1//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements a simple loop unroller. It works best when loops have
10// been canonicalized by the -indvars pass, allowing it to determine the trip
11// counts of loops easily.
12//===----------------------------------------------------------------------===//
13
14#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
15#include "llvm/ADT/DenseMap.h"
16#include "llvm/ADT/DenseMapInfo.h"
17#include "llvm/ADT/DenseSet.h"
18#include "llvm/ADT/STLExtras.h"
19#include "llvm/ADT/SetVector.h"
20#include "llvm/ADT/SmallPtrSet.h"
21#include "llvm/ADT/SmallVector.h"
22#include "llvm/ADT/StringRef.h"
23#include "llvm/Analysis/AssumptionCache.h"
24#include "llvm/Analysis/BlockFrequencyInfo.h"
25#include "llvm/Analysis/CodeMetrics.h"
26#include "llvm/Analysis/LoopAnalysisManager.h"
27#include "llvm/Analysis/LoopInfo.h"
28#include "llvm/Analysis/LoopPass.h"
29#include "llvm/Analysis/LoopUnrollAnalyzer.h"
30#include "llvm/Analysis/MemorySSA.h"
31#include "llvm/Analysis/OptimizationRemarkEmitter.h"
32#include "llvm/Analysis/ProfileSummaryInfo.h"
33#include "llvm/Analysis/ScalarEvolution.h"
34#include "llvm/Analysis/TargetTransformInfo.h"
35#include "llvm/Analysis/UniformityAnalysis.h"
36#include "llvm/IR/BasicBlock.h"
37#include "llvm/IR/CFG.h"
38#include "llvm/IR/Constant.h"
39#include "llvm/IR/Constants.h"
40#include "llvm/IR/DiagnosticInfo.h"
41#include "llvm/IR/Dominators.h"
42#include "llvm/IR/Function.h"
43#include "llvm/IR/Instruction.h"
44#include "llvm/IR/Instructions.h"
45#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
47#include "llvm/InitializePasses.h"
48#include "llvm/Pass.h"
49#include "llvm/Support/Casting.h"
50#include "llvm/Support/CommandLine.h"
51#include "llvm/Support/Debug.h"
52#include "llvm/Support/ErrorHandling.h"
53#include "llvm/Support/raw_ostream.h"
54#include "llvm/Transforms/Scalar.h"
55#include "llvm/Transforms/Scalar/LoopPassManager.h"
56#include "llvm/Transforms/Utils.h"
57#include "llvm/Transforms/Utils/LoopPeel.h"
58#include "llvm/Transforms/Utils/LoopSimplify.h"
59#include "llvm/Transforms/Utils/LoopUtils.h"
60#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
61#include "llvm/Transforms/Utils/SizeOpts.h"
62#include "llvm/Transforms/Utils/UnrollLoop.h"
63#include <algorithm>
64#include <cassert>
65#include <cstdint>
66#include <limits>
67#include <optional>
68#include <string>
69#include <tuple>
70#include <utility>
71
72using namespace llvm;
73
74#define DEBUG_TYPE "loop-unroll"
75
76cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
77 "forget-scev-loop-unroll", cl::init(Val: false), cl::Hidden,
78 cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
79 " the current top-most loop. This is sometimes preferred to reduce"
80 " compile time."));
81
82static cl::opt<unsigned>
83 UnrollThreshold("unroll-threshold", cl::Hidden,
84 cl::desc("The cost threshold for loop unrolling"));
85
86static cl::opt<unsigned>
87 UnrollOptSizeThreshold(
88 "unroll-optsize-threshold", cl::init(Val: 0), cl::Hidden,
89 cl::desc("The cost threshold for loop unrolling when optimizing for "
90 "size"));
91
92static cl::opt<unsigned> UnrollPartialThreshold(
93 "unroll-partial-threshold", cl::Hidden,
94 cl::desc("The cost threshold for partial loop unrolling"));
95
96static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
97 "unroll-max-percent-threshold-boost", cl::init(Val: 400), cl::Hidden,
98 cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
99 "to the threshold when aggressively unrolling a loop due to the "
100 "dynamic cost savings. If completely unrolling a loop will reduce "
101 "the total runtime from X to Y, we boost the loop unroll "
102 "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
103 "X/Y). This limit avoids excessive code bloat."));
104
105static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
106 "unroll-max-iteration-count-to-analyze", cl::init(Val: 10), cl::Hidden,
107 cl::desc("Don't allow loop unrolling to simulate more than this number of "
108 "iterations when checking full unroll profitability"));
109
110static cl::opt<unsigned> UnrollCount(
111 "unroll-count", cl::Hidden,
112 cl::desc("Use this unroll count for all loops including those with "
113 "unroll_count pragma values, for testing purposes"));
114
115static cl::opt<unsigned> UnrollMaxCount(
116 "unroll-max-count", cl::Hidden,
117 cl::desc("Set the max unroll count for partial and runtime unrolling, for"
118 "testing purposes"));
119
120static cl::opt<unsigned> UnrollFullMaxCount(
121 "unroll-full-max-count", cl::Hidden,
122 cl::desc(
123 "Set the max unroll count for full unrolling, for testing purposes"));
124
125static cl::opt<bool>
126 UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
127 cl::desc("Allows loops to be partially unrolled until "
128 "-unroll-threshold loop size is reached."));
129
130static cl::opt<bool> UnrollAllowRemainder(
131 "unroll-allow-remainder", cl::Hidden,
132 cl::desc("Allow generation of a loop remainder (extra iterations) "
133 "when unrolling a loop."));
134
135static cl::opt<bool>
136 UnrollRuntime("unroll-runtime", cl::Hidden,
137 cl::desc("Unroll loops with run-time trip counts"));
138
139static cl::opt<unsigned> UnrollMaxUpperBound(
140 "unroll-max-upperbound", cl::init(Val: 8), cl::Hidden,
141 cl::desc(
142 "The max of trip count upper bound that is considered in unrolling"));
143
144static cl::opt<unsigned> PragmaUnrollThreshold(
145 "pragma-unroll-threshold", cl::init(Val: 16 * 1024), cl::Hidden,
146 cl::desc("Unrolled size limit for loops with unroll metadata "
147 "(full, enable, or count)."));
148
149static cl::opt<unsigned> FlatLoopTripCountThreshold(
150 "flat-loop-tripcount-threshold", cl::init(Val: 5), cl::Hidden,
151 cl::desc("If the runtime tripcount for the loop is lower than the "
152 "threshold, the loop is considered as flat and will be less "
153 "aggressively unrolled."));
154
155static cl::opt<bool> UnrollUnrollRemainder(
156 "unroll-remainder", cl::Hidden,
157 cl::desc("Allow the loop remainder to be unrolled."));
158
159// This option isn't ever intended to be enabled, it serves to allow
160// experiments to check the assumptions about when this kind of revisit is
161// necessary.
162static cl::opt<bool> UnrollRevisitChildLoops(
163 "unroll-revisit-child-loops", cl::Hidden,
164 cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
165 "This shouldn't typically be needed as child loops (or their "
166 "clones) were already visited."));
167
168static cl::opt<unsigned> UnrollThresholdAggressive(
169 "unroll-threshold-aggressive", cl::init(Val: 300), cl::Hidden,
170 cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
171 "optimizations"));
172static cl::opt<unsigned>
173 UnrollThresholdDefault("unroll-threshold-default", cl::init(Val: 150),
174 cl::Hidden,
175 cl::desc("Default threshold (max size of unrolled "
176 "loop), used in all but O3 optimizations"));
177
178static cl::opt<unsigned> PragmaUnrollFullMaxIterations(
179 "pragma-unroll-full-max-iterations", cl::init(Val: 1'000'000), cl::Hidden,
180 cl::desc("Maximum allowed iterations to unroll under pragma unroll full."));
181
182/// A magic value for use with the Threshold parameter to indicate
183/// that the loop unroll should be performed regardless of how much
184/// code expansion would result.
185static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
186
187/// Gather the various unrolling parameters based on the defaults, compiler
188/// flags, TTI overrides and user specified parameters.
189TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
190 Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
191 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
192 OptimizationRemarkEmitter &ORE, int OptLevel,
193 std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
194 std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
195 std::optional<bool> UserUpperBound,
196 std::optional<unsigned> UserFullUnrollMaxCount) {
197 TargetTransformInfo::UnrollingPreferences UP;
198
199 // Set up the defaults
200 UP.Threshold =
201 OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
202 UP.MaxPercentThresholdBoost = 400;
203 UP.OptSizeThreshold = UnrollOptSizeThreshold;
204 UP.PartialThreshold = 150;
205 UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
206 UP.Count = 0;
207 UP.DefaultUnrollRuntimeCount = 8;
208 UP.MaxCount = std::numeric_limits<unsigned>::max();
209 UP.MaxUpperBound = UnrollMaxUpperBound;
210 UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
211 UP.BEInsns = 2;
212 UP.Partial = false;
213 UP.Runtime = false;
214 UP.AllowRemainder = true;
215 UP.UnrollRemainder = false;
216 UP.AllowExpensiveTripCount = false;
217 UP.Force = false;
218 UP.UpperBound = false;
219 UP.UnrollAndJam = false;
220 UP.UnrollAndJamInnerLoopThreshold = 60;
221 UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
222 UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
223 UP.RuntimeUnrollMultiExit = false;
224 UP.AddAdditionalAccumulators = false;
225
226 // Override with any target specific settings
227 TTI.getUnrollingPreferences(L, SE, UP, ORE: &ORE);
228
229 // Apply size attributes
230 bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
231 // Let unroll hints / pragmas take precedence over PGSO.
232 (hasUnrollTransformation(L) != TM_ForcedByUser &&
233 llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
234 QueryType: PGSOQueryType::IRPass));
235 if (OptForSize) {
236 UP.Threshold = UP.OptSizeThreshold;
237 UP.PartialThreshold = UP.PartialOptSizeThreshold;
238 UP.MaxPercentThresholdBoost = 100;
239 }
240
241 // Apply any user values specified by cl::opt
242 if (UnrollThreshold.getNumOccurrences() > 0)
243 UP.Threshold = UnrollThreshold;
244 if (UnrollPartialThreshold.getNumOccurrences() > 0)
245 UP.PartialThreshold = UnrollPartialThreshold;
246 if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
247 UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
248 if (UnrollMaxCount.getNumOccurrences() > 0)
249 UP.MaxCount = UnrollMaxCount;
250 if (UnrollMaxUpperBound.getNumOccurrences() > 0)
251 UP.MaxUpperBound = UnrollMaxUpperBound;
252 if (UnrollFullMaxCount.getNumOccurrences() > 0)
253 UP.FullUnrollMaxCount = UnrollFullMaxCount;
254 if (UnrollAllowPartial.getNumOccurrences() > 0)
255 UP.Partial = UnrollAllowPartial;
256 if (UnrollAllowRemainder.getNumOccurrences() > 0)
257 UP.AllowRemainder = UnrollAllowRemainder;
258 if (UnrollRuntime.getNumOccurrences() > 0)
259 UP.Runtime = UnrollRuntime;
260 if (UnrollMaxUpperBound == 0)
261 UP.UpperBound = false;
262 if (UnrollUnrollRemainder.getNumOccurrences() > 0)
263 UP.UnrollRemainder = UnrollUnrollRemainder;
264 if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
265 UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
266
267 // Apply user values provided by argument
268 if (UserThreshold) {
269 UP.Threshold = *UserThreshold;
270 UP.PartialThreshold = *UserThreshold;
271 }
272 if (UserCount)
273 UP.Count = *UserCount;
274 if (UserAllowPartial)
275 UP.Partial = *UserAllowPartial;
276 if (UserRuntime)
277 UP.Runtime = *UserRuntime;
278 if (UserUpperBound)
279 UP.UpperBound = *UserUpperBound;
280 if (UserFullUnrollMaxCount)
281 UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
282
283 return UP;
284}
285
286namespace {
287
288/// A struct to densely store the state of an instruction after unrolling at
289/// each iteration.
290///
291/// This is designed to work like a tuple of <Instruction *, int> for the
292/// purposes of hashing and lookup, but to be able to associate two boolean
293/// states with each key.
294struct UnrolledInstState {
295 Instruction *I;
296 int Iteration : 30;
297 unsigned IsFree : 1;
298 unsigned IsCounted : 1;
299};
300
301/// Hashing and equality testing for a set of the instruction states.
302struct UnrolledInstStateKeyInfo {
303 using PtrInfo = DenseMapInfo<Instruction *>;
304 using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
305
306 static inline unsigned getHashValue(const UnrolledInstState &S) {
307 return PairInfo::getHashValue(PairVal: {S.I, S.Iteration});
308 }
309
310 static inline bool isEqual(const UnrolledInstState &LHS,
311 const UnrolledInstState &RHS) {
312 return PairInfo::isEqual(LHS: {LHS.I, LHS.Iteration}, RHS: {RHS.I, RHS.Iteration});
313 }
314};
315
316struct EstimatedUnrollCost {
317 /// The estimated cost after unrolling.
318 unsigned UnrolledCost;
319
320 /// The estimated dynamic cost of executing the instructions in the
321 /// rolled form.
322 unsigned RolledDynamicCost;
323};
324
325} // end anonymous namespace
326
327/// Figure out if the loop is worth full unrolling.
328///
329/// Complete loop unrolling can make some loads constant, and we need to know
330/// if that would expose any further optimization opportunities. This routine
331/// estimates this optimization. It computes cost of unrolled loop
332/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
333/// dynamic cost we mean that we won't count costs of blocks that are known not
334/// to be executed (i.e. if we have a branch in the loop and we know that at the
335/// given iteration its condition would be resolved to true, we won't add up the
336/// cost of the 'false'-block).
337/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
338/// the analysis failed (no benefits expected from the unrolling, or the loop is
339/// too big to analyze), the returned value is std::nullopt.
340static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
341 const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
342 const SmallPtrSetImpl<const Value *> &EphValues,
343 const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
344 unsigned MaxIterationsCountToAnalyze) {
345 // We want to be able to scale offsets by the trip count and add more offsets
346 // to them without checking for overflows, and we already don't want to
347 // analyze *massive* trip counts, so we force the max to be reasonably small.
348 assert(MaxIterationsCountToAnalyze <
349 (unsigned)(std::numeric_limits<int>::max() / 2) &&
350 "The unroll iterations max is too large!");
351
352 // Only analyze inner loops. We can't properly estimate cost of nested loops
353 // and we won't visit inner loops again anyway.
354 if (!L->isInnermost()) {
355 LLVM_DEBUG(dbgs().indent(3)
356 << "Not analyzing loop cost: not an innermost loop.\n");
357 return std::nullopt;
358 }
359
360 // Don't simulate loops with a big or unknown tripcount
361 if (!TripCount || TripCount > MaxIterationsCountToAnalyze) {
362 LLVM_DEBUG(dbgs().indent(3)
363 << "Not analyzing loop cost: trip count "
364 << (TripCount ? "too large" : "unknown") << ".\n");
365 return std::nullopt;
366 }
367
368 SmallSetVector<BasicBlock *, 16> BBWorklist;
369 SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
370 DenseMap<Value *, Value *> SimplifiedValues;
371 SmallVector<std::pair<Value *, Value *>, 4> SimplifiedInputValues;
372
373 // The estimated cost of the unrolled form of the loop. We try to estimate
374 // this by simplifying as much as we can while computing the estimate.
375 InstructionCost UnrolledCost = 0;
376
377 // We also track the estimated dynamic (that is, actually executed) cost in
378 // the rolled form. This helps identify cases when the savings from unrolling
379 // aren't just exposing dead control flows, but actual reduced dynamic
380 // instructions due to the simplifications which we expect to occur after
381 // unrolling.
382 InstructionCost RolledDynamicCost = 0;
383
384 // We track the simplification of each instruction in each iteration. We use
385 // this to recursively merge costs into the unrolled cost on-demand so that
386 // we don't count the cost of any dead code. This is essentially a map from
387 // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
388 DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
389
390 // A small worklist used to accumulate cost of instructions from each
391 // observable and reached root in the loop.
392 SmallVector<Instruction *, 16> CostWorklist;
393
394 // PHI-used worklist used between iterations while accumulating cost.
395 SmallVector<Instruction *, 4> PHIUsedList;
396
397 // Helper function to accumulate cost for instructions in the loop.
398 auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
399 assert(Iteration >= 0 && "Cannot have a negative iteration!");
400 assert(CostWorklist.empty() && "Must start with an empty cost list");
401 assert(PHIUsedList.empty() && "Must start with an empty phi used list");
402 CostWorklist.push_back(Elt: &RootI);
403 TargetTransformInfo::TargetCostKind CostKind =
404 RootI.getFunction()->hasMinSize() ?
405 TargetTransformInfo::TCK_CodeSize :
406 TargetTransformInfo::TCK_SizeAndLatency;
407 for (;; --Iteration) {
408 do {
409 Instruction *I = CostWorklist.pop_back_val();
410
411 // InstCostMap only uses I and Iteration as a key, the other two values
412 // don't matter here.
413 auto CostIter = InstCostMap.find(V: {.I: I, .Iteration: Iteration, .IsFree: 0, .IsCounted: 0});
414 if (CostIter == InstCostMap.end())
415 // If an input to a PHI node comes from a dead path through the loop
416 // we may have no cost data for it here. What that actually means is
417 // that it is free.
418 continue;
419 auto &Cost = *CostIter;
420 if (Cost.IsCounted)
421 // Already counted this instruction.
422 continue;
423
424 // Mark that we are counting the cost of this instruction now.
425 Cost.IsCounted = true;
426
427 // If this is a PHI node in the loop header, just add it to the PHI set.
428 if (auto *PhiI = dyn_cast<PHINode>(Val: I))
429 if (PhiI->getParent() == L->getHeader()) {
430 assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
431 "inherently simplify during unrolling.");
432 if (Iteration == 0)
433 continue;
434
435 // Push the incoming value from the backedge into the PHI used list
436 // if it is an in-loop instruction. We'll use this to populate the
437 // cost worklist for the next iteration (as we count backwards).
438 if (auto *OpI = dyn_cast<Instruction>(
439 Val: PhiI->getIncomingValueForBlock(BB: L->getLoopLatch())))
440 if (L->contains(Inst: OpI))
441 PHIUsedList.push_back(Elt: OpI);
442 continue;
443 }
444
445 // First accumulate the cost of this instruction.
446 if (!Cost.IsFree) {
447 // Consider simplified operands in instruction cost.
448 SmallVector<Value *, 4> Operands;
449 transform(Range: I->operands(), d_first: std::back_inserter(x&: Operands),
450 F: [&](Value *Op) {
451 if (auto Res = SimplifiedValues.lookup(Val: Op))
452 return Res;
453 return Op;
454 });
455 UnrolledCost += TTI.getInstructionCost(U: I, Operands, CostKind);
456 LLVM_DEBUG(dbgs().indent(3)
457 << "Adding cost of instruction (iteration " << Iteration
458 << "): ");
459 LLVM_DEBUG(I->dump());
460 }
461
462 // We must count the cost of every operand which is not free,
463 // recursively. If we reach a loop PHI node, simply add it to the set
464 // to be considered on the next iteration (backwards!).
465 for (Value *Op : I->operands()) {
466 // Check whether this operand is free due to being a constant or
467 // outside the loop.
468 auto *OpI = dyn_cast<Instruction>(Val: Op);
469 if (!OpI || !L->contains(Inst: OpI))
470 continue;
471
472 // Otherwise accumulate its cost.
473 CostWorklist.push_back(Elt: OpI);
474 }
475 } while (!CostWorklist.empty());
476
477 if (PHIUsedList.empty())
478 // We've exhausted the search.
479 break;
480
481 assert(Iteration > 0 &&
482 "Cannot track PHI-used values past the first iteration!");
483 CostWorklist.append(in_start: PHIUsedList.begin(), in_end: PHIUsedList.end());
484 PHIUsedList.clear();
485 }
486 };
487
488 // Ensure that we don't violate the loop structure invariants relied on by
489 // this analysis.
490 assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
491 assert(L->isLCSSAForm(DT) &&
492 "Must have loops in LCSSA form to track live-out values.");
493
494 LLVM_DEBUG(dbgs().indent(3)
495 << "Starting LoopUnroll profitability analysis...\n");
496
497 TargetTransformInfo::TargetCostKind CostKind =
498 L->getHeader()->getParent()->hasMinSize() ?
499 TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
500 // Simulate execution of each iteration of the loop counting instructions,
501 // which would be simplified.
502 // Since the same load will take different values on different iterations,
503 // we literally have to go through all loop's iterations.
504 for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
505 LLVM_DEBUG(dbgs().indent(3) << "Analyzing iteration " << Iteration << "\n");
506
507 // Prepare for the iteration by collecting any simplified entry or backedge
508 // inputs.
509 for (Instruction &I : *L->getHeader()) {
510 auto *PHI = dyn_cast<PHINode>(Val: &I);
511 if (!PHI)
512 break;
513
514 // The loop header PHI nodes must have exactly two input: one from the
515 // loop preheader and one from the loop latch.
516 assert(
517 PHI->getNumIncomingValues() == 2 &&
518 "Must have an incoming value only for the preheader and the latch.");
519
520 Value *V = PHI->getIncomingValueForBlock(
521 BB: Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
522 if (Iteration != 0 && SimplifiedValues.count(Val: V))
523 V = SimplifiedValues.lookup(Val: V);
524 SimplifiedInputValues.push_back(Elt: {PHI, V});
525 }
526
527 // Now clear and re-populate the map for the next iteration.
528 SimplifiedValues.clear();
529 while (!SimplifiedInputValues.empty())
530 SimplifiedValues.insert(KV: SimplifiedInputValues.pop_back_val());
531
532 UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
533
534 BBWorklist.clear();
535 BBWorklist.insert(X: L->getHeader());
536 // Note that we *must not* cache the size, this loop grows the worklist.
537 for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
538 BasicBlock *BB = BBWorklist[Idx];
539
540 // Visit all instructions in the given basic block and try to simplify
541 // it. We don't change the actual IR, just count optimization
542 // opportunities.
543 for (Instruction &I : *BB) {
544 // These won't get into the final code - don't even try calculating the
545 // cost for them.
546 if (EphValues.count(Ptr: &I))
547 continue;
548
549 // Track this instruction's expected baseline cost when executing the
550 // rolled loop form.
551 RolledDynamicCost += TTI.getInstructionCost(U: &I, CostKind);
552
553 // Visit the instruction to analyze its loop cost after unrolling,
554 // and if the visitor returns true, mark the instruction as free after
555 // unrolling and continue.
556 bool IsFree = Analyzer.visit(I);
557 bool Inserted = InstCostMap.insert(V: {.I: &I, .Iteration: (int)Iteration,
558 .IsFree: (unsigned)IsFree,
559 /*IsCounted*/ false}).second;
560 (void)Inserted;
561 assert(Inserted && "Cannot have a state for an unvisited instruction!");
562
563 if (IsFree)
564 continue;
565
566 // Can't properly model a cost of a call.
567 // FIXME: With a proper cost model we should be able to do it.
568 if (auto *CI = dyn_cast<CallInst>(Val: &I)) {
569 const Function *Callee = CI->getCalledFunction();
570 if (!Callee || TTI.isLoweredToCall(F: Callee)) {
571 LLVM_DEBUG(dbgs().indent(3)
572 << "Can't analyze cost of loop with call\n");
573 return std::nullopt;
574 }
575 }
576
577 // If the instruction might have a side-effect recursively account for
578 // the cost of it and all the instructions leading up to it.
579 if (I.mayHaveSideEffects())
580 AddCostRecursively(I, Iteration);
581
582 // If unrolled body turns out to be too big, bail out.
583 if (UnrolledCost > MaxUnrolledLoopSize) {
584 LLVM_DEBUG({
585 dbgs().indent(3) << "Exceeded threshold.. exiting.\n";
586 dbgs().indent(3)
587 << "UnrolledCost: " << UnrolledCost
588 << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n";
589 });
590 return std::nullopt;
591 }
592 }
593
594 Instruction *TI = BB->getTerminator();
595
596 auto getSimplifiedConstant = [&](Value *V) -> Constant * {
597 if (SimplifiedValues.count(Val: V))
598 V = SimplifiedValues.lookup(Val: V);
599 return dyn_cast<Constant>(Val: V);
600 };
601
602 // Add in the live successors by first checking whether we have terminator
603 // that may be simplified based on the values simplified by this call.
604 BasicBlock *KnownSucc = nullptr;
605 if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: TI)) {
606 if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
607 // Just take the first successor if condition is undef
608 if (isa<UndefValue>(Val: SimpleCond))
609 KnownSucc = BI->getSuccessor(i: 0);
610 else if (ConstantInt *SimpleCondVal =
611 dyn_cast<ConstantInt>(Val: SimpleCond))
612 KnownSucc = BI->getSuccessor(i: SimpleCondVal->isZero() ? 1 : 0);
613 }
614 } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
615 if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
616 // Just take the first successor if condition is undef
617 if (isa<UndefValue>(Val: SimpleCond))
618 KnownSucc = SI->getSuccessor(idx: 0);
619 else if (ConstantInt *SimpleCondVal =
620 dyn_cast<ConstantInt>(Val: SimpleCond))
621 KnownSucc = SI->findCaseValue(C: SimpleCondVal)->getCaseSuccessor();
622 }
623 }
624 if (KnownSucc) {
625 if (L->contains(BB: KnownSucc))
626 BBWorklist.insert(X: KnownSucc);
627 else
628 ExitWorklist.insert(X: {BB, KnownSucc});
629 continue;
630 }
631
632 // Add BB's successors to the worklist.
633 for (BasicBlock *Succ : successors(BB))
634 if (L->contains(BB: Succ))
635 BBWorklist.insert(X: Succ);
636 else
637 ExitWorklist.insert(X: {BB, Succ});
638 AddCostRecursively(*TI, Iteration);
639 }
640
641 // If we found no optimization opportunities on the first iteration, we
642 // won't find them on later ones too.
643 if (UnrolledCost == RolledDynamicCost) {
644 LLVM_DEBUG({
645 dbgs().indent(3) << "No opportunities found.. exiting.\n";
646 dbgs().indent(3) << "UnrolledCost: " << UnrolledCost << "\n";
647 });
648 return std::nullopt;
649 }
650 }
651
652 while (!ExitWorklist.empty()) {
653 BasicBlock *ExitingBB, *ExitBB;
654 std::tie(args&: ExitingBB, args&: ExitBB) = ExitWorklist.pop_back_val();
655
656 for (Instruction &I : *ExitBB) {
657 auto *PN = dyn_cast<PHINode>(Val: &I);
658 if (!PN)
659 break;
660
661 Value *Op = PN->getIncomingValueForBlock(BB: ExitingBB);
662 if (auto *OpI = dyn_cast<Instruction>(Val: Op))
663 if (L->contains(Inst: OpI))
664 AddCostRecursively(*OpI, TripCount - 1);
665 }
666 }
667
668 assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
669 "All instructions must have a valid cost, whether the "
670 "loop is rolled or unrolled.");
671
672 LLVM_DEBUG({
673 dbgs().indent(3) << "Analysis finished:\n";
674 dbgs().indent(3) << "UnrolledCost: " << UnrolledCost
675 << ", RolledDynamicCost: " << RolledDynamicCost << "\n";
676 });
677 return {{.UnrolledCost: unsigned(UnrolledCost.getValue()),
678 .RolledDynamicCost: unsigned(RolledDynamicCost.getValue())}};
679}
680
681UnrollCostEstimator::UnrollCostEstimator(
682 const Loop *L, const TargetTransformInfo &TTI,
683 const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns,
684 bool TripCountIsUniform) {
685 CodeMetrics Metrics;
686 for (BasicBlock *BB : L->blocks())
687 Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false,
688 L);
689 NumInlineCandidates = Metrics.NumInlineCandidates;
690 NotDuplicatable = Metrics.notDuplicatable;
691 Convergence = Metrics.Convergence;
692 LoopSize = Metrics.NumInsts;
693 // Convergent operations make the remainder prelude unsafe by adding a
694 // control-flow dependency, unless the trip count is uniform per
695 // UniformityInfo, in which case all paths agree and the remainder is safe.
696 ConvergenceAllowsRuntime =
697 (Metrics.Convergence != ConvergenceKind::Uncontrolled &&
698 !getLoopConvergenceHeart(TheLoop: L)) ||
699 TripCountIsUniform;
700
701 // Don't allow an estimate of size zero. This would allows unrolling of loops
702 // with huge iteration counts, which is a compile time problem even if it's
703 // not a problem for code quality. Also, the code using this size may assume
704 // that each loop has at least three instructions (likely a conditional
705 // branch, a comparison feeding that branch, and some kind of loop increment
706 // feeding that comparison instruction).
707 if (LoopSize.isValid() && LoopSize < BEInsns + 1)
708 // This is an open coded max() on InstructionCost
709 LoopSize = BEInsns + 1;
710}
711
712bool UnrollCostEstimator::canUnroll(OptimizationRemarkEmitter *ORE,
713 const Loop *L) const {
714 auto ReportCannotUnroll = [&](StringRef Reason) {
715 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: " << Reason << ".\n");
716 if (ORE && L)
717 ORE->emit(RemarkBuilder: [&]() {
718 return OptimizationRemarkMissed(DEBUG_TYPE, "CannotUnrollLoop",
719 L->getStartLoc(), L->getHeader())
720 << "unable to unroll loop: " << Reason;
721 });
722 };
723
724 if (Convergence == ConvergenceKind::ExtendedLoop) {
725 ReportCannotUnroll("contains convergent operations");
726 return false;
727 }
728 if (!LoopSize.isValid()) {
729 ReportCannotUnroll("loop size could not be computed");
730 return false;
731 }
732 if (NotDuplicatable) {
733 ReportCannotUnroll("contains non-duplicatable instructions");
734 return false;
735 }
736 return true;
737}
738
739uint64_t UnrollCostEstimator::getUnrolledLoopSize(
740 const TargetTransformInfo::UnrollingPreferences &UP,
741 unsigned CountOverwrite) const {
742 unsigned LS = LoopSize.getValue();
743 assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
744 if (CountOverwrite)
745 return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;
746 else
747 return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;
748}
749
750// Returns true if the loop has an unroll(full) pragma.
751static bool hasUnrollFullPragma(const Loop *L) {
752 return getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.full");
753}
754
755// Returns true if the loop has an unroll(enable) pragma. This metadata is used
756// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
757static bool hasUnrollEnablePragma(const Loop *L) {
758 return getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.enable");
759}
760
761// Returns true if the loop has a runtime unroll(disable) pragma.
762static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
763 return getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.runtime.disable");
764}
765
766/// Returns true if the SCEV expression is uniform, i.e., all threads in a
767/// convergent execution agree on its value. Recursively checks operands.
768/// Returns false if the SCEV could not be computed.
769static bool isSCEVUniform(const SCEV *S, UniformityInfo &UI) {
770 if (isa<SCEVCouldNotCompute>(Val: S))
771 return false;
772 if (isa<SCEVConstant>(Val: S))
773 return true;
774 if (auto *U = dyn_cast<SCEVUnknown>(Val: S))
775 return UI.isUniformAtDef(V: U->getValue());
776 for (const SCEV *Op : S->operands()) {
777 if (!isSCEVUniform(S: Op, UI))
778 return false;
779 }
780 return true;
781}
782
783// If loop has an unroll_count pragma return the (necessarily
784// positive) value from the pragma. Otherwise return 0.
785static unsigned unrollCountPragmaValue(const Loop *L) {
786 MDNode *MD = getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.count");
787 if (MD) {
788 assert(MD->getNumOperands() == 2 &&
789 "Unroll count hint metadata should have two operands.");
790 unsigned Count =
791 mdconst::extract<ConstantInt>(MD: MD->getOperand(I: 1))->getZExtValue();
792 assert(Count >= 1 && "Unroll count must be positive.");
793 return Count;
794 }
795 return 0;
796}
797
798UnrollPragmaInfo::UnrollPragmaInfo(const Loop *L)
799 : UserUnrollCount(UnrollCount.getNumOccurrences() > 0),
800 PragmaFullUnroll(hasUnrollFullPragma(L)),
801 PragmaCount(unrollCountPragmaValue(L)),
802 PragmaEnableUnroll(hasUnrollEnablePragma(L)),
803 PragmaRuntimeUnrollDisable(hasRuntimeUnrollDisablePragma(L)),
804 ExplicitUnroll(PragmaCount > 0 || PragmaFullUnroll ||
805 PragmaEnableUnroll || UserUnrollCount) {}
806
807// Computes the boosting factor for complete unrolling.
808// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
809// be beneficial to fully unroll the loop even if unrolledcost is large. We
810// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
811// the unroll threshold.
812static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
813 unsigned MaxPercentThresholdBoost) {
814 if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
815 return 100;
816 else if (Cost.UnrolledCost != 0)
817 // The boosting factor is RolledDynamicCost / UnrolledCost
818 return std::min(a: 100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
819 b: MaxPercentThresholdBoost);
820 else
821 return MaxPercentThresholdBoost;
822}
823
824static std::optional<unsigned>
825shouldPragmaUnroll(Loop *L, const UnrollPragmaInfo &PInfo,
826 const unsigned TripMultiple, const unsigned TripCount,
827 unsigned MaxTripCount, const UnrollCostEstimator UCE,
828 const TargetTransformInfo::UnrollingPreferences &UP,
829 OptimizationRemarkEmitter *ORE) {
830
831 // Using unroll pragma
832 // 1st priority is unroll count set by "unroll-count" option.
833
834 if (PInfo.UserUnrollCount) {
835 if (UP.AllowRemainder &&
836 UCE.getUnrolledLoopSize(UP, CountOverwrite: (unsigned)UnrollCount) < UP.Threshold) {
837 LLVM_DEBUG(dbgs().indent(2) << "Unrolling with user-specified count: "
838 << UnrollCount << ".\n");
839 return (unsigned)UnrollCount;
840 }
841 LLVM_DEBUG(dbgs().indent(2)
842 << "Not unrolling with user count " << UnrollCount << ": "
843 << (UP.AllowRemainder ? "exceeds threshold"
844 : "remainder not allowed")
845 << ".\n");
846 }
847
848 // 2nd priority is unroll count set by pragma.
849 if (PInfo.PragmaCount > 0) {
850 if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0))) {
851 LLVM_DEBUG(dbgs().indent(2) << "Unrolling with pragma count: "
852 << PInfo.PragmaCount << ".\n");
853 return PInfo.PragmaCount;
854 }
855 LLVM_DEBUG(dbgs().indent(2)
856 << "Not unrolling with pragma count " << PInfo.PragmaCount
857 << ": remainder not allowed, count does not divide trip "
858 << "multiple " << TripMultiple << ".\n");
859 ORE->emit(RemarkBuilder: [&]() {
860 return OptimizationRemarkAnalysis(DEBUG_TYPE, "PragmaUnrollCountRejected",
861 L->getStartLoc(), L->getHeader())
862 << "may be unable to unroll loop with count "
863 << ore::NV("PragmaCount", PInfo.PragmaCount)
864 << ": remainder loop is not allowed and count does not divide "
865 "trip multiple "
866 << ore::NV("TripMultiple", TripMultiple);
867 });
868 }
869
870 if (PInfo.PragmaFullUnroll) {
871 if (TripCount != 0) {
872 // Certain cases with UBSAN can cause trip count to be calculated as
873 // INT_MAX, Block full unrolling at a reasonable limit so that the
874 // compiler doesn't hang trying to unroll the loop. See PR77842
875 if (TripCount > PragmaUnrollFullMaxIterations) {
876 LLVM_DEBUG(dbgs().indent(2)
877 << "Won't unroll; trip count is too large.\n");
878 ORE->emit(RemarkBuilder: [&]() {
879 return OptimizationRemarkAnalysis(DEBUG_TYPE,
880 "PragmaFullUnrollTripCountTooLarge",
881 L->getStartLoc(), L->getHeader())
882 << "may be unable to fully unroll loop: trip count "
883 << ore::NV("TripCount", TripCount) << " exceeds limit "
884 << ore::NV("Limit", PragmaUnrollFullMaxIterations);
885 });
886 return std::nullopt;
887 }
888
889 LLVM_DEBUG(dbgs().indent(2)
890 << "Fully unrolling with trip count: " << TripCount << ".\n");
891 return TripCount;
892 }
893 LLVM_DEBUG(dbgs().indent(2)
894 << "Not fully unrolling: unknown trip count.\n");
895 ORE->emit(RemarkBuilder: [&]() {
896 return OptimizationRemarkAnalysis(DEBUG_TYPE,
897 "PragmaFullUnrollUnknownTripCount",
898 L->getStartLoc(), L->getHeader())
899 << "may be unable to fully unroll loop: trip count is unknown";
900 });
901 }
902
903 if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
904 MaxTripCount <= UP.MaxUpperBound) {
905 LLVM_DEBUG(dbgs().indent(2)
906 << "Unrolling with max trip count: " << MaxTripCount << ".\n");
907 return MaxTripCount;
908 }
909
910 return std::nullopt;
911}
912
913static std::optional<unsigned> shouldFullUnroll(
914 Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
915 ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
916 const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
917 const TargetTransformInfo::UnrollingPreferences &UP) {
918 assert(FullUnrollTripCount && "should be non-zero!");
919
920 if (FullUnrollTripCount > UP.FullUnrollMaxCount) {
921 LLVM_DEBUG(dbgs().indent(2)
922 << "Not unrolling: trip count " << FullUnrollTripCount
923 << " exceeds max count " << UP.FullUnrollMaxCount << ".\n");
924 return std::nullopt;
925 }
926
927 // When computing the unrolled size, note that BEInsns are not replicated
928 // like the rest of the loop body.
929 uint64_t UnrolledSize = UCE.getUnrolledLoopSize(UP, CountOverwrite: FullUnrollTripCount);
930 if (UnrolledSize < UP.Threshold) {
931 LLVM_DEBUG(dbgs().indent(2) << "Unrolling: size " << UnrolledSize
932 << " < threshold " << UP.Threshold << ".\n");
933 return FullUnrollTripCount;
934 }
935
936 LLVM_DEBUG(dbgs().indent(2)
937 << "Unrolled size " << UnrolledSize << " exceeds threshold "
938 << UP.Threshold << "; checking for cost benefit.\n");
939
940 // The loop isn't that small, but we still can fully unroll it if that
941 // helps to remove a significant number of instructions.
942 // To check that, run additional analysis on the loop.
943 if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
944 L, TripCount: FullUnrollTripCount, DT, SE, EphValues, TTI,
945 MaxUnrolledLoopSize: UP.Threshold * UP.MaxPercentThresholdBoost / 100,
946 MaxIterationsCountToAnalyze: UP.MaxIterationsCountToAnalyze)) {
947 unsigned Boost =
948 getFullUnrollBoostingFactor(Cost: *Cost, MaxPercentThresholdBoost: UP.MaxPercentThresholdBoost);
949 unsigned BoostedThreshold = UP.Threshold * Boost / 100;
950 if (Cost->UnrolledCost < BoostedThreshold) {
951 LLVM_DEBUG(dbgs().indent(2) << "Profitable after cost analysis.\n");
952 return FullUnrollTripCount;
953 }
954 LLVM_DEBUG(dbgs().indent(2)
955 << "Not unrolling: cost " << Cost->UnrolledCost
956 << " >= boosted threshold " << BoostedThreshold << ".\n");
957 }
958
959 return std::nullopt;
960}
961
962static std::optional<unsigned>
963shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
964 const UnrollCostEstimator UCE,
965 const TargetTransformInfo::UnrollingPreferences &UP) {
966
967 if (!TripCount)
968 return std::nullopt;
969
970 if (!UP.Partial) {
971 LLVM_DEBUG(dbgs().indent(2) << "Will not try to unroll partially because "
972 << "-unroll-allow-partial not given\n");
973 return 0;
974 }
975 unsigned count = UP.Count;
976 if (count == 0)
977 count = TripCount;
978 if (UP.PartialThreshold != NoThreshold) {
979 // Reduce unroll count to be modulo of TripCount for partial unrolling.
980 if (UCE.getUnrolledLoopSize(UP, CountOverwrite: count) > UP.PartialThreshold) {
981 unsigned NewCount =
982 (std::max(a: UP.PartialThreshold, b: UP.BEInsns + 1) - UP.BEInsns) /
983 (LoopSize - UP.BEInsns);
984 LLVM_DEBUG(dbgs().indent(2)
985 << "Unrolled size exceeds threshold; reducing count "
986 << "from " << count << " to " << NewCount << ".\n");
987 count = NewCount;
988 }
989 if (count > UP.MaxCount)
990 count = UP.MaxCount;
991 while (count != 0 && TripCount % count != 0)
992 count--;
993 if (UP.AllowRemainder && count <= 1) {
994 // If there is no Count that is modulo of TripCount, set Count to
995 // largest power-of-two factor that satisfies the threshold limit.
996 // As we'll create fixup loop, do the type of unrolling only if
997 // remainder loop is allowed.
998 // Note: DefaultUnrollRuntimeCount is used as a reasonable starting point
999 // even though this is partial unrolling (not runtime unrolling).
1000 count = UP.DefaultUnrollRuntimeCount;
1001 while (count != 0 &&
1002 UCE.getUnrolledLoopSize(UP, CountOverwrite: count) > UP.PartialThreshold)
1003 count >>= 1;
1004 }
1005 if (count < 2) {
1006 LLVM_DEBUG(dbgs().indent(2)
1007 << "Will not partially unroll: no profitable count.\n");
1008 count = 0;
1009 }
1010 } else {
1011 count = TripCount;
1012 }
1013 if (count > UP.MaxCount)
1014 count = UP.MaxCount;
1015
1016 LLVM_DEBUG(dbgs().indent(2)
1017 << "Partially unrolling with count: " << count << "\n");
1018
1019 return count;
1020}
1021// Calculates unroll count and writes it to UP.Count.
1022// Unless IgnoreUser is true, will also use metadata and command-line options
1023// that are specific to the LoopUnroll pass (which, for instance, are
1024// irrelevant for the LoopUnrollAndJam pass).
1025// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
1026// many LoopUnroll-specific options. The shared functionality should be
1027// refactored into it own function.
1028void llvm::computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
1029 DominatorTree &DT, LoopInfo *LI,
1030 AssumptionCache *AC, ScalarEvolution &SE,
1031 const SmallPtrSetImpl<const Value *> &EphValues,
1032 OptimizationRemarkEmitter *ORE,
1033 const unsigned TripCount,
1034 const unsigned MaxTripCount, const bool MaxOrZero,
1035 const unsigned TripMultiple,
1036 const UnrollCostEstimator &UCE,
1037 TargetTransformInfo::UnrollingPreferences &UP,
1038 TargetTransformInfo::PeelingPreferences &PP) {
1039
1040 unsigned LoopSize = UCE.getRolledLoopSize();
1041
1042 LLVM_DEBUG(dbgs().indent(1) << "Computing unroll count: TripCount="
1043 << TripCount << ", MaxTripCount=" << MaxTripCount
1044 << (MaxOrZero ? " (MaxOrZero)" : "")
1045 << ", TripMultiple=" << TripMultiple << "\n");
1046
1047 UnrollPragmaInfo PInfo(L);
1048 LLVM_DEBUG({
1049 if (PInfo.ExplicitUnroll) {
1050 dbgs().indent(1) << "Explicit unroll requested:";
1051 if (PInfo.UserUnrollCount)
1052 dbgs() << " user-count";
1053 if (PInfo.PragmaFullUnroll)
1054 dbgs() << " pragma-full";
1055 if (PInfo.PragmaCount > 0)
1056 dbgs() << " pragma-count(" << PInfo.PragmaCount << ")";
1057 if (PInfo.PragmaEnableUnroll)
1058 dbgs() << " pragma-enable";
1059 dbgs() << "\n";
1060 }
1061 });
1062
1063 // Use an explicit peel count that has been specified for testing. In this
1064 // case it's not permitted to also specify an explicit unroll count.
1065 if (PP.PeelCount) {
1066 if (UnrollCount.getNumOccurrences() > 0) {
1067 reportFatalUsageError(reason: "Cannot specify both explicit peel count and "
1068 "explicit unroll count");
1069 }
1070 LLVM_DEBUG(dbgs().indent(2)
1071 << "Using explicit peel count: " << PP.PeelCount << ".\n");
1072 UP.Count = 1;
1073 UP.Runtime = false;
1074 return;
1075 }
1076
1077 // If a user provided an explicit unroll pragma (with or without count),
1078 // enable runtime unrolling and override expensive trip count checks.
1079 if (PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0) {
1080 UP.AllowExpensiveTripCount = true;
1081 UP.Runtime = true;
1082 }
1083
1084 // Check for explicit Count.
1085 // 1st priority is unroll count set by "unroll-count" option.
1086 // 2nd priority is unroll count set by pragma.
1087 LLVM_DEBUG(dbgs().indent(1) << "Trying pragma unroll...\n");
1088 if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
1089 MaxTripCount, UCE, UP, ORE)) {
1090 UP.Count = *UnrollFactor;
1091
1092 if (PInfo.UserUnrollCount || (PInfo.PragmaCount > 0)) {
1093 UP.AllowExpensiveTripCount = true;
1094 UP.Force = true;
1095 }
1096 return;
1097 } else {
1098 if (PInfo.ExplicitUnroll && TripCount != 0) {
1099 // If the loop has an unrolling pragma, we want to be more aggressive with
1100 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
1101 // value which is larger than the default limits.
1102 UP.Threshold = std::max<unsigned>(a: UP.Threshold, b: PragmaUnrollThreshold);
1103 UP.PartialThreshold =
1104 std::max<unsigned>(a: UP.PartialThreshold, b: PragmaUnrollThreshold);
1105 }
1106 }
1107
1108 // 3rd priority is exact full unrolling. This will eliminate all copies
1109 // of some exit test.
1110 LLVM_DEBUG(dbgs().indent(1) << "Trying full unroll...\n");
1111 assert(UP.Count == 0);
1112 if (TripCount) {
1113 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1114 FullUnrollTripCount: TripCount, UCE, UP)) {
1115 UP.Count = *UnrollFactor;
1116 return;
1117 }
1118 }
1119
1120 // 4th priority is bounded unrolling.
1121 // We can unroll by the upper bound amount if it's generally allowed or if
1122 // we know that the loop is executed either the upper bound or zero times.
1123 // (MaxOrZero unrolling keeps only the first loop test, so the number of
1124 // loop tests remains the same compared to the non-unrolled version, whereas
1125 // the generic upper bound unrolling keeps all but the last loop test so the
1126 // number of loop tests goes up which may end up being worse on targets with
1127 // constrained branch predictor resources so is controlled by an option.)
1128 // In addition we only unroll small upper bounds.
1129 // Note that the cost of bounded unrolling is always strictly greater than
1130 // cost of exact full unrolling. As such, if we have an exact count and
1131 // found it unprofitable, we'll never chose to bounded unroll.
1132 LLVM_DEBUG(dbgs().indent(1) << "Trying upper-bound unroll...\n");
1133 if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
1134 MaxTripCount <= UP.MaxUpperBound) {
1135 if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1136 FullUnrollTripCount: MaxTripCount, UCE, UP)) {
1137 UP.Count = *UnrollFactor;
1138 return;
1139 }
1140 }
1141
1142 // 5th priority is loop peeling.
1143 LLVM_DEBUG(dbgs().indent(1) << "Trying loop peeling...\n");
1144 computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, Threshold: UP.Threshold);
1145 if (PP.PeelCount) {
1146 LLVM_DEBUG(dbgs().indent(2)
1147 << "Peeling with count: " << PP.PeelCount << ".\n");
1148 UP.Runtime = false;
1149 UP.Count = 1;
1150 return;
1151 }
1152
1153 // Before starting partial unrolling, set up.partial to true,
1154 // if user explicitly asked for unrolling
1155 if (TripCount)
1156 UP.Partial |= PInfo.ExplicitUnroll;
1157
1158 // 6th priority is partial unrolling.
1159 // Try partial unroll only when TripCount could be statically calculated.
1160 LLVM_DEBUG(dbgs().indent(1) << "Trying partial unroll...\n");
1161 if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
1162 UP.Count = *UnrollFactor;
1163 return;
1164 }
1165 assert(TripCount == 0 &&
1166 "All cases when TripCount is constant should be covered here.");
1167
1168 // 7th priority is runtime unrolling.
1169 LLVM_DEBUG(dbgs().indent(1) << "Trying runtime unroll...\n");
1170 // Don't unroll a runtime trip count loop when it is disabled.
1171 if (PInfo.PragmaRuntimeUnrollDisable) {
1172 LLVM_DEBUG(dbgs().indent(2)
1173 << "Not runtime unrolling: disabled by pragma.\n");
1174 return;
1175 }
1176
1177 // Don't unroll a small upper bound loop unless user or TTI asked to do so.
1178 if (MaxTripCount && !UP.Force && MaxTripCount <= UP.MaxUpperBound) {
1179 LLVM_DEBUG(dbgs().indent(2) << "Not runtime unrolling: max trip count "
1180 << MaxTripCount << " is small (<= "
1181 << UP.MaxUpperBound << ") and not forced.\n");
1182 return;
1183 }
1184
1185 // Check if the runtime trip count is too small when profile is available.
1186 if (L->getHeader()->getParent()->hasProfileData()) {
1187 if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1188 if (*ProfileTripCount < FlatLoopTripCountThreshold)
1189 return;
1190 else
1191 UP.AllowExpensiveTripCount = true;
1192 }
1193 }
1194 if (!UP.Runtime) {
1195 LLVM_DEBUG(dbgs().indent(2)
1196 << "Will not try to unroll loop with runtime trip count "
1197 << "because -unroll-runtime not given\n");
1198 return;
1199 }
1200
1201 assert(UP.Count == 0);
1202 UP.Count = UP.DefaultUnrollRuntimeCount;
1203
1204 // Reduce unroll count to be the largest power-of-two factor of
1205 // the original count which satisfies the threshold limit.
1206 while (UP.Count != 0 &&
1207 UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
1208 UP.Count >>= 1;
1209
1210#ifndef NDEBUG
1211 unsigned OrigCount = UP.Count;
1212#endif
1213
1214 if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
1215 while (UP.Count != 0 && TripMultiple % UP.Count != 0)
1216 UP.Count >>= 1;
1217 LLVM_DEBUG(dbgs().indent(2)
1218 << "Remainder loop is restricted (that could be architecture "
1219 "specific or because the loop contains a convergent "
1220 "instruction), so unroll count must divide the trip "
1221 "multiple, "
1222 << TripMultiple << ". Reducing unroll count from " << OrigCount
1223 << " to " << UP.Count << ".\n");
1224 }
1225
1226 if (UP.Count > UP.MaxCount)
1227 UP.Count = UP.MaxCount;
1228
1229 if (MaxTripCount && UP.Count > MaxTripCount)
1230 UP.Count = MaxTripCount;
1231
1232 if (UP.Count < 2)
1233 UP.Count = 0;
1234 else
1235 LLVM_DEBUG(dbgs().indent(2)
1236 << "Runtime unrolling with count: " << UP.Count << "\n");
1237 return;
1238}
1239
1240static LoopUnrollResult
1241tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
1242 const TargetTransformInfo &TTI, AssumptionCache &AC,
1243 OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
1244 ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
1245 bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,
1246 std::optional<unsigned> ProvidedCount,
1247 std::optional<unsigned> ProvidedThreshold,
1248 std::optional<bool> ProvidedAllowPartial,
1249 std::optional<bool> ProvidedRuntime,
1250 std::optional<bool> ProvidedUpperBound,
1251 std::optional<bool> ProvidedAllowPeeling,
1252 std::optional<bool> ProvidedAllowProfileBasedPeeling,
1253 std::optional<unsigned> ProvidedFullUnrollMaxCount,
1254 UniformityInfo *UI = nullptr, AAResults *AA = nullptr) {
1255
1256 LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1257 << L->getHeader()->getParent()->getName() << "] Loop %"
1258 << L->getHeader()->getName()
1259 << " (depth=" << L->getLoopDepth() << ")\n");
1260 TransformationMode TM = hasUnrollTransformation(L);
1261 if (TM & TM_Disable) {
1262 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: transformation disabled by "
1263 << "metadata.\n");
1264 return LoopUnrollResult::Unmodified;
1265 }
1266
1267 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1268 // parent loop has an explicit unroll-and-jam pragma. This is to prevent
1269 // automatic unrolling from interfering with the user requested
1270 // transformation.
1271 Loop *ParentL = L->getParentLoop();
1272 if (ParentL != nullptr &&
1273 hasUnrollAndJamTransformation(L: ParentL) == TM_ForcedByUser &&
1274 hasUnrollTransformation(L) != TM_ForcedByUser) {
1275 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling loop since parent loop has"
1276 << " llvm.loop.unroll_and_jam.\n");
1277 return LoopUnrollResult::Unmodified;
1278 }
1279
1280 // If this loop isn't forced to be unrolled, avoid unrolling it when the
1281 // loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1282 // unrolling from interfering with the user requested transformation.
1283 if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
1284 hasUnrollTransformation(L) != TM_ForcedByUser) {
1285 LLVM_DEBUG(
1286 dbgs().indent(1)
1287 << "Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1288 return LoopUnrollResult::Unmodified;
1289 }
1290
1291 if (!L->isLoopSimplifyForm()) {
1292 LLVM_DEBUG(dbgs().indent(1)
1293 << "Not unrolling loop which is not in loop-simplify form.\n");
1294 if (TM & TM_ForcedByUser) {
1295 ORE.emit(RemarkBuilder: [&]() {
1296 return OptimizationRemarkMissed(DEBUG_TYPE, "NotInLoopSimplifyForm",
1297 L->getStartLoc(), L->getHeader())
1298 << "unable to unroll loop: not in loop-simplify form";
1299 });
1300 }
1301 return LoopUnrollResult::Unmodified;
1302 }
1303
1304 // When automatic unrolling is disabled, do not unroll unless overridden for
1305 // this loop.
1306 if (OnlyWhenForced && !(TM & TM_Enable)) {
1307 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: automatic unrolling "
1308 << "disabled and loop not explicitly "
1309 << "enabled.\n");
1310 return LoopUnrollResult::Unmodified;
1311 }
1312
1313 bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1314 TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
1315 L, SE, TTI, BFI, PSI, ORE, OptLevel, UserThreshold: ProvidedThreshold, UserCount: ProvidedCount,
1316 UserAllowPartial: ProvidedAllowPartial, UserRuntime: ProvidedRuntime, UserUpperBound: ProvidedUpperBound,
1317 UserFullUnrollMaxCount: ProvidedFullUnrollMaxCount);
1318 TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
1319 L, SE, TTI, UserAllowPeeling: ProvidedAllowPeeling, UserAllowProfileBasedPeeling: ProvidedAllowProfileBasedPeeling, UnrollingSpecficValues: true);
1320
1321 // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1322 // as threshold later on.
1323 if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
1324 !OptForSize) {
1325 LLVM_DEBUG(dbgs().indent(1) << "Not unrolling: all thresholds are zero.\n");
1326 if (TM & TM_ForcedByUser) {
1327 ORE.emit(RemarkBuilder: [&]() {
1328 return OptimizationRemarkMissed(DEBUG_TYPE, "UnrollThresholdsZero",
1329 L->getStartLoc(), L->getHeader())
1330 << "unable to unroll loop: unroll threshold is zero";
1331 });
1332 }
1333 return LoopUnrollResult::Unmodified;
1334 }
1335
1336 SmallPtrSet<const Value *, 32> EphValues;
1337 CodeMetrics::collectEphemeralValues(L, AC: &AC, EphValues);
1338
1339 // Check if the backedge-taken count is uniform before constructing UCE.
1340 // This is used to allow runtime unrolling with a remainder for convergent
1341 // loops when all threads agree on the trip count.
1342 const SCEV *BTC = SE.getBackedgeTakenCount(L);
1343 bool TripCountIsUniform = UI && isSCEVUniform(S: BTC, UI&: *UI);
1344 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns, TripCountIsUniform);
1345 if (!UCE.canUnroll(ORE: (TM & TM_ForcedByUser) ? &ORE : nullptr, L))
1346 return LoopUnrollResult::Unmodified;
1347
1348 unsigned LoopSize = UCE.getRolledLoopSize();
1349 LLVM_DEBUG(dbgs() << "Loop Size = " << LoopSize << "\n");
1350
1351 // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1352 // later), to (fully) unroll loops, if it does not increase code size.
1353 if (OptForSize)
1354 UP.Threshold = std::max(a: UP.Threshold, b: LoopSize + 1);
1355
1356 if (UCE.NumInlineCandidates != 0) {
1357 LLVM_DEBUG(dbgs().indent(1)
1358 << "Not unrolling loop with inlinable calls.\n");
1359 if (TM & TM_ForcedByUser) {
1360 ORE.emit(RemarkBuilder: [&]() {
1361 return OptimizationRemarkMissed(DEBUG_TYPE,
1362 "InlineCandidatesPreventUnroll",
1363 L->getStartLoc(), L->getHeader())
1364 << "unable to unroll loop: contains inlinable calls";
1365 });
1366 }
1367 return LoopUnrollResult::Unmodified;
1368 }
1369
1370 // Find the smallest exact trip count for any exit. This is an upper bound
1371 // on the loop trip count, but an exit at an earlier iteration is still
1372 // possible. An unroll by the smallest exact trip count guarantees that all
1373 // branches relating to at least one exit can be eliminated. This is unlike
1374 // the max trip count, which only guarantees that the backedge can be broken.
1375 unsigned TripCount = 0;
1376 unsigned TripMultiple = 1;
1377 SmallVector<BasicBlock *, 8> ExitingBlocks;
1378 L->getExitingBlocks(ExitingBlocks);
1379 for (BasicBlock *ExitingBlock : ExitingBlocks)
1380 if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1381 if (!TripCount || TC < TripCount)
1382 TripCount = TripMultiple = TC;
1383
1384 if (!TripCount) {
1385 // If no exact trip count is known, determine the trip multiple of either
1386 // the loop latch or the single exiting block.
1387 // TODO: Relax for multiple exits.
1388 BasicBlock *ExitingBlock = L->getLoopLatch();
1389 if (!ExitingBlock || !L->isLoopExiting(BB: ExitingBlock))
1390 ExitingBlock = L->getExitingBlock();
1391 if (ExitingBlock)
1392 TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1393 }
1394
1395 // If the loop contains a convergent operation, the prelude we'd add
1396 // to do the first few instructions before we hit the unrolled loop
1397 // is unsafe -- it adds a control-flow dependency to the convergent
1398 // operation. Therefore restrict remainder loop (try unrolling without).
1399 UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
1400
1401 // Try to find the trip count upper bound if we cannot find the exact trip
1402 // count.
1403 unsigned MaxTripCount = 0;
1404 bool MaxOrZero = false;
1405 if (!TripCount) {
1406 MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1407 MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1408 }
1409
1410 // computeUnrollCount() decides whether it is beneficial to use upper bound to
1411 // fully unroll the loop.
1412 computeUnrollCount(L, TTI, DT, LI, AC: &AC, SE, EphValues, ORE: &ORE, TripCount,
1413 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
1414 if (!UP.Count) {
1415 LLVM_DEBUG(dbgs().indent(1)
1416 << "Not unrolling: no viable strategy found.\n");
1417 if (TM & TM_ForcedByUser) {
1418 ORE.emit(RemarkBuilder: [&]() {
1419 return OptimizationRemarkMissed(DEBUG_TYPE, "NoUnrollStrategy",
1420 L->getStartLoc(), L->getHeader())
1421 << "unable to unroll loop: no viable unroll count found";
1422 });
1423 }
1424 return LoopUnrollResult::Unmodified;
1425 }
1426
1427 UP.Runtime &= UCE.ConvergenceAllowsRuntime;
1428
1429 if (PP.PeelCount) {
1430 assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step");
1431 LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1432 << " with iteration count " << PP.PeelCount << "!\n");
1433 ORE.emit(RemarkBuilder: [&]() {
1434 return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
1435 L->getHeader())
1436 << "peeled loop by " << ore::NV("PeelCount", PP.PeelCount)
1437 << " iterations";
1438 });
1439
1440 ValueToValueMapTy VMap;
1441 peelLoop(L, PeelCount: PP.PeelCount, PeelLast: PP.PeelLast, LI, SE: &SE, DT, AC: &AC, PreserveLCSSA,
1442 VMap);
1443 simplifyLoopAfterUnroll(L, SimplifyIVs: true, LI, SE: &SE, DT: &DT, AC: &AC, TTI: &TTI, Blocks: L->getBlocks(),
1444 AA: nullptr);
1445 // If the loop was peeled, we already "used up" the profile information
1446 // we had, so we don't want to unroll or peel again.
1447 if (PP.PeelProfiledIterations)
1448 L->setLoopAlreadyUnrolled();
1449 return LoopUnrollResult::PartiallyUnrolled;
1450 }
1451
1452 // Do not attempt partial/runtime unrolling in FullLoopUnrolling
1453 if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
1454 UP.Count < TripCount || UP.Count < MaxTripCount)) {
1455 LLVM_DEBUG(dbgs().indent(1)
1456 << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1457 return LoopUnrollResult::Unmodified;
1458 }
1459
1460 // At this point, UP.Runtime indicates that run-time unrolling is allowed.
1461 // However, we only want to actually perform it if we don't know the trip
1462 // count and the unroll count doesn't divide the known trip multiple.
1463 // TODO: This decision should probably be pushed up into
1464 // computeUnrollCount().
1465 UP.Runtime &= TripCount == 0 && TripMultiple % UP.Count != 0;
1466
1467 // Save loop properties before it is transformed.
1468 MDNode *OrigLoopID = L->getLoopID();
1469 UnrollPragmaInfo PInfo(L);
1470 DebugLoc LoopStartLoc = L->getStartLoc();
1471 BasicBlock *LoopHeader = L->getHeader();
1472
1473 // Unroll the loop.
1474 Loop *RemainderLoop = nullptr;
1475 UnrollLoopOptions ULO;
1476 ULO.Count = UP.Count;
1477 ULO.Force = UP.Force;
1478 ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
1479 ULO.UnrollRemainder = UP.UnrollRemainder;
1480 ULO.Runtime = UP.Runtime;
1481 ULO.ForgetAllSCEV = ForgetAllSCEV;
1482 ULO.Heart = getLoopConvergenceHeart(TheLoop: L);
1483 ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
1484 ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
1485 ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
1486 LoopUnrollResult UnrollResult = UnrollLoop(
1487 L, ULO, LI, SE: &SE, DT: &DT, AC: &AC, TTI: &TTI, ORE: &ORE, PreserveLCSSA, RemainderLoop: &RemainderLoop, AA);
1488 if (UnrollResult == LoopUnrollResult::Unmodified) {
1489 if (PInfo.ExplicitUnroll) {
1490 LLVM_DEBUG(dbgs().indent(1)
1491 << "Failed to unroll loop as explicitly requested.\n");
1492 ORE.emit(RemarkBuilder: [&]() {
1493 return OptimizationRemarkMissed(DEBUG_TYPE, "FailedToUnrollAsRequested",
1494 LoopStartLoc, LoopHeader)
1495 << "failed to unroll loop as explicitly requested";
1496 });
1497 }
1498 return LoopUnrollResult::Unmodified;
1499 }
1500
1501 if (PInfo.PragmaFullUnroll && ULO.Count != TripCount) {
1502 ORE.emit(RemarkBuilder: [&]() {
1503 return OptimizationRemarkMissed(DEBUG_TYPE, "FullUnrollAsDirectedFailed",
1504 LoopStartLoc, LoopHeader)
1505 << "unable to fully unroll loop as directed; "
1506 << "unrolled by factor " << ore::NV("UnrollCount", ULO.Count);
1507 });
1508 }
1509 if (PInfo.PragmaCount > 0 && ULO.Count != PInfo.PragmaCount) {
1510 ORE.emit(RemarkBuilder: [&]() {
1511 return OptimizationRemarkMissed(DEBUG_TYPE, "UnrollCountDiffers",
1512 LoopStartLoc, LoopHeader)
1513 << "unable to unroll loop with requested count "
1514 << ore::NV("RequestedCount", PInfo.PragmaCount)
1515 << "; unrolled by factor " << ore::NV("UnrollCount", ULO.Count);
1516 });
1517 }
1518
1519 if (RemainderLoop) {
1520 std::optional<MDNode *> RemainderLoopID =
1521 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopUnrollFollowupAll,
1522 LLVMLoopUnrollFollowupRemainder});
1523 if (RemainderLoopID)
1524 RemainderLoop->setLoopID(*RemainderLoopID);
1525 }
1526
1527 if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1528 std::optional<MDNode *> NewLoopID =
1529 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopUnrollFollowupAll,
1530 LLVMLoopUnrollFollowupUnrolled});
1531 if (NewLoopID) {
1532 L->setLoopID(*NewLoopID);
1533
1534 // Do not setLoopAlreadyUnrolled if loop attributes have been specified
1535 // explicitly.
1536 return UnrollResult;
1537 }
1538 }
1539
1540 // If loop has an unroll count pragma or unrolled by explicitly set count
1541 // mark loop as unrolled to prevent unrolling beyond that requested.
1542 if (UnrollResult != LoopUnrollResult::FullyUnrolled && PInfo.ExplicitUnroll)
1543 L->setLoopAlreadyUnrolled();
1544
1545 return UnrollResult;
1546}
1547
1548namespace {
1549
1550class LoopUnroll : public LoopPass {
1551public:
1552 static char ID; // Pass ID, replacement for typeid
1553
1554 int OptLevel;
1555
1556 /// If false, use a cost model to determine whether unrolling of a loop is
1557 /// profitable. If true, only loops that explicitly request unrolling via
1558 /// metadata are considered. All other loops are skipped.
1559 bool OnlyWhenForced;
1560
1561 /// If false, when SCEV is invalidated, only forget everything in the
1562 /// top-most loop (call forgetTopMostLoop), of the loop being processed.
1563 /// Otherwise, forgetAllLoops and rebuild when needed next.
1564 bool ForgetAllSCEV;
1565
1566 std::optional<unsigned> ProvidedCount;
1567 std::optional<unsigned> ProvidedThreshold;
1568 std::optional<bool> ProvidedAllowPartial;
1569 std::optional<bool> ProvidedRuntime;
1570 std::optional<bool> ProvidedUpperBound;
1571 std::optional<bool> ProvidedAllowPeeling;
1572 std::optional<bool> ProvidedAllowProfileBasedPeeling;
1573 std::optional<unsigned> ProvidedFullUnrollMaxCount;
1574
1575 LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
1576 bool ForgetAllSCEV = false,
1577 std::optional<unsigned> Threshold = std::nullopt,
1578 std::optional<unsigned> Count = std::nullopt,
1579 std::optional<bool> AllowPartial = std::nullopt,
1580 std::optional<bool> Runtime = std::nullopt,
1581 std::optional<bool> UpperBound = std::nullopt,
1582 std::optional<bool> AllowPeeling = std::nullopt,
1583 std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1584 std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1585 : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1586 ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
1587 ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
1588 ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
1589 ProvidedAllowPeeling(AllowPeeling),
1590 ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
1591 ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
1592 initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1593 }
1594
1595 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1596 if (skipLoop(L))
1597 return false;
1598
1599 Function &F = *L->getHeader()->getParent();
1600
1601 auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1602 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1603 ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1604 const TargetTransformInfo &TTI =
1605 getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1606 auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1607 UniformityInfo *UI =
1608 TTI.hasBranchDivergence(F: &F)
1609 ? &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo()
1610 : nullptr;
1611 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1612 // pass. Function analyses need to be preserved across loop transformations
1613 // but ORE cannot be preserved (see comment before the pass definition).
1614 OptimizationRemarkEmitter ORE(&F);
1615 bool PreserveLCSSA = mustPreserveAnalysisID(AID&: LCSSAID);
1616
1617 LoopUnrollResult Result = tryToUnrollLoop(
1618 L, DT, LI, SE, TTI, AC, ORE, BFI: nullptr, PSI: nullptr, PreserveLCSSA, OptLevel,
1619 /*OnlyFullUnroll*/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1620 ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1621 ProvidedUpperBound, ProvidedAllowPeeling,
1622 ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount, UI);
1623
1624 if (Result == LoopUnrollResult::FullyUnrolled)
1625 LPM.markLoopAsDeleted(L&: *L);
1626
1627 return Result != LoopUnrollResult::Unmodified;
1628 }
1629
1630 /// This transformation requires natural loop information & requires that
1631 /// loop preheaders be inserted into the CFG...
1632 void getAnalysisUsage(AnalysisUsage &AU) const override {
1633 AU.addRequired<AssumptionCacheTracker>();
1634 AU.addRequired<TargetTransformInfoWrapperPass>();
1635 AU.addRequired<UniformityInfoWrapperPass>();
1636 // FIXME: Loop passes are required to preserve domtree, and for now we just
1637 // recreate dom info if anything gets unrolled.
1638 getLoopAnalysisUsage(AU);
1639 }
1640};
1641
1642} // end anonymous namespace
1643
1644char LoopUnroll::ID = 0;
1645
1646INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1647INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1648INITIALIZE_PASS_DEPENDENCY(LoopPass)
1649INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1650INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
1651INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1652
1653Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
1654 bool ForgetAllSCEV, int Threshold, int Count,
1655 int AllowPartial, int Runtime, int UpperBound,
1656 int AllowPeeling) {
1657 // TODO: It would make more sense for this function to take the optionals
1658 // directly, but that's dangerous since it would silently break out of tree
1659 // callers.
1660 return new LoopUnroll(
1661 OptLevel, OnlyWhenForced, ForgetAllSCEV,
1662 Threshold == -1 ? std::nullopt : std::optional<unsigned>(Threshold),
1663 Count == -1 ? std::nullopt : std::optional<unsigned>(Count),
1664 AllowPartial == -1 ? std::nullopt : std::optional<bool>(AllowPartial),
1665 Runtime == -1 ? std::nullopt : std::optional<bool>(Runtime),
1666 UpperBound == -1 ? std::nullopt : std::optional<bool>(UpperBound),
1667 AllowPeeling == -1 ? std::nullopt : std::optional<bool>(AllowPeeling));
1668}
1669
1670PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1671 LoopStandardAnalysisResults &AR,
1672 LPMUpdater &Updater) {
1673 // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1674 // pass. Function analyses need to be preserved across loop transformations
1675 // but ORE cannot be preserved (see comment before the pass definition).
1676 OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1677
1678 // Keep track of the previous loop structure so we can identify new loops
1679 // created by unrolling.
1680 Loop *ParentL = L.getParentLoop();
1681 SmallPtrSet<Loop *, 4> OldLoops;
1682 if (ParentL)
1683 OldLoops.insert_range(R&: *ParentL);
1684 else
1685 OldLoops.insert_range(R&: AR.LI);
1686
1687 std::string LoopName = std::string(L.getName());
1688
1689 bool Changed =
1690 tryToUnrollLoop(L: &L, DT&: AR.DT, LI: &AR.LI, SE&: AR.SE, TTI: AR.TTI, AC&: AR.AC, ORE,
1691 /*BFI*/ nullptr, /*PSI*/ nullptr,
1692 /*PreserveLCSSA*/ true, OptLevel, /*OnlyFullUnroll*/ true,
1693 OnlyWhenForced, ForgetAllSCEV: ForgetSCEV, /*Count*/ ProvidedCount: std::nullopt,
1694 /*Threshold*/ ProvidedThreshold: std::nullopt, /*AllowPartial*/ ProvidedAllowPartial: false,
1695 /*Runtime*/ ProvidedRuntime: false, /*UpperBound*/ ProvidedUpperBound: false,
1696 /*AllowPeeling*/ ProvidedAllowPeeling: true,
1697 /*AllowProfileBasedPeeling*/ ProvidedAllowProfileBasedPeeling: false,
1698 /*FullUnrollMaxCount*/ ProvidedFullUnrollMaxCount: std::nullopt) !=
1699 LoopUnrollResult::Unmodified;
1700 if (!Changed)
1701 return PreservedAnalyses::all();
1702
1703 // The parent must not be damaged by unrolling!
1704#ifndef NDEBUG
1705 if (ParentL)
1706 ParentL->verifyLoop();
1707#endif
1708
1709 // Unrolling can do several things to introduce new loops into a loop nest:
1710 // - Full unrolling clones child loops within the current loop but then
1711 // removes the current loop making all of the children appear to be new
1712 // sibling loops.
1713 //
1714 // When a new loop appears as a sibling loop after fully unrolling,
1715 // its nesting structure has fundamentally changed and we want to revisit
1716 // it to reflect that.
1717 //
1718 // When unrolling has removed the current loop, we need to tell the
1719 // infrastructure that it is gone.
1720 //
1721 // Finally, we support a debugging/testing mode where we revisit child loops
1722 // as well. These are not expected to require further optimizations as either
1723 // they or the loop they were cloned from have been directly visited already.
1724 // But the debugging mode allows us to check this assumption.
1725 bool IsCurrentLoopValid = false;
1726 SmallVector<Loop *, 4> SibLoops;
1727 if (ParentL)
1728 SibLoops.append(in_start: ParentL->begin(), in_end: ParentL->end());
1729 else
1730 SibLoops.append(in_start: AR.LI.begin(), in_end: AR.LI.end());
1731 erase_if(C&: SibLoops, P: [&](Loop *SibLoop) {
1732 if (SibLoop == &L) {
1733 IsCurrentLoopValid = true;
1734 return true;
1735 }
1736
1737 // Otherwise erase the loop from the list if it was in the old loops.
1738 return OldLoops.contains(Ptr: SibLoop);
1739 });
1740 Updater.addSiblingLoops(NewSibLoops: SibLoops);
1741
1742 if (!IsCurrentLoopValid) {
1743 Updater.markLoopAsDeleted(L, Name: LoopName);
1744 } else {
1745 // We can only walk child loops if the current loop remained valid.
1746 if (UnrollRevisitChildLoops) {
1747 // Walk *all* of the child loops.
1748 SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
1749 Updater.addChildLoops(NewChildLoops: ChildLoops);
1750 }
1751 }
1752
1753 return getLoopPassPreservedAnalyses();
1754}
1755
1756PreservedAnalyses LoopUnrollPass::run(Function &F,
1757 FunctionAnalysisManager &AM) {
1758 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
1759 // There are no loops in the function. Return before computing other expensive
1760 // analyses.
1761 if (LI.empty())
1762 return PreservedAnalyses::all();
1763 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
1764 auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
1765 auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
1766 auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
1767 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
1768 AAResults &AA = AM.getResult<AAManager>(IR&: F);
1769
1770 UniformityInfo *UI = TTI.hasBranchDivergence(F: &F)
1771 ? &AM.getResult<UniformityInfoAnalysis>(IR&: F)
1772 : nullptr;
1773
1774 LoopAnalysisManager *LAM = nullptr;
1775 if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(IR&: F))
1776 LAM = &LAMProxy->getManager();
1777
1778 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
1779 ProfileSummaryInfo *PSI =
1780 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
1781 auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1782 &AM.getResult<BlockFrequencyAnalysis>(IR&: F) : nullptr;
1783
1784 bool Changed = false;
1785
1786 // The unroller requires loops to be in simplified form, and also needs LCSSA.
1787 // Since simplification may add new inner loops, it has to run before the
1788 // legality and profitability checks. This means running the loop unroller
1789 // will simplify all loops, regardless of whether anything end up being
1790 // unrolled.
1791 for (const auto &L : LI) {
1792 Changed |=
1793 simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
1794 Changed |= formLCSSARecursively(L&: *L, DT, LI: &LI, SE: &SE);
1795 }
1796
1797 // Add the loop nests in the reverse order of LoopInfo. See method
1798 // declaration.
1799 SmallPriorityWorklist<Loop *, 4> Worklist;
1800 appendLoopsToWorklist(LI, Worklist);
1801
1802 while (!Worklist.empty()) {
1803 // Because the LoopInfo stores the loops in RPO, we walk the worklist
1804 // from back to front so that we work forward across the CFG, which
1805 // for unrolling is only needed to get optimization remarks emitted in
1806 // a forward order.
1807 Loop &L = *Worklist.pop_back_val();
1808#ifndef NDEBUG
1809 Loop *ParentL = L.getParentLoop();
1810#endif
1811
1812 // Check if the profile summary indicates that the profiled application
1813 // has a huge working set size, in which case we disable peeling to avoid
1814 // bloating it further.
1815 std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1816 if (PSI && PSI->hasHugeWorkingSetSize())
1817 LocalAllowPeeling = false;
1818 std::string LoopName = std::string(L.getName());
1819 // The API here is quite complex to call and we allow to select some
1820 // flavors of unrolling during construction time (by setting UnrollOpts).
1821 LoopUnrollResult Result = tryToUnrollLoop(
1822 L: &L, DT, LI: &LI, SE, TTI, AC, ORE, BFI, PSI,
1823 /*PreserveLCSSA*/ true, OptLevel: UnrollOpts.OptLevel, /*OnlyFullUnroll*/ false,
1824 OnlyWhenForced: UnrollOpts.OnlyWhenForced, ForgetAllSCEV: UnrollOpts.ForgetSCEV,
1825 /*Count*/ ProvidedCount: std::nullopt,
1826 /*Threshold*/ ProvidedThreshold: std::nullopt, ProvidedAllowPartial: UnrollOpts.AllowPartial,
1827 ProvidedRuntime: UnrollOpts.AllowRuntime, ProvidedUpperBound: UnrollOpts.AllowUpperBound, ProvidedAllowPeeling: LocalAllowPeeling,
1828 ProvidedAllowProfileBasedPeeling: UnrollOpts.AllowProfileBasedPeeling, ProvidedFullUnrollMaxCount: UnrollOpts.FullUnrollMaxCount, UI,
1829 AA: &AA);
1830 Changed |= Result != LoopUnrollResult::Unmodified;
1831
1832 // The parent must not be damaged by unrolling!
1833#ifndef NDEBUG
1834 if (Result != LoopUnrollResult::Unmodified && ParentL)
1835 ParentL->verifyLoop();
1836#endif
1837
1838 // Clear any cached analysis results for L if we removed it completely.
1839 if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1840 LAM->clear(IR&: L, Name: LoopName);
1841 }
1842
1843 if (!Changed)
1844 return PreservedAnalyses::all();
1845
1846 return getLoopPassPreservedAnalyses();
1847}
1848
1849void LoopUnrollPass::printPipeline(
1850 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1851 static_cast<PassInfoMixin<LoopUnrollPass> *>(this)->printPipeline(
1852 OS, MapClassName2PassName);
1853 OS << '<';
1854 if (UnrollOpts.AllowPartial != std::nullopt)
1855 OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1856 if (UnrollOpts.AllowPeeling != std::nullopt)
1857 OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1858 if (UnrollOpts.AllowRuntime != std::nullopt)
1859 OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1860 if (UnrollOpts.AllowUpperBound != std::nullopt)
1861 OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1862 if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1863 OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1864 << "profile-peeling;";
1865 if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1866 OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << ';';
1867 OS << 'O' << UnrollOpts.OptLevel;
1868 OS << '>';
1869}
1870