LoopUnrollPass.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp]

1	//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements a simple loop unroller. It works best when loops have
10	// been canonicalized by the -indvars pass, allowing it to determine the trip
11	// counts of loops easily.
12	//===----------------------------------------------------------------------===//
13
14	#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
15	#include "llvm/ADT/DenseMap.h"
16	#include "llvm/ADT/DenseMapInfo.h"
17	#include "llvm/ADT/DenseSet.h"
18	#include "llvm/ADT/STLExtras.h"
19	#include "llvm/ADT/SetVector.h"
20	#include "llvm/ADT/SmallPtrSet.h"
21	#include "llvm/ADT/SmallVector.h"
22	#include "llvm/ADT/StringRef.h"
23	#include "llvm/Analysis/AssumptionCache.h"
24	#include "llvm/Analysis/BlockFrequencyInfo.h"
25	#include "llvm/Analysis/CodeMetrics.h"
26	#include "llvm/Analysis/LoopAnalysisManager.h"
27	#include "llvm/Analysis/LoopInfo.h"
28	#include "llvm/Analysis/LoopPass.h"
29	#include "llvm/Analysis/LoopUnrollAnalyzer.h"
30	#include "llvm/Analysis/MemorySSA.h"
31	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
32	#include "llvm/Analysis/ProfileSummaryInfo.h"
33	#include "llvm/Analysis/ScalarEvolution.h"
34	#include "llvm/Analysis/TargetTransformInfo.h"
35	#include "llvm/IR/BasicBlock.h"
36	#include "llvm/IR/CFG.h"
37	#include "llvm/IR/Constant.h"
38	#include "llvm/IR/Constants.h"
39	#include "llvm/IR/DiagnosticInfo.h"
40	#include "llvm/IR/Dominators.h"
41	#include "llvm/IR/Function.h"
42	#include "llvm/IR/Instruction.h"
43	#include "llvm/IR/Instructions.h"
44	#include "llvm/IR/Metadata.h"
45	#include "llvm/IR/PassManager.h"
46	#include "llvm/InitializePasses.h"
47	#include "llvm/Pass.h"
48	#include "llvm/Support/Casting.h"
49	#include "llvm/Support/CommandLine.h"
50	#include "llvm/Support/Debug.h"
51	#include "llvm/Support/ErrorHandling.h"
52	#include "llvm/Support/raw_ostream.h"
53	#include "llvm/Transforms/Scalar.h"
54	#include "llvm/Transforms/Scalar/LoopPassManager.h"
55	#include "llvm/Transforms/Utils.h"
56	#include "llvm/Transforms/Utils/LoopPeel.h"
57	#include "llvm/Transforms/Utils/LoopSimplify.h"
58	#include "llvm/Transforms/Utils/LoopUtils.h"
59	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
60	#include "llvm/Transforms/Utils/SizeOpts.h"
61	#include "llvm/Transforms/Utils/UnrollLoop.h"
62	#include <algorithm>
63	#include <cassert>
64	#include <cstdint>
65	#include <limits>
66	#include <optional>
67	#include <string>
68	#include <tuple>
69	#include <utility>
70
71	using namespace llvm;
72
73	#define DEBUG_TYPE "loop-unroll"
74
75	cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
76	"forget-scev-loop-unroll", cl::init(Val: false), cl::Hidden,
77	cl::desc ("Forget everything in SCEV when doing LoopUnroll, instead of just"
78	" the current top-most loop. This is sometimes preferred to reduce"
79	" compile time."));
80
81	static cl::opt<unsigned>
82	UnrollThreshold("unroll-threshold", cl::Hidden,
83	cl::desc ("The cost threshold for loop unrolling"));
84
85	static cl::opt<unsigned>
86	UnrollOptSizeThreshold(
87	"unroll-optsize-threshold", cl::init(Val: `0`), cl::Hidden,
88	cl::desc ("The cost threshold for loop unrolling when optimizing for "
89	"size"));
90
91	static cl::opt<unsigned> UnrollPartialThreshold(
92	"unroll-partial-threshold", cl::Hidden,
93	cl::desc ("The cost threshold for partial loop unrolling"));
94
95	static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
96	"unroll-max-percent-threshold-boost", cl::init(Val: `400`), cl::Hidden,
97	cl::desc ("The maximum 'boost' (represented as a percentage >= 100) applied "
98	"to the threshold when aggressively unrolling a loop due to the "
99	"dynamic cost savings. If completely unrolling a loop will reduce "
100	"the total runtime from X to Y, we boost the loop unroll "
101	"threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
102	"X/Y). This limit avoids excessive code bloat."));
103
104	static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
105	"unroll-max-iteration-count-to-analyze", cl::init(Val: `10`), cl::Hidden,
106	cl::desc ("Don't allow loop unrolling to simulate more than this number of "
107	"iterations when checking full unroll profitability"));
108
109	static cl::opt<unsigned> UnrollCount(
110	"unroll-count", cl::Hidden,
111	cl::desc ("Use this unroll count for all loops including those with "
112	"unroll_count pragma values, for testing purposes"));
113
114	static cl::opt<unsigned> UnrollMaxCount(
115	"unroll-max-count", cl::Hidden,
116	cl::desc ("Set the max unroll count for partial and runtime unrolling, for"
117	"testing purposes"));
118
119	static cl::opt<unsigned> UnrollFullMaxCount(
120	"unroll-full-max-count", cl::Hidden,
121	cl::desc (
122	"Set the max unroll count for full unrolling, for testing purposes"));
123
124	static cl::opt<bool>
125	UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
126	cl::desc ("Allows loops to be partially unrolled until "
127	"-unroll-threshold loop size is reached."));
128
129	static cl::opt<bool> UnrollAllowRemainder(
130	"unroll-allow-remainder", cl::Hidden,
131	cl::desc ("Allow generation of a loop remainder (extra iterations) "
132	"when unrolling a loop."));
133
134	static cl::opt<bool>
135	UnrollRuntime("unroll-runtime", cl::Hidden,
136	cl::desc ("Unroll loops with run-time trip counts"));
137
138	static cl::opt<unsigned> UnrollMaxUpperBound(
139	"unroll-max-upperbound", cl::init(Val: `8`), cl::Hidden,
140	cl::desc (
141	"The max of trip count upper bound that is considered in unrolling"));
142
143	static cl::opt<unsigned> PragmaUnrollThreshold(
144	"pragma-unroll-threshold", cl::init(Val: `16` * `1024`), cl::Hidden,
145	cl::desc ("Unrolled size limit for loops with unroll metadata "
146	"(full, enable, or count)."));
147
148	static cl::opt<unsigned> FlatLoopTripCountThreshold(
149	"flat-loop-tripcount-threshold", cl::init(Val: `5`), cl::Hidden,
150	cl::desc ("If the runtime tripcount for the loop is lower than the "
151	"threshold, the loop is considered as flat and will be less "
152	"aggressively unrolled."));
153
154	static cl::opt<bool> UnrollUnrollRemainder(
155	"unroll-remainder", cl::Hidden,
156	cl::desc ("Allow the loop remainder to be unrolled."));
157
158	// This option isn't ever intended to be enabled, it serves to allow
159	// experiments to check the assumptions about when this kind of revisit is
160	// necessary.
161	static cl::opt<bool> UnrollRevisitChildLoops(
162	"unroll-revisit-child-loops", cl::Hidden,
163	cl::desc ("Enqueue and re-visit child loops in the loop PM after unrolling. "
164	"This shouldn't typically be needed as child loops (or their "
165	"clones) were already visited."));
166
167	static cl::opt<unsigned> UnrollThresholdAggressive(
168	"unroll-threshold-aggressive", cl::init(Val: `300`), cl::Hidden,
169	cl::desc ("Threshold (max size of unrolled loop) to use in aggressive (O3) "
170	"optimizations"));
171	static cl::opt<unsigned>
172	UnrollThresholdDefault("unroll-threshold-default", cl::init(Val: `150`),
173	cl::Hidden,
174	cl::desc ("Default threshold (max size of unrolled "
175	"loop), used in all but O3 optimizations"));
176
177	static cl::opt<unsigned> PragmaUnrollFullMaxIterations(
178	"pragma-unroll-full-max-iterations", cl::init(Val: `1'000'000`), cl::Hidden,
179	cl::desc ("Maximum allowed iterations to unroll under pragma unroll full."));
180
181	/// A magic value for use with the Threshold parameter to indicate
182	/// that the loop unroll should be performed regardless of how much
183	/// code expansion would result.
184	static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
185
186	/// Gather the various unrolling parameters based on the defaults, compiler
187	/// flags, TTI overrides and user specified parameters.
188	TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
189	Loop L, ScalarEvolution &SE, const* TargetTransformInfo &TTI,
190	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
191	OptimizationRemarkEmitter &ORE, int OptLevel,
192	std::optional<unsigned> UserThreshold, std::optional<unsigned> UserCount,
193	std::optional<bool> UserAllowPartial, std::optional<bool> UserRuntime,
194	std::optional<bool> UserUpperBound,
195	std::optional<unsigned> UserFullUnrollMaxCount) {
196	TargetTransformInfo::UnrollingPreferences UP;
197
198	// Set up the defaults
199	UP.Threshold =
200	OptLevel > `2` ? UnrollThresholdAggressive : UnrollThresholdDefault;
201	UP.MaxPercentThresholdBoost = `400`;
202	UP.OptSizeThreshold = UnrollOptSizeThreshold;
203	UP.PartialThreshold = `150`;
204	UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
205	UP.Count = `0`;
206	UP.DefaultUnrollRuntimeCount = `8`;
207	UP.MaxCount = std::numeric_limits<unsigned>::max();
208	UP.MaxUpperBound = UnrollMaxUpperBound;
209	UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
210	UP.BEInsns = `2`;
211	UP.Partial = false;
212	UP.Runtime = false;
213	UP.AllowRemainder = true;
214	UP.UnrollRemainder = false;
215	UP.AllowExpensiveTripCount = false;
216	UP.Force = false;
217	UP.UpperBound = false;
218	UP.UnrollAndJam = false;
219	UP.UnrollAndJamInnerLoopThreshold = `60`;
220	UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
221	UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
222	UP.RuntimeUnrollMultiExit = false;
223	UP.AddAdditionalAccumulators = false;
224
225	// Override with any target specific settings
226	TTI.getUnrollingPreferences(L, SE, UP, ORE: &ORE);
227
228	// Apply size attributes
229	bool OptForSize = L->getHeader()->getParent()->hasOptSize() \|\|
230	// Let unroll hints / pragmas take precedence over PGSO.
231	(hasUnrollTransformation(L) != TM_ForcedByUser &&
232	llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
233	QueryType: PGSOQueryType::IRPass));
234	if (OptForSize) {
235	UP.Threshold = UP.OptSizeThreshold;
236	UP.PartialThreshold = UP.PartialOptSizeThreshold;
237	UP.MaxPercentThresholdBoost = `100`;
238	}
239
240	// Apply any user values specified by cl::opt
241	if (UnrollThreshold.getNumOccurrences() > `0`)
242	UP.Threshold = UnrollThreshold;
243	if (UnrollPartialThreshold.getNumOccurrences() > `0`)
244	UP.PartialThreshold = UnrollPartialThreshold;
245	if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > `0`)
246	UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
247	if (UnrollMaxCount.getNumOccurrences() > `0`)
248	UP.MaxCount = UnrollMaxCount;
249	if (UnrollMaxUpperBound.getNumOccurrences() > `0`)
250	UP.MaxUpperBound = UnrollMaxUpperBound;
251	if (UnrollFullMaxCount.getNumOccurrences() > `0`)
252	UP.FullUnrollMaxCount = UnrollFullMaxCount;
253	if (UnrollAllowPartial.getNumOccurrences() > `0`)
254	UP.Partial = UnrollAllowPartial;
255	if (UnrollAllowRemainder.getNumOccurrences() > `0`)
256	UP.AllowRemainder = UnrollAllowRemainder;
257	if (UnrollRuntime.getNumOccurrences() > `0`)
258	UP.Runtime = UnrollRuntime;
259	if (UnrollMaxUpperBound == `0`)
260	UP.UpperBound = false;
261	if (UnrollUnrollRemainder.getNumOccurrences() > `0`)
262	UP.UnrollRemainder = UnrollUnrollRemainder;
263	if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > `0`)
264	UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
265
266	// Apply user values provided by argument
267	if (UserThreshold) {
268	UP.Threshold = *UserThreshold;
269	UP.PartialThreshold = *UserThreshold;
270	}
271	if (UserCount)
272	UP.Count = *UserCount;
273	if (UserAllowPartial)
274	UP.Partial = *UserAllowPartial;
275	if (UserRuntime)
276	UP.Runtime = *UserRuntime;
277	if (UserUpperBound)
278	UP.UpperBound = *UserUpperBound;
279	if (UserFullUnrollMaxCount)
280	UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
281
282	return UP;
283	}
284
285	namespace {
286
287	/// A struct to densely store the state of an instruction after unrolling at
288	/// each iteration.
289	///
290	/// This is designed to work like a tuple of <Instruction , int> for the*
291	/// purposes of hashing and lookup, but to be able to associate two boolean
292	/// states with each key.
293	struct UnrolledInstState {
294	Instruction *I;
295	int Iteration : `30`;
296	unsigned IsFree : `1`;
297	unsigned IsCounted : `1`;
298	};
299
300	/// Hashing and equality testing for a set of the instruction states.
301	struct UnrolledInstStateKeyInfo {
302	using PtrInfo = DenseMapInfo<Instruction *>;
303	using PairInfo = DenseMapInfo<std::pair<Instruction , int*>>;
304
305	static inline UnrolledInstState getEmptyKey() {
306	return {.I: PtrInfo::getEmptyKey(), .Iteration: `0`, .IsFree: `0`, .IsCounted: `0`};
307	}
308
309	static inline UnrolledInstState getTombstoneKey() {
310	return {.I: PtrInfo::getTombstoneKey(), .Iteration: `0`, .IsFree: `0`, .IsCounted: `0`};
311	}
312
313	static inline unsigned getHashValue(const UnrolledInstState &S) {
314	return PairInfo::getHashValue(PairVal: {S.I, S.Iteration});
315	}
316
317	static inline bool isEqual(const UnrolledInstState &LHS,
318	const UnrolledInstState &RHS) {
319	return PairInfo::isEqual(LHS: {LHS.I, LHS.Iteration}, RHS: {RHS.I, RHS.Iteration});
320	}
321	};
322
323	struct EstimatedUnrollCost {
324	/// The estimated cost after unrolling.
325	unsigned UnrolledCost;
326
327	/// The estimated dynamic cost of executing the instructions in the
328	/// rolled form.
329	unsigned RolledDynamicCost;
330	};
331
332	} // end anonymous namespace
333
334	/// Figure out if the loop is worth full unrolling.
335	///
336	/// Complete loop unrolling can make some loads constant, and we need to know
337	/// if that would expose any further optimization opportunities. This routine
338	/// estimates this optimization. It computes cost of unrolled loop
339	/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
340	/// dynamic cost we mean that we won't count costs of blocks that are known not
341	/// to be executed (i.e. if we have a branch in the loop and we know that at the
342	/// given iteration its condition would be resolved to true, we won't add up the
343	/// cost of the 'false'-block).
344	/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
345	/// the analysis failed (no benefits expected from the unrolling, or the loop is
346	/// too big to analyze), the returned value is std::nullopt.
347	static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
348	const Loop L, unsigned* TripCount, DominatorTree &DT, ScalarEvolution &SE,
349	const SmallPtrSetImpl<const Value *> &EphValues,
350	const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
351	unsigned MaxIterationsCountToAnalyze) {
352	// We want to be able to scale offsets by the trip count and add more offsets
353	// to them without checking for overflows, and we already don't want to
354	// analyze massive* trip counts, so we force the max to be reasonably small.*
355	assert(MaxIterationsCountToAnalyze <
356	(unsigned)(std::numeric_limits<int>::max() / `2`) &&
357	"The unroll iterations max is too large!");
358
359	// Only analyze inner loops. We can't properly estimate cost of nested loops
360	// and we won't visit inner loops again anyway.
361	if (!L->isInnermost()) {
362	LLVM_DEBUG(dbgs().indent(`3`)
363	<< "Not analyzing loop cost: not an innermost loop.\n");
364	return std::nullopt;
365	}
366
367	// Don't simulate loops with a big or unknown tripcount
368	if (!TripCount \|\| TripCount > MaxIterationsCountToAnalyze) {
369	LLVM_DEBUG(dbgs().indent(`3`)
370	<< "Not analyzing loop cost: trip count "
371	<< (TripCount ? "too large" : "unknown") << ".\n");
372	return std::nullopt;
373	}
374
375	SmallSetVector<BasicBlock *, `16`> BBWorklist;
376	SmallSetVector<std::pair<BasicBlock , BasicBlock >, `4`> ExitWorklist;
377	DenseMap<Value , Value > SimplifiedValues;
378	SmallVector<std::pair<Value , Value >, `4`> SimplifiedInputValues;
379
380	// The estimated cost of the unrolled form of the loop. We try to estimate
381	// this by simplifying as much as we can while computing the estimate.
382	InstructionCost UnrolledCost = `0`;
383
384	// We also track the estimated dynamic (that is, actually executed) cost in
385	// the rolled form. This helps identify cases when the savings from unrolling
386	// aren't just exposing dead control flows, but actual reduced dynamic
387	// instructions due to the simplifications which we expect to occur after
388	// unrolling.
389	InstructionCost RolledDynamicCost = `0`;
390
391	// We track the simplification of each instruction in each iteration. We use
392	// this to recursively merge costs into the unrolled cost on-demand so that
393	// we don't count the cost of any dead code. This is essentially a map from
394	// <instruction, int> to <bool, bool>, but stored as a densely packed struct.
395	DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
396
397	// A small worklist used to accumulate cost of instructions from each
398	// observable and reached root in the loop.
399	SmallVector<Instruction *, `16`> CostWorklist;
400
401	// PHI-used worklist used between iterations while accumulating cost.
402	SmallVector<Instruction *, `4`> PHIUsedList;
403
404	// Helper function to accumulate cost for instructions in the loop.
405	auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
406	assert(Iteration >= `0` && "Cannot have a negative iteration!");
407	assert(CostWorklist.empty() && "Must start with an empty cost list");
408	assert(PHIUsedList.empty() && "Must start with an empty phi used list");
409	CostWorklist.push_back(Elt: &RootI);
410	TargetTransformInfo::TargetCostKind CostKind =
411	RootI.getFunction()->hasMinSize() ?
412	TargetTransformInfo::TCK_CodeSize :
413	TargetTransformInfo::TCK_SizeAndLatency;
414	for (;; --Iteration) {
415	do {
416	Instruction *I = CostWorklist.pop_back_val();
417
418	// InstCostMap only uses I and Iteration as a key, the other two values
419	// don't matter here.
420	auto CostIter = InstCostMap.find(V: {.I: I, .Iteration: Iteration, .IsFree: `0`, .IsCounted: `0`});
421	if (CostIter == InstCostMap.end())
422	// If an input to a PHI node comes from a dead path through the loop
423	// we may have no cost data for it here. What that actually means is
424	// that it is free.
425	continue;
426	auto &Cost = *CostIter;
427	if (Cost.IsCounted)
428	// Already counted this instruction.
429	continue;
430
431	// Mark that we are counting the cost of this instruction now.
432	Cost.IsCounted = true;
433
434	// If this is a PHI node in the loop header, just add it to the PHI set.
435	if (auto *PhiI = dyn_cast<PHINode>(Val: I))
436	if (PhiI->getParent() == L->getHeader()) {
437	assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
438	"inherently simplify during unrolling.");
439	if (Iteration == `0`)
440	continue;
441
442	// Push the incoming value from the backedge into the PHI used list
443	// if it is an in-loop instruction. We'll use this to populate the
444	// cost worklist for the next iteration (as we count backwards).
445	if (auto *OpI = dyn_cast<Instruction>(
446	Val: PhiI->getIncomingValueForBlock(BB: L->getLoopLatch())))
447	if (L->contains(Inst: OpI))
448	PHIUsedList.push_back(Elt: OpI);
449	continue;
450	}
451
452	// First accumulate the cost of this instruction.
453	if (!Cost.IsFree) {
454	// Consider simplified operands in instruction cost.
455	SmallVector<Value *, `4`> Operands;
456	transform(Range: I->operands(), d_first: std::back_inserter(x&: Operands),
457	F: [&](Value *Op) {
458	if (auto Res = SimplifiedValues.lookup(Val: Op))
459	return Res;
460	return Op;
461	});
462	UnrolledCost += TTI.getInstructionCost(U: I, Operands, CostKind);
463	LLVM_DEBUG(dbgs().indent(`3`)
464	<< "Adding cost of instruction (iteration " << Iteration
465	<< "): ");
466	LLVM_DEBUG(I->dump());
467	}
468
469	// We must count the cost of every operand which is not free,
470	// recursively. If we reach a loop PHI node, simply add it to the set
471	// to be considered on the next iteration (backwards!).
472	for (Value *Op : I->operands()) {
473	// Check whether this operand is free due to being a constant or
474	// outside the loop.
475	auto *OpI = dyn_cast<Instruction>(Val: Op);
476	if (!OpI \|\| !L->contains(Inst: OpI))
477	continue;
478
479	// Otherwise accumulate its cost.
480	CostWorklist.push_back(Elt: OpI);
481	}
482	} while (!CostWorklist.empty());
483
484	if (PHIUsedList.empty())
485	// We've exhausted the search.
486	break;
487
488	assert(Iteration > `0` &&
489	"Cannot track PHI-used values past the first iteration!");
490	CostWorklist.append(in_start: PHIUsedList.begin(), in_end: PHIUsedList.end());
491	PHIUsedList.clear();
492	}
493	};
494
495	// Ensure that we don't violate the loop structure invariants relied on by
496	// this analysis.
497	assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
498	assert(L->isLCSSAForm(DT) &&
499	"Must have loops in LCSSA form to track live-out values.");
500
501	LLVM_DEBUG(dbgs().indent(`3`)
502	<< "Starting LoopUnroll profitability analysis...\n");
503
504	TargetTransformInfo::TargetCostKind CostKind =
505	L->getHeader()->getParent()->hasMinSize() ?
506	TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
507	// Simulate execution of each iteration of the loop counting instructions,
508	// which would be simplified.
509	// Since the same load will take different values on different iterations,
510	// we literally have to go through all loop's iterations.
511	for (unsigned Iteration = `0`; Iteration < TripCount; ++Iteration) {
512	LLVM_DEBUG(dbgs().indent(`3`) << "Analyzing iteration " << Iteration << "\n");
513
514	// Prepare for the iteration by collecting any simplified entry or backedge
515	// inputs.
516	for (Instruction &I : *L->getHeader()) {
517	auto *PHI = dyn_cast<PHINode>(Val: &I);
518	if (!PHI)
519	break;
520
521	// The loop header PHI nodes must have exactly two input: one from the
522	// loop preheader and one from the loop latch.
523	assert(
524	PHI->getNumIncomingValues() == `2` &&
525	"Must have an incoming value only for the preheader and the latch.");
526
527	Value *V = PHI->getIncomingValueForBlock(
528	BB: Iteration == `0` ? L->getLoopPreheader() : L->getLoopLatch());
529	if (Iteration != `0` && SimplifiedValues.count(Val: V))
530	V = SimplifiedValues.lookup(Val: V);
531	SimplifiedInputValues.push_back(Elt: {PHI, V});
532	}
533
534	// Now clear and re-populate the map for the next iteration.
535	SimplifiedValues.clear();
536	while (!SimplifiedInputValues.empty())
537	SimplifiedValues.insert(KV: SimplifiedInputValues.pop_back_val());
538
539	UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
540
541	BBWorklist.clear();
542	BBWorklist.insert(X: L->getHeader());
543	// Note that we must not* cache the size, this loop grows the worklist.*
544	for (unsigned Idx = `0`; Idx != BBWorklist.size(); ++Idx) {
545	BasicBlock *BB = BBWorklist [Idx];
546
547	// Visit all instructions in the given basic block and try to simplify
548	// it. We don't change the actual IR, just count optimization
549	// opportunities.
550	for (Instruction &I : *BB) {
551	// These won't get into the final code - don't even try calculating the
552	// cost for them.
553	if (EphValues.count(Ptr: &I))
554	continue;
555
556	// Track this instruction's expected baseline cost when executing the
557	// rolled loop form.
558	RolledDynamicCost += TTI.getInstructionCost(U: &I, CostKind);
559
560	// Visit the instruction to analyze its loop cost after unrolling,
561	// and if the visitor returns true, mark the instruction as free after
562	// unrolling and continue.
563	bool IsFree = Analyzer.visit(I);
564	bool Inserted = InstCostMap.insert(V: {.I: &I, .Iteration: (int)Iteration,
565	.IsFree: (unsigned)IsFree,
566	/IsCounted/ false}).second;
567	(void)Inserted;
568	assert(Inserted && "Cannot have a state for an unvisited instruction!");
569
570	if (IsFree)
571	continue;
572
573	// Can't properly model a cost of a call.
574	// FIXME: With a proper cost model we should be able to do it.
575	if (auto *CI = dyn_cast<CallInst>(Val: &I)) {
576	const Function *Callee = CI->getCalledFunction();
577	if (!Callee \|\| TTI.isLoweredToCall(F: Callee)) {
578	LLVM_DEBUG(dbgs().indent(`3`)
579	<< "Can't analyze cost of loop with call\n");
580	return std::nullopt;
581	}
582	}
583
584	// If the instruction might have a side-effect recursively account for
585	// the cost of it and all the instructions leading up to it.
586	if (I.mayHaveSideEffects())
587	AddCostRecursively (I, Iteration);
588
589	// If unrolled body turns out to be too big, bail out.
590	if (UnrolledCost > MaxUnrolledLoopSize) {
591	LLVM_DEBUG({
592	dbgs().indent(`3`) << "Exceeded threshold.. exiting.\n";
593	dbgs().indent(`3`)
594	<< "UnrolledCost: " << UnrolledCost
595	<< ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize << "\n";
596	});
597	return std::nullopt;
598	}
599	}
600
601	Instruction *TI = BB->getTerminator();
602
603	auto getSimplifiedConstant = [&](Value V) -> Constant {
604	if (SimplifiedValues.count(Val: V))
605	V = SimplifiedValues.lookup(Val: V);
606	return dyn_cast<Constant>(Val: V);
607	};
608
609	// Add in the live successors by first checking whether we have terminator
610	// that may be simplified based on the values simplified by this call.
611	BasicBlock KnownSucc = nullptr*;
612	if (CondBrInst *BI = dyn_cast<CondBrInst>(Val: TI)) {
613	if (auto *SimpleCond = getSimplifiedConstant (BI->getCondition())) {
614	// Just take the first successor if condition is undef
615	if (isa<UndefValue>(Val: SimpleCond))
616	KnownSucc = BI->getSuccessor(i: `0`);
617	else if (ConstantInt *SimpleCondVal =
618	dyn_cast<ConstantInt>(Val: SimpleCond))
619	KnownSucc = BI->getSuccessor(i: SimpleCondVal->isZero() ? `1` : `0`);
620	}
621	} else if (SwitchInst *SI = dyn_cast<SwitchInst>(Val: TI)) {
622	if (auto *SimpleCond = getSimplifiedConstant (SI->getCondition())) {
623	// Just take the first successor if condition is undef
624	if (isa<UndefValue>(Val: SimpleCond))
625	KnownSucc = SI->getSuccessor(idx: `0`);
626	else if (ConstantInt *SimpleCondVal =
627	dyn_cast<ConstantInt>(Val: SimpleCond))
628	KnownSucc = SI->findCaseValue(C: SimpleCondVal)->getCaseSuccessor();
629	}
630	}
631	if (KnownSucc) {
632	if (L->contains(BB: KnownSucc))
633	BBWorklist.insert(X: KnownSucc);
634	else
635	ExitWorklist.insert(X: {BB, KnownSucc});
636	continue;
637	}
638
639	// Add BB's successors to the worklist.
640	for (BasicBlock *Succ : successors(BB))
641	if (L->contains(BB: Succ))
642	BBWorklist.insert(X: Succ);
643	else
644	ExitWorklist.insert(X: {BB, Succ});
645	AddCostRecursively (*TI, Iteration);
646	}
647
648	// If we found no optimization opportunities on the first iteration, we
649	// won't find them on later ones too.
650	if (UnrolledCost == RolledDynamicCost) {
651	LLVM_DEBUG({
652	dbgs().indent(`3`) << "No opportunities found.. exiting.\n";
653	dbgs().indent(`3`) << "UnrolledCost: " << UnrolledCost << "\n";
654	});
655	return std::nullopt;
656	}
657	}
658
659	while (!ExitWorklist.empty()) {
660	BasicBlock ExitingBB, ExitBB;
661	std::tie(args&: ExitingBB, args&: ExitBB) = ExitWorklist.pop_back_val();
662
663	for (Instruction &I : *ExitBB) {
664	auto *PN = dyn_cast<PHINode>(Val: &I);
665	if (!PN)
666	break;
667
668	Value *Op = PN->getIncomingValueForBlock(BB: ExitingBB);
669	if (auto *OpI = dyn_cast<Instruction>(Val: Op))
670	if (L->contains(Inst: OpI))
671	AddCostRecursively (*OpI, TripCount - `1`);
672	}
673	}
674
675	assert(UnrolledCost.isValid() && RolledDynamicCost.isValid() &&
676	"All instructions must have a valid cost, whether the "
677	"loop is rolled or unrolled.");
678
679	LLVM_DEBUG({
680	dbgs().indent(`3`) << "Analysis finished:\n";
681	dbgs().indent(`3`) << "UnrolledCost: " << UnrolledCost
682	<< ", RolledDynamicCost: " << RolledDynamicCost << "\n";
683	});
684	return {{.UnrolledCost: unsigned(UnrolledCost.getValue()),
685	.RolledDynamicCost: unsigned(RolledDynamicCost.getValue())}};
686	}
687
688	UnrollCostEstimator::UnrollCostEstimator(
689	const Loop L, const* TargetTransformInfo &TTI,
690	const SmallPtrSetImpl<const Value > &EphValues, unsigned* BEInsns) {
691	CodeMetrics Metrics;
692	for (BasicBlock *BB : L->blocks())
693	Metrics.analyzeBasicBlock(BB, TTI, EphValues, / PrepareForLTO= / false,
694	L);
695	NumInlineCandidates = Metrics.NumInlineCandidates;
696	NotDuplicatable = Metrics.notDuplicatable;
697	Convergence = Metrics.Convergence;
698	LoopSize = Metrics.NumInsts;
699	ConvergenceAllowsRuntime =
700	Metrics.Convergence != ConvergenceKind::Uncontrolled &&
701	!getLoopConvergenceHeart(TheLoop: L);
702
703	// Don't allow an estimate of size zero. This would allows unrolling of loops
704	// with huge iteration counts, which is a compile time problem even if it's
705	// not a problem for code quality. Also, the code using this size may assume
706	// that each loop has at least three instructions (likely a conditional
707	// branch, a comparison feeding that branch, and some kind of loop increment
708	// feeding that comparison instruction).
709	if (LoopSize.isValid() && LoopSize < BEInsns + `1`)
710	// This is an open coded max() on InstructionCost
711	LoopSize = BEInsns + `1`;
712	}
713
714	bool UnrollCostEstimator::canUnroll() const {
715	if (Convergence == ConvergenceKind::ExtendedLoop) {
716	LLVM_DEBUG(dbgs().indent(`1`)
717	<< "Not unrolling: contains convergent operations.\n");
718	return false;
719	}
720	if (!LoopSize.isValid()) {
721	LLVM_DEBUG(dbgs().indent(`1`)
722	<< "Not unrolling: loop size could not be computed.\n");
723	return false;
724	}
725	if (NotDuplicatable) {
726	LLVM_DEBUG(dbgs().indent(`1`)
727	<< "Not unrolling: contains non-duplicatable instructions.\n");
728	return false;
729	}
730	return true;
731	}
732
733	uint64_t UnrollCostEstimator::getUnrolledLoopSize(
734	const TargetTransformInfo::UnrollingPreferences &UP,
735	unsigned CountOverwrite) const {
736	unsigned LS = LoopSize.getValue();
737	assert(LS >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
738	if (CountOverwrite)
739	return static_cast<uint64_t>(LS - UP.BEInsns) * CountOverwrite + UP.BEInsns;
740	else
741	return static_cast<uint64_t>(LS - UP.BEInsns) * UP.Count + UP.BEInsns;
742	}
743
744	// Returns true if the loop has an unroll(full) pragma.
745	static bool hasUnrollFullPragma(const Loop *L) {
746	return getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.full");
747	}
748
749	// Returns true if the loop has an unroll(enable) pragma. This metadata is used
750	// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
751	static bool hasUnrollEnablePragma(const Loop *L) {
752	return getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.enable");
753	}
754
755	// Returns true if the loop has an runtime unroll(disable) pragma.
756	static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
757	return getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.runtime.disable");
758	}
759
760	// If loop has an unroll_count pragma return the (necessarily
761	// positive) value from the pragma. Otherwise return 0.
762	static unsigned unrollCountPragmaValue(const Loop *L) {
763	MDNode *MD = getUnrollMetadataForLoop(L, Name: "llvm.loop.unroll.count");
764	if (MD) {
765	assert(MD->getNumOperands() == `2` &&
766	"Unroll count hint metadata should have two operands.");
767	unsigned Count =
768	mdconst::extract<ConstantInt>(MD: MD->getOperand(I: `1`))->getZExtValue();
769	assert(Count >= `1` && "Unroll count must be positive.");
770	return Count;
771	}
772	return `0`;
773	}
774
775	UnrollPragmaInfo::UnrollPragmaInfo(const Loop *L)
776	: UserUnrollCount(UnrollCount.getNumOccurrences() > `0`),
777	PragmaFullUnroll(hasUnrollFullPragma(L)),
778	PragmaCount(unrollCountPragmaValue(L)),
779	PragmaEnableUnroll(hasUnrollEnablePragma(L)),
780	PragmaRuntimeUnrollDisable(hasRuntimeUnrollDisablePragma(L)),
781	ExplicitUnroll(PragmaCount > `0` \|\| PragmaFullUnroll \|\|
782	PragmaEnableUnroll \|\| UserUnrollCount) {}
783
784	// Computes the boosting factor for complete unrolling.
785	// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
786	// be beneficial to fully unroll the loop even if unrolledcost is large. We
787	// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
788	// the unroll threshold.
789	static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
790	unsigned MaxPercentThresholdBoost) {
791	if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / `100`)
792	return `100`;
793	else if (Cost.UnrolledCost != `0`)
794	// The boosting factor is RolledDynamicCost / UnrolledCost
795	return std::min(a: `100` * Cost.RolledDynamicCost / Cost.UnrolledCost,
796	b: MaxPercentThresholdBoost);
797	else
798	return MaxPercentThresholdBoost;
799	}
800
801	static std::optional<unsigned>
802	shouldPragmaUnroll(Loop L, const* UnrollPragmaInfo &PInfo,
803	const unsigned TripMultiple, const unsigned TripCount,
804	unsigned MaxTripCount, const UnrollCostEstimator UCE,
805	const TargetTransformInfo::UnrollingPreferences &UP) {
806
807	// Using unroll pragma
808	// 1st priority is unroll count set by "unroll-count" option.
809
810	if (PInfo.UserUnrollCount) {
811	if (UP.AllowRemainder &&
812	UCE.getUnrolledLoopSize(UP, CountOverwrite: (unsigned)UnrollCount) < UP.Threshold) {
813	LLVM_DEBUG(dbgs().indent(`2`) << "Unrolling with user-specified count: "
814	<< UnrollCount << ".\n");
815	return (unsigned)UnrollCount;
816	}
817	LLVM_DEBUG(dbgs().indent(`2`)
818	<< "Not unrolling with user count " << UnrollCount << ": "
819	<< (UP.AllowRemainder ? "exceeds threshold"
820	: "remainder not allowed")
821	<< ".\n");
822	}
823
824	// 2nd priority is unroll count set by pragma.
825	if (PInfo.PragmaCount > `0`) {
826	if ((UP.AllowRemainder \|\| (TripMultiple % PInfo.PragmaCount == `0`))) {
827	LLVM_DEBUG(dbgs().indent(`2`) << "Unrolling with pragma count: "
828	<< PInfo.PragmaCount << ".\n");
829	return PInfo.PragmaCount;
830	}
831	LLVM_DEBUG(dbgs().indent(`2`)
832	<< "Not unrolling with pragma count " << PInfo.PragmaCount
833	<< ": remainder not allowed, count does not divide trip "
834	<< "multiple " << TripMultiple << ".\n");
835	}
836
837	if (PInfo.PragmaFullUnroll) {
838	if (TripCount != `0`) {
839	// Certain cases with UBSAN can cause trip count to be calculated as
840	// INT_MAX, Block full unrolling at a reasonable limit so that the
841	// compiler doesn't hang trying to unroll the loop. See PR77842
842	if (TripCount > PragmaUnrollFullMaxIterations) {
843	LLVM_DEBUG(dbgs().indent(`2`)
844	<< "Won't unroll; trip count is too large.\n");
845	return std::nullopt;
846	}
847
848	LLVM_DEBUG(dbgs().indent(`2`)
849	<< "Fully unrolling with trip count: " << TripCount << ".\n");
850	return TripCount;
851	}
852	LLVM_DEBUG(dbgs().indent(`2`)
853	<< "Not fully unrolling: unknown trip count.\n");
854	}
855
856	if (PInfo.PragmaEnableUnroll && !TripCount && MaxTripCount &&
857	MaxTripCount <= UP.MaxUpperBound) {
858	LLVM_DEBUG(dbgs().indent(`2`)
859	<< "Unrolling with max trip count: " << MaxTripCount << ".\n");
860	return MaxTripCount;
861	}
862
863	return std::nullopt;
864	}
865
866	static std::optional<unsigned> shouldFullUnroll(
867	Loop L, const* TargetTransformInfo &TTI, DominatorTree &DT,
868	ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
869	const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
870	const TargetTransformInfo::UnrollingPreferences &UP) {
871	assert(FullUnrollTripCount && "should be non-zero!");
872
873	if (FullUnrollTripCount > UP.FullUnrollMaxCount) {
874	LLVM_DEBUG(dbgs().indent(`2`)
875	<< "Not unrolling: trip count " << FullUnrollTripCount
876	<< " exceeds max count " << UP.FullUnrollMaxCount << ".\n");
877	return std::nullopt;
878	}
879
880	// When computing the unrolled size, note that BEInsns are not replicated
881	// like the rest of the loop body.
882	uint64_t UnrolledSize = UCE.getUnrolledLoopSize(UP);
883	if (UnrolledSize < UP.Threshold) {
884	LLVM_DEBUG(dbgs().indent(`2`) << "Unrolling: size " << UnrolledSize
885	<< " < threshold " << UP.Threshold << ".\n");
886	return FullUnrollTripCount;
887	}
888
889	LLVM_DEBUG(dbgs().indent(`2`)
890	<< "Unrolled size " << UnrolledSize << " exceeds threshold "
891	<< UP.Threshold << "; checking for cost benefit.\n");
892
893	// The loop isn't that small, but we still can fully unroll it if that
894	// helps to remove a significant number of instructions.
895	// To check that, run additional analysis on the loop.
896	if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
897	L, TripCount: FullUnrollTripCount, DT, SE, EphValues, TTI,
898	MaxUnrolledLoopSize: UP.Threshold * UP.MaxPercentThresholdBoost / `100`,
899	MaxIterationsCountToAnalyze: UP.MaxIterationsCountToAnalyze)) {
900	unsigned Boost =
901	getFullUnrollBoostingFactor(Cost: *Cost, MaxPercentThresholdBoost: UP.MaxPercentThresholdBoost);
902	unsigned BoostedThreshold = UP.Threshold * Boost / `100`;
903	if (Cost ->UnrolledCost < BoostedThreshold) {
904	LLVM_DEBUG(dbgs().indent(`2`) << "Profitable after cost analysis.\n");
905	return FullUnrollTripCount;
906	}
907	LLVM_DEBUG(dbgs().indent(`2`)
908	<< "Not unrolling: cost " << Cost->UnrolledCost
909	<< " >= boosted threshold " << BoostedThreshold << ".\n");
910	}
911
912	return std::nullopt;
913	}
914
915	static std::optional<unsigned>
916	shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
917	const UnrollCostEstimator UCE,
918	const TargetTransformInfo::UnrollingPreferences &UP) {
919
920	if (!TripCount)
921	return std::nullopt;
922
923	if (!UP.Partial) {
924	LLVM_DEBUG(dbgs().indent(`2`) << "Will not try to unroll partially because "
925	<< "-unroll-allow-partial not given\n");
926	return `0`;
927	}
928	unsigned count = UP.Count;
929	if (count == `0`)
930	count = TripCount;
931	if (UP.PartialThreshold != NoThreshold) {
932	// Reduce unroll count to be modulo of TripCount for partial unrolling.
933	if (UCE.getUnrolledLoopSize(UP, CountOverwrite: count) > UP.PartialThreshold) {
934	unsigned NewCount =
935	(std::max(a: UP.PartialThreshold, b: UP.BEInsns + `1`) - UP.BEInsns) /
936	(LoopSize - UP.BEInsns);
937	LLVM_DEBUG(dbgs().indent(`2`)
938	<< "Unrolled size exceeds threshold; reducing count "
939	<< "from " << count << " to " << NewCount << ".\n");
940	count = NewCount;
941	}
942	if (count > UP.MaxCount)
943	count = UP.MaxCount;
944	while (count != `0` && TripCount % count != `0`)
945	count--;
946	if (UP.AllowRemainder && count <= `1`) {
947	// If there is no Count that is modulo of TripCount, set Count to
948	// largest power-of-two factor that satisfies the threshold limit.
949	// As we'll create fixup loop, do the type of unrolling only if
950	// remainder loop is allowed.
951	// Note: DefaultUnrollRuntimeCount is used as a reasonable starting point
952	// even though this is partial unrolling (not runtime unrolling).
953	count = UP.DefaultUnrollRuntimeCount;
954	while (count != `0` &&
955	UCE.getUnrolledLoopSize(UP, CountOverwrite: count) > UP.PartialThreshold)
956	count >>= `1`;
957	}
958	if (count < `2`) {
959	LLVM_DEBUG(dbgs().indent(`2`)
960	<< "Will not partially unroll: no profitable count.\n");
961	count = `0`;
962	}
963	} else {
964	count = TripCount;
965	}
966	if (count > UP.MaxCount)
967	count = UP.MaxCount;
968
969	LLVM_DEBUG(dbgs().indent(`2`)
970	<< "Partially unrolling with count: " << count << "\n");
971
972	return count;
973	}
974	// Calculates unroll count and writes it to UP.Count.
975	// Unless IgnoreUser is true, will also use metadata and command-line options
976	// that are specific to the LoopUnroll pass (which, for instance, are
977	// irrelevant for the LoopUnrollAndJam pass).
978	// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
979	// many LoopUnroll-specific options. The shared functionality should be
980	// refactored into it own function.
981	void llvm::computeUnrollCount(Loop L, const* TargetTransformInfo &TTI,
982	DominatorTree &DT, LoopInfo *LI,
983	AssumptionCache *AC, ScalarEvolution &SE,
984	const SmallPtrSetImpl<const Value *> &EphValues,
985	OptimizationRemarkEmitter *ORE,
986	const unsigned TripCount,
987	const unsigned MaxTripCount, const bool MaxOrZero,
988	const unsigned TripMultiple,
989	const UnrollCostEstimator &UCE,
990	TargetTransformInfo::UnrollingPreferences &UP,
991	TargetTransformInfo::PeelingPreferences &PP) {
992
993	unsigned LoopSize = UCE.getRolledLoopSize();
994
995	LLVM_DEBUG(dbgs().indent(`1`) << "Computing unroll count: TripCount="
996	<< TripCount << ", MaxTripCount=" << MaxTripCount
997	<< (MaxOrZero ? " (MaxOrZero)" : "")
998	<< ", TripMultiple=" << TripMultiple << "\n");
999
1000	UnrollPragmaInfo PInfo(L);
1001	LLVM_DEBUG({
1002	if (PInfo.ExplicitUnroll) {
1003	dbgs().indent(`1`) << "Explicit unroll requested:";
1004	if (PInfo.UserUnrollCount)
1005	dbgs() << " user-count";
1006	if (PInfo.PragmaFullUnroll)
1007	dbgs() << " pragma-full";
1008	if (PInfo.PragmaCount > `0`)
1009	dbgs() << " pragma-count(" << PInfo.PragmaCount << ")";
1010	if (PInfo.PragmaEnableUnroll)
1011	dbgs() << " pragma-enable";
1012	dbgs() << "\n";
1013	}
1014	});
1015
1016	// Use an explicit peel count that has been specified for testing. In this
1017	// case it's not permitted to also specify an explicit unroll count.
1018	if (PP.PeelCount) {
1019	if (UnrollCount.getNumOccurrences() > `0`) {
1020	reportFatalUsageError(reason: "Cannot specify both explicit peel count and "
1021	"explicit unroll count");
1022	}
1023	LLVM_DEBUG(dbgs().indent(`2`)
1024	<< "Using explicit peel count: " << PP.PeelCount << ".\n");
1025	UP.Count = `1`;
1026	UP.Runtime = false;
1027	return;
1028	}
1029	// Check for explicit Count.
1030	// 1st priority is unroll count set by "unroll-count" option.
1031	// 2nd priority is unroll count set by pragma.
1032	LLVM_DEBUG(dbgs().indent(`1`) << "Trying pragma unroll...\n");
1033	if (auto UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount,
1034	MaxTripCount, UCE, UP)) {
1035	UP.Count = *UnrollFactor;
1036
1037	if (PInfo.UserUnrollCount \|\| (PInfo.PragmaCount > `0`)) {
1038	UP.AllowExpensiveTripCount = true;
1039	UP.Force = true;
1040	}
1041	UP.Runtime \|= (PInfo.PragmaCount > `0`);
1042	return;
1043	} else {
1044	if (PInfo.ExplicitUnroll && TripCount != `0`) {
1045	// If the loop has an unrolling pragma, we want to be more aggressive with
1046	// unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
1047	// value which is larger than the default limits.
1048	UP.Threshold = std::max<unsigned>(a: UP.Threshold, b: PragmaUnrollThreshold);
1049	UP.PartialThreshold =
1050	std::max<unsigned>(a: UP.PartialThreshold, b: PragmaUnrollThreshold);
1051	}
1052	}
1053
1054	// 3rd priority is exact full unrolling. This will eliminate all copies
1055	// of some exit test.
1056	LLVM_DEBUG(dbgs().indent(`1`) << "Trying full unroll...\n");
1057	UP.Count = `0`;
1058	if (TripCount) {
1059	UP.Count = TripCount;
1060	if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1061	FullUnrollTripCount: TripCount, UCE, UP)) {
1062	UP.Count = *UnrollFactor;
1063	return;
1064	}
1065	}
1066
1067	// 4th priority is bounded unrolling.
1068	// We can unroll by the upper bound amount if it's generally allowed or if
1069	// we know that the loop is executed either the upper bound or zero times.
1070	// (MaxOrZero unrolling keeps only the first loop test, so the number of
1071	// loop tests remains the same compared to the non-unrolled version, whereas
1072	// the generic upper bound unrolling keeps all but the last loop test so the
1073	// number of loop tests goes up which may end up being worse on targets with
1074	// constrained branch predictor resources so is controlled by an option.)
1075	// In addition we only unroll small upper bounds.
1076	// Note that the cost of bounded unrolling is always strictly greater than
1077	// cost of exact full unrolling. As such, if we have an exact count and
1078	// found it unprofitable, we'll never chose to bounded unroll.
1079	LLVM_DEBUG(dbgs().indent(`1`) << "Trying upper-bound unroll...\n");
1080	if (!TripCount && MaxTripCount && (UP.UpperBound \|\| MaxOrZero) &&
1081	MaxTripCount <= UP.MaxUpperBound) {
1082	UP.Count = MaxTripCount;
1083	if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
1084	FullUnrollTripCount: MaxTripCount, UCE, UP)) {
1085	UP.Count = *UnrollFactor;
1086	return;
1087	}
1088	}
1089
1090	// 5th priority is loop peeling.
1091	LLVM_DEBUG(dbgs().indent(`1`) << "Trying loop peeling...\n");
1092	computePeelCount(L, LoopSize, PP, TripCount, DT, SE, TTI, AC, Threshold: UP.Threshold);
1093	if (PP.PeelCount) {
1094	LLVM_DEBUG(dbgs().indent(`2`)
1095	<< "Peeling with count: " << PP.PeelCount << ".\n");
1096	UP.Runtime = false;
1097	UP.Count = `1`;
1098	return;
1099	}
1100
1101	// Before starting partial unrolling, set up.partial to true,
1102	// if user explicitly asked for unrolling
1103	if (TripCount)
1104	UP.Partial \|= PInfo.ExplicitUnroll;
1105
1106	// 6th priority is partial unrolling.
1107	// Try partial unroll only when TripCount could be statically calculated.
1108	LLVM_DEBUG(dbgs().indent(`1`) << "Trying partial unroll...\n");
1109	if (auto UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP)) {
1110	UP.Count = *UnrollFactor;
1111
1112	if ((PInfo.PragmaFullUnroll \|\| PInfo.PragmaEnableUnroll) && TripCount &&
1113	UP.Count != TripCount)
1114	ORE->emit(RemarkBuilder: [&]() {
1115	return OptimizationRemarkMissed (DEBUG_TYPE,
1116	"FullUnrollAsDirectedTooLarge",
1117	L->getStartLoc(), L->getHeader())
1118	<< "unable to fully unroll loop as directed by unroll metadata "
1119	"because unrolled size is too large";
1120	});
1121
1122	if (UP.PartialThreshold != NoThreshold) {
1123	if (UP.Count == `0`) {
1124	if (PInfo.PragmaEnableUnroll)
1125	ORE->emit(RemarkBuilder: [&]() {
1126	return OptimizationRemarkMissed (DEBUG_TYPE,
1127	"UnrollAsDirectedTooLarge",
1128	L->getStartLoc(), L->getHeader())
1129	<< "unable to unroll loop as directed by "
1130	"llvm.loop.unroll.enable metadata because unrolled size "
1131	"is too large";
1132	});
1133	}
1134	}
1135	return;
1136	}
1137	assert(TripCount == `0` &&
1138	"All cases when TripCount is constant should be covered here.");
1139	if (PInfo.PragmaFullUnroll)
1140	ORE->emit(RemarkBuilder: [&]() {
1141	return OptimizationRemarkMissed (
1142	DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
1143	L->getStartLoc(), L->getHeader())
1144	<< "unable to fully unroll loop as directed by "
1145	"llvm.loop.unroll.full metadata because loop has a runtime "
1146	"trip count";
1147	});
1148
1149	// 7th priority is runtime unrolling.
1150	LLVM_DEBUG(dbgs().indent(`1`) << "Trying runtime unroll...\n");
1151	// Don't unroll a runtime trip count loop when it is disabled.
1152	if (PInfo.PragmaRuntimeUnrollDisable) {
1153	LLVM_DEBUG(dbgs().indent(`2`)
1154	<< "Not runtime unrolling: disabled by pragma.\n");
1155	UP.Count = `0`;
1156	return;
1157	}
1158
1159	// Don't unroll a small upper bound loop unless user or TTI asked to do so.
1160	if (MaxTripCount && !UP.Force && MaxTripCount < UP.MaxUpperBound) {
1161	LLVM_DEBUG(dbgs().indent(`2`)
1162	<< "Not runtime unrolling: max trip count " << MaxTripCount
1163	<< " is small (< " << UP.MaxUpperBound << ") and not forced.\n");
1164	UP.Count = `0`;
1165	return;
1166	}
1167
1168	// Check if the runtime trip count is too small when profile is available.
1169	if (L->getHeader()->getParent()->hasProfileData()) {
1170	if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
1171	if (*ProfileTripCount < FlatLoopTripCountThreshold)
1172	return;
1173	else
1174	UP.AllowExpensiveTripCount = true;
1175	}
1176	}
1177	UP.Runtime \|= PInfo.PragmaEnableUnroll \|\| PInfo.PragmaCount > `0` \|\|
1178	PInfo.UserUnrollCount;
1179	if (!UP.Runtime) {
1180	LLVM_DEBUG(dbgs().indent(`2`)
1181	<< "Will not try to unroll loop with runtime trip count "
1182	<< "because -unroll-runtime not given\n");
1183	UP.Count = `0`;
1184	return;
1185	}
1186	if (UP.Count == `0`)
1187	UP.Count = UP.DefaultUnrollRuntimeCount;
1188
1189	// Reduce unroll count to be the largest power-of-two factor of
1190	// the original count which satisfies the threshold limit.
1191	while (UP.Count != `0` &&
1192	UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
1193	UP.Count >>= `1`;
1194
1195	#ifndef NDEBUG
1196	unsigned OrigCount = UP.Count;
1197	#endif
1198
1199	if (!UP.AllowRemainder && UP.Count != `0` && (TripMultiple % UP.Count) != `0`) {
1200	while (UP.Count != `0` && TripMultiple % UP.Count != `0`)
1201	UP.Count >>= `1`;
1202	LLVM_DEBUG(dbgs().indent(`2`)
1203	<< "Remainder loop is restricted (that could be architecture "
1204	"specific or because the loop contains a convergent "
1205	"instruction), so unroll count must divide the trip "
1206	"multiple, "
1207	<< TripMultiple << ". Reducing unroll count from " << OrigCount
1208	<< " to " << UP.Count << ".\n");
1209
1210	using namespace ore;
1211
1212	if (PInfo.PragmaCount > `0` && !UP.AllowRemainder)
1213	ORE->emit(RemarkBuilder: [&]() {
1214	return OptimizationRemarkMissed (DEBUG_TYPE,
1215	"DifferentUnrollCountFromDirected",
1216	L->getStartLoc(), L->getHeader())
1217	<< "Unable to unroll loop the number of times directed by "
1218	"llvm.loop.unroll.count metadata because remainder loop is "
1219	"restricted (that could be architecture specific or because "
1220	"the loop contains a convergent instruction) and so must "
1221	"have an unroll count that divides the loop trip multiple of "
1222	<< NV ("TripMultiple", TripMultiple) << ". Unrolling instead "
1223	<< NV ("UnrollCount", UP.Count) << " time(s).";
1224	});
1225	}
1226
1227	if (UP.Count > UP.MaxCount)
1228	UP.Count = UP.MaxCount;
1229
1230	if (MaxTripCount && UP.Count > MaxTripCount)
1231	UP.Count = MaxTripCount;
1232
1233	LLVM_DEBUG(dbgs().indent(`2`)
1234	<< "Runtime unrolling with count: " << UP.Count << "\n");
1235	if (UP.Count < `2`)
1236	UP.Count = `0`;
1237	return;
1238	}
1239
1240	static LoopUnrollResult
1241	tryToUnrollLoop(Loop L, DominatorTree &DT, LoopInfo LI, ScalarEvolution &SE,
1242	const TargetTransformInfo &TTI, AssumptionCache &AC,
1243	OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
1244	ProfileSummaryInfo PSI, bool* PreserveLCSSA, int OptLevel,
1245	bool OnlyFullUnroll, bool OnlyWhenForced, bool ForgetAllSCEV,
1246	std::optional<unsigned> ProvidedCount,
1247	std::optional<unsigned> ProvidedThreshold,
1248	std::optional<bool> ProvidedAllowPartial,
1249	std::optional<bool> ProvidedRuntime,
1250	std::optional<bool> ProvidedUpperBound,
1251	std::optional<bool> ProvidedAllowPeeling,
1252	std::optional<bool> ProvidedAllowProfileBasedPeeling,
1253	std::optional<unsigned> ProvidedFullUnrollMaxCount,
1254	AAResults AA = nullptr*) {
1255
1256	LLVM_DEBUG(dbgs() << "Loop Unroll: F["
1257	<< L->getHeader()->getParent()->getName() << "] Loop %"
1258	<< L->getHeader()->getName()
1259	<< " (depth=" << L->getLoopDepth() << ")\n");
1260	TransformationMode TM = hasUnrollTransformation(L);
1261	if (TM & TM_Disable) {
1262	LLVM_DEBUG(dbgs().indent(`1`) << "Not unrolling: transformation disabled by "
1263	<< "metadata.\n");
1264	return LoopUnrollResult::Unmodified;
1265	}
1266
1267	// If this loop isn't forced to be unrolled, avoid unrolling it when the
1268	// parent loop has an explicit unroll-and-jam pragma. This is to prevent
1269	// automatic unrolling from interfering with the user requested
1270	// transformation.
1271	Loop *ParentL = L->getParentLoop();
1272	if (ParentL != nullptr &&
1273	hasUnrollAndJamTransformation(L: ParentL) == TM_ForcedByUser &&
1274	hasUnrollTransformation(L) != TM_ForcedByUser) {
1275	LLVM_DEBUG(dbgs().indent(`1`) << "Not unrolling loop since parent loop has"
1276	<< " llvm.loop.unroll_and_jam.\n");
1277	return LoopUnrollResult::Unmodified;
1278	}
1279
1280	// If this loop isn't forced to be unrolled, avoid unrolling it when the
1281	// loop has an explicit unroll-and-jam pragma. This is to prevent automatic
1282	// unrolling from interfering with the user requested transformation.
1283	if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser &&
1284	hasUnrollTransformation(L) != TM_ForcedByUser) {
1285	LLVM_DEBUG(
1286	dbgs().indent(`1`)
1287	<< "Not unrolling loop since it has llvm.loop.unroll_and_jam.\n");
1288	return LoopUnrollResult::Unmodified;
1289	}
1290
1291	if (!L->isLoopSimplifyForm()) {
1292	LLVM_DEBUG(dbgs().indent(`1`)
1293	<< "Not unrolling loop which is not in loop-simplify form.\n");
1294	return LoopUnrollResult::Unmodified;
1295	}
1296
1297	// When automatic unrolling is disabled, do not unroll unless overridden for
1298	// this loop.
1299	if (OnlyWhenForced && !(TM & TM_Enable)) {
1300	LLVM_DEBUG(dbgs().indent(`1`) << "Not unrolling: automatic unrolling "
1301	<< "disabled and loop not explicitly "
1302	<< "enabled.\n");
1303	return LoopUnrollResult::Unmodified;
1304	}
1305
1306	bool OptForSize = L->getHeader()->getParent()->hasOptSize();
1307	TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
1308	L, SE, TTI, BFI, PSI, ORE, OptLevel, UserThreshold: ProvidedThreshold, UserCount: ProvidedCount,
1309	UserAllowPartial: ProvidedAllowPartial, UserRuntime: ProvidedRuntime, UserUpperBound: ProvidedUpperBound,
1310	UserFullUnrollMaxCount: ProvidedFullUnrollMaxCount);
1311	TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
1312	L, SE, TTI, UserAllowPeeling: ProvidedAllowPeeling, UserAllowProfileBasedPeeling: ProvidedAllowProfileBasedPeeling, UnrollingSpecficValues: true);
1313
1314	// Exit early if unrolling is disabled. For OptForSize, we pick the loop size
1315	// as threshold later on.
1316	if (UP.Threshold == `0` && (!UP.Partial \|\| UP.PartialThreshold == `0`) &&
1317	!OptForSize) {
1318	LLVM_DEBUG(dbgs().indent(`1`) << "Not unrolling: all thresholds are zero.\n");
1319	return LoopUnrollResult::Unmodified;
1320	}
1321
1322	SmallPtrSet<const Value *, `32`> EphValues;
1323	CodeMetrics::collectEphemeralValues(L, AC: &AC, EphValues);
1324
1325	UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
1326	if (!UCE.canUnroll())
1327	return LoopUnrollResult::Unmodified;
1328
1329	unsigned LoopSize = UCE.getRolledLoopSize();
1330	LLVM_DEBUG(dbgs() << "Loop Size = " << LoopSize << "\n");
1331
1332	// When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
1333	// later), to (fully) unroll loops, if it does not increase code size.
1334	if (OptForSize)
1335	UP.Threshold = std::max(a: UP.Threshold, b: LoopSize + `1`);
1336
1337	if (UCE.NumInlineCandidates != `0`) {
1338	LLVM_DEBUG(dbgs().indent(`1`)
1339	<< "Not unrolling loop with inlinable calls.\n");
1340	return LoopUnrollResult::Unmodified;
1341	}
1342
1343	// Find the smallest exact trip count for any exit. This is an upper bound
1344	// on the loop trip count, but an exit at an earlier iteration is still
1345	// possible. An unroll by the smallest exact trip count guarantees that all
1346	// branches relating to at least one exit can be eliminated. This is unlike
1347	// the max trip count, which only guarantees that the backedge can be broken.
1348	unsigned TripCount = `0`;
1349	unsigned TripMultiple = `1`;
1350	SmallVector<BasicBlock *, `8`> ExitingBlocks;
1351	L->getExitingBlocks(ExitingBlocks);
1352	for (BasicBlock *ExitingBlock : ExitingBlocks)
1353	if (unsigned TC = SE.getSmallConstantTripCount(L, ExitingBlock))
1354	if (!TripCount \|\| TC < TripCount)
1355	TripCount = TripMultiple = TC;
1356
1357	if (!TripCount) {
1358	// If no exact trip count is known, determine the trip multiple of either
1359	// the loop latch or the single exiting block.
1360	// TODO: Relax for multiple exits.
1361	BasicBlock *ExitingBlock = L->getLoopLatch();
1362	if (!ExitingBlock \|\| !L->isLoopExiting(BB: ExitingBlock))
1363	ExitingBlock = L->getExitingBlock();
1364	if (ExitingBlock)
1365	TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
1366	}
1367
1368	// If the loop contains a convergent operation, the prelude we'd add
1369	// to do the first few instructions before we hit the unrolled loop
1370	// is unsafe -- it adds a control-flow dependency to the convergent
1371	// operation. Therefore restrict remainder loop (try unrolling without).
1372	//
1373	// TODO: This is somewhat conservative; we could allow the remainder if the
1374	// trip count is uniform.
1375	UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime;
1376
1377	// Try to find the trip count upper bound if we cannot find the exact trip
1378	// count.
1379	unsigned MaxTripCount = `0`;
1380	bool MaxOrZero = false;
1381	if (!TripCount) {
1382	MaxTripCount = SE.getSmallConstantMaxTripCount(L);
1383	MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
1384	}
1385
1386	// computeUnrollCount() decides whether it is beneficial to use upper bound to
1387	// fully unroll the loop.
1388	computeUnrollCount(L, TTI, DT, LI, AC: &AC, SE, EphValues, ORE: &ORE, TripCount,
1389	MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
1390	if (!UP.Count) {
1391	LLVM_DEBUG(dbgs().indent(`1`)
1392	<< "Not unrolling: no viable strategy found.\n");
1393	return LoopUnrollResult::Unmodified;
1394	}
1395
1396	UP.Runtime &= UCE.ConvergenceAllowsRuntime;
1397
1398	if (PP.PeelCount) {
1399	assert(UP.Count == `1` && "Cannot perform peel and unroll in the same step");
1400	LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName()
1401	<< " with iteration count " << PP.PeelCount << "!\n");
1402	ORE.emit(RemarkBuilder: [&]() {
1403	return OptimizationRemark (DEBUG_TYPE, "Peeled", L->getStartLoc(),
1404	L->getHeader())
1405	<< "peeled loop by " << ore::NV ("PeelCount", PP.PeelCount)
1406	<< " iterations";
1407	});
1408
1409	ValueToValueMapTy VMap;
1410	peelLoop(L, PeelCount: PP.PeelCount, PeelLast: PP.PeelLast, LI, SE: &SE, DT, AC: &AC, PreserveLCSSA,
1411	VMap);
1412	simplifyLoopAfterUnroll(L, SimplifyIVs: true, LI, SE: &SE, DT: &DT, AC: &AC, TTI: &TTI, AA: nullptr);
1413	// If the loop was peeled, we already "used up" the profile information
1414	// we had, so we don't want to unroll or peel again.
1415	if (PP.PeelProfiledIterations)
1416	L->setLoopAlreadyUnrolled();
1417	return LoopUnrollResult::PartiallyUnrolled;
1418	}
1419
1420	// Do not attempt partial/runtime unrolling in FullLoopUnrolling
1421	if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) \|\|
1422	UP.Count < TripCount \|\| UP.Count < MaxTripCount)) {
1423	LLVM_DEBUG(dbgs().indent(`1`)
1424	<< "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
1425	return LoopUnrollResult::Unmodified;
1426	}
1427
1428	// At this point, UP.Runtime indicates that run-time unrolling is allowed.
1429	// However, we only want to actually perform it if we don't know the trip
1430	// count and the unroll count doesn't divide the known trip multiple.
1431	// TODO: This decision should probably be pushed up into
1432	// computeUnrollCount().
1433	UP.Runtime &= TripCount == `0` && TripMultiple % UP.Count != `0`;
1434
1435	// Save loop properties before it is transformed.
1436	MDNode *OrigLoopID = L->getLoopID();
1437
1438	// Unroll the loop.
1439	Loop RemainderLoop = nullptr*;
1440	UnrollLoopOptions ULO;
1441	ULO.Count = UP.Count;
1442	ULO.Force = UP.Force;
1443	ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount;
1444	ULO.UnrollRemainder = UP.UnrollRemainder;
1445	ULO.Runtime = UP.Runtime;
1446	ULO.ForgetAllSCEV = ForgetAllSCEV;
1447	ULO.Heart = getLoopConvergenceHeart(TheLoop: L);
1448	ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
1449	ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
1450	ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
1451	LoopUnrollResult UnrollResult = UnrollLoop(
1452	L, ULO, LI, SE: &SE, DT: &DT, AC: &AC, TTI: &TTI, ORE: &ORE, PreserveLCSSA, RemainderLoop: &RemainderLoop, AA);
1453	if (UnrollResult == LoopUnrollResult::Unmodified)
1454	return LoopUnrollResult::Unmodified;
1455
1456	if (RemainderLoop) {
1457	std::optional<MDNode *> RemainderLoopID =
1458	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopUnrollFollowupAll,
1459	LLVMLoopUnrollFollowupRemainder});
1460	if (RemainderLoopID)
1461	RemainderLoop->setLoopID(*RemainderLoopID);
1462	}
1463
1464	if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
1465	std::optional<MDNode *> NewLoopID =
1466	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopUnrollFollowupAll,
1467	LLVMLoopUnrollFollowupUnrolled});
1468	if (NewLoopID) {
1469	L->setLoopID(*NewLoopID);
1470
1471	// Do not setLoopAlreadyUnrolled if loop attributes have been specified
1472	// explicitly.
1473	return UnrollResult;
1474	}
1475	}
1476
1477	// If loop has an unroll count pragma or unrolled by explicitly set count
1478	// mark loop as unrolled to prevent unrolling beyond that requested.
1479	if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
1480	UnrollPragmaInfo (L).ExplicitUnroll)
1481	L->setLoopAlreadyUnrolled();
1482
1483	return UnrollResult;
1484	}
1485
1486	namespace {
1487
1488	class LoopUnroll : public LoopPass {
1489	public:
1490	static char ID; // Pass ID, replacement for typeid
1491
1492	int OptLevel;
1493
1494	/// If false, use a cost model to determine whether unrolling of a loop is
1495	/// profitable. If true, only loops that explicitly request unrolling via
1496	/// metadata are considered. All other loops are skipped.
1497	bool OnlyWhenForced;
1498
1499	/// If false, when SCEV is invalidated, only forget everything in the
1500	/// top-most loop (call forgetTopMostLoop), of the loop being processed.
1501	/// Otherwise, forgetAllLoops and rebuild when needed next.
1502	bool ForgetAllSCEV;
1503
1504	std::optional<unsigned> ProvidedCount;
1505	std::optional<unsigned> ProvidedThreshold;
1506	std::optional<bool> ProvidedAllowPartial;
1507	std::optional<bool> ProvidedRuntime;
1508	std::optional<bool> ProvidedUpperBound;
1509	std::optional<bool> ProvidedAllowPeeling;
1510	std::optional<bool> ProvidedAllowProfileBasedPeeling;
1511	std::optional<unsigned> ProvidedFullUnrollMaxCount;
1512
1513	LoopUnroll(int OptLevel = `2`, bool OnlyWhenForced = false,
1514	bool ForgetAllSCEV = false,
1515	std::optional<unsigned> Threshold = std::nullopt,
1516	std::optional<unsigned> Count = std::nullopt,
1517	std::optional<bool> AllowPartial = std::nullopt,
1518	std::optional<bool> Runtime = std::nullopt,
1519	std::optional<bool> UpperBound = std::nullopt,
1520	std::optional<bool> AllowPeeling = std::nullopt,
1521	std::optional<bool> AllowProfileBasedPeeling = std::nullopt,
1522	std::optional<unsigned> ProvidedFullUnrollMaxCount = std::nullopt)
1523	: LoopPass (ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
1524	ForgetAllSCEV(ForgetAllSCEV), ProvidedCount (std::move(Count)),
1525	ProvidedThreshold (Threshold), ProvidedAllowPartial (AllowPartial),
1526	ProvidedRuntime (Runtime), ProvidedUpperBound (UpperBound),
1527	ProvidedAllowPeeling (AllowPeeling),
1528	ProvidedAllowProfileBasedPeeling (AllowProfileBasedPeeling),
1529	ProvidedFullUnrollMaxCount (ProvidedFullUnrollMaxCount) {
1530	initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
1531	}
1532
1533	bool runOnLoop(Loop *L, LPPassManager &LPM) override {
1534	if (skipLoop(L))
1535	return false;
1536
1537	Function &F = *L->getHeader()->getParent();
1538
1539	auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1540	LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1541	ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1542	const TargetTransformInfo &TTI =
1543	getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1544	auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1545	// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
1546	// pass. Function analyses need to be preserved across loop transformations
1547	// but ORE cannot be preserved (see comment before the pass definition).
1548	OptimizationRemarkEmitter ORE(&F);
1549	bool PreserveLCSSA = mustPreserveAnalysisID(AID&: LCSSAID);
1550
1551	LoopUnrollResult Result = tryToUnrollLoop(
1552	L, DT, LI, SE, TTI, AC, ORE, BFI: nullptr, PSI: nullptr, PreserveLCSSA, OptLevel,
1553	/OnlyFullUnroll/ false, OnlyWhenForced, ForgetAllSCEV, ProvidedCount,
1554	ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
1555	ProvidedUpperBound, ProvidedAllowPeeling,
1556	ProvidedAllowProfileBasedPeeling, ProvidedFullUnrollMaxCount);
1557
1558	if (Result == LoopUnrollResult::FullyUnrolled)
1559	LPM.markLoopAsDeleted(L&: *L);
1560
1561	return Result != LoopUnrollResult::Unmodified;
1562	}
1563
1564	/// This transformation requires natural loop information & requires that
1565	/// loop preheaders be inserted into the CFG...
1566	void getAnalysisUsage(AnalysisUsage &AU) const override {
1567	AU.addRequired<AssumptionCacheTracker>();
1568	AU.addRequired<TargetTransformInfoWrapperPass>();
1569	// FIXME: Loop passes are required to preserve domtree, and for now we just
1570	// recreate dom info if anything gets unrolled.
1571	getLoopAnalysisUsage(AU);
1572	}
1573	};
1574
1575	} // end anonymous namespace
1576
1577	char LoopUnroll::ID = `0`;
1578
1579	INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1580	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1581	INITIALIZE_PASS_DEPENDENCY(LoopPass)
1582	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1583	INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
1584
1585	Pass llvm::createLoopUnrollPass(int* OptLevel, bool OnlyWhenForced,
1586	bool ForgetAllSCEV, int Threshold, int Count,
1587	int AllowPartial, int Runtime, int UpperBound,
1588	int AllowPeeling) {
1589	// TODO: It would make more sense for this function to take the optionals
1590	// directly, but that's dangerous since it would silently break out of tree
1591	// callers.
1592	return new LoopUnroll (
1593	OptLevel, OnlyWhenForced, ForgetAllSCEV,
1594	Threshold == -`1` ? std::nullopt : std::optional<unsigned>(Threshold),
1595	Count == -`1` ? std::nullopt : std::optional<unsigned>(Count),
1596	AllowPartial == -`1` ? std::nullopt : std::optional<bool>(AllowPartial),
1597	Runtime == -`1` ? std::nullopt : std::optional<bool>(Runtime),
1598	UpperBound == -`1` ? std::nullopt : std::optional<bool>(UpperBound),
1599	AllowPeeling == -`1` ? std::nullopt : std::optional<bool>(AllowPeeling));
1600	}
1601
1602	PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
1603	LoopStandardAnalysisResults &AR,
1604	LPMUpdater &Updater) {
1605	// For the new PM, we can't use OptimizationRemarkEmitter as an analysis
1606	// pass. Function analyses need to be preserved across loop transformations
1607	// but ORE cannot be preserved (see comment before the pass definition).
1608	OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
1609
1610	// Keep track of the previous loop structure so we can identify new loops
1611	// created by unrolling.
1612	Loop *ParentL = L.getParentLoop();
1613	SmallPtrSet<Loop *, `4`> OldLoops;
1614	if (ParentL)
1615	OldLoops.insert_range(R&: *ParentL);
1616	else
1617	OldLoops.insert_range(R&: AR.LI);
1618
1619	std::string LoopName = std::string (L.getName());
1620
1621	bool Changed =
1622	tryToUnrollLoop(L: &L, DT&: AR.DT, LI: &AR.LI, SE&: AR.SE, TTI: AR.TTI, AC&: AR.AC, ORE,
1623	/BFI/ nullptr, /PSI/ nullptr,
1624	/PreserveLCSSA/ true, OptLevel, /OnlyFullUnroll/ true,
1625	OnlyWhenForced, ForgetAllSCEV: ForgetSCEV, /Count/ ProvidedCount: std::nullopt,
1626	/Threshold/ ProvidedThreshold: std::nullopt, /AllowPartial/ ProvidedAllowPartial: false,
1627	/Runtime/ ProvidedRuntime: false, /UpperBound/ ProvidedUpperBound: false,
1628	/AllowPeeling/ ProvidedAllowPeeling: true,
1629	/AllowProfileBasedPeeling/ ProvidedAllowProfileBasedPeeling: false,
1630	/FullUnrollMaxCount/ ProvidedFullUnrollMaxCount: std::nullopt) !=
1631	LoopUnrollResult::Unmodified;
1632	if (!Changed)
1633	return PreservedAnalyses::all();
1634
1635	// The parent must not be damaged by unrolling!
1636	#ifndef NDEBUG
1637	if (ParentL)
1638	ParentL->verifyLoop();
1639	#endif
1640
1641	// Unrolling can do several things to introduce new loops into a loop nest:
1642	// - Full unrolling clones child loops within the current loop but then
1643	// removes the current loop making all of the children appear to be new
1644	// sibling loops.
1645	//
1646	// When a new loop appears as a sibling loop after fully unrolling,
1647	// its nesting structure has fundamentally changed and we want to revisit
1648	// it to reflect that.
1649	//
1650	// When unrolling has removed the current loop, we need to tell the
1651	// infrastructure that it is gone.
1652	//
1653	// Finally, we support a debugging/testing mode where we revisit child loops
1654	// as well. These are not expected to require further optimizations as either
1655	// they or the loop they were cloned from have been directly visited already.
1656	// But the debugging mode allows us to check this assumption.
1657	bool IsCurrentLoopValid = false;
1658	SmallVector<Loop *, `4`> SibLoops;
1659	if (ParentL)
1660	SibLoops.append(in_start: ParentL->begin(), in_end: ParentL->end());
1661	else
1662	SibLoops.append(in_start: AR.LI.begin(), in_end: AR.LI.end());
1663	erase_if(C&: SibLoops, P: [&](Loop *SibLoop) {
1664	if (SibLoop == &L) {
1665	IsCurrentLoopValid = true;
1666	return true;
1667	}
1668
1669	// Otherwise erase the loop from the list if it was in the old loops.
1670	return OldLoops.contains(Ptr: SibLoop);
1671	});
1672	Updater.addSiblingLoops(NewSibLoops: SibLoops);
1673
1674	if (!IsCurrentLoopValid) {
1675	Updater.markLoopAsDeleted(L, Name: LoopName);
1676	} else {
1677	// We can only walk child loops if the current loop remained valid.
1678	if (UnrollRevisitChildLoops) {
1679	// Walk all* of the child loops.*
1680	SmallVector<Loop *, `4`> ChildLoops(L.begin(), L.end());
1681	Updater.addChildLoops(NewChildLoops: ChildLoops);
1682	}
1683	}
1684
1685	return getLoopPassPreservedAnalyses();
1686	}
1687
1688	PreservedAnalyses LoopUnrollPass::run(Function &F,
1689	FunctionAnalysisManager &AM) {
1690	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
1691	// There are no loops in the function. Return before computing other expensive
1692	// analyses.
1693	if (LI.empty())
1694	return PreservedAnalyses::all();
1695	auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
1696	auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
1697	auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
1698	auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
1699	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
1700	AAResults &AA = AM.getResult<AAManager>(IR&: F);
1701
1702	LoopAnalysisManager LAM = nullptr*;
1703	if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(IR&: F))
1704	LAM = &LAMProxy->getManager();
1705
1706	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
1707	ProfileSummaryInfo *PSI =
1708	MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
1709	auto *BFI = (PSI && PSI->hasProfileSummary()) ?
1710	&AM.getResult<BlockFrequencyAnalysis>(IR&: F) : nullptr;
1711
1712	bool Changed = false;
1713
1714	// The unroller requires loops to be in simplified form, and also needs LCSSA.
1715	// Since simplification may add new inner loops, it has to run before the
1716	// legality and profitability checks. This means running the loop unroller
1717	// will simplify all loops, regardless of whether anything end up being
1718	// unrolled.
1719	for (const auto &L : LI) {
1720	Changed \|=
1721	simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false / PreserveLCSSA /);
1722	Changed \|= formLCSSARecursively(L&: *L, DT, LI: &LI, SE: &SE);
1723	}
1724
1725	// Add the loop nests in the reverse order of LoopInfo. See method
1726	// declaration.
1727	SmallPriorityWorklist<Loop *, `4`> Worklist;
1728	appendLoopsToWorklist(LI, Worklist);
1729
1730	while (!Worklist.empty()) {
1731	// Because the LoopInfo stores the loops in RPO, we walk the worklist
1732	// from back to front so that we work forward across the CFG, which
1733	// for unrolling is only needed to get optimization remarks emitted in
1734	// a forward order.
1735	Loop &L = *Worklist.pop_back_val();
1736	#ifndef NDEBUG
1737	Loop *ParentL = L.getParentLoop();
1738	#endif
1739
1740	// Check if the profile summary indicates that the profiled application
1741	// has a huge working set size, in which case we disable peeling to avoid
1742	// bloating it further.
1743	std::optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
1744	if (PSI && PSI->hasHugeWorkingSetSize())
1745	LocalAllowPeeling = false;
1746	std::string LoopName = std::string (L.getName());
1747	// The API here is quite complex to call and we allow to select some
1748	// flavors of unrolling during construction time (by setting UnrollOpts).
1749	LoopUnrollResult Result = tryToUnrollLoop(
1750	L: &L, DT, LI: &LI, SE, TTI, AC, ORE, BFI, PSI,
1751	/PreserveLCSSA/ true, OptLevel: UnrollOpts.OptLevel, /OnlyFullUnroll/ false,
1752	OnlyWhenForced: UnrollOpts.OnlyWhenForced, ForgetAllSCEV: UnrollOpts.ForgetSCEV,
1753	/Count/ ProvidedCount: std::nullopt,
1754	/Threshold/ ProvidedThreshold: std::nullopt, ProvidedAllowPartial: UnrollOpts.AllowPartial,
1755	ProvidedRuntime: UnrollOpts.AllowRuntime, ProvidedUpperBound: UnrollOpts.AllowUpperBound, ProvidedAllowPeeling: LocalAllowPeeling,
1756	ProvidedAllowProfileBasedPeeling: UnrollOpts.AllowProfileBasedPeeling, ProvidedFullUnrollMaxCount: UnrollOpts.FullUnrollMaxCount,
1757	AA: &AA);
1758	Changed \|= Result != LoopUnrollResult::Unmodified;
1759
1760	// The parent must not be damaged by unrolling!
1761	#ifndef NDEBUG
1762	if (Result != LoopUnrollResult::Unmodified && ParentL)
1763	ParentL->verifyLoop();
1764	#endif
1765
1766	// Clear any cached analysis results for L if we removed it completely.
1767	if (LAM && Result == LoopUnrollResult::FullyUnrolled)
1768	LAM->clear(IR&: L, Name: LoopName);
1769	}
1770
1771	if (!Changed)
1772	return PreservedAnalyses::all();
1773
1774	return getLoopPassPreservedAnalyses();
1775	}
1776
1777	void LoopUnrollPass::printPipeline(
1778	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
1779	static_cast<PassInfoMixin<LoopUnrollPass> >(this*)->printPipeline(
1780	OS, MapClassName2PassName);
1781	OS << `'<'`;
1782	if (UnrollOpts.AllowPartial != std::nullopt)
1783	OS << (*UnrollOpts.AllowPartial ? "" : "no-") << "partial;";
1784	if (UnrollOpts.AllowPeeling != std::nullopt)
1785	OS << (*UnrollOpts.AllowPeeling ? "" : "no-") << "peeling;";
1786	if (UnrollOpts.AllowRuntime != std::nullopt)
1787	OS << (*UnrollOpts.AllowRuntime ? "" : "no-") << "runtime;";
1788	if (UnrollOpts.AllowUpperBound != std::nullopt)
1789	OS << (*UnrollOpts.AllowUpperBound ? "" : "no-") << "upperbound;";
1790	if (UnrollOpts.AllowProfileBasedPeeling != std::nullopt)
1791	OS << (*UnrollOpts.AllowProfileBasedPeeling ? "" : "no-")
1792	<< "profile-peeling;";
1793	if (UnrollOpts.FullUnrollMaxCount != std::nullopt)
1794	OS << "full-unroll-max=" << UnrollOpts.FullUnrollMaxCount << `';'`;
1795	OS << `'O'` << UnrollOpts.OptLevel;
1796	OS << `'>'`;
1797	}
1798

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp