LoopUnroll.cpp source code [llvm_projects/llvm/lib/Transforms/Utils/LoopUnroll.cpp]

1	//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements some loop unrolling utilities. It does not define any
10	// actual pass or policy, but provides a single function to perform loop
11	// unrolling.
12	//
13	// The process of unrolling can produce extraneous basic blocks linked with
14	// unconditional branches. This will be corrected in the future.
15	//
16	//===----------------------------------------------------------------------===//
17
18	#include "llvm/ADT/ArrayRef.h"
19	#include "llvm/ADT/DenseMap.h"
20	#include "llvm/ADT/STLExtras.h"
21	#include "llvm/ADT/ScopedHashTable.h"
22	#include "llvm/ADT/SetVector.h"
23	#include "llvm/ADT/SmallVector.h"
24	#include "llvm/ADT/Statistic.h"
25	#include "llvm/ADT/StringRef.h"
26	#include "llvm/ADT/Twine.h"
27	#include "llvm/Analysis/AliasAnalysis.h"
28	#include "llvm/Analysis/AssumptionCache.h"
29	#include "llvm/Analysis/DomTreeUpdater.h"
30	#include "llvm/Analysis/InstructionSimplify.h"
31	#include "llvm/Analysis/LoopInfo.h"
32	#include "llvm/Analysis/LoopIterator.h"
33	#include "llvm/Analysis/MemorySSA.h"
34	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
35	#include "llvm/Analysis/ScalarEvolution.h"
36	#include "llvm/IR/BasicBlock.h"
37	#include "llvm/IR/CFG.h"
38	#include "llvm/IR/Constants.h"
39	#include "llvm/IR/DebugInfoMetadata.h"
40	#include "llvm/IR/DebugLoc.h"
41	#include "llvm/IR/DiagnosticInfo.h"
42	#include "llvm/IR/Dominators.h"
43	#include "llvm/IR/Function.h"
44	#include "llvm/IR/IRBuilder.h"
45	#include "llvm/IR/Instruction.h"
46	#include "llvm/IR/Instructions.h"
47	#include "llvm/IR/IntrinsicInst.h"
48	#include "llvm/IR/Metadata.h"
49	#include "llvm/IR/PatternMatch.h"
50	#include "llvm/IR/Use.h"
51	#include "llvm/IR/User.h"
52	#include "llvm/IR/ValueHandle.h"
53	#include "llvm/IR/ValueMap.h"
54	#include "llvm/Support/Casting.h"
55	#include "llvm/Support/CommandLine.h"
56	#include "llvm/Support/Debug.h"
57	#include "llvm/Support/GenericDomTree.h"
58	#include "llvm/Support/raw_ostream.h"
59	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
60	#include "llvm/Transforms/Utils/Cloning.h"
61	#include "llvm/Transforms/Utils/Local.h"
62	#include "llvm/Transforms/Utils/LoopSimplify.h"
63	#include "llvm/Transforms/Utils/LoopUtils.h"
64	#include "llvm/Transforms/Utils/SimplifyIndVar.h"
65	#include "llvm/Transforms/Utils/UnrollLoop.h"
66	#include "llvm/Transforms/Utils/ValueMapper.h"
67	#include <assert.h>
68	#include <numeric>
69	#include <vector>
70
71	namespace llvm {
72	class DataLayout;
73	class Value;
74	} // namespace llvm
75
76	using namespace llvm;
77
78	#define DEBUG_TYPE "loop-unroll"
79
80	// TODO: Should these be here or in LoopUnroll?
81	STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
82	STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
83	STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
84	"latch (completely or otherwise)");
85
86	static cl::opt<bool>
87	UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(Val: false), cl::Hidden,
88	cl::desc ("Allow runtime unrolled loops to be unrolled "
89	"with epilog instead of prolog."));
90
91	static cl::opt<bool>
92	UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
93	cl::desc ("Verify domtree after unrolling"),
94	#ifdef EXPENSIVE_CHECKS
95	cl::init(true)
96	#else
97	cl::init(Val: false)
98	#endif
99	);
100
101	static cl::opt<bool>
102	UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
103	cl::desc ("Verify loopinfo after unrolling"),
104	#ifdef EXPENSIVE_CHECKS
105	cl::init(true)
106	#else
107	cl::init(Val: false)
108	#endif
109	);
110
111	static cl::opt<bool> UnrollAddParallelReductions(
112	"unroll-add-parallel-reductions", cl::init(Val: false), cl::Hidden,
113	cl::desc ("Allow unrolling to add parallel reduction phis."));
114
115	/// Check if unrolling created a situation where we need to insert phi nodes to
116	/// preserve LCSSA form.
117	/// \param Blocks is a vector of basic blocks representing unrolled loop.
118	/// \param L is the outer loop.
119	/// It's possible that some of the blocks are in L, and some are not. In this
120	/// case, if there is a use is outside L, and definition is inside L, we need to
121	/// insert a phi-node, otherwise LCSSA will be broken.
122	/// The function is just a helper function for llvm::UnrollLoop that returns
123	/// true if this situation occurs, indicating that LCSSA needs to be fixed.
124	static bool needToInsertPhisForLCSSA(Loop *L,
125	const std::vector<BasicBlock *> &Blocks,
126	LoopInfo *LI) {
127	for (BasicBlock *BB : Blocks) {
128	if (LI->getLoopFor(BB) == L)
129	continue;
130	for (Instruction &I : *BB) {
131	for (Use &U : I.operands()) {
132	if (const auto *Def = dyn_cast<Instruction>(Val&: U)) {
133	Loop *DefLoop = LI->getLoopFor(BB: Def->getParent());
134	if (!DefLoop)
135	continue;
136	if (DefLoop->contains(L))
137	return true;
138	}
139	}
140	}
141	}
142	return false;
143	}
144
145	/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
146	/// and adds a mapping from the original loop to the new loop to NewLoops.
147	/// Returns nullptr if no new loop was created and a pointer to the
148	/// original loop OriginalBB was part of otherwise.
149	const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
150	BasicBlock ClonedBB, LoopInfo LI,
151	NewLoopsMap &NewLoops) {
152	// Figure out which loop New is in.
153	const Loop *OldLoop = LI->getLoopFor(BB: OriginalBB);
154	assert(OldLoop && "Should (at least) be in the loop being unrolled!");
155
156	Loop *&NewLoop = NewLoops [OldLoop];
157	if (!NewLoop) {
158	// Found a new sub-loop.
159	assert(OriginalBB == OldLoop->getHeader() &&
160	"Header should be first in RPO");
161
162	NewLoop = LI->AllocateLoop();
163	Loop *NewLoopParent = NewLoops.lookup(Val: OldLoop->getParentLoop());
164
165	if (NewLoopParent)
166	NewLoopParent->addChildLoop(NewChild: NewLoop);
167	else
168	LI->addTopLevelLoop(New: NewLoop);
169
170	NewLoop->addBasicBlockToLoop(NewBB: ClonedBB, LI&: *LI);
171	return OldLoop;
172	} else {
173	NewLoop->addBasicBlockToLoop(NewBB: ClonedBB, LI&: *LI);
174	return nullptr;
175	}
176	}
177
178	/// The function chooses which type of unroll (epilog or prolog) is more
179	/// profitabale.
180	/// Epilog unroll is more profitable when there is PHI that starts from
181	/// constant. In this case epilog will leave PHI start from constant,
182	/// but prolog will convert it to non-constant.
183	///
184	/// loop:
185	/// PN = PHI [I, Latch], [CI, PreHeader]
186	/// I = foo(PN)
187	/// ...
188	///
189	/// Epilog unroll case.
190	/// loop:
191	/// PN = PHI [I2, Latch], [CI, PreHeader]
192	/// I1 = foo(PN)
193	/// I2 = foo(I1)
194	/// ...
195	/// Prolog unroll case.
196	/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
197	/// loop:
198	/// PN = PHI [I2, Latch], [NewPN, PreHeader]
199	/// I1 = foo(PN)
200	/// I2 = foo(I1)
201	/// ...
202	///
203	static bool isEpilogProfitable(Loop *L) {
204	BasicBlock *PreHeader = L->getLoopPreheader();
205	BasicBlock *Header = L->getHeader();
206	assert(PreHeader && Header);
207	for (const PHINode &PN : Header->phis()) {
208	if (isa<ConstantInt>(Val: PN.getIncomingValueForBlock(BB: PreHeader)))
209	return true;
210	}
211	return false;
212	}
213
214	struct LoadValue {
215	Instruction DefI = nullptr*;
216	unsigned Generation = `0`;
217	LoadValue() = default;
218	LoadValue(Instruction Inst, unsigned* Generation)
219	: DefI(Inst), Generation(Generation) {}
220	};
221
222	class StackNode {
223	ScopedHashTable<const SCEV *, LoadValue>::ScopeTy LoadScope;
224	unsigned CurrentGeneration;
225	unsigned ChildGeneration;
226	DomTreeNode *Node;
227	DomTreeNode::const_iterator ChildIter;
228	DomTreeNode::const_iterator EndIter;
229	bool Processed = false;
230
231	public:
232	StackNode(ScopedHashTable<const SCEV *, LoadValue> &AvailableLoads,
233	unsigned cg, DomTreeNode *N, DomTreeNode::const_iterator Child,
234	DomTreeNode::const_iterator End)
235	: LoadScope (AvailableLoads), CurrentGeneration(cg), ChildGeneration(cg),
236	Node(N), ChildIter (Child), EndIter (End) {}
237	// Accessors.
238	unsigned currentGeneration() const { return CurrentGeneration; }
239	unsigned childGeneration() const { return ChildGeneration; }
240	void childGeneration(unsigned generation) { ChildGeneration = generation; }
241	DomTreeNode node() { return* Node; }
242	DomTreeNode::const_iterator childIter() const { return ChildIter; }
243
244	DomTreeNode *nextChild() {
245	DomTreeNode Child = ChildIter;
246	++ChildIter;
247	return Child;
248	}
249
250	DomTreeNode::const_iterator end() const { return EndIter; }
251	bool isProcessed() const { return Processed; }
252	void process() { Processed = true; }
253	};
254
255	Value getMatchingValue(LoadValue LV, LoadInst LI, unsigned CurrentGeneration,
256	BatchAAResults &BAA,
257	function_ref<MemorySSA *()> GetMSSA) {
258	if (!LV.DefI)
259	return nullptr;
260	if (LV.DefI->getType() != LI->getType())
261	return nullptr;
262	if (LV.Generation != CurrentGeneration) {
263	MemorySSA *MSSA = GetMSSA ();
264	if (!MSSA)
265	return nullptr;
266	auto *EarlierMA = MSSA->getMemoryAccess(I: LV.DefI);
267	MemoryAccess *LaterDef =
268	MSSA->getWalker()->getClobberingMemoryAccess(I: LI, AA&: BAA);
269	if (!MSSA->dominates(A: LaterDef, B: EarlierMA))
270	return nullptr;
271	}
272	return LV.DefI;
273	}
274
275	void loadCSE(Loop *L, DominatorTree &DT, ScalarEvolution &SE, LoopInfo &LI,
276	BatchAAResults &BAA, function_ref<MemorySSA *()> GetMSSA) {
277	ScopedHashTable<const SCEV *, LoadValue> AvailableLoads;
278	SmallVector<std::unique_ptr<StackNode>> NodesToProcess;
279	DomTreeNode *HeaderD = DT.getNode(BB: L->getHeader());
280	NodesToProcess.emplace_back(Args: new StackNode (AvailableLoads, `0`, HeaderD,
281	HeaderD->begin(), HeaderD->end()));
282
283	unsigned CurrentGeneration = `0`;
284	while (!NodesToProcess.empty()) {
285	StackNode NodeToProcess = &NodesToProcess.back();
286
287	CurrentGeneration = NodeToProcess->currentGeneration();
288
289	if (!NodeToProcess->isProcessed()) {
290	// Process the node.
291
292	// If this block has a single predecessor, then the predecessor is the
293	// parent
294	// of the domtree node and all of the live out memory values are still
295	// current in this block. If this block has multiple predecessors, then
296	// they could have invalidated the live-out memory values of our parent
297	// value. For now, just be conservative and invalidate memory if this
298	// block has multiple predecessors.
299	if (!NodeToProcess->node()->getBlock()->getSinglePredecessor())
300	++CurrentGeneration;
301	for (auto &I : make_early_inc_range(Range&: *NodeToProcess->node()->getBlock())) {
302
303	auto *Load = dyn_cast<LoadInst>(Val: &I);
304	if (!Load \|\| !Load->isSimple()) {
305	if (I.mayWriteToMemory())
306	CurrentGeneration++;
307	continue;
308	}
309
310	const SCEV *PtrSCEV = SE.getSCEV(V: Load->getPointerOperand());
311	LoadValue LV = AvailableLoads.lookup(Key: PtrSCEV);
312	if (Value *M =
313	getMatchingValue(LV, LI: Load, CurrentGeneration, BAA, GetMSSA)) {
314	if (LI.replacementPreservesLCSSAForm(From: Load, To: M)) {
315	Load->replaceAllUsesWith(V: M);
316	Load->eraseFromParent();
317	}
318	} else {
319	AvailableLoads.insert(Key: PtrSCEV, Val: LoadValue (Load, CurrentGeneration));
320	}
321	}
322	NodeToProcess->childGeneration(generation: CurrentGeneration);
323	NodeToProcess->process();
324	} else if (NodeToProcess->childIter() != NodeToProcess->end()) {
325	// Push the next child onto the stack.
326	DomTreeNode *Child = NodeToProcess->nextChild();
327	if (!L->contains(BB: Child->getBlock()))
328	continue;
329	NodesToProcess.emplace_back(
330	Args: new StackNode (AvailableLoads, NodeToProcess->childGeneration(), Child,
331	Child->begin(), Child->end()));
332	} else {
333	// It has been processed, and there are no more children to process,
334	// so delete it and pop it off the stack.
335	NodesToProcess.pop_back();
336	}
337	}
338	}
339
340	/// Perform some cleanup and simplifications on loops after unrolling. It is
341	/// useful to simplify the IV's in the new loop, as well as do a quick
342	/// simplify/dce pass of the instructions.
343	void llvm::simplifyLoopAfterUnroll(Loop L, bool* SimplifyIVs, LoopInfo *LI,
344	ScalarEvolution SE, DominatorTree DT,
345	AssumptionCache *AC,
346	const TargetTransformInfo *TTI,
347	AAResults *AA) {
348	using namespace llvm::PatternMatch;
349
350	// Simplify any new induction variables in the partially unrolled loop.
351	if (SE && SimplifyIVs) {
352	SmallVector<WeakTrackingVH, `16`> DeadInsts;
353	simplifyLoopIVs(L, SE, DT, LI, TTI, Dead&: DeadInsts);
354
355	// Aggressively clean up dead instructions that simplifyLoopIVs already
356	// identified. Any remaining should be cleaned up below.
357	while (!DeadInsts.empty()) {
358	Value *V = DeadInsts.pop_back_val();
359	if (Instruction *Inst = dyn_cast_or_null<Instruction>(Val: V))
360	RecursivelyDeleteTriviallyDeadInstructions(V: Inst);
361	}
362
363	if (AA) {
364	std::unique_ptr<MemorySSA> MSSA = nullptr;
365	BatchAAResults BAA(*AA);
366	loadCSE(L, DT&: DT, SE&: SE, LI&: LI, BAA, GetMSSA: [L, AA, DT, &MSSA]() -> MemorySSA {
367	if (!MSSA)
368	MSSA.reset(p: new MemorySSA (*L, AA, DT));
369	return &*MSSA;
370	});
371	}
372	}
373
374	// At this point, the code is well formed. Perform constprop, instsimplify,
375	// and dce.
376	const DataLayout &DL = L->getHeader()->getDataLayout();
377	SmallVector<WeakTrackingVH, `16`> DeadInsts;
378	for (BasicBlock *BB : L->getBlocks()) {
379	// Remove repeated debug instructions after loop unrolling.
380	if (BB->getParent()->getSubprogram())
381	RemoveRedundantDbgInstrs(BB);
382
383	for (Instruction &Inst : llvm::make_early_inc_range(Range&: *BB)) {
384	if (Value V = simplifyInstruction(I: &Inst, Q: {DL, nullptr*, DT, AC}))
385	if (LI->replacementPreservesLCSSAForm(From: &Inst, To: V))
386	Inst.replaceAllUsesWith(V);
387	if (isInstructionTriviallyDead(I: &Inst))
388	DeadInsts.emplace_back(Args: &Inst);
389
390	// Fold ((add X, C1), C2) to (add X, C1+C2). This is very common in
391	// unrolled loops, and handling this early allows following code to
392	// identify the IV as a "simple recurrence" without first folding away
393	// a long chain of adds.
394	{
395	Value *X;
396	const APInt C1, C2;
397	if (match(V: &Inst, P: m_Add(L: m_Add(L: m_Value(V&: X), R: m_APInt(Res&: C1)), R: m_APInt(Res&: C2)))) {
398	auto *InnerI = dyn_cast<Instruction>(Val: Inst.getOperand(i: `0`));
399	auto *InnerOBO = cast<OverflowingBinaryOperator>(Val: Inst.getOperand(i: `0`));
400	bool SignedOverflow;
401	APInt NewC = C1->sadd_ov(RHS: *C2, Overflow&: SignedOverflow);
402	Inst.setOperand(i: `0`, Val: X);
403	Inst.setOperand(i: `1`, Val: ConstantInt::get(Ty: Inst.getType(), V: NewC));
404	Inst.setHasNoUnsignedWrap(Inst.hasNoUnsignedWrap() &&
405	InnerOBO->hasNoUnsignedWrap());
406	Inst.setHasNoSignedWrap(Inst.hasNoSignedWrap() &&
407	InnerOBO->hasNoSignedWrap() &&
408	!SignedOverflow);
409	if (InnerI && isInstructionTriviallyDead(I: InnerI))
410	DeadInsts.emplace_back(Args&: InnerI);
411	}
412	}
413	}
414	// We can't do recursive deletion until we're done iterating, as we might
415	// have a phi which (potentially indirectly) uses instructions later in
416	// the block we're iterating through.
417	RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
418	}
419	}
420
421	// Loops containing convergent instructions that are uncontrolled or controlled
422	// from outside the loop must have a count that divides their TripMultiple.
423	LLVM_ATTRIBUTE_USED
424	static bool canHaveUnrollRemainder(const Loop *L) {
425	if (getLoopConvergenceHeart(TheLoop: L))
426	return false;
427
428	// Check for uncontrolled convergent operations.
429	for (auto &BB : L->blocks()) {
430	for (auto &I : *BB) {
431	if (isa<ConvergenceControlInst>(Val: I))
432	return true;
433	if (auto *CB = dyn_cast<CallBase>(Val: &I))
434	if (CB->isConvergent())
435	return CB->getConvergenceControlToken();
436	}
437	}
438	return true;
439	}
440
441	/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
442	/// can only fail when the loop's latch block is not terminated by a conditional
443	/// branch instruction. However, if the trip count (and multiple) are not known,
444	/// loop unrolling will mostly produce more code that is no faster.
445	///
446	/// If Runtime is true then UnrollLoop will try to insert a prologue or
447	/// epilogue that ensures the latch has a trip multiple of Count. UnrollLoop
448	/// will not runtime-unroll the loop if computing the run-time trip count will
449	/// be expensive and AllowExpensiveTripCount is false.
450	///
451	/// The LoopInfo Analysis that is passed will be kept consistent.
452	///
453	/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
454	/// DominatorTree if they are non-null.
455	///
456	/// If RemainderLoop is non-null, it will receive the remainder loop (if
457	/// required and not fully unrolled).
458	LoopUnrollResult
459	llvm::UnrollLoop(Loop L, UnrollLoopOptions ULO, LoopInfo LI,
460	ScalarEvolution SE, DominatorTree DT, AssumptionCache *AC,
461	const TargetTransformInfo TTI, OptimizationRemarkEmitter ORE,
462	bool PreserveLCSSA, Loop *RemainderLoop, AAResults AA) {
463	assert(DT && "DomTree is required");
464
465	if (!L->getLoopPreheader()) {
466	LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
467	return LoopUnrollResult::Unmodified;
468	}
469
470	if (!L->getLoopLatch()) {
471	LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
472	return LoopUnrollResult::Unmodified;
473	}
474
475	// Loops with indirectbr cannot be cloned.
476	if (!L->isSafeToClone()) {
477	LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
478	return LoopUnrollResult::Unmodified;
479	}
480
481	if (L->getHeader()->hasAddressTaken()) {
482	// The loop-rotate pass can be helpful to avoid this in many cases.
483	LLVM_DEBUG(
484	dbgs() << " Won't unroll loop: address of header block is taken.\n");
485	return LoopUnrollResult::Unmodified;
486	}
487
488	assert(ULO.Count > `0`);
489
490	// All these values should be taken only after peeling because they might have
491	// changed.
492	BasicBlock *Preheader = L->getLoopPreheader();
493	BasicBlock *Header = L->getHeader();
494	BasicBlock *LatchBlock = L->getLoopLatch();
495	SmallVector<BasicBlock *, `4`> ExitBlocks;
496	L->getExitBlocks(ExitBlocks);
497	std::vector<BasicBlock *> OriginalLoopBlocks = L->getBlocks();
498
499	const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L);
500	const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L);
501	std::optional<unsigned> OriginalTripCount =
502	llvm::getLoopEstimatedTripCount(L);
503	BranchProbability OriginalLoopProb = llvm::getLoopProbability(L);
504
505	// Effectively "DCE" unrolled iterations that are beyond the max tripcount
506	// and will never be executed.
507	if (MaxTripCount && ULO.Count > MaxTripCount)
508	ULO.Count = MaxTripCount;
509
510	struct ExitInfo {
511	unsigned TripCount;
512	unsigned TripMultiple;
513	unsigned BreakoutTrip;
514	bool ExitOnTrue;
515	BasicBlock FirstExitingBlock = nullptr*;
516	SmallVector<BasicBlock *> ExitingBlocks;
517	};
518	DenseMap<BasicBlock *, ExitInfo> ExitInfos;
519	SmallVector<BasicBlock *, `4`> ExitingBlocks;
520	L->getExitingBlocks(ExitingBlocks);
521	for (auto *ExitingBlock : ExitingBlocks) {
522	// The folding code is not prepared to deal with non-branch instructions
523	// right now.
524	auto *BI = dyn_cast<BranchInst>(Val: ExitingBlock->getTerminator());
525	if (!BI)
526	continue;
527
528	ExitInfo &Info = ExitInfos [ExitingBlock];
529	Info.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
530	Info.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
531	if (Info.TripCount != `0`) {
532	Info.BreakoutTrip = Info.TripCount % ULO.Count;
533	Info.TripMultiple = `0`;
534	} else {
535	Info.BreakoutTrip = Info.TripMultiple =
536	(unsigned)std::gcd(m: ULO.Count, n: Info.TripMultiple);
537	}
538	Info.ExitOnTrue = !L->contains(BB: BI->getSuccessor(i: `0`));
539	Info.ExitingBlocks.push_back(Elt: ExitingBlock);
540	LLVM_DEBUG(dbgs() << " Exiting block %" << ExitingBlock->getName()
541	<< ": TripCount=" << Info.TripCount
542	<< ", TripMultiple=" << Info.TripMultiple
543	<< ", BreakoutTrip=" << Info.BreakoutTrip << "\n");
544	}
545
546	// Are we eliminating the loop control altogether? Note that we can know
547	// we're eliminating the backedge without knowing exactly which iteration
548	// of the unrolled body exits.
549	const bool CompletelyUnroll = ULO.Count == MaxTripCount;
550
551	const bool PreserveOnlyFirst = CompletelyUnroll && MaxOrZero;
552
553	// There's no point in performing runtime unrolling if this unroll count
554	// results in a full unroll.
555	if (CompletelyUnroll)
556	ULO.Runtime = false;
557
558	// Go through all exits of L and see if there are any phi-nodes there. We just
559	// conservatively assume that they're inserted to preserve LCSSA form, which
560	// means that complete unrolling might break this form. We need to either fix
561	// it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
562	// now we just recompute LCSSA for the outer loop, but it should be possible
563	// to fix it in-place.
564	bool NeedToFixLCSSA =
565	PreserveLCSSA && CompletelyUnroll &&
566	any_of(Range&: ExitBlocks,
567	P: [](const BasicBlock BB) { return* isa<PHINode>(Val: BB->begin()); });
568
569	// The current loop unroll pass can unroll loops that have
570	// (1) single latch; and
571	// (2a) latch is unconditional; or
572	// (2b) latch is conditional and is an exiting block
573	// FIXME: The implementation can be extended to work with more complicated
574	// cases, e.g. loops with multiple latches.
575	BranchInst *LatchBI = dyn_cast<BranchInst>(Val: LatchBlock->getTerminator());
576
577	// A conditional branch which exits the loop, which can be optimized to an
578	// unconditional branch in the unrolled loop in some cases.
579	bool LatchIsExiting = L->isLoopExiting(BB: LatchBlock);
580	if (!LatchBI \|\| (LatchBI->isConditional() && !LatchIsExiting)) {
581	LLVM_DEBUG(
582	dbgs() << "Can't unroll; a conditional latch must exit the loop");
583	return LoopUnrollResult::Unmodified;
584	}
585
586	assert((!ULO.Runtime \|\| canHaveUnrollRemainder(L)) &&
587	"Can't runtime unroll if loop contains a convergent operation.");
588
589	bool EpilogProfitability =
590	UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
591	: isEpilogProfitable(L);
592
593	if (ULO.Runtime &&
594	!UnrollRuntimeLoopRemainder(
595	L, Count: ULO.Count, AllowExpensiveTripCount: ULO.AllowExpensiveTripCount, UseEpilogRemainder: EpilogProfitability,
596	UnrollRemainder: ULO.UnrollRemainder, ForgetAllSCEV: ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
597	PreserveLCSSA, SCEVExpansionBudget: ULO.SCEVExpansionBudget, RuntimeUnrollMultiExit: ULO.RuntimeUnrollMultiExit,
598	ResultLoop: RemainderLoop, OriginalTripCount, OriginalLoopProb)) {
599	if (ULO.Force)
600	ULO.Runtime = false;
601	else {
602	LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
603	"generated when assuming runtime trip count\n");
604	return LoopUnrollResult::Unmodified;
605	}
606	}
607
608	using namespace ore;
609	// Report the unrolling decision.
610	if (CompletelyUnroll) {
611	LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
612	<< " with trip count " << ULO.Count << "!\n");
613	if (ORE)
614	ORE->emit(RemarkBuilder: [&]() {
615	return OptimizationRemark (DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
616	L->getHeader())
617	<< "completely unrolled loop with "
618	<< NV ("UnrollCount", ULO.Count) << " iterations";
619	});
620	} else {
621	LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
622	<< ULO.Count);
623	if (ULO.Runtime)
624	LLVM_DEBUG(dbgs() << " with run-time trip count");
625	LLVM_DEBUG(dbgs() << "!\n");
626
627	if (ORE)
628	ORE->emit(RemarkBuilder: [&]() {
629	OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
630	L->getHeader());
631	Diag << "unrolled loop by a factor of " << NV ("UnrollCount", ULO.Count);
632	if (ULO.Runtime)
633	Diag << " with run-time trip count";
634	return Diag;
635	});
636	}
637
638	// We are going to make changes to this loop. SCEV may be keeping cached info
639	// about it, in particular about backedge taken count. The changes we make
640	// are guaranteed to invalidate this information for our loop. It is tempting
641	// to only invalidate the loop being unrolled, but it is incorrect as long as
642	// all exiting branches from all inner loops have impact on the outer loops,
643	// and if something changes inside them then any of outer loops may also
644	// change. When we forget outermost loop, we also forget all contained loops
645	// and this is what we need here.
646	if (SE) {
647	if (ULO.ForgetAllSCEV)
648	SE->forgetAllLoops();
649	else {
650	SE->forgetTopmostLoop(L);
651	SE->forgetBlockAndLoopDispositions();
652	}
653	}
654
655	if (!LatchIsExiting)
656	++NumUnrolledNotLatch;
657
658	// For the first iteration of the loop, we should use the precloned values for
659	// PHI nodes. Insert associations now.
660	ValueToValueMapTy LastValueMap;
661	std::vector<PHINode*> OrigPHINode;
662	for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
663	OrigPHINode.push_back(x: cast<PHINode>(Val&: I));
664	}
665
666	// Collect phi nodes for reductions for which we can introduce multiple
667	// parallel reduction phis and compute the final reduction result after the
668	// loop. This requires a single exit block after unrolling. This is ensured by
669	// restricting to single-block loops where the unrolled iterations are known
670	// to not exit.
671	DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
672	bool CanAddAdditionalAccumulators =
673	(UnrollAddParallelReductions.getNumOccurrences() > `0`
674	? UnrollAddParallelReductions
675	: ULO.AddAdditionalAccumulators) &&
676	!CompletelyUnroll && L->getNumBlocks() == `1` &&
677	(ULO.Runtime \|\|
678	(ExitInfos.contains(Val: Header) && ((ExitInfos [Header].TripCount != `0` &&
679	ExitInfos [Header].BreakoutTrip == `0`))));
680
681	// Limit parallelizing reductions to unroll counts of 4 or less for now.
682	// TODO: The number of parallel reductions should depend on the number of
683	// execution units. We also don't have to add a parallel reduction phi per
684	// unrolled iteration, but could for example add a parallel phi for every 2
685	// unrolled iterations.
686	if (CanAddAdditionalAccumulators && ULO.Count <= `4`) {
687	for (PHINode &Phi : Header->phis()) {
688	auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
689	if (!RdxDesc)
690	continue;
691
692	// Only handle duplicate phis for a single reduction for now.
693	// TODO: Handle any number of reductions
694	if (!Reductions.empty())
695	continue;
696
697	Reductions [&Phi] = *RdxDesc;
698	}
699	}
700
701	std::vector<BasicBlock *> Headers;
702	std::vector<BasicBlock *> Latches;
703	Headers.push_back(x: Header);
704	Latches.push_back(x: LatchBlock);
705
706	// The current on-the-fly SSA update requires blocks to be processed in
707	// reverse postorder so that LastValueMap contains the correct value at each
708	// exit.
709	LoopBlocksDFS DFS(L);
710	DFS.perform(LI);
711
712	// Stash the DFS iterators before adding blocks to the loop.
713	LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
714	LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
715
716	std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
717
718	// Loop Unrolling might create new loops. While we do preserve LoopInfo, we
719	// might break loop-simplified form for these loops (as they, e.g., would
720	// share the same exit blocks). We'll keep track of loops for which we can
721	// break this so that later we can re-simplify them.
722	SmallSetVector<Loop *, `4`> LoopsToSimplify;
723	LoopsToSimplify.insert_range(R&: *L);
724
725	// When a FSDiscriminator is enabled, we don't need to add the multiply
726	// factors to the discriminators.
727	if (Header->getParent()->shouldEmitDebugInfoForProfiling() &&
728	!EnableFSDiscriminator)
729	for (BasicBlock *BB : L->getBlocks())
730	for (Instruction &I : *BB)
731	if (!I.isDebugOrPseudoInst())
732	if (const DILocation *DIL = I.getDebugLoc()) {
733	auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(DF: ULO.Count);
734	if (NewDIL)
735	I.setDebugLoc(*NewDIL);
736	else
737	LLVM_DEBUG(dbgs()
738	<< "Failed to create new discriminator: "
739	<< DIL->getFilename() << " Line: " << DIL->getLine());
740	}
741
742	// Identify what noalias metadata is inside the loop: if it is inside the
743	// loop, the associated metadata must be cloned for each iteration.
744	SmallVector<MDNode *, `6`> LoopLocalNoAliasDeclScopes;
745	identifyNoAliasScopesToClone(BBs: L->getBlocks(), NoAliasDeclScopes&: LoopLocalNoAliasDeclScopes);
746
747	// We place the unrolled iterations immediately after the original loop
748	// latch. This is a reasonable default placement if we don't have block
749	// frequencies, and if we do, well the layout will be adjusted later.
750	auto BlockInsertPt = std::next(x: LatchBlock->getIterator());
751	SmallVector<Instruction *> PartialReductions;
752	for (unsigned It = `1`; It != ULO.Count; ++It) {
753	SmallVector<BasicBlock *, `8`> NewBlocks;
754	SmallDenseMap<const Loop , Loop , `4`> NewLoops;
755	NewLoops [L] = L;
756
757	for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
758	ValueToValueMapTy VMap;
759	BasicBlock New = CloneBasicBlock(BB: BB, VMap, NameSuffix: "." + Twine (It));
760	Header->getParent()->insert(Position: BlockInsertPt, BB: New);
761
762	assert((BB != Header \|\| LI->getLoopFor(BB) == L) &&
763	"Header should not be in a sub-loop");
764	// Tell LI about New.
765	const Loop OldLoop = addClonedBlockToLoopInfo(OriginalBB: BB, ClonedBB: New, LI, NewLoops);
766	if (OldLoop)
767	LoopsToSimplify.insert(X: NewLoops [OldLoop]);
768
769	if (*BB == Header) {
770	// Loop over all of the PHI nodes in the block, changing them to use
771	// the incoming values from the previous block.
772	for (PHINode *OrigPHI : OrigPHINode) {
773	PHINode *NewPHI = cast<PHINode>(Val&: VMap [OrigPHI]);
774	Value *InVal = NewPHI->getIncomingValueForBlock(BB: LatchBlock);
775
776	// Use cloned phis as parallel phis for partial reductions, which will
777	// get combined to the final reduction result after the loop.
778	if (Reductions.contains(Val: OrigPHI)) {
779	// Collect partial reduction results.
780	if (PartialReductions.empty())
781	PartialReductions.push_back(Elt: cast<Instruction>(Val: InVal));
782	PartialReductions.push_back(Elt: cast<Instruction>(Val&: VMap [InVal]));
783
784	// Update the start value for the cloned phis to use the identity
785	// value for the reduction.
786	const RecurrenceDescriptor &RdxDesc = Reductions [OrigPHI];
787	NewPHI->setIncomingValueForBlock(
788	BB: L->getLoopPreheader(),
789	V: getRecurrenceIdentity(K: RdxDesc.getRecurrenceKind(),
790	Tp: OrigPHI->getType(),
791	FMF: RdxDesc.getFastMathFlags()));
792
793	// Update NewPHI to use the cloned value for the iteration and move
794	// to header.
795	NewPHI->replaceUsesOfWith(From: InVal, To: VMap [InVal]);
796	NewPHI->moveBefore(InsertPos: OrigPHI->getIterator());
797	continue;
798	}
799
800	if (Instruction *InValI = dyn_cast<Instruction>(Val: InVal))
801	if (It > `1` && L->contains(Inst: InValI))
802	InVal = LastValueMap [InValI];
803	VMap [OrigPHI] = InVal;
804	NewPHI->eraseFromParent();
805	}
806
807	// Eliminate copies of the loop heart intrinsic, if any.
808	if (ULO.Heart) {
809	auto it = VMap.find(Val: ULO.Heart);
810	assert(it != VMap.end());
811	Instruction *heartCopy = cast<Instruction>(Val&: it ->second);
812	heartCopy->eraseFromParent();
813	VMap.erase(I: it);
814	}
815	}
816
817	// Remap source location atom instance. Do this now, rather than
818	// when we remap instructions, because remap is called once we've
819	// cloned all blocks (all the clones would get the same atom
820	// number).
821	if (!VMap.AtomMap.empty())
822	for (Instruction &I : *New)
823	RemapSourceAtom(I: &I, VM&: VMap);
824
825	// Update our running map of newest clones
826	LastValueMap [*BB] = New;
827	for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
828	VI != VE; ++VI)
829	LastValueMap [VI ->first] = VI ->second;
830
831	// Add phi entries for newly created values to all exit blocks.
832	for (BasicBlock Succ : successors(BB: BB)) {
833	if (L->contains(BB: Succ))
834	continue;
835	for (PHINode &PHI : Succ->phis()) {
836	Value Incoming = PHI.getIncomingValueForBlock(BB: BB);
837	ValueToValueMapTy::iterator It = LastValueMap.find(Val: Incoming);
838	if (It != LastValueMap.end())
839	Incoming = It ->second;
840	PHI.addIncoming(V: Incoming, BB: New);
841	SE->forgetLcssaPhiWithNewPredecessor(L, V: &PHI);
842	}
843	}
844	// Keep track of new headers and latches as we create them, so that
845	// we can insert the proper branches later.
846	if (*BB == Header)
847	Headers.push_back(x: New);
848	if (*BB == LatchBlock)
849	Latches.push_back(x: New);
850
851	// Keep track of the exiting block and its successor block contained in
852	// the loop for the current iteration.
853	auto ExitInfoIt = ExitInfos.find(Val: *BB);
854	if (ExitInfoIt != ExitInfos.end())
855	ExitInfoIt ->second.ExitingBlocks.push_back(Elt: New);
856
857	NewBlocks.push_back(Elt: New);
858	UnrolledLoopBlocks.push_back(x: New);
859
860	// Update DomTree: since we just copy the loop body, and each copy has a
861	// dedicated entry block (copy of the header block), this header's copy
862	// dominates all copied blocks. That means, dominance relations in the
863	// copied body are the same as in the original body.
864	if (*BB == Header)
865	DT->addNewBlock(BB: New, DomBB: Latches [It - `1`]);
866	else {
867	auto BBDomNode = DT->getNode(BB: *BB);
868	auto BBIDom = BBDomNode->getIDom();
869	BasicBlock *OriginalBBIDom = BBIDom->getBlock();
870	DT->addNewBlock(
871	BB: New, DomBB: cast<BasicBlock>(Val&: LastValueMap [cast<Value>(Val: OriginalBBIDom)]));
872	}
873	}
874
875	// Remap all instructions in the most recent iteration.
876	// Key Instructions: Nothing to do - we've already remapped the atoms.
877	remapInstructionsInBlocks(Blocks: NewBlocks, VMap&: LastValueMap);
878	for (BasicBlock *NewBlock : NewBlocks)
879	for (Instruction &I : *NewBlock)
880	if (auto *II = dyn_cast<AssumeInst>(Val: &I))
881	AC->registerAssumption(CI: II);
882
883	{
884	// Identify what other metadata depends on the cloned version. After
885	// cloning, replace the metadata with the corrected version for both
886	// memory instructions and noalias intrinsics.
887	std::string ext = (Twine ("It") + Twine (It)).str();
888	cloneAndAdaptNoAliasScopes(NoAliasDeclScopes: LoopLocalNoAliasDeclScopes, NewBlocks,
889	Context&: Header->getContext(), Ext: ext);
890	}
891	}
892
893	// Loop over the PHI nodes in the original block, setting incoming values.
894	for (PHINode *PN : OrigPHINode) {
895	if (CompletelyUnroll) {
896	PN->replaceAllUsesWith(V: PN->getIncomingValueForBlock(BB: Preheader));
897	PN->eraseFromParent();
898	} else if (ULO.Count > `1`) {
899	if (Reductions.contains(Val: PN))
900	continue;
901
902	Value InVal = PN->removeIncomingValue(BB: LatchBlock, DeletePHIIfEmpty: false*);
903	// If this value was defined in the loop, take the value defined by the
904	// last iteration of the loop.
905	if (Instruction *InValI = dyn_cast<Instruction>(Val: InVal)) {
906	if (L->contains(Inst: InValI))
907	InVal = LastValueMap [InVal];
908	}
909	assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
910	PN->addIncoming(V: InVal, BB: Latches.back());
911	}
912	}
913
914	// Connect latches of the unrolled iterations to the headers of the next
915	// iteration. Currently they point to the header of the same iteration.
916	for (unsigned i = `0`, e = Latches.size(); i != e; ++i) {
917	unsigned j = (i + `1`) % e;
918	Latches [i]->getTerminator()->replaceSuccessorWith(OldBB: Headers [i], NewBB: Headers [j]);
919	}
920
921	// Remove loop metadata copied from the original loop latch to branches that
922	// are no longer latches.
923	for (unsigned I = `0`, E = Latches.size() - (CompletelyUnroll ? `0` : `1`); I < E;
924	++I)
925	Latches [I]->getTerminator()->setMetadata(KindID: LLVMContext::MD_loop, Node: nullptr);
926
927	// Update dominators of blocks we might reach through exits.
928	// Immediate dominator of such block might change, because we add more
929	// routes which can lead to the exit: we can now reach it from the copied
930	// iterations too.
931	if (ULO.Count > `1`) {
932	for (auto *BB : OriginalLoopBlocks) {
933	auto *BBDomNode = DT->getNode(BB);
934	SmallVector<BasicBlock *, `16`> ChildrenToUpdate;
935	for (auto *ChildDomNode : BBDomNode->children()) {
936	auto *ChildBB = ChildDomNode->getBlock();
937	if (!L->contains(BB: ChildBB))
938	ChildrenToUpdate.push_back(Elt: ChildBB);
939	}
940	// The new idom of the block will be the nearest common dominator
941	// of all copies of the previous idom. This is equivalent to the
942	// nearest common dominator of the previous idom and the first latch,
943	// which dominates all copies of the previous idom.
944	BasicBlock *NewIDom = DT->findNearestCommonDominator(A: BB, B: LatchBlock);
945	for (auto *ChildBB : ChildrenToUpdate)
946	DT->changeImmediateDominator(BB: ChildBB, NewBB: NewIDom);
947	}
948	}
949
950	assert(!UnrollVerifyDomtree \|\|
951	DT->verify(DominatorTree::VerificationLevel::Fast));
952
953	SmallVector<DominatorTree::UpdateType> DTUpdates;
954	auto SetDest = [&](BasicBlock Src, bool* WillExit, bool ExitOnTrue) {
955	auto *Term = cast<BranchInst>(Val: Src->getTerminator());
956	const unsigned Idx = ExitOnTrue ^ WillExit;
957	BasicBlock *Dest = Term->getSuccessor(i: Idx);
958	BasicBlock *DeadSucc = Term->getSuccessor(i: `1`-Idx);
959
960	// Remove predecessors from all non-Dest successors.
961	DeadSucc->removePredecessor(Pred: Src, / KeepOneInputPHIs / true);
962
963	// Replace the conditional branch with an unconditional one.
964	auto *BI = BranchInst::Create(IfTrue: Dest, InsertBefore: Term->getIterator());
965	BI->setDebugLoc(Term->getDebugLoc());
966	Term->eraseFromParent();
967
968	DTUpdates.emplace_back(Args: DominatorTree::Delete, Args&: Src, Args&: DeadSucc);
969	};
970
971	auto WillExit = [&](const ExitInfo &Info, unsigned i, unsigned j,
972	bool IsLatch) -> std::optional<bool> {
973	if (CompletelyUnroll) {
974	if (PreserveOnlyFirst) {
975	if (i == `0`)
976	return std::nullopt;
977	return j == `0`;
978	}
979	// Complete (but possibly inexact) unrolling
980	if (j == `0`)
981	return true;
982	if (Info.TripCount && j != Info.TripCount)
983	return false;
984	return std::nullopt;
985	}
986
987	if (ULO.Runtime) {
988	// If runtime unrolling inserts a prologue, information about non-latch
989	// exits may be stale.
990	if (IsLatch && j != `0`)
991	return false;
992	return std::nullopt;
993	}
994
995	if (j != Info.BreakoutTrip &&
996	(Info.TripMultiple == `0` \|\| j % Info.TripMultiple != `0`)) {
997	// If we know the trip count or a multiple of it, we can safely use an
998	// unconditional branch for some iterations.
999	return false;
1000	}
1001	return std::nullopt;
1002	};
1003
1004	// Fold branches for iterations where we know that they will exit or not
1005	// exit.
1006	for (auto &Pair : ExitInfos) {
1007	ExitInfo &Info = Pair.second;
1008	for (unsigned i = `0`, e = Info.ExitingBlocks.size(); i != e; ++i) {
1009	// The branch destination.
1010	unsigned j = (i + `1`) % e;
1011	bool IsLatch = Pair.first == LatchBlock;
1012	std::optional<bool> KnownWillExit = WillExit (Info, i, j, IsLatch);
1013	if (!KnownWillExit) {
1014	if (!Info.FirstExitingBlock)
1015	Info.FirstExitingBlock = Info.ExitingBlocks [i];
1016	continue;
1017	}
1018
1019	// We don't fold known-exiting branches for non-latch exits here,
1020	// because this ensures that both all loop blocks and all exit blocks
1021	// remain reachable in the CFG.
1022	// TODO: We could fold these branches, but it would require much more
1023	// sophisticated updates to LoopInfo.
1024	if (*KnownWillExit && !IsLatch) {
1025	if (!Info.FirstExitingBlock)
1026	Info.FirstExitingBlock = Info.ExitingBlocks [i];
1027	continue;
1028	}
1029
1030	SetDest (Info.ExitingBlocks [i], *KnownWillExit, Info.ExitOnTrue);
1031	}
1032	}
1033
1034	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
1035	DomTreeUpdater *DTUToUse = &DTU;
1036	if (ExitingBlocks.size() == `1` && ExitInfos.size() == `1`) {
1037	// Manually update the DT if there's a single exiting node. In that case
1038	// there's a single exit node and it is sufficient to update the nodes
1039	// immediately dominated by the original exiting block. They will become
1040	// dominated by the first exiting block that leaves the loop after
1041	// unrolling. Note that the CFG inside the loop does not change, so there's
1042	// no need to update the DT inside the unrolled loop.
1043	DTUToUse = nullptr;
1044	auto &[OriginalExit, Info] = *ExitInfos.begin();
1045	if (!Info.FirstExitingBlock)
1046	Info.FirstExitingBlock = Info.ExitingBlocks.back();
1047	for (auto *C : to_vector(Range: DT->getNode(BB: OriginalExit)->children())) {
1048	if (L->contains(BB: C->getBlock()))
1049	continue;
1050	C->setIDom(DT->getNode(BB: Info.FirstExitingBlock));
1051	}
1052	} else {
1053	DTU.applyUpdates(Updates: DTUpdates);
1054	}
1055
1056	// When completely unrolling, the last latch becomes unreachable.
1057	if (!LatchIsExiting && CompletelyUnroll) {
1058	// There is no need to update the DT here, because there must be a unique
1059	// latch. Hence if the latch is not exiting it must directly branch back to
1060	// the original loop header and does not dominate any nodes.
1061	assert(LatchBlock->getSingleSuccessor() && "Loop with multiple latches?");
1062	changeToUnreachable(I: Latches.back()->getTerminator(), PreserveLCSSA);
1063	}
1064
1065	// Merge adjacent basic blocks, if possible.
1066	for (BasicBlock *Latch : Latches) {
1067	BranchInst *Term = dyn_cast<BranchInst>(Val: Latch->getTerminator());
1068	assert((Term \|\|
1069	(CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
1070	"Need a branch as terminator, except when fully unrolling with "
1071	"unconditional latch");
1072	if (Term && Term->isUnconditional()) {
1073	BasicBlock *Dest = Term->getSuccessor(i: `0`);
1074	BasicBlock *Fold = Dest->getUniquePredecessor();
1075	if (MergeBlockIntoPredecessor(BB: Dest, /DTU=/DTUToUse, LI,
1076	/MSSAU=/nullptr, /MemDep=/nullptr,
1077	/PredecessorWithTwoSuccessors=/false,
1078	DT: DTUToUse ? nullptr : DT)) {
1079	// Dest has been folded into Fold. Update our worklists accordingly.
1080	llvm::replace(Range&: Latches, OldValue: Dest, NewValue: Fold);
1081	llvm::erase(C&: UnrolledLoopBlocks, V: Dest);
1082	}
1083	}
1084	}
1085
1086	// If there are partial reductions, create code in the exit block to compute
1087	// the final result and update users of the final result.
1088	if (!PartialReductions.empty()) {
1089	BasicBlock *ExitBlock = L->getExitBlock();
1090	assert(ExitBlock &&
1091	"Can only introduce parallel reduction phis with single exit block");
1092	assert(Reductions.size() == `1` &&
1093	"currently only a single reduction is supported");
1094	Value *FinalRdxValue = PartialReductions.back();
1095	Value RdxResult = nullptr*;
1096	for (PHINode &Phi : ExitBlock->phis()) {
1097	if (Phi.getIncomingValueForBlock(BB: L->getLoopLatch()) != FinalRdxValue)
1098	continue;
1099	if (!RdxResult) {
1100	RdxResult = PartialReductions.front();
1101	IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
1102	Builder.setFastMathFlags(Reductions.begin()->second.getFastMathFlags());
1103	RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
1104	for (Instruction *RdxPart : drop_begin(RangeOrContainer&: PartialReductions)) {
1105	RdxResult = Builder.CreateBinOp(
1106	Opc: (Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(Kind: RK),
1107	LHS: RdxPart, RHS: RdxResult, Name: "bin.rdx");
1108	}
1109	NeedToFixLCSSA = true;
1110	for (Instruction *RdxPart : PartialReductions)
1111	RdxPart->dropPoisonGeneratingFlags();
1112	}
1113
1114	Phi.replaceAllUsesWith(V: RdxResult);
1115	}
1116	}
1117
1118	if (DTUToUse) {
1119	// Apply updates to the DomTree.
1120	DT = &DTU.getDomTree();
1121	}
1122	assert(!UnrollVerifyDomtree \|\|
1123	DT->verify(DominatorTree::VerificationLevel::Fast));
1124
1125	// At this point, the code is well formed. We now simplify the unrolled loop,
1126	// doing constant propagation and dead code elimination as we go.
1127	simplifyLoopAfterUnroll(L, SimplifyIVs: !CompletelyUnroll && ULO.Count > `1`, LI, SE, DT, AC,
1128	TTI, AA);
1129
1130	NumCompletelyUnrolled += CompletelyUnroll;
1131	++NumUnrolled;
1132
1133	Loop *OuterL = L->getParentLoop();
1134	// Update LoopInfo if the loop is completely removed.
1135	if (CompletelyUnroll) {
1136	LI->erase(L);
1137	// We shouldn't try to use `L` anymore.
1138	L = nullptr;
1139	} else {
1140	// Update metadata for the loop's branch weights and estimated trip count:
1141	// - If ULO.Runtime, UnrollRuntimeLoopRemainder sets the guard branch
1142	// weights, latch branch weights, and estimated trip count of the
1143	// remainder loop it creates. It also sets the branch weights for the
1144	// unrolled loop guard it creates. The branch weights for the unrolled
1145	// loop latch are adjusted below. FIXME: Handle prologue loops.
1146	// - Otherwise, if unrolled loop iteration latches become unconditional,
1147	// branch weights are adjusted above. FIXME: Actually handle such
1148	// unconditional latches.
1149	// - Otherwise, the original loop's branch weights are correct for the
1150	// unrolled loop, so do not adjust them.
1151	// - In all cases, the unrolled loop's estimated trip count is set below.
1152	//
1153	// As an example of the last case, consider what happens if the unroll count
1154	// is 4 for a loop with an estimated trip count of 10 when we do not create
1155	// a remainder loop and all iterations' latches remain conditional. Each
1156	// unrolled iteration's latch still has the same probability of exiting the
1157	// loop as it did when in the original loop, and thus it should still have
1158	// the same branch weights. Each unrolled iteration's non-zero probability
1159	// of exiting already appropriately reduces the probability of reaching the
1160	// remaining iterations just as it did in the original loop. Trying to also
1161	// adjust the branch weights of the final unrolled iteration's latch (i.e.,
1162	// the backedge for the unrolled loop as a whole) to reflect its new trip
1163	// count of 3 will erroneously further reduce its block frequencies.
1164	// However, in case an analysis later needs to estimate the trip count of
1165	// the unrolled loop as a whole without considering the branch weights for
1166	// each unrolled iteration's latch within it, we store the new trip count as
1167	// separate metadata.
1168	if (!OriginalLoopProb.isUnknown() && ULO.Runtime && EpilogProfitability) {
1169	// Where p is always the probability of executing at least 1 more
1170	// iteration, the probability for at least n more iterations is p^n.
1171	setLoopProbability(L, P: OriginalLoopProb.pow(N: ULO.Count));
1172	}
1173	if (OriginalTripCount) {
1174	unsigned NewTripCount = *OriginalTripCount / ULO.Count;
1175	if (!ULO.Runtime && *OriginalTripCount % ULO.Count)
1176	++NewTripCount;
1177	setLoopEstimatedTripCount(L, EstimatedTripCount: NewTripCount);
1178	}
1179	}
1180
1181	// LoopInfo should not be valid, confirm that.
1182	if (UnrollVerifyLoopInfo)
1183	LI->verify(DomTree: *DT);
1184
1185	// After complete unrolling most of the blocks should be contained in OuterL.
1186	// However, some of them might happen to be out of OuterL (e.g. if they
1187	// precede a loop exit). In this case we might need to insert PHI nodes in
1188	// order to preserve LCSSA form.
1189	// We don't need to check this if we already know that we need to fix LCSSA
1190	// form.
1191	// TODO: For now we just recompute LCSSA for the outer loop in this case, but
1192	// it should be possible to fix it in-place.
1193	if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
1194	NeedToFixLCSSA \|= ::needToInsertPhisForLCSSA(L: OuterL, Blocks: UnrolledLoopBlocks, LI);
1195
1196	// Make sure that loop-simplify form is preserved. We want to simplify
1197	// at least one layer outside of the loop that was unrolled so that any
1198	// changes to the parent loop exposed by the unrolling are considered.
1199	if (OuterL) {
1200	// OuterL includes all loops for which we can break loop-simplify, so
1201	// it's sufficient to simplify only it (it'll recursively simplify inner
1202	// loops too).
1203	if (NeedToFixLCSSA) {
1204	// LCSSA must be performed on the outermost affected loop. The unrolled
1205	// loop's last loop latch is guaranteed to be in the outermost loop
1206	// after LoopInfo's been updated by LoopInfo::erase.
1207	Loop *LatchLoop = LI->getLoopFor(BB: Latches.back());
1208	Loop *FixLCSSALoop = OuterL;
1209	if (!FixLCSSALoop->contains(L: LatchLoop))
1210	while (FixLCSSALoop->getParentLoop() != LatchLoop)
1211	FixLCSSALoop = FixLCSSALoop->getParentLoop();
1212
1213	formLCSSARecursively(L&: FixLCSSALoop, DT: DT, LI, SE);
1214	} else if (PreserveLCSSA) {
1215	assert(OuterL->isLCSSAForm(*DT) &&
1216	"Loops should be in LCSSA form after loop-unroll.");
1217	}
1218
1219	// TODO: That potentially might be compile-time expensive. We should try
1220	// to fix the loop-simplified form incrementally.
1221	simplifyLoop(L: OuterL, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA);
1222	} else {
1223	// Simplify loops for which we might've broken loop-simplify form.
1224	for (Loop *SubLoop : LoopsToSimplify)
1225	simplifyLoop(L: SubLoop, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA);
1226	}
1227
1228	return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
1229	: LoopUnrollResult::PartiallyUnrolled;
1230	}
1231
1232	/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
1233	/// node with the given name (for example, "llvm.loop.unroll.count"). If no
1234	/// such metadata node exists, then nullptr is returned.
1235	MDNode llvm::GetUnrollMetadata(MDNode LoopID, StringRef Name) {
1236	// First operand should refer to the loop id itself.
1237	assert(LoopID->getNumOperands() > `0` && "requires at least one operand");
1238	assert(LoopID->getOperand(`0`) == LoopID && "invalid loop id");
1239
1240	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
1241	MDNode *MD = dyn_cast<MDNode>(Val: MDO);
1242	if (!MD)
1243	continue;
1244
1245	MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
1246	if (!S)
1247	continue;
1248
1249	if (Name == S->getString())
1250	return MD;
1251	}
1252	return nullptr;
1253	}
1254
1255	std::optional<RecurrenceDescriptor>
1256	llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
1257	ScalarEvolution *SE) {
1258	RecurrenceDescriptor RdxDesc;
1259	if (!RecurrenceDescriptor::isReductionPHI(Phi: &Phi, TheLoop: L, RedDes&: RdxDesc,
1260	/DemandedBits=/DB: nullptr,
1261	/AC=/nullptr, /DT=/nullptr, SE))
1262	return std::nullopt;
1263	if (RdxDesc.hasUsesOutsideReductionChain())
1264	return std::nullopt;
1265	RecurKind RK = RdxDesc.getRecurrenceKind();
1266	// Skip unsupported reductions.
1267	// TODO: Handle additional reductions, including FP and min-max
1268	// reductions.
1269	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) \|\|
1270	RecurrenceDescriptor::isFindRecurrenceKind(Kind: RK) \|\|
1271	RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))
1272	return std::nullopt;
1273
1274	if (RdxDesc.hasExactFPMath())
1275	return std::nullopt;
1276
1277	if (RdxDesc.IntermediateStore)
1278	return std::nullopt;
1279
1280	// Don't unroll reductions with constant ops; those can be folded to a
1281	// single induction update.
1282	if (any_of(Range: cast<Instruction>(Val: Phi.getIncomingValueForBlock(BB: L->getLoopLatch()))
1283	->operands(),
1284	P: IsaPred<Constant>))
1285	return std::nullopt;
1286
1287	BasicBlock *Latch = L->getLoopLatch();
1288	if (!Latch \|\|
1289	!is_contained(
1290	Range: cast<Instruction>(Val: Phi.getIncomingValueForBlock(BB: Latch))->operands(),
1291	Element: &Phi))
1292	return std::nullopt;
1293
1294	return RdxDesc;
1295	}
1296

Browse the source code of llvm_projects/llvm/lib/Transforms/Utils/LoopUnroll.cpp