LoopUnrollRuntime.cpp source code [llvm_projects/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp]

1	//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements some loop unrolling utilities for loops with run-time
10	// trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
11	// trip counts.
12	//
13	// The functions in this file are used to generate extra code when the
14	// run-time trip count modulo the unroll factor is not 0. When this is the
15	// case, we need to generate code to execute these 'left over' iterations.
16	//
17	// The current strategy generates an if-then-else sequence prior to the
18	// unrolled loop to execute the 'left over' iterations before or after the
19	// unrolled loop.
20	//
21	//===----------------------------------------------------------------------===//
22
23	#include "llvm/ADT/Statistic.h"
24	#include "llvm/Analysis/DomTreeUpdater.h"
25	#include "llvm/Analysis/InstructionSimplify.h"
26	#include "llvm/Analysis/LoopIterator.h"
27	#include "llvm/Analysis/ScalarEvolution.h"
28	#include "llvm/Analysis/ValueTracking.h"
29	#include "llvm/IR/BasicBlock.h"
30	#include "llvm/IR/Dominators.h"
31	#include "llvm/IR/MDBuilder.h"
32	#include "llvm/IR/Module.h"
33	#include "llvm/IR/ProfDataUtils.h"
34	#include "llvm/Support/CommandLine.h"
35	#include "llvm/Support/Debug.h"
36	#include "llvm/Support/raw_ostream.h"
37	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38	#include "llvm/Transforms/Utils/Cloning.h"
39	#include "llvm/Transforms/Utils/Local.h"
40	#include "llvm/Transforms/Utils/LoopUtils.h"
41	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
42	#include "llvm/Transforms/Utils/UnrollLoop.h"
43	#include <cmath>
44
45	using namespace llvm;
46
47	#define DEBUG_TYPE "loop-unroll"
48
49	STATISTIC(NumRuntimeUnrolled,
50	"Number of loops unrolled with run-time trip counts");
51	static cl::opt<bool> UnrollRuntimeMultiExit(
52	"unroll-runtime-multi-exit", cl::init(Val: false), cl::Hidden,
53	cl::desc ("Allow runtime unrolling for loops with multiple exits, when "
54	"epilog is generated"));
55	static cl::opt<bool> UnrollRuntimeOtherExitPredictable(
56	"unroll-runtime-other-exit-predictable", cl::init(Val: false), cl::Hidden,
57	cl::desc ("Assume the non latch exit block to be predictable"));
58
59	// Probability that the loop trip count is so small that after the prolog
60	// we do not enter the unrolled loop at all.
61	// It is unlikely that the loop trip count is smaller than the unroll factor;
62	// other than that, the choice of constant is not tuned yet.
63	static const uint32_t UnrolledLoopHeaderWeights[] = {`1`, `127`};
64	// Probability that the loop trip count is so small that we skip the unrolled
65	// loop completely and immediately enter the epilogue loop.
66	// It is unlikely that the loop trip count is smaller than the unroll factor;
67	// other than that, the choice of constant is not tuned yet.
68	static const uint32_t EpilogHeaderWeights[] = {`1`, `127`};
69
70	/// Connect the unrolling prolog code to the original loop.
71	/// The unrolling prolog code contains code to execute the
72	/// 'extra' iterations if the run-time trip count modulo the
73	/// unroll count is non-zero.
74	///
75	/// This function performs the following:
76	/// - Create PHI nodes at prolog end block to combine values
77	/// that exit the prolog code and jump around the prolog.
78	/// - Add a PHI operand to a PHI node at the loop exit block
79	/// for values that exit the prolog and go around the loop.
80	/// - Branch around the original loop if the trip count is less
81	/// than the unroll factor.
82	///
83	static void ConnectProlog(Loop L, Value BECount, unsigned Count,
84	BasicBlock *PrologExit,
85	BasicBlock *OriginalLoopLatchExit,
86	BasicBlock PreHeader, BasicBlock NewPreHeader,
87	ValueToValueMapTy &VMap, DominatorTree *DT,
88	LoopInfo LI, bool* PreserveLCSSA,
89	ScalarEvolution &SE) {
90	// Loop structure should be the following:
91	// Preheader
92	// PrologHeader
93	// ...
94	// PrologLatch
95	// PrologExit
96	// NewPreheader
97	// Header
98	// ...
99	// Latch
100	// LatchExit
101	BasicBlock *Latch = L->getLoopLatch();
102	assert(Latch && "Loop must have a latch");
103	BasicBlock *PrologLatch = cast<BasicBlock>(Val&: VMap [Latch]);
104
105	// Create a PHI node for each outgoing value from the original loop
106	// (which means it is an outgoing value from the prolog code too).
107	// The new PHI node is inserted in the prolog end basic block.
108	// The new PHI node value is added as an operand of a PHI node in either
109	// the loop header or the loop exit block.
110	for (BasicBlock *Succ : successors(BB: Latch)) {
111	for (PHINode &PN : Succ->phis()) {
112	// Add a new PHI node to the prolog end block and add the
113	// appropriate incoming values.
114	// TODO: This code assumes that the PrologExit (or the LatchExit block for
115	// prolog loop) contains only one predecessor from the loop, i.e. the
116	// PrologLatch. When supporting multiple-exiting block loops, we can have
117	// two or more blocks that have the LatchExit as the target in the
118	// original loop.
119	PHINode *NewPN = PHINode::Create(Ty: PN.getType(), NumReservedValues: `2`, NameStr: PN.getName() + ".unr");
120	NewPN->insertBefore(InsertPos: PrologExit->getFirstNonPHIIt());
121	// Adding a value to the new PHI node from the original loop preheader.
122	// This is the value that skips all the prolog code.
123	if (L->contains(Inst: &PN)) {
124	// Succ is loop header.
125	NewPN->addIncoming(V: PN.getIncomingValueForBlock(BB: NewPreHeader),
126	BB: PreHeader);
127	} else {
128	// Succ is LatchExit.
129	NewPN->addIncoming(V: PoisonValue::get(T: PN.getType()), BB: PreHeader);
130	}
131
132	Value *V = PN.getIncomingValueForBlock(BB: Latch);
133	if (Instruction *I = dyn_cast<Instruction>(Val: V)) {
134	if (L->contains(Inst: I)) {
135	V = VMap.lookup(Val: I);
136	}
137	}
138	// Adding a value to the new PHI node from the last prolog block
139	// that was created.
140	NewPN->addIncoming(V, BB: PrologLatch);
141
142	// Update the existing PHI node operand with the value from the
143	// new PHI node. How this is done depends on if the existing
144	// PHI node is in the original loop block, or the exit block.
145	if (L->contains(Inst: &PN))
146	PN.setIncomingValueForBlock(BB: NewPreHeader, V: NewPN);
147	else
148	PN.addIncoming(V: NewPN, BB: PrologExit);
149	SE.forgetLcssaPhiWithNewPredecessor(L, V: &PN);
150	}
151	}
152
153	// Make sure that created prolog loop is in simplified form
154	SmallVector<BasicBlock *, `4`> PrologExitPreds;
155	Loop *PrologLoop = LI->getLoopFor(BB: PrologLatch);
156	if (PrologLoop) {
157	for (BasicBlock *PredBB : predecessors(BB: PrologExit))
158	if (PrologLoop->contains(BB: PredBB))
159	PrologExitPreds.push_back(Elt: PredBB);
160
161	SplitBlockPredecessors(BB: PrologExit, Preds: PrologExitPreds, Suffix: ".unr-lcssa", DT, LI,
162	MSSAU: nullptr, PreserveLCSSA);
163	}
164
165	// Create a branch around the original loop, which is taken if there are no
166	// iterations remaining to be executed after running the prologue.
167	Instruction *InsertPt = PrologExit->getTerminator();
168	IRBuilder<> B(InsertPt);
169
170	assert(Count != `0` && "nonsensical Count!");
171
172	// If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
173	// This means %xtraiter is (BECount + 1) and all of the iterations of this
174	// loop were executed by the prologue. Note that if BECount <u (Count - 1)
175	// then (BECount + 1) cannot unsigned-overflow.
176	Value *BrLoopExit =
177	B.CreateICmpULT(LHS: BECount, RHS: ConstantInt::get(Ty: BECount->getType(), V: Count - `1`));
178	// Split the exit to maintain loop canonicalization guarantees
179	SmallVector<BasicBlock *, `4`> Preds(predecessors(BB: OriginalLoopLatchExit));
180	SplitBlockPredecessors(BB: OriginalLoopLatchExit, Preds, Suffix: ".unr-lcssa", DT, LI,
181	MSSAU: nullptr, PreserveLCSSA);
182	// Add the branch to the exit block (around the unrolled loop)
183	MDNode BranchWeights = nullptr*;
184	if (hasBranchWeightMD(I: *Latch->getTerminator())) {
185	// Assume loop is nearly always entered.
186	MDBuilder MDB(B.getContext());
187	BranchWeights = MDB.createBranchWeights(Weights: UnrolledLoopHeaderWeights);
188	}
189	B.CreateCondBr(Cond: BrLoopExit, True: OriginalLoopLatchExit, False: NewPreHeader,
190	BranchWeights);
191	InsertPt->eraseFromParent();
192	if (DT) {
193	auto *NewDom = DT->findNearestCommonDominator(A: OriginalLoopLatchExit,
194	B: PrologExit);
195	DT->changeImmediateDominator(BB: OriginalLoopLatchExit, NewBB: NewDom);
196	}
197	}
198
199	/// Assume, due to our position in the remainder loop or its guard, anywhere
200	/// from 0 to \p N more iterations can possibly execute. Among such cases in
201	/// the original loop (with loop probability \p OriginalLoopProb), what is the
202	/// probability of executing at least one more iteration?
203	static BranchProbability
204	probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
205	// OriginalLoopProb == 1 would produce a division by zero in the calculation
206	// below. The problem is that case indicates an always infinite loop, but a
207	// remainder loop cannot be calculated at run time if the original loop is
208	// infinite as infinity % UnrollCount is undefined. We then choose
209	// probabilities indicating that all remainder loop iterations will always
210	// execute.
211	//
212	// Currently, the remainder loop here is an epilogue, which cannot be reached
213	// if the original loop is infinite, so the aforementioned choice is
214	// arbitrary.
215	//
216	// FIXME: Branch weights still need to be fixed in the case of prologues
217	// (issue #135812). In that case, the aforementioned choice seems reasonable
218	// for the goal of maintaining the original loop's block frequencies. That
219	// is, an infinite loop's initial iterations are not skipped, and the prologue
220	// loop body might have unique blocks that execute a finite number of times
221	// if, for example, the original loop body contains conditionals like i <
222	// UnrollCount.
223	if (OriginalLoopProb == BranchProbability::getOne())
224	return BranchProbability::getOne();
225
226	// Each of these variables holds the original loop's probability that the
227	// number of iterations it will execute is some m in the specified range.
228	BranchProbability ProbOne = OriginalLoopProb; // 1 <= m
229	BranchProbability ProbTooMany = ProbOne.pow(N: N + `1`); // N + 1 <= m
230	BranchProbability ProbNotTooMany = ProbTooMany.getCompl(); // 0 <= m <= N
231	BranchProbability ProbOneNotTooMany = ProbOne - ProbTooMany; // 1 <= m <= N
232	return ProbOneNotTooMany / ProbNotTooMany;
233	}
234
235	/// Connect the unrolling epilog code to the original loop.
236	/// The unrolling epilog code contains code to execute the
237	/// 'extra' iterations if the run-time trip count modulo the
238	/// unroll count is non-zero.
239	///
240	/// This function performs the following:
241	/// - Update PHI nodes at the epilog loop exit
242	/// - Create PHI nodes at the unrolling loop exit and epilog preheader to
243	/// combine values that exit the unrolling loop code and jump around it.
244	/// - Update PHI operands in the epilog loop by the new PHI nodes
245	/// - At the unrolling loop exit, branch around the epilog loop if extra iters
246	// (ModVal) is zero.
247	/// - At the epilog preheader, add an llvm.assume call that extra iters is
248	/// non-zero. If the unrolling loop exit is the predecessor, the above new
249	/// branch guarantees that assumption. If the unrolling loop preheader is the
250	/// predecessor, then the required first iteration from the original loop has
251	/// yet to be executed, so it must be executed in the epilog loop. If we
252	/// later unroll the epilog loop, that llvm.assume call somehow enables
253	/// ScalarEvolution to compute a epilog loop maximum trip count, which enables
254	/// eliminating the branch at the end of the final unrolled epilog iteration.
255	///
256	static void ConnectEpilog(Loop L, Value ModVal, BasicBlock *NewExit,
257	BasicBlock Exit, BasicBlock PreHeader,
258	BasicBlock EpilogPreHeader, BasicBlock NewPreHeader,
259	ValueToValueMapTy &VMap, DominatorTree *DT,
260	LoopInfo LI, bool* PreserveLCSSA, ScalarEvolution &SE,
261	unsigned Count, AssumptionCache &AC,
262	BranchProbability OriginalLoopProb) {
263	BasicBlock *Latch = L->getLoopLatch();
264	assert(Latch && "Loop must have a latch");
265	BasicBlock *EpilogLatch = cast<BasicBlock>(Val&: VMap [Latch]);
266
267	// Loop structure should be the following:
268	//
269	// PreHeader
270	// NewPreHeader
271	// Header
272	// ...
273	// Latch
274	// NewExit (PN)
275	// EpilogPreHeader
276	// EpilogHeader
277	// ...
278	// EpilogLatch
279	// Exit (EpilogPN)
280
281	// Update PHI nodes at Exit.
282	for (PHINode &PN : NewExit->phis()) {
283	// PN should be used in another PHI located in Exit block as
284	// Exit was split by SplitBlockPredecessors into Exit and NewExit
285	// Basically it should look like:
286	// NewExit:
287	// PN = PHI [I, Latch]
288	// ...
289	// Exit:
290	// EpilogPN = PHI [PN, EpilogPreHeader], [X, Exit2], [Y, Exit2.epil]
291	//
292	// Exits from non-latch blocks point to the original exit block and the
293	// epilogue edges have already been added.
294	//
295	// There is EpilogPreHeader incoming block instead of NewExit as
296	// NewExit was split 1 more time to get EpilogPreHeader.
297	assert(PN.hasOneUse() && "The phi should have 1 use");
298	PHINode *EpilogPN = cast<PHINode>(Val: PN.use_begin()->getUser());
299	assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
300
301	Value *V = PN.getIncomingValueForBlock(BB: Latch);
302	Instruction *I = dyn_cast<Instruction>(Val: V);
303	if (I && L->contains(Inst: I))
304	// If value comes from an instruction in the loop add VMap value.
305	V = VMap.lookup(Val: I);
306	// For the instruction out of the loop, constant or undefined value
307	// insert value itself.
308	EpilogPN->addIncoming(V, BB: EpilogLatch);
309
310	assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= `0` &&
311	"EpilogPN should have EpilogPreHeader incoming block");
312	// Change EpilogPreHeader incoming block to NewExit.
313	EpilogPN->setIncomingBlock(i: EpilogPN->getBasicBlockIndex(BB: EpilogPreHeader),
314	BB: NewExit);
315	// Now PHIs should look like:
316	// NewExit:
317	// PN = PHI [I, Latch]
318	// ...
319	// Exit:
320	// EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
321	}
322
323	// Create PHI nodes at NewExit (from the unrolling loop Latch) and at
324	// EpilogPreHeader (from PreHeader and NewExit). Update corresponding PHI
325	// nodes in epilog loop.
326	for (BasicBlock *Succ : successors(BB: Latch)) {
327	// Skip this as we already updated phis in exit blocks.
328	if (!L->contains(BB: Succ))
329	continue;
330
331	// Succ here appears to always be just L->getHeader(). Otherwise, how do we
332	// know its corresponding epilog block (from VMap) is EpilogHeader and thus
333	// EpilogPreHeader is the right incoming block for VPN, as set below?
334	// TODO: Can we thus avoid the enclosing loop over successors?
335	assert(Succ == L->getHeader() &&
336	"Expect the only in-loop successor of latch to be the loop header");
337
338	for (PHINode &PN : Succ->phis()) {
339	// Add new PHI nodes to the loop exit block.
340	PHINode NewPN0 = PHINode::Create(Ty: PN.getType(), /NumReservedValues=/*`1`,
341	NameStr: PN.getName() + ".unr");
342	NewPN0->insertBefore(InsertPos: NewExit->getFirstNonPHIIt());
343	// Add value to the new PHI node from the unrolling loop latch.
344	NewPN0->addIncoming(V: PN.getIncomingValueForBlock(BB: Latch), BB: Latch);
345
346	// Add new PHI nodes to EpilogPreHeader.
347	PHINode NewPN1 = PHINode::Create(Ty: PN.getType(), /NumReservedValues=/*`2`,
348	NameStr: PN.getName() + ".epil.init");
349	NewPN1->insertBefore(InsertPos: EpilogPreHeader->getFirstNonPHIIt());
350	// Add value to the new PHI node from the unrolling loop preheader.
351	NewPN1->addIncoming(V: PN.getIncomingValueForBlock(BB: NewPreHeader), BB: PreHeader);
352	// Add value to the new PHI node from the epilog loop guard.
353	NewPN1->addIncoming(V: NewPN0, BB: NewExit);
354
355	// Update the existing PHI node operand with the value from the new PHI
356	// node. Corresponding instruction in epilog loop should be PHI.
357	PHINode *VPN = cast<PHINode>(Val&: VMap [&PN]);
358	VPN->setIncomingValueForBlock(BB: EpilogPreHeader, V: NewPN1);
359	}
360	}
361
362	// In NewExit, branch around the epilog loop if no extra iters.
363	Instruction *InsertPt = NewExit->getTerminator();
364	IRBuilder<> B(InsertPt);
365	Value *BrLoopExit = B.CreateIsNotNull(Arg: ModVal, Name: "lcmp.mod");
366	assert(Exit && "Loop must have a single exit block only");
367	// Split the epilogue exit to maintain loop canonicalization guarantees
368	SmallVector<BasicBlock*, `4`> Preds(predecessors(BB: Exit));
369	SplitBlockPredecessors(BB: Exit, Preds, Suffix: ".epilog-lcssa", DT, LI, MSSAU: nullptr,
370	PreserveLCSSA);
371	// Add the branch to the exit block (around the epilog loop)
372	MDNode BranchWeights = nullptr*;
373	if (OriginalLoopProb.isUnknown() &&
374	hasBranchWeightMD(I: *Latch->getTerminator())) {
375	// Assume equal distribution in interval [0, Count).
376	MDBuilder MDB(B.getContext());
377	BranchWeights = MDB.createBranchWeights(TrueWeight: `1`, FalseWeight: Count - `1`);
378	}
379	CondBrInst *RemainderLoopGuard =
380	B.CreateCondBr(Cond: BrLoopExit, True: EpilogPreHeader, False: Exit, BranchWeights);
381	if (!OriginalLoopProb.isUnknown()) {
382	setBranchProbability(B: RemainderLoopGuard,
383	P: probOfNextInRemainder(OriginalLoopProb, N: Count - `1`),
384	/ForFirstTarget=/true);
385	}
386	InsertPt->eraseFromParent();
387	if (DT) {
388	auto *NewDom = DT->findNearestCommonDominator(A: Exit, B: NewExit);
389	DT->changeImmediateDominator(BB: Exit, NewBB: NewDom);
390	}
391
392	// In EpilogPreHeader, assume extra iters is non-zero.
393	IRBuilder<> B2(EpilogPreHeader, EpilogPreHeader->getFirstNonPHIIt());
394	Value *ModIsNotNull = B2.CreateIsNotNull(Arg: ModVal, Name: "lcmp.mod");
395	AssumeInst *AI = cast<AssumeInst>(Val: B2.CreateAssumption(Cond: ModIsNotNull));
396	AC.registerAssumption(CI: AI);
397	}
398
399	/// Create a clone of the blocks in a loop and connect them together. A new
400	/// loop will be created including all cloned blocks, and the iterator of the
401	/// new loop switched to count NewIter down to 0.
402	/// The cloned blocks should be inserted between InsertTop and InsertBot.
403	/// InsertTop should be new preheader, InsertBot new loop exit.
404	/// Returns the new cloned loop that is created.
405	static Loop CloneLoopBlocks(Loop L, Value *NewIter,
406	const bool UseEpilogRemainder,
407	const bool UnrollRemainder, BasicBlock *InsertTop,
408	BasicBlock InsertBot, BasicBlock Preheader,
409	std::vector<BasicBlock *> &NewBlocks,
410	LoopBlocksDFS &LoopBlocks, ValueToValueMapTy &VMap,
411	DominatorTree DT, LoopInfo LI, unsigned Count,
412	std::optional<unsigned> OriginalTripCount,
413	BranchProbability OriginalLoopProb) {
414	StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
415	BasicBlock *Header = L->getHeader();
416	BasicBlock *Latch = L->getLoopLatch();
417	Function *F = Header->getParent();
418	LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
419	LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
420	Loop *ParentLoop = L->getParentLoop();
421	NewLoopsMap NewLoops;
422	NewLoops [ParentLoop] = ParentLoop;
423
424	// For each block in the original loop, create a new copy,
425	// and update the value map with the newly created values.
426	for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
427	BasicBlock NewBB = CloneBasicBlock(BB: BB, VMap, NameSuffix: "." + suffix, F);
428	NewBlocks.push_back(x: NewBB);
429
430	addClonedBlockToLoopInfo(OriginalBB: *BB, ClonedBB: NewBB, LI, NewLoops);
431
432	VMap [*BB] = NewBB;
433	if (Header == *BB) {
434	// For the first block, add a CFG connection to this newly
435	// created block.
436	InsertTop->getTerminator()->setSuccessor(Idx: `0`, BB: NewBB);
437	}
438
439	if (DT) {
440	if (Header == *BB) {
441	// The header is dominated by the preheader.
442	DT->addNewBlock(BB: NewBB, DomBB: InsertTop);
443	} else {
444	// Copy information from original loop to unrolled loop.
445	BasicBlock IDomBB = DT->getNode(BB: BB)->getIDom()->getBlock();
446	DT->addNewBlock(BB: NewBB, DomBB: cast<BasicBlock>(Val&: VMap [IDomBB]));
447	}
448	}
449
450	if (Latch == *BB) {
451	// For the last block, create a loop back to cloned head.
452	VMap.erase(Val: (*BB)->getTerminator());
453	// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
454	// Subtle: NewIter can be 0 if we wrapped when computing the trip count,
455	// thus we must compare the post-increment (wrapping) value.
456	BasicBlock *FirstLoopBB = cast<BasicBlock>(Val&: VMap [Header]);
457	CondBrInst *LatchBR = cast<CondBrInst>(Val: NewBB->getTerminator());
458	IRBuilder<> Builder(LatchBR);
459	PHINode *NewIdx =
460	PHINode::Create(Ty: NewIter->getType(), NumReservedValues: `2`, NameStr: suffix + ".iter");
461	NewIdx->insertBefore(InsertPos: FirstLoopBB->getFirstNonPHIIt());
462	auto *Zero = ConstantInt::get(Ty: NewIdx->getType(), V: `0`);
463	auto *One = ConstantInt::get(Ty: NewIdx->getType(), V: `1`);
464	Value *IdxNext =
465	Builder.CreateAdd(LHS: NewIdx, RHS: One, Name: NewIdx->getName() + ".next");
466	Value *IdxCmp = Builder.CreateICmpNE(LHS: IdxNext, RHS: NewIter, Name: NewIdx->getName() + ".cmp");
467	MDNode BranchWeights = nullptr*;
468	if ((OriginalLoopProb.isUnknown() \|\| !UseEpilogRemainder) &&
469	hasBranchWeightMD(I: *LatchBR)) {
470	uint32_t ExitWeight;
471	uint32_t BackEdgeWeight;
472	if (Count >= `3`) {
473	// Note: We do not enter this loop for zero-remainders. The check
474	// is at the end of the loop. We assume equal distribution between
475	// possible remainders in [1, Count).
476	ExitWeight = `1`;
477	BackEdgeWeight = (Count - `2`) / `2`;
478	} else {
479	// Unnecessary backedge, should never be taken. The conditional
480	// jump should be optimized away later.
481	ExitWeight = `1`;
482	BackEdgeWeight = `0`;
483	}
484	MDBuilder MDB(Builder.getContext());
485	BranchWeights = MDB.createBranchWeights(TrueWeight: BackEdgeWeight, FalseWeight: ExitWeight);
486	}
487	CondBrInst *RemainderLoopLatch =
488	Builder.CreateCondBr(Cond: IdxCmp, True: FirstLoopBB, False: InsertBot, BranchWeights);
489	if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
490	// Compute the total frequency of the original loop body from the
491	// remainder iterations. Once we've reached them, the first of them
492	// always executes, so its frequency and probability are 1.
493	double FreqRemIters = `1`;
494	if (Count > `2`) {
495	BranchProbability ProbReaching = BranchProbability::getOne();
496	for (unsigned N = Count - `2`; N >= `1`; --N) {
497	ProbReaching *= probOfNextInRemainder(OriginalLoopProb, N);
498	FreqRemIters += ProbReaching.toDouble();
499	}
500	}
501	// Solve for the loop probability that would produce that frequency.
502	// Sum(i=0..inf)(Prob^i) = 1/(1-Prob) = FreqRemIters.
503	BranchProbability Prob =
504	BranchProbability::getBranchProbability(Prob: `1` - `1` / FreqRemIters);
505	setBranchProbability(B: RemainderLoopLatch, P: Prob, /ForFirstTarget=/true);
506	}
507	NewIdx->addIncoming(V: Zero, BB: InsertTop);
508	NewIdx->addIncoming(V: IdxNext, BB: NewBB);
509	LatchBR->eraseFromParent();
510	}
511	}
512
513	// Change the incoming values to the ones defined in the preheader or
514	// cloned loop.
515	for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(Val: I); ++I) {
516	PHINode NewPHI = cast<PHINode>(Val&: VMap [&I]);
517	unsigned idx = NewPHI->getBasicBlockIndex(BB: Preheader);
518	NewPHI->setIncomingBlock(i: idx, BB: InsertTop);
519	BasicBlock *NewLatch = cast<BasicBlock>(Val&: VMap [Latch]);
520	idx = NewPHI->getBasicBlockIndex(BB: Latch);
521	Value *InVal = NewPHI->getIncomingValue(i: idx);
522	NewPHI->setIncomingBlock(i: idx, BB: NewLatch);
523	if (Value *V = VMap.lookup(Val: InVal))
524	NewPHI->setIncomingValue(i: idx, V);
525	}
526
527	Loop *NewLoop = NewLoops [L];
528	assert(NewLoop && "L should have been cloned");
529
530	if (OriginalTripCount && UseEpilogRemainder)
531	setLoopEstimatedTripCount(L: NewLoop, EstimatedTripCount: *OriginalTripCount % Count);
532
533	// Add unroll disable metadata to disable future unrolling for this loop.
534	if (!UnrollRemainder)
535	NewLoop->setLoopAlreadyUnrolled();
536	return NewLoop;
537	}
538
539	/// Returns true if we can profitably unroll the multi-exit loop L.
540	static bool canProfitablyRuntimeUnrollMultiExitLoop(
541	Loop L, const* TargetTransformInfo *TTI,
542	SmallVectorImpl<BasicBlock > &OtherExits, BasicBlock LatchExit,
543	bool UseEpilogRemainder) {
544
545	// The main pain point with multi-exit loop unrolling is that once unrolled,
546	// we will not be able to merge all blocks into a straight line code.
547	// There are branches within the unrolled loop that go to the OtherExits.
548	// The second point is the increase in code size, but this is true
549	// irrespective of multiple exits.
550
551	// Note: Both the heuristics below are coarse grained. We are essentially
552	// enabling unrolling of loops that have a single side exit other than the
553	// normal LatchExit (i.e. exiting into a deoptimize block).
554	// The heuristics considered are:
555	// 1. low number of branches in the unrolled version.
556	// 2. high predictability of these extra branches.
557	// We avoid unrolling loops that have more than two exiting blocks. This
558	// limits the total number of branches in the unrolled loop to be atmost
559	// the unroll factor (since one of the exiting blocks is the latch block).
560	SmallVector<BasicBlock*, `4`> ExitingBlocks;
561	L->getExitingBlocks(ExitingBlocks);
562	if (ExitingBlocks.size() > `2`)
563	return false;
564
565	// Allow unrolling of loops with no non latch exit blocks.
566	if (OtherExits.size() == `0`)
567	return true;
568
569	if (OtherExits.size() != `1`)
570	return false;
571
572	// When UnrollRuntimeOtherExitPredictable is specified, we assume the other
573	// exit branch is predictable even if it has no deoptimize call.
574	if (UnrollRuntimeOtherExitPredictable)
575	return true;
576
577	// The second heuristic is that L has one exit other than the latchexit and
578	// that exit is highly unlikely.
579	if (TTI) {
580	BasicBlock *LatchBB = L->getLoopLatch();
581	assert(LatchBB && "Expected loop to have a latch");
582	BasicBlock *NonLatchExitingBlock =
583	(ExitingBlocks [`0`] == LatchBB) ? ExitingBlocks [`1`] : ExitingBlocks [`0`];
584	auto BranchProb =
585	llvm::getBranchProbability(Src: NonLatchExitingBlock, Dst: OtherExits [`0`]);
586	// If BranchProbability could not be extracted (returns unknown), then
587	// don't return and do the check for deopt block.
588	if (!BranchProb.isUnknown()) {
589	auto Threshold = TTI->getPredictableBranchThreshold().getCompl();
590	return BranchProb < Threshold;
591	}
592	}
593
594	// We know that deoptimize blocks are rarely taken, which also implies the
595	// branch leading to the deoptimize block is highly unlikely.
596	return OtherExits [`0`]->getPostdominatingDeoptimizeCall();
597	// TODO: These can be fine-tuned further to consider code size or deopt states
598	// that are captured by the deoptimize exit block.
599	// Also, we can extend this to support more cases, if we actually
600	// know of kinds of multiexit loops that would benefit from unrolling.
601	}
602
603	/// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain
604	/// accounting for the possibility of unsigned overflow in the 2s complement
605	/// domain. Preconditions:
606	/// 1) TripCount = BECount + 1 (allowing overflow)
607	/// 2) Log2(Count) <= BitWidth(BECount)
608	static Value CreateTripRemainder(IRBuilder<> &B, Value BECount,
609	Value TripCount, unsigned* Count) {
610	// Note that TripCount is BECount + 1.
611	if (isPowerOf2_32(Value: Count))
612	// If the expression is zero, then either:
613	// 1. There are no iterations to be run in the prolog/epilog loop.
614	// OR
615	// 2. The addition computing TripCount overflowed.
616	//
617	// If (2) is true, we know that TripCount really is (1 << BEWidth) and so
618	// the number of iterations that remain to be run in the original loop is a
619	// multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (a
620	// precondition of this method).
621	return B.CreateAnd(LHS: TripCount, RHS: Count - `1`, Name: "xtraiter");
622
623	// As (BECount + 1) can potentially unsigned overflow we count
624	// (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
625	Constant *CountC = ConstantInt::get(Ty: BECount->getType(), V: Count);
626	Value *ModValTmp = B.CreateURem(LHS: BECount, RHS: CountC);
627	Value *ModValAdd = B.CreateAdd(LHS: ModValTmp,
628	RHS: ConstantInt::get(Ty: ModValTmp->getType(), V: `1`));
629	// At that point (BECount % Count) + 1 could be equal to Count.
630	// To handle this case we need to take mod by Count one more time.
631	return B.CreateURem(LHS: ModValAdd, RHS: CountC, Name: "xtraiter");
632	}
633
634
635	/// Insert code in the prolog/epilog code when unrolling a loop with a
636	/// run-time trip-count.
637	///
638	/// This method assumes that the loop unroll factor is total number
639	/// of loop bodies in the loop after unrolling. (Some folks refer
640	/// to the unroll factor as the number of extra* copies added).*
641	/// We assume also that the loop unroll factor is a power-of-two. So, after
642	/// unrolling the loop, the number of loop bodies executed is 2,
643	/// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
644	/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
645	/// the switch instruction is generated.
646	///
647	/// *Prolog case*
648	/// extraiters = tripcount % loopfactor
649	/// if (extraiters == 0) jump Loop:
650	/// else jump Prol:
651	/// Prol: LoopBody;
652	/// extraiters -= 1 // Omitted if unroll factor is 2.
653	/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
654	/// if (tripcount < loopfactor) jump End:
655	/// Loop:
656	/// ...
657	/// End:
658	///
659	/// *Epilog case*
660	/// extraiters = tripcount % loopfactor
661	/// if (tripcount < loopfactor) jump LoopExit:
662	/// unroll_iters = tripcount - extraiters
663	/// Loop: LoopBody; (executes unroll_iter times);
664	/// unroll_iter -= 1
665	/// if (unroll_iter != 0) jump Loop:
666	/// LoopExit:
667	/// if (extraiters == 0) jump EpilExit:
668	/// Epil: LoopBody; (executes extraiters times)
669	/// extraiters -= 1 // Omitted if unroll factor is 2.
670	/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
671	/// EpilExit:
672
673	bool llvm::UnrollRuntimeLoopRemainder(
674	Loop L, unsigned* Count, bool AllowExpensiveTripCount,
675	bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
676	LoopInfo LI, ScalarEvolution SE, DominatorTree DT, AssumptionCache AC,
677	const TargetTransformInfo TTI, bool* PreserveLCSSA,
678	unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
679	Loop *ResultLoop, std::optional<unsigned*> OriginalTripCount,
680	BranchProbability OriginalLoopProb) {
681	LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
682	LLVM_DEBUG(L->dump());
683	LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
684	: dbgs() << "Using prolog remainder.\n");
685
686	// Make sure the loop is in canonical form.
687	if (!L->isLoopSimplifyForm()) {
688	LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
689	return false;
690	}
691
692	// Guaranteed by LoopSimplifyForm.
693	BasicBlock *Latch = L->getLoopLatch();
694	BasicBlock *Header = L->getHeader();
695
696	CondBrInst *LatchBR = dyn_cast<CondBrInst>(Val: Latch->getTerminator());
697
698	if (!LatchBR) {
699	// The loop-rotate pass can be helpful to avoid this in many cases.
700	LLVM_DEBUG(
701	dbgs()
702	<< "Loop latch not terminated by a conditional branch.\n");
703	return false;
704	}
705
706	unsigned ExitIndex = LatchBR->getSuccessor(i: `0`) == Header ? `1` : `0`;
707	BasicBlock *LatchExit = LatchBR->getSuccessor(i: ExitIndex);
708
709	if (L->contains(BB: LatchExit)) {
710	// Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
711	// targets of the Latch be an exit block out of the loop.
712	LLVM_DEBUG(
713	dbgs()
714	<< "One of the loop latch successors must be the exit block.\n");
715	return false;
716	}
717
718	// These are exit blocks other than the target of the latch exiting block.
719	SmallVector<BasicBlock *, `4`> OtherExits;
720	L->getUniqueNonLatchExitBlocks(ExitBlocks&: OtherExits);
721	// Support only single exit and exiting block unless multi-exit loop
722	// unrolling is enabled.
723	if (!L->getExitingBlock() \|\| OtherExits.size()) {
724	// We rely on LCSSA form being preserved when the exit blocks are transformed.
725	// (Note that only an off-by-default mode of the old PM disables PreserveLCCA.)
726	if (!PreserveLCSSA)
727	return false;
728
729	// Priority goes to UnrollRuntimeMultiExit if it's supplied.
730	if (UnrollRuntimeMultiExit.getNumOccurrences()) {
731	if (!UnrollRuntimeMultiExit)
732	return false;
733	} else {
734	// Otherwise perform multi-exit unrolling, if either the target indicates
735	// it is profitable or the general profitability heuristics apply.
736	if (!RuntimeUnrollMultiExit &&
737	!canProfitablyRuntimeUnrollMultiExitLoop(
738	L, TTI, OtherExits, LatchExit, UseEpilogRemainder)) {
739	LLVM_DEBUG(dbgs() << "Multiple exit/exiting blocks in loop and "
740	"multi-exit unrolling not enabled!\n");
741	return false;
742	}
743	}
744	}
745	// Use Scalar Evolution to compute the trip count. This allows more loops to
746	// be unrolled than relying on induction var simplification.
747	if (!SE)
748	return false;
749
750	// Only unroll loops with a computable trip count.
751	// We calculate the backedge count by using getExitCount on the Latch block,
752	// which is proven to be the only exiting block in this loop. This is same as
753	// calculating getBackedgeTakenCount on the loop (which computes SCEV for all
754	// exiting blocks).
755	const SCEV *BECountSC = SE->getExitCount(L, ExitingBlock: Latch);
756	if (isa<SCEVCouldNotCompute>(Val: BECountSC)) {
757	LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
758	return false;
759	}
760
761	unsigned BEWidth = cast<IntegerType>(Val: BECountSC->getType())->getBitWidth();
762
763	// Add 1 since the backedge count doesn't include the first loop iteration.
764	// (Note that overflow can occur, this is handled explicitly below)
765	const SCEV *TripCountSC =
766	SE->getAddExpr(LHS: BECountSC, RHS: SE->getConstant(Ty: BECountSC->getType(), V: `1`));
767	if (isa<SCEVCouldNotCompute>(Val: TripCountSC)) {
768	LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
769	return false;
770	}
771
772	BasicBlock *PreHeader = L->getLoopPreheader();
773	Instruction *PreHeaderBR = PreHeader->getTerminator();
774	SCEVExpander Expander(*SE, "loop-unroll");
775	if (!AllowExpensiveTripCount &&
776	Expander.isHighCostExpansion(Exprs: TripCountSC, L, Budget: SCEVExpansionBudget, TTI,
777	At: PreHeaderBR)) {
778	LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
779	return false;
780	}
781
782	// This constraint lets us deal with an overflowing trip count easily; see the
783	// comment on ModVal below.
784	if (Log2_32(Value: Count) > BEWidth) {
785	LLVM_DEBUG(
786	dbgs()
787	<< "Count failed constraint on overflow trip count calculation.\n");
788	return false;
789	}
790
791	// Loop structure is the following:
792	//
793	// PreHeader
794	// Header
795	// ...
796	// Latch
797	// LatchExit
798
799	BasicBlock *NewPreHeader;
800	BasicBlock NewExit = nullptr*;
801	BasicBlock PrologExit = nullptr*;
802	BasicBlock EpilogPreHeader = nullptr*;
803	BasicBlock PrologPreHeader = nullptr*;
804
805	if (UseEpilogRemainder) {
806	// If epilog remainder
807	// Split PreHeader to insert a branch around loop for unrolling.
808	NewPreHeader = SplitBlock(Old: PreHeader, SplitPt: PreHeader->getTerminator(), DT, LI);
809	NewPreHeader->setName(PreHeader->getName() + ".new");
810	// Split LatchExit to create phi nodes from branch above.
811	NewExit = SplitBlockPredecessors(BB: LatchExit, Preds: {Latch}, Suffix: ".unr-lcssa", DT, LI,
812	MSSAU: nullptr, PreserveLCSSA);
813	// NewExit gets its DebugLoc from LatchExit, which is not part of the
814	// original Loop.
815	// Fix this by setting Loop's DebugLoc to NewExit.
816	auto *NewExitTerminator = NewExit->getTerminator();
817	NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
818	// Split NewExit to insert epilog remainder loop.
819	EpilogPreHeader = SplitBlock(Old: NewExit, SplitPt: NewExitTerminator, DT, LI);
820	EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
821
822	// If the latch exits from multiple level of nested loops, then
823	// by assumption there must be another loop exit which branches to the
824	// outer loop and we must adjust the loop for the newly inserted blocks
825	// to account for the fact that our epilogue is still in the same outer
826	// loop. Note that this leaves loopinfo temporarily out of sync with the
827	// CFG until the actual epilogue loop is inserted.
828	if (auto *ParentL = L->getParentLoop())
829	if (LI->getLoopFor(BB: LatchExit) != ParentL) {
830	LI->removeBlock(BB: NewExit);
831	ParentL->addBasicBlockToLoop(NewBB: NewExit, LI&: *LI);
832	LI->removeBlock(BB: EpilogPreHeader);
833	ParentL->addBasicBlockToLoop(NewBB: EpilogPreHeader, LI&: *LI);
834	}
835
836	} else {
837	// If prolog remainder
838	// Split the original preheader twice to insert prolog remainder loop
839	PrologPreHeader = SplitEdge(From: PreHeader, To: Header, DT, LI);
840	PrologPreHeader->setName(Header->getName() + ".prol.preheader");
841	PrologExit = SplitBlock(Old: PrologPreHeader, SplitPt: PrologPreHeader->getTerminator(),
842	DT, LI);
843	PrologExit->setName(Header->getName() + ".prol.loopexit");
844	// Split PrologExit to get NewPreHeader.
845	NewPreHeader = SplitBlock(Old: PrologExit, SplitPt: PrologExit->getTerminator(), DT, LI);
846	NewPreHeader->setName(PreHeader->getName() + ".new");
847	}
848	// Loop structure should be the following:
849	// Epilog Prolog
850	//
851	// PreHeader PreHeader
852	// NewPreHeader PrologPreHeader
853	// Header PrologExit*
854	// ... NewPreHeader*
855	// Latch Header
856	// NewExit ...*
857	// EpilogPreHeader Latch*
858	// LatchExit LatchExit
859
860	// Calculate conditions for branch around loop for unrolling
861	// in epilog case and around prolog remainder loop in prolog case.
862	// Compute the number of extra iterations required, which is:
863	// extra iterations = run-time trip count % loop unroll factor
864	PreHeaderBR = PreHeader->getTerminator();
865	IRBuilder<> B(PreHeaderBR);
866	Value *TripCount = Expander.expandCodeFor(SH: TripCountSC, Ty: TripCountSC->getType(),
867	I: PreHeaderBR);
868	Value *BECount;
869	// If there are other exits before the latch, that may cause the latch exit
870	// branch to never be executed, and the latch exit count may be poison.
871	// In this case, freeze the TripCount and base BECount on the frozen
872	// TripCount. We will introduce two branches using these values, and it's
873	// important that they see a consistent value (which would not be guaranteed
874	// if were frozen independently.)
875	if ((!OtherExits.empty() \|\| !SE->loopHasNoAbnormalExits(L)) &&
876	!isGuaranteedNotToBeUndefOrPoison(V: TripCount, AC, CtxI: PreHeaderBR, DT)) {
877	TripCount = B.CreateFreeze(V: TripCount);
878	BECount =
879	B.CreateAdd(LHS: TripCount, RHS: Constant::getAllOnesValue(Ty: TripCount->getType()));
880	} else {
881	// If we don't need to freeze, use SCEVExpander for BECount as well, to
882	// allow slightly better value reuse.
883	BECount =
884	Expander.expandCodeFor(SH: BECountSC, Ty: BECountSC->getType(), I: PreHeaderBR);
885	}
886
887	Value * const ModVal = CreateTripRemainder(B, BECount, TripCount, Count);
888
889	Value *BranchVal =
890	UseEpilogRemainder ? B.CreateICmpULT(LHS: BECount,
891	RHS: ConstantInt::get(Ty: BECount->getType(),
892	V: Count - `1`)) :
893	B.CreateIsNotNull(Arg: ModVal, Name: "lcmp.mod");
894	BasicBlock *RemainderLoop =
895	UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
896	BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
897	// Branch to either remainder (extra iterations) loop or unrolling loop.
898	MDNode BranchWeights = nullptr*;
899	if ((OriginalLoopProb.isUnknown() \|\| !UseEpilogRemainder) &&
900	hasBranchWeightMD(I: *Latch->getTerminator())) {
901	// Assume loop is nearly always entered.
902	MDBuilder MDB(B.getContext());
903	BranchWeights = MDB.createBranchWeights(Weights: EpilogHeaderWeights);
904	}
905	CondBrInst *UnrollingLoopGuard =
906	B.CreateCondBr(Cond: BranchVal, True: RemainderLoop, False: UnrollingLoop, BranchWeights);
907	if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
908	// The original loop's first iteration always happens. Compute the
909	// probability of the original loop executing Count-1 iterations after that
910	// to complete the first iteration of the unrolled loop.
911	BranchProbability ProbOne = OriginalLoopProb;
912	BranchProbability ProbRest = ProbOne.pow(N: Count - `1`);
913	setBranchProbability(B: UnrollingLoopGuard, P: ProbRest,
914	/ForFirstTarget=/false);
915	}
916	PreHeaderBR->eraseFromParent();
917	if (DT) {
918	if (UseEpilogRemainder)
919	DT->changeImmediateDominator(BB: EpilogPreHeader, NewBB: PreHeader);
920	else
921	DT->changeImmediateDominator(BB: PrologExit, NewBB: PreHeader);
922	}
923	Function *F = Header->getParent();
924	// Get an ordered list of blocks in the loop to help with the ordering of the
925	// cloned blocks in the prolog/epilog code
926	LoopBlocksDFS LoopBlocks(L);
927	LoopBlocks.perform(LI);
928
929	//
930	// For each extra loop iteration, create a copy of the loop's basic blocks
931	// and generate a condition that branches to the copy depending on the
932	// number of 'left over' iterations.
933	//
934	std::vector<BasicBlock *> NewBlocks;
935	ValueToValueMapTy VMap;
936
937	// Clone all the basic blocks in the loop. If Count is 2, we don't clone
938	// the loop, otherwise we create a cloned loop to execute the extra
939	// iterations. This function adds the appropriate CFG connections.
940	BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
941	BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
942	Loop *remainderLoop =
943	CloneLoopBlocks(L, NewIter: ModVal, UseEpilogRemainder, UnrollRemainder, InsertTop,
944	InsertBot, Preheader: NewPreHeader, NewBlocks, LoopBlocks, VMap, DT,
945	LI, Count, OriginalTripCount, OriginalLoopProb);
946
947	// Insert the cloned blocks into the function.
948	F->splice(ToIt: InsertBot->getIterator(), FromF: F, FromBeginIt: NewBlocks [`0`]->getIterator(), FromEndIt: F->end());
949
950	// Now the loop blocks are cloned and the other exiting blocks from the
951	// remainder are connected to the original Loop's exit blocks. The remaining
952	// work is to update the phi nodes in the original loop, and take in the
953	// values from the cloned region.
954	for (auto *BB : OtherExits) {
955	// Given we preserve LCSSA form, we know that the values used outside the
956	// loop will be used through these phi nodes at the exit blocks that are
957	// transformed below.
958	for (PHINode &PN : BB->phis()) {
959	unsigned oldNumOperands = PN.getNumIncomingValues();
960	// Add the incoming values from the remainder code to the end of the phi
961	// node.
962	for (unsigned i = `0`; i < oldNumOperands; i++){
963	auto *PredBB =PN.getIncomingBlock(i);
964	if (PredBB == Latch)
965	// The latch exit is handled separately, see connectX
966	continue;
967	if (!L->contains(BB: PredBB))
968	// Even if we had dedicated exits, the code above inserted an
969	// extra branch which can reach the latch exit.
970	continue;
971
972	auto *V = PN.getIncomingValue(i);
973	if (Instruction *I = dyn_cast<Instruction>(Val: V))
974	if (L->contains(Inst: I))
975	V = VMap.lookup(Val: I);
976	PN.addIncoming(V, BB: cast<BasicBlock>(Val&: VMap [PredBB]));
977	}
978	}
979	#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
980	for (BasicBlock *SuccBB : successors(BB)) {
981	assert(!(llvm::is_contained(OtherExits, SuccBB) \|\| SuccBB == LatchExit) &&
982	"Breaks the definition of dedicated exits!");
983	}
984	#endif
985	}
986
987	// Update the immediate dominator of the exit blocks and blocks that are
988	// reachable from the exit blocks. This is needed because we now have paths
989	// from both the original loop and the remainder code reaching the exit
990	// blocks. While the IDom of these exit blocks were from the original loop,
991	// now the IDom is the preheader (which decides whether the original loop or
992	// remainder code should run) unless the block still has just the original
993	// predecessor (such as NewExit in the case of an epilog remainder).
994	if (DT && !L->getExitingBlock()) {
995	SmallVector<BasicBlock *, `16`> ChildrenToUpdate;
996	// NB! We have to examine the dom children of all loop blocks, not just
997	// those which are the IDom of the exit blocks. This is because blocks
998	// reachable from the exit blocks can have their IDom as the nearest common
999	// dominator of the exit blocks.
1000	for (auto *BB : L->blocks()) {
1001	auto *DomNodeBB = DT->getNode(BB);
1002	for (auto *DomChild : DomNodeBB->children()) {
1003	auto *DomChildBB = DomChild->getBlock();
1004	if (!L->contains(L: LI->getLoopFor(BB: DomChildBB)) &&
1005	DomChildBB->getUniquePredecessor() != BB)
1006	ChildrenToUpdate.push_back(Elt: DomChildBB);
1007	}
1008	}
1009	for (auto *BB : ChildrenToUpdate)
1010	DT->changeImmediateDominator(BB, NewBB: PreHeader);
1011	}
1012
1013	// Loop structure should be the following:
1014	// Epilog Prolog
1015	//
1016	// PreHeader PreHeader
1017	// NewPreHeader PrologPreHeader
1018	// Header PrologHeader
1019	// ... ...
1020	// Latch PrologLatch
1021	// NewExit PrologExit
1022	// EpilogPreHeader NewPreHeader
1023	// EpilogHeader Header
1024	// ... ...
1025	// EpilogLatch Latch
1026	// LatchExit LatchExit
1027
1028	// Rewrite the cloned instruction operands to use the values created when the
1029	// clone is created.
1030	for (BasicBlock *BB : NewBlocks) {
1031	Module *M = BB->getModule();
1032	for (Instruction &I : *BB) {
1033	RemapInstruction(I: &I, VM&: VMap,
1034	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
1035	RemapDbgRecordRange(M, Range: I.getDbgRecordRange(), VM&: VMap,
1036	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
1037	}
1038	}
1039
1040	if (UseEpilogRemainder) {
1041	// Connect the epilog code to the original loop and update the
1042	// PHI functions.
1043	ConnectEpilog(L, ModVal, NewExit, Exit: LatchExit, PreHeader, EpilogPreHeader,
1044	NewPreHeader, VMap, DT, LI, PreserveLCSSA, SE&: SE, Count, AC&: AC,
1045	OriginalLoopProb);
1046
1047	// Update counter in loop for unrolling.
1048	// Use an incrementing IV. Pre-incr/post-incr is backedge/trip count.
1049	// Subtle: TestVal can be 0 if we wrapped when computing the trip count,
1050	// thus we must compare the post-increment (wrapping) value.
1051	IRBuilder<> B2(NewPreHeader->getTerminator());
1052	Value *TestVal = B2.CreateSub(LHS: TripCount, RHS: ModVal, Name: "unroll_iter");
1053	CondBrInst *LatchBR = cast<CondBrInst>(Val: Latch->getTerminator());
1054	PHINode *NewIdx = PHINode::Create(Ty: TestVal->getType(), NumReservedValues: `2`, NameStr: "niter");
1055	NewIdx->insertBefore(InsertPos: Header->getFirstNonPHIIt());
1056	B2.SetInsertPoint(LatchBR);
1057	auto *Zero = ConstantInt::get(Ty: NewIdx->getType(), V: `0`);
1058	auto *One = ConstantInt::get(Ty: NewIdx->getType(), V: `1`);
1059	Value *IdxNext = B2.CreateAdd(LHS: NewIdx, RHS: One, Name: NewIdx->getName() + ".next");
1060	auto Pred = LatchBR->getSuccessor(i: `0`) == Header ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
1061	Value *IdxCmp = B2.CreateICmp(P: Pred, LHS: IdxNext, RHS: TestVal, Name: NewIdx->getName() + ".ncmp");
1062	NewIdx->addIncoming(V: Zero, BB: NewPreHeader);
1063	NewIdx->addIncoming(V: IdxNext, BB: Latch);
1064	LatchBR->setCondition(IdxCmp);
1065	} else {
1066	// Connect the prolog code to the original loop and update the
1067	// PHI functions.
1068	ConnectProlog(L, BECount, Count, PrologExit, OriginalLoopLatchExit: LatchExit, PreHeader,
1069	NewPreHeader, VMap, DT, LI, PreserveLCSSA, SE&: *SE);
1070	}
1071
1072	// If this loop is nested, then the loop unroller changes the code in the any
1073	// of its parent loops, so the Scalar Evolution pass needs to be run again.
1074	SE->forgetTopmostLoop(L);
1075
1076	// Verify that the Dom Tree and Loop Info are correct.
1077	#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
1078	if (DT) {
1079	assert(DT->verify(DominatorTree::VerificationLevel::Full));
1080	LI->verify(*DT);
1081	}
1082	#endif
1083
1084	// For unroll factor 2 remainder loop will have 1 iteration.
1085	if (Count == `2` && DT && LI && SE) {
1086	// TODO: This code could probably be pulled out into a helper function
1087	// (e.g. breakLoopBackedgeAndSimplify) and reused in loop-deletion.
1088	BasicBlock *RemainderLatch = remainderLoop->getLoopLatch();
1089	assert(RemainderLatch);
1090	SmallVector<BasicBlock *> RemainderBlocks(remainderLoop->getBlocks());
1091	breakLoopBackedge(L: remainderLoop, DT&: DT, SE&: SE, LI&: LI, MSSA: nullptr*);
1092	remainderLoop = nullptr;
1093
1094	// Simplify loop values after breaking the backedge
1095	const DataLayout &DL = L->getHeader()->getDataLayout();
1096	SmallVector<WeakTrackingVH, `16`> DeadInsts;
1097	for (BasicBlock *BB : RemainderBlocks) {
1098	for (Instruction &Inst : llvm::make_early_inc_range(Range&: *BB)) {
1099	if (Value V = simplifyInstruction(I: &Inst, Q: {DL, nullptr*, DT, AC}))
1100	if (LI->replacementPreservesLCSSAForm(From: &Inst, To: V))
1101	Inst.replaceAllUsesWith(V);
1102	if (isInstructionTriviallyDead(I: &Inst))
1103	DeadInsts.emplace_back(Args: &Inst);
1104	}
1105	// We can't do recursive deletion until we're done iterating, as we might
1106	// have a phi which (potentially indirectly) uses instructions later in
1107	// the block we're iterating through.
1108	RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
1109	}
1110
1111	// Merge latch into exit block.
1112	auto *ExitBB = RemainderLatch->getSingleSuccessor();
1113	assert(ExitBB && "required after breaking cond br backedge");
1114	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
1115	MergeBlockIntoPredecessor(BB: ExitBB, DTU: &DTU, LI);
1116	}
1117
1118	// Canonicalize to LoopSimplifyForm both original and remainder loops. We
1119	// cannot rely on the LoopUnrollPass to do this because it only does
1120	// canonicalization for parent/subloops and not the sibling loops.
1121	if (OtherExits.size() > `0`) {
1122	// Generate dedicated exit blocks for the original loop, to preserve
1123	// LoopSimplifyForm.
1124	formDedicatedExitBlocks(L, DT, LI, MSSAU: nullptr, PreserveLCSSA);
1125	// Generate dedicated exit blocks for the remainder loop if one exists, to
1126	// preserve LoopSimplifyForm.
1127	if (remainderLoop)
1128	formDedicatedExitBlocks(L: remainderLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA);
1129	}
1130
1131	auto UnrollResult = LoopUnrollResult::Unmodified;
1132	if (remainderLoop && UnrollRemainder) {
1133	LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
1134	UnrollLoopOptions ULO;
1135	ULO.Count = Count - `1`;
1136	ULO.Force = false;
1137	ULO.Runtime = false;
1138	ULO.AllowExpensiveTripCount = false;
1139	ULO.UnrollRemainder = false;
1140	ULO.ForgetAllSCEV = ForgetAllSCEV;
1141	assert(!getLoopConvergenceHeart(L) &&
1142	"A loop with a convergence heart does not allow runtime unrolling.");
1143	UnrollResult = UnrollLoop(L: remainderLoop, ULO, LI, SE, DT, AC, TTI,
1144	/ORE/ nullptr, PreserveLCSSA);
1145	}
1146
1147	if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
1148	*ResultLoop = remainderLoop;
1149	NumRuntimeUnrolled ++;
1150	return true;
1151	}
1152

Browse the source code of llvm_projects/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp