LoopRotationUtils.cpp source code [llvm_projects/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp]

1	//===----------------- LoopRotationUtils.cpp -----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides utilities to convert a loop into a loop with bottom test.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/Transforms/Utils/LoopRotationUtils.h"
14	#include "llvm/ADT/Statistic.h"
15	#include "llvm/Analysis/AssumptionCache.h"
16	#include "llvm/Analysis/CodeMetrics.h"
17	#include "llvm/Analysis/DomTreeUpdater.h"
18	#include "llvm/Analysis/InstructionSimplify.h"
19	#include "llvm/Analysis/LoopInfo.h"
20	#include "llvm/Analysis/MemorySSA.h"
21	#include "llvm/Analysis/MemorySSAUpdater.h"
22	#include "llvm/Analysis/ScalarEvolution.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/IR/CFG.h"
25	#include "llvm/IR/DebugInfo.h"
26	#include "llvm/IR/Dominators.h"
27	#include "llvm/IR/IntrinsicInst.h"
28	#include "llvm/IR/MDBuilder.h"
29	#include "llvm/IR/ProfDataUtils.h"
30	#include "llvm/Support/CommandLine.h"
31	#include "llvm/Support/Debug.h"
32	#include "llvm/Support/raw_ostream.h"
33	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
34	#include "llvm/Transforms/Utils/Cloning.h"
35	#include "llvm/Transforms/Utils/Local.h"
36	#include "llvm/Transforms/Utils/SSAUpdater.h"
37	#include "llvm/Transforms/Utils/ValueMapper.h"
38	using namespace llvm;
39
40	#define DEBUG_TYPE "loop-rotate"
41
42	STATISTIC(NumNotRotatedDueToHeaderSize,
43	"Number of loops not rotated due to the header size");
44	STATISTIC(NumInstrsHoisted,
45	"Number of instructions hoisted into loop preheader");
46	STATISTIC(NumInstrsDuplicated,
47	"Number of instructions cloned into loop preheader");
48
49	// Probability that a rotated loop has zero trip count / is never entered.
50	static constexpr uint32_t ZeroTripCountWeights[] = {`1`, `127`};
51
52	namespace {
53	/// A simple loop rotation transformation.
54	class LoopRotate {
55	const unsigned MaxHeaderSize;
56	LoopInfo *LI;
57	const TargetTransformInfo *TTI;
58	AssumptionCache *AC;
59	DominatorTree *DT;
60	ScalarEvolution *SE;
61	MemorySSAUpdater *MSSAU;
62	const SimplifyQuery &SQ;
63	bool RotationOnly;
64	bool IsUtilMode;
65	bool PrepareForLTO;
66
67	public:
68	LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
69	const TargetTransformInfo TTI, AssumptionCache AC,
70	DominatorTree DT, ScalarEvolution SE, MemorySSAUpdater *MSSAU,
71	const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
72	bool PrepareForLTO)
73	: MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
74	MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
75	IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
76	bool processLoop(Loop *L);
77
78	private:
79	bool rotateLoop(Loop L, bool* SimplifiedLatch);
80	bool simplifyLoopLatch(Loop *L);
81	};
82	} // end anonymous namespace
83
84	/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not
85	/// previously exist in the map, and the value was inserted.
86	static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value K, Value V) {
87	bool Inserted = VM.insert(KV: {K, V}).second;
88	assert(Inserted);
89	(void)Inserted;
90	}
91	/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
92	/// old header into the preheader. If there were uses of the values produced by
93	/// these instruction that were outside of the loop, we have to insert PHI nodes
94	/// to merge the two values. Do this now.
95	static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
96	BasicBlock *OrigPreheader,
97	ValueToValueMapTy &ValueMap,
98	ScalarEvolution *SE,
99	SmallVectorImpl<PHINode> InsertedPHIs) {
100	// Remove PHI node entries that are no longer live.
101	BasicBlock::iterator I, E = OrigHeader->end();
102	for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(Val&: I); ++I)
103	PN->removeIncomingValue(BB: OrigPreheader);
104
105	// Now fix up users of the instructions in OrigHeader, inserting PHI nodes
106	// as necessary.
107	SSAUpdater SSA(InsertedPHIs);
108	for (I = OrigHeader->begin(); I != E; ++I) {
109	Value OrigHeaderVal = &I;
110
111	// If there are no uses of the value (e.g. because it returns void), there
112	// is nothing to rewrite.
113	if (OrigHeaderVal->use_empty())
114	continue;
115
116	Value *OrigPreHeaderVal = ValueMap.lookup(Val: OrigHeaderVal);
117
118	// The value now exits in two versions: the initial value in the preheader
119	// and the loop "next" value in the original header.
120	SSA.Initialize(Ty: OrigHeaderVal->getType(), Name: OrigHeaderVal->getName());
121	// Force re-computation of OrigHeaderVal, as some users now need to use the
122	// new PHI node.
123	if (SE)
124	SE->forgetValue(V: OrigHeaderVal);
125	SSA.AddAvailableValue(BB: OrigHeader, V: OrigHeaderVal);
126	SSA.AddAvailableValue(BB: OrigPreheader, V: OrigPreHeaderVal);
127
128	// Visit each use of the OrigHeader instruction.
129	for (Use &U : llvm::make_early_inc_range(Range: OrigHeaderVal->uses())) {
130	// SSAUpdater can't handle a non-PHI use in the same block as an
131	// earlier def. We can easily handle those cases manually.
132	Instruction *UserInst = cast<Instruction>(Val: U.getUser());
133	if (!isa<PHINode>(Val: UserInst)) {
134	BasicBlock *UserBB = UserInst->getParent();
135
136	// The original users in the OrigHeader are already using the
137	// original definitions.
138	if (UserBB == OrigHeader)
139	continue;
140
141	// Users in the OrigPreHeader need to use the value to which the
142	// original definitions are mapped.
143	if (UserBB == OrigPreheader) {
144	U = OrigPreHeaderVal;
145	continue;
146	}
147	}
148
149	// Anything else can be handled by SSAUpdater.
150	SSA.RewriteUse(U);
151	}
152
153	// Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
154	// intrinsics.
155	SmallVector<DbgVariableRecord *, `1`> DbgVariableRecords;
156	llvm::findDbgValues(V: OrigHeaderVal, DbgVariableRecords);
157
158	for (DbgVariableRecord *DVR : DbgVariableRecords) {
159	// The original users in the OrigHeader are already using the original
160	// definitions.
161	BasicBlock *UserBB = DVR->getMarker()->getParent();
162	if (UserBB == OrigHeader)
163	continue;
164
165	// Users in the OrigPreHeader need to use the value to which the
166	// original definitions are mapped and anything else can be handled by
167	// the SSAUpdater. To avoid adding PHINodes, check if the value is
168	// available in UserBB, if not substitute poison.
169	Value *NewVal;
170	if (UserBB == OrigPreheader)
171	NewVal = OrigPreHeaderVal;
172	else if (SSA.HasValueForBlock(BB: UserBB))
173	NewVal = SSA.GetValueInMiddleOfBlock(BB: UserBB);
174	else
175	NewVal = PoisonValue::get(T: OrigHeaderVal->getType());
176	DVR->replaceVariableLocationOp(OldValue: OrigHeaderVal, NewValue: NewVal);
177	}
178	}
179	}
180
181	// Assuming both header and latch are exiting, look for a phi which is only
182	// used outside the loop (via a LCSSA phi) in the exit from the header.
183	// This means that rotating the loop can remove the phi.
184	static bool profitableToRotateLoopExitingLatch(Loop *L) {
185	BasicBlock *Header = L->getHeader();
186	CondBrInst *BI = dyn_cast<CondBrInst>(Val: Header->getTerminator());
187	BasicBlock *HeaderExit = BI->getSuccessor(i: `0`);
188	if (L->contains(BB: HeaderExit))
189	HeaderExit = BI->getSuccessor(i: `1`);
190
191	for (auto &Phi : Header->phis()) {
192	// Look for uses of this phi in the loop/via exits other than the header.
193	if (llvm::any_of(Range: Phi.users(), P: [HeaderExit](const User *U) {
194	return cast<Instruction>(Val: U)->getParent() != HeaderExit;
195	}))
196	continue;
197	return true;
198	}
199	return false;
200	}
201
202	static void updateBranchWeights(CondBrInst &PreHeaderBI, CondBrInst &LoopBI,
203	bool HasConditionalPreHeader,
204	bool SuccsSwapped) {
205	MDNode *WeightMD = getBranchWeightMDNode(I: PreHeaderBI);
206	if (WeightMD == nullptr)
207	return;
208
209	// LoopBI should currently be a clone of PreHeaderBI with the same
210	// metadata. But we double check to make sure we don't have a degenerate case
211	// where instsimplify changed the instructions.
212	if (WeightMD != getBranchWeightMDNode(I: LoopBI))
213	return;
214
215	SmallVector<uint32_t, `2`> Weights;
216	extractFromBranchWeightMD32(ProfileData: WeightMD, Weights);
217	if (Weights.size() != `2`)
218	return;
219	uint32_t OrigLoopExitWeight = Weights [`0`];
220	uint32_t OrigLoopBackedgeWeight = Weights [`1`];
221
222	if (SuccsSwapped)
223	std::swap(a&: OrigLoopExitWeight, b&: OrigLoopBackedgeWeight);
224
225	// Update branch weights. Consider the following edge-counts:
226	//
227	// \| \|-------- \|
228	// V V \| V
229	// Br i1 ... \| Br i1 ...
230	// \| \| \| \| \|
231	// x\| y\| \| becomes: \| y0\| \|-----
232	// V V \| \| V V \|
233	// Exit Loop \| \| Loop \|
234	// \| \| \| Br i1 ... \|
235	// ----- \| \| \| \|
236	// x0\| x1\| y1 \| \|
237	// V V ----
238	// Exit
239	//
240	// The following must hold:
241	// - x == x0 + x1 # counts to "exit" must stay the same.
242	// - y0 == x - x0 == x1 # how often loop was entered at all.
243	// - y1 == y - y0 # How often loop was repeated (after first iter.).
244	//
245	// We cannot generally deduce how often we had a zero-trip count loop so we
246	// have to make a guess for how to distribute x among the new x0 and x1.
247
248	uint32_t ExitWeight0; // aka x0
249	uint32_t ExitWeight1; // aka x1
250	uint32_t EnterWeight; // aka y0
251	uint32_t LoopBackWeight; // aka y1
252	if (OrigLoopExitWeight > `0` && OrigLoopBackedgeWeight > `0`) {
253	ExitWeight0 = `0`;
254	if (HasConditionalPreHeader) {
255	// Here we cannot know how many 0-trip count loops we have, so we guess:
256	if (OrigLoopBackedgeWeight >= OrigLoopExitWeight) {
257	// If the loop count is bigger than the exit count then we set
258	// probabilities as if 0-trip count nearly never happens.
259	ExitWeight0 = ZeroTripCountWeights[`0`];
260	// Scale up counts if necessary so we can match `ZeroTripCountWeights`
261	// for the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.
262	while (OrigLoopExitWeight < ZeroTripCountWeights[`1`] + ExitWeight0) {
263	// ... but don't overflow.
264	uint32_t const HighBit = uint32_t{`1`} << (sizeof(uint32_t) * `8` - `1`);
265	if ((OrigLoopBackedgeWeight & HighBit) != `0` \|\|
266	(OrigLoopExitWeight & HighBit) != `0`)
267	break;
268	OrigLoopBackedgeWeight <<= `1`;
269	OrigLoopExitWeight <<= `1`;
270	}
271	} else {
272	// If there's a higher exit-count than backedge-count then we set
273	// probabilities as if there are only 0-trip and 1-trip cases.
274	ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;
275	}
276	} else {
277	// Theoretically, if the loop body must be executed at least once, the
278	// backedge count must be not less than exit count. However the branch
279	// weight collected by sampling-based PGO may be not very accurate due to
280	// sampling. Therefore this workaround is required here to avoid underflow
281	// of unsigned in following update of branch weight.
282	if (OrigLoopExitWeight > OrigLoopBackedgeWeight)
283	OrigLoopBackedgeWeight = OrigLoopExitWeight;
284	}
285	assert(OrigLoopExitWeight >= ExitWeight0 && "Bad branch weight");
286	ExitWeight1 = OrigLoopExitWeight - ExitWeight0;
287	EnterWeight = ExitWeight1;
288	assert(OrigLoopBackedgeWeight >= EnterWeight && "Bad branch weight");
289	LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight;
290	} else if (OrigLoopExitWeight == `0`) {
291	if (OrigLoopBackedgeWeight == `0`) {
292	// degenerate case... keep everything zero...
293	ExitWeight0 = `0`;
294	ExitWeight1 = `0`;
295	EnterWeight = `0`;
296	LoopBackWeight = `0`;
297	} else {
298	// Special case "LoopExitWeight == 0" weights which behaves like an
299	// endless where we don't want loop-enttry (y0) to be the same as
300	// loop-exit (x1).
301	ExitWeight0 = `0`;
302	ExitWeight1 = `0`;
303	EnterWeight = `1`;
304	LoopBackWeight = OrigLoopBackedgeWeight;
305	}
306	} else {
307	// loop is never entered.
308	assert(OrigLoopBackedgeWeight == `0` && "remaining case is backedge zero");
309	ExitWeight0 = `1`;
310	ExitWeight1 = `1`;
311	EnterWeight = `0`;
312	LoopBackWeight = `0`;
313	}
314
315	const uint32_t LoopBIWeights[] = {
316	SuccsSwapped ? LoopBackWeight : ExitWeight1,
317	SuccsSwapped ? ExitWeight1 : LoopBackWeight,
318	};
319	setBranchWeights(I&: LoopBI, Weights: LoopBIWeights, /IsExpected=/false);
320	if (HasConditionalPreHeader) {
321	const uint32_t PreHeaderBIWeights[] = {
322	SuccsSwapped ? EnterWeight : ExitWeight0,
323	SuccsSwapped ? ExitWeight0 : EnterWeight,
324	};
325	setBranchWeights(I&: PreHeaderBI, Weights: PreHeaderBIWeights, /IsExpected=/false);
326	}
327	}
328
329	/// Rotate loop LP. Return true if the loop is rotated.
330	///
331	/// \param SimplifiedLatch is true if the latch was just folded into the final
332	/// loop exit. In this case we may want to rotate even though the new latch is
333	/// now an exiting branch. This rotation would have happened had the latch not
334	/// been simplified. However, if SimplifiedLatch is false, then we avoid
335	/// rotating loops in which the latch exits to avoid excessive or endless
336	/// rotation. LoopRotate should be repeatable and converge to a canonical
337	/// form. This property is satisfied because simplifying the loop latch can only
338	/// happen once across multiple invocations of the LoopRotate pass.
339	bool LoopRotate::rotateLoop(Loop L, bool* SimplifiedLatch) {
340	// If the loop has only one block then there is not much to rotate.
341	if (L->getBlocks().size() == `1`)
342	return false;
343
344	bool Rotated = false;
345	BasicBlock *OrigHeader = L->getHeader();
346	BasicBlock *OrigLatch = L->getLoopLatch();
347
348	CondBrInst *BI = dyn_cast<CondBrInst>(Val: OrigHeader->getTerminator());
349	if (!BI)
350	return Rotated;
351
352	// If the loop header is not one of the loop exiting blocks then
353	// either this loop is already rotated or it is not
354	// suitable for loop rotation transformations.
355	if (!L->isLoopExiting(BB: OrigHeader))
356	return Rotated;
357
358	// If the loop latch already contains a branch that leaves the loop then the
359	// loop is already rotated.
360	if (!OrigLatch)
361	return Rotated;
362
363	// Rotate if the loop latch was just simplified. Or if it makes the loop exit
364	// count computable. Or if we think it will be profitable.
365	if (L->isLoopExiting(BB: OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
366	!profitableToRotateLoopExitingLatch(L))
367	return Rotated;
368
369	// Check size of original header and reject loop if it is very big or we can't
370	// duplicate blocks inside it.
371	{
372	SmallPtrSet<const Value *, `32`> EphValues;
373	CodeMetrics::collectEphemeralValues(L, AC, EphValues);
374
375	CodeMetrics Metrics;
376	Metrics.analyzeBasicBlock(BB: OrigHeader, TTI: *TTI, EphValues, PrepareForLTO);
377	if (Metrics.notDuplicatable) {
378	LLVM_DEBUG(
379	dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
380	<< " instructions: ";
381	L->dump());
382	return Rotated;
383	}
384	if (Metrics.Convergence != ConvergenceKind::None) {
385	LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
386	"instructions: ";
387	L->dump());
388	return Rotated;
389	}
390	if (!Metrics.NumInsts.isValid()) {
391	LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
392	" with invalid cost: ";
393	L->dump());
394	return Rotated;
395	}
396	if (Metrics.NumInsts > MaxHeaderSize) {
397	LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
398	<< Metrics.NumInsts
399	<< " instructions, which is more than the threshold ("
400	<< MaxHeaderSize << " instructions): ";
401	L->dump());
402	++NumNotRotatedDueToHeaderSize;
403	return Rotated;
404	}
405
406	// When preparing for LTO, avoid rotating loops with calls that could be
407	// inlined during the LTO stage.
408	if (PrepareForLTO && Metrics.NumInlineCandidates > `0`)
409	return Rotated;
410	}
411
412	// Now, this loop is suitable for rotation.
413	BasicBlock *OrigPreheader = L->getLoopPreheader();
414
415	// If the loop could not be converted to canonical form, it must have an
416	// indirectbr in it, just give up.
417	if (!OrigPreheader \|\| !L->hasDedicatedExits())
418	return Rotated;
419
420	// Anything ScalarEvolution may know about this loop or the PHI nodes
421	// in its header will soon be invalidated. We should also invalidate
422	// all outer loops because insertion and deletion of blocks that happens
423	// during the rotation may violate invariants related to backedge taken
424	// infos in them.
425	if (SE) {
426	SE->forgetTopmostLoop(L);
427	// We may hoist some instructions out of loop. In case if they were cached
428	// as "loop variant" or "loop computable", these caches must be dropped.
429	// We also may fold basic blocks, so cached block dispositions also need
430	// to be dropped.
431	SE->forgetBlockAndLoopDispositions();
432	}
433
434	LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
435	if (MSSAU && VerifyMemorySSA)
436	MSSAU->getMemorySSA()->verifyMemorySSA();
437
438	// Find new Loop header. NewHeader is a Header's one and only successor
439	// that is inside loop. Header's other successor is outside the
440	// loop. Otherwise loop is not suitable for rotation.
441	BasicBlock *Exit = BI->getSuccessor(i: `0`);
442	BasicBlock *NewHeader = BI->getSuccessor(i: `1`);
443	bool BISuccsSwapped = L->contains(BB: Exit);
444	if (BISuccsSwapped)
445	std::swap(a&: Exit, b&: NewHeader);
446	assert(NewHeader && "Unable to determine new loop header");
447	assert(L->contains(NewHeader) && !L->contains(Exit) &&
448	"Unable to determine loop header and exit blocks");
449
450	// This code assumes that the new header has exactly one predecessor.
451	// Remove any single-entry PHI nodes in it.
452	assert(NewHeader->getSinglePredecessor() &&
453	"New header doesn't have one pred!");
454	FoldSingleEntryPHINodes(BB: NewHeader);
455
456	// Begin by walking OrigHeader and populating ValueMap with an entry for
457	// each Instruction.
458	BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
459	ValueToValueMapTy ValueMap, ValueMapMSSA;
460
461	// For PHI nodes, the value available in OldPreHeader is just the
462	// incoming value from OldPreHeader.
463	for (; PHINode *PN = dyn_cast<PHINode>(Val&: I); ++I)
464	InsertNewValueIntoMap(VM&: ValueMap, K: PN,
465	V: PN->getIncomingValueForBlock(BB: OrigPreheader));
466
467	// For the rest of the instructions, either hoist to the OrigPreheader if
468	// possible or create a clone in the OldPreHeader if not.
469	Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
470
471	// Record all debug records preceding LoopEntryBranch to avoid
472	// duplication.
473	using DbgHash =
474	std::pair<std::pair<hash_code, DILocalVariable >, DIExpression >;
475	auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
476	auto VarLocOps = D->location_ops();
477	return {{hash_combine_range(R&: VarLocOps), D->getVariable()},
478	D->getExpression()};
479	};
480
481	SmallDenseSet<DbgHash, `8`> DbgRecords;
482	// Build DbgVariableRecord hashes for DbgVariableRecords attached to the
483	// terminator.
484	for (const DbgVariableRecord &DVR :
485	filterDbgVars(R: OrigPreheader->getTerminator()->getDbgRecordRange()))
486	DbgRecords.insert(V: makeHash (&DVR));
487
488	// Remember the local noalias scope declarations in the header. After the
489	// rotation, they must be duplicated and the scope must be cloned. This
490	// avoids unwanted interaction across iterations.
491	SmallVector<NoAliasScopeDeclInst *, `6`> NoAliasDeclInstructions;
492	for (Instruction &I : *OrigHeader)
493	if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
494	NoAliasDeclInstructions.push_back(Elt: Decl);
495
496	Module *M = OrigHeader->getModule();
497
498	// Track the next DbgRecord to clone. If we have a sequence where an
499	// instruction is hoisted instead of being cloned:
500	// DbgRecord blah
501	// %foo = add i32 0, 0
502	// DbgRecord xyzzy
503	// %bar = call i32 @foobar()
504	// where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
505	// attached to %foo, then when %foo his hoisted it will "fall down" onto the
506	// function call:
507	// DbgRecord blah
508	// DbgRecord xyzzy
509	// %bar = call i32 @foobar()
510	// causing it to appear attached to the call too.
511	//
512	// To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
513	// here" position to account for this behaviour. We point it at any
514	// DbgRecords on the next instruction, here labelled xyzzy, before we hoist
515	// %foo. Later, we only only clone DbgRecords from that position (xyzzy)
516	// onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
517	// a range because it gives us a natural way of testing whether
518	// there were DbgRecords on the next instruction before we hoisted things).
519	iterator_range<DbgRecord::self_iterator> NextDbgInsts =
520	(I != E) ? I ->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
521
522	while (I != E) {
523	Instruction Inst = &I ++;
524
525	// If the instruction's operands are invariant and it doesn't read or write
526	// memory, then it is safe to hoist. Doing this doesn't change the order of
527	// execution in the preheader, but does prevent the instruction from
528	// executing in each iteration of the loop. This means it is safe to hoist
529	// something that might trap, but isn't safe to hoist something that reads
530	// memory (without proving that the loop doesn't write).
531	if (L->hasLoopInvariantOperands(I: Inst) && !Inst->mayReadFromMemory() &&
532	!Inst->mayWriteToMemory() && !Inst->isTerminator() &&
533	!isa<AllocaInst>(Val: Inst) &&
534	// It is not safe to hoist the value of these instructions in
535	// coroutines, as the addresses of otherwise eligible variables (e.g.
536	// thread-local variables and errno) may change if the coroutine is
537	// resumed in a different thread.Therefore, we disable this
538	// optimization for correctness. However, this may block other correct
539	// optimizations.
540	// FIXME: This should be reverted once we have a better model for
541	// memory access in coroutines.
542	!Inst->getFunction()->isPresplitCoroutine()) {
543
544	if (!NextDbgInsts.empty()) {
545	auto DbgValueRange =
546	LoopEntryBranch->cloneDebugInfoFrom(From: Inst, FromHere: NextDbgInsts.begin());
547	RemapDbgRecordRange(M, Range: DbgValueRange, VM&: ValueMap,
548	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
549	// Erase anything we've seen before.
550	for (DbgVariableRecord &DVR :
551	make_early_inc_range(Range: filterDbgVars(R: DbgValueRange)))
552	if (DbgRecords.count(V: makeHash (&DVR)))
553	DVR.eraseFromParent();
554	}
555
556	NextDbgInsts = I ->getDbgRecordRange();
557
558	Inst->moveBefore(InsertPos: LoopEntryBranch->getIterator());
559
560	++NumInstrsHoisted;
561	continue;
562	}
563
564	// Otherwise, create a duplicate of the instruction.
565	Instruction *C = Inst->clone();
566	if (const DebugLoc &DL = C->getDebugLoc())
567	mapAtomInstance(DL, VMap&: ValueMap);
568
569	C->insertBefore(InsertPos: LoopEntryBranch->getIterator());
570
571	++NumInstrsDuplicated;
572
573	if (!NextDbgInsts.empty()) {
574	auto Range = C->cloneDebugInfoFrom(From: Inst, FromHere: NextDbgInsts.begin());
575	RemapDbgRecordRange(M, Range, VM&: ValueMap,
576	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
577	NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
578	// Erase anything we've seen before.
579	for (DbgVariableRecord &DVR : make_early_inc_range(Range: filterDbgVars(R: Range)))
580	if (DbgRecords.count(V: makeHash (&DVR)))
581	DVR.eraseFromParent();
582	}
583
584	// Eagerly remap the operands of the instruction.
585	RemapInstruction(I: C, VM&: ValueMap,
586	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
587
588	// With the operands remapped, see if the instruction constant folds or is
589	// otherwise simplifyable. This commonly occurs because the entry from PHI
590	// nodes allows icmps and other instructions to fold.
591	Value *V = simplifyInstruction(I: C, Q: SQ);
592	if (V && LI->replacementPreservesLCSSAForm(From: C, To: V)) {
593	// If so, then delete the temporary instruction and stick the folded value
594	// in the map.
595	InsertNewValueIntoMap(VM&: ValueMap, K: Inst, V);
596	if (!C->mayHaveSideEffects()) {
597	C->eraseFromParent();
598	C = nullptr;
599	}
600	} else {
601	InsertNewValueIntoMap(VM&: ValueMap, K: Inst, V: C);
602	}
603	if (C) {
604	// Otherwise, stick the new instruction into the new block!
605	C->setName(Inst->getName());
606
607	if (auto *II = dyn_cast<AssumeInst>(Val: C))
608	AC->registerAssumption(CI: II);
609	// MemorySSA cares whether the cloned instruction was inserted or not, and
610	// not whether it can be remapped to a simplified value.
611	if (MSSAU)
612	InsertNewValueIntoMap(VM&: ValueMapMSSA, K: Inst, V: C);
613	}
614	}
615
616	if (!NoAliasDeclInstructions.empty()) {
617	// There are noalias scope declarations:
618	// (general):
619	// Original: OrigPre { OrigHeader NewHeader ... Latch }
620	// after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
621	//
622	// with D: llvm.experimental.noalias.scope.decl,
623	// U: !noalias or !alias.scope depending on D
624	// ... { D U1 U2 } can transform into:
625	// (0) : ... { D U1 U2 } // no relevant rotation for this part
626	// (1) : ... D' { U1 U2 D } // D is part of OrigHeader
627	// (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
628	//
629	// We now want to transform:
630	// (1) -> : ... D' { D U1 U2 D'' }
631	// (2) -> : ... D' U1' { D U2 D'' U1'' }
632	// D: original llvm.experimental.noalias.scope.decl
633	// D', U1': duplicate with replaced scopes
634	// D'', U1'': different duplicate with replaced scopes
635	// This ensures a safe fallback to 'may_alias' introduced by the rotate,
636	// as U1'' and U1' scopes will not be compatible wrt to the local restrict
637
638	// Clone the llvm.experimental.noalias.decl again for the NewHeader.
639	BasicBlock::iterator NewHeaderInsertionPoint =
640	NewHeader->getFirstNonPHIIt();
641	for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
642	LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:"
643	<< *NAD << "\n");
644	Instruction *NewNAD = NAD->clone();
645	NewNAD->insertBefore(BB&: *NewHeader, InsertPos: NewHeaderInsertionPoint);
646	}
647
648	// Scopes must now be duplicated, once for OrigHeader and once for
649	// OrigPreHeader'.
650	{
651	auto &Context = NewHeader->getContext();
652
653	SmallVector<MDNode *, `8`> NoAliasDeclScopes;
654	for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
655	NoAliasDeclScopes.push_back(Elt: NAD->getScopeList());
656
657	LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n");
658	cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, NewBlocks: {OrigHeader}, Context,
659	Ext: "h.rot");
660	LLVM_DEBUG(OrigHeader->dump());
661
662	// Keep the compile time impact low by only adapting the inserted block
663	// of instructions in the OrigPreHeader. This might result in slightly
664	// more aliasing between these instructions and those that were already
665	// present, but it will be much faster when the original PreHeader is
666	// large.
667	LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n");
668	auto *FirstDecl =
669	cast<Instruction>(Val&: ValueMap [*NoAliasDeclInstructions.begin()]);
670	auto *LastInst = &OrigPreheader->back();
671	cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, IStart: FirstDecl, IEnd: LastInst,
672	Context, Ext: "pre.rot");
673	LLVM_DEBUG(OrigPreheader->dump());
674
675	LLVM_DEBUG(dbgs() << " Updated NewHeader:\n");
676	LLVM_DEBUG(NewHeader->dump());
677	}
678	}
679
680	// Along with all the other instructions, we just cloned OrigHeader's
681	// terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
682	// successors by duplicating their incoming values for OrigHeader.
683	for (BasicBlock *SuccBB : successors(BB: OrigHeader))
684	for (BasicBlock::iterator BI = SuccBB->begin();
685	PHINode *PN = dyn_cast<PHINode>(Val&: BI); ++BI)
686	PN->addIncoming(V: PN->getIncomingValueForBlock(BB: OrigHeader), BB: OrigPreheader);
687
688	// Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
689	// OrigPreHeader's old terminator (the original branch into the loop), and
690	// remove the corresponding incoming values from the PHI nodes in OrigHeader.
691	LoopEntryBranch->eraseFromParent();
692	OrigPreheader->flushTerminatorDbgRecords();
693
694	// Update MemorySSA before the rewrite call below changes the 1:1
695	// instruction:cloned_instruction_or_value mapping.
696	if (MSSAU) {
697	InsertNewValueIntoMap(VM&: ValueMapMSSA, K: OrigHeader, V: OrigPreheader);
698	MSSAU->updateForClonedBlockIntoPred(BB: OrigHeader, P1: OrigPreheader,
699	VM: ValueMapMSSA);
700	}
701
702	SmallVector<PHINode *, `2`> InsertedPHIs;
703	// If there were any uses of instructions in the duplicated block outside the
704	// loop, update them, inserting PHI nodes as required
705	RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
706	InsertedPHIs: &InsertedPHIs);
707
708	// Attach debug records to the new phis if that phi uses a value that
709	// previously had debug metadata attached. This keeps the debug info
710	// up-to-date in the loop body.
711	if (!InsertedPHIs.empty())
712	insertDebugValuesForPHIs(BB: OrigHeader, InsertedPHIs);
713
714	// NewHeader is now the header of the loop.
715	L->moveToHeader(BB: NewHeader);
716	assert(L->getHeader() == NewHeader && "Latch block is our new header");
717
718	// Inform DT about changes to the CFG.
719	if (DT) {
720	// The OrigPreheader branches to the NewHeader and Exit now. Then, inform
721	// the DT about the removed edge to the OrigHeader (that got removed).
722	SmallVector<DominatorTree::UpdateType, `3`> Updates = {
723	{DominatorTree::Insert, OrigPreheader, Exit},
724	{DominatorTree::Insert, OrigPreheader, NewHeader},
725	{DominatorTree::Delete, OrigPreheader, OrigHeader}};
726
727	if (MSSAU) {
728	MSSAU->applyUpdates(Updates, DT&: DT, /UpdateDT=/UpdateDTFirst: true*);
729	if (VerifyMemorySSA)
730	MSSAU->getMemorySSA()->verifyMemorySSA();
731	} else {
732	DT->applyUpdates(Updates);
733	}
734	}
735
736	// At this point, we've finished our major CFG changes. As part of cloning
737	// the loop into the preheader we've simplified instructions and the
738	// duplicated conditional branch may now be branching on a constant. If it is
739	// branching on a constant and if that constant means that we enter the loop,
740	// then we fold away the cond branch to an uncond branch. This simplifies the
741	// loop in cases important for nested loops, and it also means we don't have
742	// to split as many edges.
743	CondBrInst *PHBI = cast<CondBrInst>(Val: OrigPreheader->getTerminator());
744	const Value *Cond = PHBI->getCondition();
745	const bool HasConditionalPreHeader =
746	!isa<ConstantInt>(Val: Cond) \|\|
747	PHBI->getSuccessor(i: cast<ConstantInt>(Val: Cond)->isZero()) != NewHeader;
748
749	updateBranchWeights(PreHeaderBI&: PHBI, LoopBI&: BI, HasConditionalPreHeader, SuccsSwapped: BISuccsSwapped);
750
751	if (HasConditionalPreHeader) {
752	// The conditional branch can't be folded, handle the general case.
753	// Split edges as necessary to preserve LoopSimplify form.
754
755	// Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
756	// thus is not a preheader anymore.
757	// Split the edge to form a real preheader.
758	BasicBlock *NewPH = SplitCriticalEdge(
759	Src: OrigPreheader, Dst: NewHeader,
760	Options: CriticalEdgeSplittingOptions (DT, LI, MSSAU).setPreserveLCSSA());
761	NewPH->setName(NewHeader->getName() + ".lr.ph");
762
763	// Preserve canonical loop form, which means that 'Exit' should have only
764	// one predecessor. Note that Exit could be an exit block for multiple
765	// nested loops, causing both of the edges to now be critical and need to
766	// be split.
767	SmallVector<BasicBlock *, `4`> ExitPreds(predecessors(BB: Exit));
768	bool SplitLatchEdge = false;
769	for (BasicBlock *ExitPred : ExitPreds) {
770	// We only need to split loop exit edges.
771	Loop *PredLoop = LI->getLoopFor(BB: ExitPred);
772	if (!PredLoop \|\| PredLoop->contains(BB: Exit) \|\|
773	isa<IndirectBrInst>(Val: ExitPred->getTerminator()))
774	continue;
775	SplitLatchEdge \|= L->getLoopLatch() == ExitPred;
776	BasicBlock *ExitSplit = SplitCriticalEdge(
777	Src: ExitPred, Dst: Exit,
778	Options: CriticalEdgeSplittingOptions (DT, LI, MSSAU).setPreserveLCSSA());
779	ExitSplit->moveBefore(MovePos: Exit);
780	}
781	assert(SplitLatchEdge &&
782	"Despite splitting all preds, failed to split latch exit?");
783	(void)SplitLatchEdge;
784	} else {
785	// We can fold the conditional branch in the preheader, this makes things
786	// simpler. The first step is to remove the extra edge to the Exit block.
787	Exit->removePredecessor(Pred: OrigPreheader, KeepOneInputPHIs: true /preserve LCSSA/);
788	UncondBrInst *NewBI = UncondBrInst::Create(IfTrue: NewHeader, InsertBefore: PHBI->getIterator());
789	NewBI->setDebugLoc(PHBI->getDebugLoc());
790	PHBI->eraseFromParent();
791
792	// With our CFG finalized, update DomTree if it is available.
793	if (DT)
794	DT->deleteEdge(From: OrigPreheader, To: Exit);
795
796	// Update MSSA too, if available.
797	if (MSSAU)
798	MSSAU->removeEdge(From: OrigPreheader, To: Exit);
799	}
800
801	assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
802	assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
803
804	if (MSSAU && VerifyMemorySSA)
805	MSSAU->getMemorySSA()->verifyMemorySSA();
806
807	// Now that the CFG and DomTree are in a consistent state again, try to merge
808	// the OrigHeader block into OrigLatch. This will succeed if they are
809	// connected by an unconditional branch. This is just a cleanup so the
810	// emitted code isn't too gross in this common case.
811	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
812	BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
813	bool DidMerge = MergeBlockIntoPredecessor(BB: OrigHeader, DTU: &DTU, LI, MSSAU);
814	if (DidMerge)
815	RemoveRedundantDbgInstrs(BB: PredBB);
816
817	if (MSSAU && VerifyMemorySSA)
818	MSSAU->getMemorySSA()->verifyMemorySSA();
819
820	LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
821
822	return true;
823	}
824
825	/// Determine whether the instructions in this range may be safely and cheaply
826	/// speculated. This is not an important enough situation to develop complex
827	/// heuristics. We handle a single arithmetic instruction along with any type
828	/// conversions.
829	static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
830	BasicBlock::iterator End, Loop *L) {
831	bool seenIncrement = false;
832	bool MultiExitLoop = false;
833
834	if (!L->getExitingBlock())
835	MultiExitLoop = true;
836
837	for (BasicBlock::iterator I = Begin; I != End; ++I) {
838
839	if (!isSafeToSpeculativelyExecute(I: &*I))
840	return false;
841
842	switch (I ->getOpcode()) {
843	default:
844	return false;
845	case Instruction::GetElementPtr:
846	// GEPs are cheap if all indices are constant.
847	if (!cast<GEPOperator>(Val&: I)->hasAllConstantIndices())
848	return false;
849	// fall-thru to increment case
850	[[fallthrough]];
851	case Instruction::Add:
852	case Instruction::Sub:
853	case Instruction::And:
854	case Instruction::Or:
855	case Instruction::Xor:
856	case Instruction::Shl:
857	case Instruction::LShr:
858	case Instruction::AShr: {
859	Value *IVOpnd =
860	!isa<Constant>(Val: I ->getOperand(i: `0`))
861	? I ->getOperand(i: `0`)
862	: !isa<Constant>(Val: I ->getOperand(i: `1`)) ? I ->getOperand(i: `1`) : nullptr;
863	if (!IVOpnd)
864	return false;
865
866	// If increment operand is used outside of the loop, this speculation
867	// could cause extra live range interference.
868	if (MultiExitLoop) {
869	for (User *UseI : IVOpnd->users()) {
870	auto *UserInst = cast<Instruction>(Val: UseI);
871	if (!L->contains(Inst: UserInst))
872	return false;
873	}
874	}
875
876	if (seenIncrement)
877	return false;
878	seenIncrement = true;
879	break;
880	}
881	case Instruction::Trunc:
882	case Instruction::ZExt:
883	case Instruction::SExt:
884	// ignore type conversions
885	break;
886	}
887	}
888	return true;
889	}
890
891	/// Fold the loop tail into the loop exit by speculating the loop tail
892	/// instructions. Typically, this is a single post-increment. In the case of a
893	/// simple 2-block loop, hoisting the increment can be much better than
894	/// duplicating the entire loop header. In the case of loops with early exits,
895	/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
896	/// canonical form so downstream passes can handle it.
897	///
898	/// I don't believe this invalidates SCEV.
899	bool LoopRotate::simplifyLoopLatch(Loop *L) {
900	BasicBlock *Latch = L->getLoopLatch();
901	if (!Latch \|\| Latch->hasAddressTaken())
902	return false;
903
904	UncondBrInst *Jmp = dyn_cast<UncondBrInst>(Val: Latch->getTerminator());
905	if (!Jmp)
906	return false;
907
908	BasicBlock *LastExit = Latch->getSinglePredecessor();
909	if (!LastExit \|\| !L->isLoopExiting(BB: LastExit))
910	return false;
911
912	if (!isa<UncondBrInst, CondBrInst>(Val: LastExit->getTerminator()))
913	return false;
914
915	if (!shouldSpeculateInstrs(Begin: Latch->begin(), End: Jmp->getIterator(), L))
916	return false;
917
918	LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
919	<< LastExit->getName() << "\n");
920
921	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
922	MergeBlockIntoPredecessor(BB: Latch, DTU: &DTU, LI, MSSAU, MemDep: nullptr,
923	/PredecessorWithTwoSuccessors=/true);
924
925	if (SE) {
926	// Merging blocks may remove blocks reference in the block disposition cache. Clear the cache.
927	SE->forgetBlockAndLoopDispositions();
928	}
929
930	if (MSSAU && VerifyMemorySSA)
931	MSSAU->getMemorySSA()->verifyMemorySSA();
932
933	return true;
934	}
935
936	/// Rotate \c L, and return true if any modification was made.
937	bool LoopRotate::processLoop(Loop *L) {
938	// Save the loop metadata.
939	MDNode *LoopMD = L->getLoopID();
940
941	bool SimplifiedLatch = false;
942
943	// Simplify the loop latch before attempting to rotate the header
944	// upward. Rotation may not be needed if the loop tail can be folded into the
945	// loop exit.
946	if (!RotationOnly)
947	SimplifiedLatch = simplifyLoopLatch(L);
948
949	bool MadeChange = rotateLoop(L, SimplifiedLatch);
950	assert((!MadeChange \|\| L->isLoopExiting(L->getLoopLatch())) &&
951	"Loop latch should be exiting after loop-rotate.");
952
953	// Restore the loop metadata.
954	// NB! We presume LoopRotation DOESN'T ADD its own metadata.
955	if ((MadeChange \|\| SimplifiedLatch) && LoopMD)
956	L->setLoopID(LoopMD);
957
958	return MadeChange \|\| SimplifiedLatch;
959	}
960
961
962	/// The utility to convert a loop into a loop with bottom test.
963	bool llvm::LoopRotation(Loop L, LoopInfo LI, const TargetTransformInfo *TTI,
964	AssumptionCache AC, DominatorTree DT,
965	ScalarEvolution SE, MemorySSAUpdater MSSAU,
966	const SimplifyQuery &SQ, bool RotationOnly = true,
967	unsigned Threshold = unsigned(-`1`),
968	bool IsUtilMode = true, bool PrepareForLTO) {
969	LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
970	IsUtilMode, PrepareForLTO);
971	return LR.processLoop(L);
972	}
973

Browse the source code of llvm_projects/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp