LoopRotationUtils.cpp source code [llvm_projects/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp]

1	//===----------------- LoopRotationUtils.cpp -----------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file provides utilities to convert a loop into a loop with bottom test.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/Transforms/Utils/LoopRotationUtils.h"
14	#include "llvm/ADT/Statistic.h"
15	#include "llvm/Analysis/AssumptionCache.h"
16	#include "llvm/Analysis/CodeMetrics.h"
17	#include "llvm/Analysis/DomTreeUpdater.h"
18	#include "llvm/Analysis/InstructionSimplify.h"
19	#include "llvm/Analysis/LoopInfo.h"
20	#include "llvm/Analysis/MemorySSA.h"
21	#include "llvm/Analysis/MemorySSAUpdater.h"
22	#include "llvm/Analysis/ScalarEvolution.h"
23	#include "llvm/Analysis/ValueTracking.h"
24	#include "llvm/IR/CFG.h"
25	#include "llvm/IR/DebugInfo.h"
26	#include "llvm/IR/Dominators.h"
27	#include "llvm/IR/IntrinsicInst.h"
28	#include "llvm/IR/MDBuilder.h"
29	#include "llvm/IR/ProfDataUtils.h"
30	#include "llvm/Support/CommandLine.h"
31	#include "llvm/Support/Debug.h"
32	#include "llvm/Support/raw_ostream.h"
33	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
34	#include "llvm/Transforms/Utils/Cloning.h"
35	#include "llvm/Transforms/Utils/Local.h"
36	#include "llvm/Transforms/Utils/SSAUpdater.h"
37	#include "llvm/Transforms/Utils/ValueMapper.h"
38	using namespace llvm;
39
40	#define DEBUG_TYPE "loop-rotate"
41
42	STATISTIC(NumNotRotatedDueToHeaderSize,
43	"Number of loops not rotated due to the header size");
44	STATISTIC(NumInstrsHoisted,
45	"Number of instructions hoisted into loop preheader");
46	STATISTIC(NumInstrsDuplicated,
47	"Number of instructions cloned into loop preheader");
48	STATISTIC(NumRotated, "Number of loops rotated");
49
50	static cl::opt<bool>
51	MultiRotate("loop-rotate-multi", cl::init(Val: false), cl::Hidden,
52	cl::desc ("Allow loop rotation multiple times in order to reach "
53	"a better latch exit"));
54
55	// Probability that a rotated loop has zero trip count / is never entered.
56	static constexpr uint32_t ZeroTripCountWeights[] = {`1`, `127`};
57
58	namespace {
59	/// A simple loop rotation transformation.
60	class LoopRotate {
61	const unsigned MaxHeaderSize;
62	LoopInfo *LI;
63	const TargetTransformInfo *TTI;
64	AssumptionCache *AC;
65	DominatorTree *DT;
66	ScalarEvolution *SE;
67	MemorySSAUpdater *MSSAU;
68	const SimplifyQuery &SQ;
69	bool RotationOnly;
70	bool IsUtilMode;
71	bool PrepareForLTO;
72
73	public:
74	LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
75	const TargetTransformInfo TTI, AssumptionCache AC,
76	DominatorTree DT, ScalarEvolution SE, MemorySSAUpdater *MSSAU,
77	const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
78	bool PrepareForLTO)
79	: MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
80	MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
81	IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
82	bool processLoop(Loop *L);
83
84	private:
85	bool rotateLoop(Loop L, bool* SimplifiedLatch);
86	bool simplifyLoopLatch(Loop *L);
87	};
88	} // end anonymous namespace
89
90	/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not
91	/// previously exist in the map, and the value was inserted.
92	static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value K, Value V) {
93	bool Inserted = VM.insert(KV: {K, V}).second;
94	assert(Inserted);
95	(void)Inserted;
96	}
97	/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
98	/// old header into the preheader. If there were uses of the values produced by
99	/// these instruction that were outside of the loop, we have to insert PHI nodes
100	/// to merge the two values. Do this now.
101	static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
102	BasicBlock *OrigPreheader,
103	ValueToValueMapTy &ValueMap,
104	ScalarEvolution *SE,
105	SmallVectorImpl<PHINode> InsertedPHIs) {
106	// Remove PHI node entries that are no longer live.
107	BasicBlock::iterator I, E = OrigHeader->end();
108	for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(Val&: I); ++I)
109	PN->removeIncomingValue(Idx: PN->getBasicBlockIndex(BB: OrigPreheader));
110
111	// Now fix up users of the instructions in OrigHeader, inserting PHI nodes
112	// as necessary.
113	SSAUpdater SSA(InsertedPHIs);
114	for (I = OrigHeader->begin(); I != E; ++I) {
115	Value OrigHeaderVal = &I;
116
117	// If there are no uses of the value (e.g. because it returns void), there
118	// is nothing to rewrite.
119	if (OrigHeaderVal->use_empty())
120	continue;
121
122	Value *OrigPreHeaderVal = ValueMap.lookup(Val: OrigHeaderVal);
123
124	// The value now exits in two versions: the initial value in the preheader
125	// and the loop "next" value in the original header.
126	SSA.Initialize(Ty: OrigHeaderVal->getType(), Name: OrigHeaderVal->getName());
127	// Force re-computation of OrigHeaderVal, as some users now need to use the
128	// new PHI node.
129	if (SE)
130	SE->forgetValue(V: OrigHeaderVal);
131	SSA.AddAvailableValue(BB: OrigHeader, V: OrigHeaderVal);
132	SSA.AddAvailableValue(BB: OrigPreheader, V: OrigPreHeaderVal);
133
134	// Visit each use of the OrigHeader instruction.
135	for (Use &U : llvm::make_early_inc_range(Range: OrigHeaderVal->uses())) {
136	// SSAUpdater can't handle a non-PHI use in the same block as an
137	// earlier def. We can easily handle those cases manually.
138	Instruction *UserInst = cast<Instruction>(Val: U.getUser());
139	if (!isa<PHINode>(Val: UserInst)) {
140	BasicBlock *UserBB = UserInst->getParent();
141
142	// The original users in the OrigHeader are already using the
143	// original definitions.
144	if (UserBB == OrigHeader)
145	continue;
146
147	// Users in the OrigPreHeader need to use the value to which the
148	// original definitions are mapped.
149	if (UserBB == OrigPreheader) {
150	U = OrigPreHeaderVal;
151	continue;
152	}
153	}
154
155	// Anything else can be handled by SSAUpdater.
156	SSA.RewriteUse(U);
157	}
158
159	// Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
160	// intrinsics.
161	SmallVector<DbgValueInst *, `1`> DbgValues;
162	SmallVector<DbgVariableRecord *, `1`> DbgVariableRecords;
163	llvm::findDbgValues(DbgValues, V: OrigHeaderVal, DbgVariableRecords: &DbgVariableRecords);
164	for (auto &DbgValue : DbgValues) {
165	// The original users in the OrigHeader are already using the original
166	// definitions.
167	BasicBlock *UserBB = DbgValue->getParent();
168	if (UserBB == OrigHeader)
169	continue;
170
171	// Users in the OrigPreHeader need to use the value to which the
172	// original definitions are mapped and anything else can be handled by
173	// the SSAUpdater. To avoid adding PHINodes, check if the value is
174	// available in UserBB, if not substitute poison.
175	Value *NewVal;
176	if (UserBB == OrigPreheader)
177	NewVal = OrigPreHeaderVal;
178	else if (SSA.HasValueForBlock(BB: UserBB))
179	NewVal = SSA.GetValueInMiddleOfBlock(BB: UserBB);
180	else
181	NewVal = PoisonValue::get(T: OrigHeaderVal->getType());
182	DbgValue->replaceVariableLocationOp(OldValue: OrigHeaderVal, NewValue: NewVal);
183	}
184
185	// RemoveDIs: duplicate implementation for non-instruction debug-info
186	// storage in DbgVariableRecords.
187	for (DbgVariableRecord *DVR : DbgVariableRecords) {
188	// The original users in the OrigHeader are already using the original
189	// definitions.
190	BasicBlock *UserBB = DVR->getMarker()->getParent();
191	if (UserBB == OrigHeader)
192	continue;
193
194	// Users in the OrigPreHeader need to use the value to which the
195	// original definitions are mapped and anything else can be handled by
196	// the SSAUpdater. To avoid adding PHINodes, check if the value is
197	// available in UserBB, if not substitute poison.
198	Value *NewVal;
199	if (UserBB == OrigPreheader)
200	NewVal = OrigPreHeaderVal;
201	else if (SSA.HasValueForBlock(BB: UserBB))
202	NewVal = SSA.GetValueInMiddleOfBlock(BB: UserBB);
203	else
204	NewVal = PoisonValue::get(T: OrigHeaderVal->getType());
205	DVR->replaceVariableLocationOp(OldValue: OrigHeaderVal, NewValue: NewVal);
206	}
207	}
208	}
209
210	// Assuming both header and latch are exiting, look for a phi which is only
211	// used outside the loop (via a LCSSA phi) in the exit from the header.
212	// This means that rotating the loop can remove the phi.
213	static bool profitableToRotateLoopExitingLatch(Loop *L) {
214	BasicBlock *Header = L->getHeader();
215	BranchInst *BI = dyn_cast<BranchInst>(Val: Header->getTerminator());
216	assert(BI && BI->isConditional() && "need header with conditional exit");
217	BasicBlock *HeaderExit = BI->getSuccessor(i: `0`);
218	if (L->contains(BB: HeaderExit))
219	HeaderExit = BI->getSuccessor(i: `1`);
220
221	for (auto &Phi : Header->phis()) {
222	// Look for uses of this phi in the loop/via exits other than the header.
223	if (llvm::any_of(Range: Phi.users(), P: [HeaderExit](const User *U) {
224	return cast<Instruction>(Val: U)->getParent() != HeaderExit;
225	}))
226	continue;
227	return true;
228	}
229	return false;
230	}
231
232	// Check that latch exit is deoptimizing (which means - very unlikely to happen)
233	// and there is another exit from the loop which is non-deoptimizing.
234	// If we rotate latch to that exit our loop has a better chance of being fully
235	// canonical.
236	//
237	// It can give false positives in some rare cases.
238	static bool canRotateDeoptimizingLatchExit(Loop *L) {
239	BasicBlock *Latch = L->getLoopLatch();
240	assert(Latch && "need latch");
241	BranchInst *BI = dyn_cast<BranchInst>(Val: Latch->getTerminator());
242	// Need normal exiting latch.
243	if (!BI \|\| !BI->isConditional())
244	return false;
245
246	BasicBlock *Exit = BI->getSuccessor(i: `1`);
247	if (L->contains(BB: Exit))
248	Exit = BI->getSuccessor(i: `0`);
249
250	// Latch exit is non-deoptimizing, no need to rotate.
251	if (!Exit->getPostdominatingDeoptimizeCall())
252	return false;
253
254	SmallVector<BasicBlock *, `4`> Exits;
255	L->getUniqueExitBlocks(ExitBlocks&: Exits);
256	if (!Exits.empty()) {
257	// There is at least one non-deoptimizing exit.
258	//
259	// Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
260	// as it can conservatively return false for deoptimizing exits with
261	// complex enough control flow down to deoptimize call.
262	//
263	// That means here we can report success for a case where
264	// all exits are deoptimizing but one of them has complex enough
265	// control flow (e.g. with loops).
266	//
267	// That should be a very rare case and false positives for this function
268	// have compile-time effect only.
269	return any_of(Range&: Exits, P: [](const BasicBlock *BB) {
270	return !BB->getPostdominatingDeoptimizeCall();
271	});
272	}
273	return false;
274	}
275
276	static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
277	bool HasConditionalPreHeader,
278	bool SuccsSwapped) {
279	MDNode *WeightMD = getBranchWeightMDNode(I: PreHeaderBI);
280	if (WeightMD == nullptr)
281	return;
282
283	// LoopBI should currently be a clone of PreHeaderBI with the same
284	// metadata. But we double check to make sure we don't have a degenerate case
285	// where instsimplify changed the instructions.
286	if (WeightMD != getBranchWeightMDNode(I: LoopBI))
287	return;
288
289	SmallVector<uint32_t, `2`> Weights;
290	extractFromBranchWeightMD32(ProfileData: WeightMD, Weights);
291	if (Weights.size() != `2`)
292	return;
293	uint32_t OrigLoopExitWeight = Weights [`0`];
294	uint32_t OrigLoopBackedgeWeight = Weights [`1`];
295
296	if (SuccsSwapped)
297	std::swap(a&: OrigLoopExitWeight, b&: OrigLoopBackedgeWeight);
298
299	// Update branch weights. Consider the following edge-counts:
300	//
301	// \| \|-------- \|
302	// V V \| V
303	// Br i1 ... \| Br i1 ...
304	// \| \| \| \| \|
305	// x\| y\| \| becomes: \| y0\| \|-----
306	// V V \| \| V V \|
307	// Exit Loop \| \| Loop \|
308	// \| \| \| Br i1 ... \|
309	// ----- \| \| \| \|
310	// x0\| x1\| y1 \| \|
311	// V V ----
312	// Exit
313	//
314	// The following must hold:
315	// - x == x0 + x1 # counts to "exit" must stay the same.
316	// - y0 == x - x0 == x1 # how often loop was entered at all.
317	// - y1 == y - y0 # How often loop was repeated (after first iter.).
318	//
319	// We cannot generally deduce how often we had a zero-trip count loop so we
320	// have to make a guess for how to distribute x among the new x0 and x1.
321
322	uint32_t ExitWeight0; // aka x0
323	uint32_t ExitWeight1; // aka x1
324	uint32_t EnterWeight; // aka y0
325	uint32_t LoopBackWeight; // aka y1
326	if (OrigLoopExitWeight > `0` && OrigLoopBackedgeWeight > `0`) {
327	ExitWeight0 = `0`;
328	if (HasConditionalPreHeader) {
329	// Here we cannot know how many 0-trip count loops we have, so we guess:
330	if (OrigLoopBackedgeWeight >= OrigLoopExitWeight) {
331	// If the loop count is bigger than the exit count then we set
332	// probabilities as if 0-trip count nearly never happens.
333	ExitWeight0 = ZeroTripCountWeights[`0`];
334	// Scale up counts if necessary so we can match `ZeroTripCountWeights`
335	// for the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.
336	while (OrigLoopExitWeight < ZeroTripCountWeights[`1`] + ExitWeight0) {
337	// ... but don't overflow.
338	uint32_t const HighBit = uint32_t{`1`} << (sizeof(uint32_t) * `8` - `1`);
339	if ((OrigLoopBackedgeWeight & HighBit) != `0` \|\|
340	(OrigLoopExitWeight & HighBit) != `0`)
341	break;
342	OrigLoopBackedgeWeight <<= `1`;
343	OrigLoopExitWeight <<= `1`;
344	}
345	} else {
346	// If there's a higher exit-count than backedge-count then we set
347	// probabilities as if there are only 0-trip and 1-trip cases.
348	ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;
349	}
350	} else {
351	// Theoretically, if the loop body must be executed at least once, the
352	// backedge count must be not less than exit count. However the branch
353	// weight collected by sampling-based PGO may be not very accurate due to
354	// sampling. Therefore this workaround is required here to avoid underflow
355	// of unsigned in following update of branch weight.
356	if (OrigLoopExitWeight > OrigLoopBackedgeWeight)
357	OrigLoopBackedgeWeight = OrigLoopExitWeight;
358	}
359	assert(OrigLoopExitWeight >= ExitWeight0 && "Bad branch weight");
360	ExitWeight1 = OrigLoopExitWeight - ExitWeight0;
361	EnterWeight = ExitWeight1;
362	assert(OrigLoopBackedgeWeight >= EnterWeight && "Bad branch weight");
363	LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight;
364	} else if (OrigLoopExitWeight == `0`) {
365	if (OrigLoopBackedgeWeight == `0`) {
366	// degenerate case... keep everything zero...
367	ExitWeight0 = `0`;
368	ExitWeight1 = `0`;
369	EnterWeight = `0`;
370	LoopBackWeight = `0`;
371	} else {
372	// Special case "LoopExitWeight == 0" weights which behaves like an
373	// endless where we don't want loop-enttry (y0) to be the same as
374	// loop-exit (x1).
375	ExitWeight0 = `0`;
376	ExitWeight1 = `0`;
377	EnterWeight = `1`;
378	LoopBackWeight = OrigLoopBackedgeWeight;
379	}
380	} else {
381	// loop is never entered.
382	assert(OrigLoopBackedgeWeight == `0` && "remaining case is backedge zero");
383	ExitWeight0 = `1`;
384	ExitWeight1 = `1`;
385	EnterWeight = `0`;
386	LoopBackWeight = `0`;
387	}
388
389	const uint32_t LoopBIWeights[] = {
390	SuccsSwapped ? LoopBackWeight : ExitWeight1,
391	SuccsSwapped ? ExitWeight1 : LoopBackWeight,
392	};
393	setBranchWeights(I&: LoopBI, Weights: LoopBIWeights, /IsExpected=/false);
394	if (HasConditionalPreHeader) {
395	const uint32_t PreHeaderBIWeights[] = {
396	SuccsSwapped ? EnterWeight : ExitWeight0,
397	SuccsSwapped ? ExitWeight0 : EnterWeight,
398	};
399	setBranchWeights(I&: PreHeaderBI, Weights: PreHeaderBIWeights, /IsExpected=/false);
400	}
401	}
402
403	/// Rotate loop LP. Return true if the loop is rotated.
404	///
405	/// \param SimplifiedLatch is true if the latch was just folded into the final
406	/// loop exit. In this case we may want to rotate even though the new latch is
407	/// now an exiting branch. This rotation would have happened had the latch not
408	/// been simplified. However, if SimplifiedLatch is false, then we avoid
409	/// rotating loops in which the latch exits to avoid excessive or endless
410	/// rotation. LoopRotate should be repeatable and converge to a canonical
411	/// form. This property is satisfied because simplifying the loop latch can only
412	/// happen once across multiple invocations of the LoopRotate pass.
413	///
414	/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
415	/// so to reach a suitable (non-deoptimizing) exit.
416	bool LoopRotate::rotateLoop(Loop L, bool* SimplifiedLatch) {
417	// If the loop has only one block then there is not much to rotate.
418	if (L->getBlocks().size() == `1`)
419	return false;
420
421	bool Rotated = false;
422	do {
423	BasicBlock *OrigHeader = L->getHeader();
424	BasicBlock *OrigLatch = L->getLoopLatch();
425
426	BranchInst *BI = dyn_cast<BranchInst>(Val: OrigHeader->getTerminator());
427	if (!BI \|\| BI->isUnconditional())
428	return Rotated;
429
430	// If the loop header is not one of the loop exiting blocks then
431	// either this loop is already rotated or it is not
432	// suitable for loop rotation transformations.
433	if (!L->isLoopExiting(BB: OrigHeader))
434	return Rotated;
435
436	// If the loop latch already contains a branch that leaves the loop then the
437	// loop is already rotated.
438	if (!OrigLatch)
439	return Rotated;
440
441	// Rotate if either the loop latch does not* exit the loop, or if the loop*
442	// latch was just simplified. Or if we think it will be profitable.
443	if (L->isLoopExiting(BB: OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
444	!profitableToRotateLoopExitingLatch(L) &&
445	!canRotateDeoptimizingLatchExit(L))
446	return Rotated;
447
448	// Check size of original header and reject loop if it is very big or we can't
449	// duplicate blocks inside it.
450	{
451	SmallPtrSet<const Value *, `32`> EphValues;
452	CodeMetrics::collectEphemeralValues(L, AC, EphValues);
453
454	CodeMetrics Metrics;
455	Metrics.analyzeBasicBlock(BB: OrigHeader, TTI: *TTI, EphValues, PrepareForLTO);
456	if (Metrics.notDuplicatable) {
457	LLVM_DEBUG(
458	dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
459	<< " instructions: ";
460	L->dump());
461	return Rotated;
462	}
463	if (Metrics.Convergence != ConvergenceKind::None) {
464	LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
465	"instructions: ";
466	L->dump());
467	return Rotated;
468	}
469	if (!Metrics.NumInsts.isValid()) {
470	LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains instructions"
471	" with invalid cost: ";
472	L->dump());
473	return Rotated;
474	}
475	if (Metrics.NumInsts > MaxHeaderSize) {
476	LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
477	<< Metrics.NumInsts
478	<< " instructions, which is more than the threshold ("
479	<< MaxHeaderSize << " instructions): ";
480	L->dump());
481	++NumNotRotatedDueToHeaderSize;
482	return Rotated;
483	}
484
485	// When preparing for LTO, avoid rotating loops with calls that could be
486	// inlined during the LTO stage.
487	if (PrepareForLTO && Metrics.NumInlineCandidates > `0`)
488	return Rotated;
489	}
490
491	// Now, this loop is suitable for rotation.
492	BasicBlock *OrigPreheader = L->getLoopPreheader();
493
494	// If the loop could not be converted to canonical form, it must have an
495	// indirectbr in it, just give up.
496	if (!OrigPreheader \|\| !L->hasDedicatedExits())
497	return Rotated;
498
499	// Anything ScalarEvolution may know about this loop or the PHI nodes
500	// in its header will soon be invalidated. We should also invalidate
501	// all outer loops because insertion and deletion of blocks that happens
502	// during the rotation may violate invariants related to backedge taken
503	// infos in them.
504	if (SE) {
505	SE->forgetTopmostLoop(L);
506	// We may hoist some instructions out of loop. In case if they were cached
507	// as "loop variant" or "loop computable", these caches must be dropped.
508	// We also may fold basic blocks, so cached block dispositions also need
509	// to be dropped.
510	SE->forgetBlockAndLoopDispositions();
511	}
512
513	LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
514	if (MSSAU && VerifyMemorySSA)
515	MSSAU->getMemorySSA()->verifyMemorySSA();
516
517	// Find new Loop header. NewHeader is a Header's one and only successor
518	// that is inside loop. Header's other successor is outside the
519	// loop. Otherwise loop is not suitable for rotation.
520	BasicBlock *Exit = BI->getSuccessor(i: `0`);
521	BasicBlock *NewHeader = BI->getSuccessor(i: `1`);
522	bool BISuccsSwapped = L->contains(BB: Exit);
523	if (BISuccsSwapped)
524	std::swap(a&: Exit, b&: NewHeader);
525	assert(NewHeader && "Unable to determine new loop header");
526	assert(L->contains(NewHeader) && !L->contains(Exit) &&
527	"Unable to determine loop header and exit blocks");
528
529	// This code assumes that the new header has exactly one predecessor.
530	// Remove any single-entry PHI nodes in it.
531	assert(NewHeader->getSinglePredecessor() &&
532	"New header doesn't have one pred!");
533	FoldSingleEntryPHINodes(BB: NewHeader);
534
535	// Begin by walking OrigHeader and populating ValueMap with an entry for
536	// each Instruction.
537	BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
538	ValueToValueMapTy ValueMap, ValueMapMSSA;
539
540	// For PHI nodes, the value available in OldPreHeader is just the
541	// incoming value from OldPreHeader.
542	for (; PHINode *PN = dyn_cast<PHINode>(Val&: I); ++I)
543	InsertNewValueIntoMap(VM&: ValueMap, K: PN,
544	V: PN->getIncomingValueForBlock(BB: OrigPreheader));
545
546	// For the rest of the instructions, either hoist to the OrigPreheader if
547	// possible or create a clone in the OldPreHeader if not.
548	Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
549
550	// Record all debug records preceding LoopEntryBranch to avoid
551	// duplication.
552	using DbgHash =
553	std::pair<std::pair<hash_code, DILocalVariable >, DIExpression >;
554	auto makeHash = [](const DbgVariableRecord *D) -> DbgHash {
555	auto VarLocOps = D->location_ops();
556	return {{hash_combine_range(R&: VarLocOps), D->getVariable()},
557	D->getExpression()};
558	};
559
560	SmallDenseSet<DbgHash, `8`> DbgRecords;
561	// Build DbgVariableRecord hashes for DbgVariableRecords attached to the
562	// terminator.
563	for (const DbgVariableRecord &DVR :
564	filterDbgVars(R: OrigPreheader->getTerminator()->getDbgRecordRange()))
565	DbgRecords.insert(V: makeHash (&DVR));
566
567	// Remember the local noalias scope declarations in the header. After the
568	// rotation, they must be duplicated and the scope must be cloned. This
569	// avoids unwanted interaction across iterations.
570	SmallVector<NoAliasScopeDeclInst *, `6`> NoAliasDeclInstructions;
571	for (Instruction &I : *OrigHeader)
572	if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(Val: &I))
573	NoAliasDeclInstructions.push_back(Elt: Decl);
574
575	Module *M = OrigHeader->getModule();
576
577	// Track the next DbgRecord to clone. If we have a sequence where an
578	// instruction is hoisted instead of being cloned:
579	// DbgRecord blah
580	// %foo = add i32 0, 0
581	// DbgRecord xyzzy
582	// %bar = call i32 @foobar()
583	// where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
584	// attached to %foo, then when %foo his hoisted it will "fall down" onto the
585	// function call:
586	// DbgRecord blah
587	// DbgRecord xyzzy
588	// %bar = call i32 @foobar()
589	// causing it to appear attached to the call too.
590	//
591	// To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
592	// here" position to account for this behaviour. We point it at any
593	// DbgRecords on the next instruction, here labelled xyzzy, before we hoist
594	// %foo. Later, we only only clone DbgRecords from that position (xyzzy)
595	// onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
596	// a range because it gives us a natural way of testing whether
597	// there were DbgRecords on the next instruction before we hoisted things).
598	iterator_range<DbgRecord::self_iterator> NextDbgInsts =
599	(I != E) ? I ->getDbgRecordRange() : DbgMarker::getEmptyDbgRecordRange();
600
601	while (I != E) {
602	Instruction Inst = &I ++;
603
604	// If the instruction's operands are invariant and it doesn't read or write
605	// memory, then it is safe to hoist. Doing this doesn't change the order of
606	// execution in the preheader, but does prevent the instruction from
607	// executing in each iteration of the loop. This means it is safe to hoist
608	// something that might trap, but isn't safe to hoist something that reads
609	// memory (without proving that the loop doesn't write).
610	if (L->hasLoopInvariantOperands(I: Inst) && !Inst->mayReadFromMemory() &&
611	!Inst->mayWriteToMemory() && !Inst->isTerminator() &&
612	!isa<AllocaInst>(Val: Inst) &&
613	// It is not safe to hoist the value of these instructions in
614	// coroutines, as the addresses of otherwise eligible variables (e.g.
615	// thread-local variables and errno) may change if the coroutine is
616	// resumed in a different thread.Therefore, we disable this
617	// optimization for correctness. However, this may block other correct
618	// optimizations.
619	// FIXME: This should be reverted once we have a better model for
620	// memory access in coroutines.
621	!Inst->getFunction()->isPresplitCoroutine()) {
622
623	if (!NextDbgInsts.empty()) {
624	auto DbgValueRange =
625	LoopEntryBranch->cloneDebugInfoFrom(From: Inst, FromHere: NextDbgInsts.begin());
626	RemapDbgRecordRange(M, Range: DbgValueRange, VM&: ValueMap,
627	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
628	// Erase anything we've seen before.
629	for (DbgVariableRecord &DVR :
630	make_early_inc_range(Range: filterDbgVars(R: DbgValueRange)))
631	if (DbgRecords.count(V: makeHash (&DVR)))
632	DVR.eraseFromParent();
633	}
634
635	NextDbgInsts = I ->getDbgRecordRange();
636
637	Inst->moveBefore(InsertPos: LoopEntryBranch->getIterator());
638
639	++NumInstrsHoisted;
640	continue;
641	}
642
643	// Otherwise, create a duplicate of the instruction.
644	Instruction *C = Inst->clone();
645	if (const DebugLoc &DL = C->getDebugLoc())
646	mapAtomInstance(DL, VMap&: ValueMap);
647
648	C->insertBefore(InsertPos: LoopEntryBranch->getIterator());
649
650	++NumInstrsDuplicated;
651
652	if (!NextDbgInsts.empty()) {
653	auto Range = C->cloneDebugInfoFrom(From: Inst, FromHere: NextDbgInsts.begin());
654	RemapDbgRecordRange(M, Range, VM&: ValueMap,
655	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
656	NextDbgInsts = DbgMarker::getEmptyDbgRecordRange();
657	// Erase anything we've seen before.
658	for (DbgVariableRecord &DVR :
659	make_early_inc_range(Range: filterDbgVars(R: Range)))
660	if (DbgRecords.count(V: makeHash (&DVR)))
661	DVR.eraseFromParent();
662	}
663
664	// Eagerly remap the operands of the instruction.
665	RemapInstruction(I: C, VM&: ValueMap,
666	Flags: RF_NoModuleLevelChanges \| RF_IgnoreMissingLocals);
667
668	// With the operands remapped, see if the instruction constant folds or is
669	// otherwise simplifyable. This commonly occurs because the entry from PHI
670	// nodes allows icmps and other instructions to fold.
671	Value *V = simplifyInstruction(I: C, Q: SQ);
672	if (V && LI->replacementPreservesLCSSAForm(From: C, To: V)) {
673	// If so, then delete the temporary instruction and stick the folded value
674	// in the map.
675	InsertNewValueIntoMap(VM&: ValueMap, K: Inst, V);
676	if (!C->mayHaveSideEffects()) {
677	C->eraseFromParent();
678	C = nullptr;
679	}
680	} else {
681	InsertNewValueIntoMap(VM&: ValueMap, K: Inst, V: C);
682	}
683	if (C) {
684	// Otherwise, stick the new instruction into the new block!
685	C->setName(Inst->getName());
686
687	if (auto *II = dyn_cast<AssumeInst>(Val: C))
688	AC->registerAssumption(CI: II);
689	// MemorySSA cares whether the cloned instruction was inserted or not, and
690	// not whether it can be remapped to a simplified value.
691	if (MSSAU)
692	InsertNewValueIntoMap(VM&: ValueMapMSSA, K: Inst, V: C);
693	}
694	}
695
696	if (!NoAliasDeclInstructions.empty()) {
697	// There are noalias scope declarations:
698	// (general):
699	// Original: OrigPre { OrigHeader NewHeader ... Latch }
700	// after: (OrigPre+OrigHeader') { NewHeader ... Latch OrigHeader }
701	//
702	// with D: llvm.experimental.noalias.scope.decl,
703	// U: !noalias or !alias.scope depending on D
704	// ... { D U1 U2 } can transform into:
705	// (0) : ... { D U1 U2 } // no relevant rotation for this part
706	// (1) : ... D' { U1 U2 D } // D is part of OrigHeader
707	// (2) : ... D' U1' { U2 D U1 } // D, U1 are part of OrigHeader
708	//
709	// We now want to transform:
710	// (1) -> : ... D' { D U1 U2 D'' }
711	// (2) -> : ... D' U1' { D U2 D'' U1'' }
712	// D: original llvm.experimental.noalias.scope.decl
713	// D', U1': duplicate with replaced scopes
714	// D'', U1'': different duplicate with replaced scopes
715	// This ensures a safe fallback to 'may_alias' introduced by the rotate,
716	// as U1'' and U1' scopes will not be compatible wrt to the local restrict
717
718	// Clone the llvm.experimental.noalias.decl again for the NewHeader.
719	BasicBlock::iterator NewHeaderInsertionPoint =
720	NewHeader->getFirstNonPHIIt();
721	for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions) {
722	LLVM_DEBUG(dbgs() << " Cloning llvm.experimental.noalias.scope.decl:"
723	<< *NAD << "\n");
724	Instruction *NewNAD = NAD->clone();
725	NewNAD->insertBefore(BB&: *NewHeader, InsertPos: NewHeaderInsertionPoint);
726	}
727
728	// Scopes must now be duplicated, once for OrigHeader and once for
729	// OrigPreHeader'.
730	{
731	auto &Context = NewHeader->getContext();
732
733	SmallVector<MDNode *, `8`> NoAliasDeclScopes;
734	for (NoAliasScopeDeclInst *NAD : NoAliasDeclInstructions)
735	NoAliasDeclScopes.push_back(Elt: NAD->getScopeList());
736
737	LLVM_DEBUG(dbgs() << " Updating OrigHeader scopes\n");
738	cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, NewBlocks: {OrigHeader}, Context,
739	Ext: "h.rot");
740	LLVM_DEBUG(OrigHeader->dump());
741
742	// Keep the compile time impact low by only adapting the inserted block
743	// of instructions in the OrigPreHeader. This might result in slightly
744	// more aliasing between these instructions and those that were already
745	// present, but it will be much faster when the original PreHeader is
746	// large.
747	LLVM_DEBUG(dbgs() << " Updating part of OrigPreheader scopes\n");
748	auto *FirstDecl =
749	cast<Instruction>(Val&: ValueMap [*NoAliasDeclInstructions.begin()]);
750	auto *LastInst = &OrigPreheader->back();
751	cloneAndAdaptNoAliasScopes(NoAliasDeclScopes, IStart: FirstDecl, IEnd: LastInst,
752	Context, Ext: "pre.rot");
753	LLVM_DEBUG(OrigPreheader->dump());
754
755	LLVM_DEBUG(dbgs() << " Updated NewHeader:\n");
756	LLVM_DEBUG(NewHeader->dump());
757	}
758	}
759
760	// Along with all the other instructions, we just cloned OrigHeader's
761	// terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
762	// successors by duplicating their incoming values for OrigHeader.
763	for (BasicBlock *SuccBB : successors(BB: OrigHeader))
764	for (BasicBlock::iterator BI = SuccBB->begin();
765	PHINode *PN = dyn_cast<PHINode>(Val&: BI); ++BI)
766	PN->addIncoming(V: PN->getIncomingValueForBlock(BB: OrigHeader), BB: OrigPreheader);
767
768	// Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
769	// OrigPreHeader's old terminator (the original branch into the loop), and
770	// remove the corresponding incoming values from the PHI nodes in OrigHeader.
771	LoopEntryBranch->eraseFromParent();
772	OrigPreheader->flushTerminatorDbgRecords();
773
774	// Update MemorySSA before the rewrite call below changes the 1:1
775	// instruction:cloned_instruction_or_value mapping.
776	if (MSSAU) {
777	InsertNewValueIntoMap(VM&: ValueMapMSSA, K: OrigHeader, V: OrigPreheader);
778	MSSAU->updateForClonedBlockIntoPred(BB: OrigHeader, P1: OrigPreheader,
779	VM: ValueMapMSSA);
780	}
781
782	SmallVector<PHINode*, `2`> InsertedPHIs;
783	// If there were any uses of instructions in the duplicated block outside the
784	// loop, update them, inserting PHI nodes as required
785	RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, SE,
786	InsertedPHIs: &InsertedPHIs);
787
788	// Attach debug records to the new phis if that phi uses a value that
789	// previously had debug metadata attached. This keeps the debug info
790	// up-to-date in the loop body.
791	if (!InsertedPHIs.empty())
792	insertDebugValuesForPHIs(BB: OrigHeader, InsertedPHIs);
793
794	// NewHeader is now the header of the loop.
795	L->moveToHeader(BB: NewHeader);
796	assert(L->getHeader() == NewHeader && "Latch block is our new header");
797
798	// Inform DT about changes to the CFG.
799	if (DT) {
800	// The OrigPreheader branches to the NewHeader and Exit now. Then, inform
801	// the DT about the removed edge to the OrigHeader (that got removed).
802	SmallVector<DominatorTree::UpdateType, `3`> Updates = {
803	{DominatorTree::Insert, OrigPreheader, Exit},
804	{DominatorTree::Insert, OrigPreheader, NewHeader},
805	{DominatorTree::Delete, OrigPreheader, OrigHeader}};
806
807	if (MSSAU) {
808	MSSAU->applyUpdates(Updates, DT&: DT, /UpdateDT=/UpdateDTFirst: true*);
809	if (VerifyMemorySSA)
810	MSSAU->getMemorySSA()->verifyMemorySSA();
811	} else {
812	DT->applyUpdates(Updates);
813	}
814	}
815
816	// At this point, we've finished our major CFG changes. As part of cloning
817	// the loop into the preheader we've simplified instructions and the
818	// duplicated conditional branch may now be branching on a constant. If it is
819	// branching on a constant and if that constant means that we enter the loop,
820	// then we fold away the cond branch to an uncond branch. This simplifies the
821	// loop in cases important for nested loops, and it also means we don't have
822	// to split as many edges.
823	BranchInst *PHBI = cast<BranchInst>(Val: OrigPreheader->getTerminator());
824	assert(PHBI->isConditional() && "Should be clone of BI condbr!");
825	const Value *Cond = PHBI->getCondition();
826	const bool HasConditionalPreHeader =
827	!isa<ConstantInt>(Val: Cond) \|\|
828	PHBI->getSuccessor(i: cast<ConstantInt>(Val: Cond)->isZero()) != NewHeader;
829
830	updateBranchWeights(PreHeaderBI&: PHBI, LoopBI&: BI, HasConditionalPreHeader, SuccsSwapped: BISuccsSwapped);
831
832	if (HasConditionalPreHeader) {
833	// The conditional branch can't be folded, handle the general case.
834	// Split edges as necessary to preserve LoopSimplify form.
835
836	// Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
837	// thus is not a preheader anymore.
838	// Split the edge to form a real preheader.
839	BasicBlock *NewPH = SplitCriticalEdge(
840	Src: OrigPreheader, Dst: NewHeader,
841	Options: CriticalEdgeSplittingOptions (DT, LI, MSSAU).setPreserveLCSSA());
842	NewPH->setName(NewHeader->getName() + ".lr.ph");
843
844	// Preserve canonical loop form, which means that 'Exit' should have only
845	// one predecessor. Note that Exit could be an exit block for multiple
846	// nested loops, causing both of the edges to now be critical and need to
847	// be split.
848	SmallVector<BasicBlock *, `4`> ExitPreds(predecessors(BB: Exit));
849	bool SplitLatchEdge = false;
850	for (BasicBlock *ExitPred : ExitPreds) {
851	// We only need to split loop exit edges.
852	Loop *PredLoop = LI->getLoopFor(BB: ExitPred);
853	if (!PredLoop \|\| PredLoop->contains(BB: Exit) \|\|
854	isa<IndirectBrInst>(Val: ExitPred->getTerminator()))
855	continue;
856	SplitLatchEdge \|= L->getLoopLatch() == ExitPred;
857	BasicBlock *ExitSplit = SplitCriticalEdge(
858	Src: ExitPred, Dst: Exit,
859	Options: CriticalEdgeSplittingOptions (DT, LI, MSSAU).setPreserveLCSSA());
860	ExitSplit->moveBefore(MovePos: Exit);
861	}
862	assert(SplitLatchEdge &&
863	"Despite splitting all preds, failed to split latch exit?");
864	(void)SplitLatchEdge;
865	} else {
866	// We can fold the conditional branch in the preheader, this makes things
867	// simpler. The first step is to remove the extra edge to the Exit block.
868	Exit->removePredecessor(Pred: OrigPreheader, KeepOneInputPHIs: true /preserve LCSSA/);
869	BranchInst *NewBI = BranchInst::Create(IfTrue: NewHeader, InsertBefore: PHBI->getIterator());
870	NewBI->setDebugLoc(PHBI->getDebugLoc());
871	PHBI->eraseFromParent();
872
873	// With our CFG finalized, update DomTree if it is available.
874	if (DT) DT->deleteEdge(From: OrigPreheader, To: Exit);
875
876	// Update MSSA too, if available.
877	if (MSSAU)
878	MSSAU->removeEdge(From: OrigPreheader, To: Exit);
879	}
880
881	assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
882	assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
883
884	if (MSSAU && VerifyMemorySSA)
885	MSSAU->getMemorySSA()->verifyMemorySSA();
886
887	// Now that the CFG and DomTree are in a consistent state again, try to merge
888	// the OrigHeader block into OrigLatch. This will succeed if they are
889	// connected by an unconditional branch. This is just a cleanup so the
890	// emitted code isn't too gross in this common case.
891	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
892	BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
893	bool DidMerge = MergeBlockIntoPredecessor(BB: OrigHeader, DTU: &DTU, LI, MSSAU);
894	if (DidMerge)
895	RemoveRedundantDbgInstrs(BB: PredBB);
896
897	if (MSSAU && VerifyMemorySSA)
898	MSSAU->getMemorySSA()->verifyMemorySSA();
899
900	LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
901
902	++NumRotated;
903
904	Rotated = true;
905	SimplifiedLatch = false;
906
907	// Check that new latch is a deoptimizing exit and then repeat rotation if possible.
908	// Deoptimizing latch exit is not a generally typical case, so we just loop over.
909	// TODO: if it becomes a performance bottleneck extend rotation algorithm
910	// to handle multiple rotations in one go.
911	} while (MultiRotate && canRotateDeoptimizingLatchExit(L));
912
913
914	return true;
915	}
916
917	/// Determine whether the instructions in this range may be safely and cheaply
918	/// speculated. This is not an important enough situation to develop complex
919	/// heuristics. We handle a single arithmetic instruction along with any type
920	/// conversions.
921	static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
922	BasicBlock::iterator End, Loop *L) {
923	bool seenIncrement = false;
924	bool MultiExitLoop = false;
925
926	if (!L->getExitingBlock())
927	MultiExitLoop = true;
928
929	for (BasicBlock::iterator I = Begin; I != End; ++I) {
930
931	if (!isSafeToSpeculativelyExecute(I: &*I))
932	return false;
933
934	switch (I ->getOpcode()) {
935	default:
936	return false;
937	case Instruction::GetElementPtr:
938	// GEPs are cheap if all indices are constant.
939	if (!cast<GEPOperator>(Val&: I)->hasAllConstantIndices())
940	return false;
941	// fall-thru to increment case
942	[[fallthrough]];
943	case Instruction::Add:
944	case Instruction::Sub:
945	case Instruction::And:
946	case Instruction::Or:
947	case Instruction::Xor:
948	case Instruction::Shl:
949	case Instruction::LShr:
950	case Instruction::AShr: {
951	Value *IVOpnd =
952	!isa<Constant>(Val: I ->getOperand(i: `0`))
953	? I ->getOperand(i: `0`)
954	: !isa<Constant>(Val: I ->getOperand(i: `1`)) ? I ->getOperand(i: `1`) : nullptr;
955	if (!IVOpnd)
956	return false;
957
958	// If increment operand is used outside of the loop, this speculation
959	// could cause extra live range interference.
960	if (MultiExitLoop) {
961	for (User *UseI : IVOpnd->users()) {
962	auto *UserInst = cast<Instruction>(Val: UseI);
963	if (!L->contains(Inst: UserInst))
964	return false;
965	}
966	}
967
968	if (seenIncrement)
969	return false;
970	seenIncrement = true;
971	break;
972	}
973	case Instruction::Trunc:
974	case Instruction::ZExt:
975	case Instruction::SExt:
976	// ignore type conversions
977	break;
978	}
979	}
980	return true;
981	}
982
983	/// Fold the loop tail into the loop exit by speculating the loop tail
984	/// instructions. Typically, this is a single post-increment. In the case of a
985	/// simple 2-block loop, hoisting the increment can be much better than
986	/// duplicating the entire loop header. In the case of loops with early exits,
987	/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
988	/// canonical form so downstream passes can handle it.
989	///
990	/// I don't believe this invalidates SCEV.
991	bool LoopRotate::simplifyLoopLatch(Loop *L) {
992	BasicBlock *Latch = L->getLoopLatch();
993	if (!Latch \|\| Latch->hasAddressTaken())
994	return false;
995
996	BranchInst *Jmp = dyn_cast<BranchInst>(Val: Latch->getTerminator());
997	if (!Jmp \|\| !Jmp->isUnconditional())
998	return false;
999
1000	BasicBlock *LastExit = Latch->getSinglePredecessor();
1001	if (!LastExit \|\| !L->isLoopExiting(BB: LastExit))
1002	return false;
1003
1004	BranchInst *BI = dyn_cast<BranchInst>(Val: LastExit->getTerminator());
1005	if (!BI)
1006	return false;
1007
1008	if (!shouldSpeculateInstrs(Begin: Latch->begin(), End: Jmp->getIterator(), L))
1009	return false;
1010
1011	LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
1012	<< LastExit->getName() << "\n");
1013
1014	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
1015	MergeBlockIntoPredecessor(BB: Latch, DTU: &DTU, LI, MSSAU, MemDep: nullptr,
1016	/PredecessorWithTwoSuccessors=/true);
1017
1018	if (SE) {
1019	// Merging blocks may remove blocks reference in the block disposition cache. Clear the cache.
1020	SE->forgetBlockAndLoopDispositions();
1021	}
1022
1023	if (MSSAU && VerifyMemorySSA)
1024	MSSAU->getMemorySSA()->verifyMemorySSA();
1025
1026	return true;
1027	}
1028
1029	/// Rotate \c L, and return true if any modification was made.
1030	bool LoopRotate::processLoop(Loop *L) {
1031	// Save the loop metadata.
1032	MDNode *LoopMD = L->getLoopID();
1033
1034	bool SimplifiedLatch = false;
1035
1036	// Simplify the loop latch before attempting to rotate the header
1037	// upward. Rotation may not be needed if the loop tail can be folded into the
1038	// loop exit.
1039	if (!RotationOnly)
1040	SimplifiedLatch = simplifyLoopLatch(L);
1041
1042	bool MadeChange = rotateLoop(L, SimplifiedLatch);
1043	assert((!MadeChange \|\| L->isLoopExiting(L->getLoopLatch())) &&
1044	"Loop latch should be exiting after loop-rotate.");
1045
1046	// Restore the loop metadata.
1047	// NB! We presume LoopRotation DOESN'T ADD its own metadata.
1048	if ((MadeChange \|\| SimplifiedLatch) && LoopMD)
1049	L->setLoopID(LoopMD);
1050
1051	return MadeChange \|\| SimplifiedLatch;
1052	}
1053
1054
1055	/// The utility to convert a loop into a loop with bottom test.
1056	bool llvm::LoopRotation(Loop L, LoopInfo LI, const TargetTransformInfo *TTI,
1057	AssumptionCache AC, DominatorTree DT,
1058	ScalarEvolution SE, MemorySSAUpdater MSSAU,
1059	const SimplifyQuery &SQ, bool RotationOnly = true,
1060	unsigned Threshold = unsigned(-`1`),
1061	bool IsUtilMode = true, bool PrepareForLTO) {
1062	LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
1063	IsUtilMode, PrepareForLTO);
1064	return LR.processLoop(L);
1065	}
1066

Browse the source code of llvm_projects/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp