LoopInterchange.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopInterchange.cpp]

1	//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This Pass handles loop interchange transform.
10	// This pass interchanges loops to provide a more cache-friendly memory access
11	// patterns.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Transforms/Scalar/LoopInterchange.h"
16	#include "llvm/ADT/STLExtras.h"
17	#include "llvm/ADT/SmallSet.h"
18	#include "llvm/ADT/SmallVector.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/ADT/StringMap.h"
21	#include "llvm/ADT/StringRef.h"
22	#include "llvm/Analysis/DependenceAnalysis.h"
23	#include "llvm/Analysis/LoopCacheAnalysis.h"
24	#include "llvm/Analysis/LoopInfo.h"
25	#include "llvm/Analysis/LoopNestAnalysis.h"
26	#include "llvm/Analysis/LoopPass.h"
27	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28	#include "llvm/Analysis/ScalarEvolution.h"
29	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30	#include "llvm/IR/BasicBlock.h"
31	#include "llvm/IR/DiagnosticInfo.h"
32	#include "llvm/IR/Dominators.h"
33	#include "llvm/IR/Function.h"
34	#include "llvm/IR/IRBuilder.h"
35	#include "llvm/IR/InstrTypes.h"
36	#include "llvm/IR/Instruction.h"
37	#include "llvm/IR/Instructions.h"
38	#include "llvm/IR/User.h"
39	#include "llvm/IR/Value.h"
40	#include "llvm/Support/Casting.h"
41	#include "llvm/Support/CommandLine.h"
42	#include "llvm/Support/Debug.h"
43	#include "llvm/Support/ErrorHandling.h"
44	#include "llvm/Support/raw_ostream.h"
45	#include "llvm/Transforms/Scalar/LoopPassManager.h"
46	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47	#include "llvm/Transforms/Utils/Local.h"
48	#include "llvm/Transforms/Utils/LoopUtils.h"
49	#include <cassert>
50	#include <utility>
51	#include <vector>
52
53	using namespace llvm;
54
55	#define DEBUG_TYPE "loop-interchange"
56
57	STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
59	static cl::opt<int> LoopInterchangeCostThreshold(
60	"loop-interchange-threshold", cl::init(Val: `0`), cl::Hidden,
61	cl::desc ("Interchange if you gain more than this number"));
62
63	// Maximum number of load-stores that can be handled in the dependency matrix.
64	static cl::opt<unsigned int> MaxMemInstrCount(
65	"loop-interchange-max-meminstr-count", cl::init(Val: `64`), cl::Hidden,
66	cl::desc (
67	"Maximum number of load-store instructions that should be handled "
68	"in the dependency matrix. Higher value may lead to more interchanges "
69	"at the cost of compile-time"));
70
71	namespace {
72
73	using LoopVector = SmallVector<Loop *, `8`>;
74
75	/// A list of direction vectors. Each entry represents a direction vector
76	/// corresponding to one or more dependencies existing in the loop nest. The
77	/// length of all direction vectors is equal and is N + 1, where N is the depth
78	/// of the loop nest. The first N elements correspond to the dependency
79	/// direction of each N loops. The last one indicates whether this entry is
80	/// forward dependency ('<') or not (''). The term "forward" aligns with what*
81	/// is defined in LoopAccessAnalysis.
82	// TODO: Check if we can use a sparse matrix here.
83	using CharMatrix = std::vector<std::vector<char>>;
84
85	/// Types of rules used in profitability check.
86	enum class RuleTy {
87	PerLoopCacheAnalysis,
88	PerInstrOrderCost,
89	ForVectorization,
90	Ignore
91	};
92
93	} // end anonymous namespace
94
95	// Minimum loop depth supported.
96	static cl::opt<unsigned int> MinLoopNestDepth(
97	"loop-interchange-min-loop-nest-depth", cl::init(Val: `2`), cl::Hidden,
98	cl::desc ("Minimum depth of loop nest considered for the transform"));
99
100	// Maximum loop depth supported.
101	static cl::opt<unsigned int> MaxLoopNestDepth(
102	"loop-interchange-max-loop-nest-depth", cl::init(Val: `10`), cl::Hidden,
103	cl::desc ("Maximum depth of loop nest considered for the transform"));
104
105	// We prefer cache cost to vectorization by default.
106	static cl::list<RuleTy> Profitabilities(
107	"loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
108	cl::Hidden,
109	cl::desc ("List of profitability heuristics to be used. They are applied in "
110	"the given order"),
111	cl::list_init<RuleTy>(Vals: {RuleTy::PerLoopCacheAnalysis,
112	RuleTy::PerInstrOrderCost,
113	RuleTy::ForVectorization}),
114	cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
115	"Prioritize loop cache cost"),
116	clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
117	"Prioritize the IVs order of each instruction"),
118	clEnumValN(RuleTy::ForVectorization, "vectorize",
119	"Prioritize vectorization"),
120	clEnumValN(RuleTy::Ignore, "ignore",
121	"Ignore profitability, force interchange (does not "
122	"work with other options)")));
123
124	// Support for the inner-loop reduction pattern.
125	static cl::opt<bool> EnableReduction2Memory(
126	"loop-interchange-reduction-to-mem", cl::init(Val: false), cl::Hidden,
127	cl::desc ("Support for the inner-loop reduction pattern."));
128
129	#ifndef NDEBUG
130	static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
131	SmallSet<RuleTy, `4`> Set;
132	for (RuleTy Rule : Rules) {
133	if (!Set.insert(Rule).second)
134	return false;
135	if (Rule == RuleTy::Ignore)
136	return false;
137	}
138	return true;
139	}
140
141	static void printDepMatrix(CharMatrix &DepMatrix) {
142	for (auto &Row : DepMatrix) {
143	// Drop the last element because it is a flag indicating whether this is
144	// forward dependency or not, which doesn't affect the legality check.
145	for (char D : drop_end(Row))
146	LLVM_DEBUG(dbgs() << D << " ");
147	LLVM_DEBUG(dbgs() << "\n");
148	}
149	}
150
151	/// Return true if \p Src appears before \p Dst in the same basic block.
152	/// Precondition: \p Src and \Dst are distinct instructions within the same
153	/// basic block.
154	static bool inThisOrder(const Instruction Src, const* Instruction *Dst) {
155	assert(Src->getParent() == Dst->getParent() && Src != Dst &&
156	"Expected Src and Dst to be different instructions in the same BB");
157
158	bool FoundSrc = false;
159	for (const Instruction &I : *(Src->getParent())) {
160	if (&I == Src) {
161	FoundSrc = true;
162	continue;
163	}
164	if (&I == Dst)
165	return FoundSrc;
166	}
167
168	llvm_unreachable("Dst not found");
169	}
170	#endif
171
172	static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
173	Loop L, DependenceInfo DI,
174	ScalarEvolution *SE,
175	OptimizationRemarkEmitter *ORE) {
176	using ValueVector = SmallVector<Value *, `16`>;
177
178	ValueVector MemInstr;
179
180	// For each block.
181	for (BasicBlock *BB : L->blocks()) {
182	// Scan the BB and collect legal loads and stores.
183	for (Instruction &I : *BB) {
184	if (!isa<Instruction>(Val: I))
185	return false;
186	if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
187	if (!Ld->isSimple())
188	return false;
189	MemInstr.push_back(Elt: &I);
190	} else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
191	if (!St->isSimple())
192	return false;
193	MemInstr.push_back(Elt: &I);
194	}
195	}
196	}
197
198	LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
199	<< " Loads and Stores to analyze\n");
200	if (MemInstr.size() > MaxMemInstrCount) {
201	LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
202	<< MaxMemInstrCount << " load/stores in a loop\n");
203	ORE->emit(RemarkBuilder: [&]() {
204	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedLoop",
205	L->getStartLoc(), L->getHeader())
206	<< "Number of loads/stores exceeded, the supported maximum "
207	"can be increased with option "
208	"-loop-interchange-maxmeminstr-count.";
209	});
210	return false;
211	}
212	ValueVector::iterator I, IE, J, JE;
213
214	// Manage direction vectors that are already seen. Map each direction vector
215	// to an index of DepMatrix at which it is stored.
216	StringMap<unsigned> Seen;
217
218	for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
219	for (J = I, JE = MemInstr.end(); J != JE; ++J) {
220	std::vector<char> Dep;
221	Instruction Src = cast<Instruction>(Val: I);
222	Instruction Dst = cast<Instruction>(Val: J);
223	// Ignore Input dependencies.
224	if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
225	continue;
226	// Track Output, Flow, and Anti dependencies.
227	if (auto D = DI->depends(Src, Dst)) {
228	assert(D->isOrdered() && "Expected an output, flow or anti dep.");
229	// If the direction vector is negative, normalize it to
230	// make it non-negative.
231	if (D ->normalize(SE))
232	LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
233	LLVM_DEBUG(StringRef DepType =
234	D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
235	dbgs() << "Found " << DepType
236	<< " dependency between Src and Dst\n"
237	<< " Src:" << Src << "\n Dst:" << Dst << `'\n'`);
238	unsigned Levels = D ->getLevels();
239	char Direction;
240	for (unsigned II = `1`; II <= Levels; ++II) {
241	// `DVEntry::LE` is converted to ``. This is because `LE` means `<`*
242	// or `=`, for which we don't have an equivalent representation, so
243	// that the conservative approximation is necessary. The same goes for
244	// `DVEntry::GE`.
245	// TODO: Use of fine-grained expressions allows for more accurate
246	// analysis.
247	unsigned Dir = D ->getDirection(Level: II);
248	if (Dir == Dependence::DVEntry::LT)
249	Direction = `'<'`;
250	else if (Dir == Dependence::DVEntry::GT)
251	Direction = `'>'`;
252	else if (Dir == Dependence::DVEntry::EQ)
253	Direction = `'='`;
254	else
255	Direction = `'*'`;
256	Dep.push_back(x: Direction);
257	}
258
259	// If the Dependence object doesn't have any information, fill the
260	// dependency vector with ''.*
261	if (D ->isConfused()) {
262	assert(Dep.empty() && "Expected empty dependency vector");
263	Dep.assign(n: Level, val: `'*'`);
264	}
265
266	while (Dep.size() != Level) {
267	Dep.push_back(x: `'I'`);
268	}
269
270	// If all the elements of any direction vector have only '', legality*
271	// can't be proven. Exit early to save compile time.
272	if (all_of(Range&: Dep, P: equal_to(Arg: `'*'`))) {
273	ORE->emit(RemarkBuilder: [&]() {
274	return OptimizationRemarkMissed (DEBUG_TYPE, "Dependence",
275	L->getStartLoc(), L->getHeader())
276	<< "All loops have dependencies in all directions.";
277	});
278	return false;
279	}
280
281	// Test whether the dependency is forward or not.
282	bool IsKnownForward = true;
283	if (Src->getParent() != Dst->getParent()) {
284	// In general, when Src and Dst are in different BBs, the execution
285	// order of them within a single iteration is not guaranteed. Treat
286	// conservatively as not-forward dependency in this case.
287	IsKnownForward = false;
288	} else {
289	// Src and Dst are in the same BB. If they are the different
290	// instructions, Src should appear before Dst in the BB as they are
291	// stored to MemInstr in that order.
292	assert((Src == Dst \|\| inThisOrder(Src, Dst)) &&
293	"Unexpected instructions");
294
295	// If the Dependence object is reversed (due to normalization), it
296	// represents the dependency from Dst to Src, meaning it is a backward
297	// dependency. Otherwise it should be a forward dependency.
298	bool IsReversed = D ->getSrc() != Src;
299	if (IsReversed)
300	IsKnownForward = false;
301	}
302
303	// Initialize the last element. Assume forward dependencies only; it
304	// will be updated later if there is any non-forward dependency.
305	Dep.push_back(x: `'<'`);
306
307	// The last element should express the "summary" among one or more
308	// direction vectors whose first N elements are the same (where N is
309	// the depth of the loop nest). Hence we exclude the last element from
310	// the Seen map.
311	auto [Ite, Inserted] = Seen.try_emplace(
312	Key: StringRef (Dep.data(), Dep.size() - `1`), Args: DepMatrix.size());
313
314	// Make sure we only add unique entries to the dependency matrix.
315	if (Inserted)
316	DepMatrix.push_back(x: Dep);
317
318	// If we cannot prove that this dependency is forward, change the last
319	// element of the corresponding entry. Since a `[... ]` dependency*
320	// includes a `[... <]` dependency, we do not need to keep both and
321	// change the existing entry instead.
322	if (!IsKnownForward)
323	DepMatrix [Ite ->second].back() = `'*'`;
324	}
325	}
326	}
327
328	return true;
329	}
330
331	// A loop is moved from index 'from' to an index 'to'. Update the Dependence
332	// matrix by exchanging the two columns.
333	static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
334	unsigned ToIndx) {
335	for (auto &Row : DepMatrix)
336	std::swap(a&: Row [ToIndx], b&: Row [FromIndx]);
337	}
338
339	// Check if a direction vector is lexicographically positive. Return true if it
340	// is positive, nullopt if it is "zero", otherwise false.
341	// [Theorem] A permutation of the loops in a perfect nest is legal if and only
342	// if the direction matrix, after the same permutation is applied to its
343	// columns, has no ">" direction as the leftmost non-"=" direction in any row.
344	static std::optional<bool>
345	isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
346	for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) {
347	if (Direction == `'<'`)
348	return true;
349	if (Direction == `'>'` \|\| Direction == `'*'`)
350	return false;
351	}
352	return std::nullopt;
353	}
354
355	// Checks if it is legal to interchange 2 loops.
356	static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
357	unsigned InnerLoopId,
358	unsigned OuterLoopId) {
359	unsigned NumRows = DepMatrix.size();
360	std::vector<char> Cur;
361	// For each row check if it is valid to interchange.
362	for (unsigned Row = `0`; Row < NumRows; ++Row) {
363	// Create temporary DepVector check its lexicographical order
364	// before and after swapping OuterLoop vs InnerLoop
365	Cur = DepMatrix [Row];
366
367	// If the surrounding loops already ensure that the direction vector is
368	// lexicographically positive, nothing within the loop will be able to break
369	// the dependence. In such a case we can skip the subsequent check.
370	if (isLexicographicallyPositive(DV: Cur, Begin: `0`, End: OuterLoopId) == true)
371	continue;
372
373	// Check if the direction vector is lexicographically positive (or zero)
374	// for both before/after exchanged. Ignore the last element because it
375	// doesn't affect the legality.
376	if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - `1`) == false)
377	return false;
378	std::swap(a&: Cur [InnerLoopId], b&: Cur [OuterLoopId]);
379	if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - `1`) == false)
380	return false;
381	}
382	return true;
383	}
384
385	static void populateWorklist(Loop &L, LoopVector &LoopList) {
386	LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
387	<< L.getHeader()->getParent()->getName() << " Loop: %"
388	<< L.getHeader()->getName() << `'\n'`);
389	assert(LoopList.empty() && "LoopList should initially be empty!");
390	Loop *CurrentLoop = &L;
391	const std::vector<Loop > Vec = &CurrentLoop->getSubLoops();
392	while (!Vec->empty()) {
393	// The current loop has multiple subloops in it hence it is not tightly
394	// nested.
395	// Discard all loops above it added into Worklist.
396	if (Vec->size() != `1`) {
397	LoopList = {};
398	return;
399	}
400
401	LoopList.push_back(Elt: CurrentLoop);
402	CurrentLoop = Vec->front();
403	Vec = &CurrentLoop->getSubLoops();
404	}
405	LoopList.push_back(Elt: CurrentLoop);
406	}
407
408	static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList,
409	OptimizationRemarkEmitter &ORE) {
410	unsigned LoopNestDepth = LoopList.size();
411	if (LoopNestDepth < MinLoopNestDepth \|\| LoopNestDepth > MaxLoopNestDepth) {
412	LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
413	<< ", the supported range is [" << MinLoopNestDepth
414	<< ", " << MaxLoopNestDepth << "].\n");
415	Loop *OuterLoop = LoopList.front();
416	ORE.emit(RemarkBuilder: [&]() {
417	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedLoopNestDepth",
418	OuterLoop->getStartLoc(),
419	OuterLoop->getHeader())
420	<< "Unsupported depth of loop nest, the supported range is ["
421	<< std::to_string(val: MinLoopNestDepth) << ", "
422	<< std::to_string(val: MaxLoopNestDepth) << "].\n";
423	});
424	return false;
425	}
426	return true;
427	}
428
429	static bool isComputableLoopNest(ScalarEvolution *SE,
430	ArrayRef<Loop *> LoopList) {
431	for (Loop *L : LoopList) {
432	const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
433	if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) {
434	LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
435	return false;
436	}
437	if (L->getNumBackEdges() != `1`) {
438	LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
439	return false;
440	}
441	if (!L->getExitingBlock()) {
442	LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
443	return false;
444	}
445	}
446	return true;
447	}
448
449	namespace {
450
451	/// LoopInterchangeLegality checks if it is legal to interchange the loop.
452	class LoopInterchangeLegality {
453	public:
454	LoopInterchangeLegality(Loop Outer, Loop Inner, ScalarEvolution *SE,
455	OptimizationRemarkEmitter ORE, DominatorTree DT)
456	: OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
457
458	/// Check if the loops can be interchanged.
459	bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
460	CharMatrix &DepMatrix);
461
462	/// Discover induction PHIs in the header of \p L. Induction
463	/// PHIs are added to \p Inductions.
464	bool findInductions(Loop L, SmallVectorImpl<PHINode > &Inductions);
465
466	/// Check if the loop structure is understood. We do not handle triangular
467	/// loops for now.
468	bool isLoopStructureUnderstood();
469
470	bool currentLimitations();
471
472	const SmallPtrSetImpl<PHINode > &getOuterInnerReductions() const* {
473	return OuterInnerReductions;
474	}
475
476	const ArrayRef<PHINode > getInnerLoopInductions() const* {
477	return InnerLoopInductions;
478	}
479
480	ArrayRef<Instruction > getHasNoWrapReductions() const* {
481	return HasNoWrapReductions;
482	}
483
484	/// Record reductions in the inner loop. Currently supported reductions:
485	/// - initialized from a constant.
486	/// - reduction PHI node has only one user.
487	/// - located in the innermost loop.
488	struct InnerReduction {
489	/// The reduction itself.
490	PHINode *Reduction;
491	Value *Init;
492	Value *Next;
493	/// The Lcssa PHI.
494	PHINode *LcssaPhi;
495	/// Store reduction result into memory object.
496	StoreInst *LcssaStore;
497	/// The memory Location.
498	Value *MemRef;
499	Type *ElemTy;
500	};
501
502	ArrayRef<InnerReduction> getInnerReductions() const {
503	return InnerReductions;
504	}
505
506	private:
507	bool tightlyNested(Loop Outer, Loop Inner);
508	bool containsUnsafeInstructions(BasicBlock BB, Instruction Skip);
509
510	/// Discover induction and reduction PHIs in the header of \p L. Induction
511	/// PHIs are added to \p Inductions, reductions are added to
512	/// OuterInnerReductions. When the outer loop is passed, the inner loop needs
513	/// to be passed as \p InnerLoop.
514	bool findInductionAndReductions(Loop *L,
515	SmallVector<PHINode *, `8`> &Inductions,
516	Loop *InnerLoop);
517
518	/// Detect and record the reduction of the inner loop. Add them to
519	/// InnerReductions.
520	///
521	/// innerloop:
522	/// Re = phi<0.0, Next>
523	/// Next = Re op ...
524	/// OuterLoopLatch:
525	/// Lcssa = phi<Next> ; lcssa phi
526	/// store Lcssa, MemRef ; LcssaStore
527	///
528	bool isInnerReduction(Loop L, PHINode Phi,
529	SmallVectorImpl<Instruction *> &HasNoWrapInsts);
530
531	Loop *OuterLoop;
532	Loop *InnerLoop;
533
534	ScalarEvolution *SE;
535	DominatorTree *DT;
536
537	/// Interface to emit optimization remarks.
538	OptimizationRemarkEmitter *ORE;
539
540	/// Set of reduction PHIs taking part of a reduction across the inner and
541	/// outer loop.
542	SmallPtrSet<PHINode *, `4`> OuterInnerReductions;
543
544	/// Set of inner loop induction PHIs
545	SmallVector<PHINode *, `8`> InnerLoopInductions;
546
547	/// Hold instructions that have nuw/nsw flags and involved in reductions,
548	/// like integer addition/multiplication. Those flags must be dropped when
549	/// interchanging the loops.
550	SmallVector<Instruction *, `4`> HasNoWrapReductions;
551
552	/// Vector of reductions in the inner loop.
553	SmallVector<InnerReduction, `8`> InnerReductions;
554	};
555
556	/// Manages information utilized by the profitability check for cache. The main
557	/// purpose of this class is to delay the computation of CacheCost until it is
558	/// actually needed.
559	class CacheCostManager {
560	Loop *OutermostLoop;
561	LoopStandardAnalysisResults *AR;
562	DependenceInfo *DI;
563
564	/// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
565	/// that the result can be nullptr.
566	std::optional<std::unique_ptr<CacheCost>> CC;
567
568	/// Maps each loop to an index representing the optimal position within the
569	/// loop-nest, as determined by the cache cost analysis.
570	DenseMap<const Loop , unsigned*> CostMap;
571
572	void computeIfUnitinialized();
573
574	public:
575	CacheCostManager(Loop OutermostLoop, LoopStandardAnalysisResults AR,
576	DependenceInfo *DI)
577	: OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
578	CacheCost *getCacheCost();
579	const DenseMap<const Loop , unsigned*> &getCostMap();
580	};
581
582	/// LoopInterchangeProfitability checks if it is profitable to interchange the
583	/// loop.
584	class LoopInterchangeProfitability {
585	public:
586	LoopInterchangeProfitability(Loop Outer, Loop Inner, ScalarEvolution *SE,
587	OptimizationRemarkEmitter *ORE)
588	: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
589
590	/// Check if the loop interchange is profitable.
591	bool isProfitable(const Loop InnerLoop, const* Loop *OuterLoop,
592	unsigned InnerLoopId, unsigned OuterLoopId,
593	CharMatrix &DepMatrix, CacheCostManager &CCM);
594
595	private:
596	int getInstrOrderCost();
597	std::optional<bool> isProfitablePerLoopCacheAnalysis(
598	const DenseMap<const Loop , unsigned> &CostMap, CacheCost CC);
599	std::optional<bool> isProfitablePerInstrOrderCost();
600	std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
601	unsigned OuterLoopId,
602	CharMatrix &DepMatrix);
603	Loop *OuterLoop;
604	Loop *InnerLoop;
605
606	/// Scev analysis.
607	ScalarEvolution *SE;
608
609	/// Interface to emit optimization remarks.
610	OptimizationRemarkEmitter *ORE;
611	};
612
613	/// LoopInterchangeTransform interchanges the loop.
614	class LoopInterchangeTransform {
615	public:
616	LoopInterchangeTransform(Loop Outer, Loop Inner, ScalarEvolution *SE,
617	LoopInfo LI, DominatorTree DT,
618	const LoopInterchangeLegality &LIL)
619	: OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
620
621	/// Interchange OuterLoop and InnerLoop.
622	bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
623	void reduction2Memory();
624	void restructureLoops(Loop NewInner, Loop NewOuter,
625	BasicBlock *OrigInnerPreHeader,
626	BasicBlock *OrigOuterPreHeader);
627	void removeChildLoop(Loop OuterLoop, Loop InnerLoop);
628
629	private:
630	bool adjustLoopLinks();
631	bool adjustLoopBranches();
632
633	Loop *OuterLoop;
634	Loop *InnerLoop;
635
636	/// Scev analysis.
637	ScalarEvolution *SE;
638
639	LoopInfo *LI;
640	DominatorTree *DT;
641
642	const LoopInterchangeLegality &LIL;
643	};
644
645	struct LoopInterchange {
646	ScalarEvolution SE = nullptr*;
647	LoopInfo LI = nullptr*;
648	DependenceInfo DI = nullptr*;
649	DominatorTree DT = nullptr*;
650	LoopStandardAnalysisResults AR = nullptr*;
651
652	/// Interface to emit optimization remarks.
653	OptimizationRemarkEmitter *ORE;
654
655	LoopInterchange(ScalarEvolution SE, LoopInfo LI, DependenceInfo *DI,
656	DominatorTree DT, LoopStandardAnalysisResults AR,
657	OptimizationRemarkEmitter *ORE)
658	: SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
659
660	bool run(Loop *L) {
661	if (L->getParentLoop())
662	return false;
663	SmallVector<Loop *, `8`> LoopList;
664	populateWorklist(L&: *L, LoopList);
665	return processLoopList(LoopList);
666	}
667
668	bool run(LoopNest &LN) {
669	SmallVector<Loop *, `8`> LoopList(LN.getLoops());
670	for (unsigned I = `1`; I < LoopList.size(); ++I)
671	if (LoopList [I]->getParentLoop() != LoopList [I - `1`])
672	return false;
673	return processLoopList(LoopList);
674	}
675
676	unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
677	// TODO: Add a better heuristic to select the loop to be interchanged based
678	// on the dependence matrix. Currently we select the innermost loop.
679	return LoopList.size() - `1`;
680	}
681
682	bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
683	bool Changed = false;
684
685	// Ensure proper loop nest depth.
686	assert(hasSupportedLoopDepth(LoopList, *ORE) &&
687	"Unsupported depth of loop nest.");
688
689	unsigned LoopNestDepth = LoopList.size();
690
691	LLVM_DEBUG({
692	dbgs() << "Processing LoopList of size = " << LoopNestDepth
693	<< " containing the following loops:\n";
694	for (auto *L : LoopList) {
695	dbgs() << " - ";
696	L->print(dbgs());
697	}
698	});
699
700	CharMatrix DependencyMatrix;
701	Loop OuterMostLoop = (LoopList.begin());
702	if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth,
703	L: OuterMostLoop, DI, SE, ORE)) {
704	LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
705	return false;
706	}
707
708	LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
709	printDepMatrix(DependencyMatrix));
710
711	// Get the Outermost loop exit.
712	BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
713	if (!LoopNestExit) {
714	LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
715	<< "' needs an unique exit block");
716	return false;
717	}
718
719	unsigned SelecLoopId = selectLoopForInterchange(LoopList);
720	CacheCostManager CCM(LoopList [`0`], AR, DI);
721	// We try to achieve the globally optimal memory access for the loopnest,
722	// and do interchange based on a bubble-sort fasion. We start from
723	// the innermost loop, move it outwards to the best possible position
724	// and repeat this process.
725	for (unsigned j = SelecLoopId; j > `0`; j--) {
726	bool ChangedPerIter = false;
727	for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
728	bool Interchanged =
729	processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - `1`, DependencyMatrix, CCM);
730	ChangedPerIter \|= Interchanged;
731	Changed \|= Interchanged;
732	}
733	// Early abort if there was no interchange during an entire round of
734	// moving loops outwards.
735	if (!ChangedPerIter)
736	break;
737	}
738	return Changed;
739	}
740
741	bool processLoop(SmallVectorImpl<Loop > &LoopList, unsigned* InnerLoopId,
742	unsigned OuterLoopId,
743	std::vector<std::vector<char>> &DependencyMatrix,
744	CacheCostManager &CCM) {
745	Loop *OuterLoop = LoopList [OuterLoopId];
746	Loop *InnerLoop = LoopList [InnerLoopId];
747	LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
748	<< " and OuterLoopId = " << OuterLoopId << "\n");
749	LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
750	if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) {
751	LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
752	<< OuterLoop->getName() << "' and '"
753	<< InnerLoop->getName() << "'\n");
754	return false;
755	}
756	LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
757	<< InnerLoop->getName()
758	<< "' are legal to interchange\n");
759	LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
760	if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
761	DepMatrix&: DependencyMatrix, CCM)) {
762	LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
763	<< "' and '" << InnerLoop->getName()
764	<< "' not profitable.\n");
765	return false;
766	}
767
768	ORE->emit(RemarkBuilder: [&]() {
769	return OptimizationRemark (DEBUG_TYPE, "Interchanged",
770	InnerLoop->getStartLoc(),
771	InnerLoop->getHeader())
772	<< "Loop interchanged with enclosing loop.";
773	});
774
775	LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
776	LIT.transform(DropNoWrapInsts: LIL.getHasNoWrapReductions());
777	LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
778	<< OuterLoop->getName() << "' and inner loop '"
779	<< InnerLoop->getName() << "'\n");
780	LoopsInterchanged ++;
781
782	llvm::formLCSSARecursively(L&: OuterLoop, DT: DT, LI, SE);
783
784	// Loops interchanged, update LoopList accordingly.
785	std::swap(a&: LoopList [OuterLoopId], b&: LoopList [InnerLoopId]);
786	// Update the DependencyMatrix
787	interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId);
788
789	LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
790	printDepMatrix(DependencyMatrix));
791
792	return true;
793	}
794	};
795
796	} // end anonymous namespace
797
798	bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
799	Instruction *Skip) {
800	return any_of(Range&: BB, P: [Skip](const* Instruction &I) {
801	if (&I == Skip)
802	return false;
803	return I.mayHaveSideEffects() \|\| I.mayReadFromMemory();
804	});
805	}
806
807	bool LoopInterchangeLegality::tightlyNested(Loop OuterLoop, Loop InnerLoop) {
808	BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
809	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
810	BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
811
812	LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
813	<< "' and '" << InnerLoop->getName()
814	<< "' are tightly nested\n");
815
816	// A perfectly nested loop will not have any branch in between the outer and
817	// inner block i.e. outer header will branch to either inner preheader and
818	// outerloop latch.
819	for (BasicBlock *Succ : successors(BB: OuterLoopHeader))
820	if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
821	Succ != OuterLoopLatch)
822	return false;
823
824	LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
825
826	// The inner loop reduction pattern requires storing the LCSSA PHI in
827	// the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
828	// that store during checks.
829	Instruction Skip = nullptr*;
830	assert(InnerReductions.size() <= `1` &&
831	"So far we only support at most one reduction.");
832	if (InnerReductions.size() == `1`)
833	Skip = InnerReductions [`0`].LcssaStore;
834
835	// We do not have any basic block in between now make sure the outer header
836	// and outer loop latch doesn't contain any unsafe instructions.
837	if (containsUnsafeInstructions(BB: OuterLoopHeader, Skip) \|\|
838	containsUnsafeInstructions(BB: OuterLoopLatch, Skip))
839	return false;
840
841	// Also make sure the inner loop preheader does not contain any unsafe
842	// instructions. Note that all instructions in the preheader will be moved to
843	// the outer loop header when interchanging.
844	if (InnerLoopPreHeader != OuterLoopHeader &&
845	containsUnsafeInstructions(BB: InnerLoopPreHeader, Skip))
846	return false;
847
848	BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
849	// Ensure the inner loop exit block flows to the outer loop latch possibly
850	// through empty blocks.
851	const BasicBlock &SuccInner =
852	LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch);
853	if (&SuccInner != OuterLoopLatch) {
854	LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
855	<< " does not lead to the outer loop latch.\n";);
856	return false;
857	}
858	// The inner loop exit block does flow to the outer loop latch and not some
859	// other BBs, now make sure it contains safe instructions, since it will be
860	// moved into the (new) inner loop after interchange.
861	if (containsUnsafeInstructions(BB: InnerLoopExit, Skip))
862	return false;
863
864	LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
865	// We have a perfect loop nest.
866	return true;
867	}
868
869	bool LoopInterchangeLegality::isLoopStructureUnderstood() {
870	BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
871	for (PHINode *InnerInduction : InnerLoopInductions) {
872	unsigned Num = InnerInduction->getNumOperands();
873	for (unsigned i = `0`; i < Num; ++i) {
874	Value *Val = InnerInduction->getOperand(i_nocapture: i);
875	if (isa<Constant>(Val))
876	continue;
877	Instruction *I = dyn_cast<Instruction>(Val);
878	if (!I)
879	return false;
880	// TODO: Handle triangular loops.
881	// e.g. for(int i=0;i<N;i++)
882	// for(int j=i;j<N;j++)
883	unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
884	if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) ==
885	InnerLoopPreheader &&
886	!OuterLoop->isLoopInvariant(V: I)) {
887	return false;
888	}
889	}
890	}
891
892	// TODO: Handle triangular loops of another form.
893	// e.g. for(int i=0;i<N;i++)
894	// for(int j=0;j<i;j++)
895	// or,
896	// for(int i=0;i<N;i++)
897	// for(int j=0;ji<N;j++)*
898	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
899	CondBrInst *InnerLoopLatchBI =
900	dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
901	if (!InnerLoopLatchBI)
902	return false;
903	if (CmpInst *InnerLoopCmp =
904	dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) {
905	Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: `0`);
906	Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: `1`);
907
908	// LHS and RHS of the inner loop exit condition, e.g.,
909	// in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
910	Value Left = nullptr*;
911	Value Right = nullptr*;
912
913	// Check if V only involves inner loop induction variable.
914	// Return true if V is InnerInduction, or a cast from
915	// InnerInduction, or a binary operator that involves
916	// InnerInduction and a constant.
917	std::function<bool(Value *)> IsPathToInnerIndVar;
918	IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value V) -> bool* {
919	if (llvm::is_contained(Range&: InnerLoopInductions, Element: V))
920	return true;
921	if (isa<Constant>(Val: V))
922	return true;
923	const Instruction *I = dyn_cast<Instruction>(Val: V);
924	if (!I)
925	return false;
926	if (isa<CastInst>(Val: I))
927	return IsPathToInnerIndVar (I->getOperand(i: `0`));
928	if (isa<BinaryOperator>(Val: I))
929	return IsPathToInnerIndVar (I->getOperand(i: `0`)) &&
930	IsPathToInnerIndVar (I->getOperand(i: `1`));
931	return false;
932	};
933
934	// In case of multiple inner loop indvars, it is okay if LHS and RHS
935	// are both inner indvar related variables.
936	if (IsPathToInnerIndVar (Op0) && IsPathToInnerIndVar (Op1))
937	return true;
938
939	// Otherwise we check if the cmp instruction compares an inner indvar
940	// related variable (Left) with a outer loop invariant (Right).
941	if (IsPathToInnerIndVar (Op0) && !isa<Constant>(Val: Op0)) {
942	Left = Op0;
943	Right = Op1;
944	} else if (IsPathToInnerIndVar (Op1) && !isa<Constant>(Val: Op1)) {
945	Left = Op1;
946	Right = Op0;
947	}
948
949	if (Left == nullptr)
950	return false;
951
952	const SCEV *S = SE->getSCEV(V: Right);
953	if (!SE->isLoopInvariant(S, L: OuterLoop))
954	return false;
955	}
956
957	return true;
958	}
959
960	// If SV is a LCSSA PHI node with a single incoming value, return the incoming
961	// value.
962	static Value followLCSSA(Value SV) {
963	PHINode *PHI = dyn_cast<PHINode>(Val: SV);
964	if (!PHI)
965	return SV;
966
967	if (PHI->getNumIncomingValues() != `1`)
968	return SV;
969	return followLCSSA(SV: PHI->getIncomingValue(i: `0`));
970	}
971
972	static bool checkReductionKind(Loop L, PHINode PHI,
973	SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
974	RecurrenceDescriptor RD;
975	if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) {
976	// Detect floating point reduction only when it can be reordered.
977	if (RD.getExactFPMathInst() != nullptr)
978	return false;
979
980	RecurKind RK = RD.getRecurrenceKind();
981	switch (RK) {
982	case RecurKind::Or:
983	case RecurKind::And:
984	case RecurKind::Xor:
985	case RecurKind::SMin:
986	case RecurKind::SMax:
987	case RecurKind::UMin:
988	case RecurKind::UMax:
989	case RecurKind::FAdd:
990	case RecurKind::FMul:
991	case RecurKind::FMin:
992	case RecurKind::FMax:
993	case RecurKind::FMinimum:
994	case RecurKind::FMaximum:
995	case RecurKind::FMinimumNum:
996	case RecurKind::FMaximumNum:
997	case RecurKind::FMulAdd:
998	case RecurKind::AnyOf:
999	return true;
1000
1001	// Change the order of integer addition/multiplication may change the
1002	// semantics. Consider the following case:
1003	//
1004	// int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1005	// int sum = 0;
1006	// for (int i = 0; i < 2; i++)
1007	// for (int j = 0; j < 2; j++)
1008	// sum += A[j][i];
1009	//
1010	// If the above loops are exchanged, the addition will cause an
1011	// overflow. To prevent this, we must drop the nuw/nsw flags from the
1012	// addition/multiplication instructions when we actually exchanges the
1013	// loops.
1014	case RecurKind::Add:
1015	case RecurKind::Mul: {
1016	unsigned OpCode = RecurrenceDescriptor::getOpcode(Kind: RK);
1017	SmallVector<Instruction *, `4`> Ops = RD.getReductionOpChain(Phi: PHI, L);
1018
1019	// Bail out when we fail to collect reduction instructions chain.
1020	if (Ops.empty())
1021	return false;
1022
1023	for (Instruction *I : Ops) {
1024	assert(I->getOpcode() == OpCode &&
1025	"Expected the instruction to be the reduction operation");
1026	(void)OpCode;
1027
1028	// If the instruction has nuw/nsw flags, we must drop them when the
1029	// transformation is actually performed.
1030	if (I->hasNoSignedWrap() \|\| I->hasNoUnsignedWrap())
1031	HasNoWrapInsts.push_back(Elt: I);
1032	}
1033	return true;
1034	}
1035
1036	default:
1037	return false;
1038	}
1039	} else
1040	return false;
1041	}
1042
1043	// Check V's users to see if it is involved in a reduction in L.
1044	static PHINode *
1045	findInnerReductionPhi(Loop L, Value V,
1046	SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1047	// Reduction variables cannot be constants.
1048	if (isa<Constant>(Val: V))
1049	return nullptr;
1050
1051	for (Value *User : V->users()) {
1052	if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) {
1053	if (PHI->getNumIncomingValues() == `1`)
1054	continue;
1055
1056	if (checkReductionKind(L, PHI, HasNoWrapInsts))
1057	return PHI;
1058	else
1059	return nullptr;
1060	}
1061	}
1062
1063	return nullptr;
1064	}
1065
1066	bool LoopInterchangeLegality::isInnerReduction(
1067	Loop L, PHINode Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1068
1069	// Only support reduction2Mem when the loop nest to be interchanged is
1070	// the innermost two loops.
1071	if (!L->isInnermost()) {
1072	LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1073	ORE->emit(RemarkBuilder: [&]() {
1074	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerReduction",
1075	L->getStartLoc(), L->getHeader())
1076	<< "Only supported when the loop is the innermost.";
1077	});
1078	return false;
1079	}
1080
1081	if (Phi->getNumIncomingValues() != `2`)
1082	return false;
1083
1084	Value *Init = Phi->getIncomingValueForBlock(BB: L->getLoopPreheader());
1085	Value *Next = Phi->getIncomingValueForBlock(BB: L->getLoopLatch());
1086
1087	// So far only supports constant initial value.
1088	if (!isa<Constant>(Val: Init)) {
1089	LLVM_DEBUG(
1090	dbgs()
1091	<< "Only supported for the reduction with a constant initial value.\n");
1092	ORE->emit(RemarkBuilder: [&]() {
1093	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerReduction",
1094	L->getStartLoc(), L->getHeader())
1095	<< "Only supported for the reduction with a constant initial "
1096	"value.";
1097	});
1098	return false;
1099	}
1100
1101	// The reduction result must live in the inner loop.
1102	if (Instruction *I = dyn_cast<Instruction>(Val: Next)) {
1103	BasicBlock *BB = I->getParent();
1104	if (!L->contains(BB))
1105	return false;
1106	}
1107
1108	// The reduction should have only one user.
1109	if (!Phi->hasOneUser())
1110	return false;
1111
1112	// Check the reduction kind.
1113	if (!checkReductionKind(L, PHI: Phi, HasNoWrapInsts))
1114	return false;
1115
1116	// Find lcssa_phi in OuterLoop's Latch
1117	BasicBlock *ExitBlock = L->getExitBlock();
1118	if (!ExitBlock)
1119	return false;
1120
1121	PHINode *Lcssa = NULL;
1122	for (auto *U : Next->users()) {
1123	if (auto *P = dyn_cast<PHINode>(Val: U)) {
1124	if (P == Phi)
1125	continue;
1126
1127	if (Lcssa == NULL && P->getParent() == ExitBlock &&
1128	P->getIncomingValueForBlock(BB: L->getLoopLatch()) == Next)
1129	Lcssa = P;
1130	else
1131	return false;
1132	} else
1133	return false;
1134	}
1135	if (!Lcssa)
1136	return false;
1137
1138	if (!Lcssa->hasOneUser()) {
1139	LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1140	"the outer loop.\n");
1141	ORE->emit(RemarkBuilder: [&]() {
1142	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerReduction",
1143	L->getStartLoc(), L->getHeader())
1144	<< "Only supported when the reduction is used once in the outer "
1145	"loop.";
1146	});
1147	return false;
1148	}
1149
1150	StoreInst *LcssaStore =
1151	dyn_cast<StoreInst>(Val: Lcssa->getUniqueUndroppableUser());
1152	if (!LcssaStore \|\| LcssaStore->getParent() != ExitBlock)
1153	return false;
1154
1155	Value *MemRef = LcssaStore->getOperand(i_nocapture: `1`);
1156	Type *ElemTy = LcssaStore->getOperand(i_nocapture: `0`)->getType();
1157
1158	// LcssaStore stores the reduction result in BB.
1159	// When the reduction is initialized from a constant value, we need to load
1160	// from the memory object into the target basic block of the inner loop. This
1161	// means the memory reference was used prematurely. So we must ensure that the
1162	// memory reference does not dominate the target basic block.
1163	// TODO: Move the memory reference definition into the loop header.
1164	if (!DT->dominates(Def: dyn_cast<Instruction>(Val: MemRef), BB: L->getHeader())) {
1165	LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1166	"the inner loop.\n");
1167	ORE->emit(RemarkBuilder: [&]() {
1168	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerReduction",
1169	L->getStartLoc(), L->getHeader())
1170	<< "Only supported when memory reference dominate the inner "
1171	"loop.";
1172	});
1173	return false;
1174	}
1175
1176	// Found a reduction in the inner loop.
1177	InnerReduction SR;
1178	SR.Reduction = Phi;
1179	SR.Init = Init;
1180	SR.Next = Next;
1181	SR.LcssaPhi = Lcssa;
1182	SR.LcssaStore = LcssaStore;
1183	SR.MemRef = MemRef;
1184	SR.ElemTy = ElemTy;
1185
1186	InnerReductions.push_back(Elt: SR);
1187	return true;
1188	}
1189
1190	bool LoopInterchangeLegality::findInductionAndReductions(
1191	Loop L, SmallVector<PHINode , `8`> &Inductions, Loop *InnerLoop) {
1192	if (!L->getLoopLatch() \|\| !L->getLoopPredecessor())
1193	return false;
1194	for (PHINode &PHI : L->getHeader()->phis()) {
1195	InductionDescriptor ID;
1196	if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
1197	Inductions.push_back(Elt: &PHI);
1198	else {
1199	// PHIs in inner loops need to be part of a reduction in the outer loop,
1200	// discovered when checking the PHIs of the outer loop earlier.
1201	if (!InnerLoop) {
1202	if (OuterInnerReductions.count(Ptr: &PHI)) {
1203	LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1204	} else if (EnableReduction2Memory &&
1205	isInnerReduction(L, Phi: &PHI, HasNoWrapInsts&: HasNoWrapReductions)) {
1206	LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1207	<< PHI << `'\n'`);
1208	} else
1209	return false;
1210	} else {
1211	assert(PHI.getNumIncomingValues() == `2` &&
1212	"Phis in loop header should have exactly 2 incoming values");
1213	// Check if we have a PHI node in the outer loop that has a reduction
1214	// result from the inner loop as an incoming value.
1215	Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch()));
1216	PHINode *InnerRedPhi =
1217	findInnerReductionPhi(L: InnerLoop, V, HasNoWrapInsts&: HasNoWrapReductions);
1218	if (!InnerRedPhi \|\|
1219	!llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) {
1220	LLVM_DEBUG(
1221	dbgs()
1222	<< "Failed to recognize PHI as an induction or reduction.\n");
1223	return false;
1224	}
1225	OuterInnerReductions.insert(Ptr: &PHI);
1226	OuterInnerReductions.insert(Ptr: InnerRedPhi);
1227	}
1228	}
1229	}
1230
1231	// For now we only support at most one reduction.
1232	if (InnerReductions.size() > `1`) {
1233	LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1234	ORE->emit(RemarkBuilder: [&]() {
1235	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerReduction",
1236	L->getStartLoc(), L->getHeader())
1237	<< "Only supports at most one reduction.";
1238	});
1239	return false;
1240	}
1241
1242	return true;
1243	}
1244
1245	// This function indicates the current limitations in the transform as a result
1246	// of which we do not proceed.
1247	bool LoopInterchangeLegality::currentLimitations() {
1248	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1249
1250	// transform currently expects the loop latches to also be the exiting
1251	// blocks.
1252	if (InnerLoop->getExitingBlock() != InnerLoopLatch \|\|
1253	OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() \|\|
1254	!isa<CondBrInst>(Val: InnerLoopLatch->getTerminator()) \|\|
1255	!isa<CondBrInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) {
1256	LLVM_DEBUG(
1257	dbgs() << "Loops where the latch is not the exiting block are not"
1258	<< " supported currently.\n");
1259	ORE->emit(RemarkBuilder: [&]() {
1260	return OptimizationRemarkMissed (DEBUG_TYPE, "ExitingNotLatch",
1261	OuterLoop->getStartLoc(),
1262	OuterLoop->getHeader())
1263	<< "Loops where the latch is not the exiting block cannot be"
1264	" interchange currently.";
1265	});
1266	return true;
1267	}
1268
1269	SmallVector<PHINode *, `8`> Inductions;
1270	if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) {
1271	LLVM_DEBUG(
1272	dbgs() << "Only outer loops with induction or reduction PHI nodes "
1273	<< "are supported currently.\n");
1274	ORE->emit(RemarkBuilder: [&]() {
1275	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedPHIOuter",
1276	OuterLoop->getStartLoc(),
1277	OuterLoop->getHeader())
1278	<< "Only outer loops with induction or reduction PHI nodes can be"
1279	" interchanged currently.";
1280	});
1281	return true;
1282	}
1283
1284	Inductions.clear();
1285	// For multi-level loop nests, make sure that all phi nodes for inner loops
1286	// at all levels can be recognized as a induction or reduction phi. Bail out
1287	// if a phi node at a certain nesting level cannot be properly recognized.
1288	Loop *CurLevelLoop = OuterLoop;
1289	while (!CurLevelLoop->getSubLoops().empty()) {
1290	// We already made sure that the loop nest is tightly nested.
1291	CurLevelLoop = CurLevelLoop->getSubLoops().front();
1292	if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) {
1293	LLVM_DEBUG(
1294	dbgs() << "Only inner loops with induction or reduction PHI nodes "
1295	<< "are supported currently.\n");
1296	ORE->emit(RemarkBuilder: [&]() {
1297	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedPHIInner",
1298	CurLevelLoop->getStartLoc(),
1299	CurLevelLoop->getHeader())
1300	<< "Only inner loops with induction or reduction PHI nodes can be"
1301	" interchange currently.";
1302	});
1303	return true;
1304	}
1305	}
1306
1307	// TODO: Triangular loops are not handled for now.
1308	if (!isLoopStructureUnderstood()) {
1309	LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1310	ORE->emit(RemarkBuilder: [&]() {
1311	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedStructureInner",
1312	InnerLoop->getStartLoc(),
1313	InnerLoop->getHeader())
1314	<< "Inner loop structure not understood currently.";
1315	});
1316	return true;
1317	}
1318
1319	return false;
1320	}
1321
1322	bool LoopInterchangeLegality::findInductions(
1323	Loop L, SmallVectorImpl<PHINode > &Inductions) {
1324	for (PHINode &PHI : L->getHeader()->phis()) {
1325	InductionDescriptor ID;
1326	if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
1327	Inductions.push_back(Elt: &PHI);
1328	}
1329	return !Inductions.empty();
1330	}
1331
1332	// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1333	// users are either reduction PHIs or PHIs outside the outer loop (which means
1334	// the we are only interested in the final value after the loop).
1335	static bool
1336	areInnerLoopExitPHIsSupported(Loop InnerL, Loop OuterL,
1337	SmallPtrSetImpl<PHINode *> &Reductions,
1338	PHINode *LcssaReduction) {
1339	BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1340	for (PHINode &PHI : InnerExit->phis()) {
1341	// The reduction LCSSA PHI will have only one incoming block, which comes
1342	// from the loop latch.
1343	if (PHI.getNumIncomingValues() > `1`)
1344	return false;
1345	if (&PHI == LcssaReduction)
1346	return true;
1347	if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) {
1348	PHINode *PN = dyn_cast<PHINode>(Val: U);
1349	return !PN \|\|
1350	(!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent()));
1351	})) {
1352	return false;
1353	}
1354	}
1355	return true;
1356	}
1357
1358	// We currently support LCSSA PHI nodes in the outer loop exit, if their
1359	// incoming values do not come from the outer loop latch or if the
1360	// outer loop latch has a single predecessor. In that case, the value will
1361	// be available if both the inner and outer loop conditions are true, which
1362	// will still be true after interchanging. If we have multiple predecessor,
1363	// that may not be the case, e.g. because the outer loop latch may be executed
1364	// if the inner loop is not executed.
1365	static bool areOuterLoopExitPHIsSupported(Loop OuterLoop, Loop InnerLoop) {
1366	BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1367	for (PHINode &PHI : LoopNestExit->phis()) {
1368	for (Value *Incoming : PHI.incoming_values()) {
1369	Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming);
1370	if (!IncomingI \|\| IncomingI->getParent() != OuterLoop->getLoopLatch())
1371	continue;
1372
1373	// The incoming value is defined in the outer loop latch. Currently we
1374	// only support that in case the outer loop latch has a single predecessor.
1375	// This guarantees that the outer loop latch is executed if and only if
1376	// the inner loop is executed (because tightlyNested() guarantees that the
1377	// outer loop header only branches to the inner loop or the outer loop
1378	// latch).
1379	// FIXME: We could weaken this logic and allow multiple predecessors,
1380	// if the values are produced outside the loop latch. We would need
1381	// additional logic to update the PHI nodes in the exit block as
1382	// well.
1383	if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1384	return false;
1385	}
1386	}
1387	return true;
1388	}
1389
1390	// In case of multi-level nested loops, it may occur that lcssa phis exist in
1391	// the latch of InnerLoop, i.e., when defs of the incoming values are further
1392	// inside the loopnest. Sometimes those incoming values are not available
1393	// after interchange, since the original inner latch will become the new outer
1394	// latch which may have predecessor paths that do not include those incoming
1395	// values.
1396	// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1397	// multi-level loop nests.
1398	static bool areInnerLoopLatchPHIsSupported(Loop OuterLoop, Loop InnerLoop) {
1399	if (InnerLoop->getSubLoops().empty())
1400	return true;
1401	// If the original outer latch has only one predecessor, then values defined
1402	// further inside the looploop, e.g., in the innermost loop, will be available
1403	// at the new outer latch after interchange.
1404	if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1405	return true;
1406
1407	// The outer latch has more than one predecessors, i.e., the inner
1408	// exit and the inner header.
1409	// PHI nodes in the inner latch are lcssa phis where the incoming values
1410	// are defined further inside the loopnest. Check if those phis are used
1411	// in the original inner latch. If that is the case then bail out since
1412	// those incoming values may not be available at the new outer latch.
1413	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1414	for (PHINode &PHI : InnerLoopLatch->phis()) {
1415	for (auto *U : PHI.users()) {
1416	Instruction *UI = cast<Instruction>(Val: U);
1417	if (InnerLoopLatch == UI->getParent())
1418	return false;
1419	}
1420	}
1421	return true;
1422	}
1423
1424	bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1425	unsigned OuterLoopId,
1426	CharMatrix &DepMatrix) {
1427	if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1428	LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1429	<< " and OuterLoopId = " << OuterLoopId
1430	<< " due to dependence\n");
1431	ORE->emit(RemarkBuilder: [&]() {
1432	return OptimizationRemarkMissed (DEBUG_TYPE, "Dependence",
1433	InnerLoop->getStartLoc(),
1434	InnerLoop->getHeader())
1435	<< "Cannot interchange loops due to dependences.";
1436	});
1437	return false;
1438	}
1439	// Check if outer and inner loop contain legal instructions only.
1440	for (auto *BB : OuterLoop->blocks())
1441	for (Instruction &I : BB->instructionsWithoutDebug())
1442	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
1443	// readnone functions do not prevent interchanging.
1444	if (CI->onlyWritesMemory())
1445	continue;
1446	LLVM_DEBUG(
1447	dbgs() << "Loops with call instructions cannot be interchanged "
1448	<< "safely.");
1449	ORE->emit(RemarkBuilder: [&]() {
1450	return OptimizationRemarkMissed (DEBUG_TYPE, "CallInst",
1451	CI->getDebugLoc(),
1452	CI->getParent())
1453	<< "Cannot interchange loops due to call instruction.";
1454	});
1455
1456	return false;
1457	}
1458
1459	if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) {
1460	LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1461	return false;
1462	}
1463
1464	if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1465	LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1466	ORE->emit(RemarkBuilder: [&]() {
1467	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1468	InnerLoop->getStartLoc(),
1469	InnerLoop->getHeader())
1470	<< "Cannot interchange loops because unsupported PHI nodes found "
1471	"in inner loop latch.";
1472	});
1473	return false;
1474	}
1475
1476	// TODO: The loops could not be interchanged due to current limitations in the
1477	// transform module.
1478	if (currentLimitations()) {
1479	LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1480	return false;
1481	}
1482
1483	// Check if the loops are tightly nested.
1484	if (!tightlyNested(OuterLoop, InnerLoop)) {
1485	LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1486	ORE->emit(RemarkBuilder: [&]() {
1487	return OptimizationRemarkMissed (DEBUG_TYPE, "NotTightlyNested",
1488	InnerLoop->getStartLoc(),
1489	InnerLoop->getHeader())
1490	<< "Cannot interchange loops because they are not tightly "
1491	"nested.";
1492	});
1493	return false;
1494	}
1495
1496	// The LCSSA PHI for the reduction has passed checks before; its user
1497	// is a store instruction.
1498	PHINode LcssaReduction = nullptr*;
1499	assert(InnerReductions.size() <= `1` &&
1500	"So far we only support at most one reduction.");
1501	if (InnerReductions.size() == `1`)
1502	LcssaReduction = InnerReductions [`0`].LcssaPhi;
1503
1504	if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop, Reductions&: OuterInnerReductions,
1505	LcssaReduction)) {
1506	LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1507	ORE->emit(RemarkBuilder: [&]() {
1508	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedExitPHI",
1509	InnerLoop->getStartLoc(),
1510	InnerLoop->getHeader())
1511	<< "Found unsupported PHI node in loop exit.";
1512	});
1513	return false;
1514	}
1515
1516	if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1517	LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1518	ORE->emit(RemarkBuilder: [&]() {
1519	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedExitPHI",
1520	OuterLoop->getStartLoc(),
1521	OuterLoop->getHeader())
1522	<< "Found unsupported PHI node in loop exit.";
1523	});
1524	return false;
1525	}
1526
1527	return true;
1528	}
1529
1530	void CacheCostManager::computeIfUnitinialized() {
1531	if (CC.has_value())
1532	return;
1533
1534	LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1535	CC = CacheCost::getCacheCost(Root&: OutermostLoop, AR&: AR, DI&: *DI);
1536	// Obtain the loop vector returned from loop cache analysis beforehand,
1537	// and put each <Loop, index> pair into a map for constant time query
1538	// later. Indices in loop vector reprsent the optimal order of the
1539	// corresponding loop, e.g., given a loopnest with depth N, index 0
1540	// indicates the loop should be placed as the outermost loop and index N
1541	// indicates the loop should be placed as the innermost loop.
1542	//
1543	// For the old pass manager CacheCost would be null.
1544	if (CC != nullptr*)
1545	for (const auto &[Idx, Cost] : enumerate(First: (*CC)->getLoopCosts()))
1546	CostMap [Cost.first] = Idx;
1547	}
1548
1549	CacheCost *CacheCostManager::getCacheCost() {
1550	computeIfUnitinialized();
1551	return CC ->get();
1552	}
1553
1554	const DenseMap<const Loop , unsigned*> &CacheCostManager::getCostMap() {
1555	computeIfUnitinialized();
1556	return CostMap;
1557	}
1558
1559	int LoopInterchangeProfitability::getInstrOrderCost() {
1560	unsigned GoodOrder, BadOrder;
1561	BadOrder = GoodOrder = `0`;
1562	for (BasicBlock *BB : InnerLoop->blocks()) {
1563	for (Instruction &Ins : *BB) {
1564	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) {
1565	bool FoundInnerInduction = false;
1566	bool FoundOuterInduction = false;
1567	for (Value *Op : GEP->operands()) {
1568	// Skip operands that are not SCEV-able.
1569	if (!SE->isSCEVable(Ty: Op->getType()))
1570	continue;
1571
1572	const SCEV *OperandVal = SE->getSCEV(V: Op);
1573	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal);
1574	if (!AR)
1575	continue;
1576
1577	// If we find the inner induction after an outer induction e.g.
1578	// for(int i=0;i<N;i++)
1579	// for(int j=0;j<N;j++)
1580	// A[i][j] = A[i-1][j-1]+k;
1581	// then it is a good order.
1582	if (AR->getLoop() == InnerLoop) {
1583	// We found an InnerLoop induction after OuterLoop induction. It is
1584	// a good order.
1585	FoundInnerInduction = true;
1586	if (FoundOuterInduction) {
1587	GoodOrder++;
1588	break;
1589	}
1590	}
1591	// If we find the outer induction after an inner induction e.g.
1592	// for(int i=0;i<N;i++)
1593	// for(int j=0;j<N;j++)
1594	// A[j][i] = A[j-1][i-1]+k;
1595	// then it is a bad order.
1596	if (AR->getLoop() == OuterLoop) {
1597	// We found an OuterLoop induction after InnerLoop induction. It is
1598	// a bad order.
1599	FoundOuterInduction = true;
1600	if (FoundInnerInduction) {
1601	BadOrder++;
1602	break;
1603	}
1604	}
1605	}
1606	}
1607	}
1608	}
1609	return GoodOrder - BadOrder;
1610	}
1611
1612	std::optional<bool>
1613	LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1614	const DenseMap<const Loop , unsigned> &CostMap, CacheCost CC) {
1615	// This is the new cost model returned from loop cache analysis.
1616	// A smaller index means the loop should be placed an outer loop, and vice
1617	// versa.
1618	auto InnerLoopIt = CostMap.find(Val: InnerLoop);
1619	if (InnerLoopIt == CostMap.end())
1620	return std::nullopt;
1621	auto OuterLoopIt = CostMap.find(Val: OuterLoop);
1622	if (OuterLoopIt == CostMap.end())
1623	return std::nullopt;
1624
1625	if (CC->getLoopCost(L: OuterLoop) == CC->getLoopCost(L: InnerLoop))
1626	return std::nullopt;
1627	unsigned InnerIndex = InnerLoopIt ->second;
1628	unsigned OuterIndex = OuterLoopIt ->second;
1629	LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1630	<< ", OuterIndex = " << OuterIndex << "\n");
1631	assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1632	"numbers to each loop");
1633	return std::optional<bool>(InnerIndex < OuterIndex);
1634	}
1635
1636	std::optional<bool>
1637	LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1638	// Legacy cost model: this is rough cost estimation algorithm. It counts the
1639	// good and bad order of induction variables in the instruction and allows
1640	// reordering if number of bad orders is more than good.
1641	int Cost = getInstrOrderCost();
1642	LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1643	if (Cost < `0` && Cost < LoopInterchangeCostThreshold)
1644	return std::optional<bool>(true);
1645
1646	return std::nullopt;
1647	}
1648
1649	/// Return true if we can vectorize the loop specified by \p LoopId.
1650	static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1651	for (const auto &Dep : DepMatrix) {
1652	char Dir = Dep [LoopId];
1653	char DepType = Dep.back();
1654	assert((DepType == `'<'` \|\| DepType == `'*'`) &&
1655	"Unexpected element in dependency vector");
1656
1657	// There are no loop-carried dependencies.
1658	if (Dir == `'='` \|\| Dir == `'I'`)
1659	continue;
1660
1661	// DepType being '<' means that this direction vector represents a forward
1662	// dependency. In principle, a loop with '<' direction can be vectorized in
1663	// this case.
1664	if (Dir == `'<'` && DepType == `'<'`)
1665	continue;
1666
1667	// We cannot prove that the loop is vectorizable.
1668	return false;
1669	}
1670	return true;
1671	}
1672
1673	std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1674	unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1675	// If the outer loop cannot be vectorized, it is not profitable to move this
1676	// to inner position.
1677	if (!canVectorize(DepMatrix, LoopId: OuterLoopId))
1678	return false;
1679
1680	// If the inner loop cannot be vectorized but the outer loop can be, then it
1681	// is profitable to interchange to enable inner loop parallelism.
1682	if (!canVectorize(DepMatrix, LoopId: InnerLoopId))
1683	return true;
1684
1685	// If both the inner and the outer loop can be vectorized, it is necessary to
1686	// check the cost of each vectorized loop for profitability decision. At this
1687	// time we do not have a cost model to estimate them, so return nullopt.
1688	// TODO: Estimate the cost of vectorized loop when both the outer and the
1689	// inner loop can be vectorized.
1690	return std::nullopt;
1691	}
1692
1693	bool LoopInterchangeProfitability::isProfitable(
1694	const Loop InnerLoop, const* Loop OuterLoop, unsigned* InnerLoopId,
1695	unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1696	// Do not consider loops with a backedge that isn't taken, e.g. an
1697	// unconditional branch true/false, as candidates for interchange.
1698	// TODO: when interchange is forced, we should probably also allow
1699	// interchange for these loops, and thus this logic should be moved just
1700	// below the cost-model ignore check below. But this check is done first
1701	// to avoid the issue in #163954.
1702	const SCEV *InnerBTC = SE->getBackedgeTakenCount(L: InnerLoop);
1703	const SCEV *OuterBTC = SE->getBackedgeTakenCount(L: OuterLoop);
1704	if (InnerBTC && InnerBTC->isZero()) {
1705	LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1706	"single iteration loop\n");
1707	return false;
1708	}
1709	if (OuterBTC && OuterBTC->isZero()) {
1710	LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1711	"single iteration loop\n");
1712	return false;
1713	}
1714
1715	// Return true if interchange is forced and the cost-model ignored.
1716	if (Profitabilities.size() == `1` && Profitabilities [`0`] == RuleTy::Ignore)
1717	return true;
1718	assert(noDuplicateRulesAndIgnore(Profitabilities) &&
1719	"Duplicate rules and option 'ignore' are not allowed");
1720
1721	// isProfitable() is structured to avoid endless loop interchange. If the
1722	// highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1723	// decide the profitability then, profitability check will stop and return the
1724	// analysis result. If it failed to determine it (e.g., cache analysis failed
1725	// to analyze the loopnest due to delinearization issues) then go ahead the
1726	// second highest priority rule (isProfitablePerInstrOrderCost by default).
1727	// Likewise, if it failed to analysis the profitability then only, the last
1728	// rule (isProfitableForVectorization by default) will decide.
1729	std::optional<bool> shouldInterchange;
1730	for (RuleTy RT : Profitabilities) {
1731	switch (RT) {
1732	case RuleTy::PerLoopCacheAnalysis: {
1733	CacheCost *CC = CCM.getCacheCost();
1734	const DenseMap<const Loop , unsigned*> &CostMap = CCM.getCostMap();
1735	shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1736	break;
1737	}
1738	case RuleTy::PerInstrOrderCost:
1739	shouldInterchange = isProfitablePerInstrOrderCost();
1740	break;
1741	case RuleTy::ForVectorization:
1742	shouldInterchange =
1743	isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1744	break;
1745	case RuleTy::Ignore:
1746	llvm_unreachable("Option 'ignore' is not supported with other options");
1747	break;
1748	}
1749
1750	// If this rule could determine the profitability, don't call subsequent
1751	// rules.
1752	if (shouldInterchange.has_value())
1753	break;
1754	}
1755
1756	if (!shouldInterchange.has_value()) {
1757	ORE->emit(RemarkBuilder: [&]() {
1758	return OptimizationRemarkMissed (DEBUG_TYPE, "InterchangeNotProfitable",
1759	InnerLoop->getStartLoc(),
1760	InnerLoop->getHeader())
1761	<< "Insufficient information to calculate the cost of loop for "
1762	"interchange.";
1763	});
1764	return false;
1765	} else if (!shouldInterchange.value()) {
1766	ORE->emit(RemarkBuilder: [&]() {
1767	return OptimizationRemarkMissed (DEBUG_TYPE, "InterchangeNotProfitable",
1768	InnerLoop->getStartLoc(),
1769	InnerLoop->getHeader())
1770	<< "Interchanging loops is not considered to improve cache "
1771	"locality nor vectorization.";
1772	});
1773	return false;
1774	}
1775	return true;
1776	}
1777
1778	void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1779	Loop *InnerLoop) {
1780	for (Loop L : OuterLoop)
1781	if (L == InnerLoop) {
1782	OuterLoop->removeChildLoop(Child: L);
1783	return;
1784	}
1785	llvm_unreachable("Couldn't find loop");
1786	}
1787
1788	/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1789	/// new inner and outer loop after interchanging: NewInner is the original
1790	/// outer loop and NewOuter is the original inner loop.
1791	///
1792	/// Before interchanging, we have the following structure
1793	/// Outer preheader
1794	// Outer header
1795	// Inner preheader
1796	// Inner header
1797	// Inner body
1798	// Inner latch
1799	// outer bbs
1800	// Outer latch
1801	//
1802	// After interchanging:
1803	// Inner preheader
1804	// Inner header
1805	// Outer preheader
1806	// Outer header
1807	// Inner body
1808	// outer bbs
1809	// Outer latch
1810	// Inner latch
1811	void LoopInterchangeTransform::restructureLoops(
1812	Loop NewInner, Loop NewOuter, BasicBlock *OrigInnerPreHeader,
1813	BasicBlock *OrigOuterPreHeader) {
1814	Loop *OuterLoopParent = OuterLoop->getParentLoop();
1815	// The original inner loop preheader moves from the new inner loop to
1816	// the parent loop, if there is one.
1817	NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader);
1818	LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent);
1819
1820	// Switch the loop levels.
1821	if (OuterLoopParent) {
1822	// Remove the loop from its parent loop.
1823	removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner);
1824	removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1825	OuterLoopParent->addChildLoop(NewChild: NewOuter);
1826	} else {
1827	removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1828	LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter);
1829	}
1830	while (!NewOuter->isInnermost())
1831	NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin()));
1832	NewOuter->addChildLoop(NewChild: NewInner);
1833
1834	// BBs from the original inner loop.
1835	SmallVector<BasicBlock *, `8`> OrigInnerBBs(NewOuter->blocks());
1836
1837	// Add BBs from the original outer loop to the original inner loop (excluding
1838	// BBs already in inner loop)
1839	for (BasicBlock *BB : NewInner->blocks())
1840	if (LI->getLoopFor(BB) == NewInner)
1841	NewOuter->addBlockEntry(BB);
1842
1843	// Now remove inner loop header and latch from the new inner loop and move
1844	// other BBs (the loop body) to the new inner loop.
1845	BasicBlock *OuterHeader = NewOuter->getHeader();
1846	BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1847	for (BasicBlock *BB : OrigInnerBBs) {
1848	// Nothing will change for BBs in child loops.
1849	if (LI->getLoopFor(BB) != NewOuter)
1850	continue;
1851	// Remove the new outer loop header and latch from the new inner loop.
1852	if (BB == OuterHeader \|\| BB == OuterLatch)
1853	NewInner->removeBlockFromLoop(BB);
1854	else
1855	LI->changeLoopFor(BB, L: NewInner);
1856	}
1857
1858	// The preheader of the original outer loop becomes part of the new
1859	// outer loop.
1860	NewOuter->addBlockEntry(BB: OrigOuterPreHeader);
1861	LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter);
1862
1863	// Tell SE that we move the loops around.
1864	SE->forgetLoop(L: NewOuter);
1865	}
1866
1867	/// User can write, or optimizers can generate the reduction for inner loop.
1868	/// To make the interchange valid, apply Reduction2Mem by moving the
1869	/// initializer and store instructions into the inner loop. So far we only
1870	/// handle cases where the reduction variable is initialized to a constant.
1871	/// For example, below code:
1872	///
1873	/// loop:
1874	/// re = phi<0.0, next>
1875	/// next = re op ...
1876	/// endloop
1877	/// reduc_sum = phi<next> // lcssa phi
1878	/// MEM_REF[idx] = reduc_sum // LcssaStore
1879	///
1880	/// is transformed into:
1881	///
1882	/// loop:
1883	/// tmp = MEM_REF[idx];
1884	/// new_var = !first_iteration ? tmp : 0.0;
1885	/// next = new_var op ...
1886	/// MEM_REF[idx] = next; // after moving
1887	/// endloop
1888	///
1889	/// In this way the initial const is used in the first iteration of loop.
1890	void LoopInterchangeTransform::reduction2Memory() {
1891	ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
1892	LIL.getInnerReductions();
1893
1894	assert(InnerReductions.size() == `1` &&
1895	"So far we only support at most one reduction.");
1896
1897	LoopInterchangeLegality::InnerReduction SR = InnerReductions [`0`];
1898	BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1899	IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1900
1901	// Check if it's the first iteration.
1902	LLVMContext &Context = InnerLoopHeader->getContext();
1903	PHINode *FirstIter =
1904	Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Context), NumReservedValues: `2`, Name: "first.iter");
1905	FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: `1`),
1906	BB: InnerLoop->getLoopPreheader());
1907	FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: `0`),
1908	BB: InnerLoop->getLoopLatch());
1909	assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1910
1911	// When the reduction is initialized from a constant value, we need to add
1912	// a stmt loading from the memory object to target basic block in inner
1913	// loop.
1914	Instruction *LoadMem = Builder.CreateLoad(Ty: SR.ElemTy, Ptr: SR.MemRef);
1915
1916	// Init new_var to MEM_REF or CONST depending on if it is the first iteration.
1917	Value *NewVar = Builder.CreateSelect(C: FirstIter, True: SR.Init, False: LoadMem, Name: "new.var");
1918
1919	// Replace all uses of the reduction variable with a new variable.
1920	SR.Reduction->replaceAllUsesWith(V: NewVar);
1921
1922	// Move store instruction into inner loop, just after reduction next's
1923	// definition.
1924	SR.LcssaStore->setOperand(i_nocapture: `0`, Val_nocapture: SR.Next);
1925	SR.LcssaStore->moveAfter(MovePos: dyn_cast<Instruction>(Val: SR.Next));
1926	}
1927
1928	bool LoopInterchangeTransform::transform(
1929	ArrayRef<Instruction *> DropNoWrapInsts) {
1930	bool Transformed = false;
1931
1932	ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
1933	LIL.getInnerReductions();
1934	if (InnerReductions.size() == `1`)
1935	reduction2Memory();
1936
1937	if (InnerLoop->getSubLoops().empty()) {
1938	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1939	LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1940	auto &InductionPHIs = LIL.getInnerLoopInductions();
1941	if (InductionPHIs.empty()) {
1942	LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1943	return false;
1944	}
1945
1946	SmallVector<Instruction *, `8`> InnerIndexVarList;
1947	for (PHINode *CurInductionPHI : InductionPHIs) {
1948	if (CurInductionPHI->getIncomingBlock(i: `0`) == InnerLoopPreHeader)
1949	InnerIndexVarList.push_back(
1950	Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: `1`)));
1951	else
1952	InnerIndexVarList.push_back(
1953	Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: `0`)));
1954	}
1955
1956	// Create a new latch block for the inner loop. We split at the
1957	// current latch's terminator and then move the condition and all
1958	// operands that are not either loop-invariant or the induction PHI into the
1959	// new latch block.
1960	BasicBlock *NewLatch =
1961	SplitBlock(Old: InnerLoop->getLoopLatch(),
1962	SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1963
1964	SmallSetVector<Instruction *, `4`> WorkList;
1965	unsigned i = `0`;
1966	auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1967	for (; i < WorkList.size(); i++) {
1968	// Duplicate instruction and move it the new latch. Update uses that
1969	// have been moved.
1970	Instruction *NewI = WorkList [i]->clone();
1971	NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt());
1972	assert(!NewI->mayHaveSideEffects() &&
1973	"Moving instructions with side-effects may change behavior of "
1974	"the loop nest!");
1975	for (Use &U : llvm::make_early_inc_range(Range: WorkList [i]->uses())) {
1976	Instruction *UserI = cast<Instruction>(Val: U.getUser());
1977	if (!InnerLoop->contains(BB: UserI->getParent()) \|\|
1978	UserI->getParent() == NewLatch \|\|
1979	llvm::is_contained(Range: InductionPHIs, Element: UserI))
1980	U.set(NewI);
1981	}
1982	// Add operands of moved instruction to the worklist, except if they are
1983	// outside the inner loop or are the induction PHI.
1984	for (Value *Op : WorkList [i]->operands()) {
1985	Instruction *OpI = dyn_cast<Instruction>(Val: Op);
1986	if (!OpI \|\|
1987	this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop \|\|
1988	llvm::is_contained(Range: InductionPHIs, Element: OpI))
1989	continue;
1990	WorkList.insert(X: OpI);
1991	}
1992	}
1993	};
1994
1995	// FIXME: Should we interchange when we have a constant condition?
1996	Instruction *CondI = dyn_cast<Instruction>(
1997	Val: cast<CondBrInst>(Val: InnerLoop->getLoopLatch()->getTerminator())
1998	->getCondition());
1999	if (CondI)
2000	WorkList.insert(X: CondI);
2001	MoveInstructions ();
2002	for (Instruction *InnerIndexVar : InnerIndexVarList)
2003	WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar));
2004	MoveInstructions ();
2005	}
2006
2007	// Ensure the inner loop phi nodes have a separate basic block.
2008	BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2009	if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2010	InnerLoopHeader->getTerminator()) {
2011	SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2012	LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2013	}
2014
2015	// Instructions in the original inner loop preheader may depend on values
2016	// defined in the outer loop header. Move them there, because the original
2017	// inner loop preheader will become the entry into the interchanged loop nest.
2018	// Currently we move all instructions and rely on LICM to move invariant
2019	// instructions outside the loop nest.
2020	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2021	BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2022	if (InnerLoopPreHeader != OuterLoopHeader) {
2023	for (Instruction &I :
2024	make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(),
2025	y: std::prev(x: InnerLoopPreHeader->end()))))
2026	I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator());
2027	}
2028
2029	Transformed \|= adjustLoopLinks();
2030	if (!Transformed) {
2031	LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2032	return false;
2033	}
2034
2035	// Finally, drop the nsw/nuw flags from the instructions for reduction
2036	// calculations.
2037	for (Instruction *Reduction : DropNoWrapInsts) {
2038	Reduction->setHasNoSignedWrap(false);
2039	Reduction->setHasNoUnsignedWrap(false);
2040	}
2041
2042	return true;
2043	}
2044
2045	/// \brief Move all instructions except the terminator from FromBB right before
2046	/// InsertBefore
2047	static void moveBBContents(BasicBlock FromBB, Instruction InsertBefore) {
2048	BasicBlock *ToBB = InsertBefore->getParent();
2049
2050	ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(),
2051	FromEndIt: FromBB->getTerminator()->getIterator());
2052	}
2053
2054	/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2055	static void swapBBContents(BasicBlock BB1, BasicBlock BB2) {
2056	// Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2057	// from BB1 afterwards.
2058	auto Iter = map_range(C&: BB1, F: [](Instruction &I) { return* &I; });
2059	SmallVector<Instruction *, `4`> TempInstrs(Iter.begin(), std::prev(x: Iter.end()));
2060	for (Instruction *I : TempInstrs)
2061	I->removeFromParent();
2062
2063	// Move instructions from BB2 to BB1.
2064	moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator());
2065
2066	// Move instructions from TempInstrs to BB2.
2067	for (Instruction *I : TempInstrs)
2068	I->insertBefore(InsertPos: BB2->getTerminator()->getIterator());
2069	}
2070
2071	// Update BI to jump to NewBB instead of OldBB. Records updates to the
2072	// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2073	// \p OldBB is exactly once in BI's successor list.
2074	static void updateSuccessor(Instruction Term, BasicBlock OldBB,
2075	BasicBlock *NewBB,
2076	std::vector<DominatorTree::UpdateType> &DTUpdates,
2077	bool MustUpdateOnce = true) {
2078	assert((!MustUpdateOnce \|\| llvm::count(successors(Term), OldBB) == `1`) &&
2079	"BI must jump to OldBB exactly once.");
2080	bool Changed = false;
2081	for (Use &Op : Term->operands())
2082	if (Op == OldBB) {
2083	Op.set(NewBB);
2084	Changed = true;
2085	}
2086
2087	if (Changed) {
2088	DTUpdates.push_back(
2089	x: {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2090	DTUpdates.push_back(
2091	x: {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2092	}
2093	assert(Changed && "Expected a successor to be updated");
2094	}
2095
2096	// Move Lcssa PHIs to the right place.
2097	static void moveLCSSAPhis(BasicBlock InnerExit, BasicBlock InnerHeader,
2098	BasicBlock InnerLatch, BasicBlock OuterHeader,
2099	BasicBlock OuterLatch, BasicBlock OuterExit,
2100	Loop InnerLoop, LoopInfo LI) {
2101
2102	// Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2103	// defined either in the header or latch. Those blocks will become header and
2104	// latch of the new outer loop, and the only possible users can PHI nodes
2105	// in the exit block of the loop nest or the outer loop header (reduction
2106	// PHIs, in that case, the incoming value must be defined in the inner loop
2107	// header). We can just substitute the user with the incoming value and remove
2108	// the PHI.
2109	for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) {
2110	assert(P.getNumIncomingValues() == `1` &&
2111	"Only loops with a single exit are supported!");
2112
2113	// Incoming values are guaranteed be instructions currently.
2114	auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch));
2115	// In case of multi-level nested loops, follow LCSSA to find the incoming
2116	// value defined from the innermost loop.
2117	auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI));
2118	// Skip phis with incoming values from the inner loop body, excluding the
2119	// header and latch.
2120	if (IncIInnerMost->getParent() != InnerLatch &&
2121	IncIInnerMost->getParent() != InnerHeader)
2122	continue;
2123
2124	assert(all_of(P.users(),
2125	[OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2126	return (cast<PHINode>(U)->getParent() == OuterHeader &&
2127	IncI->getParent() == InnerHeader) \|\|
2128	cast<PHINode>(U)->getParent() == OuterExit;
2129	}) &&
2130	"Can only replace phis iff the uses are in the loop nest exit or "
2131	"the incoming value is defined in the inner header (it will "
2132	"dominate all loop blocks after interchanging)");
2133	P.replaceAllUsesWith(V: IncI);
2134	P.eraseFromParent();
2135	}
2136
2137	SmallVector<PHINode *, `8`> LcssaInnerExit(
2138	llvm::make_pointer_range(Range: InnerExit->phis()));
2139
2140	SmallVector<PHINode *, `8`> LcssaInnerLatch(
2141	llvm::make_pointer_range(Range: InnerLatch->phis()));
2142
2143	// Lcssa PHIs for values used outside the inner loop are in InnerExit.
2144	// If a PHI node has users outside of InnerExit, it has a use outside the
2145	// interchanged loop and we have to preserve it. We move these to
2146	// InnerLatch, which will become the new exit block for the innermost
2147	// loop after interchanging.
2148	for (PHINode *P : LcssaInnerExit)
2149	P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2150
2151	// If the inner loop latch contains LCSSA PHIs, those come from a child loop
2152	// and we have to move them to the new inner latch.
2153	for (PHINode *P : LcssaInnerLatch)
2154	P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt());
2155
2156	// Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2157	// incoming values defined in the outer loop, we have to add a new PHI
2158	// in the inner loop latch, which became the exit block of the outer loop,
2159	// after interchanging.
2160	if (OuterExit) {
2161	for (PHINode &P : OuterExit->phis()) {
2162	if (P.getNumIncomingValues() != `1`)
2163	continue;
2164	// Skip Phis with incoming values defined in the inner loop. Those should
2165	// already have been updated.
2166	auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: `0`));
2167	if (!I \|\| LI->getLoopFor(BB: I->getParent()) == InnerLoop)
2168	continue;
2169
2170	PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone());
2171	NewPhi->setIncomingValue(i: `0`, V: P.getIncomingValue(i: `0`));
2172	NewPhi->setIncomingBlock(i: `0`, BB: OuterLatch);
2173	// We might have incoming edges from other BBs, i.e., the original outer
2174	// header.
2175	for (auto *Pred : predecessors(BB: InnerLatch)) {
2176	if (Pred == OuterLatch)
2177	continue;
2178	NewPhi->addIncoming(V: P.getIncomingValue(i: `0`), BB: Pred);
2179	}
2180	NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2181	P.setIncomingValue(i: `0`, V: NewPhi);
2182	}
2183	}
2184
2185	// Now adjust the incoming blocks for the LCSSA PHIs.
2186	// For PHIs moved from Inner's exit block, we need to replace Inner's latch
2187	// with the new latch.
2188	InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch);
2189	}
2190
2191	/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2192	/// block: the outer loop latch block does not need to be exit block of the
2193	/// inner loop. Consider a loop that was in LCSSA form, but then some
2194	/// transformation like loop-unswitch comes along and creates an empty block,
2195	/// where BB5 in this example is the outer loop latch block:
2196	///
2197	/// BB4:
2198	/// br label %BB5
2199	/// BB5:
2200	/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2201	/// br outer.header
2202	///
2203	/// Interchange then brings it in LCSSA form again resulting in this chain of
2204	/// single-input phi nodes:
2205	///
2206	/// BB4:
2207	/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2208	/// br label %BB5
2209	/// BB5:
2210	/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2211	///
2212	/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2213	/// use before the def if we don't check this. The solution is to simplify
2214	/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2215	///
2216	static void simplifyLCSSAPhis(Loop OuterLoop, Loop InnerLoop) {
2217	BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2218	BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2219
2220	// Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2221	if (OuterLoopLatch == InnerLoopExit)
2222	return;
2223
2224	// Collect and remove phis in non-exit blocks if they have 1 input.
2225	SmallVector<PHINode *, `8`> Phis(
2226	llvm::make_pointer_range(Range: OuterLoopLatch->phis()));
2227	for (PHINode *Phi : Phis) {
2228	assert(Phi->getNumIncomingValues() == `1` && "Single input phi expected");
2229	LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2230	<< "\n");
2231	Phi->replaceAllUsesWith(V: Phi->getIncomingValue(i: `0`));
2232	Phi->eraseFromParent();
2233	}
2234	}
2235
2236	bool LoopInterchangeTransform::adjustLoopBranches() {
2237	LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2238	std::vector<DominatorTree::UpdateType> DTUpdates;
2239
2240	BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2241	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2242
2243	assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2244	InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2245	InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2246
2247	simplifyLCSSAPhis(OuterLoop, InnerLoop);
2248
2249	// Ensure that both preheaders do not contain PHI nodes and have single
2250	// predecessors. This allows us to move them easily. We use
2251	// InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2252	// preheaders do not satisfy those conditions.
2253	if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) \|\|
2254	!OuterLoopPreHeader->getUniquePredecessor())
2255	OuterLoopPreHeader =
2256	InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2257	if (InnerLoopPreHeader == OuterLoop->getHeader())
2258	InnerLoopPreHeader =
2259	InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2260
2261	// Adjust the loop preheader
2262	BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2263	BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2264	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2265	BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2266	BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2267	BasicBlock *InnerLoopLatchPredecessor =
2268	InnerLoopLatch->getUniquePredecessor();
2269	BasicBlock *InnerLoopLatchSuccessor;
2270	BasicBlock *OuterLoopLatchSuccessor;
2271
2272	CondBrInst *OuterLoopLatchBI =
2273	dyn_cast<CondBrInst>(Val: OuterLoopLatch->getTerminator());
2274	CondBrInst *InnerLoopLatchBI =
2275	dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
2276	Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2277	Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2278
2279	if (!OuterLoopPredecessor \|\| !InnerLoopLatchPredecessor \|\|
2280	!OuterLoopLatchBI \|\| !InnerLoopLatchBI \|\| !OuterLoopHeaderBI \|\|
2281	!InnerLoopHeaderBI)
2282	return false;
2283
2284	Instruction *InnerLoopLatchPredecessorBI =
2285	InnerLoopLatchPredecessor->getTerminator();
2286	Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2287
2288	if (!OuterLoopPredecessorBI \|\| !InnerLoopLatchPredecessorBI)
2289	return false;
2290	BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2291	if (!InnerLoopHeaderSuccessor)
2292	return false;
2293
2294	// Adjust Loop Preheader and headers.
2295	// The branches in the outer loop predecessor and the outer loop header can
2296	// be unconditional branches or conditional branches with duplicates. Consider
2297	// this when updating the successors.
2298	updateSuccessor(Term: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader,
2299	NewBB: InnerLoopPreHeader, DTUpdates, /MustUpdateOnce=/false);
2300	// The outer loop header might or might not branch to the outer latch.
2301	// We are guaranteed to branch to the inner loop preheader.
2302	if (llvm::is_contained(Range: successors(I: OuterLoopHeaderBI), Element: OuterLoopLatch)) {
2303	// In this case the outerLoopHeader should branch to the InnerLoopLatch.
2304	updateSuccessor(Term: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch,
2305	DTUpdates,
2306	/MustUpdateOnce=/false);
2307	}
2308	updateSuccessor(Term: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader,
2309	NewBB: InnerLoopHeaderSuccessor, DTUpdates,
2310	/MustUpdateOnce=/false);
2311
2312	// Adjust reduction PHI's now that the incoming block has changed.
2313	InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader,
2314	New: OuterLoopHeader);
2315
2316	updateSuccessor(Term: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor,
2317	NewBB: OuterLoopPreHeader, DTUpdates);
2318
2319	// -------------Adjust loop latches-----------
2320	if (InnerLoopLatchBI->getSuccessor(i: `0`) == InnerLoopHeader)
2321	InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: `1`);
2322	else
2323	InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: `0`);
2324
2325	updateSuccessor(Term: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch,
2326	NewBB: InnerLoopLatchSuccessor, DTUpdates);
2327
2328	if (OuterLoopLatchBI->getSuccessor(i: `0`) == OuterLoopHeader)
2329	OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: `1`);
2330	else
2331	OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: `0`);
2332
2333	updateSuccessor(Term: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor,
2334	NewBB: OuterLoopLatchSuccessor, DTUpdates);
2335	updateSuccessor(Term: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch,
2336	DTUpdates);
2337
2338	DT->applyUpdates(Updates: DTUpdates);
2339	restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader,
2340	OrigOuterPreHeader: OuterLoopPreHeader);
2341
2342	moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch,
2343	OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(),
2344	InnerLoop, LI);
2345	// For PHIs in the exit block of the outer loop, outer's latch has been
2346	// replaced by Inners'.
2347	OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2348
2349	auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2350	// Now update the reduction PHIs in the inner and outer loop headers.
2351	SmallVector<PHINode *, `4`> InnerLoopPHIs, OuterLoopPHIs;
2352	for (PHINode &PHI : InnerLoopHeader->phis())
2353	if (OuterInnerReductions.contains(Ptr: &PHI))
2354	InnerLoopPHIs.push_back(Elt: &PHI);
2355
2356	for (PHINode &PHI : OuterLoopHeader->phis())
2357	if (OuterInnerReductions.contains(Ptr: &PHI))
2358	OuterLoopPHIs.push_back(Elt: &PHI);
2359
2360	// Now move the remaining reduction PHIs from outer to inner loop header and
2361	// vice versa. The PHI nodes must be part of a reduction across the inner and
2362	// outer loop and all the remains to do is and updating the incoming blocks.
2363	for (PHINode *PHI : OuterLoopPHIs) {
2364	LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2365	PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt());
2366	assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2367	}
2368	for (PHINode *PHI : InnerLoopPHIs) {
2369	LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2370	PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt());
2371	assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2372	}
2373
2374	// Update the incoming blocks for moved PHI nodes.
2375	OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader);
2376	OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch);
2377	InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader);
2378	InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2379
2380	// Values defined in the outer loop header could be used in the inner loop
2381	// latch. In that case, we need to create LCSSA phis for them, because after
2382	// interchanging they will be defined in the new inner loop and used in the
2383	// new outer loop.
2384	SmallVector<Instruction *, `4`> MayNeedLCSSAPhis;
2385	for (Instruction &I :
2386	make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end())))
2387	MayNeedLCSSAPhis.push_back(Elt: &I);
2388	formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: DT, LI: LI, SE);
2389
2390	return true;
2391	}
2392
2393	bool LoopInterchangeTransform::adjustLoopLinks() {
2394	// Adjust all branches in the inner and outer loop.
2395	bool Changed = adjustLoopBranches();
2396	if (Changed) {
2397	// We have interchanged the preheaders so we need to interchange the data in
2398	// the preheaders as well. This is because the content of the inner
2399	// preheader was previously executed inside the outer loop.
2400	BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2401	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2402	swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader);
2403	}
2404	return Changed;
2405	}
2406
2407	PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
2408	LoopAnalysisManager &AM,
2409	LoopStandardAnalysisResults &AR,
2410	LPMUpdater &U) {
2411	Function &F = *LN.getParent();
2412	SmallVector<Loop *, `8`> LoopList(LN.getLoops());
2413
2414	if (MaxMemInstrCount < `1`) {
2415	LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2416	return PreservedAnalyses::all();
2417	}
2418	OptimizationRemarkEmitter ORE(&F);
2419
2420	// Ensure minimum depth of the loop nest to do the interchange.
2421	if (!hasSupportedLoopDepth(LoopList, ORE))
2422	return PreservedAnalyses::all();
2423	// Ensure computable loop nest.
2424	if (!isComputableLoopNest(SE: &AR.SE, LoopList)) {
2425	LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2426	return PreservedAnalyses::all();
2427	}
2428
2429	ORE.emit(RemarkBuilder: [&]() {
2430	return OptimizationRemarkAnalysis (DEBUG_TYPE, "Dependence",
2431	LN.getOutermostLoop().getStartLoc(),
2432	LN.getOutermostLoop().getHeader())
2433	<< "Computed dependence info, invoking the transform.";
2434	});
2435
2436	DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2437	if (!LoopInterchange (&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2438	return PreservedAnalyses::all();
2439	U.markLoopNestChanged(Changed: true);
2440	return getLoopPassPreservedAnalyses();
2441	}
2442

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopInterchange.cpp