LoopInterchange.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopInterchange.cpp]

1	//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This Pass handles loop interchange transform.
10	// This pass interchanges loops to provide a more cache-friendly memory access
11	// patterns.
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Transforms/Scalar/LoopInterchange.h"
16	#include "llvm/ADT/STLExtras.h"
17	#include "llvm/ADT/SmallSet.h"
18	#include "llvm/ADT/SmallVector.h"
19	#include "llvm/ADT/Statistic.h"
20	#include "llvm/ADT/StringRef.h"
21	#include "llvm/ADT/StringSet.h"
22	#include "llvm/Analysis/DependenceAnalysis.h"
23	#include "llvm/Analysis/LoopCacheAnalysis.h"
24	#include "llvm/Analysis/LoopInfo.h"
25	#include "llvm/Analysis/LoopNestAnalysis.h"
26	#include "llvm/Analysis/LoopPass.h"
27	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28	#include "llvm/Analysis/ScalarEvolution.h"
29	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30	#include "llvm/IR/BasicBlock.h"
31	#include "llvm/IR/DiagnosticInfo.h"
32	#include "llvm/IR/Dominators.h"
33	#include "llvm/IR/Function.h"
34	#include "llvm/IR/InstrTypes.h"
35	#include "llvm/IR/Instruction.h"
36	#include "llvm/IR/Instructions.h"
37	#include "llvm/IR/User.h"
38	#include "llvm/IR/Value.h"
39	#include "llvm/Support/Casting.h"
40	#include "llvm/Support/CommandLine.h"
41	#include "llvm/Support/Debug.h"
42	#include "llvm/Support/ErrorHandling.h"
43	#include "llvm/Support/raw_ostream.h"
44	#include "llvm/Transforms/Scalar/LoopPassManager.h"
45	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
46	#include "llvm/Transforms/Utils/LoopUtils.h"
47	#include <cassert>
48	#include <utility>
49	#include <vector>
50
51	using namespace llvm;
52
53	#define DEBUG_TYPE "loop-interchange"
54
55	STATISTIC(LoopsInterchanged, "Number of loops interchanged");
56
57	static cl::opt<int> LoopInterchangeCostThreshold(
58	"loop-interchange-threshold", cl::init(Val: `0`), cl::Hidden,
59	cl::desc ("Interchange if you gain more than this number"));
60
61	// Maximum number of load-stores that can be handled in the dependency matrix.
62	static cl::opt<unsigned int> MaxMemInstrCount(
63	"loop-interchange-max-meminstr-count", cl::init(Val: `64`), cl::Hidden,
64	cl::desc (
65	"Maximum number of load-store instructions that should be handled "
66	"in the dependency matrix. Higher value may lead to more interchanges "
67	"at the cost of compile-time"));
68
69	namespace {
70
71	using LoopVector = SmallVector<Loop *, `8`>;
72
73	// TODO: Check if we can use a sparse matrix here.
74	using CharMatrix = std::vector<std::vector<char>>;
75
76	/// Types of rules used in profitability check.
77	enum class RuleTy {
78	PerLoopCacheAnalysis,
79	PerInstrOrderCost,
80	ForVectorization,
81	};
82
83	} // end anonymous namespace
84
85	// Minimum loop depth supported.
86	static cl::opt<unsigned int> MinLoopNestDepth(
87	"loop-interchange-min-loop-nest-depth", cl::init(Val: `2`), cl::Hidden,
88	cl::desc ("Minimum depth of loop nest considered for the transform"));
89
90	// Maximum loop depth supported.
91	static cl::opt<unsigned int> MaxLoopNestDepth(
92	"loop-interchange-max-loop-nest-depth", cl::init(Val: `10`), cl::Hidden,
93	cl::desc ("Maximum depth of loop nest considered for the transform"));
94
95	// We prefer cache cost to vectorization by default.
96	static cl::list<RuleTy> Profitabilities(
97	"loop-interchange-profitabilities", cl::ZeroOrMore,
98	cl::MiscFlags::CommaSeparated, cl::Hidden,
99	cl::desc ("List of profitability heuristics to be used. They are applied in "
100	"the given order"),
101	cl::list_init<RuleTy>(Vals: {RuleTy::PerLoopCacheAnalysis,
102	RuleTy::PerInstrOrderCost,
103	RuleTy::ForVectorization}),
104	cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
105	"Prioritize loop cache cost"),
106	clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
107	"Prioritize the IVs order of each instruction"),
108	clEnumValN(RuleTy::ForVectorization, "vectorize",
109	"Prioritize vectorization")));
110
111	#ifndef NDEBUG
112	static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {
113	SmallSet<RuleTy, `4`> Set;
114	for (RuleTy Rule : Rules)
115	if (!Set.insert(Rule).second)
116	return false;
117	return true;
118	}
119
120	static void printDepMatrix(CharMatrix &DepMatrix) {
121	for (auto &Row : DepMatrix) {
122	for (auto D : Row)
123	LLVM_DEBUG(dbgs() << D << " ");
124	LLVM_DEBUG(dbgs() << "\n");
125	}
126	}
127	#endif
128
129	static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
130	Loop L, DependenceInfo DI,
131	ScalarEvolution *SE,
132	OptimizationRemarkEmitter *ORE) {
133	using ValueVector = SmallVector<Value *, `16`>;
134
135	ValueVector MemInstr;
136
137	// For each block.
138	for (BasicBlock *BB : L->blocks()) {
139	// Scan the BB and collect legal loads and stores.
140	for (Instruction &I : *BB) {
141	if (!isa<Instruction>(Val: I))
142	return false;
143	if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
144	if (!Ld->isSimple())
145	return false;
146	MemInstr.push_back(Elt: &I);
147	} else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
148	if (!St->isSimple())
149	return false;
150	MemInstr.push_back(Elt: &I);
151	}
152	}
153	}
154
155	LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
156	<< " Loads and Stores to analyze\n");
157	if (MemInstr.size() > MaxMemInstrCount) {
158	LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
159	<< MaxMemInstrCount << " load/stores in a loop\n");
160	ORE->emit(RemarkBuilder: [&]() {
161	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedLoop",
162	L->getStartLoc(), L->getHeader())
163	<< "Number of loads/stores exceeded, the supported maximum "
164	"can be increased with option "
165	"-loop-interchange-maxmeminstr-count.";
166	});
167	return false;
168	}
169	ValueVector::iterator I, IE, J, JE;
170	StringSet<> Seen;
171
172	for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
173	for (J = I, JE = MemInstr.end(); J != JE; ++J) {
174	std::vector<char> Dep;
175	Instruction Src = cast<Instruction>(Val: I);
176	Instruction Dst = cast<Instruction>(Val: J);
177	// Ignore Input dependencies.
178	if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
179	continue;
180	// Track Output, Flow, and Anti dependencies.
181	if (auto D = DI->depends(Src, Dst)) {
182	assert(D->isOrdered() && "Expected an output, flow or anti dep.");
183	// If the direction vector is negative, normalize it to
184	// make it non-negative.
185	if (D ->normalize(SE))
186	LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
187	LLVM_DEBUG(StringRef DepType =
188	D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
189	dbgs() << "Found " << DepType
190	<< " dependency between Src and Dst\n"
191	<< " Src:" << Src << "\n Dst:" << Dst << `'\n'`);
192	unsigned Levels = D ->getLevels();
193	char Direction;
194	for (unsigned II = `1`; II <= Levels; ++II) {
195	// `DVEntry::LE` is converted to ``. This is because `LE` means `<`*
196	// or `=`, for which we don't have an equivalent representation, so
197	// that the conservative approximation is necessary. The same goes for
198	// `DVEntry::GE`.
199	// TODO: Use of fine-grained expressions allows for more accurate
200	// analysis.
201	unsigned Dir = D ->getDirection(Level: II);
202	if (Dir == Dependence::DVEntry::LT)
203	Direction = `'<'`;
204	else if (Dir == Dependence::DVEntry::GT)
205	Direction = `'>'`;
206	else if (Dir == Dependence::DVEntry::EQ)
207	Direction = `'='`;
208	else
209	Direction = `'*'`;
210	Dep.push_back(x: Direction);
211	}
212
213	// If the Dependence object doesn't have any information, fill the
214	// dependency vector with ''.*
215	if (D ->isConfused()) {
216	assert(Dep.empty() && "Expected empty dependency vector");
217	Dep.assign(n: Level, val: `'*'`);
218	}
219
220	while (Dep.size() != Level) {
221	Dep.push_back(x: `'I'`);
222	}
223
224	// Make sure we only add unique entries to the dependency matrix.
225	if (Seen.insert(key: StringRef (Dep.data(), Dep.size())).second)
226	DepMatrix.push_back(x: Dep);
227	}
228	}
229	}
230
231	return true;
232	}
233
234	// A loop is moved from index 'from' to an index 'to'. Update the Dependence
235	// matrix by exchanging the two columns.
236	static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
237	unsigned ToIndx) {
238	for (auto &Row : DepMatrix)
239	std::swap(a&: Row [ToIndx], b&: Row [FromIndx]);
240	}
241
242	// Check if a direction vector is lexicographically positive. Return true if it
243	// is positive, nullopt if it is "zero", otherwise false.
244	// [Theorem] A permutation of the loops in a perfect nest is legal if and only
245	// if the direction matrix, after the same permutation is applied to its
246	// columns, has no ">" direction as the leftmost non-"=" direction in any row.
247	static std::optional<bool>
248	isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
249	for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) {
250	if (Direction == `'<'`)
251	return true;
252	if (Direction == `'>'` \|\| Direction == `'*'`)
253	return false;
254	}
255	return std::nullopt;
256	}
257
258	// Checks if it is legal to interchange 2 loops.
259	static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
260	unsigned InnerLoopId,
261	unsigned OuterLoopId) {
262	unsigned NumRows = DepMatrix.size();
263	std::vector<char> Cur;
264	// For each row check if it is valid to interchange.
265	for (unsigned Row = `0`; Row < NumRows; ++Row) {
266	// Create temporary DepVector check its lexicographical order
267	// before and after swapping OuterLoop vs InnerLoop
268	Cur = DepMatrix [Row];
269
270	// If the surrounding loops already ensure that the direction vector is
271	// lexicographically positive, nothing within the loop will be able to break
272	// the dependence. In such a case we can skip the subsequent check.
273	if (isLexicographicallyPositive(DV: Cur, Begin: `0`, End: OuterLoopId) == true)
274	continue;
275
276	// Check if the direction vector is lexicographically positive (or zero)
277	// for both before/after exchanged.
278	if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size()) == false)
279	return false;
280	std::swap(a&: Cur [InnerLoopId], b&: Cur [OuterLoopId]);
281	if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size()) == false)
282	return false;
283	}
284	return true;
285	}
286
287	static void populateWorklist(Loop &L, LoopVector &LoopList) {
288	LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
289	<< L.getHeader()->getParent()->getName() << " Loop: %"
290	<< L.getHeader()->getName() << `'\n'`);
291	assert(LoopList.empty() && "LoopList should initially be empty!");
292	Loop *CurrentLoop = &L;
293	const std::vector<Loop > Vec = &CurrentLoop->getSubLoops();
294	while (!Vec->empty()) {
295	// The current loop has multiple subloops in it hence it is not tightly
296	// nested.
297	// Discard all loops above it added into Worklist.
298	if (Vec->size() != `1`) {
299	LoopList = {};
300	return;
301	}
302
303	LoopList.push_back(Elt: CurrentLoop);
304	CurrentLoop = Vec->front();
305	Vec = &CurrentLoop->getSubLoops();
306	}
307	LoopList.push_back(Elt: CurrentLoop);
308	}
309
310	static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList,
311	OptimizationRemarkEmitter &ORE) {
312	unsigned LoopNestDepth = LoopList.size();
313	if (LoopNestDepth < MinLoopNestDepth \|\| LoopNestDepth > MaxLoopNestDepth) {
314	LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
315	<< ", the supported range is [" << MinLoopNestDepth
316	<< ", " << MaxLoopNestDepth << "].\n");
317	Loop *OuterLoop = LoopList.front();
318	ORE.emit(RemarkBuilder: [&]() {
319	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedLoopNestDepth",
320	OuterLoop->getStartLoc(),
321	OuterLoop->getHeader())
322	<< "Unsupported depth of loop nest, the supported range is ["
323	<< std::to_string(val: MinLoopNestDepth) << ", "
324	<< std::to_string(val: MaxLoopNestDepth) << "].\n";
325	});
326	return false;
327	}
328	return true;
329	}
330
331	static bool isComputableLoopNest(ScalarEvolution *SE,
332	ArrayRef<Loop *> LoopList) {
333	for (Loop *L : LoopList) {
334	const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
335	if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) {
336	LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
337	return false;
338	}
339	if (L->getNumBackEdges() != `1`) {
340	LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
341	return false;
342	}
343	if (!L->getExitingBlock()) {
344	LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
345	return false;
346	}
347	}
348	return true;
349	}
350
351	namespace {
352
353	/// LoopInterchangeLegality checks if it is legal to interchange the loop.
354	class LoopInterchangeLegality {
355	public:
356	LoopInterchangeLegality(Loop Outer, Loop Inner, ScalarEvolution *SE,
357	OptimizationRemarkEmitter *ORE)
358	: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
359
360	/// Check if the loops can be interchanged.
361	bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
362	CharMatrix &DepMatrix);
363
364	/// Discover induction PHIs in the header of \p L. Induction
365	/// PHIs are added to \p Inductions.
366	bool findInductions(Loop L, SmallVectorImpl<PHINode > &Inductions);
367
368	/// Check if the loop structure is understood. We do not handle triangular
369	/// loops for now.
370	bool isLoopStructureUnderstood();
371
372	bool currentLimitations();
373
374	const SmallPtrSetImpl<PHINode > &getOuterInnerReductions() const* {
375	return OuterInnerReductions;
376	}
377
378	const ArrayRef<PHINode > getInnerLoopInductions() const* {
379	return InnerLoopInductions;
380	}
381
382	private:
383	bool tightlyNested(Loop Outer, Loop Inner);
384	bool containsUnsafeInstructions(BasicBlock *BB);
385
386	/// Discover induction and reduction PHIs in the header of \p L. Induction
387	/// PHIs are added to \p Inductions, reductions are added to
388	/// OuterInnerReductions. When the outer loop is passed, the inner loop needs
389	/// to be passed as \p InnerLoop.
390	bool findInductionAndReductions(Loop *L,
391	SmallVector<PHINode *, `8`> &Inductions,
392	Loop *InnerLoop);
393
394	Loop *OuterLoop;
395	Loop *InnerLoop;
396
397	ScalarEvolution *SE;
398
399	/// Interface to emit optimization remarks.
400	OptimizationRemarkEmitter *ORE;
401
402	/// Set of reduction PHIs taking part of a reduction across the inner and
403	/// outer loop.
404	SmallPtrSet<PHINode *, `4`> OuterInnerReductions;
405
406	/// Set of inner loop induction PHIs
407	SmallVector<PHINode *, `8`> InnerLoopInductions;
408	};
409
410	/// LoopInterchangeProfitability checks if it is profitable to interchange the
411	/// loop.
412	class LoopInterchangeProfitability {
413	public:
414	LoopInterchangeProfitability(Loop Outer, Loop Inner, ScalarEvolution *SE,
415	OptimizationRemarkEmitter *ORE)
416	: OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
417
418	/// Check if the loop interchange is profitable.
419	bool isProfitable(const Loop InnerLoop, const* Loop *OuterLoop,
420	unsigned InnerLoopId, unsigned OuterLoopId,
421	CharMatrix &DepMatrix,
422	const DenseMap<const Loop , unsigned*> &CostMap,
423	std::unique_ptr<CacheCost> &CC);
424
425	private:
426	int getInstrOrderCost();
427	std::optional<bool> isProfitablePerLoopCacheAnalysis(
428	const DenseMap<const Loop , unsigned*> &CostMap,
429	std::unique_ptr<CacheCost> &CC);
430	std::optional<bool> isProfitablePerInstrOrderCost();
431	std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
432	unsigned OuterLoopId,
433	CharMatrix &DepMatrix);
434	Loop *OuterLoop;
435	Loop *InnerLoop;
436
437	/// Scev analysis.
438	ScalarEvolution *SE;
439
440	/// Interface to emit optimization remarks.
441	OptimizationRemarkEmitter *ORE;
442	};
443
444	/// LoopInterchangeTransform interchanges the loop.
445	class LoopInterchangeTransform {
446	public:
447	LoopInterchangeTransform(Loop Outer, Loop Inner, ScalarEvolution *SE,
448	LoopInfo LI, DominatorTree DT,
449	const LoopInterchangeLegality &LIL)
450	: OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
451
452	/// Interchange OuterLoop and InnerLoop.
453	bool transform();
454	void restructureLoops(Loop NewInner, Loop NewOuter,
455	BasicBlock *OrigInnerPreHeader,
456	BasicBlock *OrigOuterPreHeader);
457	void removeChildLoop(Loop OuterLoop, Loop InnerLoop);
458
459	private:
460	bool adjustLoopLinks();
461	bool adjustLoopBranches();
462
463	Loop *OuterLoop;
464	Loop *InnerLoop;
465
466	/// Scev analysis.
467	ScalarEvolution *SE;
468
469	LoopInfo *LI;
470	DominatorTree *DT;
471
472	const LoopInterchangeLegality &LIL;
473	};
474
475	struct LoopInterchange {
476	ScalarEvolution SE = nullptr*;
477	LoopInfo LI = nullptr*;
478	DependenceInfo DI = nullptr*;
479	DominatorTree DT = nullptr*;
480	std::unique_ptr<CacheCost> CC = nullptr;
481
482	/// Interface to emit optimization remarks.
483	OptimizationRemarkEmitter *ORE;
484
485	LoopInterchange(ScalarEvolution SE, LoopInfo LI, DependenceInfo *DI,
486	DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
487	OptimizationRemarkEmitter *ORE)
488	: SE(SE), LI(LI), DI(DI), DT(DT), CC (std::move(CC)), ORE(ORE) {}
489
490	bool run(Loop *L) {
491	if (L->getParentLoop())
492	return false;
493	SmallVector<Loop *, `8`> LoopList;
494	populateWorklist(L&: *L, LoopList);
495	return processLoopList(LoopList);
496	}
497
498	bool run(LoopNest &LN) {
499	SmallVector<Loop *, `8`> LoopList(LN.getLoops());
500	for (unsigned I = `1`; I < LoopList.size(); ++I)
501	if (LoopList [I]->getParentLoop() != LoopList [I - `1`])
502	return false;
503	return processLoopList(LoopList);
504	}
505
506	unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
507	// TODO: Add a better heuristic to select the loop to be interchanged based
508	// on the dependence matrix. Currently we select the innermost loop.
509	return LoopList.size() - `1`;
510	}
511
512	bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
513	bool Changed = false;
514
515	// Ensure proper loop nest depth.
516	assert(hasSupportedLoopDepth(LoopList, *ORE) &&
517	"Unsupported depth of loop nest.");
518
519	unsigned LoopNestDepth = LoopList.size();
520
521	LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
522	<< "\n");
523
524	CharMatrix DependencyMatrix;
525	Loop OuterMostLoop = (LoopList.begin());
526	if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth,
527	L: OuterMostLoop, DI, SE, ORE)) {
528	LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
529	return false;
530	}
531
532	LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
533	printDepMatrix(DependencyMatrix));
534
535	// Get the Outermost loop exit.
536	BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
537	if (!LoopNestExit) {
538	LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
539	return false;
540	}
541
542	unsigned SelecLoopId = selectLoopForInterchange(LoopList);
543	// Obtain the loop vector returned from loop cache analysis beforehand,
544	// and put each <Loop, index> pair into a map for constant time query
545	// later. Indices in loop vector reprsent the optimal order of the
546	// corresponding loop, e.g., given a loopnest with depth N, index 0
547	// indicates the loop should be placed as the outermost loop and index N
548	// indicates the loop should be placed as the innermost loop.
549	//
550	// For the old pass manager CacheCost would be null.
551	DenseMap<const Loop , unsigned*> CostMap;
552	if (CC != nullptr) {
553	for (const auto &[Idx, Cost] : enumerate(First: CC ->getLoopCosts()))
554	CostMap [Cost.first] = Idx;
555	}
556	// We try to achieve the globally optimal memory access for the loopnest,
557	// and do interchange based on a bubble-sort fasion. We start from
558	// the innermost loop, move it outwards to the best possible position
559	// and repeat this process.
560	for (unsigned j = SelecLoopId; j > `0`; j--) {
561	bool ChangedPerIter = false;
562	for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
563	bool Interchanged =
564	processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - `1`, DependencyMatrix, CostMap);
565	ChangedPerIter \|= Interchanged;
566	Changed \|= Interchanged;
567	}
568	// Early abort if there was no interchange during an entire round of
569	// moving loops outwards.
570	if (!ChangedPerIter)
571	break;
572	}
573	return Changed;
574	}
575
576	bool processLoop(SmallVectorImpl<Loop > &LoopList, unsigned* InnerLoopId,
577	unsigned OuterLoopId,
578	std::vector<std::vector<char>> &DependencyMatrix,
579	const DenseMap<const Loop , unsigned*> &CostMap) {
580	Loop *OuterLoop = LoopList [OuterLoopId];
581	Loop *InnerLoop = LoopList [InnerLoopId];
582	LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
583	<< " and OuterLoopId = " << OuterLoopId << "\n");
584	LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
585	if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) {
586	LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
587	return false;
588	}
589	LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
590	LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
591	if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
592	DepMatrix&: DependencyMatrix, CostMap, CC)) {
593	LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
594	return false;
595	}
596
597	ORE->emit(RemarkBuilder: [&]() {
598	return OptimizationRemark (DEBUG_TYPE, "Interchanged",
599	InnerLoop->getStartLoc(),
600	InnerLoop->getHeader())
601	<< "Loop interchanged with enclosing loop.";
602	});
603
604	LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
605	LIT.transform();
606	LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
607	LoopsInterchanged ++;
608
609	llvm::formLCSSARecursively(L&: OuterLoop, DT: DT, LI, SE);
610
611	// Loops interchanged, update LoopList accordingly.
612	std::swap(a&: LoopList [OuterLoopId], b&: LoopList [InnerLoopId]);
613	// Update the DependencyMatrix
614	interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId);
615
616	LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
617	printDepMatrix(DependencyMatrix));
618
619	return true;
620	}
621	};
622
623	} // end anonymous namespace
624
625	bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
626	return any_of(Range&: BB, P: [](const* Instruction &I) {
627	return I.mayHaveSideEffects() \|\| I.mayReadFromMemory();
628	});
629	}
630
631	bool LoopInterchangeLegality::tightlyNested(Loop OuterLoop, Loop InnerLoop) {
632	BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
633	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
634	BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
635
636	LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
637
638	// A perfectly nested loop will not have any branch in between the outer and
639	// inner block i.e. outer header will branch to either inner preheader and
640	// outerloop latch.
641	BranchInst *OuterLoopHeaderBI =
642	dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator());
643	if (!OuterLoopHeaderBI)
644	return false;
645
646	for (BasicBlock *Succ : successors(I: OuterLoopHeaderBI))
647	if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
648	Succ != OuterLoopLatch)
649	return false;
650
651	LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
652	// We do not have any basic block in between now make sure the outer header
653	// and outer loop latch doesn't contain any unsafe instructions.
654	if (containsUnsafeInstructions(BB: OuterLoopHeader) \|\|
655	containsUnsafeInstructions(BB: OuterLoopLatch))
656	return false;
657
658	// Also make sure the inner loop preheader does not contain any unsafe
659	// instructions. Note that all instructions in the preheader will be moved to
660	// the outer loop header when interchanging.
661	if (InnerLoopPreHeader != OuterLoopHeader &&
662	containsUnsafeInstructions(BB: InnerLoopPreHeader))
663	return false;
664
665	BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
666	// Ensure the inner loop exit block flows to the outer loop latch possibly
667	// through empty blocks.
668	const BasicBlock &SuccInner =
669	LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch);
670	if (&SuccInner != OuterLoopLatch) {
671	LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
672	<< " does not lead to the outer loop latch.\n";);
673	return false;
674	}
675	// The inner loop exit block does flow to the outer loop latch and not some
676	// other BBs, now make sure it contains safe instructions, since it will be
677	// moved into the (new) inner loop after interchange.
678	if (containsUnsafeInstructions(BB: InnerLoopExit))
679	return false;
680
681	LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
682	// We have a perfect loop nest.
683	return true;
684	}
685
686	bool LoopInterchangeLegality::isLoopStructureUnderstood() {
687	BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
688	for (PHINode *InnerInduction : InnerLoopInductions) {
689	unsigned Num = InnerInduction->getNumOperands();
690	for (unsigned i = `0`; i < Num; ++i) {
691	Value *Val = InnerInduction->getOperand(i_nocapture: i);
692	if (isa<Constant>(Val))
693	continue;
694	Instruction *I = dyn_cast<Instruction>(Val);
695	if (!I)
696	return false;
697	// TODO: Handle triangular loops.
698	// e.g. for(int i=0;i<N;i++)
699	// for(int j=i;j<N;j++)
700	unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
701	if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) ==
702	InnerLoopPreheader &&
703	!OuterLoop->isLoopInvariant(V: I)) {
704	return false;
705	}
706	}
707	}
708
709	// TODO: Handle triangular loops of another form.
710	// e.g. for(int i=0;i<N;i++)
711	// for(int j=0;j<i;j++)
712	// or,
713	// for(int i=0;i<N;i++)
714	// for(int j=0;ji<N;j++)*
715	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
716	BranchInst *InnerLoopLatchBI =
717	dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator());
718	if (!InnerLoopLatchBI->isConditional())
719	return false;
720	if (CmpInst *InnerLoopCmp =
721	dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) {
722	Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: `0`);
723	Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: `1`);
724
725	// LHS and RHS of the inner loop exit condition, e.g.,
726	// in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
727	Value Left = nullptr*;
728	Value Right = nullptr*;
729
730	// Check if V only involves inner loop induction variable.
731	// Return true if V is InnerInduction, or a cast from
732	// InnerInduction, or a binary operator that involves
733	// InnerInduction and a constant.
734	std::function<bool(Value *)> IsPathToInnerIndVar;
735	IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value V) -> bool* {
736	if (llvm::is_contained(Range&: InnerLoopInductions, Element: V))
737	return true;
738	if (isa<Constant>(Val: V))
739	return true;
740	const Instruction *I = dyn_cast<Instruction>(Val: V);
741	if (!I)
742	return false;
743	if (isa<CastInst>(Val: I))
744	return IsPathToInnerIndVar (I->getOperand(i: `0`));
745	if (isa<BinaryOperator>(Val: I))
746	return IsPathToInnerIndVar (I->getOperand(i: `0`)) &&
747	IsPathToInnerIndVar (I->getOperand(i: `1`));
748	return false;
749	};
750
751	// In case of multiple inner loop indvars, it is okay if LHS and RHS
752	// are both inner indvar related variables.
753	if (IsPathToInnerIndVar (Op0) && IsPathToInnerIndVar (Op1))
754	return true;
755
756	// Otherwise we check if the cmp instruction compares an inner indvar
757	// related variable (Left) with a outer loop invariant (Right).
758	if (IsPathToInnerIndVar (Op0) && !isa<Constant>(Val: Op0)) {
759	Left = Op0;
760	Right = Op1;
761	} else if (IsPathToInnerIndVar (Op1) && !isa<Constant>(Val: Op1)) {
762	Left = Op1;
763	Right = Op0;
764	}
765
766	if (Left == nullptr)
767	return false;
768
769	const SCEV *S = SE->getSCEV(V: Right);
770	if (!SE->isLoopInvariant(S, L: OuterLoop))
771	return false;
772	}
773
774	return true;
775	}
776
777	// If SV is a LCSSA PHI node with a single incoming value, return the incoming
778	// value.
779	static Value followLCSSA(Value SV) {
780	PHINode *PHI = dyn_cast<PHINode>(Val: SV);
781	if (!PHI)
782	return SV;
783
784	if (PHI->getNumIncomingValues() != `1`)
785	return SV;
786	return followLCSSA(SV: PHI->getIncomingValue(i: `0`));
787	}
788
789	// Check V's users to see if it is involved in a reduction in L.
790	static PHINode findInnerReductionPhi(Loop L, Value *V) {
791	// Reduction variables cannot be constants.
792	if (isa<Constant>(Val: V))
793	return nullptr;
794
795	for (Value *User : V->users()) {
796	if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) {
797	if (PHI->getNumIncomingValues() == `1`)
798	continue;
799	RecurrenceDescriptor RD;
800	if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) {
801	// Detect floating point reduction only when it can be reordered.
802	if (RD.getExactFPMathInst() != nullptr)
803	return nullptr;
804	return PHI;
805	}
806	return nullptr;
807	}
808	}
809
810	return nullptr;
811	}
812
813	bool LoopInterchangeLegality::findInductionAndReductions(
814	Loop L, SmallVector<PHINode , `8`> &Inductions, Loop *InnerLoop) {
815	if (!L->getLoopLatch() \|\| !L->getLoopPredecessor())
816	return false;
817	for (PHINode &PHI : L->getHeader()->phis()) {
818	InductionDescriptor ID;
819	if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
820	Inductions.push_back(Elt: &PHI);
821	else {
822	// PHIs in inner loops need to be part of a reduction in the outer loop,
823	// discovered when checking the PHIs of the outer loop earlier.
824	if (!InnerLoop) {
825	if (!OuterInnerReductions.count(Ptr: &PHI)) {
826	LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
827	"across the outer loop.\n");
828	return false;
829	}
830	} else {
831	assert(PHI.getNumIncomingValues() == `2` &&
832	"Phis in loop header should have exactly 2 incoming values");
833	// Check if we have a PHI node in the outer loop that has a reduction
834	// result from the inner loop as an incoming value.
835	Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch()));
836	PHINode *InnerRedPhi = findInnerReductionPhi(L: InnerLoop, V);
837	if (!InnerRedPhi \|\|
838	!llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) {
839	LLVM_DEBUG(
840	dbgs()
841	<< "Failed to recognize PHI as an induction or reduction.\n");
842	return false;
843	}
844	OuterInnerReductions.insert(Ptr: &PHI);
845	OuterInnerReductions.insert(Ptr: InnerRedPhi);
846	}
847	}
848	}
849	return true;
850	}
851
852	// This function indicates the current limitations in the transform as a result
853	// of which we do not proceed.
854	bool LoopInterchangeLegality::currentLimitations() {
855	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
856
857	// transform currently expects the loop latches to also be the exiting
858	// blocks.
859	if (InnerLoop->getExitingBlock() != InnerLoopLatch \|\|
860	OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() \|\|
861	!isa<BranchInst>(Val: InnerLoopLatch->getTerminator()) \|\|
862	!isa<BranchInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) {
863	LLVM_DEBUG(
864	dbgs() << "Loops where the latch is not the exiting block are not"
865	<< " supported currently.\n");
866	ORE->emit(RemarkBuilder: [&]() {
867	return OptimizationRemarkMissed (DEBUG_TYPE, "ExitingNotLatch",
868	OuterLoop->getStartLoc(),
869	OuterLoop->getHeader())
870	<< "Loops where the latch is not the exiting block cannot be"
871	" interchange currently.";
872	});
873	return true;
874	}
875
876	SmallVector<PHINode *, `8`> Inductions;
877	if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) {
878	LLVM_DEBUG(
879	dbgs() << "Only outer loops with induction or reduction PHI nodes "
880	<< "are supported currently.\n");
881	ORE->emit(RemarkBuilder: [&]() {
882	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedPHIOuter",
883	OuterLoop->getStartLoc(),
884	OuterLoop->getHeader())
885	<< "Only outer loops with induction or reduction PHI nodes can be"
886	" interchanged currently.";
887	});
888	return true;
889	}
890
891	Inductions.clear();
892	// For multi-level loop nests, make sure that all phi nodes for inner loops
893	// at all levels can be recognized as a induction or reduction phi. Bail out
894	// if a phi node at a certain nesting level cannot be properly recognized.
895	Loop *CurLevelLoop = OuterLoop;
896	while (!CurLevelLoop->getSubLoops().empty()) {
897	// We already made sure that the loop nest is tightly nested.
898	CurLevelLoop = CurLevelLoop->getSubLoops().front();
899	if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) {
900	LLVM_DEBUG(
901	dbgs() << "Only inner loops with induction or reduction PHI nodes "
902	<< "are supported currently.\n");
903	ORE->emit(RemarkBuilder: [&]() {
904	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedPHIInner",
905	CurLevelLoop->getStartLoc(),
906	CurLevelLoop->getHeader())
907	<< "Only inner loops with induction or reduction PHI nodes can be"
908	" interchange currently.";
909	});
910	return true;
911	}
912	}
913
914	// TODO: Triangular loops are not handled for now.
915	if (!isLoopStructureUnderstood()) {
916	LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
917	ORE->emit(RemarkBuilder: [&]() {
918	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedStructureInner",
919	InnerLoop->getStartLoc(),
920	InnerLoop->getHeader())
921	<< "Inner loop structure not understood currently.";
922	});
923	return true;
924	}
925
926	return false;
927	}
928
929	bool LoopInterchangeLegality::findInductions(
930	Loop L, SmallVectorImpl<PHINode > &Inductions) {
931	for (PHINode &PHI : L->getHeader()->phis()) {
932	InductionDescriptor ID;
933	if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
934	Inductions.push_back(Elt: &PHI);
935	}
936	return !Inductions.empty();
937	}
938
939	// We currently only support LCSSA PHI nodes in the inner loop exit, if their
940	// users are either reduction PHIs or PHIs outside the outer loop (which means
941	// the we are only interested in the final value after the loop).
942	static bool
943	areInnerLoopExitPHIsSupported(Loop InnerL, Loop OuterL,
944	SmallPtrSetImpl<PHINode *> &Reductions) {
945	BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
946	for (PHINode &PHI : InnerExit->phis()) {
947	// Reduction lcssa phi will have only 1 incoming block that from loop latch.
948	if (PHI.getNumIncomingValues() > `1`)
949	return false;
950	if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) {
951	PHINode *PN = dyn_cast<PHINode>(Val: U);
952	return !PN \|\|
953	(!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent()));
954	})) {
955	return false;
956	}
957	}
958	return true;
959	}
960
961	// We currently support LCSSA PHI nodes in the outer loop exit, if their
962	// incoming values do not come from the outer loop latch or if the
963	// outer loop latch has a single predecessor. In that case, the value will
964	// be available if both the inner and outer loop conditions are true, which
965	// will still be true after interchanging. If we have multiple predecessor,
966	// that may not be the case, e.g. because the outer loop latch may be executed
967	// if the inner loop is not executed.
968	static bool areOuterLoopExitPHIsSupported(Loop OuterLoop, Loop InnerLoop) {
969	BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
970	for (PHINode &PHI : LoopNestExit->phis()) {
971	for (Value *Incoming : PHI.incoming_values()) {
972	Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming);
973	if (!IncomingI \|\| IncomingI->getParent() != OuterLoop->getLoopLatch())
974	continue;
975
976	// The incoming value is defined in the outer loop latch. Currently we
977	// only support that in case the outer loop latch has a single predecessor.
978	// This guarantees that the outer loop latch is executed if and only if
979	// the inner loop is executed (because tightlyNested() guarantees that the
980	// outer loop header only branches to the inner loop or the outer loop
981	// latch).
982	// FIXME: We could weaken this logic and allow multiple predecessors,
983	// if the values are produced outside the loop latch. We would need
984	// additional logic to update the PHI nodes in the exit block as
985	// well.
986	if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
987	return false;
988	}
989	}
990	return true;
991	}
992
993	// In case of multi-level nested loops, it may occur that lcssa phis exist in
994	// the latch of InnerLoop, i.e., when defs of the incoming values are further
995	// inside the loopnest. Sometimes those incoming values are not available
996	// after interchange, since the original inner latch will become the new outer
997	// latch which may have predecessor paths that do not include those incoming
998	// values.
999	// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1000	// multi-level loop nests.
1001	static bool areInnerLoopLatchPHIsSupported(Loop OuterLoop, Loop InnerLoop) {
1002	if (InnerLoop->getSubLoops().empty())
1003	return true;
1004	// If the original outer latch has only one predecessor, then values defined
1005	// further inside the looploop, e.g., in the innermost loop, will be available
1006	// at the new outer latch after interchange.
1007	if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1008	return true;
1009
1010	// The outer latch has more than one predecessors, i.e., the inner
1011	// exit and the inner header.
1012	// PHI nodes in the inner latch are lcssa phis where the incoming values
1013	// are defined further inside the loopnest. Check if those phis are used
1014	// in the original inner latch. If that is the case then bail out since
1015	// those incoming values may not be available at the new outer latch.
1016	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1017	for (PHINode &PHI : InnerLoopLatch->phis()) {
1018	for (auto *U : PHI.users()) {
1019	Instruction *UI = cast<Instruction>(Val: U);
1020	if (InnerLoopLatch == UI->getParent())
1021	return false;
1022	}
1023	}
1024	return true;
1025	}
1026
1027	bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1028	unsigned OuterLoopId,
1029	CharMatrix &DepMatrix) {
1030	if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1031	LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1032	<< " and OuterLoopId = " << OuterLoopId
1033	<< " due to dependence\n");
1034	ORE->emit(RemarkBuilder: [&]() {
1035	return OptimizationRemarkMissed (DEBUG_TYPE, "Dependence",
1036	InnerLoop->getStartLoc(),
1037	InnerLoop->getHeader())
1038	<< "Cannot interchange loops due to dependences.";
1039	});
1040	return false;
1041	}
1042	// Check if outer and inner loop contain legal instructions only.
1043	for (auto *BB : OuterLoop->blocks())
1044	for (Instruction &I : BB->instructionsWithoutDebug())
1045	if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
1046	// readnone functions do not prevent interchanging.
1047	if (CI->onlyWritesMemory())
1048	continue;
1049	LLVM_DEBUG(
1050	dbgs() << "Loops with call instructions cannot be interchanged "
1051	<< "safely.");
1052	ORE->emit(RemarkBuilder: [&]() {
1053	return OptimizationRemarkMissed (DEBUG_TYPE, "CallInst",
1054	CI->getDebugLoc(),
1055	CI->getParent())
1056	<< "Cannot interchange loops due to call instruction.";
1057	});
1058
1059	return false;
1060	}
1061
1062	if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) {
1063	LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1064	return false;
1065	}
1066
1067	if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1068	LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1069	ORE->emit(RemarkBuilder: [&]() {
1070	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1071	InnerLoop->getStartLoc(),
1072	InnerLoop->getHeader())
1073	<< "Cannot interchange loops because unsupported PHI nodes found "
1074	"in inner loop latch.";
1075	});
1076	return false;
1077	}
1078
1079	// TODO: The loops could not be interchanged due to current limitations in the
1080	// transform module.
1081	if (currentLimitations()) {
1082	LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1083	return false;
1084	}
1085
1086	// Check if the loops are tightly nested.
1087	if (!tightlyNested(OuterLoop, InnerLoop)) {
1088	LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1089	ORE->emit(RemarkBuilder: [&]() {
1090	return OptimizationRemarkMissed (DEBUG_TYPE, "NotTightlyNested",
1091	InnerLoop->getStartLoc(),
1092	InnerLoop->getHeader())
1093	<< "Cannot interchange loops because they are not tightly "
1094	"nested.";
1095	});
1096	return false;
1097	}
1098
1099	if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop,
1100	Reductions&: OuterInnerReductions)) {
1101	LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1102	ORE->emit(RemarkBuilder: [&]() {
1103	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedExitPHI",
1104	InnerLoop->getStartLoc(),
1105	InnerLoop->getHeader())
1106	<< "Found unsupported PHI node in loop exit.";
1107	});
1108	return false;
1109	}
1110
1111	if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1112	LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1113	ORE->emit(RemarkBuilder: [&]() {
1114	return OptimizationRemarkMissed (DEBUG_TYPE, "UnsupportedExitPHI",
1115	OuterLoop->getStartLoc(),
1116	OuterLoop->getHeader())
1117	<< "Found unsupported PHI node in loop exit.";
1118	});
1119	return false;
1120	}
1121
1122	return true;
1123	}
1124
1125	int LoopInterchangeProfitability::getInstrOrderCost() {
1126	unsigned GoodOrder, BadOrder;
1127	BadOrder = GoodOrder = `0`;
1128	for (BasicBlock *BB : InnerLoop->blocks()) {
1129	for (Instruction &Ins : *BB) {
1130	if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) {
1131	bool FoundInnerInduction = false;
1132	bool FoundOuterInduction = false;
1133	for (Value *Op : GEP->operands()) {
1134	// Skip operands that are not SCEV-able.
1135	if (!SE->isSCEVable(Ty: Op->getType()))
1136	continue;
1137
1138	const SCEV *OperandVal = SE->getSCEV(V: Op);
1139	const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal);
1140	if (!AR)
1141	continue;
1142
1143	// If we find the inner induction after an outer induction e.g.
1144	// for(int i=0;i<N;i++)
1145	// for(int j=0;j<N;j++)
1146	// A[i][j] = A[i-1][j-1]+k;
1147	// then it is a good order.
1148	if (AR->getLoop() == InnerLoop) {
1149	// We found an InnerLoop induction after OuterLoop induction. It is
1150	// a good order.
1151	FoundInnerInduction = true;
1152	if (FoundOuterInduction) {
1153	GoodOrder++;
1154	break;
1155	}
1156	}
1157	// If we find the outer induction after an inner induction e.g.
1158	// for(int i=0;i<N;i++)
1159	// for(int j=0;j<N;j++)
1160	// A[j][i] = A[j-1][i-1]+k;
1161	// then it is a bad order.
1162	if (AR->getLoop() == OuterLoop) {
1163	// We found an OuterLoop induction after InnerLoop induction. It is
1164	// a bad order.
1165	FoundOuterInduction = true;
1166	if (FoundInnerInduction) {
1167	BadOrder++;
1168	break;
1169	}
1170	}
1171	}
1172	}
1173	}
1174	}
1175	return GoodOrder - BadOrder;
1176	}
1177
1178	std::optional<bool>
1179	LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1180	const DenseMap<const Loop , unsigned*> &CostMap,
1181	std::unique_ptr<CacheCost> &CC) {
1182	// This is the new cost model returned from loop cache analysis.
1183	// A smaller index means the loop should be placed an outer loop, and vice
1184	// versa.
1185	auto InnerLoopIt = CostMap.find(Val: InnerLoop);
1186	if (InnerLoopIt == CostMap.end())
1187	return std::nullopt;
1188	auto OuterLoopIt = CostMap.find(Val: OuterLoop);
1189	if (OuterLoopIt == CostMap.end())
1190	return std::nullopt;
1191
1192	if (CC ->getLoopCost(L: OuterLoop) == CC ->getLoopCost(L: InnerLoop))
1193	return std::nullopt;
1194	unsigned InnerIndex = InnerLoopIt ->second;
1195	unsigned OuterIndex = OuterLoopIt ->second;
1196	LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1197	<< ", OuterIndex = " << OuterIndex << "\n");
1198	assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1199	"numbers to each loop");
1200	return std::optional<bool>(InnerIndex < OuterIndex);
1201	}
1202
1203	std::optional<bool>
1204	LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1205	// Legacy cost model: this is rough cost estimation algorithm. It counts the
1206	// good and bad order of induction variables in the instruction and allows
1207	// reordering if number of bad orders is more than good.
1208	int Cost = getInstrOrderCost();
1209	LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1210	if (Cost < `0` && Cost < LoopInterchangeCostThreshold)
1211	return std::optional<bool>(true);
1212
1213	return std::nullopt;
1214	}
1215
1216	/// Return true if we can vectorize the loop specified by \p LoopId.
1217	static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1218	for (const auto &Dep : DepMatrix) {
1219	char Dir = Dep [LoopId];
1220	if (Dir != `'I'` && Dir != `'='`)
1221	return false;
1222	}
1223	return true;
1224	}
1225
1226	std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1227	unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1228	// If the outer loop is not loop independent it is not profitable to move
1229	// this to inner position, since doing so would not enable inner loop
1230	// parallelism.
1231	if (!canVectorize(DepMatrix, LoopId: OuterLoopId))
1232	return false;
1233
1234	// If inner loop has dependence and outer loop is loop independent then it is
1235	// profitable to interchange to enable inner loop parallelism.
1236	if (!canVectorize(DepMatrix, LoopId: InnerLoopId))
1237	return true;
1238
1239	// If both the inner and the outer loop can be vectorized, it is necessary to
1240	// check the cost of each vectorized loop for profitability decision. At this
1241	// time we do not have a cost model to estimate them, so return nullopt.
1242	// TODO: Estimate the cost of vectorized loop when both the outer and the
1243	// inner loop can be vectorized.
1244	return std::nullopt;
1245	}
1246
1247	bool LoopInterchangeProfitability::isProfitable(
1248	const Loop InnerLoop, const* Loop OuterLoop, unsigned* InnerLoopId,
1249	unsigned OuterLoopId, CharMatrix &DepMatrix,
1250	const DenseMap<const Loop , unsigned*> &CostMap,
1251	std::unique_ptr<CacheCost> &CC) {
1252	// isProfitable() is structured to avoid endless loop interchange. If the
1253	// highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1254	// decide the profitability then, profitability check will stop and return the
1255	// analysis result. If it failed to determine it (e.g., cache analysis failed
1256	// to analyze the loopnest due to delinearization issues) then go ahead the
1257	// second highest priority rule (isProfitablePerInstrOrderCost by default).
1258	// Likewise, if it failed to analysis the profitability then only, the last
1259	// rule (isProfitableForVectorization by default) will decide.
1260	assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules");
1261	std::optional<bool> shouldInterchange;
1262	for (RuleTy RT : Profitabilities) {
1263	switch (RT) {
1264	case RuleTy::PerLoopCacheAnalysis:
1265	shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1266	break;
1267	case RuleTy::PerInstrOrderCost:
1268	shouldInterchange = isProfitablePerInstrOrderCost();
1269	break;
1270	case RuleTy::ForVectorization:
1271	shouldInterchange =
1272	isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1273	break;
1274	}
1275
1276	// If this rule could determine the profitability, don't call subsequent
1277	// rules.
1278	if (shouldInterchange.has_value())
1279	break;
1280	}
1281
1282	if (!shouldInterchange.has_value()) {
1283	ORE->emit(RemarkBuilder: [&]() {
1284	return OptimizationRemarkMissed (DEBUG_TYPE, "InterchangeNotProfitable",
1285	InnerLoop->getStartLoc(),
1286	InnerLoop->getHeader())
1287	<< "Insufficient information to calculate the cost of loop for "
1288	"interchange.";
1289	});
1290	return false;
1291	} else if (!shouldInterchange.value()) {
1292	ORE->emit(RemarkBuilder: [&]() {
1293	return OptimizationRemarkMissed (DEBUG_TYPE, "InterchangeNotProfitable",
1294	InnerLoop->getStartLoc(),
1295	InnerLoop->getHeader())
1296	<< "Interchanging loops is not considered to improve cache "
1297	"locality nor vectorization.";
1298	});
1299	return false;
1300	}
1301	return true;
1302	}
1303
1304	void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1305	Loop *InnerLoop) {
1306	for (Loop L : OuterLoop)
1307	if (L == InnerLoop) {
1308	OuterLoop->removeChildLoop(Child: L);
1309	return;
1310	}
1311	llvm_unreachable("Couldn't find loop");
1312	}
1313
1314	/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1315	/// new inner and outer loop after interchanging: NewInner is the original
1316	/// outer loop and NewOuter is the original inner loop.
1317	///
1318	/// Before interchanging, we have the following structure
1319	/// Outer preheader
1320	// Outer header
1321	// Inner preheader
1322	// Inner header
1323	// Inner body
1324	// Inner latch
1325	// outer bbs
1326	// Outer latch
1327	//
1328	// After interchanging:
1329	// Inner preheader
1330	// Inner header
1331	// Outer preheader
1332	// Outer header
1333	// Inner body
1334	// outer bbs
1335	// Outer latch
1336	// Inner latch
1337	void LoopInterchangeTransform::restructureLoops(
1338	Loop NewInner, Loop NewOuter, BasicBlock *OrigInnerPreHeader,
1339	BasicBlock *OrigOuterPreHeader) {
1340	Loop *OuterLoopParent = OuterLoop->getParentLoop();
1341	// The original inner loop preheader moves from the new inner loop to
1342	// the parent loop, if there is one.
1343	NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader);
1344	LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent);
1345
1346	// Switch the loop levels.
1347	if (OuterLoopParent) {
1348	// Remove the loop from its parent loop.
1349	removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner);
1350	removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1351	OuterLoopParent->addChildLoop(NewChild: NewOuter);
1352	} else {
1353	removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1354	LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter);
1355	}
1356	while (!NewOuter->isInnermost())
1357	NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin()));
1358	NewOuter->addChildLoop(NewChild: NewInner);
1359
1360	// BBs from the original inner loop.
1361	SmallVector<BasicBlock *, `8`> OrigInnerBBs(NewOuter->blocks());
1362
1363	// Add BBs from the original outer loop to the original inner loop (excluding
1364	// BBs already in inner loop)
1365	for (BasicBlock *BB : NewInner->blocks())
1366	if (LI->getLoopFor(BB) == NewInner)
1367	NewOuter->addBlockEntry(BB);
1368
1369	// Now remove inner loop header and latch from the new inner loop and move
1370	// other BBs (the loop body) to the new inner loop.
1371	BasicBlock *OuterHeader = NewOuter->getHeader();
1372	BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1373	for (BasicBlock *BB : OrigInnerBBs) {
1374	// Nothing will change for BBs in child loops.
1375	if (LI->getLoopFor(BB) != NewOuter)
1376	continue;
1377	// Remove the new outer loop header and latch from the new inner loop.
1378	if (BB == OuterHeader \|\| BB == OuterLatch)
1379	NewInner->removeBlockFromLoop(BB);
1380	else
1381	LI->changeLoopFor(BB, L: NewInner);
1382	}
1383
1384	// The preheader of the original outer loop becomes part of the new
1385	// outer loop.
1386	NewOuter->addBlockEntry(BB: OrigOuterPreHeader);
1387	LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter);
1388
1389	// Tell SE that we move the loops around.
1390	SE->forgetLoop(L: NewOuter);
1391	}
1392
1393	bool LoopInterchangeTransform::transform() {
1394	bool Transformed = false;
1395
1396	if (InnerLoop->getSubLoops().empty()) {
1397	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1398	LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1399	auto &InductionPHIs = LIL.getInnerLoopInductions();
1400	if (InductionPHIs.empty()) {
1401	LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1402	return false;
1403	}
1404
1405	SmallVector<Instruction *, `8`> InnerIndexVarList;
1406	for (PHINode *CurInductionPHI : InductionPHIs) {
1407	if (CurInductionPHI->getIncomingBlock(i: `0`) == InnerLoopPreHeader)
1408	InnerIndexVarList.push_back(
1409	Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: `1`)));
1410	else
1411	InnerIndexVarList.push_back(
1412	Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: `0`)));
1413	}
1414
1415	// Create a new latch block for the inner loop. We split at the
1416	// current latch's terminator and then move the condition and all
1417	// operands that are not either loop-invariant or the induction PHI into the
1418	// new latch block.
1419	BasicBlock *NewLatch =
1420	SplitBlock(Old: InnerLoop->getLoopLatch(),
1421	SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1422
1423	SmallSetVector<Instruction *, `4`> WorkList;
1424	unsigned i = `0`;
1425	auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1426	for (; i < WorkList.size(); i++) {
1427	// Duplicate instruction and move it the new latch. Update uses that
1428	// have been moved.
1429	Instruction *NewI = WorkList [i]->clone();
1430	NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt());
1431	assert(!NewI->mayHaveSideEffects() &&
1432	"Moving instructions with side-effects may change behavior of "
1433	"the loop nest!");
1434	for (Use &U : llvm::make_early_inc_range(Range: WorkList [i]->uses())) {
1435	Instruction *UserI = cast<Instruction>(Val: U.getUser());
1436	if (!InnerLoop->contains(BB: UserI->getParent()) \|\|
1437	UserI->getParent() == NewLatch \|\|
1438	llvm::is_contained(Range: InductionPHIs, Element: UserI))
1439	U.set(NewI);
1440	}
1441	// Add operands of moved instruction to the worklist, except if they are
1442	// outside the inner loop or are the induction PHI.
1443	for (Value *Op : WorkList [i]->operands()) {
1444	Instruction *OpI = dyn_cast<Instruction>(Val: Op);
1445	if (!OpI \|\|
1446	this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop \|\|
1447	llvm::is_contained(Range: InductionPHIs, Element: OpI))
1448	continue;
1449	WorkList.insert(X: OpI);
1450	}
1451	}
1452	};
1453
1454	// FIXME: Should we interchange when we have a constant condition?
1455	Instruction *CondI = dyn_cast<Instruction>(
1456	Val: cast<BranchInst>(Val: InnerLoop->getLoopLatch()->getTerminator())
1457	->getCondition());
1458	if (CondI)
1459	WorkList.insert(X: CondI);
1460	MoveInstructions ();
1461	for (Instruction *InnerIndexVar : InnerIndexVarList)
1462	WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar));
1463	MoveInstructions ();
1464	}
1465
1466	// Ensure the inner loop phi nodes have a separate basic block.
1467	BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1468	if (&*InnerLoopHeader->getFirstNonPHIIt() !=
1469	InnerLoopHeader->getTerminator()) {
1470	SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
1471	LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1472	}
1473
1474	// Instructions in the original inner loop preheader may depend on values
1475	// defined in the outer loop header. Move them there, because the original
1476	// inner loop preheader will become the entry into the interchanged loop nest.
1477	// Currently we move all instructions and rely on LICM to move invariant
1478	// instructions outside the loop nest.
1479	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1480	BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1481	if (InnerLoopPreHeader != OuterLoopHeader) {
1482	for (Instruction &I :
1483	make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(),
1484	y: std::prev(x: InnerLoopPreHeader->end()))))
1485	I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator());
1486	}
1487
1488	Transformed \|= adjustLoopLinks();
1489	if (!Transformed) {
1490	LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1491	return false;
1492	}
1493
1494	return true;
1495	}
1496
1497	/// \brief Move all instructions except the terminator from FromBB right before
1498	/// InsertBefore
1499	static void moveBBContents(BasicBlock FromBB, Instruction InsertBefore) {
1500	BasicBlock *ToBB = InsertBefore->getParent();
1501
1502	ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(),
1503	FromEndIt: FromBB->getTerminator()->getIterator());
1504	}
1505
1506	/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1507	static void swapBBContents(BasicBlock BB1, BasicBlock BB2) {
1508	// Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1509	// from BB1 afterwards.
1510	auto Iter = map_range(C&: BB1, F: [](Instruction &I) { return* &I; });
1511	SmallVector<Instruction *, `4`> TempInstrs(Iter.begin(), std::prev(x: Iter.end()));
1512	for (Instruction *I : TempInstrs)
1513	I->removeFromParent();
1514
1515	// Move instructions from BB2 to BB1.
1516	moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator());
1517
1518	// Move instructions from TempInstrs to BB2.
1519	for (Instruction *I : TempInstrs)
1520	I->insertBefore(InsertPos: BB2->getTerminator()->getIterator());
1521	}
1522
1523	// Update BI to jump to NewBB instead of OldBB. Records updates to the
1524	// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1525	// \p OldBB is exactly once in BI's successor list.
1526	static void updateSuccessor(BranchInst BI, BasicBlock OldBB,
1527	BasicBlock *NewBB,
1528	std::vector<DominatorTree::UpdateType> &DTUpdates,
1529	bool MustUpdateOnce = true) {
1530	assert((!MustUpdateOnce \|\| llvm::count(successors(BI), OldBB) == `1`) &&
1531	"BI must jump to OldBB exactly once.");
1532	bool Changed = false;
1533	for (Use &Op : BI->operands())
1534	if (Op == OldBB) {
1535	Op.set(NewBB);
1536	Changed = true;
1537	}
1538
1539	if (Changed) {
1540	DTUpdates.push_back(
1541	x: {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1542	DTUpdates.push_back(
1543	x: {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1544	}
1545	assert(Changed && "Expected a successor to be updated");
1546	}
1547
1548	// Move Lcssa PHIs to the right place.
1549	static void moveLCSSAPhis(BasicBlock InnerExit, BasicBlock InnerHeader,
1550	BasicBlock InnerLatch, BasicBlock OuterHeader,
1551	BasicBlock OuterLatch, BasicBlock OuterExit,
1552	Loop InnerLoop, LoopInfo LI) {
1553
1554	// Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1555	// defined either in the header or latch. Those blocks will become header and
1556	// latch of the new outer loop, and the only possible users can PHI nodes
1557	// in the exit block of the loop nest or the outer loop header (reduction
1558	// PHIs, in that case, the incoming value must be defined in the inner loop
1559	// header). We can just substitute the user with the incoming value and remove
1560	// the PHI.
1561	for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) {
1562	assert(P.getNumIncomingValues() == `1` &&
1563	"Only loops with a single exit are supported!");
1564
1565	// Incoming values are guaranteed be instructions currently.
1566	auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch));
1567	// In case of multi-level nested loops, follow LCSSA to find the incoming
1568	// value defined from the innermost loop.
1569	auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI));
1570	// Skip phis with incoming values from the inner loop body, excluding the
1571	// header and latch.
1572	if (IncIInnerMost->getParent() != InnerLatch &&
1573	IncIInnerMost->getParent() != InnerHeader)
1574	continue;
1575
1576	assert(all_of(P.users(),
1577	[OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1578	return (cast<PHINode>(U)->getParent() == OuterHeader &&
1579	IncI->getParent() == InnerHeader) \|\|
1580	cast<PHINode>(U)->getParent() == OuterExit;
1581	}) &&
1582	"Can only replace phis iff the uses are in the loop nest exit or "
1583	"the incoming value is defined in the inner header (it will "
1584	"dominate all loop blocks after interchanging)");
1585	P.replaceAllUsesWith(V: IncI);
1586	P.eraseFromParent();
1587	}
1588
1589	SmallVector<PHINode *, `8`> LcssaInnerExit(
1590	llvm::make_pointer_range(Range: InnerExit->phis()));
1591
1592	SmallVector<PHINode *, `8`> LcssaInnerLatch(
1593	llvm::make_pointer_range(Range: InnerLatch->phis()));
1594
1595	// Lcssa PHIs for values used outside the inner loop are in InnerExit.
1596	// If a PHI node has users outside of InnerExit, it has a use outside the
1597	// interchanged loop and we have to preserve it. We move these to
1598	// InnerLatch, which will become the new exit block for the innermost
1599	// loop after interchanging.
1600	for (PHINode *P : LcssaInnerExit)
1601	P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
1602
1603	// If the inner loop latch contains LCSSA PHIs, those come from a child loop
1604	// and we have to move them to the new inner latch.
1605	for (PHINode *P : LcssaInnerLatch)
1606	P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt());
1607
1608	// Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1609	// incoming values defined in the outer loop, we have to add a new PHI
1610	// in the inner loop latch, which became the exit block of the outer loop,
1611	// after interchanging.
1612	if (OuterExit) {
1613	for (PHINode &P : OuterExit->phis()) {
1614	if (P.getNumIncomingValues() != `1`)
1615	continue;
1616	// Skip Phis with incoming values defined in the inner loop. Those should
1617	// already have been updated.
1618	auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: `0`));
1619	if (!I \|\| LI->getLoopFor(BB: I->getParent()) == InnerLoop)
1620	continue;
1621
1622	PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone());
1623	NewPhi->setIncomingValue(i: `0`, V: P.getIncomingValue(i: `0`));
1624	NewPhi->setIncomingBlock(i: `0`, BB: OuterLatch);
1625	// We might have incoming edges from other BBs, i.e., the original outer
1626	// header.
1627	for (auto *Pred : predecessors(BB: InnerLatch)) {
1628	if (Pred == OuterLatch)
1629	continue;
1630	NewPhi->addIncoming(V: P.getIncomingValue(i: `0`), BB: Pred);
1631	}
1632	NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
1633	P.setIncomingValue(i: `0`, V: NewPhi);
1634	}
1635	}
1636
1637	// Now adjust the incoming blocks for the LCSSA PHIs.
1638	// For PHIs moved from Inner's exit block, we need to replace Inner's latch
1639	// with the new latch.
1640	InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch);
1641	}
1642
1643	bool LoopInterchangeTransform::adjustLoopBranches() {
1644	LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1645	std::vector<DominatorTree::UpdateType> DTUpdates;
1646
1647	BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1648	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1649
1650	assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1651	InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1652	InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1653	// Ensure that both preheaders do not contain PHI nodes and have single
1654	// predecessors. This allows us to move them easily. We use
1655	// InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1656	// preheaders do not satisfy those conditions.
1657	if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) \|\|
1658	!OuterLoopPreHeader->getUniquePredecessor())
1659	OuterLoopPreHeader =
1660	InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
1661	if (InnerLoopPreHeader == OuterLoop->getHeader())
1662	InnerLoopPreHeader =
1663	InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
1664
1665	// Adjust the loop preheader
1666	BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1667	BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1668	BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1669	BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1670	BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1671	BasicBlock *InnerLoopLatchPredecessor =
1672	InnerLoopLatch->getUniquePredecessor();
1673	BasicBlock *InnerLoopLatchSuccessor;
1674	BasicBlock *OuterLoopLatchSuccessor;
1675
1676	BranchInst *OuterLoopLatchBI =
1677	dyn_cast<BranchInst>(Val: OuterLoopLatch->getTerminator());
1678	BranchInst *InnerLoopLatchBI =
1679	dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator());
1680	BranchInst *OuterLoopHeaderBI =
1681	dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator());
1682	BranchInst *InnerLoopHeaderBI =
1683	dyn_cast<BranchInst>(Val: InnerLoopHeader->getTerminator());
1684
1685	if (!OuterLoopPredecessor \|\| !InnerLoopLatchPredecessor \|\|
1686	!OuterLoopLatchBI \|\| !InnerLoopLatchBI \|\| !OuterLoopHeaderBI \|\|
1687	!InnerLoopHeaderBI)
1688	return false;
1689
1690	BranchInst *InnerLoopLatchPredecessorBI =
1691	dyn_cast<BranchInst>(Val: InnerLoopLatchPredecessor->getTerminator());
1692	BranchInst *OuterLoopPredecessorBI =
1693	dyn_cast<BranchInst>(Val: OuterLoopPredecessor->getTerminator());
1694
1695	if (!OuterLoopPredecessorBI \|\| !InnerLoopLatchPredecessorBI)
1696	return false;
1697	BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1698	if (!InnerLoopHeaderSuccessor)
1699	return false;
1700
1701	// Adjust Loop Preheader and headers.
1702	// The branches in the outer loop predecessor and the outer loop header can
1703	// be unconditional branches or conditional branches with duplicates. Consider
1704	// this when updating the successors.
1705	updateSuccessor(BI: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader,
1706	NewBB: InnerLoopPreHeader, DTUpdates, /MustUpdateOnce=/false);
1707	// The outer loop header might or might not branch to the outer latch.
1708	// We are guaranteed to branch to the inner loop preheader.
1709	if (llvm::is_contained(Range: OuterLoopHeaderBI->successors(), Element: OuterLoopLatch)) {
1710	// In this case the outerLoopHeader should branch to the InnerLoopLatch.
1711	updateSuccessor(BI: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch,
1712	DTUpdates,
1713	/MustUpdateOnce=/false);
1714	}
1715	updateSuccessor(BI: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader,
1716	NewBB: InnerLoopHeaderSuccessor, DTUpdates,
1717	/MustUpdateOnce=/false);
1718
1719	// Adjust reduction PHI's now that the incoming block has changed.
1720	InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader,
1721	New: OuterLoopHeader);
1722
1723	updateSuccessor(BI: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor,
1724	NewBB: OuterLoopPreHeader, DTUpdates);
1725
1726	// -------------Adjust loop latches-----------
1727	if (InnerLoopLatchBI->getSuccessor(i: `0`) == InnerLoopHeader)
1728	InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: `1`);
1729	else
1730	InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: `0`);
1731
1732	updateSuccessor(BI: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch,
1733	NewBB: InnerLoopLatchSuccessor, DTUpdates);
1734
1735	if (OuterLoopLatchBI->getSuccessor(i: `0`) == OuterLoopHeader)
1736	OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: `1`);
1737	else
1738	OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: `0`);
1739
1740	updateSuccessor(BI: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor,
1741	NewBB: OuterLoopLatchSuccessor, DTUpdates);
1742	updateSuccessor(BI: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch,
1743	DTUpdates);
1744
1745	DT->applyUpdates(Updates: DTUpdates);
1746	restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader,
1747	OrigOuterPreHeader: OuterLoopPreHeader);
1748
1749	moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch,
1750	OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(),
1751	InnerLoop, LI);
1752	// For PHIs in the exit block of the outer loop, outer's latch has been
1753	// replaced by Inners'.
1754	OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
1755
1756	auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1757	// Now update the reduction PHIs in the inner and outer loop headers.
1758	SmallVector<PHINode *, `4`> InnerLoopPHIs, OuterLoopPHIs;
1759	for (PHINode &PHI : InnerLoopHeader->phis())
1760	if (OuterInnerReductions.contains(Ptr: &PHI))
1761	InnerLoopPHIs.push_back(Elt: &PHI);
1762
1763	for (PHINode &PHI : OuterLoopHeader->phis())
1764	if (OuterInnerReductions.contains(Ptr: &PHI))
1765	OuterLoopPHIs.push_back(Elt: &PHI);
1766
1767	// Now move the remaining reduction PHIs from outer to inner loop header and
1768	// vice versa. The PHI nodes must be part of a reduction across the inner and
1769	// outer loop and all the remains to do is and updating the incoming blocks.
1770	for (PHINode *PHI : OuterLoopPHIs) {
1771	LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1772	PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt());
1773	assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1774	}
1775	for (PHINode *PHI : InnerLoopPHIs) {
1776	LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1777	PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt());
1778	assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1779	}
1780
1781	// Update the incoming blocks for moved PHI nodes.
1782	OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader);
1783	OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch);
1784	InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader);
1785	InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
1786
1787	// Values defined in the outer loop header could be used in the inner loop
1788	// latch. In that case, we need to create LCSSA phis for them, because after
1789	// interchanging they will be defined in the new inner loop and used in the
1790	// new outer loop.
1791	SmallVector<Instruction *, `4`> MayNeedLCSSAPhis;
1792	for (Instruction &I :
1793	make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end())))
1794	MayNeedLCSSAPhis.push_back(Elt: &I);
1795	formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: DT, LI: LI, SE);
1796
1797	return true;
1798	}
1799
1800	bool LoopInterchangeTransform::adjustLoopLinks() {
1801	// Adjust all branches in the inner and outer loop.
1802	bool Changed = adjustLoopBranches();
1803	if (Changed) {
1804	// We have interchanged the preheaders so we need to interchange the data in
1805	// the preheaders as well. This is because the content of the inner
1806	// preheader was previously executed inside the outer loop.
1807	BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1808	BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1809	swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader);
1810	}
1811	return Changed;
1812	}
1813
1814	PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
1815	LoopAnalysisManager &AM,
1816	LoopStandardAnalysisResults &AR,
1817	LPMUpdater &U) {
1818	Function &F = *LN.getParent();
1819	SmallVector<Loop *, `8`> LoopList(LN.getLoops());
1820
1821	if (MaxMemInstrCount < `1`) {
1822	LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
1823	return PreservedAnalyses::all();
1824	}
1825	OptimizationRemarkEmitter ORE(&F);
1826
1827	// Ensure minimum depth of the loop nest to do the interchange.
1828	if (!hasSupportedLoopDepth(LoopList, ORE))
1829	return PreservedAnalyses::all();
1830	// Ensure computable loop nest.
1831	if (!isComputableLoopNest(SE: &AR.SE, LoopList)) {
1832	LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
1833	return PreservedAnalyses::all();
1834	}
1835
1836	ORE.emit(RemarkBuilder: [&]() {
1837	return OptimizationRemarkAnalysis (DEBUG_TYPE, "Dependence",
1838	LN.getOutermostLoop().getStartLoc(),
1839	LN.getOutermostLoop().getHeader())
1840	<< "Computed dependence info, invoking the transform.";
1841	});
1842
1843	DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1844	std::unique_ptr<CacheCost> CC =
1845	CacheCost::getCacheCost(Root&: LN.getOutermostLoop(), AR, DI);
1846
1847	if (!LoopInterchange (&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1848	return PreservedAnalyses::all();
1849	U.markLoopNestChanged(Changed: true);
1850	return getLoopPassPreservedAnalyses();
1851	}
1852

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopInterchange.cpp