LoopFuse.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/LoopFuse.cpp]

1	//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	///
9	/// \file
10	/// This file implements the loop fusion pass.
11	/// The implementation is largely based on the following document:
12	///
13	/// Code Transformations to Augment the Scope of Loop Fusion in a
14	/// Production Compiler
15	/// Christopher Mark Barton
16	/// MSc Thesis
17	/// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
18	///
19	/// The general approach taken is to collect sets of control flow equivalent
20	/// loops and test whether they can be fused. The necessary conditions for
21	/// fusion are:
22	/// 1. The loops must be adjacent (there cannot be any statements between
23	/// the two loops).
24	/// 2. The loops must be conforming (they must execute the same number of
25	/// iterations).
26	/// 3. The loops must be control flow equivalent (if one loop executes, the
27	/// other is guaranteed to execute).
28	/// 4. There cannot be any negative distance dependencies between the loops.
29	/// If all of these conditions are satisfied, it is safe to fuse the loops.
30	///
31	/// This implementation creates FusionCandidates that represent the loop and the
32	/// necessary information needed by fusion. It then operates on the fusion
33	/// candidates, first confirming that the candidate is eligible for fusion. The
34	/// candidates are then collected into control flow equivalent sets, sorted in
35	/// dominance order. Each set of control flow equivalent candidates is then
36	/// traversed, attempting to fuse pairs of candidates in the set. If all
37	/// requirements for fusion are met, the two candidates are fused, creating a
38	/// new (fused) candidate which is then added back into the set to consider for
39	/// additional fusion.
40	///
41	/// This implementation currently does not make any modifications to remove
42	/// conditions for fusion. Code transformations to make loops conform to each of
43	/// the conditions for fusion are discussed in more detail in the document
44	/// above. These can be added to the current implementation in the future.
45	//===----------------------------------------------------------------------===//
46
47	#include "llvm/Transforms/Scalar/LoopFuse.h"
48	#include "llvm/ADT/Statistic.h"
49	#include "llvm/Analysis/AssumptionCache.h"
50	#include "llvm/Analysis/DependenceAnalysis.h"
51	#include "llvm/Analysis/DomTreeUpdater.h"
52	#include "llvm/Analysis/LoopInfo.h"
53	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
54	#include "llvm/Analysis/PostDominators.h"
55	#include "llvm/Analysis/ScalarEvolution.h"
56	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
57	#include "llvm/Analysis/TargetTransformInfo.h"
58	#include "llvm/IR/Function.h"
59	#include "llvm/IR/Verifier.h"
60	#include "llvm/Support/CommandLine.h"
61	#include "llvm/Support/Debug.h"
62	#include "llvm/Support/raw_ostream.h"
63	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
64	#include "llvm/Transforms/Utils/CodeMoverUtils.h"
65	#include "llvm/Transforms/Utils/LoopPeel.h"
66	#include "llvm/Transforms/Utils/LoopSimplify.h"
67
68	using namespace llvm;
69
70	#define DEBUG_TYPE "loop-fusion"
71
72	STATISTIC(FuseCounter, "Loops fused");
73	STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
74	STATISTIC(InvalidPreheader, "Loop has invalid preheader");
75	STATISTIC(InvalidHeader, "Loop has invalid header");
76	STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
77	STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
78	STATISTIC(InvalidLatch, "Loop has invalid latch");
79	STATISTIC(InvalidLoop, "Loop is invalid");
80	STATISTIC(AddressTakenBB, "Basic block has address taken");
81	STATISTIC(MayThrowException, "Loop may throw an exception");
82	STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
83	STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
84	STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
85	STATISTIC(UnknownTripCount, "Loop has unknown trip count");
86	STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
87	STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
88	STATISTIC(NonAdjacent, "Loops are not adjacent");
89	STATISTIC(
90	NonEmptyPreheader,
91	"Loop has a non-empty preheader with instructions that cannot be moved");
92	STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
93	STATISTIC(NonIdenticalGuards, "Candidates have different guards");
94	STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
95	"instructions that cannot be moved");
96	STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
97	"instructions that cannot be moved");
98	STATISTIC(NotRotated, "Candidate is not rotated");
99	STATISTIC(OnlySecondCandidateIsGuarded,
100	"The second candidate is guarded while the first one is not");
101	STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions.");
102	STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions.");
103
104	enum FusionDependenceAnalysisChoice {
105	FUSION_DEPENDENCE_ANALYSIS_SCEV,
106	FUSION_DEPENDENCE_ANALYSIS_DA,
107	FUSION_DEPENDENCE_ANALYSIS_ALL,
108	};
109
110	static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
111	"loop-fusion-dependence-analysis",
112	cl::desc ("Which dependence analysis should loop fusion use?"),
113	cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
114	"Use the scalar evolution interface"),
115	clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
116	"Use the dependence analysis interface"),
117	clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
118	"Use all available analyses")),
119	cl::Hidden, cl::init(Val: FUSION_DEPENDENCE_ANALYSIS_ALL));
120
121	static cl::opt<unsigned> FusionPeelMaxCount(
122	"loop-fusion-peel-max-count", cl::init(Val: `0`), cl::Hidden,
123	cl::desc ("Max number of iterations to be peeled from a loop, such that "
124	"fusion can take place"));
125
126	#ifndef NDEBUG
127	static cl::opt<bool>
128	VerboseFusionDebugging("loop-fusion-verbose-debug",
129	cl::desc("Enable verbose debugging for Loop Fusion"),
130	cl::Hidden, cl::init(false));
131	#endif
132
133	namespace {
134	/// This class is used to represent a candidate for loop fusion. When it is
135	/// constructed, it checks the conditions for loop fusion to ensure that it
136	/// represents a valid candidate. It caches several parts of a loop that are
137	/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
138	/// of continually querying the underlying Loop to retrieve these values. It is
139	/// assumed these will not change throughout loop fusion.
140	///
141	/// The invalidate method should be used to indicate that the FusionCandidate is
142	/// no longer a valid candidate for fusion. Similarly, the isValid() method can
143	/// be used to ensure that the FusionCandidate is still valid for fusion.
144	struct FusionCandidate {
145	/// Cache of parts of the loop used throughout loop fusion. These should not
146	/// need to change throughout the analysis and transformation.
147	/// These parts are cached to avoid repeatedly looking up in the Loop class.
148
149	/// Preheader of the loop this candidate represents
150	BasicBlock *Preheader;
151	/// Header of the loop this candidate represents
152	BasicBlock *Header;
153	/// Blocks in the loop that exit the loop
154	BasicBlock *ExitingBlock;
155	/// The successor block of this loop (where the exiting blocks go to)
156	BasicBlock *ExitBlock;
157	/// Latch of the loop
158	BasicBlock *Latch;
159	/// The loop that this fusion candidate represents
160	Loop *L;
161	/// Vector of instructions in this loop that read from memory
162	SmallVector<Instruction *, `16`> MemReads;
163	/// Vector of instructions in this loop that write to memory
164	SmallVector<Instruction *, `16`> MemWrites;
165	/// Are all of the members of this fusion candidate still valid
166	bool Valid;
167	/// Guard branch of the loop, if it exists
168	BranchInst *GuardBranch;
169	/// Peeling Paramaters of the Loop.
170	TTI::PeelingPreferences PP;
171	/// Can you Peel this Loop?
172	bool AbleToPeel;
173	/// Has this loop been Peeled
174	bool Peeled;
175
176	/// Dominator and PostDominator trees are needed for the
177	/// FusionCandidateCompare function, required by FusionCandidateSet to
178	/// determine where the FusionCandidate should be inserted into the set. These
179	/// are used to establish ordering of the FusionCandidates based on dominance.
180	DominatorTree &DT;
181	const PostDominatorTree *PDT;
182
183	OptimizationRemarkEmitter &ORE;
184
185	FusionCandidate(Loop L, DominatorTree &DT, const* PostDominatorTree *PDT,
186	OptimizationRemarkEmitter &ORE, TTI::PeelingPreferences PP)
187	: Preheader(L->getLoopPreheader()), Header(L->getHeader()),
188	ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
189	Latch(L->getLoopLatch()), L(L), Valid(true),
190	GuardBranch(L->getLoopGuardBranch()), PP (PP), AbleToPeel(canPeel(L)),
191	Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
192
193	// Walk over all blocks in the loop and check for conditions that may
194	// prevent fusion. For each block, walk over all instructions and collect
195	// the memory reads and writes If any instructions that prevent fusion are
196	// found, invalidate this object and return.
197	for (BasicBlock *BB : L->blocks()) {
198	if (BB->hasAddressTaken()) {
199	invalidate();
200	reportInvalidCandidate(Stat&: AddressTakenBB);
201	return;
202	}
203
204	for (Instruction &I : *BB) {
205	if (I.mayThrow()) {
206	invalidate();
207	reportInvalidCandidate(Stat&: MayThrowException);
208	return;
209	}
210	if (StoreInst *SI = dyn_cast<StoreInst>(Val: &I)) {
211	if (SI->isVolatile()) {
212	invalidate();
213	reportInvalidCandidate(Stat&: ContainsVolatileAccess);
214	return;
215	}
216	}
217	if (LoadInst *LI = dyn_cast<LoadInst>(Val: &I)) {
218	if (LI->isVolatile()) {
219	invalidate();
220	reportInvalidCandidate(Stat&: ContainsVolatileAccess);
221	return;
222	}
223	}
224	if (I.mayWriteToMemory())
225	MemWrites.push_back(Elt: &I);
226	if (I.mayReadFromMemory())
227	MemReads.push_back(Elt: &I);
228	}
229	}
230	}
231
232	/// Check if all members of the class are valid.
233	bool isValid() const {
234	return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
235	!L->isInvalid() && Valid;
236	}
237
238	/// Verify that all members are in sync with the Loop object.
239	void verify() const {
240	assert(isValid() && "Candidate is not valid!!");
241	assert(!L->isInvalid() && "Loop is invalid!");
242	assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
243	assert(Header == L->getHeader() && "Header is out of sync");
244	assert(ExitingBlock == L->getExitingBlock() &&
245	"Exiting Blocks is out of sync");
246	assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
247	assert(Latch == L->getLoopLatch() && "Latch is out of sync");
248	}
249
250	/// Get the entry block for this fusion candidate.
251	///
252	/// If this fusion candidate represents a guarded loop, the entry block is the
253	/// loop guard block. If it represents an unguarded loop, the entry block is
254	/// the preheader of the loop.
255	BasicBlock getEntryBlock() const* {
256	if (GuardBranch)
257	return GuardBranch->getParent();
258	else
259	return Preheader;
260	}
261
262	/// After Peeling the loop is modified quite a bit, hence all of the Blocks
263	/// need to be updated accordingly.
264	void updateAfterPeeling() {
265	Preheader = L->getLoopPreheader();
266	Header = L->getHeader();
267	ExitingBlock = L->getExitingBlock();
268	ExitBlock = L->getExitBlock();
269	Latch = L->getLoopLatch();
270	verify();
271	}
272
273	/// Given a guarded loop, get the successor of the guard that is not in the
274	/// loop.
275	///
276	/// This method returns the successor of the loop guard that is not located
277	/// within the loop (i.e., the successor of the guard that is not the
278	/// preheader).
279	/// This method is only valid for guarded loops.
280	BasicBlock getNonLoopBlock() const* {
281	assert(GuardBranch && "Only valid on guarded loops.");
282	assert(GuardBranch->isConditional() &&
283	"Expecting guard to be a conditional branch.");
284	if (Peeled)
285	return GuardBranch->getSuccessor(i: `1`);
286	return (GuardBranch->getSuccessor(i: `0`) == Preheader)
287	? GuardBranch->getSuccessor(i: `1`)
288	: GuardBranch->getSuccessor(i: `0`);
289	}
290
291	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
292	LLVM_DUMP_METHOD void dump() const {
293	dbgs() << "\tGuardBranch: ";
294	if (GuardBranch)
295	dbgs() << *GuardBranch;
296	else
297	dbgs() << "nullptr";
298	dbgs() << "\n"
299	<< (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
300	<< "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
301	<< "\n"
302	<< "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
303	<< "\tExitingBB: "
304	<< (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
305	<< "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
306	<< "\n"
307	<< "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
308	<< "\tEntryBlock: "
309	<< (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
310	<< "\n";
311	}
312	#endif
313
314	/// Determine if a fusion candidate (representing a loop) is eligible for
315	/// fusion. Note that this only checks whether a single loop can be fused - it
316	/// does not check whether it is legal* to fuse two loops together.*
317	bool isEligibleForFusion(ScalarEvolution &SE) const {
318	if (!isValid()) {
319	LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
320	if (!Preheader)
321	++InvalidPreheader;
322	if (!Header)
323	++InvalidHeader;
324	if (!ExitingBlock)
325	++InvalidExitingBlock;
326	if (!ExitBlock)
327	++InvalidExitBlock;
328	if (!Latch)
329	++InvalidLatch;
330	if (L->isInvalid())
331	++InvalidLoop;
332
333	return false;
334	}
335
336	// Require ScalarEvolution to be able to determine a trip count.
337	if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
338	LLVM_DEBUG(dbgs() << "Loop " << L->getName()
339	<< " trip count not computable!\n");
340	return reportInvalidCandidate(Stat&: UnknownTripCount);
341	}
342
343	if (!L->isLoopSimplifyForm()) {
344	LLVM_DEBUG(dbgs() << "Loop " << L->getName()
345	<< " is not in simplified form!\n");
346	return reportInvalidCandidate(Stat&: NotSimplifiedForm);
347	}
348
349	if (!L->isRotatedForm()) {
350	LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
351	return reportInvalidCandidate(Stat&: NotRotated);
352	}
353
354	return true;
355	}
356
357	private:
358	// This is only used internally for now, to clear the MemWrites and MemReads
359	// list and setting Valid to false. I can't envision other uses of this right
360	// now, since once FusionCandidates are put into the FusionCandidateSet they
361	// are immutable. Thus, any time we need to change/update a FusionCandidate,
362	// we must create a new one and insert it into the FusionCandidateSet to
363	// ensure the FusionCandidateSet remains ordered correctly.
364	void invalidate() {
365	MemWrites.clear();
366	MemReads.clear();
367	Valid = false;
368	}
369
370	bool reportInvalidCandidate(llvm::Statistic &Stat) const {
371	using namespace ore;
372	assert(L && Preheader && "Fusion candidate not initialized properly!");
373	#if LLVM_ENABLE_STATS
374	++Stat;
375	ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
376	L->getStartLoc(), Preheader)
377	<< "[" << Preheader->getParent()->getName() << "]: "
378	<< "Loop is not a candidate for fusion: " << Stat.getDesc());
379	#endif
380	return false;
381	}
382	};
383
384	struct FusionCandidateCompare {
385	/// Comparison functor to sort two Control Flow Equivalent fusion candidates
386	/// into dominance order.
387	/// If LHS dominates RHS and RHS post-dominates LHS, return true;
388	/// If RHS dominates LHS and LHS post-dominates RHS, return false;
389	/// If both LHS and RHS are not dominating each other then, non-strictly
390	/// post dominate check will decide the order of candidates. If RHS
391	/// non-strictly post dominates LHS then, return true. If LHS non-strictly
392	/// post dominates RHS then, return false. If both are non-strictly post
393	/// dominate each other then, level in the post dominator tree will decide
394	/// the order of candidates.
395	bool operator()(const FusionCandidate &LHS,
396	const FusionCandidate &RHS) const {
397	const DominatorTree *DT = &(LHS.DT);
398
399	BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
400	BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
401
402	// Do not save PDT to local variable as it is only used in asserts and thus
403	// will trigger an unused variable warning if building without asserts.
404	assert(DT && LHS.PDT && "Expecting valid dominator tree");
405
406	// Do this compare first so if LHS == RHS, function returns false.
407	if (DT->dominates(A: RHSEntryBlock, B: LHSEntryBlock)) {
408	// RHS dominates LHS
409	// Verify LHS post-dominates RHS
410	assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
411	return false;
412	}
413
414	if (DT->dominates(A: LHSEntryBlock, B: RHSEntryBlock)) {
415	// Verify RHS Postdominates LHS
416	assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
417	return true;
418	}
419
420	// If two FusionCandidates are in the same level of dominator tree,
421	// they will not dominate each other, but may still be control flow
422	// equivalent. To sort those FusionCandidates, nonStrictlyPostDominate()
423	// function is needed.
424	bool WrongOrder =
425	nonStrictlyPostDominate(ThisBlock: LHSEntryBlock, OtherBlock: RHSEntryBlock, DT, PDT: LHS.PDT);
426	bool RightOrder =
427	nonStrictlyPostDominate(ThisBlock: RHSEntryBlock, OtherBlock: LHSEntryBlock, DT, PDT: LHS.PDT);
428	if (WrongOrder && RightOrder) {
429	// If common predecessor of LHS and RHS post dominates both
430	// FusionCandidates then, Order of FusionCandidate can be
431	// identified by its level in post dominator tree.
432	DomTreeNode *LNode = LHS.PDT->getNode(BB: LHSEntryBlock);
433	DomTreeNode *RNode = LHS.PDT->getNode(BB: RHSEntryBlock);
434	return LNode->getLevel() > RNode->getLevel();
435	} else if (WrongOrder)
436	return false;
437	else if (RightOrder)
438	return true;
439
440	// If LHS does not non-strict Postdominate RHS and RHS does not non-strict
441	// Postdominate LHS then, there is no dominance relationship between the
442	// two FusionCandidates. Thus, they should not be in the same set together.
443	llvm_unreachable(
444	"No dominance relationship between these fusion candidates!");
445	}
446	};
447
448	using LoopVector = SmallVector<Loop *, `4`>;
449
450	// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
451	// order. Thus, if FC0 comes before* FC1 in a FusionCandidateSet, then FC0*
452	// dominates FC1 and FC1 post-dominates FC0.
453	// std::set was chosen because we want a sorted data structure with stable
454	// iterators. A subsequent patch to loop fusion will enable fusing non-adjacent
455	// loops by moving intervening code around. When this intervening code contains
456	// loops, those loops will be moved also. The corresponding FusionCandidates
457	// will also need to be moved accordingly. As this is done, having stable
458	// iterators will simplify the logic. Similarly, having an efficient insert that
459	// keeps the FusionCandidateSet sorted will also simplify the implementation.
460	using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
461	using FusionCandidateCollection = SmallVector<FusionCandidateSet, `4`>;
462
463	#if !defined(NDEBUG)
464	static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
465	const FusionCandidate &FC) {
466	if (FC.isValid())
467	OS << FC.Preheader->getName();
468	else
469	OS << "<Invalid>";
470
471	return OS;
472	}
473
474	static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
475	const FusionCandidateSet &CandSet) {
476	for (const FusionCandidate &FC : CandSet)
477	OS << FC << `'\n'`;
478
479	return OS;
480	}
481
482	static void
483	printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
484	dbgs() << "Fusion Candidates: \n";
485	for (const auto &CandidateSet : FusionCandidates) {
486	dbgs() << "* Fusion Candidate Set *\n";
487	dbgs() << CandidateSet;
488	dbgs() << "****************************\n";
489	}
490	}
491	#endif
492
493	/// Collect all loops in function at the same nest level, starting at the
494	/// outermost level.
495	///
496	/// This data structure collects all loops at the same nest level for a
497	/// given function (specified by the LoopInfo object). It starts at the
498	/// outermost level.
499	struct LoopDepthTree {
500	using LoopsOnLevelTy = SmallVector<LoopVector, `4`>;
501	using iterator = LoopsOnLevelTy::iterator;
502	using const_iterator = LoopsOnLevelTy::const_iterator;
503
504	LoopDepthTree(LoopInfo &LI) : Depth(`1`) {
505	if (!LI.empty())
506	LoopsOnLevel.emplace_back(Args: LoopVector (LI.rbegin(), LI.rend()));
507	}
508
509	/// Test whether a given loop has been removed from the function, and thus is
510	/// no longer valid.
511	bool isRemovedLoop(const Loop L) const* { return RemovedLoops.count(Ptr: L); }
512
513	/// Record that a given loop has been removed from the function and is no
514	/// longer valid.
515	void removeLoop(const Loop *L) { RemovedLoops.insert(Ptr: L); }
516
517	/// Descend the tree to the next (inner) nesting level
518	void descend() {
519	LoopsOnLevelTy LoopsOnNextLevel;
520
521	for (const LoopVector &LV : *this)
522	for (Loop *L : LV)
523	if (!isRemovedLoop(L) && L->begin() != L->end())
524	LoopsOnNextLevel.emplace_back(Args: LoopVector (L->begin(), L->end()));
525
526	LoopsOnLevel = LoopsOnNextLevel;
527	RemovedLoops.clear();
528	Depth++;
529	}
530
531	bool empty() const { return size() == `0`; }
532	size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
533	unsigned getDepth() const { return Depth; }
534
535	iterator begin() { return LoopsOnLevel.begin(); }
536	iterator end() { return LoopsOnLevel.end(); }
537	const_iterator begin() const { return LoopsOnLevel.begin(); }
538	const_iterator end() const { return LoopsOnLevel.end(); }
539
540	private:
541	/// Set of loops that have been removed from the function and are no longer
542	/// valid.
543	SmallPtrSet<const Loop *, `8`> RemovedLoops;
544
545	/// Depth of the current level, starting at 1 (outermost loops).
546	unsigned Depth;
547
548	/// Vector of loops at the current depth level that have the same parent loop
549	LoopsOnLevelTy LoopsOnLevel;
550	};
551
552	#ifndef NDEBUG
553	static void printLoopVector(const LoopVector &LV) {
554	dbgs() << "****************************\n";
555	for (auto *L : LV)
556	printLoop(*L, dbgs());
557	dbgs() << "****************************\n";
558	}
559	#endif
560
561	struct LoopFuser {
562	private:
563	// Sets of control flow equivalent fusion candidates for a given nest level.
564	FusionCandidateCollection FusionCandidates;
565
566	LoopDepthTree LDT;
567	DomTreeUpdater DTU;
568
569	LoopInfo &LI;
570	DominatorTree &DT;
571	DependenceInfo &DI;
572	ScalarEvolution &SE;
573	PostDominatorTree &PDT;
574	OptimizationRemarkEmitter &ORE;
575	AssumptionCache &AC;
576	const TargetTransformInfo &TTI;
577
578	public:
579	LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
580	ScalarEvolution &SE, PostDominatorTree &PDT,
581	OptimizationRemarkEmitter &ORE, const DataLayout &DL,
582	AssumptionCache &AC, const TargetTransformInfo &TTI)
583	: LDT (LI), DTU (DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
584	DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
585
586	/// This is the main entry point for loop fusion. It will traverse the
587	/// specified function and collect candidate loops to fuse, starting at the
588	/// outermost nesting level and working inwards.
589	bool fuseLoops(Function &F) {
590	#ifndef NDEBUG
591	if (VerboseFusionDebugging) {
592	LI.print(dbgs());
593	}
594	#endif
595
596	LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
597	<< "\n");
598	bool Changed = false;
599
600	while (!LDT.empty()) {
601	LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
602	<< LDT.getDepth() << "\n";);
603
604	for (const LoopVector &LV : LDT) {
605	assert(LV.size() > `0` && "Empty loop set was build!");
606
607	// Skip singleton loop sets as they do not offer fusion opportunities on
608	// this level.
609	if (LV.size() == `1`)
610	continue;
611	#ifndef NDEBUG
612	if (VerboseFusionDebugging) {
613	LLVM_DEBUG({
614	dbgs() << " Visit loop set (#" << LV.size() << "):\n";
615	printLoopVector(LV);
616	});
617	}
618	#endif
619
620	collectFusionCandidates(LV);
621	Changed \|= fuseCandidates();
622	}
623
624	// Finished analyzing candidates at this level.
625	// Descend to the next level and clear all of the candidates currently
626	// collected. Note that it will not be possible to fuse any of the
627	// existing candidates with new candidates because the new candidates will
628	// be at a different nest level and thus not be control flow equivalent
629	// with all of the candidates collected so far.
630	LLVM_DEBUG(dbgs() << "Descend one level!\n");
631	LDT.descend();
632	FusionCandidates.clear();
633	}
634
635	if (Changed)
636	LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
637
638	#ifndef NDEBUG
639	assert(DT.verify());
640	assert(PDT.verify());
641	LI.verify(DT);
642	SE.verify();
643	#endif
644
645	LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
646	return Changed;
647	}
648
649	private:
650	/// Determine if two fusion candidates are control flow equivalent.
651	///
652	/// Two fusion candidates are control flow equivalent if when one executes,
653	/// the other is guaranteed to execute. This is determined using dominators
654	/// and post-dominators: if A dominates B and B post-dominates A then A and B
655	/// are control-flow equivalent.
656	bool isControlFlowEquivalent(const FusionCandidate &FC0,
657	const FusionCandidate &FC1) const {
658	assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
659
660	return ::isControlFlowEquivalent(BB0: FC0.getEntryBlock(), BB1: FC1.getEntryBlock(),
661	DT, PDT);
662	}
663
664	/// Iterate over all loops in the given loop set and identify the loops that
665	/// are eligible for fusion. Place all eligible fusion candidates into Control
666	/// Flow Equivalent sets, sorted by dominance.
667	void collectFusionCandidates(const LoopVector &LV) {
668	for (Loop *L : LV) {
669	TTI::PeelingPreferences PP =
670	gatherPeelingPreferences(L, SE, TTI, UserAllowPeeling: std::nullopt, UserAllowProfileBasedPeeling: std::nullopt);
671	FusionCandidate CurrCand(L, DT, &PDT, ORE, PP);
672	if (!CurrCand.isEligibleForFusion(SE))
673	continue;
674
675	// Go through each list in FusionCandidates and determine if L is control
676	// flow equivalent with the first loop in that list. If it is, append LV.
677	// If not, go to the next list.
678	// If no suitable list is found, start another list and add it to
679	// FusionCandidates.
680	bool FoundSet = false;
681
682	for (auto &CurrCandSet : FusionCandidates) {
683	if (isControlFlowEquivalent(FC0: *CurrCandSet.begin(), FC1: CurrCand)) {
684	CurrCandSet.insert(x: CurrCand);
685	FoundSet = true;
686	#ifndef NDEBUG
687	if (VerboseFusionDebugging)
688	LLVM_DEBUG(dbgs() << "Adding " << CurrCand
689	<< " to existing candidate set\n");
690	#endif
691	break;
692	}
693	}
694	if (!FoundSet) {
695	// No set was found. Create a new set and add to FusionCandidates
696	#ifndef NDEBUG
697	if (VerboseFusionDebugging)
698	LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
699	#endif
700	FusionCandidateSet NewCandSet;
701	NewCandSet.insert(x: CurrCand);
702	FusionCandidates.push_back(Elt: NewCandSet);
703	}
704	NumFusionCandidates ++;
705	}
706	}
707
708	/// Determine if it is beneficial to fuse two loops.
709	///
710	/// For now, this method simply returns true because we want to fuse as much
711	/// as possible (primarily to test the pass). This method will evolve, over
712	/// time, to add heuristics for profitability of fusion.
713	bool isBeneficialFusion(const FusionCandidate &FC0,
714	const FusionCandidate &FC1) {
715	return true;
716	}
717
718	/// Determine if two fusion candidates have the same trip count (i.e., they
719	/// execute the same number of iterations).
720	///
721	/// This function will return a pair of values. The first is a boolean,
722	/// stating whether or not the two candidates are known at compile time to
723	/// have the same TripCount. The second is the difference in the two
724	/// TripCounts. This information can be used later to determine whether or not
725	/// peeling can be performed on either one of the candidates.
726	std::pair<bool, std::optional<unsigned>>
727	haveIdenticalTripCounts(const FusionCandidate &FC0,
728	const FusionCandidate &FC1) const {
729	const SCEV *TripCount0 = SE.getBackedgeTakenCount(L: FC0.L);
730	if (isa<SCEVCouldNotCompute>(Val: TripCount0)) {
731	UncomputableTripCount ++;
732	LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
733	return {false, std::nullopt};
734	}
735
736	const SCEV *TripCount1 = SE.getBackedgeTakenCount(L: FC1.L);
737	if (isa<SCEVCouldNotCompute>(Val: TripCount1)) {
738	UncomputableTripCount ++;
739	LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
740	return {false, std::nullopt};
741	}
742
743	LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
744	<< *TripCount1 << " are "
745	<< (TripCount0 == TripCount1 ? "identical" : "different")
746	<< "\n");
747
748	if (TripCount0 == TripCount1)
749	return {true, `0`};
750
751	LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
752	"determining the difference between trip counts\n");
753
754	// Currently only considering loops with a single exit point
755	// and a non-constant trip count.
756	const unsigned TC0 = SE.getSmallConstantTripCount(L: FC0.L);
757	const unsigned TC1 = SE.getSmallConstantTripCount(L: FC1.L);
758
759	// If any of the tripcounts are zero that means that loop(s) do not have
760	// a single exit or a constant tripcount.
761	if (TC0 == `0` \|\| TC1 == `0`) {
762	LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
763	"have a constant number of iterations. Peeling "
764	"is not benefical\n");
765	return {false, std::nullopt};
766	}
767
768	std::optional<unsigned> Difference;
769	int Diff = TC0 - TC1;
770
771	if (Diff > `0`)
772	Difference = Diff;
773	else {
774	LLVM_DEBUG(
775	dbgs() << "Difference is less than 0. FC1 (second loop) has more "
776	"iterations than the first one. Currently not supported\n");
777	}
778
779	LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
780	<< "\n");
781
782	return {false, Difference};
783	}
784
785	void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
786	unsigned PeelCount) {
787	assert(FC0.AbleToPeel && "Should be able to peel loop");
788
789	LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
790	<< " iterations of the first loop. \n");
791
792	ValueToValueMapTy VMap;
793	FC0.Peeled =
794	peelLoop(L: FC0.L, PeelCount, PeelLast: false, LI: &LI, SE: &SE, DT, AC: &AC, PreserveLCSSA: true, VMap);
795	if (FC0.Peeled) {
796	LLVM_DEBUG(dbgs() << "Done Peeling\n");
797
798	#ifndef NDEBUG
799	auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
800
801	assert(IdenticalTripCount.first && *IdenticalTripCount.second == `0` &&
802	"Loops should have identical trip counts after peeling");
803	#endif
804
805	FC0.PP.PeelCount += PeelCount;
806
807	// Peeling does not update the PDT
808	PDT.recalculate(Func&: *FC0.Preheader->getParent());
809
810	FC0.updateAfterPeeling();
811
812	// In this case the iterations of the loop are constant, so the first
813	// loop will execute completely (will not jump from one of
814	// the peeled blocks to the second loop). Here we are updating the
815	// branch conditions of each of the peeled blocks, such that it will
816	// branch to its successor which is not the preheader of the second loop
817	// in the case of unguarded loops, or the succesors of the exit block of
818	// the first loop otherwise. Doing this update will ensure that the entry
819	// block of the first loop dominates the entry block of the second loop.
820	BasicBlock *BB =
821	FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
822	if (BB) {
823	SmallVector<DominatorTree::UpdateType, `8`> TreeUpdates;
824	SmallVector<Instruction *, `8`> WorkList;
825	for (BasicBlock *Pred : predecessors(BB)) {
826	if (Pred != FC0.ExitBlock) {
827	WorkList.emplace_back(Args: Pred->getTerminator());
828	TreeUpdates.emplace_back(
829	Args: DominatorTree::UpdateType (DominatorTree::Delete, Pred, BB));
830	}
831	}
832	// Cannot modify the predecessors inside the above loop as it will cause
833	// the iterators to be nullptrs, causing memory errors.
834	for (Instruction *CurrentBranch : WorkList) {
835	BasicBlock *Succ = CurrentBranch->getSuccessor(Idx: `0`);
836	if (Succ == BB)
837	Succ = CurrentBranch->getSuccessor(Idx: `1`);
838	ReplaceInstWithInst(From: CurrentBranch, To: BranchInst::Create(IfTrue: Succ));
839	}
840
841	DTU.applyUpdates(Updates: TreeUpdates);
842	DTU.flush();
843	}
844	LLVM_DEBUG(
845	dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
846	<< " iterations from the first loop.\n"
847	"Both Loops have the same number of iterations now.\n");
848	}
849	}
850
851	/// Walk each set of control flow equivalent fusion candidates and attempt to
852	/// fuse them. This does a single linear traversal of all candidates in the
853	/// set. The conditions for legal fusion are checked at this point. If a pair
854	/// of fusion candidates passes all legality checks, they are fused together
855	/// and a new fusion candidate is created and added to the FusionCandidateSet.
856	/// The original fusion candidates are then removed, as they are no longer
857	/// valid.
858	bool fuseCandidates() {
859	bool Fused = false;
860	LLVM_DEBUG(printFusionCandidates(FusionCandidates));
861	for (auto &CandidateSet : FusionCandidates) {
862	if (CandidateSet.size() < `2`)
863	continue;
864
865	LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
866	<< CandidateSet << "\n");
867
868	for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
869	assert(!LDT.isRemovedLoop(FC0->L) &&
870	"Should not have removed loops in CandidateSet!");
871	auto FC1 = FC0;
872	for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
873	assert(!LDT.isRemovedLoop(FC1->L) &&
874	"Should not have removed loops in CandidateSet!");
875
876	LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
877	dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
878
879	FC0 ->verify();
880	FC1 ->verify();
881
882	// Check if the candidates have identical tripcounts (first value of
883	// pair), and if not check the difference in the tripcounts between
884	// the loops (second value of pair). The difference is not equal to
885	// std::nullopt iff the loops iterate a constant number of times, and
886	// have a single exit.
887	std::pair<bool, std::optional<unsigned>> IdenticalTripCountRes =
888	haveIdenticalTripCounts(FC0: FC0, FC1: FC1);
889	bool SameTripCount = IdenticalTripCountRes.first;
890	std::optional<unsigned> TCDifference = IdenticalTripCountRes.second;
891
892	// Here we are checking that FC0 (the first loop) can be peeled, and
893	// both loops have different tripcounts.
894	if (FC0 ->AbleToPeel && !SameTripCount && TCDifference) {
895	if (*TCDifference > FusionPeelMaxCount) {
896	LLVM_DEBUG(dbgs()
897	<< "Difference in loop trip counts: " << *TCDifference
898	<< " is greater than maximum peel count specificed: "
899	<< FusionPeelMaxCount << "\n");
900	} else {
901	// Dependent on peeling being performed on the first loop, and
902	// assuming all other conditions for fusion return true.
903	SameTripCount = true;
904	}
905	}
906
907	if (!SameTripCount) {
908	LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
909	"counts. Not fusing.\n");
910	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
911	Stat&: NonEqualTripCount);
912	continue;
913	}
914
915	if (!isAdjacent(FC0: FC0, FC1: FC1)) {
916	LLVM_DEBUG(dbgs()
917	<< "Fusion candidates are not adjacent. Not fusing.\n");
918	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1, Stat&: NonAdjacent);
919	continue;
920	}
921
922	if ((!FC0 ->GuardBranch && FC1 ->GuardBranch) \|\|
923	(FC0 ->GuardBranch && !FC1 ->GuardBranch)) {
924	LLVM_DEBUG(dbgs() << "The one of candidate is guarded while the "
925	"another one is not. Not fusing.\n");
926	reportLoopFusion<OptimizationRemarkMissed>(
927	FC0: FC0, FC1: FC1, Stat&: OnlySecondCandidateIsGuarded);
928	continue;
929	}
930
931	// Ensure that FC0 and FC1 have identical guards.
932	// If one (or both) are not guarded, this check is not necessary.
933	if (FC0 ->GuardBranch && FC1 ->GuardBranch &&
934	!haveIdenticalGuards(FC0: FC0, FC1: FC1) && !TCDifference) {
935	LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
936	"guards. Not Fusing.\n");
937	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
938	Stat&: NonIdenticalGuards);
939	continue;
940	}
941
942	if (FC0 ->GuardBranch) {
943	assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
944
945	if (!isSafeToMoveBefore(BB&: *FC0 ->ExitBlock,
946	InsertPoint&: *FC1 ->ExitBlock->getFirstNonPHIOrDbg(), DT,
947	PDT: &PDT, DI: &DI)) {
948	LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
949	"instructions in exit block. Not fusing.\n");
950	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
951	Stat&: NonEmptyExitBlock);
952	continue;
953	}
954
955	if (!isSafeToMoveBefore(
956	BB&: *FC1 ->GuardBranch->getParent(),
957	InsertPoint&: *FC0 ->GuardBranch->getParent()->getTerminator(), DT, PDT: &PDT,
958	DI: &DI)) {
959	LLVM_DEBUG(dbgs()
960	<< "Fusion candidate contains unsafe "
961	"instructions in guard block. Not fusing.\n");
962	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
963	Stat&: NonEmptyGuardBlock);
964	continue;
965	}
966	}
967
968	// Check the dependencies across the loops and do not fuse if it would
969	// violate them.
970	if (!dependencesAllowFusion(FC0: FC0, FC1: FC1)) {
971	LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
972	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
973	Stat&: InvalidDependencies);
974	continue;
975	}
976
977	// If the second loop has instructions in the pre-header, attempt to
978	// hoist them up to the first loop's pre-header or sink them into the
979	// body of the second loop.
980	SmallVector<Instruction *, `4`> SafeToHoist;
981	SmallVector<Instruction *, `4`> SafeToSink;
982	// At this point, this is the last remaining legality check.
983	// Which means if we can make this pre-header empty, we can fuse
984	// these loops
985	if (!isEmptyPreheader(FC: *FC1)) {
986	LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
987	"preheader.\n");
988
989	// If it is not safe to hoist/sink all instructions in the
990	// pre-header, we cannot fuse these loops.
991	if (!collectMovablePreheaderInsts(FC0: FC0, FC1: FC1, SafeToHoist,
992	SafeToSink)) {
993	LLVM_DEBUG(dbgs() << "Could not hoist/sink all instructions in "
994	"Fusion Candidate Pre-header.\n"
995	<< "Not Fusing.\n");
996	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
997	Stat&: NonEmptyPreheader);
998	continue;
999	}
1000	}
1001
1002	bool BeneficialToFuse = isBeneficialFusion(FC0: FC0, FC1: FC1);
1003	LLVM_DEBUG(dbgs()
1004	<< "\tFusion appears to be "
1005	<< (BeneficialToFuse ? "" : "un") << "profitable!\n");
1006	if (!BeneficialToFuse) {
1007	reportLoopFusion<OptimizationRemarkMissed>(FC0: FC0, FC1: FC1,
1008	Stat&: FusionNotBeneficial);
1009	continue;
1010	}
1011	// All analysis has completed and has determined that fusion is legal
1012	// and profitable. At this point, start transforming the code and
1013	// perform fusion.
1014
1015	// Execute the hoist/sink operations on preheader instructions
1016	movePreheaderInsts(FC0: FC0, FC1: FC1, HoistInsts&: SafeToHoist, SinkInsts&: SafeToSink);
1017
1018	LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
1019	<< *FC1 << "\n");
1020
1021	FusionCandidate FC0Copy = *FC0;
1022	// Peel the loop after determining that fusion is legal. The Loops
1023	// will still be safe to fuse after the peeling is performed.
1024	bool Peel = TCDifference && *TCDifference > `0`;
1025	if (Peel)
1026	peelFusionCandidate(FC0&: FC0Copy, FC1: FC1, PeelCount: TCDifference);
1027
1028	// Report fusion to the Optimization Remarks.
1029	// Note this needs to be done before* performFusion because*
1030	// performFusion will change the original loops, making it not
1031	// possible to identify them after fusion is complete.
1032	reportLoopFusion<OptimizationRemark>(FC0: (Peel ? FC0Copy : FC0), FC1: FC1,
1033	Stat&: FuseCounter);
1034
1035	FusionCandidate FusedCand(
1036	performFusion(FC0: (Peel ? FC0Copy : FC0), FC1: FC1), DT, &PDT, ORE,
1037	FC0Copy.PP);
1038	FusedCand.verify();
1039	assert(FusedCand.isEligibleForFusion(SE) &&
1040	"Fused candidate should be eligible for fusion!");
1041
1042	// Notify the loop-depth-tree that these loops are not valid objects
1043	LDT.removeLoop(L: FC1 ->L);
1044
1045	CandidateSet.erase(position: FC0);
1046	CandidateSet.erase(position: FC1);
1047
1048	auto InsertPos = CandidateSet.insert(x: FusedCand);
1049
1050	assert(InsertPos.second &&
1051	"Unable to insert TargetCandidate in CandidateSet!");
1052
1053	// Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
1054	// of the FC1 loop will attempt to fuse the new (fused) loop with the
1055	// remaining candidates in the current candidate set.
1056	FC0 = FC1 = InsertPos.first;
1057
1058	LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
1059	<< "\n");
1060
1061	Fused = true;
1062	}
1063	}
1064	}
1065	return Fused;
1066	}
1067
1068	// Returns true if the instruction \p I can be hoisted to the end of the
1069	// preheader of \p FC0. \p SafeToHoist contains the instructions that are
1070	// known to be safe to hoist. The instructions encountered that cannot be
1071	// hoisted are in \p NotHoisting.
1072	// TODO: Move functionality into CodeMoverUtils
1073	bool canHoistInst(Instruction &I,
1074	const SmallVector<Instruction *, `4`> &SafeToHoist,
1075	const SmallVector<Instruction *, `4`> &NotHoisting,
1076	const FusionCandidate &FC0) const {
1077	const BasicBlock *FC0PreheaderTarget = FC0.Preheader->getSingleSuccessor();
1078	assert(FC0PreheaderTarget &&
1079	"Expected single successor for loop preheader.");
1080
1081	for (Use &Op : I.operands()) {
1082	if (auto *OpInst = dyn_cast<Instruction>(Val&: Op)) {
1083	bool OpHoisted = is_contained(Range: SafeToHoist, Element: OpInst);
1084	// Check if we have already decided to hoist this operand. In this
1085	// case, it does not dominate FC0 yet, but will after we hoist it.
1086	if (!(OpHoisted \|\| DT.dominates(Def: OpInst, BB: FC0PreheaderTarget))) {
1087	return false;
1088	}
1089	}
1090	}
1091
1092	// PHIs in FC1's header only have FC0 blocks as predecessors. PHIs
1093	// cannot be hoisted and should be sunk to the exit of the fused loop.
1094	if (isa<PHINode>(Val: I))
1095	return false;
1096
1097	// If this isn't a memory inst, hoisting is safe
1098	if (!I.mayReadOrWriteMemory())
1099	return true;
1100
1101	LLVM_DEBUG(dbgs() << "Checking if this mem inst can be hoisted.\n");
1102	for (Instruction *NotHoistedInst : NotHoisting) {
1103	if (auto D = DI.depends(Src: &I, Dst: NotHoistedInst)) {
1104	// Dependency is not read-before-write, write-before-read or
1105	// write-before-write
1106	if (D ->isFlow() \|\| D ->isAnti() \|\| D ->isOutput()) {
1107	LLVM_DEBUG(dbgs() << "Inst depends on an instruction in FC1's "
1108	"preheader that is not being hoisted.\n");
1109	return false;
1110	}
1111	}
1112	}
1113
1114	for (Instruction *ReadInst : FC0.MemReads) {
1115	if (auto D = DI.depends(Src: ReadInst, Dst: &I)) {
1116	// Dependency is not read-before-write
1117	if (D ->isAnti()) {
1118	LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC0.\n");
1119	return false;
1120	}
1121	}
1122	}
1123
1124	for (Instruction *WriteInst : FC0.MemWrites) {
1125	if (auto D = DI.depends(Src: WriteInst, Dst: &I)) {
1126	// Dependency is not write-before-read or write-before-write
1127	if (D ->isFlow() \|\| D ->isOutput()) {
1128	LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC0.\n");
1129	return false;
1130	}
1131	}
1132	}
1133	return true;
1134	}
1135
1136	// Returns true if the instruction \p I can be sunk to the top of the exit
1137	// block of \p FC1.
1138	// TODO: Move functionality into CodeMoverUtils
1139	bool canSinkInst(Instruction &I, const FusionCandidate &FC1) const {
1140	for (User *U : I.users()) {
1141	if (auto *UI{dyn_cast<Instruction>(Val: U)}) {
1142	// Cannot sink if user in loop
1143	// If FC1 has phi users of this value, we cannot sink it into FC1.
1144	if (FC1.L->contains(Inst: UI)) {
1145	// Cannot hoist or sink this instruction. No hoisting/sinking
1146	// should take place, loops should not fuse
1147	return false;
1148	}
1149	}
1150	}
1151
1152	// If this isn't a memory inst, sinking is safe
1153	if (!I.mayReadOrWriteMemory())
1154	return true;
1155
1156	for (Instruction *ReadInst : FC1.MemReads) {
1157	if (auto D = DI.depends(Src: &I, Dst: ReadInst)) {
1158	// Dependency is not write-before-read
1159	if (D ->isFlow()) {
1160	LLVM_DEBUG(dbgs() << "Inst depends on a read instruction in FC1.\n");
1161	return false;
1162	}
1163	}
1164	}
1165
1166	for (Instruction *WriteInst : FC1.MemWrites) {
1167	if (auto D = DI.depends(Src: &I, Dst: WriteInst)) {
1168	// Dependency is not write-before-write or read-before-write
1169	if (D ->isOutput() \|\| D ->isAnti()) {
1170	LLVM_DEBUG(dbgs() << "Inst depends on a write instruction in FC1.\n");
1171	return false;
1172	}
1173	}
1174	}
1175
1176	return true;
1177	}
1178
1179	/// Collect instructions in the \p FC1 Preheader that can be hoisted
1180	/// to the \p FC0 Preheader or sunk into the \p FC1 Body
1181	bool collectMovablePreheaderInsts(
1182	const FusionCandidate &FC0, const FusionCandidate &FC1,
1183	SmallVector<Instruction *, `4`> &SafeToHoist,
1184	SmallVector<Instruction , `4`> &SafeToSink) const* {
1185	BasicBlock *FC1Preheader = FC1.Preheader;
1186	// Save the instructions that are not being hoisted, so we know not to hoist
1187	// mem insts that they dominate.
1188	SmallVector<Instruction *, `4`> NotHoisting;
1189
1190	for (Instruction &I : *FC1Preheader) {
1191	// Can't move a branch
1192	if (&I == FC1Preheader->getTerminator())
1193	continue;
1194	// If the instruction has side-effects, give up.
1195	// TODO: The case of mayReadFromMemory we can handle but requires
1196	// additional work with a dependence analysis so for now we give
1197	// up on memory reads.
1198	if (I.mayThrow() \|\| !I.willReturn()) {
1199	LLVM_DEBUG(dbgs() << "Inst: " << I << " may throw or won't return.\n");
1200	return false;
1201	}
1202
1203	LLVM_DEBUG(dbgs() << "Checking Inst: " << I << "\n");
1204
1205	if (I.isAtomic() \|\| I.isVolatile()) {
1206	LLVM_DEBUG(
1207	dbgs() << "\tInstruction is volatile or atomic. Cannot move it.\n");
1208	return false;
1209	}
1210
1211	if (canHoistInst(I, SafeToHoist, NotHoisting, FC0)) {
1212	SafeToHoist.push_back(Elt: &I);
1213	LLVM_DEBUG(dbgs() << "\tSafe to hoist.\n");
1214	} else {
1215	LLVM_DEBUG(dbgs() << "\tCould not hoist. Trying to sink...\n");
1216	NotHoisting.push_back(Elt: &I);
1217
1218	if (canSinkInst(I, FC1)) {
1219	SafeToSink.push_back(Elt: &I);
1220	LLVM_DEBUG(dbgs() << "\tSafe to sink.\n");
1221	} else {
1222	LLVM_DEBUG(dbgs() << "\tCould not sink.\n");
1223	return false;
1224	}
1225	}
1226	}
1227	LLVM_DEBUG(
1228	dbgs() << "All preheader instructions could be sunk or hoisted!\n");
1229	return true;
1230	}
1231
1232	/// Rewrite all additive recurrences in a SCEV to use a new loop.
1233	class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
1234	public:
1235	AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
1236	bool UseMax = true)
1237	: SCEVRewriteVisitor (SE), Valid(true), UseMax(UseMax), OldL(OldL),
1238	NewL(NewL) {}
1239
1240	const SCEV visitAddRecExpr(const* SCEVAddRecExpr *Expr) {
1241	const Loop *ExprL = Expr->getLoop();
1242	SmallVector<const SCEV *, `2`> Operands;
1243	if (ExprL == &OldL) {
1244	append_range(C&: Operands, R: Expr->operands());
1245	return SE.getAddRecExpr(Operands, L: &NewL, Flags: Expr->getNoWrapFlags());
1246	}
1247
1248	if (OldL.contains(L: ExprL)) {
1249	bool Pos = SE.isKnownPositive(S: Expr->getStepRecurrence(SE));
1250	if (!UseMax \|\| !Pos \|\| !Expr->isAffine()) {
1251	Valid = false;
1252	return Expr;
1253	}
1254	return visit(S: Expr->getStart());
1255	}
1256
1257	for (const SCEV *Op : Expr->operands())
1258	Operands.push_back(Elt: visit(S: Op));
1259	return SE.getAddRecExpr(Operands, L: ExprL, Flags: Expr->getNoWrapFlags());
1260	}
1261
1262	bool wasValidSCEV() const { return Valid; }
1263
1264	private:
1265	bool Valid, UseMax;
1266	const Loop &OldL, &NewL;
1267	};
1268
1269	/// Return false if the access functions of \p I0 and \p I1 could cause
1270	/// a negative dependence.
1271	bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
1272	Instruction &I1, bool EqualIsInvalid) {
1273	Value *Ptr0 = getLoadStorePointerOperand(V: &I0);
1274	Value *Ptr1 = getLoadStorePointerOperand(V: &I1);
1275	if (!Ptr0 \|\| !Ptr1)
1276	return false;
1277
1278	const SCEV *SCEVPtr0 = SE.getSCEVAtScope(V: Ptr0, L: &L0);
1279	const SCEV *SCEVPtr1 = SE.getSCEVAtScope(V: Ptr1, L: &L1);
1280	#ifndef NDEBUG
1281	if (VerboseFusionDebugging)
1282	LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
1283	<< *SCEVPtr1 << "\n");
1284	#endif
1285	AddRecLoopReplacer Rewriter(SE, L0, L1);
1286	SCEVPtr0 = Rewriter.visit(S: SCEVPtr0);
1287	#ifndef NDEBUG
1288	if (VerboseFusionDebugging)
1289	LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
1290	<< " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
1291	#endif
1292	if (!Rewriter.wasValidSCEV())
1293	return false;
1294
1295	// TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
1296	// L0) and the other is not. We could check if it is monotone and test
1297	// the beginning and end value instead.
1298
1299	BasicBlock *L0Header = L0.getHeader();
1300	auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
1301	const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Val: S);
1302	if (!AddRec)
1303	return false;
1304	return !DT.dominates(A: L0Header, B: AddRec->getLoop()->getHeader()) &&
1305	!DT.dominates(A: AddRec->getLoop()->getHeader(), B: L0Header);
1306	};
1307	if (SCEVExprContains(Root: SCEVPtr1, Pred: HasNonLinearDominanceRelation))
1308	return false;
1309
1310	ICmpInst::Predicate Pred =
1311	EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
1312	bool IsAlwaysGE = SE.isKnownPredicate(Pred, LHS: SCEVPtr0, RHS: SCEVPtr1);
1313	#ifndef NDEBUG
1314	if (VerboseFusionDebugging)
1315	LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
1316	<< (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
1317	<< "\n");
1318	#endif
1319	return IsAlwaysGE;
1320	}
1321
1322	/// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
1323	/// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
1324	/// specified by @p DepChoice are used to determine this.
1325	bool dependencesAllowFusion(const FusionCandidate &FC0,
1326	const FusionCandidate &FC1, Instruction &I0,
1327	Instruction &I1, bool AnyDep,
1328	FusionDependenceAnalysisChoice DepChoice) {
1329	#ifndef NDEBUG
1330	if (VerboseFusionDebugging) {
1331	LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
1332	<< DepChoice << "\n");
1333	}
1334	#endif
1335	switch (DepChoice) {
1336	case FUSION_DEPENDENCE_ANALYSIS_SCEV:
1337	return accessDiffIsPositive(L0: FC0.L, L1: FC1.L, I0, I1, EqualIsInvalid: AnyDep);
1338	case FUSION_DEPENDENCE_ANALYSIS_DA: {
1339	auto DepResult = DI.depends(Src: &I0, Dst: &I1);
1340	if (!DepResult)
1341	return true;
1342	#ifndef NDEBUG
1343	if (VerboseFusionDebugging) {
1344	LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
1345	dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
1346	<< (DepResult->isOrdered() ? "true" : "false")
1347	<< "]\n");
1348	LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
1349	<< "\n");
1350	}
1351	#endif
1352
1353	if (DepResult ->getNextPredecessor() \|\| DepResult ->getNextSuccessor())
1354	LLVM_DEBUG(
1355	dbgs() << "TODO: Implement pred/succ dependence handling!\n");
1356
1357	// TODO: Can we actually use the dependence info analysis here?
1358	return false;
1359	}
1360
1361	case FUSION_DEPENDENCE_ANALYSIS_ALL:
1362	return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
1363	DepChoice: FUSION_DEPENDENCE_ANALYSIS_SCEV) \|\|
1364	dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
1365	DepChoice: FUSION_DEPENDENCE_ANALYSIS_DA);
1366	}
1367
1368	llvm_unreachable("Unknown fusion dependence analysis choice!");
1369	}
1370
1371	/// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
1372	bool dependencesAllowFusion(const FusionCandidate &FC0,
1373	const FusionCandidate &FC1) {
1374	LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
1375	<< "\n");
1376	assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
1377	assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
1378
1379	for (Instruction *WriteL0 : FC0.MemWrites) {
1380	for (Instruction *WriteL1 : FC1.MemWrites)
1381	if (!dependencesAllowFusion(FC0, FC1, I0&: WriteL0, I1&: WriteL1,
1382	/ AnyDep / false,
1383	DepChoice: FusionDependenceAnalysis)) {
1384	InvalidDependencies ++;
1385	return false;
1386	}
1387	for (Instruction *ReadL1 : FC1.MemReads)
1388	if (!dependencesAllowFusion(FC0, FC1, I0&: WriteL0, I1&: ReadL1,
1389	/ AnyDep / false,
1390	DepChoice: FusionDependenceAnalysis)) {
1391	InvalidDependencies ++;
1392	return false;
1393	}
1394	}
1395
1396	for (Instruction *WriteL1 : FC1.MemWrites) {
1397	for (Instruction *WriteL0 : FC0.MemWrites)
1398	if (!dependencesAllowFusion(FC0, FC1, I0&: WriteL0, I1&: WriteL1,
1399	/ AnyDep / false,
1400	DepChoice: FusionDependenceAnalysis)) {
1401	InvalidDependencies ++;
1402	return false;
1403	}
1404	for (Instruction *ReadL0 : FC0.MemReads)
1405	if (!dependencesAllowFusion(FC0, FC1, I0&: ReadL0, I1&: WriteL1,
1406	/ AnyDep / false,
1407	DepChoice: FusionDependenceAnalysis)) {
1408	InvalidDependencies ++;
1409	return false;
1410	}
1411	}
1412
1413	// Walk through all uses in FC1. For each use, find the reaching def. If the
1414	// def is located in FC0 then it is not safe to fuse.
1415	for (BasicBlock *BB : FC1.L->blocks())
1416	for (Instruction &I : *BB)
1417	for (auto &Op : I.operands())
1418	if (Instruction *Def = dyn_cast<Instruction>(Val&: Op))
1419	if (FC0.L->contains(BB: Def->getParent())) {
1420	InvalidDependencies ++;
1421	return false;
1422	}
1423
1424	return true;
1425	}
1426
1427	/// Determine if two fusion candidates are adjacent in the CFG.
1428	///
1429	/// This method will determine if there are additional basic blocks in the CFG
1430	/// between the exit of \p FC0 and the entry of \p FC1.
1431	/// If the two candidates are guarded loops, then it checks whether the
1432	/// non-loop successor of the \p FC0 guard branch is the entry block of \p
1433	/// FC1. If not, then the loops are not adjacent. If the two candidates are
1434	/// not guarded loops, then it checks whether the exit block of \p FC0 is the
1435	/// preheader of \p FC1.
1436	bool isAdjacent(const FusionCandidate &FC0,
1437	const FusionCandidate &FC1) const {
1438	// If the successor of the guard branch is FC1, then the loops are adjacent
1439	if (FC0.GuardBranch)
1440	return FC0.getNonLoopBlock() == FC1.getEntryBlock();
1441	else
1442	return FC0.ExitBlock == FC1.getEntryBlock();
1443	}
1444
1445	bool isEmptyPreheader(const FusionCandidate &FC) const {
1446	return FC.Preheader->size() == `1`;
1447	}
1448
1449	/// Hoist \p FC1 Preheader instructions to \p FC0 Preheader
1450	/// and sink others into the body of \p FC1.
1451	void movePreheaderInsts(const FusionCandidate &FC0,
1452	const FusionCandidate &FC1,
1453	SmallVector<Instruction *, `4`> &HoistInsts,
1454	SmallVector<Instruction , `4`> &SinkInsts) const* {
1455	// All preheader instructions except the branch must be hoisted or sunk
1456	assert(HoistInsts.size() + SinkInsts.size() == FC1.Preheader->size() - `1` &&
1457	"Attempting to sink and hoist preheader instructions, but not all "
1458	"the preheader instructions are accounted for.");
1459
1460	NumHoistedInsts += HoistInsts.size();
1461	NumSunkInsts += SinkInsts.size();
1462
1463	LLVM_DEBUG(if (VerboseFusionDebugging) {
1464	if (!HoistInsts.empty())
1465	dbgs() << "Hoisting: \n";
1466	for (Instruction *I : HoistInsts)
1467	dbgs() << *I << "\n";
1468	if (!SinkInsts.empty())
1469	dbgs() << "Sinking: \n";
1470	for (Instruction *I : SinkInsts)
1471	dbgs() << *I << "\n";
1472	});
1473
1474	for (Instruction *I : HoistInsts) {
1475	assert(I->getParent() == FC1.Preheader);
1476	I->moveBefore(BB&: *FC0.Preheader,
1477	I: FC0.Preheader->getTerminator()->getIterator());
1478	}
1479	// insert instructions in reverse order to maintain dominance relationship
1480	for (Instruction *I : reverse(C&: SinkInsts)) {
1481	assert(I->getParent() == FC1.Preheader);
1482	I->moveBefore(BB&: *FC1.ExitBlock, I: FC1.ExitBlock->getFirstInsertionPt());
1483	}
1484	}
1485
1486	/// Determine if two fusion candidates have identical guards
1487	///
1488	/// This method will determine if two fusion candidates have the same guards.
1489	/// The guards are considered the same if:
1490	/// 1. The instructions to compute the condition used in the compare are
1491	/// identical.
1492	/// 2. The successors of the guard have the same flow into/around the loop.
1493	/// If the compare instructions are identical, then the first successor of the
1494	/// guard must go to the same place (either the preheader of the loop or the
1495	/// NonLoopBlock). In other words, the first successor of both loops must
1496	/// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
1497	/// the NonLoopBlock). The same must be true for the second successor.
1498	bool haveIdenticalGuards(const FusionCandidate &FC0,
1499	const FusionCandidate &FC1) const {
1500	assert(FC0.GuardBranch && FC1.GuardBranch &&
1501	"Expecting FC0 and FC1 to be guarded loops.");
1502
1503	if (auto FC0CmpInst =
1504	dyn_cast<Instruction>(Val: FC0.GuardBranch->getCondition()))
1505	if (auto FC1CmpInst =
1506	dyn_cast<Instruction>(Val: FC1.GuardBranch->getCondition()))
1507	if (!FC0CmpInst->isIdenticalTo(I: FC1CmpInst))
1508	return false;
1509
1510	// The compare instructions are identical.
1511	// Now make sure the successor of the guards have the same flow into/around
1512	// the loop
1513	if (FC0.GuardBranch->getSuccessor(i: `0`) == FC0.Preheader)
1514	return (FC1.GuardBranch->getSuccessor(i: `0`) == FC1.Preheader);
1515	else
1516	return (FC1.GuardBranch->getSuccessor(i: `1`) == FC1.Preheader);
1517	}
1518
1519	/// Modify the latch branch of FC to be unconditional since successors of the
1520	/// branch are the same.
1521	void simplifyLatchBranch(const FusionCandidate &FC) const {
1522	BranchInst *FCLatchBranch = dyn_cast<BranchInst>(Val: FC.Latch->getTerminator());
1523	if (FCLatchBranch) {
1524	assert(FCLatchBranch->isConditional() &&
1525	FCLatchBranch->getSuccessor(`0`) == FCLatchBranch->getSuccessor(`1`) &&
1526	"Expecting the two successors of FCLatchBranch to be the same");
1527	BranchInst *NewBranch =
1528	BranchInst::Create(IfTrue: FCLatchBranch->getSuccessor(i: `0`));
1529	ReplaceInstWithInst(From: FCLatchBranch, To: NewBranch);
1530	}
1531	}
1532
1533	/// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
1534	/// successor, then merge FC0.Latch with its unique successor.
1535	void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
1536	moveInstructionsToTheBeginning(FromBB&: FC0.Latch, ToBB&: FC1.Latch, DT, PDT, DI);
1537	if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
1538	MergeBlockIntoPredecessor(BB: Succ, DTU: &DTU, LI: &LI);
1539	DTU.flush();
1540	}
1541	}
1542
1543	/// Fuse two fusion candidates, creating a new fused loop.
1544	///
1545	/// This method contains the mechanics of fusing two loops, represented by \p
1546	/// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
1547	/// postdominates \p FC0 (making them control flow equivalent). It also
1548	/// assumes that the other conditions for fusion have been met: adjacent,
1549	/// identical trip counts, and no negative distance dependencies exist that
1550	/// would prevent fusion. Thus, there is no checking for these conditions in
1551	/// this method.
1552	///
1553	/// Fusion is performed by rewiring the CFG to update successor blocks of the
1554	/// components of tho loop. Specifically, the following changes are done:
1555	///
1556	/// 1. The preheader of \p FC1 is removed as it is no longer necessary
1557	/// (because it is currently only a single statement block).
1558	/// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
1559	/// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
1560	/// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
1561	///
1562	/// All of these modifications are done with dominator tree updates, thus
1563	/// keeping the dominator (and post dominator) information up-to-date.
1564	///
1565	/// This can be improved in the future by actually merging blocks during
1566	/// fusion. For example, the preheader of \p FC1 can be merged with the
1567	/// preheader of \p FC0. This would allow loops with more than a single
1568	/// statement in the preheader to be fused. Similarly, the latch blocks of the
1569	/// two loops could also be fused into a single block. This will require
1570	/// analysis to prove it is safe to move the contents of the block past
1571	/// existing code, which currently has not been implemented.
1572	Loop performFusion(const* FusionCandidate &FC0, const FusionCandidate &FC1) {
1573	assert(FC0.isValid() && FC1.isValid() &&
1574	"Expecting valid fusion candidates");
1575
1576	LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
1577	dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
1578
1579	// Move instructions from the preheader of FC1 to the end of the preheader
1580	// of FC0.
1581	moveInstructionsToTheEnd(FromBB&: FC1.Preheader, ToBB&: FC0.Preheader, DT, PDT, DI);
1582
1583	// Fusing guarded loops is handled slightly differently than non-guarded
1584	// loops and has been broken out into a separate method instead of trying to
1585	// intersperse the logic within a single method.
1586	if (FC0.GuardBranch)
1587	return fuseGuardedLoops(FC0, FC1);
1588
1589	assert(FC1.Preheader ==
1590	(FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
1591	assert(FC1.Preheader->size() == `1` &&
1592	FC1.Preheader->getSingleSuccessor() == FC1.Header);
1593
1594	// Remember the phi nodes originally in the header of FC0 in order to rewire
1595	// them later. However, this is only necessary if the new loop carried
1596	// values might not dominate the exiting branch. While we do not generally
1597	// test if this is the case but simply insert intermediate phi nodes, we
1598	// need to make sure these intermediate phi nodes have different
1599	// predecessors. To this end, we filter the special case where the exiting
1600	// block is the latch block of the first loop. Nothing needs to be done
1601	// anyway as all loop carried values dominate the latch and thereby also the
1602	// exiting branch.
1603	SmallVector<PHINode *, `8`> OriginalFC0PHIs;
1604	if (FC0.ExitingBlock != FC0.Latch)
1605	for (PHINode &PHI : FC0.Header->phis())
1606	OriginalFC0PHIs.push_back(Elt: &PHI);
1607
1608	// Replace incoming blocks for header PHIs first.
1609	FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader);
1610	FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch);
1611
1612	// Then modify the control flow and update DT and PDT.
1613	SmallVector<DominatorTree::UpdateType, `8`> TreeUpdates;
1614
1615	// The old exiting block of the first loop (FC0) has to jump to the header
1616	// of the second as we need to execute the code in the second header block
1617	// regardless of the trip count. That is, if the trip count is 0, so the
1618	// back edge is never taken, we still have to execute both loop headers,
1619	// especially (but not only!) if the second is a do-while style loop.
1620	// However, doing so might invalidate the phi nodes of the first loop as
1621	// the new values do only need to dominate their latch and not the exiting
1622	// predicate. To remedy this potential problem we always introduce phi
1623	// nodes in the header of the second loop later that select the loop carried
1624	// value, if the second header was reached through an old latch of the
1625	// first, or undef otherwise. This is sound as exiting the first implies the
1626	// second will exit too, __without__ taking the back-edge. [Their
1627	// trip-counts are equal after all.
1628	// KB: Would this sequence be simpler to just make FC0.ExitingBlock go
1629	// to FC1.Header? I think this is basically what the three sequences are
1630	// trying to accomplish; however, doing this directly in the CFG may mean
1631	// the DT/PDT becomes invalid
1632	if (!FC0.Peeled) {
1633	FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC1.Preheader,
1634	To: FC1.Header);
1635	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1636	DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
1637	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1638	DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1639	} else {
1640	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1641	DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
1642
1643	// Remove the ExitBlock of the first Loop (also not needed)
1644	FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock,
1645	To: FC1.Header);
1646	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1647	DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1648	FC0.ExitBlock->getTerminator()->eraseFromParent();
1649	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1650	DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1651	new UnreachableInst (FC0.ExitBlock->getContext(), FC0.ExitBlock);
1652	}
1653
1654	// The pre-header of L1 is not necessary anymore.
1655	assert(pred_empty(FC1.Preheader));
1656	FC1.Preheader->getTerminator()->eraseFromParent();
1657	new UnreachableInst (FC1.Preheader->getContext(), FC1.Preheader);
1658	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1659	DominatorTree::Delete, FC1.Preheader, FC1.Header));
1660
1661	// Moves the phi nodes from the second to the first loops header block.
1662	while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) {
1663	if (SE.isSCEVable(Ty: PHI->getType()))
1664	SE.forgetValue(V: PHI);
1665	if (PHI->hasNUsesOrMore(N: `1`))
1666	PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt());
1667	else
1668	PHI->eraseFromParent();
1669	}
1670
1671	// Introduce new phi nodes in the second loop header to ensure
1672	// exiting the first and jumping to the header of the second does not break
1673	// the SSA property of the phis originally in the first loop. See also the
1674	// comment above.
1675	BasicBlock::iterator L1HeaderIP = FC1.Header->begin();
1676	for (PHINode *LCPHI : OriginalFC0PHIs) {
1677	int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch);
1678	assert(L1LatchBBIdx >= `0` &&
1679	"Expected loop carried value to be rewired at this point!");
1680
1681	Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx);
1682
1683	PHINode *L1HeaderPHI =
1684	PHINode::Create(Ty: LCV->getType(), NumReservedValues: `2`, NameStr: LCPHI->getName() + ".afterFC0");
1685	L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP);
1686	L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch);
1687	L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()),
1688	BB: FC0.ExitingBlock);
1689
1690	LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI);
1691	}
1692
1693	// Replace latch terminator destinations.
1694	FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header);
1695	FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header);
1696
1697	// Modify the latch branch of FC0 to be unconditional as both successors of
1698	// the branch are the same.
1699	simplifyLatchBranch(FC: FC0);
1700
1701	// If FC0.Latch and FC0.ExitingBlock are the same then we have already
1702	// performed the updates above.
1703	if (FC0.Latch != FC0.ExitingBlock)
1704	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1705	DominatorTree::Insert, FC0.Latch, FC1.Header));
1706
1707	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (DominatorTree::Delete,
1708	FC0.Latch, FC0.Header));
1709	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (DominatorTree::Insert,
1710	FC1.Latch, FC0.Header));
1711	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (DominatorTree::Delete,
1712	FC1.Latch, FC1.Header));
1713
1714	// Update DT/PDT
1715	DTU.applyUpdates(Updates: TreeUpdates);
1716
1717	LI.removeBlock(BB: FC1.Preheader);
1718	DTU.deleteBB(DelBB: FC1.Preheader);
1719	if (FC0.Peeled) {
1720	LI.removeBlock(BB: FC0.ExitBlock);
1721	DTU.deleteBB(DelBB: FC0.ExitBlock);
1722	}
1723
1724	DTU.flush();
1725
1726	// Is there a way to keep SE up-to-date so we don't need to forget the loops
1727	// and rebuild the information in subsequent passes of fusion?
1728	// Note: Need to forget the loops before merging the loop latches, as
1729	// mergeLatch may remove the only block in FC1.
1730	SE.forgetLoop(L: FC1.L);
1731	SE.forgetLoop(L: FC0.L);
1732	// Forget block dispositions as well, so that there are no dangling
1733	// pointers to erased/free'ed blocks.
1734	SE.forgetBlockAndLoopDispositions();
1735
1736	// Move instructions from FC0.Latch to FC1.Latch.
1737	// Note: mergeLatch requires an updated DT.
1738	mergeLatch(FC0, FC1);
1739
1740	// Merge the loops.
1741	SmallVector<BasicBlock *, `8`> Blocks(FC1.L->blocks());
1742	for (BasicBlock *BB : Blocks) {
1743	FC0.L->addBlockEntry(BB);
1744	FC1.L->removeBlockFromLoop(BB);
1745	if (LI.getLoopFor(BB) != FC1.L)
1746	continue;
1747	LI.changeLoopFor(BB, L: FC0.L);
1748	}
1749	while (!FC1.L->isInnermost()) {
1750	const auto &ChildLoopIt = FC1.L->begin();
1751	Loop ChildLoop = ChildLoopIt;
1752	FC1.L->removeChildLoop(I: ChildLoopIt);
1753	FC0.L->addChildLoop(NewChild: ChildLoop);
1754	}
1755
1756	// Delete the now empty loop L1.
1757	LI.erase(L: FC1.L);
1758
1759	#ifndef NDEBUG
1760	assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
1761	assert(DT.verify(DominatorTree::VerificationLevel::Fast));
1762	assert(PDT.verify());
1763	LI.verify(DT);
1764	SE.verify();
1765	#endif
1766
1767	LLVM_DEBUG(dbgs() << "Fusion done:\n");
1768
1769	return FC0.L;
1770	}
1771
1772	/// Report details on loop fusion opportunities.
1773	///
1774	/// This template function can be used to report both successful and missed
1775	/// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
1776	/// be one of:
1777	/// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
1778	/// given two valid fusion candidates.
1779	/// - OptimizationRemark to report successful fusion of two fusion
1780	/// candidates.
1781	/// The remarks will be printed using the form:
1782	/// <path/filename>:<line number>:<column number>: [<function name>]:
1783	/// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
1784	template <typename RemarkKind>
1785	void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
1786	llvm::Statistic &Stat) {
1787	assert(FC0.Preheader && FC1.Preheader &&
1788	"Expecting valid fusion candidates");
1789	using namespace ore;
1790	#if LLVM_ENABLE_STATS
1791	++Stat;
1792	ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
1793	FC0.Preheader)
1794	<< "[" << FC0.Preheader->getParent()->getName()
1795	<< "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
1796	<< " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
1797	<< ": " << Stat.getDesc());
1798	#endif
1799	}
1800
1801	/// Fuse two guarded fusion candidates, creating a new fused loop.
1802	///
1803	/// Fusing guarded loops is handled much the same way as fusing non-guarded
1804	/// loops. The rewiring of the CFG is slightly different though, because of
1805	/// the presence of the guards around the loops and the exit blocks after the
1806	/// loop body. As such, the new loop is rewired as follows:
1807	/// 1. Keep the guard branch from FC0 and use the non-loop block target
1808	/// from the FC1 guard branch.
1809	/// 2. Remove the exit block from FC0 (this exit block should be empty
1810	/// right now).
1811	/// 3. Remove the guard branch for FC1
1812	/// 4. Remove the preheader for FC1.
1813	/// The exit block successor for the latch of FC0 is updated to be the header
1814	/// of FC1 and the non-exit block successor of the latch of FC1 is updated to
1815	/// be the header of FC0, thus creating the fused loop.
1816	Loop fuseGuardedLoops(const* FusionCandidate &FC0,
1817	const FusionCandidate &FC1) {
1818	assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
1819
1820	BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
1821	BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
1822	BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
1823	BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
1824	BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
1825
1826	// Move instructions from the exit block of FC0 to the beginning of the exit
1827	// block of FC1, in the case that the FC0 loop has not been peeled. In the
1828	// case that FC0 loop is peeled, then move the instructions of the successor
1829	// of the FC0 Exit block to the beginning of the exit block of FC1.
1830	moveInstructionsToTheBeginning(
1831	FromBB&: (FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock), ToBB&: *FC1.ExitBlock,
1832	DT, PDT, DI);
1833
1834	// Move instructions from the guard block of FC1 to the end of the guard
1835	// block of FC0.
1836	moveInstructionsToTheEnd(FromBB&: FC1GuardBlock, ToBB&: FC0GuardBlock, DT, PDT, DI);
1837
1838	assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
1839
1840	SmallVector<DominatorTree::UpdateType, `8`> TreeUpdates;
1841
1842	////////////////////////////////////////////////////////////////////////////
1843	// Update the Loop Guard
1844	////////////////////////////////////////////////////////////////////////////
1845	// The guard for FC0 is updated to guard both FC0 and FC1. This is done by
1846	// changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
1847	// Thus, one path from the guard goes to the preheader for FC0 (and thus
1848	// executes the new fused loop) and the other path goes to the NonLoopBlock
1849	// for FC1 (where FC1 guard would have gone if FC1 was not executed).
1850	FC1NonLoopBlock->replacePhiUsesWith(Old: FC1GuardBlock, New: FC0GuardBlock);
1851	FC0.GuardBranch->replaceUsesOfWith(From: FC0NonLoopBlock, To: FC1NonLoopBlock);
1852
1853	BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
1854	BBToUpdate->getTerminator()->replaceUsesOfWith(From: FC1GuardBlock, To: FC1.Header);
1855
1856	// The guard of FC1 is not necessary anymore.
1857	FC1.GuardBranch->eraseFromParent();
1858	new UnreachableInst (FC1GuardBlock->getContext(), FC1GuardBlock);
1859
1860	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1861	DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
1862	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1863	DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
1864	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1865	DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
1866	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1867	DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
1868
1869	if (FC0.Peeled) {
1870	// Remove the Block after the ExitBlock of FC0
1871	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1872	DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
1873	FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
1874	new UnreachableInst (FC0ExitBlockSuccessor->getContext(),
1875	FC0ExitBlockSuccessor);
1876	}
1877
1878	assert(pred_empty(FC1GuardBlock) &&
1879	"Expecting guard block to have no predecessors");
1880	assert(succ_empty(FC1GuardBlock) &&
1881	"Expecting guard block to have no successors");
1882
1883	// Remember the phi nodes originally in the header of FC0 in order to rewire
1884	// them later. However, this is only necessary if the new loop carried
1885	// values might not dominate the exiting branch. While we do not generally
1886	// test if this is the case but simply insert intermediate phi nodes, we
1887	// need to make sure these intermediate phi nodes have different
1888	// predecessors. To this end, we filter the special case where the exiting
1889	// block is the latch block of the first loop. Nothing needs to be done
1890	// anyway as all loop carried values dominate the latch and thereby also the
1891	// exiting branch.
1892	// KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
1893	// (because the loops are rotated. Thus, nothing will ever be added to
1894	// OriginalFC0PHIs.
1895	SmallVector<PHINode *, `8`> OriginalFC0PHIs;
1896	if (FC0.ExitingBlock != FC0.Latch)
1897	for (PHINode &PHI : FC0.Header->phis())
1898	OriginalFC0PHIs.push_back(Elt: &PHI);
1899
1900	assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
1901
1902	// Replace incoming blocks for header PHIs first.
1903	FC1.Preheader->replaceSuccessorsPhiUsesWith(New: FC0.Preheader);
1904	FC0.Latch->replaceSuccessorsPhiUsesWith(New: FC1.Latch);
1905
1906	// The old exiting block of the first loop (FC0) has to jump to the header
1907	// of the second as we need to execute the code in the second header block
1908	// regardless of the trip count. That is, if the trip count is 0, so the
1909	// back edge is never taken, we still have to execute both loop headers,
1910	// especially (but not only!) if the second is a do-while style loop.
1911	// However, doing so might invalidate the phi nodes of the first loop as
1912	// the new values do only need to dominate their latch and not the exiting
1913	// predicate. To remedy this potential problem we always introduce phi
1914	// nodes in the header of the second loop later that select the loop carried
1915	// value, if the second header was reached through an old latch of the
1916	// first, or undef otherwise. This is sound as exiting the first implies the
1917	// second will exit too, __without__ taking the back-edge (their
1918	// trip-counts are equal after all).
1919	FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(From: FC0.ExitBlock,
1920	To: FC1.Header);
1921
1922	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1923	DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
1924	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1925	DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
1926
1927	// Remove FC0 Exit Block
1928	// The exit block for FC0 is no longer needed since control will flow
1929	// directly to the header of FC1. Since it is an empty block, it can be
1930	// removed at this point.
1931	// TODO: In the future, we can handle non-empty exit blocks my merging any
1932	// instructions from FC0 exit block into FC1 exit block prior to removing
1933	// the block.
1934	assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
1935	FC0.ExitBlock->getTerminator()->eraseFromParent();
1936	new UnreachableInst (FC0.ExitBlock->getContext(), FC0.ExitBlock);
1937
1938	// Remove FC1 Preheader
1939	// The pre-header of L1 is not necessary anymore.
1940	assert(pred_empty(FC1.Preheader));
1941	FC1.Preheader->getTerminator()->eraseFromParent();
1942	new UnreachableInst (FC1.Preheader->getContext(), FC1.Preheader);
1943	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1944	DominatorTree::Delete, FC1.Preheader, FC1.Header));
1945
1946	// Moves the phi nodes from the second to the first loops header block.
1947	while (PHINode *PHI = dyn_cast<PHINode>(Val: &FC1.Header->front())) {
1948	if (SE.isSCEVable(Ty: PHI->getType()))
1949	SE.forgetValue(V: PHI);
1950	if (PHI->hasNUsesOrMore(N: `1`))
1951	PHI->moveBefore(InsertPos: FC0.Header->getFirstInsertionPt());
1952	else
1953	PHI->eraseFromParent();
1954	}
1955
1956	// Introduce new phi nodes in the second loop header to ensure
1957	// exiting the first and jumping to the header of the second does not break
1958	// the SSA property of the phis originally in the first loop. See also the
1959	// comment above.
1960	BasicBlock::iterator L1HeaderIP = FC1.Header->begin();
1961	for (PHINode *LCPHI : OriginalFC0PHIs) {
1962	int L1LatchBBIdx = LCPHI->getBasicBlockIndex(BB: FC1.Latch);
1963	assert(L1LatchBBIdx >= `0` &&
1964	"Expected loop carried value to be rewired at this point!");
1965
1966	Value *LCV = LCPHI->getIncomingValue(i: L1LatchBBIdx);
1967
1968	PHINode *L1HeaderPHI =
1969	PHINode::Create(Ty: LCV->getType(), NumReservedValues: `2`, NameStr: LCPHI->getName() + ".afterFC0");
1970	L1HeaderPHI->insertBefore(InsertPos: L1HeaderIP);
1971	L1HeaderPHI->addIncoming(V: LCV, BB: FC0.Latch);
1972	L1HeaderPHI->addIncoming(V: PoisonValue::get(T: LCV->getType()),
1973	BB: FC0.ExitingBlock);
1974
1975	LCPHI->setIncomingValue(i: L1LatchBBIdx, V: L1HeaderPHI);
1976	}
1977
1978	// Update the latches
1979
1980	// Replace latch terminator destinations.
1981	FC0.Latch->getTerminator()->replaceUsesOfWith(From: FC0.Header, To: FC1.Header);
1982	FC1.Latch->getTerminator()->replaceUsesOfWith(From: FC1.Header, To: FC0.Header);
1983
1984	// Modify the latch branch of FC0 to be unconditional as both successors of
1985	// the branch are the same.
1986	simplifyLatchBranch(FC: FC0);
1987
1988	// If FC0.Latch and FC0.ExitingBlock are the same then we have already
1989	// performed the updates above.
1990	if (FC0.Latch != FC0.ExitingBlock)
1991	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (
1992	DominatorTree::Insert, FC0.Latch, FC1.Header));
1993
1994	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (DominatorTree::Delete,
1995	FC0.Latch, FC0.Header));
1996	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (DominatorTree::Insert,
1997	FC1.Latch, FC0.Header));
1998	TreeUpdates.emplace_back(Args: DominatorTree::UpdateType (DominatorTree::Delete,
1999	FC1.Latch, FC1.Header));
2000
2001	// All done
2002	// Apply the updates to the Dominator Tree and cleanup.
2003
2004	assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
2005	assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
2006
2007	// Update DT/PDT
2008	DTU.applyUpdates(Updates: TreeUpdates);
2009
2010	LI.removeBlock(BB: FC1GuardBlock);
2011	LI.removeBlock(BB: FC1.Preheader);
2012	LI.removeBlock(BB: FC0.ExitBlock);
2013	if (FC0.Peeled) {
2014	LI.removeBlock(BB: FC0ExitBlockSuccessor);
2015	DTU.deleteBB(DelBB: FC0ExitBlockSuccessor);
2016	}
2017	DTU.deleteBB(DelBB: FC1GuardBlock);
2018	DTU.deleteBB(DelBB: FC1.Preheader);
2019	DTU.deleteBB(DelBB: FC0.ExitBlock);
2020	DTU.flush();
2021
2022	// Is there a way to keep SE up-to-date so we don't need to forget the loops
2023	// and rebuild the information in subsequent passes of fusion?
2024	// Note: Need to forget the loops before merging the loop latches, as
2025	// mergeLatch may remove the only block in FC1.
2026	SE.forgetLoop(L: FC1.L);
2027	SE.forgetLoop(L: FC0.L);
2028	// Forget block dispositions as well, so that there are no dangling
2029	// pointers to erased/free'ed blocks.
2030	SE.forgetBlockAndLoopDispositions();
2031
2032	// Move instructions from FC0.Latch to FC1.Latch.
2033	// Note: mergeLatch requires an updated DT.
2034	mergeLatch(FC0, FC1);
2035
2036	// Merge the loops.
2037	SmallVector<BasicBlock *, `8`> Blocks(FC1.L->blocks());
2038	for (BasicBlock *BB : Blocks) {
2039	FC0.L->addBlockEntry(BB);
2040	FC1.L->removeBlockFromLoop(BB);
2041	if (LI.getLoopFor(BB) != FC1.L)
2042	continue;
2043	LI.changeLoopFor(BB, L: FC0.L);
2044	}
2045	while (!FC1.L->isInnermost()) {
2046	const auto &ChildLoopIt = FC1.L->begin();
2047	Loop ChildLoop = ChildLoopIt;
2048	FC1.L->removeChildLoop(I: ChildLoopIt);
2049	FC0.L->addChildLoop(NewChild: ChildLoop);
2050	}
2051
2052	// Delete the now empty loop L1.
2053	LI.erase(L: FC1.L);
2054
2055	#ifndef NDEBUG
2056	assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
2057	assert(DT.verify(DominatorTree::VerificationLevel::Fast));
2058	assert(PDT.verify());
2059	LI.verify(DT);
2060	SE.verify();
2061	#endif
2062
2063	LLVM_DEBUG(dbgs() << "Fusion done:\n");
2064
2065	return FC0.L;
2066	}
2067	};
2068	} // namespace
2069
2070	PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
2071	auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
2072	auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
2073	auto &DI = AM.getResult<DependenceAnalysis>(IR&: F);
2074	auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
2075	auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(IR&: F);
2076	auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
2077	auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
2078	const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
2079	const DataLayout &DL = F.getDataLayout();
2080
2081	// Ensure loops are in simplifed form which is a pre-requisite for loop fusion
2082	// pass. Added only for new PM since the legacy PM has already added
2083	// LoopSimplify pass as a dependency.
2084	bool Changed = false;
2085	for (auto &L : LI) {
2086	Changed \|=
2087	simplifyLoop(L, DT: &DT, LI: &LI, SE: &SE, AC: &AC, MSSAU: nullptr, PreserveLCSSA: false / PreserveLCSSA /);
2088	}
2089	if (Changed)
2090	PDT.recalculate(Func&: F);
2091
2092	LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
2093	Changed \|= LF.fuseLoops(F);
2094	if (!Changed)
2095	return PreservedAnalyses::all();
2096
2097	PreservedAnalyses PA;
2098	PA.preserve<DominatorTreeAnalysis>();
2099	PA.preserve<PostDominatorTreeAnalysis>();
2100	PA.preserve<ScalarEvolutionAnalysis>();
2101	PA.preserve<LoopAnalysis>();
2102	return PA;
2103	}
2104

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/LoopFuse.cpp