1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Transforms/Scalar/LoopInterchange.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
22#include "llvm/Analysis/DependenceAnalysis.h"
23#include "llvm/Analysis/LoopCacheAnalysis.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/LoopNestAnalysis.h"
26#include "llvm/Analysis/LoopPass.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/ScalarEvolution.h"
29#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/DiagnosticInfo.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
40#include "llvm/Support/Casting.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/ErrorHandling.h"
44#include "llvm/Support/raw_ostream.h"
45#include "llvm/Transforms/Scalar/LoopPassManager.h"
46#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47#include "llvm/Transforms/Utils/Local.h"
48#include "llvm/Transforms/Utils/LoopUtils.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
59static cl::opt<int> LoopInterchangeCostThreshold(
60 "loop-interchange-threshold", cl::init(Val: 0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
63// Maximum number of load-stores that can be handled in the dependency matrix.
64static cl::opt<unsigned int> MaxMemInstrCount(
65 "loop-interchange-max-meminstr-count", cl::init(Val: 64), cl::Hidden,
66 cl::desc(
67 "Maximum number of load-store instructions that should be handled "
68 "in the dependency matrix. Higher value may lead to more interchanges "
69 "at the cost of compile-time"));
70
71namespace {
72
73using LoopVector = SmallVector<Loop *, 8>;
74
75/// A list of direction vectors. Each entry represents a direction vector
76/// corresponding to one or more dependencies existing in the loop nest. The
77/// length of all direction vectors is equal and is N + 1, where N is the depth
78/// of the loop nest. The first N elements correspond to the dependency
79/// direction of each N loops. The last one indicates whether this entry is
80/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
81/// is defined in LoopAccessAnalysis.
82// TODO: Check if we can use a sparse matrix here.
83using CharMatrix = std::vector<std::vector<char>>;
84
85/// Types of rules used in profitability check.
86enum class RuleTy {
87 PerLoopCacheAnalysis,
88 PerInstrOrderCost,
89 ForVectorization,
90 Ignore
91};
92
93} // end anonymous namespace
94
95// Minimum loop depth supported.
96static cl::opt<unsigned int> MinLoopNestDepth(
97 "loop-interchange-min-loop-nest-depth", cl::init(Val: 2), cl::Hidden,
98 cl::desc("Minimum depth of loop nest considered for the transform"));
99
100// Maximum loop depth supported.
101static cl::opt<unsigned int> MaxLoopNestDepth(
102 "loop-interchange-max-loop-nest-depth", cl::init(Val: 10), cl::Hidden,
103 cl::desc("Maximum depth of loop nest considered for the transform"));
104
105// We prefer cache cost to vectorization by default.
106static cl::list<RuleTy> Profitabilities(
107 "loop-interchange-profitabilities", cl::ZeroOrMore,
108 cl::MiscFlags::CommaSeparated, cl::Hidden,
109 cl::desc("List of profitability heuristics to be used. They are applied in "
110 "the given order"),
111 cl::list_init<RuleTy>(Vals: {RuleTy::PerLoopCacheAnalysis,
112 RuleTy::PerInstrOrderCost,
113 RuleTy::ForVectorization}),
114 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
115 "Prioritize loop cache cost"),
116 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
117 "Prioritize the IVs order of each instruction"),
118 clEnumValN(RuleTy::ForVectorization, "vectorize",
119 "Prioritize vectorization"),
120 clEnumValN(RuleTy::Ignore, "ignore",
121 "Ignore profitability, force interchange (does not "
122 "work with other options)")));
123
124// Support for the inner-loop reduction pattern.
125static cl::opt<bool> EnableReduction2Memory(
126 "loop-interchange-reduction-to-mem", cl::init(Val: false), cl::Hidden,
127 cl::desc("Support for the inner-loop reduction pattern."));
128
129#ifndef NDEBUG
130static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
131 SmallSet<RuleTy, 4> Set;
132 for (RuleTy Rule : Rules) {
133 if (!Set.insert(Rule).second)
134 return false;
135 if (Rule == RuleTy::Ignore)
136 return false;
137 }
138 return true;
139}
140
141static void printDepMatrix(CharMatrix &DepMatrix) {
142 for (auto &Row : DepMatrix) {
143 // Drop the last element because it is a flag indicating whether this is
144 // forward dependency or not, which doesn't affect the legality check.
145 for (char D : drop_end(Row))
146 LLVM_DEBUG(dbgs() << D << " ");
147 LLVM_DEBUG(dbgs() << "\n");
148 }
149}
150
151/// Return true if \p Src appears before \p Dst in the same basic block.
152/// Precondition: \p Src and \Dst are distinct instructions within the same
153/// basic block.
154static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
155 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
156 "Expected Src and Dst to be different instructions in the same BB");
157
158 bool FoundSrc = false;
159 for (const Instruction &I : *(Src->getParent())) {
160 if (&I == Src) {
161 FoundSrc = true;
162 continue;
163 }
164 if (&I == Dst)
165 return FoundSrc;
166 }
167
168 llvm_unreachable("Dst not found");
169}
170#endif
171
172static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
173 Loop *L, DependenceInfo *DI,
174 ScalarEvolution *SE,
175 OptimizationRemarkEmitter *ORE) {
176 using ValueVector = SmallVector<Value *, 16>;
177
178 ValueVector MemInstr;
179
180 // For each block.
181 for (BasicBlock *BB : L->blocks()) {
182 // Scan the BB and collect legal loads and stores.
183 for (Instruction &I : *BB) {
184 if (!isa<Instruction>(Val: I))
185 return false;
186 if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
187 if (!Ld->isSimple())
188 return false;
189 MemInstr.push_back(Elt: &I);
190 } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
191 if (!St->isSimple())
192 return false;
193 MemInstr.push_back(Elt: &I);
194 }
195 }
196 }
197
198 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
199 << " Loads and Stores to analyze\n");
200 if (MemInstr.size() > MaxMemInstrCount) {
201 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
202 << MaxMemInstrCount << " load/stores in a loop\n");
203 ORE->emit(RemarkBuilder: [&]() {
204 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
205 L->getStartLoc(), L->getHeader())
206 << "Number of loads/stores exceeded, the supported maximum "
207 "can be increased with option "
208 "-loop-interchange-maxmeminstr-count.";
209 });
210 return false;
211 }
212 ValueVector::iterator I, IE, J, JE;
213
214 // Manage direction vectors that are already seen. Map each direction vector
215 // to an index of DepMatrix at which it is stored.
216 StringMap<unsigned> Seen;
217
218 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
219 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
220 std::vector<char> Dep;
221 Instruction *Src = cast<Instruction>(Val: *I);
222 Instruction *Dst = cast<Instruction>(Val: *J);
223 // Ignore Input dependencies.
224 if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
225 continue;
226 // Track Output, Flow, and Anti dependencies.
227 if (auto D = DI->depends(Src, Dst)) {
228 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
229 // If the direction vector is negative, normalize it to
230 // make it non-negative.
231 if (D->normalize(SE))
232 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
233 LLVM_DEBUG(StringRef DepType =
234 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
235 dbgs() << "Found " << DepType
236 << " dependency between Src and Dst\n"
237 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
238 unsigned Levels = D->getLevels();
239 char Direction;
240 for (unsigned II = 1; II <= Levels; ++II) {
241 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
242 // or `=`, for which we don't have an equivalent representation, so
243 // that the conservative approximation is necessary. The same goes for
244 // `DVEntry::GE`.
245 // TODO: Use of fine-grained expressions allows for more accurate
246 // analysis.
247 unsigned Dir = D->getDirection(Level: II);
248 if (Dir == Dependence::DVEntry::LT)
249 Direction = '<';
250 else if (Dir == Dependence::DVEntry::GT)
251 Direction = '>';
252 else if (Dir == Dependence::DVEntry::EQ)
253 Direction = '=';
254 else
255 Direction = '*';
256 Dep.push_back(x: Direction);
257 }
258
259 // If the Dependence object doesn't have any information, fill the
260 // dependency vector with '*'.
261 if (D->isConfused()) {
262 assert(Dep.empty() && "Expected empty dependency vector");
263 Dep.assign(n: Level, val: '*');
264 }
265
266 while (Dep.size() != Level) {
267 Dep.push_back(x: 'I');
268 }
269
270 // If all the elements of any direction vector have only '*', legality
271 // can't be proven. Exit early to save compile time.
272 if (all_of(Range&: Dep, P: equal_to(Arg: '*'))) {
273 ORE->emit(RemarkBuilder: [&]() {
274 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
275 L->getStartLoc(), L->getHeader())
276 << "All loops have dependencies in all directions.";
277 });
278 return false;
279 }
280
281 // Test whether the dependency is forward or not.
282 bool IsKnownForward = true;
283 if (Src->getParent() != Dst->getParent()) {
284 // In general, when Src and Dst are in different BBs, the execution
285 // order of them within a single iteration is not guaranteed. Treat
286 // conservatively as not-forward dependency in this case.
287 IsKnownForward = false;
288 } else {
289 // Src and Dst are in the same BB. If they are the different
290 // instructions, Src should appear before Dst in the BB as they are
291 // stored to MemInstr in that order.
292 assert((Src == Dst || inThisOrder(Src, Dst)) &&
293 "Unexpected instructions");
294
295 // If the Dependence object is reversed (due to normalization), it
296 // represents the dependency from Dst to Src, meaning it is a backward
297 // dependency. Otherwise it should be a forward dependency.
298 bool IsReversed = D->getSrc() != Src;
299 if (IsReversed)
300 IsKnownForward = false;
301 }
302
303 // Initialize the last element. Assume forward dependencies only; it
304 // will be updated later if there is any non-forward dependency.
305 Dep.push_back(x: '<');
306
307 // The last element should express the "summary" among one or more
308 // direction vectors whose first N elements are the same (where N is
309 // the depth of the loop nest). Hence we exclude the last element from
310 // the Seen map.
311 auto [Ite, Inserted] = Seen.try_emplace(
312 Key: StringRef(Dep.data(), Dep.size() - 1), Args: DepMatrix.size());
313
314 // Make sure we only add unique entries to the dependency matrix.
315 if (Inserted)
316 DepMatrix.push_back(x: Dep);
317
318 // If we cannot prove that this dependency is forward, change the last
319 // element of the corresponding entry. Since a `[... *]` dependency
320 // includes a `[... <]` dependency, we do not need to keep both and
321 // change the existing entry instead.
322 if (!IsKnownForward)
323 DepMatrix[Ite->second].back() = '*';
324 }
325 }
326 }
327
328 return true;
329}
330
331// A loop is moved from index 'from' to an index 'to'. Update the Dependence
332// matrix by exchanging the two columns.
333static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
334 unsigned ToIndx) {
335 for (auto &Row : DepMatrix)
336 std::swap(a&: Row[ToIndx], b&: Row[FromIndx]);
337}
338
339// Check if a direction vector is lexicographically positive. Return true if it
340// is positive, nullopt if it is "zero", otherwise false.
341// [Theorem] A permutation of the loops in a perfect nest is legal if and only
342// if the direction matrix, after the same permutation is applied to its
343// columns, has no ">" direction as the leftmost non-"=" direction in any row.
344static std::optional<bool>
345isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
346 for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) {
347 if (Direction == '<')
348 return true;
349 if (Direction == '>' || Direction == '*')
350 return false;
351 }
352 return std::nullopt;
353}
354
355// Checks if it is legal to interchange 2 loops.
356static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
357 unsigned InnerLoopId,
358 unsigned OuterLoopId) {
359 unsigned NumRows = DepMatrix.size();
360 std::vector<char> Cur;
361 // For each row check if it is valid to interchange.
362 for (unsigned Row = 0; Row < NumRows; ++Row) {
363 // Create temporary DepVector check its lexicographical order
364 // before and after swapping OuterLoop vs InnerLoop
365 Cur = DepMatrix[Row];
366
367 // If the surrounding loops already ensure that the direction vector is
368 // lexicographically positive, nothing within the loop will be able to break
369 // the dependence. In such a case we can skip the subsequent check.
370 if (isLexicographicallyPositive(DV: Cur, Begin: 0, End: OuterLoopId) == true)
371 continue;
372
373 // Check if the direction vector is lexicographically positive (or zero)
374 // for both before/after exchanged. Ignore the last element because it
375 // doesn't affect the legality.
376 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - 1) == false)
377 return false;
378 std::swap(a&: Cur[InnerLoopId], b&: Cur[OuterLoopId]);
379 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - 1) == false)
380 return false;
381 }
382 return true;
383}
384
385static void populateWorklist(Loop &L, LoopVector &LoopList) {
386 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
387 << L.getHeader()->getParent()->getName() << " Loop: %"
388 << L.getHeader()->getName() << '\n');
389 assert(LoopList.empty() && "LoopList should initially be empty!");
390 Loop *CurrentLoop = &L;
391 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
392 while (!Vec->empty()) {
393 // The current loop has multiple subloops in it hence it is not tightly
394 // nested.
395 // Discard all loops above it added into Worklist.
396 if (Vec->size() != 1) {
397 LoopList = {};
398 return;
399 }
400
401 LoopList.push_back(Elt: CurrentLoop);
402 CurrentLoop = Vec->front();
403 Vec = &CurrentLoop->getSubLoops();
404 }
405 LoopList.push_back(Elt: CurrentLoop);
406}
407
408static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList,
409 OptimizationRemarkEmitter &ORE) {
410 unsigned LoopNestDepth = LoopList.size();
411 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
412 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
413 << ", the supported range is [" << MinLoopNestDepth
414 << ", " << MaxLoopNestDepth << "].\n");
415 Loop *OuterLoop = LoopList.front();
416 ORE.emit(RemarkBuilder: [&]() {
417 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
418 OuterLoop->getStartLoc(),
419 OuterLoop->getHeader())
420 << "Unsupported depth of loop nest, the supported range is ["
421 << std::to_string(val: MinLoopNestDepth) << ", "
422 << std::to_string(val: MaxLoopNestDepth) << "].\n";
423 });
424 return false;
425 }
426 return true;
427}
428
429static bool isComputableLoopNest(ScalarEvolution *SE,
430 ArrayRef<Loop *> LoopList) {
431 for (Loop *L : LoopList) {
432 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
433 if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) {
434 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
435 return false;
436 }
437 if (L->getNumBackEdges() != 1) {
438 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
439 return false;
440 }
441 if (!L->getExitingBlock()) {
442 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
443 return false;
444 }
445 }
446 return true;
447}
448
449namespace {
450
451/// LoopInterchangeLegality checks if it is legal to interchange the loop.
452class LoopInterchangeLegality {
453public:
454 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
455 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
456 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
457
458 /// Check if the loops can be interchanged.
459 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
460 CharMatrix &DepMatrix);
461
462 /// Discover induction PHIs in the header of \p L. Induction
463 /// PHIs are added to \p Inductions.
464 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
465
466 /// Check if the loop structure is understood. We do not handle triangular
467 /// loops for now.
468 bool isLoopStructureUnderstood();
469
470 bool currentLimitations();
471
472 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
473 return OuterInnerReductions;
474 }
475
476 const ArrayRef<PHINode *> getInnerLoopInductions() const {
477 return InnerLoopInductions;
478 }
479
480 ArrayRef<Instruction *> getHasNoWrapReductions() const {
481 return HasNoWrapReductions;
482 }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Discover induction and reduction PHIs in the header of \p L. Induction
511 /// PHIs are added to \p Inductions, reductions are added to
512 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
513 /// to be passed as \p InnerLoop.
514 bool findInductionAndReductions(Loop *L,
515 SmallVector<PHINode *, 8> &Inductions,
516 Loop *InnerLoop);
517
518 /// Detect and record the reduction of the inner loop. Add them to
519 /// InnerReductions.
520 ///
521 /// innerloop:
522 /// Re = phi<0.0, Next>
523 /// Next = Re op ...
524 /// OuterLoopLatch:
525 /// Lcssa = phi<Next> ; lcssa phi
526 /// store Lcssa, MemRef ; LcssaStore
527 ///
528 bool isInnerReduction(Loop *L, PHINode *Phi,
529 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
530
531 Loop *OuterLoop;
532 Loop *InnerLoop;
533
534 ScalarEvolution *SE;
535 DominatorTree *DT;
536
537 /// Interface to emit optimization remarks.
538 OptimizationRemarkEmitter *ORE;
539
540 /// Set of reduction PHIs taking part of a reduction across the inner and
541 /// outer loop.
542 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
543
544 /// Set of inner loop induction PHIs
545 SmallVector<PHINode *, 8> InnerLoopInductions;
546
547 /// Hold instructions that have nuw/nsw flags and involved in reductions,
548 /// like integer addition/multiplication. Those flags must be dropped when
549 /// interchanging the loops.
550 SmallVector<Instruction *, 4> HasNoWrapReductions;
551
552 /// Vector of reductions in the inner loop.
553 SmallVector<InnerReduction, 8> InnerReductions;
554};
555
556/// Manages information utilized by the profitability check for cache. The main
557/// purpose of this class is to delay the computation of CacheCost until it is
558/// actually needed.
559class CacheCostManager {
560 Loop *OutermostLoop;
561 LoopStandardAnalysisResults *AR;
562 DependenceInfo *DI;
563
564 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
565 /// that the result can be nullptr.
566 std::optional<std::unique_ptr<CacheCost>> CC;
567
568 /// Maps each loop to an index representing the optimal position within the
569 /// loop-nest, as determined by the cache cost analysis.
570 DenseMap<const Loop *, unsigned> CostMap;
571
572 void computeIfUnitinialized();
573
574public:
575 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
576 DependenceInfo *DI)
577 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
578 CacheCost *getCacheCost();
579 const DenseMap<const Loop *, unsigned> &getCostMap();
580};
581
582/// LoopInterchangeProfitability checks if it is profitable to interchange the
583/// loop.
584class LoopInterchangeProfitability {
585public:
586 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
587 OptimizationRemarkEmitter *ORE)
588 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
589
590 /// Check if the loop interchange is profitable.
591 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
592 unsigned InnerLoopId, unsigned OuterLoopId,
593 CharMatrix &DepMatrix, CacheCostManager &CCM);
594
595private:
596 int getInstrOrderCost();
597 std::optional<bool> isProfitablePerLoopCacheAnalysis(
598 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
599 std::optional<bool> isProfitablePerInstrOrderCost();
600 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
601 unsigned OuterLoopId,
602 CharMatrix &DepMatrix);
603 Loop *OuterLoop;
604 Loop *InnerLoop;
605
606 /// Scev analysis.
607 ScalarEvolution *SE;
608
609 /// Interface to emit optimization remarks.
610 OptimizationRemarkEmitter *ORE;
611};
612
613/// LoopInterchangeTransform interchanges the loop.
614class LoopInterchangeTransform {
615public:
616 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
617 LoopInfo *LI, DominatorTree *DT,
618 const LoopInterchangeLegality &LIL)
619 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
620
621 /// Interchange OuterLoop and InnerLoop.
622 bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
623 void reduction2Memory();
624 void restructureLoops(Loop *NewInner, Loop *NewOuter,
625 BasicBlock *OrigInnerPreHeader,
626 BasicBlock *OrigOuterPreHeader);
627 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
628
629private:
630 bool adjustLoopLinks();
631 bool adjustLoopBranches();
632
633 Loop *OuterLoop;
634 Loop *InnerLoop;
635
636 /// Scev analysis.
637 ScalarEvolution *SE;
638
639 LoopInfo *LI;
640 DominatorTree *DT;
641
642 const LoopInterchangeLegality &LIL;
643};
644
645struct LoopInterchange {
646 ScalarEvolution *SE = nullptr;
647 LoopInfo *LI = nullptr;
648 DependenceInfo *DI = nullptr;
649 DominatorTree *DT = nullptr;
650 LoopStandardAnalysisResults *AR = nullptr;
651
652 /// Interface to emit optimization remarks.
653 OptimizationRemarkEmitter *ORE;
654
655 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
656 DominatorTree *DT, LoopStandardAnalysisResults *AR,
657 OptimizationRemarkEmitter *ORE)
658 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
659
660 bool run(Loop *L) {
661 if (L->getParentLoop())
662 return false;
663 SmallVector<Loop *, 8> LoopList;
664 populateWorklist(L&: *L, LoopList);
665 return processLoopList(LoopList);
666 }
667
668 bool run(LoopNest &LN) {
669 SmallVector<Loop *, 8> LoopList(LN.getLoops());
670 for (unsigned I = 1; I < LoopList.size(); ++I)
671 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
672 return false;
673 return processLoopList(LoopList);
674 }
675
676 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
677 // TODO: Add a better heuristic to select the loop to be interchanged based
678 // on the dependence matrix. Currently we select the innermost loop.
679 return LoopList.size() - 1;
680 }
681
682 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
683 bool Changed = false;
684
685 // Ensure proper loop nest depth.
686 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
687 "Unsupported depth of loop nest.");
688
689 unsigned LoopNestDepth = LoopList.size();
690
691 LLVM_DEBUG({
692 dbgs() << "Processing LoopList of size = " << LoopNestDepth
693 << " containing the following loops:\n";
694 for (auto *L : LoopList) {
695 dbgs() << " - ";
696 L->print(dbgs());
697 }
698 });
699
700 CharMatrix DependencyMatrix;
701 Loop *OuterMostLoop = *(LoopList.begin());
702 if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth,
703 L: OuterMostLoop, DI, SE, ORE)) {
704 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
705 return false;
706 }
707
708 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
709 printDepMatrix(DependencyMatrix));
710
711 // Get the Outermost loop exit.
712 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
713 if (!LoopNestExit) {
714 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
715 << "' needs an unique exit block");
716 return false;
717 }
718
719 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
720 CacheCostManager CCM(LoopList[0], AR, DI);
721 // We try to achieve the globally optimal memory access for the loopnest,
722 // and do interchange based on a bubble-sort fasion. We start from
723 // the innermost loop, move it outwards to the best possible position
724 // and repeat this process.
725 for (unsigned j = SelecLoopId; j > 0; j--) {
726 bool ChangedPerIter = false;
727 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
728 bool Interchanged =
729 processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - 1, DependencyMatrix, CCM);
730 ChangedPerIter |= Interchanged;
731 Changed |= Interchanged;
732 }
733 // Early abort if there was no interchange during an entire round of
734 // moving loops outwards.
735 if (!ChangedPerIter)
736 break;
737 }
738 return Changed;
739 }
740
741 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
742 unsigned OuterLoopId,
743 std::vector<std::vector<char>> &DependencyMatrix,
744 CacheCostManager &CCM) {
745 Loop *OuterLoop = LoopList[OuterLoopId];
746 Loop *InnerLoop = LoopList[InnerLoopId];
747 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
748 << " and OuterLoopId = " << OuterLoopId << "\n");
749 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
750 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) {
751 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
752 << OuterLoop->getName() << "' and '"
753 << InnerLoop->getName() << "'\n");
754 return false;
755 }
756 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
757 << InnerLoop->getName()
758 << "' are legal to interchange\n");
759 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
760 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
761 DepMatrix&: DependencyMatrix, CCM)) {
762 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
763 << "' and '" << InnerLoop->getName()
764 << "' not profitable.\n");
765 return false;
766 }
767
768 ORE->emit(RemarkBuilder: [&]() {
769 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
770 InnerLoop->getStartLoc(),
771 InnerLoop->getHeader())
772 << "Loop interchanged with enclosing loop.";
773 });
774
775 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
776 LIT.transform(DropNoWrapInsts: LIL.getHasNoWrapReductions());
777 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
778 << OuterLoop->getName() << "' and inner loop '"
779 << InnerLoop->getName() << "'\n");
780 LoopsInterchanged++;
781
782 llvm::formLCSSARecursively(L&: *OuterLoop, DT: *DT, LI, SE);
783
784 // Loops interchanged, update LoopList accordingly.
785 std::swap(a&: LoopList[OuterLoopId], b&: LoopList[InnerLoopId]);
786 // Update the DependencyMatrix
787 interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId);
788
789 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
790 printDepMatrix(DependencyMatrix));
791
792 return true;
793 }
794};
795
796} // end anonymous namespace
797
798bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
799 Instruction *Skip) {
800 return any_of(Range&: *BB, P: [Skip](const Instruction &I) {
801 if (&I == Skip)
802 return false;
803 return I.mayHaveSideEffects() || I.mayReadFromMemory();
804 });
805}
806
807bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
808 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
809 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
810 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
811
812 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
813 << "' and '" << InnerLoop->getName()
814 << "' are tightly nested\n");
815
816 // A perfectly nested loop will not have any branch in between the outer and
817 // inner block i.e. outer header will branch to either inner preheader and
818 // outerloop latch.
819 BranchInst *OuterLoopHeaderBI =
820 dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator());
821 if (!OuterLoopHeaderBI)
822 return false;
823
824 for (BasicBlock *Succ : successors(I: OuterLoopHeaderBI))
825 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
826 Succ != OuterLoopLatch)
827 return false;
828
829 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
830
831 // The inner loop reduction pattern requires storing the LCSSA PHI in
832 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
833 // that store during checks.
834 Instruction *Skip = nullptr;
835 assert(InnerReductions.size() <= 1 &&
836 "So far we only support at most one reduction.");
837 if (InnerReductions.size() == 1)
838 Skip = InnerReductions[0].LcssaStore;
839
840 // We do not have any basic block in between now make sure the outer header
841 // and outer loop latch doesn't contain any unsafe instructions.
842 if (containsUnsafeInstructions(BB: OuterLoopHeader, Skip) ||
843 containsUnsafeInstructions(BB: OuterLoopLatch, Skip))
844 return false;
845
846 // Also make sure the inner loop preheader does not contain any unsafe
847 // instructions. Note that all instructions in the preheader will be moved to
848 // the outer loop header when interchanging.
849 if (InnerLoopPreHeader != OuterLoopHeader &&
850 containsUnsafeInstructions(BB: InnerLoopPreHeader, Skip))
851 return false;
852
853 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
854 // Ensure the inner loop exit block flows to the outer loop latch possibly
855 // through empty blocks.
856 const BasicBlock &SuccInner =
857 LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch);
858 if (&SuccInner != OuterLoopLatch) {
859 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
860 << " does not lead to the outer loop latch.\n";);
861 return false;
862 }
863 // The inner loop exit block does flow to the outer loop latch and not some
864 // other BBs, now make sure it contains safe instructions, since it will be
865 // moved into the (new) inner loop after interchange.
866 if (containsUnsafeInstructions(BB: InnerLoopExit, Skip))
867 return false;
868
869 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
870 // We have a perfect loop nest.
871 return true;
872}
873
874bool LoopInterchangeLegality::isLoopStructureUnderstood() {
875 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
876 for (PHINode *InnerInduction : InnerLoopInductions) {
877 unsigned Num = InnerInduction->getNumOperands();
878 for (unsigned i = 0; i < Num; ++i) {
879 Value *Val = InnerInduction->getOperand(i_nocapture: i);
880 if (isa<Constant>(Val))
881 continue;
882 Instruction *I = dyn_cast<Instruction>(Val);
883 if (!I)
884 return false;
885 // TODO: Handle triangular loops.
886 // e.g. for(int i=0;i<N;i++)
887 // for(int j=i;j<N;j++)
888 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
889 if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) ==
890 InnerLoopPreheader &&
891 !OuterLoop->isLoopInvariant(V: I)) {
892 return false;
893 }
894 }
895 }
896
897 // TODO: Handle triangular loops of another form.
898 // e.g. for(int i=0;i<N;i++)
899 // for(int j=0;j<i;j++)
900 // or,
901 // for(int i=0;i<N;i++)
902 // for(int j=0;j*i<N;j++)
903 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
904 BranchInst *InnerLoopLatchBI =
905 dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator());
906 if (!InnerLoopLatchBI->isConditional())
907 return false;
908 if (CmpInst *InnerLoopCmp =
909 dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) {
910 Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: 0);
911 Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: 1);
912
913 // LHS and RHS of the inner loop exit condition, e.g.,
914 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
915 Value *Left = nullptr;
916 Value *Right = nullptr;
917
918 // Check if V only involves inner loop induction variable.
919 // Return true if V is InnerInduction, or a cast from
920 // InnerInduction, or a binary operator that involves
921 // InnerInduction and a constant.
922 std::function<bool(Value *)> IsPathToInnerIndVar;
923 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
924 if (llvm::is_contained(Range&: InnerLoopInductions, Element: V))
925 return true;
926 if (isa<Constant>(Val: V))
927 return true;
928 const Instruction *I = dyn_cast<Instruction>(Val: V);
929 if (!I)
930 return false;
931 if (isa<CastInst>(Val: I))
932 return IsPathToInnerIndVar(I->getOperand(i: 0));
933 if (isa<BinaryOperator>(Val: I))
934 return IsPathToInnerIndVar(I->getOperand(i: 0)) &&
935 IsPathToInnerIndVar(I->getOperand(i: 1));
936 return false;
937 };
938
939 // In case of multiple inner loop indvars, it is okay if LHS and RHS
940 // are both inner indvar related variables.
941 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
942 return true;
943
944 // Otherwise we check if the cmp instruction compares an inner indvar
945 // related variable (Left) with a outer loop invariant (Right).
946 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Val: Op0)) {
947 Left = Op0;
948 Right = Op1;
949 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Val: Op1)) {
950 Left = Op1;
951 Right = Op0;
952 }
953
954 if (Left == nullptr)
955 return false;
956
957 const SCEV *S = SE->getSCEV(V: Right);
958 if (!SE->isLoopInvariant(S, L: OuterLoop))
959 return false;
960 }
961
962 return true;
963}
964
965// If SV is a LCSSA PHI node with a single incoming value, return the incoming
966// value.
967static Value *followLCSSA(Value *SV) {
968 PHINode *PHI = dyn_cast<PHINode>(Val: SV);
969 if (!PHI)
970 return SV;
971
972 if (PHI->getNumIncomingValues() != 1)
973 return SV;
974 return followLCSSA(SV: PHI->getIncomingValue(i: 0));
975}
976
977static bool checkReductionKind(Loop *L, PHINode *PHI,
978 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
979 RecurrenceDescriptor RD;
980 if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) {
981 // Detect floating point reduction only when it can be reordered.
982 if (RD.getExactFPMathInst() != nullptr)
983 return false;
984
985 RecurKind RK = RD.getRecurrenceKind();
986 switch (RK) {
987 case RecurKind::Or:
988 case RecurKind::And:
989 case RecurKind::Xor:
990 case RecurKind::SMin:
991 case RecurKind::SMax:
992 case RecurKind::UMin:
993 case RecurKind::UMax:
994 case RecurKind::FAdd:
995 case RecurKind::FMul:
996 case RecurKind::FMin:
997 case RecurKind::FMax:
998 case RecurKind::FMinimum:
999 case RecurKind::FMaximum:
1000 case RecurKind::FMinimumNum:
1001 case RecurKind::FMaximumNum:
1002 case RecurKind::FMulAdd:
1003 case RecurKind::AnyOf:
1004 return true;
1005
1006 // Change the order of integer addition/multiplication may change the
1007 // semantics. Consider the following case:
1008 //
1009 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1010 // int sum = 0;
1011 // for (int i = 0; i < 2; i++)
1012 // for (int j = 0; j < 2; j++)
1013 // sum += A[j][i];
1014 //
1015 // If the above loops are exchanged, the addition will cause an
1016 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1017 // addition/multiplication instructions when we actually exchanges the
1018 // loops.
1019 case RecurKind::Add:
1020 case RecurKind::Mul: {
1021 unsigned OpCode = RecurrenceDescriptor::getOpcode(Kind: RK);
1022 SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(Phi: PHI, L);
1023
1024 // Bail out when we fail to collect reduction instructions chain.
1025 if (Ops.empty())
1026 return false;
1027
1028 for (Instruction *I : Ops) {
1029 assert(I->getOpcode() == OpCode &&
1030 "Expected the instruction to be the reduction operation");
1031 (void)OpCode;
1032
1033 // If the instruction has nuw/nsw flags, we must drop them when the
1034 // transformation is actually performed.
1035 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1036 HasNoWrapInsts.push_back(Elt: I);
1037 }
1038 return true;
1039 }
1040
1041 default:
1042 return false;
1043 }
1044 } else
1045 return false;
1046}
1047
1048// Check V's users to see if it is involved in a reduction in L.
1049static PHINode *
1050findInnerReductionPhi(Loop *L, Value *V,
1051 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1052 // Reduction variables cannot be constants.
1053 if (isa<Constant>(Val: V))
1054 return nullptr;
1055
1056 for (Value *User : V->users()) {
1057 if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) {
1058 if (PHI->getNumIncomingValues() == 1)
1059 continue;
1060
1061 if (checkReductionKind(L, PHI, HasNoWrapInsts))
1062 return PHI;
1063 else
1064 return nullptr;
1065 }
1066 }
1067
1068 return nullptr;
1069}
1070
1071bool LoopInterchangeLegality::isInnerReduction(
1072 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1073
1074 // Only support reduction2Mem when the loop nest to be interchanged is
1075 // the innermost two loops.
1076 if (!L->isInnermost()) {
1077 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1078 ORE->emit(RemarkBuilder: [&]() {
1079 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1080 L->getStartLoc(), L->getHeader())
1081 << "Only supported when the loop is the innermost.";
1082 });
1083 return false;
1084 }
1085
1086 if (Phi->getNumIncomingValues() != 2)
1087 return false;
1088
1089 Value *Init = Phi->getIncomingValueForBlock(BB: L->getLoopPreheader());
1090 Value *Next = Phi->getIncomingValueForBlock(BB: L->getLoopLatch());
1091
1092 // So far only supports constant initial value.
1093 if (!isa<Constant>(Val: Init)) {
1094 LLVM_DEBUG(
1095 dbgs()
1096 << "Only supported for the reduction with a constant initial value.\n");
1097 ORE->emit(RemarkBuilder: [&]() {
1098 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1099 L->getStartLoc(), L->getHeader())
1100 << "Only supported for the reduction with a constant initial "
1101 "value.";
1102 });
1103 return false;
1104 }
1105
1106 // The reduction result must live in the inner loop.
1107 if (Instruction *I = dyn_cast<Instruction>(Val: Next)) {
1108 BasicBlock *BB = I->getParent();
1109 if (!L->contains(BB))
1110 return false;
1111 }
1112
1113 // The reduction should have only one user.
1114 if (!Phi->hasOneUser())
1115 return false;
1116
1117 // Check the reduction kind.
1118 if (!checkReductionKind(L, PHI: Phi, HasNoWrapInsts))
1119 return false;
1120
1121 // Find lcssa_phi in OuterLoop's Latch
1122 BasicBlock *ExitBlock = L->getExitBlock();
1123 if (!ExitBlock)
1124 return false;
1125
1126 PHINode *Lcssa = NULL;
1127 for (auto *U : Next->users()) {
1128 if (auto *P = dyn_cast<PHINode>(Val: U)) {
1129 if (P == Phi)
1130 continue;
1131
1132 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1133 P->getIncomingValueForBlock(BB: L->getLoopLatch()) == Next)
1134 Lcssa = P;
1135 else
1136 return false;
1137 } else
1138 return false;
1139 }
1140 if (!Lcssa)
1141 return false;
1142
1143 if (!Lcssa->hasOneUser()) {
1144 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1145 "the outer loop.\n");
1146 ORE->emit(RemarkBuilder: [&]() {
1147 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1148 L->getStartLoc(), L->getHeader())
1149 << "Only supported when the reduction is used once in the outer "
1150 "loop.";
1151 });
1152 return false;
1153 }
1154
1155 StoreInst *LcssaStore =
1156 dyn_cast<StoreInst>(Val: Lcssa->getUniqueUndroppableUser());
1157 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1158 return false;
1159
1160 Value *MemRef = LcssaStore->getOperand(i_nocapture: 1);
1161 Type *ElemTy = LcssaStore->getOperand(i_nocapture: 0)->getType();
1162
1163 // LcssaStore stores the reduction result in BB.
1164 // When the reduction is initialized from a constant value, we need to load
1165 // from the memory object into the target basic block of the inner loop. This
1166 // means the memory reference was used prematurely. So we must ensure that the
1167 // memory reference does not dominate the target basic block.
1168 // TODO: Move the memory reference definition into the loop header.
1169 if (!DT->dominates(Def: dyn_cast<Instruction>(Val: MemRef), BB: L->getHeader())) {
1170 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1171 "the inner loop.\n");
1172 ORE->emit(RemarkBuilder: [&]() {
1173 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1174 L->getStartLoc(), L->getHeader())
1175 << "Only supported when memory reference dominate the inner "
1176 "loop.";
1177 });
1178 return false;
1179 }
1180
1181 // Found a reduction in the inner loop.
1182 InnerReduction SR;
1183 SR.Reduction = Phi;
1184 SR.Init = Init;
1185 SR.Next = Next;
1186 SR.LcssaPhi = Lcssa;
1187 SR.LcssaStore = LcssaStore;
1188 SR.MemRef = MemRef;
1189 SR.ElemTy = ElemTy;
1190
1191 InnerReductions.push_back(Elt: SR);
1192 return true;
1193}
1194
1195bool LoopInterchangeLegality::findInductionAndReductions(
1196 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1197 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1198 return false;
1199 for (PHINode &PHI : L->getHeader()->phis()) {
1200 InductionDescriptor ID;
1201 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
1202 Inductions.push_back(Elt: &PHI);
1203 else {
1204 // PHIs in inner loops need to be part of a reduction in the outer loop,
1205 // discovered when checking the PHIs of the outer loop earlier.
1206 if (!InnerLoop) {
1207 if (OuterInnerReductions.count(Ptr: &PHI)) {
1208 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1209 } else if (EnableReduction2Memory &&
1210 isInnerReduction(L, Phi: &PHI, HasNoWrapInsts&: HasNoWrapReductions)) {
1211 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1212 << PHI << '\n');
1213 } else
1214 return false;
1215 } else {
1216 assert(PHI.getNumIncomingValues() == 2 &&
1217 "Phis in loop header should have exactly 2 incoming values");
1218 // Check if we have a PHI node in the outer loop that has a reduction
1219 // result from the inner loop as an incoming value.
1220 Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch()));
1221 PHINode *InnerRedPhi =
1222 findInnerReductionPhi(L: InnerLoop, V, HasNoWrapInsts&: HasNoWrapReductions);
1223 if (!InnerRedPhi ||
1224 !llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) {
1225 LLVM_DEBUG(
1226 dbgs()
1227 << "Failed to recognize PHI as an induction or reduction.\n");
1228 return false;
1229 }
1230 OuterInnerReductions.insert(Ptr: &PHI);
1231 OuterInnerReductions.insert(Ptr: InnerRedPhi);
1232 }
1233 }
1234 }
1235
1236 // For now we only support at most one reduction.
1237 if (InnerReductions.size() > 1) {
1238 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1239 ORE->emit(RemarkBuilder: [&]() {
1240 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1241 L->getStartLoc(), L->getHeader())
1242 << "Only supports at most one reduction.";
1243 });
1244 return false;
1245 }
1246
1247 return true;
1248}
1249
1250// This function indicates the current limitations in the transform as a result
1251// of which we do not proceed.
1252bool LoopInterchangeLegality::currentLimitations() {
1253 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1254
1255 // transform currently expects the loop latches to also be the exiting
1256 // blocks.
1257 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1258 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1259 !isa<BranchInst>(Val: InnerLoopLatch->getTerminator()) ||
1260 !isa<BranchInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) {
1261 LLVM_DEBUG(
1262 dbgs() << "Loops where the latch is not the exiting block are not"
1263 << " supported currently.\n");
1264 ORE->emit(RemarkBuilder: [&]() {
1265 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1266 OuterLoop->getStartLoc(),
1267 OuterLoop->getHeader())
1268 << "Loops where the latch is not the exiting block cannot be"
1269 " interchange currently.";
1270 });
1271 return true;
1272 }
1273
1274 SmallVector<PHINode *, 8> Inductions;
1275 if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) {
1276 LLVM_DEBUG(
1277 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1278 << "are supported currently.\n");
1279 ORE->emit(RemarkBuilder: [&]() {
1280 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1281 OuterLoop->getStartLoc(),
1282 OuterLoop->getHeader())
1283 << "Only outer loops with induction or reduction PHI nodes can be"
1284 " interchanged currently.";
1285 });
1286 return true;
1287 }
1288
1289 Inductions.clear();
1290 // For multi-level loop nests, make sure that all phi nodes for inner loops
1291 // at all levels can be recognized as a induction or reduction phi. Bail out
1292 // if a phi node at a certain nesting level cannot be properly recognized.
1293 Loop *CurLevelLoop = OuterLoop;
1294 while (!CurLevelLoop->getSubLoops().empty()) {
1295 // We already made sure that the loop nest is tightly nested.
1296 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1297 if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) {
1298 LLVM_DEBUG(
1299 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1300 << "are supported currently.\n");
1301 ORE->emit(RemarkBuilder: [&]() {
1302 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1303 CurLevelLoop->getStartLoc(),
1304 CurLevelLoop->getHeader())
1305 << "Only inner loops with induction or reduction PHI nodes can be"
1306 " interchange currently.";
1307 });
1308 return true;
1309 }
1310 }
1311
1312 // TODO: Triangular loops are not handled for now.
1313 if (!isLoopStructureUnderstood()) {
1314 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1315 ORE->emit(RemarkBuilder: [&]() {
1316 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1317 InnerLoop->getStartLoc(),
1318 InnerLoop->getHeader())
1319 << "Inner loop structure not understood currently.";
1320 });
1321 return true;
1322 }
1323
1324 return false;
1325}
1326
1327bool LoopInterchangeLegality::findInductions(
1328 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1329 for (PHINode &PHI : L->getHeader()->phis()) {
1330 InductionDescriptor ID;
1331 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
1332 Inductions.push_back(Elt: &PHI);
1333 }
1334 return !Inductions.empty();
1335}
1336
1337// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1338// users are either reduction PHIs or PHIs outside the outer loop (which means
1339// the we are only interested in the final value after the loop).
1340static bool
1341areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
1342 SmallPtrSetImpl<PHINode *> &Reductions,
1343 PHINode *LcssaReduction) {
1344 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1345 for (PHINode &PHI : InnerExit->phis()) {
1346 // The reduction LCSSA PHI will have only one incoming block, which comes
1347 // from the loop latch.
1348 if (PHI.getNumIncomingValues() > 1)
1349 return false;
1350 if (&PHI == LcssaReduction)
1351 return true;
1352 if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) {
1353 PHINode *PN = dyn_cast<PHINode>(Val: U);
1354 return !PN ||
1355 (!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent()));
1356 })) {
1357 return false;
1358 }
1359 }
1360 return true;
1361}
1362
1363// We currently support LCSSA PHI nodes in the outer loop exit, if their
1364// incoming values do not come from the outer loop latch or if the
1365// outer loop latch has a single predecessor. In that case, the value will
1366// be available if both the inner and outer loop conditions are true, which
1367// will still be true after interchanging. If we have multiple predecessor,
1368// that may not be the case, e.g. because the outer loop latch may be executed
1369// if the inner loop is not executed.
1370static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1371 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1372 for (PHINode &PHI : LoopNestExit->phis()) {
1373 for (Value *Incoming : PHI.incoming_values()) {
1374 Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming);
1375 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1376 continue;
1377
1378 // The incoming value is defined in the outer loop latch. Currently we
1379 // only support that in case the outer loop latch has a single predecessor.
1380 // This guarantees that the outer loop latch is executed if and only if
1381 // the inner loop is executed (because tightlyNested() guarantees that the
1382 // outer loop header only branches to the inner loop or the outer loop
1383 // latch).
1384 // FIXME: We could weaken this logic and allow multiple predecessors,
1385 // if the values are produced outside the loop latch. We would need
1386 // additional logic to update the PHI nodes in the exit block as
1387 // well.
1388 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1389 return false;
1390 }
1391 }
1392 return true;
1393}
1394
1395// In case of multi-level nested loops, it may occur that lcssa phis exist in
1396// the latch of InnerLoop, i.e., when defs of the incoming values are further
1397// inside the loopnest. Sometimes those incoming values are not available
1398// after interchange, since the original inner latch will become the new outer
1399// latch which may have predecessor paths that do not include those incoming
1400// values.
1401// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1402// multi-level loop nests.
1403static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1404 if (InnerLoop->getSubLoops().empty())
1405 return true;
1406 // If the original outer latch has only one predecessor, then values defined
1407 // further inside the looploop, e.g., in the innermost loop, will be available
1408 // at the new outer latch after interchange.
1409 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1410 return true;
1411
1412 // The outer latch has more than one predecessors, i.e., the inner
1413 // exit and the inner header.
1414 // PHI nodes in the inner latch are lcssa phis where the incoming values
1415 // are defined further inside the loopnest. Check if those phis are used
1416 // in the original inner latch. If that is the case then bail out since
1417 // those incoming values may not be available at the new outer latch.
1418 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1419 for (PHINode &PHI : InnerLoopLatch->phis()) {
1420 for (auto *U : PHI.users()) {
1421 Instruction *UI = cast<Instruction>(Val: U);
1422 if (InnerLoopLatch == UI->getParent())
1423 return false;
1424 }
1425 }
1426 return true;
1427}
1428
1429bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1430 unsigned OuterLoopId,
1431 CharMatrix &DepMatrix) {
1432 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1433 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1434 << " and OuterLoopId = " << OuterLoopId
1435 << " due to dependence\n");
1436 ORE->emit(RemarkBuilder: [&]() {
1437 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1438 InnerLoop->getStartLoc(),
1439 InnerLoop->getHeader())
1440 << "Cannot interchange loops due to dependences.";
1441 });
1442 return false;
1443 }
1444 // Check if outer and inner loop contain legal instructions only.
1445 for (auto *BB : OuterLoop->blocks())
1446 for (Instruction &I : BB->instructionsWithoutDebug())
1447 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
1448 // readnone functions do not prevent interchanging.
1449 if (CI->onlyWritesMemory())
1450 continue;
1451 LLVM_DEBUG(
1452 dbgs() << "Loops with call instructions cannot be interchanged "
1453 << "safely.");
1454 ORE->emit(RemarkBuilder: [&]() {
1455 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1456 CI->getDebugLoc(),
1457 CI->getParent())
1458 << "Cannot interchange loops due to call instruction.";
1459 });
1460
1461 return false;
1462 }
1463
1464 if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) {
1465 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1466 return false;
1467 }
1468
1469 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1470 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1471 ORE->emit(RemarkBuilder: [&]() {
1472 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1473 InnerLoop->getStartLoc(),
1474 InnerLoop->getHeader())
1475 << "Cannot interchange loops because unsupported PHI nodes found "
1476 "in inner loop latch.";
1477 });
1478 return false;
1479 }
1480
1481 // TODO: The loops could not be interchanged due to current limitations in the
1482 // transform module.
1483 if (currentLimitations()) {
1484 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1485 return false;
1486 }
1487
1488 // Check if the loops are tightly nested.
1489 if (!tightlyNested(OuterLoop, InnerLoop)) {
1490 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1491 ORE->emit(RemarkBuilder: [&]() {
1492 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1493 InnerLoop->getStartLoc(),
1494 InnerLoop->getHeader())
1495 << "Cannot interchange loops because they are not tightly "
1496 "nested.";
1497 });
1498 return false;
1499 }
1500
1501 // The LCSSA PHI for the reduction has passed checks before; its user
1502 // is a store instruction.
1503 PHINode *LcssaReduction = nullptr;
1504 assert(InnerReductions.size() <= 1 &&
1505 "So far we only support at most one reduction.");
1506 if (InnerReductions.size() == 1)
1507 LcssaReduction = InnerReductions[0].LcssaPhi;
1508
1509 if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop, Reductions&: OuterInnerReductions,
1510 LcssaReduction)) {
1511 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1512 ORE->emit(RemarkBuilder: [&]() {
1513 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1514 InnerLoop->getStartLoc(),
1515 InnerLoop->getHeader())
1516 << "Found unsupported PHI node in loop exit.";
1517 });
1518 return false;
1519 }
1520
1521 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1522 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1523 ORE->emit(RemarkBuilder: [&]() {
1524 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1525 OuterLoop->getStartLoc(),
1526 OuterLoop->getHeader())
1527 << "Found unsupported PHI node in loop exit.";
1528 });
1529 return false;
1530 }
1531
1532 return true;
1533}
1534
1535void CacheCostManager::computeIfUnitinialized() {
1536 if (CC.has_value())
1537 return;
1538
1539 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1540 CC = CacheCost::getCacheCost(Root&: *OutermostLoop, AR&: *AR, DI&: *DI);
1541 // Obtain the loop vector returned from loop cache analysis beforehand,
1542 // and put each <Loop, index> pair into a map for constant time query
1543 // later. Indices in loop vector reprsent the optimal order of the
1544 // corresponding loop, e.g., given a loopnest with depth N, index 0
1545 // indicates the loop should be placed as the outermost loop and index N
1546 // indicates the loop should be placed as the innermost loop.
1547 //
1548 // For the old pass manager CacheCost would be null.
1549 if (*CC != nullptr)
1550 for (const auto &[Idx, Cost] : enumerate(First: (*CC)->getLoopCosts()))
1551 CostMap[Cost.first] = Idx;
1552}
1553
1554CacheCost *CacheCostManager::getCacheCost() {
1555 computeIfUnitinialized();
1556 return CC->get();
1557}
1558
1559const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1560 computeIfUnitinialized();
1561 return CostMap;
1562}
1563
1564int LoopInterchangeProfitability::getInstrOrderCost() {
1565 unsigned GoodOrder, BadOrder;
1566 BadOrder = GoodOrder = 0;
1567 for (BasicBlock *BB : InnerLoop->blocks()) {
1568 for (Instruction &Ins : *BB) {
1569 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) {
1570 bool FoundInnerInduction = false;
1571 bool FoundOuterInduction = false;
1572 for (Value *Op : GEP->operands()) {
1573 // Skip operands that are not SCEV-able.
1574 if (!SE->isSCEVable(Ty: Op->getType()))
1575 continue;
1576
1577 const SCEV *OperandVal = SE->getSCEV(V: Op);
1578 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal);
1579 if (!AR)
1580 continue;
1581
1582 // If we find the inner induction after an outer induction e.g.
1583 // for(int i=0;i<N;i++)
1584 // for(int j=0;j<N;j++)
1585 // A[i][j] = A[i-1][j-1]+k;
1586 // then it is a good order.
1587 if (AR->getLoop() == InnerLoop) {
1588 // We found an InnerLoop induction after OuterLoop induction. It is
1589 // a good order.
1590 FoundInnerInduction = true;
1591 if (FoundOuterInduction) {
1592 GoodOrder++;
1593 break;
1594 }
1595 }
1596 // If we find the outer induction after an inner induction e.g.
1597 // for(int i=0;i<N;i++)
1598 // for(int j=0;j<N;j++)
1599 // A[j][i] = A[j-1][i-1]+k;
1600 // then it is a bad order.
1601 if (AR->getLoop() == OuterLoop) {
1602 // We found an OuterLoop induction after InnerLoop induction. It is
1603 // a bad order.
1604 FoundOuterInduction = true;
1605 if (FoundInnerInduction) {
1606 BadOrder++;
1607 break;
1608 }
1609 }
1610 }
1611 }
1612 }
1613 }
1614 return GoodOrder - BadOrder;
1615}
1616
1617std::optional<bool>
1618LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1619 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1620 // This is the new cost model returned from loop cache analysis.
1621 // A smaller index means the loop should be placed an outer loop, and vice
1622 // versa.
1623 auto InnerLoopIt = CostMap.find(Val: InnerLoop);
1624 if (InnerLoopIt == CostMap.end())
1625 return std::nullopt;
1626 auto OuterLoopIt = CostMap.find(Val: OuterLoop);
1627 if (OuterLoopIt == CostMap.end())
1628 return std::nullopt;
1629
1630 if (CC->getLoopCost(L: *OuterLoop) == CC->getLoopCost(L: *InnerLoop))
1631 return std::nullopt;
1632 unsigned InnerIndex = InnerLoopIt->second;
1633 unsigned OuterIndex = OuterLoopIt->second;
1634 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1635 << ", OuterIndex = " << OuterIndex << "\n");
1636 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1637 "numbers to each loop");
1638 return std::optional<bool>(InnerIndex < OuterIndex);
1639}
1640
1641std::optional<bool>
1642LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1643 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1644 // good and bad order of induction variables in the instruction and allows
1645 // reordering if number of bad orders is more than good.
1646 int Cost = getInstrOrderCost();
1647 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1648 if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1649 return std::optional<bool>(true);
1650
1651 return std::nullopt;
1652}
1653
1654/// Return true if we can vectorize the loop specified by \p LoopId.
1655static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1656 for (const auto &Dep : DepMatrix) {
1657 char Dir = Dep[LoopId];
1658 char DepType = Dep.back();
1659 assert((DepType == '<' || DepType == '*') &&
1660 "Unexpected element in dependency vector");
1661
1662 // There are no loop-carried dependencies.
1663 if (Dir == '=' || Dir == 'I')
1664 continue;
1665
1666 // DepType being '<' means that this direction vector represents a forward
1667 // dependency. In principle, a loop with '<' direction can be vectorized in
1668 // this case.
1669 if (Dir == '<' && DepType == '<')
1670 continue;
1671
1672 // We cannot prove that the loop is vectorizable.
1673 return false;
1674 }
1675 return true;
1676}
1677
1678std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1679 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1680 // If the outer loop cannot be vectorized, it is not profitable to move this
1681 // to inner position.
1682 if (!canVectorize(DepMatrix, LoopId: OuterLoopId))
1683 return false;
1684
1685 // If the inner loop cannot be vectorized but the outer loop can be, then it
1686 // is profitable to interchange to enable inner loop parallelism.
1687 if (!canVectorize(DepMatrix, LoopId: InnerLoopId))
1688 return true;
1689
1690 // If both the inner and the outer loop can be vectorized, it is necessary to
1691 // check the cost of each vectorized loop for profitability decision. At this
1692 // time we do not have a cost model to estimate them, so return nullopt.
1693 // TODO: Estimate the cost of vectorized loop when both the outer and the
1694 // inner loop can be vectorized.
1695 return std::nullopt;
1696}
1697
1698bool LoopInterchangeProfitability::isProfitable(
1699 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1700 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1701 // Do not consider loops with a backedge that isn't taken, e.g. an
1702 // unconditional branch true/false, as candidates for interchange.
1703 // TODO: when interchange is forced, we should probably also allow
1704 // interchange for these loops, and thus this logic should be moved just
1705 // below the cost-model ignore check below. But this check is done first
1706 // to avoid the issue in #163954.
1707 const SCEV *InnerBTC = SE->getBackedgeTakenCount(L: InnerLoop);
1708 const SCEV *OuterBTC = SE->getBackedgeTakenCount(L: OuterLoop);
1709 if (InnerBTC && InnerBTC->isZero()) {
1710 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1711 "single iteration loop\n");
1712 return false;
1713 }
1714 if (OuterBTC && OuterBTC->isZero()) {
1715 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1716 "single iteration loop\n");
1717 return false;
1718 }
1719
1720 // Return true if interchange is forced and the cost-model ignored.
1721 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1722 return true;
1723 assert(noDuplicateRulesAndIgnore(Profitabilities) &&
1724 "Duplicate rules and option 'ignore' are not allowed");
1725
1726 // isProfitable() is structured to avoid endless loop interchange. If the
1727 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1728 // decide the profitability then, profitability check will stop and return the
1729 // analysis result. If it failed to determine it (e.g., cache analysis failed
1730 // to analyze the loopnest due to delinearization issues) then go ahead the
1731 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1732 // Likewise, if it failed to analysis the profitability then only, the last
1733 // rule (isProfitableForVectorization by default) will decide.
1734 std::optional<bool> shouldInterchange;
1735 for (RuleTy RT : Profitabilities) {
1736 switch (RT) {
1737 case RuleTy::PerLoopCacheAnalysis: {
1738 CacheCost *CC = CCM.getCacheCost();
1739 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1740 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1741 break;
1742 }
1743 case RuleTy::PerInstrOrderCost:
1744 shouldInterchange = isProfitablePerInstrOrderCost();
1745 break;
1746 case RuleTy::ForVectorization:
1747 shouldInterchange =
1748 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1749 break;
1750 case RuleTy::Ignore:
1751 llvm_unreachable("Option 'ignore' is not supported with other options");
1752 break;
1753 }
1754
1755 // If this rule could determine the profitability, don't call subsequent
1756 // rules.
1757 if (shouldInterchange.has_value())
1758 break;
1759 }
1760
1761 if (!shouldInterchange.has_value()) {
1762 ORE->emit(RemarkBuilder: [&]() {
1763 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1764 InnerLoop->getStartLoc(),
1765 InnerLoop->getHeader())
1766 << "Insufficient information to calculate the cost of loop for "
1767 "interchange.";
1768 });
1769 return false;
1770 } else if (!shouldInterchange.value()) {
1771 ORE->emit(RemarkBuilder: [&]() {
1772 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1773 InnerLoop->getStartLoc(),
1774 InnerLoop->getHeader())
1775 << "Interchanging loops is not considered to improve cache "
1776 "locality nor vectorization.";
1777 });
1778 return false;
1779 }
1780 return true;
1781}
1782
1783void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1784 Loop *InnerLoop) {
1785 for (Loop *L : *OuterLoop)
1786 if (L == InnerLoop) {
1787 OuterLoop->removeChildLoop(Child: L);
1788 return;
1789 }
1790 llvm_unreachable("Couldn't find loop");
1791}
1792
1793/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1794/// new inner and outer loop after interchanging: NewInner is the original
1795/// outer loop and NewOuter is the original inner loop.
1796///
1797/// Before interchanging, we have the following structure
1798/// Outer preheader
1799// Outer header
1800// Inner preheader
1801// Inner header
1802// Inner body
1803// Inner latch
1804// outer bbs
1805// Outer latch
1806//
1807// After interchanging:
1808// Inner preheader
1809// Inner header
1810// Outer preheader
1811// Outer header
1812// Inner body
1813// outer bbs
1814// Outer latch
1815// Inner latch
1816void LoopInterchangeTransform::restructureLoops(
1817 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1818 BasicBlock *OrigOuterPreHeader) {
1819 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1820 // The original inner loop preheader moves from the new inner loop to
1821 // the parent loop, if there is one.
1822 NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader);
1823 LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent);
1824
1825 // Switch the loop levels.
1826 if (OuterLoopParent) {
1827 // Remove the loop from its parent loop.
1828 removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner);
1829 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1830 OuterLoopParent->addChildLoop(NewChild: NewOuter);
1831 } else {
1832 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1833 LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter);
1834 }
1835 while (!NewOuter->isInnermost())
1836 NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin()));
1837 NewOuter->addChildLoop(NewChild: NewInner);
1838
1839 // BBs from the original inner loop.
1840 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1841
1842 // Add BBs from the original outer loop to the original inner loop (excluding
1843 // BBs already in inner loop)
1844 for (BasicBlock *BB : NewInner->blocks())
1845 if (LI->getLoopFor(BB) == NewInner)
1846 NewOuter->addBlockEntry(BB);
1847
1848 // Now remove inner loop header and latch from the new inner loop and move
1849 // other BBs (the loop body) to the new inner loop.
1850 BasicBlock *OuterHeader = NewOuter->getHeader();
1851 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1852 for (BasicBlock *BB : OrigInnerBBs) {
1853 // Nothing will change for BBs in child loops.
1854 if (LI->getLoopFor(BB) != NewOuter)
1855 continue;
1856 // Remove the new outer loop header and latch from the new inner loop.
1857 if (BB == OuterHeader || BB == OuterLatch)
1858 NewInner->removeBlockFromLoop(BB);
1859 else
1860 LI->changeLoopFor(BB, L: NewInner);
1861 }
1862
1863 // The preheader of the original outer loop becomes part of the new
1864 // outer loop.
1865 NewOuter->addBlockEntry(BB: OrigOuterPreHeader);
1866 LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter);
1867
1868 // Tell SE that we move the loops around.
1869 SE->forgetLoop(L: NewOuter);
1870}
1871
1872/// User can write, or optimizers can generate the reduction for inner loop.
1873/// To make the interchange valid, apply Reduction2Mem by moving the
1874/// initializer and store instructions into the inner loop. So far we only
1875/// handle cases where the reduction variable is initialized to a constant.
1876/// For example, below code:
1877///
1878/// loop:
1879/// re = phi<0.0, next>
1880/// next = re op ...
1881/// endloop
1882/// reduc_sum = phi<next> // lcssa phi
1883/// MEM_REF[idx] = reduc_sum // LcssaStore
1884///
1885/// is transformed into:
1886///
1887/// loop:
1888/// tmp = MEM_REF[idx];
1889/// new_var = !first_iteration ? tmp : 0.0;
1890/// next = new_var op ...
1891/// MEM_REF[idx] = next; // after moving
1892/// endloop
1893///
1894/// In this way the initial const is used in the first iteration of loop.
1895void LoopInterchangeTransform::reduction2Memory() {
1896 ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
1897 LIL.getInnerReductions();
1898
1899 assert(InnerReductions.size() == 1 &&
1900 "So far we only support at most one reduction.");
1901
1902 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
1903 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1904 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1905
1906 // Check if it's the first iteration.
1907 LLVMContext &Context = InnerLoopHeader->getContext();
1908 PHINode *FirstIter =
1909 Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Context), NumReservedValues: 2, Name: "first.iter");
1910 FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: 1),
1911 BB: InnerLoop->getLoopPreheader());
1912 FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: 0),
1913 BB: InnerLoop->getLoopLatch());
1914 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1915
1916 // When the reduction is initialized from a constant value, we need to add
1917 // a stmt loading from the memory object to target basic block in inner
1918 // loop.
1919 Instruction *LoadMem = Builder.CreateLoad(Ty: SR.ElemTy, Ptr: SR.MemRef);
1920
1921 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
1922 Value *NewVar = Builder.CreateSelect(C: FirstIter, True: SR.Init, False: LoadMem, Name: "new.var");
1923
1924 // Replace all uses of the reduction variable with a new variable.
1925 SR.Reduction->replaceAllUsesWith(V: NewVar);
1926
1927 // Move store instruction into inner loop, just after reduction next's
1928 // definition.
1929 SR.LcssaStore->setOperand(i_nocapture: 0, Val_nocapture: SR.Next);
1930 SR.LcssaStore->moveAfter(MovePos: dyn_cast<Instruction>(Val: SR.Next));
1931}
1932
1933bool LoopInterchangeTransform::transform(
1934 ArrayRef<Instruction *> DropNoWrapInsts) {
1935 bool Transformed = false;
1936
1937 ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
1938 LIL.getInnerReductions();
1939 if (InnerReductions.size() == 1)
1940 reduction2Memory();
1941
1942 if (InnerLoop->getSubLoops().empty()) {
1943 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1944 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1945 auto &InductionPHIs = LIL.getInnerLoopInductions();
1946 if (InductionPHIs.empty()) {
1947 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1948 return false;
1949 }
1950
1951 SmallVector<Instruction *, 8> InnerIndexVarList;
1952 for (PHINode *CurInductionPHI : InductionPHIs) {
1953 if (CurInductionPHI->getIncomingBlock(i: 0) == InnerLoopPreHeader)
1954 InnerIndexVarList.push_back(
1955 Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 1)));
1956 else
1957 InnerIndexVarList.push_back(
1958 Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 0)));
1959 }
1960
1961 // Create a new latch block for the inner loop. We split at the
1962 // current latch's terminator and then move the condition and all
1963 // operands that are not either loop-invariant or the induction PHI into the
1964 // new latch block.
1965 BasicBlock *NewLatch =
1966 SplitBlock(Old: InnerLoop->getLoopLatch(),
1967 SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1968
1969 SmallSetVector<Instruction *, 4> WorkList;
1970 unsigned i = 0;
1971 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1972 for (; i < WorkList.size(); i++) {
1973 // Duplicate instruction and move it the new latch. Update uses that
1974 // have been moved.
1975 Instruction *NewI = WorkList[i]->clone();
1976 NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt());
1977 assert(!NewI->mayHaveSideEffects() &&
1978 "Moving instructions with side-effects may change behavior of "
1979 "the loop nest!");
1980 for (Use &U : llvm::make_early_inc_range(Range: WorkList[i]->uses())) {
1981 Instruction *UserI = cast<Instruction>(Val: U.getUser());
1982 if (!InnerLoop->contains(BB: UserI->getParent()) ||
1983 UserI->getParent() == NewLatch ||
1984 llvm::is_contained(Range: InductionPHIs, Element: UserI))
1985 U.set(NewI);
1986 }
1987 // Add operands of moved instruction to the worklist, except if they are
1988 // outside the inner loop or are the induction PHI.
1989 for (Value *Op : WorkList[i]->operands()) {
1990 Instruction *OpI = dyn_cast<Instruction>(Val: Op);
1991 if (!OpI ||
1992 this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop ||
1993 llvm::is_contained(Range: InductionPHIs, Element: OpI))
1994 continue;
1995 WorkList.insert(X: OpI);
1996 }
1997 }
1998 };
1999
2000 // FIXME: Should we interchange when we have a constant condition?
2001 Instruction *CondI = dyn_cast<Instruction>(
2002 Val: cast<BranchInst>(Val: InnerLoop->getLoopLatch()->getTerminator())
2003 ->getCondition());
2004 if (CondI)
2005 WorkList.insert(X: CondI);
2006 MoveInstructions();
2007 for (Instruction *InnerIndexVar : InnerIndexVarList)
2008 WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar));
2009 MoveInstructions();
2010 }
2011
2012 // Ensure the inner loop phi nodes have a separate basic block.
2013 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2014 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2015 InnerLoopHeader->getTerminator()) {
2016 SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2017 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2018 }
2019
2020 // Instructions in the original inner loop preheader may depend on values
2021 // defined in the outer loop header. Move them there, because the original
2022 // inner loop preheader will become the entry into the interchanged loop nest.
2023 // Currently we move all instructions and rely on LICM to move invariant
2024 // instructions outside the loop nest.
2025 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2026 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2027 if (InnerLoopPreHeader != OuterLoopHeader) {
2028 for (Instruction &I :
2029 make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(),
2030 y: std::prev(x: InnerLoopPreHeader->end()))))
2031 I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator());
2032 }
2033
2034 Transformed |= adjustLoopLinks();
2035 if (!Transformed) {
2036 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2037 return false;
2038 }
2039
2040 // Finally, drop the nsw/nuw flags from the instructions for reduction
2041 // calculations.
2042 for (Instruction *Reduction : DropNoWrapInsts) {
2043 Reduction->setHasNoSignedWrap(false);
2044 Reduction->setHasNoUnsignedWrap(false);
2045 }
2046
2047 return true;
2048}
2049
2050/// \brief Move all instructions except the terminator from FromBB right before
2051/// InsertBefore
2052static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2053 BasicBlock *ToBB = InsertBefore->getParent();
2054
2055 ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(),
2056 FromEndIt: FromBB->getTerminator()->getIterator());
2057}
2058
2059/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2060static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2061 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2062 // from BB1 afterwards.
2063 auto Iter = map_range(C&: *BB1, F: [](Instruction &I) { return &I; });
2064 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(x: Iter.end()));
2065 for (Instruction *I : TempInstrs)
2066 I->removeFromParent();
2067
2068 // Move instructions from BB2 to BB1.
2069 moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator());
2070
2071 // Move instructions from TempInstrs to BB2.
2072 for (Instruction *I : TempInstrs)
2073 I->insertBefore(InsertPos: BB2->getTerminator()->getIterator());
2074}
2075
2076// Update BI to jump to NewBB instead of OldBB. Records updates to the
2077// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2078// \p OldBB is exactly once in BI's successor list.
2079static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
2080 BasicBlock *NewBB,
2081 std::vector<DominatorTree::UpdateType> &DTUpdates,
2082 bool MustUpdateOnce = true) {
2083 assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) &&
2084 "BI must jump to OldBB exactly once.");
2085 bool Changed = false;
2086 for (Use &Op : BI->operands())
2087 if (Op == OldBB) {
2088 Op.set(NewBB);
2089 Changed = true;
2090 }
2091
2092 if (Changed) {
2093 DTUpdates.push_back(
2094 x: {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
2095 DTUpdates.push_back(
2096 x: {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
2097 }
2098 assert(Changed && "Expected a successor to be updated");
2099}
2100
2101// Move Lcssa PHIs to the right place.
2102static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2103 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2104 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2105 Loop *InnerLoop, LoopInfo *LI) {
2106
2107 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2108 // defined either in the header or latch. Those blocks will become header and
2109 // latch of the new outer loop, and the only possible users can PHI nodes
2110 // in the exit block of the loop nest or the outer loop header (reduction
2111 // PHIs, in that case, the incoming value must be defined in the inner loop
2112 // header). We can just substitute the user with the incoming value and remove
2113 // the PHI.
2114 for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) {
2115 assert(P.getNumIncomingValues() == 1 &&
2116 "Only loops with a single exit are supported!");
2117
2118 // Incoming values are guaranteed be instructions currently.
2119 auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch));
2120 // In case of multi-level nested loops, follow LCSSA to find the incoming
2121 // value defined from the innermost loop.
2122 auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI));
2123 // Skip phis with incoming values from the inner loop body, excluding the
2124 // header and latch.
2125 if (IncIInnerMost->getParent() != InnerLatch &&
2126 IncIInnerMost->getParent() != InnerHeader)
2127 continue;
2128
2129 assert(all_of(P.users(),
2130 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2131 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2132 IncI->getParent() == InnerHeader) ||
2133 cast<PHINode>(U)->getParent() == OuterExit;
2134 }) &&
2135 "Can only replace phis iff the uses are in the loop nest exit or "
2136 "the incoming value is defined in the inner header (it will "
2137 "dominate all loop blocks after interchanging)");
2138 P.replaceAllUsesWith(V: IncI);
2139 P.eraseFromParent();
2140 }
2141
2142 SmallVector<PHINode *, 8> LcssaInnerExit(
2143 llvm::make_pointer_range(Range: InnerExit->phis()));
2144
2145 SmallVector<PHINode *, 8> LcssaInnerLatch(
2146 llvm::make_pointer_range(Range: InnerLatch->phis()));
2147
2148 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2149 // If a PHI node has users outside of InnerExit, it has a use outside the
2150 // interchanged loop and we have to preserve it. We move these to
2151 // InnerLatch, which will become the new exit block for the innermost
2152 // loop after interchanging.
2153 for (PHINode *P : LcssaInnerExit)
2154 P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2155
2156 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2157 // and we have to move them to the new inner latch.
2158 for (PHINode *P : LcssaInnerLatch)
2159 P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt());
2160
2161 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2162 // incoming values defined in the outer loop, we have to add a new PHI
2163 // in the inner loop latch, which became the exit block of the outer loop,
2164 // after interchanging.
2165 if (OuterExit) {
2166 for (PHINode &P : OuterExit->phis()) {
2167 if (P.getNumIncomingValues() != 1)
2168 continue;
2169 // Skip Phis with incoming values defined in the inner loop. Those should
2170 // already have been updated.
2171 auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: 0));
2172 if (!I || LI->getLoopFor(BB: I->getParent()) == InnerLoop)
2173 continue;
2174
2175 PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone());
2176 NewPhi->setIncomingValue(i: 0, V: P.getIncomingValue(i: 0));
2177 NewPhi->setIncomingBlock(i: 0, BB: OuterLatch);
2178 // We might have incoming edges from other BBs, i.e., the original outer
2179 // header.
2180 for (auto *Pred : predecessors(BB: InnerLatch)) {
2181 if (Pred == OuterLatch)
2182 continue;
2183 NewPhi->addIncoming(V: P.getIncomingValue(i: 0), BB: Pred);
2184 }
2185 NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2186 P.setIncomingValue(i: 0, V: NewPhi);
2187 }
2188 }
2189
2190 // Now adjust the incoming blocks for the LCSSA PHIs.
2191 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2192 // with the new latch.
2193 InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch);
2194}
2195
2196/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2197/// block: the outer loop latch block does not need to be exit block of the
2198/// inner loop. Consider a loop that was in LCSSA form, but then some
2199/// transformation like loop-unswitch comes along and creates an empty block,
2200/// where BB5 in this example is the outer loop latch block:
2201///
2202/// BB4:
2203/// br label %BB5
2204/// BB5:
2205/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2206/// br outer.header
2207///
2208/// Interchange then brings it in LCSSA form again resulting in this chain of
2209/// single-input phi nodes:
2210///
2211/// BB4:
2212/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2213/// br label %BB5
2214/// BB5:
2215/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2216///
2217/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2218/// use before the def if we don't check this. The solution is to simplify
2219/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2220///
2221static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2222 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2223 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2224
2225 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2226 if (OuterLoopLatch == InnerLoopExit)
2227 return;
2228
2229 // Collect and remove phis in non-exit blocks if they have 1 input.
2230 SmallVector<PHINode *, 8> Phis(
2231 llvm::make_pointer_range(Range: OuterLoopLatch->phis()));
2232 for (PHINode *Phi : Phis) {
2233 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2234 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2235 << "\n");
2236 Phi->replaceAllUsesWith(V: Phi->getIncomingValue(i: 0));
2237 Phi->eraseFromParent();
2238 }
2239}
2240
2241bool LoopInterchangeTransform::adjustLoopBranches() {
2242 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2243 std::vector<DominatorTree::UpdateType> DTUpdates;
2244
2245 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2246 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2247
2248 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2249 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2250 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2251
2252 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2253
2254 // Ensure that both preheaders do not contain PHI nodes and have single
2255 // predecessors. This allows us to move them easily. We use
2256 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2257 // preheaders do not satisfy those conditions.
2258 if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) ||
2259 !OuterLoopPreHeader->getUniquePredecessor())
2260 OuterLoopPreHeader =
2261 InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2262 if (InnerLoopPreHeader == OuterLoop->getHeader())
2263 InnerLoopPreHeader =
2264 InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2265
2266 // Adjust the loop preheader
2267 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2268 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2269 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2270 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2271 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2272 BasicBlock *InnerLoopLatchPredecessor =
2273 InnerLoopLatch->getUniquePredecessor();
2274 BasicBlock *InnerLoopLatchSuccessor;
2275 BasicBlock *OuterLoopLatchSuccessor;
2276
2277 BranchInst *OuterLoopLatchBI =
2278 dyn_cast<BranchInst>(Val: OuterLoopLatch->getTerminator());
2279 BranchInst *InnerLoopLatchBI =
2280 dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator());
2281 BranchInst *OuterLoopHeaderBI =
2282 dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator());
2283 BranchInst *InnerLoopHeaderBI =
2284 dyn_cast<BranchInst>(Val: InnerLoopHeader->getTerminator());
2285
2286 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2287 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2288 !InnerLoopHeaderBI)
2289 return false;
2290
2291 BranchInst *InnerLoopLatchPredecessorBI =
2292 dyn_cast<BranchInst>(Val: InnerLoopLatchPredecessor->getTerminator());
2293 BranchInst *OuterLoopPredecessorBI =
2294 dyn_cast<BranchInst>(Val: OuterLoopPredecessor->getTerminator());
2295
2296 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2297 return false;
2298 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2299 if (!InnerLoopHeaderSuccessor)
2300 return false;
2301
2302 // Adjust Loop Preheader and headers.
2303 // The branches in the outer loop predecessor and the outer loop header can
2304 // be unconditional branches or conditional branches with duplicates. Consider
2305 // this when updating the successors.
2306 updateSuccessor(BI: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader,
2307 NewBB: InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2308 // The outer loop header might or might not branch to the outer latch.
2309 // We are guaranteed to branch to the inner loop preheader.
2310 if (llvm::is_contained(Range: OuterLoopHeaderBI->successors(), Element: OuterLoopLatch)) {
2311 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2312 updateSuccessor(BI: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch,
2313 DTUpdates,
2314 /*MustUpdateOnce=*/false);
2315 }
2316 updateSuccessor(BI: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader,
2317 NewBB: InnerLoopHeaderSuccessor, DTUpdates,
2318 /*MustUpdateOnce=*/false);
2319
2320 // Adjust reduction PHI's now that the incoming block has changed.
2321 InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader,
2322 New: OuterLoopHeader);
2323
2324 updateSuccessor(BI: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor,
2325 NewBB: OuterLoopPreHeader, DTUpdates);
2326
2327 // -------------Adjust loop latches-----------
2328 if (InnerLoopLatchBI->getSuccessor(i: 0) == InnerLoopHeader)
2329 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 1);
2330 else
2331 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 0);
2332
2333 updateSuccessor(BI: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch,
2334 NewBB: InnerLoopLatchSuccessor, DTUpdates);
2335
2336 if (OuterLoopLatchBI->getSuccessor(i: 0) == OuterLoopHeader)
2337 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 1);
2338 else
2339 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 0);
2340
2341 updateSuccessor(BI: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor,
2342 NewBB: OuterLoopLatchSuccessor, DTUpdates);
2343 updateSuccessor(BI: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch,
2344 DTUpdates);
2345
2346 DT->applyUpdates(Updates: DTUpdates);
2347 restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader,
2348 OrigOuterPreHeader: OuterLoopPreHeader);
2349
2350 moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch,
2351 OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(),
2352 InnerLoop, LI);
2353 // For PHIs in the exit block of the outer loop, outer's latch has been
2354 // replaced by Inners'.
2355 OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2356
2357 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2358 // Now update the reduction PHIs in the inner and outer loop headers.
2359 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2360 for (PHINode &PHI : InnerLoopHeader->phis())
2361 if (OuterInnerReductions.contains(Ptr: &PHI))
2362 InnerLoopPHIs.push_back(Elt: &PHI);
2363
2364 for (PHINode &PHI : OuterLoopHeader->phis())
2365 if (OuterInnerReductions.contains(Ptr: &PHI))
2366 OuterLoopPHIs.push_back(Elt: &PHI);
2367
2368 // Now move the remaining reduction PHIs from outer to inner loop header and
2369 // vice versa. The PHI nodes must be part of a reduction across the inner and
2370 // outer loop and all the remains to do is and updating the incoming blocks.
2371 for (PHINode *PHI : OuterLoopPHIs) {
2372 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2373 PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt());
2374 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2375 }
2376 for (PHINode *PHI : InnerLoopPHIs) {
2377 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2378 PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt());
2379 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2380 }
2381
2382 // Update the incoming blocks for moved PHI nodes.
2383 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader);
2384 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch);
2385 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader);
2386 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2387
2388 // Values defined in the outer loop header could be used in the inner loop
2389 // latch. In that case, we need to create LCSSA phis for them, because after
2390 // interchanging they will be defined in the new inner loop and used in the
2391 // new outer loop.
2392 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2393 for (Instruction &I :
2394 make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end())))
2395 MayNeedLCSSAPhis.push_back(Elt: &I);
2396 formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: *DT, LI: *LI, SE);
2397
2398 return true;
2399}
2400
2401bool LoopInterchangeTransform::adjustLoopLinks() {
2402 // Adjust all branches in the inner and outer loop.
2403 bool Changed = adjustLoopBranches();
2404 if (Changed) {
2405 // We have interchanged the preheaders so we need to interchange the data in
2406 // the preheaders as well. This is because the content of the inner
2407 // preheader was previously executed inside the outer loop.
2408 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2409 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2410 swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader);
2411 }
2412 return Changed;
2413}
2414
2415PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
2416 LoopAnalysisManager &AM,
2417 LoopStandardAnalysisResults &AR,
2418 LPMUpdater &U) {
2419 Function &F = *LN.getParent();
2420 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2421
2422 if (MaxMemInstrCount < 1) {
2423 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2424 return PreservedAnalyses::all();
2425 }
2426 OptimizationRemarkEmitter ORE(&F);
2427
2428 // Ensure minimum depth of the loop nest to do the interchange.
2429 if (!hasSupportedLoopDepth(LoopList, ORE))
2430 return PreservedAnalyses::all();
2431 // Ensure computable loop nest.
2432 if (!isComputableLoopNest(SE: &AR.SE, LoopList)) {
2433 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2434 return PreservedAnalyses::all();
2435 }
2436
2437 ORE.emit(RemarkBuilder: [&]() {
2438 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2439 LN.getOutermostLoop().getStartLoc(),
2440 LN.getOutermostLoop().getHeader())
2441 << "Computed dependence info, invoking the transform.";
2442 });
2443
2444 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2445 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2446 return PreservedAnalyses::all();
2447 U.markLoopNestChanged(Changed: true);
2448 return getLoopPassPreservedAnalyses();
2449}
2450