1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Transforms/Scalar/LoopInterchange.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/ADT/StringSet.h"
22#include "llvm/Analysis/DependenceAnalysis.h"
23#include "llvm/Analysis/LoopCacheAnalysis.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/LoopNestAnalysis.h"
26#include "llvm/Analysis/LoopPass.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/ScalarEvolution.h"
29#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/DiagnosticInfo.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/InstrTypes.h"
35#include "llvm/IR/Instruction.h"
36#include "llvm/IR/Instructions.h"
37#include "llvm/IR/User.h"
38#include "llvm/IR/Value.h"
39#include "llvm/Support/Casting.h"
40#include "llvm/Support/CommandLine.h"
41#include "llvm/Support/Debug.h"
42#include "llvm/Support/ErrorHandling.h"
43#include "llvm/Support/raw_ostream.h"
44#include "llvm/Transforms/Scalar/LoopPassManager.h"
45#include "llvm/Transforms/Utils/BasicBlockUtils.h"
46#include "llvm/Transforms/Utils/LoopUtils.h"
47#include <cassert>
48#include <utility>
49#include <vector>
50
51using namespace llvm;
52
53#define DEBUG_TYPE "loop-interchange"
54
55STATISTIC(LoopsInterchanged, "Number of loops interchanged");
56
57static cl::opt<int> LoopInterchangeCostThreshold(
58 "loop-interchange-threshold", cl::init(Val: 0), cl::Hidden,
59 cl::desc("Interchange if you gain more than this number"));
60
61// Maximum number of load-stores that can be handled in the dependency matrix.
62static cl::opt<unsigned int> MaxMemInstrCount(
63 "loop-interchange-max-meminstr-count", cl::init(Val: 64), cl::Hidden,
64 cl::desc(
65 "Maximum number of load-store instructions that should be handled "
66 "in the dependency matrix. Higher value may lead to more interchanges "
67 "at the cost of compile-time"));
68
69namespace {
70
71using LoopVector = SmallVector<Loop *, 8>;
72
73// TODO: Check if we can use a sparse matrix here.
74using CharMatrix = std::vector<std::vector<char>>;
75
76/// Types of rules used in profitability check.
77enum class RuleTy {
78 PerLoopCacheAnalysis,
79 PerInstrOrderCost,
80 ForVectorization,
81};
82
83} // end anonymous namespace
84
85// Minimum loop depth supported.
86static cl::opt<unsigned int> MinLoopNestDepth(
87 "loop-interchange-min-loop-nest-depth", cl::init(Val: 2), cl::Hidden,
88 cl::desc("Minimum depth of loop nest considered for the transform"));
89
90// Maximum loop depth supported.
91static cl::opt<unsigned int> MaxLoopNestDepth(
92 "loop-interchange-max-loop-nest-depth", cl::init(Val: 10), cl::Hidden,
93 cl::desc("Maximum depth of loop nest considered for the transform"));
94
95// We prefer cache cost to vectorization by default.
96static cl::list<RuleTy> Profitabilities(
97 "loop-interchange-profitabilities", cl::ZeroOrMore,
98 cl::MiscFlags::CommaSeparated, cl::Hidden,
99 cl::desc("List of profitability heuristics to be used. They are applied in "
100 "the given order"),
101 cl::list_init<RuleTy>(Vals: {RuleTy::PerLoopCacheAnalysis,
102 RuleTy::PerInstrOrderCost,
103 RuleTy::ForVectorization}),
104 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
105 "Prioritize loop cache cost"),
106 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
107 "Prioritize the IVs order of each instruction"),
108 clEnumValN(RuleTy::ForVectorization, "vectorize",
109 "Prioritize vectorization")));
110
111#ifndef NDEBUG
112static bool noDuplicateRules(ArrayRef<RuleTy> Rules) {
113 SmallSet<RuleTy, 4> Set;
114 for (RuleTy Rule : Rules)
115 if (!Set.insert(Rule).second)
116 return false;
117 return true;
118}
119
120static void printDepMatrix(CharMatrix &DepMatrix) {
121 for (auto &Row : DepMatrix) {
122 for (auto D : Row)
123 LLVM_DEBUG(dbgs() << D << " ");
124 LLVM_DEBUG(dbgs() << "\n");
125 }
126}
127#endif
128
129static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
130 Loop *L, DependenceInfo *DI,
131 ScalarEvolution *SE,
132 OptimizationRemarkEmitter *ORE) {
133 using ValueVector = SmallVector<Value *, 16>;
134
135 ValueVector MemInstr;
136
137 // For each block.
138 for (BasicBlock *BB : L->blocks()) {
139 // Scan the BB and collect legal loads and stores.
140 for (Instruction &I : *BB) {
141 if (!isa<Instruction>(Val: I))
142 return false;
143 if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
144 if (!Ld->isSimple())
145 return false;
146 MemInstr.push_back(Elt: &I);
147 } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
148 if (!St->isSimple())
149 return false;
150 MemInstr.push_back(Elt: &I);
151 }
152 }
153 }
154
155 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
156 << " Loads and Stores to analyze\n");
157 if (MemInstr.size() > MaxMemInstrCount) {
158 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
159 << MaxMemInstrCount << " load/stores in a loop\n");
160 ORE->emit(RemarkBuilder: [&]() {
161 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
162 L->getStartLoc(), L->getHeader())
163 << "Number of loads/stores exceeded, the supported maximum "
164 "can be increased with option "
165 "-loop-interchange-maxmeminstr-count.";
166 });
167 return false;
168 }
169 ValueVector::iterator I, IE, J, JE;
170 StringSet<> Seen;
171
172 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
173 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
174 std::vector<char> Dep;
175 Instruction *Src = cast<Instruction>(Val: *I);
176 Instruction *Dst = cast<Instruction>(Val: *J);
177 // Ignore Input dependencies.
178 if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
179 continue;
180 // Track Output, Flow, and Anti dependencies.
181 if (auto D = DI->depends(Src, Dst)) {
182 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
183 // If the direction vector is negative, normalize it to
184 // make it non-negative.
185 if (D->normalize(SE))
186 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
187 LLVM_DEBUG(StringRef DepType =
188 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
189 dbgs() << "Found " << DepType
190 << " dependency between Src and Dst\n"
191 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
192 unsigned Levels = D->getLevels();
193 char Direction;
194 for (unsigned II = 1; II <= Levels; ++II) {
195 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
196 // or `=`, for which we don't have an equivalent representation, so
197 // that the conservative approximation is necessary. The same goes for
198 // `DVEntry::GE`.
199 // TODO: Use of fine-grained expressions allows for more accurate
200 // analysis.
201 unsigned Dir = D->getDirection(Level: II);
202 if (Dir == Dependence::DVEntry::LT)
203 Direction = '<';
204 else if (Dir == Dependence::DVEntry::GT)
205 Direction = '>';
206 else if (Dir == Dependence::DVEntry::EQ)
207 Direction = '=';
208 else
209 Direction = '*';
210 Dep.push_back(x: Direction);
211 }
212
213 // If the Dependence object doesn't have any information, fill the
214 // dependency vector with '*'.
215 if (D->isConfused()) {
216 assert(Dep.empty() && "Expected empty dependency vector");
217 Dep.assign(n: Level, val: '*');
218 }
219
220 while (Dep.size() != Level) {
221 Dep.push_back(x: 'I');
222 }
223
224 // Make sure we only add unique entries to the dependency matrix.
225 if (Seen.insert(key: StringRef(Dep.data(), Dep.size())).second)
226 DepMatrix.push_back(x: Dep);
227 }
228 }
229 }
230
231 return true;
232}
233
234// A loop is moved from index 'from' to an index 'to'. Update the Dependence
235// matrix by exchanging the two columns.
236static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
237 unsigned ToIndx) {
238 for (auto &Row : DepMatrix)
239 std::swap(a&: Row[ToIndx], b&: Row[FromIndx]);
240}
241
242// Check if a direction vector is lexicographically positive. Return true if it
243// is positive, nullopt if it is "zero", otherwise false.
244// [Theorem] A permutation of the loops in a perfect nest is legal if and only
245// if the direction matrix, after the same permutation is applied to its
246// columns, has no ">" direction as the leftmost non-"=" direction in any row.
247static std::optional<bool>
248isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
249 for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) {
250 if (Direction == '<')
251 return true;
252 if (Direction == '>' || Direction == '*')
253 return false;
254 }
255 return std::nullopt;
256}
257
258// Checks if it is legal to interchange 2 loops.
259static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
260 unsigned InnerLoopId,
261 unsigned OuterLoopId) {
262 unsigned NumRows = DepMatrix.size();
263 std::vector<char> Cur;
264 // For each row check if it is valid to interchange.
265 for (unsigned Row = 0; Row < NumRows; ++Row) {
266 // Create temporary DepVector check its lexicographical order
267 // before and after swapping OuterLoop vs InnerLoop
268 Cur = DepMatrix[Row];
269
270 // If the surrounding loops already ensure that the direction vector is
271 // lexicographically positive, nothing within the loop will be able to break
272 // the dependence. In such a case we can skip the subsequent check.
273 if (isLexicographicallyPositive(DV: Cur, Begin: 0, End: OuterLoopId) == true)
274 continue;
275
276 // Check if the direction vector is lexicographically positive (or zero)
277 // for both before/after exchanged.
278 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size()) == false)
279 return false;
280 std::swap(a&: Cur[InnerLoopId], b&: Cur[OuterLoopId]);
281 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size()) == false)
282 return false;
283 }
284 return true;
285}
286
287static void populateWorklist(Loop &L, LoopVector &LoopList) {
288 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
289 << L.getHeader()->getParent()->getName() << " Loop: %"
290 << L.getHeader()->getName() << '\n');
291 assert(LoopList.empty() && "LoopList should initially be empty!");
292 Loop *CurrentLoop = &L;
293 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
294 while (!Vec->empty()) {
295 // The current loop has multiple subloops in it hence it is not tightly
296 // nested.
297 // Discard all loops above it added into Worklist.
298 if (Vec->size() != 1) {
299 LoopList = {};
300 return;
301 }
302
303 LoopList.push_back(Elt: CurrentLoop);
304 CurrentLoop = Vec->front();
305 Vec = &CurrentLoop->getSubLoops();
306 }
307 LoopList.push_back(Elt: CurrentLoop);
308}
309
310static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList,
311 OptimizationRemarkEmitter &ORE) {
312 unsigned LoopNestDepth = LoopList.size();
313 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
314 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
315 << ", the supported range is [" << MinLoopNestDepth
316 << ", " << MaxLoopNestDepth << "].\n");
317 Loop *OuterLoop = LoopList.front();
318 ORE.emit(RemarkBuilder: [&]() {
319 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
320 OuterLoop->getStartLoc(),
321 OuterLoop->getHeader())
322 << "Unsupported depth of loop nest, the supported range is ["
323 << std::to_string(val: MinLoopNestDepth) << ", "
324 << std::to_string(val: MaxLoopNestDepth) << "].\n";
325 });
326 return false;
327 }
328 return true;
329}
330
331static bool isComputableLoopNest(ScalarEvolution *SE,
332 ArrayRef<Loop *> LoopList) {
333 for (Loop *L : LoopList) {
334 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
335 if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) {
336 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
337 return false;
338 }
339 if (L->getNumBackEdges() != 1) {
340 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
341 return false;
342 }
343 if (!L->getExitingBlock()) {
344 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
345 return false;
346 }
347 }
348 return true;
349}
350
351namespace {
352
353/// LoopInterchangeLegality checks if it is legal to interchange the loop.
354class LoopInterchangeLegality {
355public:
356 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
357 OptimizationRemarkEmitter *ORE)
358 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
359
360 /// Check if the loops can be interchanged.
361 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
362 CharMatrix &DepMatrix);
363
364 /// Discover induction PHIs in the header of \p L. Induction
365 /// PHIs are added to \p Inductions.
366 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
367
368 /// Check if the loop structure is understood. We do not handle triangular
369 /// loops for now.
370 bool isLoopStructureUnderstood();
371
372 bool currentLimitations();
373
374 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
375 return OuterInnerReductions;
376 }
377
378 const ArrayRef<PHINode *> getInnerLoopInductions() const {
379 return InnerLoopInductions;
380 }
381
382private:
383 bool tightlyNested(Loop *Outer, Loop *Inner);
384 bool containsUnsafeInstructions(BasicBlock *BB);
385
386 /// Discover induction and reduction PHIs in the header of \p L. Induction
387 /// PHIs are added to \p Inductions, reductions are added to
388 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
389 /// to be passed as \p InnerLoop.
390 bool findInductionAndReductions(Loop *L,
391 SmallVector<PHINode *, 8> &Inductions,
392 Loop *InnerLoop);
393
394 Loop *OuterLoop;
395 Loop *InnerLoop;
396
397 ScalarEvolution *SE;
398
399 /// Interface to emit optimization remarks.
400 OptimizationRemarkEmitter *ORE;
401
402 /// Set of reduction PHIs taking part of a reduction across the inner and
403 /// outer loop.
404 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
405
406 /// Set of inner loop induction PHIs
407 SmallVector<PHINode *, 8> InnerLoopInductions;
408};
409
410/// LoopInterchangeProfitability checks if it is profitable to interchange the
411/// loop.
412class LoopInterchangeProfitability {
413public:
414 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
415 OptimizationRemarkEmitter *ORE)
416 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
417
418 /// Check if the loop interchange is profitable.
419 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
420 unsigned InnerLoopId, unsigned OuterLoopId,
421 CharMatrix &DepMatrix,
422 const DenseMap<const Loop *, unsigned> &CostMap,
423 std::unique_ptr<CacheCost> &CC);
424
425private:
426 int getInstrOrderCost();
427 std::optional<bool> isProfitablePerLoopCacheAnalysis(
428 const DenseMap<const Loop *, unsigned> &CostMap,
429 std::unique_ptr<CacheCost> &CC);
430 std::optional<bool> isProfitablePerInstrOrderCost();
431 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
432 unsigned OuterLoopId,
433 CharMatrix &DepMatrix);
434 Loop *OuterLoop;
435 Loop *InnerLoop;
436
437 /// Scev analysis.
438 ScalarEvolution *SE;
439
440 /// Interface to emit optimization remarks.
441 OptimizationRemarkEmitter *ORE;
442};
443
444/// LoopInterchangeTransform interchanges the loop.
445class LoopInterchangeTransform {
446public:
447 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
448 LoopInfo *LI, DominatorTree *DT,
449 const LoopInterchangeLegality &LIL)
450 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
451
452 /// Interchange OuterLoop and InnerLoop.
453 bool transform();
454 void restructureLoops(Loop *NewInner, Loop *NewOuter,
455 BasicBlock *OrigInnerPreHeader,
456 BasicBlock *OrigOuterPreHeader);
457 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
458
459private:
460 bool adjustLoopLinks();
461 bool adjustLoopBranches();
462
463 Loop *OuterLoop;
464 Loop *InnerLoop;
465
466 /// Scev analysis.
467 ScalarEvolution *SE;
468
469 LoopInfo *LI;
470 DominatorTree *DT;
471
472 const LoopInterchangeLegality &LIL;
473};
474
475struct LoopInterchange {
476 ScalarEvolution *SE = nullptr;
477 LoopInfo *LI = nullptr;
478 DependenceInfo *DI = nullptr;
479 DominatorTree *DT = nullptr;
480 std::unique_ptr<CacheCost> CC = nullptr;
481
482 /// Interface to emit optimization remarks.
483 OptimizationRemarkEmitter *ORE;
484
485 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
486 DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
487 OptimizationRemarkEmitter *ORE)
488 : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
489
490 bool run(Loop *L) {
491 if (L->getParentLoop())
492 return false;
493 SmallVector<Loop *, 8> LoopList;
494 populateWorklist(L&: *L, LoopList);
495 return processLoopList(LoopList);
496 }
497
498 bool run(LoopNest &LN) {
499 SmallVector<Loop *, 8> LoopList(LN.getLoops());
500 for (unsigned I = 1; I < LoopList.size(); ++I)
501 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
502 return false;
503 return processLoopList(LoopList);
504 }
505
506 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
507 // TODO: Add a better heuristic to select the loop to be interchanged based
508 // on the dependence matrix. Currently we select the innermost loop.
509 return LoopList.size() - 1;
510 }
511
512 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
513 bool Changed = false;
514
515 // Ensure proper loop nest depth.
516 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
517 "Unsupported depth of loop nest.");
518
519 unsigned LoopNestDepth = LoopList.size();
520
521 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
522 << "\n");
523
524 CharMatrix DependencyMatrix;
525 Loop *OuterMostLoop = *(LoopList.begin());
526 if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth,
527 L: OuterMostLoop, DI, SE, ORE)) {
528 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
529 return false;
530 }
531
532 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
533 printDepMatrix(DependencyMatrix));
534
535 // Get the Outermost loop exit.
536 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
537 if (!LoopNestExit) {
538 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
539 return false;
540 }
541
542 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
543 // Obtain the loop vector returned from loop cache analysis beforehand,
544 // and put each <Loop, index> pair into a map for constant time query
545 // later. Indices in loop vector reprsent the optimal order of the
546 // corresponding loop, e.g., given a loopnest with depth N, index 0
547 // indicates the loop should be placed as the outermost loop and index N
548 // indicates the loop should be placed as the innermost loop.
549 //
550 // For the old pass manager CacheCost would be null.
551 DenseMap<const Loop *, unsigned> CostMap;
552 if (CC != nullptr) {
553 for (const auto &[Idx, Cost] : enumerate(First: CC->getLoopCosts()))
554 CostMap[Cost.first] = Idx;
555 }
556 // We try to achieve the globally optimal memory access for the loopnest,
557 // and do interchange based on a bubble-sort fasion. We start from
558 // the innermost loop, move it outwards to the best possible position
559 // and repeat this process.
560 for (unsigned j = SelecLoopId; j > 0; j--) {
561 bool ChangedPerIter = false;
562 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
563 bool Interchanged =
564 processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - 1, DependencyMatrix, CostMap);
565 ChangedPerIter |= Interchanged;
566 Changed |= Interchanged;
567 }
568 // Early abort if there was no interchange during an entire round of
569 // moving loops outwards.
570 if (!ChangedPerIter)
571 break;
572 }
573 return Changed;
574 }
575
576 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
577 unsigned OuterLoopId,
578 std::vector<std::vector<char>> &DependencyMatrix,
579 const DenseMap<const Loop *, unsigned> &CostMap) {
580 Loop *OuterLoop = LoopList[OuterLoopId];
581 Loop *InnerLoop = LoopList[InnerLoopId];
582 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
583 << " and OuterLoopId = " << OuterLoopId << "\n");
584 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
585 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) {
586 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
587 return false;
588 }
589 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
590 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
591 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
592 DepMatrix&: DependencyMatrix, CostMap, CC)) {
593 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
594 return false;
595 }
596
597 ORE->emit(RemarkBuilder: [&]() {
598 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
599 InnerLoop->getStartLoc(),
600 InnerLoop->getHeader())
601 << "Loop interchanged with enclosing loop.";
602 });
603
604 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
605 LIT.transform();
606 LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
607 LoopsInterchanged++;
608
609 llvm::formLCSSARecursively(L&: *OuterLoop, DT: *DT, LI, SE);
610
611 // Loops interchanged, update LoopList accordingly.
612 std::swap(a&: LoopList[OuterLoopId], b&: LoopList[InnerLoopId]);
613 // Update the DependencyMatrix
614 interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId);
615
616 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
617 printDepMatrix(DependencyMatrix));
618
619 return true;
620 }
621};
622
623} // end anonymous namespace
624
625bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
626 return any_of(Range&: *BB, P: [](const Instruction &I) {
627 return I.mayHaveSideEffects() || I.mayReadFromMemory();
628 });
629}
630
631bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
632 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
633 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
634 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
635
636 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
637
638 // A perfectly nested loop will not have any branch in between the outer and
639 // inner block i.e. outer header will branch to either inner preheader and
640 // outerloop latch.
641 BranchInst *OuterLoopHeaderBI =
642 dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator());
643 if (!OuterLoopHeaderBI)
644 return false;
645
646 for (BasicBlock *Succ : successors(I: OuterLoopHeaderBI))
647 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
648 Succ != OuterLoopLatch)
649 return false;
650
651 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
652 // We do not have any basic block in between now make sure the outer header
653 // and outer loop latch doesn't contain any unsafe instructions.
654 if (containsUnsafeInstructions(BB: OuterLoopHeader) ||
655 containsUnsafeInstructions(BB: OuterLoopLatch))
656 return false;
657
658 // Also make sure the inner loop preheader does not contain any unsafe
659 // instructions. Note that all instructions in the preheader will be moved to
660 // the outer loop header when interchanging.
661 if (InnerLoopPreHeader != OuterLoopHeader &&
662 containsUnsafeInstructions(BB: InnerLoopPreHeader))
663 return false;
664
665 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
666 // Ensure the inner loop exit block flows to the outer loop latch possibly
667 // through empty blocks.
668 const BasicBlock &SuccInner =
669 LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch);
670 if (&SuccInner != OuterLoopLatch) {
671 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
672 << " does not lead to the outer loop latch.\n";);
673 return false;
674 }
675 // The inner loop exit block does flow to the outer loop latch and not some
676 // other BBs, now make sure it contains safe instructions, since it will be
677 // moved into the (new) inner loop after interchange.
678 if (containsUnsafeInstructions(BB: InnerLoopExit))
679 return false;
680
681 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
682 // We have a perfect loop nest.
683 return true;
684}
685
686bool LoopInterchangeLegality::isLoopStructureUnderstood() {
687 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
688 for (PHINode *InnerInduction : InnerLoopInductions) {
689 unsigned Num = InnerInduction->getNumOperands();
690 for (unsigned i = 0; i < Num; ++i) {
691 Value *Val = InnerInduction->getOperand(i_nocapture: i);
692 if (isa<Constant>(Val))
693 continue;
694 Instruction *I = dyn_cast<Instruction>(Val);
695 if (!I)
696 return false;
697 // TODO: Handle triangular loops.
698 // e.g. for(int i=0;i<N;i++)
699 // for(int j=i;j<N;j++)
700 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
701 if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) ==
702 InnerLoopPreheader &&
703 !OuterLoop->isLoopInvariant(V: I)) {
704 return false;
705 }
706 }
707 }
708
709 // TODO: Handle triangular loops of another form.
710 // e.g. for(int i=0;i<N;i++)
711 // for(int j=0;j<i;j++)
712 // or,
713 // for(int i=0;i<N;i++)
714 // for(int j=0;j*i<N;j++)
715 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
716 BranchInst *InnerLoopLatchBI =
717 dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator());
718 if (!InnerLoopLatchBI->isConditional())
719 return false;
720 if (CmpInst *InnerLoopCmp =
721 dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) {
722 Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: 0);
723 Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: 1);
724
725 // LHS and RHS of the inner loop exit condition, e.g.,
726 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
727 Value *Left = nullptr;
728 Value *Right = nullptr;
729
730 // Check if V only involves inner loop induction variable.
731 // Return true if V is InnerInduction, or a cast from
732 // InnerInduction, or a binary operator that involves
733 // InnerInduction and a constant.
734 std::function<bool(Value *)> IsPathToInnerIndVar;
735 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
736 if (llvm::is_contained(Range&: InnerLoopInductions, Element: V))
737 return true;
738 if (isa<Constant>(Val: V))
739 return true;
740 const Instruction *I = dyn_cast<Instruction>(Val: V);
741 if (!I)
742 return false;
743 if (isa<CastInst>(Val: I))
744 return IsPathToInnerIndVar(I->getOperand(i: 0));
745 if (isa<BinaryOperator>(Val: I))
746 return IsPathToInnerIndVar(I->getOperand(i: 0)) &&
747 IsPathToInnerIndVar(I->getOperand(i: 1));
748 return false;
749 };
750
751 // In case of multiple inner loop indvars, it is okay if LHS and RHS
752 // are both inner indvar related variables.
753 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
754 return true;
755
756 // Otherwise we check if the cmp instruction compares an inner indvar
757 // related variable (Left) with a outer loop invariant (Right).
758 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Val: Op0)) {
759 Left = Op0;
760 Right = Op1;
761 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Val: Op1)) {
762 Left = Op1;
763 Right = Op0;
764 }
765
766 if (Left == nullptr)
767 return false;
768
769 const SCEV *S = SE->getSCEV(V: Right);
770 if (!SE->isLoopInvariant(S, L: OuterLoop))
771 return false;
772 }
773
774 return true;
775}
776
777// If SV is a LCSSA PHI node with a single incoming value, return the incoming
778// value.
779static Value *followLCSSA(Value *SV) {
780 PHINode *PHI = dyn_cast<PHINode>(Val: SV);
781 if (!PHI)
782 return SV;
783
784 if (PHI->getNumIncomingValues() != 1)
785 return SV;
786 return followLCSSA(SV: PHI->getIncomingValue(i: 0));
787}
788
789// Check V's users to see if it is involved in a reduction in L.
790static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
791 // Reduction variables cannot be constants.
792 if (isa<Constant>(Val: V))
793 return nullptr;
794
795 for (Value *User : V->users()) {
796 if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) {
797 if (PHI->getNumIncomingValues() == 1)
798 continue;
799 RecurrenceDescriptor RD;
800 if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) {
801 // Detect floating point reduction only when it can be reordered.
802 if (RD.getExactFPMathInst() != nullptr)
803 return nullptr;
804 return PHI;
805 }
806 return nullptr;
807 }
808 }
809
810 return nullptr;
811}
812
813bool LoopInterchangeLegality::findInductionAndReductions(
814 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
815 if (!L->getLoopLatch() || !L->getLoopPredecessor())
816 return false;
817 for (PHINode &PHI : L->getHeader()->phis()) {
818 InductionDescriptor ID;
819 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
820 Inductions.push_back(Elt: &PHI);
821 else {
822 // PHIs in inner loops need to be part of a reduction in the outer loop,
823 // discovered when checking the PHIs of the outer loop earlier.
824 if (!InnerLoop) {
825 if (!OuterInnerReductions.count(Ptr: &PHI)) {
826 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
827 "across the outer loop.\n");
828 return false;
829 }
830 } else {
831 assert(PHI.getNumIncomingValues() == 2 &&
832 "Phis in loop header should have exactly 2 incoming values");
833 // Check if we have a PHI node in the outer loop that has a reduction
834 // result from the inner loop as an incoming value.
835 Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch()));
836 PHINode *InnerRedPhi = findInnerReductionPhi(L: InnerLoop, V);
837 if (!InnerRedPhi ||
838 !llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) {
839 LLVM_DEBUG(
840 dbgs()
841 << "Failed to recognize PHI as an induction or reduction.\n");
842 return false;
843 }
844 OuterInnerReductions.insert(Ptr: &PHI);
845 OuterInnerReductions.insert(Ptr: InnerRedPhi);
846 }
847 }
848 }
849 return true;
850}
851
852// This function indicates the current limitations in the transform as a result
853// of which we do not proceed.
854bool LoopInterchangeLegality::currentLimitations() {
855 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
856
857 // transform currently expects the loop latches to also be the exiting
858 // blocks.
859 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
860 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
861 !isa<BranchInst>(Val: InnerLoopLatch->getTerminator()) ||
862 !isa<BranchInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) {
863 LLVM_DEBUG(
864 dbgs() << "Loops where the latch is not the exiting block are not"
865 << " supported currently.\n");
866 ORE->emit(RemarkBuilder: [&]() {
867 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
868 OuterLoop->getStartLoc(),
869 OuterLoop->getHeader())
870 << "Loops where the latch is not the exiting block cannot be"
871 " interchange currently.";
872 });
873 return true;
874 }
875
876 SmallVector<PHINode *, 8> Inductions;
877 if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) {
878 LLVM_DEBUG(
879 dbgs() << "Only outer loops with induction or reduction PHI nodes "
880 << "are supported currently.\n");
881 ORE->emit(RemarkBuilder: [&]() {
882 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
883 OuterLoop->getStartLoc(),
884 OuterLoop->getHeader())
885 << "Only outer loops with induction or reduction PHI nodes can be"
886 " interchanged currently.";
887 });
888 return true;
889 }
890
891 Inductions.clear();
892 // For multi-level loop nests, make sure that all phi nodes for inner loops
893 // at all levels can be recognized as a induction or reduction phi. Bail out
894 // if a phi node at a certain nesting level cannot be properly recognized.
895 Loop *CurLevelLoop = OuterLoop;
896 while (!CurLevelLoop->getSubLoops().empty()) {
897 // We already made sure that the loop nest is tightly nested.
898 CurLevelLoop = CurLevelLoop->getSubLoops().front();
899 if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) {
900 LLVM_DEBUG(
901 dbgs() << "Only inner loops with induction or reduction PHI nodes "
902 << "are supported currently.\n");
903 ORE->emit(RemarkBuilder: [&]() {
904 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
905 CurLevelLoop->getStartLoc(),
906 CurLevelLoop->getHeader())
907 << "Only inner loops with induction or reduction PHI nodes can be"
908 " interchange currently.";
909 });
910 return true;
911 }
912 }
913
914 // TODO: Triangular loops are not handled for now.
915 if (!isLoopStructureUnderstood()) {
916 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
917 ORE->emit(RemarkBuilder: [&]() {
918 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
919 InnerLoop->getStartLoc(),
920 InnerLoop->getHeader())
921 << "Inner loop structure not understood currently.";
922 });
923 return true;
924 }
925
926 return false;
927}
928
929bool LoopInterchangeLegality::findInductions(
930 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
931 for (PHINode &PHI : L->getHeader()->phis()) {
932 InductionDescriptor ID;
933 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
934 Inductions.push_back(Elt: &PHI);
935 }
936 return !Inductions.empty();
937}
938
939// We currently only support LCSSA PHI nodes in the inner loop exit, if their
940// users are either reduction PHIs or PHIs outside the outer loop (which means
941// the we are only interested in the final value after the loop).
942static bool
943areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
944 SmallPtrSetImpl<PHINode *> &Reductions) {
945 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
946 for (PHINode &PHI : InnerExit->phis()) {
947 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
948 if (PHI.getNumIncomingValues() > 1)
949 return false;
950 if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) {
951 PHINode *PN = dyn_cast<PHINode>(Val: U);
952 return !PN ||
953 (!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent()));
954 })) {
955 return false;
956 }
957 }
958 return true;
959}
960
961// We currently support LCSSA PHI nodes in the outer loop exit, if their
962// incoming values do not come from the outer loop latch or if the
963// outer loop latch has a single predecessor. In that case, the value will
964// be available if both the inner and outer loop conditions are true, which
965// will still be true after interchanging. If we have multiple predecessor,
966// that may not be the case, e.g. because the outer loop latch may be executed
967// if the inner loop is not executed.
968static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
969 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
970 for (PHINode &PHI : LoopNestExit->phis()) {
971 for (Value *Incoming : PHI.incoming_values()) {
972 Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming);
973 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
974 continue;
975
976 // The incoming value is defined in the outer loop latch. Currently we
977 // only support that in case the outer loop latch has a single predecessor.
978 // This guarantees that the outer loop latch is executed if and only if
979 // the inner loop is executed (because tightlyNested() guarantees that the
980 // outer loop header only branches to the inner loop or the outer loop
981 // latch).
982 // FIXME: We could weaken this logic and allow multiple predecessors,
983 // if the values are produced outside the loop latch. We would need
984 // additional logic to update the PHI nodes in the exit block as
985 // well.
986 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
987 return false;
988 }
989 }
990 return true;
991}
992
993// In case of multi-level nested loops, it may occur that lcssa phis exist in
994// the latch of InnerLoop, i.e., when defs of the incoming values are further
995// inside the loopnest. Sometimes those incoming values are not available
996// after interchange, since the original inner latch will become the new outer
997// latch which may have predecessor paths that do not include those incoming
998// values.
999// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1000// multi-level loop nests.
1001static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1002 if (InnerLoop->getSubLoops().empty())
1003 return true;
1004 // If the original outer latch has only one predecessor, then values defined
1005 // further inside the looploop, e.g., in the innermost loop, will be available
1006 // at the new outer latch after interchange.
1007 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1008 return true;
1009
1010 // The outer latch has more than one predecessors, i.e., the inner
1011 // exit and the inner header.
1012 // PHI nodes in the inner latch are lcssa phis where the incoming values
1013 // are defined further inside the loopnest. Check if those phis are used
1014 // in the original inner latch. If that is the case then bail out since
1015 // those incoming values may not be available at the new outer latch.
1016 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1017 for (PHINode &PHI : InnerLoopLatch->phis()) {
1018 for (auto *U : PHI.users()) {
1019 Instruction *UI = cast<Instruction>(Val: U);
1020 if (InnerLoopLatch == UI->getParent())
1021 return false;
1022 }
1023 }
1024 return true;
1025}
1026
1027bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1028 unsigned OuterLoopId,
1029 CharMatrix &DepMatrix) {
1030 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1031 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1032 << " and OuterLoopId = " << OuterLoopId
1033 << " due to dependence\n");
1034 ORE->emit(RemarkBuilder: [&]() {
1035 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1036 InnerLoop->getStartLoc(),
1037 InnerLoop->getHeader())
1038 << "Cannot interchange loops due to dependences.";
1039 });
1040 return false;
1041 }
1042 // Check if outer and inner loop contain legal instructions only.
1043 for (auto *BB : OuterLoop->blocks())
1044 for (Instruction &I : BB->instructionsWithoutDebug())
1045 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
1046 // readnone functions do not prevent interchanging.
1047 if (CI->onlyWritesMemory())
1048 continue;
1049 LLVM_DEBUG(
1050 dbgs() << "Loops with call instructions cannot be interchanged "
1051 << "safely.");
1052 ORE->emit(RemarkBuilder: [&]() {
1053 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1054 CI->getDebugLoc(),
1055 CI->getParent())
1056 << "Cannot interchange loops due to call instruction.";
1057 });
1058
1059 return false;
1060 }
1061
1062 if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) {
1063 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1064 return false;
1065 }
1066
1067 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1068 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1069 ORE->emit(RemarkBuilder: [&]() {
1070 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1071 InnerLoop->getStartLoc(),
1072 InnerLoop->getHeader())
1073 << "Cannot interchange loops because unsupported PHI nodes found "
1074 "in inner loop latch.";
1075 });
1076 return false;
1077 }
1078
1079 // TODO: The loops could not be interchanged due to current limitations in the
1080 // transform module.
1081 if (currentLimitations()) {
1082 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1083 return false;
1084 }
1085
1086 // Check if the loops are tightly nested.
1087 if (!tightlyNested(OuterLoop, InnerLoop)) {
1088 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1089 ORE->emit(RemarkBuilder: [&]() {
1090 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1091 InnerLoop->getStartLoc(),
1092 InnerLoop->getHeader())
1093 << "Cannot interchange loops because they are not tightly "
1094 "nested.";
1095 });
1096 return false;
1097 }
1098
1099 if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop,
1100 Reductions&: OuterInnerReductions)) {
1101 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1102 ORE->emit(RemarkBuilder: [&]() {
1103 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1104 InnerLoop->getStartLoc(),
1105 InnerLoop->getHeader())
1106 << "Found unsupported PHI node in loop exit.";
1107 });
1108 return false;
1109 }
1110
1111 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1112 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1113 ORE->emit(RemarkBuilder: [&]() {
1114 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1115 OuterLoop->getStartLoc(),
1116 OuterLoop->getHeader())
1117 << "Found unsupported PHI node in loop exit.";
1118 });
1119 return false;
1120 }
1121
1122 return true;
1123}
1124
1125int LoopInterchangeProfitability::getInstrOrderCost() {
1126 unsigned GoodOrder, BadOrder;
1127 BadOrder = GoodOrder = 0;
1128 for (BasicBlock *BB : InnerLoop->blocks()) {
1129 for (Instruction &Ins : *BB) {
1130 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) {
1131 bool FoundInnerInduction = false;
1132 bool FoundOuterInduction = false;
1133 for (Value *Op : GEP->operands()) {
1134 // Skip operands that are not SCEV-able.
1135 if (!SE->isSCEVable(Ty: Op->getType()))
1136 continue;
1137
1138 const SCEV *OperandVal = SE->getSCEV(V: Op);
1139 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal);
1140 if (!AR)
1141 continue;
1142
1143 // If we find the inner induction after an outer induction e.g.
1144 // for(int i=0;i<N;i++)
1145 // for(int j=0;j<N;j++)
1146 // A[i][j] = A[i-1][j-1]+k;
1147 // then it is a good order.
1148 if (AR->getLoop() == InnerLoop) {
1149 // We found an InnerLoop induction after OuterLoop induction. It is
1150 // a good order.
1151 FoundInnerInduction = true;
1152 if (FoundOuterInduction) {
1153 GoodOrder++;
1154 break;
1155 }
1156 }
1157 // If we find the outer induction after an inner induction e.g.
1158 // for(int i=0;i<N;i++)
1159 // for(int j=0;j<N;j++)
1160 // A[j][i] = A[j-1][i-1]+k;
1161 // then it is a bad order.
1162 if (AR->getLoop() == OuterLoop) {
1163 // We found an OuterLoop induction after InnerLoop induction. It is
1164 // a bad order.
1165 FoundOuterInduction = true;
1166 if (FoundInnerInduction) {
1167 BadOrder++;
1168 break;
1169 }
1170 }
1171 }
1172 }
1173 }
1174 }
1175 return GoodOrder - BadOrder;
1176}
1177
1178std::optional<bool>
1179LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1180 const DenseMap<const Loop *, unsigned> &CostMap,
1181 std::unique_ptr<CacheCost> &CC) {
1182 // This is the new cost model returned from loop cache analysis.
1183 // A smaller index means the loop should be placed an outer loop, and vice
1184 // versa.
1185 auto InnerLoopIt = CostMap.find(Val: InnerLoop);
1186 if (InnerLoopIt == CostMap.end())
1187 return std::nullopt;
1188 auto OuterLoopIt = CostMap.find(Val: OuterLoop);
1189 if (OuterLoopIt == CostMap.end())
1190 return std::nullopt;
1191
1192 if (CC->getLoopCost(L: *OuterLoop) == CC->getLoopCost(L: *InnerLoop))
1193 return std::nullopt;
1194 unsigned InnerIndex = InnerLoopIt->second;
1195 unsigned OuterIndex = OuterLoopIt->second;
1196 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1197 << ", OuterIndex = " << OuterIndex << "\n");
1198 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1199 "numbers to each loop");
1200 return std::optional<bool>(InnerIndex < OuterIndex);
1201}
1202
1203std::optional<bool>
1204LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1205 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1206 // good and bad order of induction variables in the instruction and allows
1207 // reordering if number of bad orders is more than good.
1208 int Cost = getInstrOrderCost();
1209 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1210 if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1211 return std::optional<bool>(true);
1212
1213 return std::nullopt;
1214}
1215
1216/// Return true if we can vectorize the loop specified by \p LoopId.
1217static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1218 for (const auto &Dep : DepMatrix) {
1219 char Dir = Dep[LoopId];
1220 if (Dir != 'I' && Dir != '=')
1221 return false;
1222 }
1223 return true;
1224}
1225
1226std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1227 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1228 // If the outer loop is not loop independent it is not profitable to move
1229 // this to inner position, since doing so would not enable inner loop
1230 // parallelism.
1231 if (!canVectorize(DepMatrix, LoopId: OuterLoopId))
1232 return false;
1233
1234 // If inner loop has dependence and outer loop is loop independent then it is
1235 // profitable to interchange to enable inner loop parallelism.
1236 if (!canVectorize(DepMatrix, LoopId: InnerLoopId))
1237 return true;
1238
1239 // If both the inner and the outer loop can be vectorized, it is necessary to
1240 // check the cost of each vectorized loop for profitability decision. At this
1241 // time we do not have a cost model to estimate them, so return nullopt.
1242 // TODO: Estimate the cost of vectorized loop when both the outer and the
1243 // inner loop can be vectorized.
1244 return std::nullopt;
1245}
1246
1247bool LoopInterchangeProfitability::isProfitable(
1248 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1249 unsigned OuterLoopId, CharMatrix &DepMatrix,
1250 const DenseMap<const Loop *, unsigned> &CostMap,
1251 std::unique_ptr<CacheCost> &CC) {
1252 // isProfitable() is structured to avoid endless loop interchange. If the
1253 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1254 // decide the profitability then, profitability check will stop and return the
1255 // analysis result. If it failed to determine it (e.g., cache analysis failed
1256 // to analyze the loopnest due to delinearization issues) then go ahead the
1257 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1258 // Likewise, if it failed to analysis the profitability then only, the last
1259 // rule (isProfitableForVectorization by default) will decide.
1260 assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules");
1261 std::optional<bool> shouldInterchange;
1262 for (RuleTy RT : Profitabilities) {
1263 switch (RT) {
1264 case RuleTy::PerLoopCacheAnalysis:
1265 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1266 break;
1267 case RuleTy::PerInstrOrderCost:
1268 shouldInterchange = isProfitablePerInstrOrderCost();
1269 break;
1270 case RuleTy::ForVectorization:
1271 shouldInterchange =
1272 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1273 break;
1274 }
1275
1276 // If this rule could determine the profitability, don't call subsequent
1277 // rules.
1278 if (shouldInterchange.has_value())
1279 break;
1280 }
1281
1282 if (!shouldInterchange.has_value()) {
1283 ORE->emit(RemarkBuilder: [&]() {
1284 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1285 InnerLoop->getStartLoc(),
1286 InnerLoop->getHeader())
1287 << "Insufficient information to calculate the cost of loop for "
1288 "interchange.";
1289 });
1290 return false;
1291 } else if (!shouldInterchange.value()) {
1292 ORE->emit(RemarkBuilder: [&]() {
1293 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1294 InnerLoop->getStartLoc(),
1295 InnerLoop->getHeader())
1296 << "Interchanging loops is not considered to improve cache "
1297 "locality nor vectorization.";
1298 });
1299 return false;
1300 }
1301 return true;
1302}
1303
1304void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1305 Loop *InnerLoop) {
1306 for (Loop *L : *OuterLoop)
1307 if (L == InnerLoop) {
1308 OuterLoop->removeChildLoop(Child: L);
1309 return;
1310 }
1311 llvm_unreachable("Couldn't find loop");
1312}
1313
1314/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1315/// new inner and outer loop after interchanging: NewInner is the original
1316/// outer loop and NewOuter is the original inner loop.
1317///
1318/// Before interchanging, we have the following structure
1319/// Outer preheader
1320// Outer header
1321// Inner preheader
1322// Inner header
1323// Inner body
1324// Inner latch
1325// outer bbs
1326// Outer latch
1327//
1328// After interchanging:
1329// Inner preheader
1330// Inner header
1331// Outer preheader
1332// Outer header
1333// Inner body
1334// outer bbs
1335// Outer latch
1336// Inner latch
1337void LoopInterchangeTransform::restructureLoops(
1338 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1339 BasicBlock *OrigOuterPreHeader) {
1340 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1341 // The original inner loop preheader moves from the new inner loop to
1342 // the parent loop, if there is one.
1343 NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader);
1344 LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent);
1345
1346 // Switch the loop levels.
1347 if (OuterLoopParent) {
1348 // Remove the loop from its parent loop.
1349 removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner);
1350 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1351 OuterLoopParent->addChildLoop(NewChild: NewOuter);
1352 } else {
1353 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1354 LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter);
1355 }
1356 while (!NewOuter->isInnermost())
1357 NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin()));
1358 NewOuter->addChildLoop(NewChild: NewInner);
1359
1360 // BBs from the original inner loop.
1361 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1362
1363 // Add BBs from the original outer loop to the original inner loop (excluding
1364 // BBs already in inner loop)
1365 for (BasicBlock *BB : NewInner->blocks())
1366 if (LI->getLoopFor(BB) == NewInner)
1367 NewOuter->addBlockEntry(BB);
1368
1369 // Now remove inner loop header and latch from the new inner loop and move
1370 // other BBs (the loop body) to the new inner loop.
1371 BasicBlock *OuterHeader = NewOuter->getHeader();
1372 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1373 for (BasicBlock *BB : OrigInnerBBs) {
1374 // Nothing will change for BBs in child loops.
1375 if (LI->getLoopFor(BB) != NewOuter)
1376 continue;
1377 // Remove the new outer loop header and latch from the new inner loop.
1378 if (BB == OuterHeader || BB == OuterLatch)
1379 NewInner->removeBlockFromLoop(BB);
1380 else
1381 LI->changeLoopFor(BB, L: NewInner);
1382 }
1383
1384 // The preheader of the original outer loop becomes part of the new
1385 // outer loop.
1386 NewOuter->addBlockEntry(BB: OrigOuterPreHeader);
1387 LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter);
1388
1389 // Tell SE that we move the loops around.
1390 SE->forgetLoop(L: NewOuter);
1391}
1392
1393bool LoopInterchangeTransform::transform() {
1394 bool Transformed = false;
1395
1396 if (InnerLoop->getSubLoops().empty()) {
1397 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1398 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1399 auto &InductionPHIs = LIL.getInnerLoopInductions();
1400 if (InductionPHIs.empty()) {
1401 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1402 return false;
1403 }
1404
1405 SmallVector<Instruction *, 8> InnerIndexVarList;
1406 for (PHINode *CurInductionPHI : InductionPHIs) {
1407 if (CurInductionPHI->getIncomingBlock(i: 0) == InnerLoopPreHeader)
1408 InnerIndexVarList.push_back(
1409 Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 1)));
1410 else
1411 InnerIndexVarList.push_back(
1412 Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 0)));
1413 }
1414
1415 // Create a new latch block for the inner loop. We split at the
1416 // current latch's terminator and then move the condition and all
1417 // operands that are not either loop-invariant or the induction PHI into the
1418 // new latch block.
1419 BasicBlock *NewLatch =
1420 SplitBlock(Old: InnerLoop->getLoopLatch(),
1421 SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1422
1423 SmallSetVector<Instruction *, 4> WorkList;
1424 unsigned i = 0;
1425 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1426 for (; i < WorkList.size(); i++) {
1427 // Duplicate instruction and move it the new latch. Update uses that
1428 // have been moved.
1429 Instruction *NewI = WorkList[i]->clone();
1430 NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt());
1431 assert(!NewI->mayHaveSideEffects() &&
1432 "Moving instructions with side-effects may change behavior of "
1433 "the loop nest!");
1434 for (Use &U : llvm::make_early_inc_range(Range: WorkList[i]->uses())) {
1435 Instruction *UserI = cast<Instruction>(Val: U.getUser());
1436 if (!InnerLoop->contains(BB: UserI->getParent()) ||
1437 UserI->getParent() == NewLatch ||
1438 llvm::is_contained(Range: InductionPHIs, Element: UserI))
1439 U.set(NewI);
1440 }
1441 // Add operands of moved instruction to the worklist, except if they are
1442 // outside the inner loop or are the induction PHI.
1443 for (Value *Op : WorkList[i]->operands()) {
1444 Instruction *OpI = dyn_cast<Instruction>(Val: Op);
1445 if (!OpI ||
1446 this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop ||
1447 llvm::is_contained(Range: InductionPHIs, Element: OpI))
1448 continue;
1449 WorkList.insert(X: OpI);
1450 }
1451 }
1452 };
1453
1454 // FIXME: Should we interchange when we have a constant condition?
1455 Instruction *CondI = dyn_cast<Instruction>(
1456 Val: cast<BranchInst>(Val: InnerLoop->getLoopLatch()->getTerminator())
1457 ->getCondition());
1458 if (CondI)
1459 WorkList.insert(X: CondI);
1460 MoveInstructions();
1461 for (Instruction *InnerIndexVar : InnerIndexVarList)
1462 WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar));
1463 MoveInstructions();
1464 }
1465
1466 // Ensure the inner loop phi nodes have a separate basic block.
1467 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1468 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
1469 InnerLoopHeader->getTerminator()) {
1470 SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
1471 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1472 }
1473
1474 // Instructions in the original inner loop preheader may depend on values
1475 // defined in the outer loop header. Move them there, because the original
1476 // inner loop preheader will become the entry into the interchanged loop nest.
1477 // Currently we move all instructions and rely on LICM to move invariant
1478 // instructions outside the loop nest.
1479 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1480 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1481 if (InnerLoopPreHeader != OuterLoopHeader) {
1482 for (Instruction &I :
1483 make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(),
1484 y: std::prev(x: InnerLoopPreHeader->end()))))
1485 I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator());
1486 }
1487
1488 Transformed |= adjustLoopLinks();
1489 if (!Transformed) {
1490 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1491 return false;
1492 }
1493
1494 return true;
1495}
1496
1497/// \brief Move all instructions except the terminator from FromBB right before
1498/// InsertBefore
1499static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1500 BasicBlock *ToBB = InsertBefore->getParent();
1501
1502 ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(),
1503 FromEndIt: FromBB->getTerminator()->getIterator());
1504}
1505
1506/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1507static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1508 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1509 // from BB1 afterwards.
1510 auto Iter = map_range(C&: *BB1, F: [](Instruction &I) { return &I; });
1511 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(x: Iter.end()));
1512 for (Instruction *I : TempInstrs)
1513 I->removeFromParent();
1514
1515 // Move instructions from BB2 to BB1.
1516 moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator());
1517
1518 // Move instructions from TempInstrs to BB2.
1519 for (Instruction *I : TempInstrs)
1520 I->insertBefore(InsertPos: BB2->getTerminator()->getIterator());
1521}
1522
1523// Update BI to jump to NewBB instead of OldBB. Records updates to the
1524// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1525// \p OldBB is exactly once in BI's successor list.
1526static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1527 BasicBlock *NewBB,
1528 std::vector<DominatorTree::UpdateType> &DTUpdates,
1529 bool MustUpdateOnce = true) {
1530 assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) &&
1531 "BI must jump to OldBB exactly once.");
1532 bool Changed = false;
1533 for (Use &Op : BI->operands())
1534 if (Op == OldBB) {
1535 Op.set(NewBB);
1536 Changed = true;
1537 }
1538
1539 if (Changed) {
1540 DTUpdates.push_back(
1541 x: {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1542 DTUpdates.push_back(
1543 x: {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1544 }
1545 assert(Changed && "Expected a successor to be updated");
1546}
1547
1548// Move Lcssa PHIs to the right place.
1549static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1550 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1551 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1552 Loop *InnerLoop, LoopInfo *LI) {
1553
1554 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1555 // defined either in the header or latch. Those blocks will become header and
1556 // latch of the new outer loop, and the only possible users can PHI nodes
1557 // in the exit block of the loop nest or the outer loop header (reduction
1558 // PHIs, in that case, the incoming value must be defined in the inner loop
1559 // header). We can just substitute the user with the incoming value and remove
1560 // the PHI.
1561 for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) {
1562 assert(P.getNumIncomingValues() == 1 &&
1563 "Only loops with a single exit are supported!");
1564
1565 // Incoming values are guaranteed be instructions currently.
1566 auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch));
1567 // In case of multi-level nested loops, follow LCSSA to find the incoming
1568 // value defined from the innermost loop.
1569 auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI));
1570 // Skip phis with incoming values from the inner loop body, excluding the
1571 // header and latch.
1572 if (IncIInnerMost->getParent() != InnerLatch &&
1573 IncIInnerMost->getParent() != InnerHeader)
1574 continue;
1575
1576 assert(all_of(P.users(),
1577 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1578 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1579 IncI->getParent() == InnerHeader) ||
1580 cast<PHINode>(U)->getParent() == OuterExit;
1581 }) &&
1582 "Can only replace phis iff the uses are in the loop nest exit or "
1583 "the incoming value is defined in the inner header (it will "
1584 "dominate all loop blocks after interchanging)");
1585 P.replaceAllUsesWith(V: IncI);
1586 P.eraseFromParent();
1587 }
1588
1589 SmallVector<PHINode *, 8> LcssaInnerExit(
1590 llvm::make_pointer_range(Range: InnerExit->phis()));
1591
1592 SmallVector<PHINode *, 8> LcssaInnerLatch(
1593 llvm::make_pointer_range(Range: InnerLatch->phis()));
1594
1595 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1596 // If a PHI node has users outside of InnerExit, it has a use outside the
1597 // interchanged loop and we have to preserve it. We move these to
1598 // InnerLatch, which will become the new exit block for the innermost
1599 // loop after interchanging.
1600 for (PHINode *P : LcssaInnerExit)
1601 P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
1602
1603 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1604 // and we have to move them to the new inner latch.
1605 for (PHINode *P : LcssaInnerLatch)
1606 P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt());
1607
1608 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1609 // incoming values defined in the outer loop, we have to add a new PHI
1610 // in the inner loop latch, which became the exit block of the outer loop,
1611 // after interchanging.
1612 if (OuterExit) {
1613 for (PHINode &P : OuterExit->phis()) {
1614 if (P.getNumIncomingValues() != 1)
1615 continue;
1616 // Skip Phis with incoming values defined in the inner loop. Those should
1617 // already have been updated.
1618 auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: 0));
1619 if (!I || LI->getLoopFor(BB: I->getParent()) == InnerLoop)
1620 continue;
1621
1622 PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone());
1623 NewPhi->setIncomingValue(i: 0, V: P.getIncomingValue(i: 0));
1624 NewPhi->setIncomingBlock(i: 0, BB: OuterLatch);
1625 // We might have incoming edges from other BBs, i.e., the original outer
1626 // header.
1627 for (auto *Pred : predecessors(BB: InnerLatch)) {
1628 if (Pred == OuterLatch)
1629 continue;
1630 NewPhi->addIncoming(V: P.getIncomingValue(i: 0), BB: Pred);
1631 }
1632 NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
1633 P.setIncomingValue(i: 0, V: NewPhi);
1634 }
1635 }
1636
1637 // Now adjust the incoming blocks for the LCSSA PHIs.
1638 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1639 // with the new latch.
1640 InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch);
1641}
1642
1643bool LoopInterchangeTransform::adjustLoopBranches() {
1644 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1645 std::vector<DominatorTree::UpdateType> DTUpdates;
1646
1647 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1648 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1649
1650 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1651 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1652 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1653 // Ensure that both preheaders do not contain PHI nodes and have single
1654 // predecessors. This allows us to move them easily. We use
1655 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1656 // preheaders do not satisfy those conditions.
1657 if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) ||
1658 !OuterLoopPreHeader->getUniquePredecessor())
1659 OuterLoopPreHeader =
1660 InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
1661 if (InnerLoopPreHeader == OuterLoop->getHeader())
1662 InnerLoopPreHeader =
1663 InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
1664
1665 // Adjust the loop preheader
1666 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1667 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1668 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1669 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1670 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1671 BasicBlock *InnerLoopLatchPredecessor =
1672 InnerLoopLatch->getUniquePredecessor();
1673 BasicBlock *InnerLoopLatchSuccessor;
1674 BasicBlock *OuterLoopLatchSuccessor;
1675
1676 BranchInst *OuterLoopLatchBI =
1677 dyn_cast<BranchInst>(Val: OuterLoopLatch->getTerminator());
1678 BranchInst *InnerLoopLatchBI =
1679 dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator());
1680 BranchInst *OuterLoopHeaderBI =
1681 dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator());
1682 BranchInst *InnerLoopHeaderBI =
1683 dyn_cast<BranchInst>(Val: InnerLoopHeader->getTerminator());
1684
1685 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1686 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1687 !InnerLoopHeaderBI)
1688 return false;
1689
1690 BranchInst *InnerLoopLatchPredecessorBI =
1691 dyn_cast<BranchInst>(Val: InnerLoopLatchPredecessor->getTerminator());
1692 BranchInst *OuterLoopPredecessorBI =
1693 dyn_cast<BranchInst>(Val: OuterLoopPredecessor->getTerminator());
1694
1695 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1696 return false;
1697 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1698 if (!InnerLoopHeaderSuccessor)
1699 return false;
1700
1701 // Adjust Loop Preheader and headers.
1702 // The branches in the outer loop predecessor and the outer loop header can
1703 // be unconditional branches or conditional branches with duplicates. Consider
1704 // this when updating the successors.
1705 updateSuccessor(BI: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader,
1706 NewBB: InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1707 // The outer loop header might or might not branch to the outer latch.
1708 // We are guaranteed to branch to the inner loop preheader.
1709 if (llvm::is_contained(Range: OuterLoopHeaderBI->successors(), Element: OuterLoopLatch)) {
1710 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1711 updateSuccessor(BI: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch,
1712 DTUpdates,
1713 /*MustUpdateOnce=*/false);
1714 }
1715 updateSuccessor(BI: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader,
1716 NewBB: InnerLoopHeaderSuccessor, DTUpdates,
1717 /*MustUpdateOnce=*/false);
1718
1719 // Adjust reduction PHI's now that the incoming block has changed.
1720 InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader,
1721 New: OuterLoopHeader);
1722
1723 updateSuccessor(BI: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor,
1724 NewBB: OuterLoopPreHeader, DTUpdates);
1725
1726 // -------------Adjust loop latches-----------
1727 if (InnerLoopLatchBI->getSuccessor(i: 0) == InnerLoopHeader)
1728 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 1);
1729 else
1730 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 0);
1731
1732 updateSuccessor(BI: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch,
1733 NewBB: InnerLoopLatchSuccessor, DTUpdates);
1734
1735 if (OuterLoopLatchBI->getSuccessor(i: 0) == OuterLoopHeader)
1736 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 1);
1737 else
1738 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 0);
1739
1740 updateSuccessor(BI: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor,
1741 NewBB: OuterLoopLatchSuccessor, DTUpdates);
1742 updateSuccessor(BI: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch,
1743 DTUpdates);
1744
1745 DT->applyUpdates(Updates: DTUpdates);
1746 restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader,
1747 OrigOuterPreHeader: OuterLoopPreHeader);
1748
1749 moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch,
1750 OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(),
1751 InnerLoop, LI);
1752 // For PHIs in the exit block of the outer loop, outer's latch has been
1753 // replaced by Inners'.
1754 OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
1755
1756 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1757 // Now update the reduction PHIs in the inner and outer loop headers.
1758 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1759 for (PHINode &PHI : InnerLoopHeader->phis())
1760 if (OuterInnerReductions.contains(Ptr: &PHI))
1761 InnerLoopPHIs.push_back(Elt: &PHI);
1762
1763 for (PHINode &PHI : OuterLoopHeader->phis())
1764 if (OuterInnerReductions.contains(Ptr: &PHI))
1765 OuterLoopPHIs.push_back(Elt: &PHI);
1766
1767 // Now move the remaining reduction PHIs from outer to inner loop header and
1768 // vice versa. The PHI nodes must be part of a reduction across the inner and
1769 // outer loop and all the remains to do is and updating the incoming blocks.
1770 for (PHINode *PHI : OuterLoopPHIs) {
1771 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1772 PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt());
1773 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1774 }
1775 for (PHINode *PHI : InnerLoopPHIs) {
1776 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1777 PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt());
1778 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1779 }
1780
1781 // Update the incoming blocks for moved PHI nodes.
1782 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader);
1783 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch);
1784 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader);
1785 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
1786
1787 // Values defined in the outer loop header could be used in the inner loop
1788 // latch. In that case, we need to create LCSSA phis for them, because after
1789 // interchanging they will be defined in the new inner loop and used in the
1790 // new outer loop.
1791 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1792 for (Instruction &I :
1793 make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end())))
1794 MayNeedLCSSAPhis.push_back(Elt: &I);
1795 formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: *DT, LI: *LI, SE);
1796
1797 return true;
1798}
1799
1800bool LoopInterchangeTransform::adjustLoopLinks() {
1801 // Adjust all branches in the inner and outer loop.
1802 bool Changed = adjustLoopBranches();
1803 if (Changed) {
1804 // We have interchanged the preheaders so we need to interchange the data in
1805 // the preheaders as well. This is because the content of the inner
1806 // preheader was previously executed inside the outer loop.
1807 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1808 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1809 swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader);
1810 }
1811 return Changed;
1812}
1813
1814PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
1815 LoopAnalysisManager &AM,
1816 LoopStandardAnalysisResults &AR,
1817 LPMUpdater &U) {
1818 Function &F = *LN.getParent();
1819 SmallVector<Loop *, 8> LoopList(LN.getLoops());
1820
1821 if (MaxMemInstrCount < 1) {
1822 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
1823 return PreservedAnalyses::all();
1824 }
1825 OptimizationRemarkEmitter ORE(&F);
1826
1827 // Ensure minimum depth of the loop nest to do the interchange.
1828 if (!hasSupportedLoopDepth(LoopList, ORE))
1829 return PreservedAnalyses::all();
1830 // Ensure computable loop nest.
1831 if (!isComputableLoopNest(SE: &AR.SE, LoopList)) {
1832 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
1833 return PreservedAnalyses::all();
1834 }
1835
1836 ORE.emit(RemarkBuilder: [&]() {
1837 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
1838 LN.getOutermostLoop().getStartLoc(),
1839 LN.getOutermostLoop().getHeader())
1840 << "Computed dependence info, invoking the transform.";
1841 });
1842
1843 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1844 std::unique_ptr<CacheCost> CC =
1845 CacheCost::getCacheCost(Root&: LN.getOutermostLoop(), AR, DI);
1846
1847 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1848 return PreservedAnalyses::all();
1849 U.markLoopNestChanged(Changed: true);
1850 return getLoopPassPreservedAnalyses();
1851}
1852