1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Transforms/Scalar/LoopInterchange.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
22#include "llvm/Analysis/DependenceAnalysis.h"
23#include "llvm/Analysis/LoopCacheAnalysis.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/LoopNestAnalysis.h"
26#include "llvm/Analysis/LoopPass.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/ScalarEvolution.h"
29#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/DiagnosticInfo.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
40#include "llvm/Support/Casting.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/ErrorHandling.h"
44#include "llvm/Support/raw_ostream.h"
45#include "llvm/Transforms/Scalar/LoopPassManager.h"
46#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47#include "llvm/Transforms/Utils/Local.h"
48#include "llvm/Transforms/Utils/LoopUtils.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
59static cl::opt<int> LoopInterchangeCostThreshold(
60 "loop-interchange-threshold", cl::init(Val: 0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
63// Maximum number of load-stores that can be handled in the dependency matrix.
64static cl::opt<unsigned int> MaxMemInstrCount(
65 "loop-interchange-max-meminstr-count", cl::init(Val: 64), cl::Hidden,
66 cl::desc(
67 "Maximum number of load-store instructions that should be handled "
68 "in the dependency matrix. Higher value may lead to more interchanges "
69 "at the cost of compile-time"));
70
71namespace {
72
73using LoopVector = SmallVector<Loop *, 8>;
74
75/// A list of direction vectors. Each entry represents a direction vector
76/// corresponding to one or more dependencies existing in the loop nest. The
77/// length of all direction vectors is equal and is N + 1, where N is the depth
78/// of the loop nest. The first N elements correspond to the dependency
79/// direction of each N loops. The last one indicates whether this entry is
80/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
81/// is defined in LoopAccessAnalysis.
82// TODO: Check if we can use a sparse matrix here.
83using CharMatrix = std::vector<std::vector<char>>;
84
85/// Types of rules used in profitability check.
86enum class RuleTy {
87 PerLoopCacheAnalysis,
88 PerInstrOrderCost,
89 ForVectorization,
90 Ignore
91};
92
93} // end anonymous namespace
94
95// Minimum loop depth supported.
96static cl::opt<unsigned int> MinLoopNestDepth(
97 "loop-interchange-min-loop-nest-depth", cl::init(Val: 2), cl::Hidden,
98 cl::desc("Minimum depth of loop nest considered for the transform"));
99
100// Maximum loop depth supported.
101static cl::opt<unsigned int> MaxLoopNestDepth(
102 "loop-interchange-max-loop-nest-depth", cl::init(Val: 10), cl::Hidden,
103 cl::desc("Maximum depth of loop nest considered for the transform"));
104
105// We prefer cache cost to vectorization by default.
106static cl::list<RuleTy> Profitabilities(
107 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
108 cl::Hidden,
109 cl::desc("List of profitability heuristics to be used. They are applied in "
110 "the given order"),
111 cl::list_init<RuleTy>(Vals: {RuleTy::PerLoopCacheAnalysis,
112 RuleTy::PerInstrOrderCost,
113 RuleTy::ForVectorization}),
114 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
115 "Prioritize loop cache cost"),
116 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
117 "Prioritize the IVs order of each instruction"),
118 clEnumValN(RuleTy::ForVectorization, "vectorize",
119 "Prioritize vectorization"),
120 clEnumValN(RuleTy::Ignore, "ignore",
121 "Ignore profitability, force interchange (does not "
122 "work with other options)")));
123
124// Support for the inner-loop reduction pattern.
125static cl::opt<bool> EnableReduction2Memory(
126 "loop-interchange-reduction-to-mem", cl::init(Val: false), cl::Hidden,
127 cl::desc("Support for the inner-loop reduction pattern."));
128
129#ifndef NDEBUG
130static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
131 SmallSet<RuleTy, 4> Set;
132 for (RuleTy Rule : Rules) {
133 if (!Set.insert(Rule).second)
134 return false;
135 if (Rule == RuleTy::Ignore)
136 return false;
137 }
138 return true;
139}
140
141static void printDepMatrix(CharMatrix &DepMatrix) {
142 for (auto &Row : DepMatrix) {
143 // Drop the last element because it is a flag indicating whether this is
144 // forward dependency or not, which doesn't affect the legality check.
145 for (char D : drop_end(Row))
146 LLVM_DEBUG(dbgs() << D << " ");
147 LLVM_DEBUG(dbgs() << "\n");
148 }
149}
150
151/// Return true if \p Src appears before \p Dst in the same basic block.
152/// Precondition: \p Src and \Dst are distinct instructions within the same
153/// basic block.
154static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
155 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
156 "Expected Src and Dst to be different instructions in the same BB");
157
158 bool FoundSrc = false;
159 for (const Instruction &I : *(Src->getParent())) {
160 if (&I == Src) {
161 FoundSrc = true;
162 continue;
163 }
164 if (&I == Dst)
165 return FoundSrc;
166 }
167
168 llvm_unreachable("Dst not found");
169}
170#endif
171
172static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
173 Loop *L, DependenceInfo *DI,
174 ScalarEvolution *SE,
175 OptimizationRemarkEmitter *ORE) {
176 using ValueVector = SmallVector<Value *, 16>;
177
178 ValueVector MemInstr;
179
180 // For each block.
181 for (BasicBlock *BB : L->blocks()) {
182 // Scan the BB and collect legal loads and stores.
183 for (Instruction &I : *BB) {
184 if (!isa<Instruction>(Val: I))
185 return false;
186 if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
187 if (!Ld->isSimple())
188 return false;
189 MemInstr.push_back(Elt: &I);
190 } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
191 if (!St->isSimple())
192 return false;
193 MemInstr.push_back(Elt: &I);
194 }
195 }
196 }
197
198 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
199 << " Loads and Stores to analyze\n");
200 if (MemInstr.size() > MaxMemInstrCount) {
201 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
202 << MaxMemInstrCount << " load/stores in a loop\n");
203 ORE->emit(RemarkBuilder: [&]() {
204 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
205 L->getStartLoc(), L->getHeader())
206 << "Number of loads/stores exceeded, the supported maximum "
207 "can be increased with option "
208 "-loop-interchange-maxmeminstr-count.";
209 });
210 return false;
211 }
212 ValueVector::iterator I, IE, J, JE;
213
214 // Manage direction vectors that are already seen. Map each direction vector
215 // to an index of DepMatrix at which it is stored.
216 StringMap<unsigned> Seen;
217
218 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
219 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
220 std::vector<char> Dep;
221 Instruction *Src = cast<Instruction>(Val: *I);
222 Instruction *Dst = cast<Instruction>(Val: *J);
223 // Ignore Input dependencies.
224 if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
225 continue;
226 // Track Output, Flow, and Anti dependencies.
227 if (auto D = DI->depends(Src, Dst)) {
228 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
229 // If the direction vector is negative, normalize it to
230 // make it non-negative.
231 if (D->normalize(SE))
232 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
233 LLVM_DEBUG(StringRef DepType =
234 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
235 dbgs() << "Found " << DepType
236 << " dependency between Src and Dst\n"
237 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
238 unsigned Levels = D->getLevels();
239 char Direction;
240 for (unsigned II = 1; II <= Levels; ++II) {
241 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
242 // or `=`, for which we don't have an equivalent representation, so
243 // that the conservative approximation is necessary. The same goes for
244 // `DVEntry::GE`.
245 // TODO: Use of fine-grained expressions allows for more accurate
246 // analysis.
247 unsigned Dir = D->getDirection(Level: II);
248 if (Dir == Dependence::DVEntry::LT)
249 Direction = '<';
250 else if (Dir == Dependence::DVEntry::GT)
251 Direction = '>';
252 else if (Dir == Dependence::DVEntry::EQ)
253 Direction = '=';
254 else
255 Direction = '*';
256 Dep.push_back(x: Direction);
257 }
258
259 // If the Dependence object doesn't have any information, fill the
260 // dependency vector with '*'.
261 if (D->isConfused()) {
262 assert(Dep.empty() && "Expected empty dependency vector");
263 Dep.assign(n: Level, val: '*');
264 }
265
266 while (Dep.size() != Level) {
267 Dep.push_back(x: 'I');
268 }
269
270 // If all the elements of any direction vector have only '*', legality
271 // can't be proven. Exit early to save compile time.
272 if (all_of(Range&: Dep, P: equal_to(Arg: '*'))) {
273 ORE->emit(RemarkBuilder: [&]() {
274 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
275 L->getStartLoc(), L->getHeader())
276 << "All loops have dependencies in all directions.";
277 });
278 return false;
279 }
280
281 // Test whether the dependency is forward or not.
282 bool IsKnownForward = true;
283 if (Src->getParent() != Dst->getParent()) {
284 // In general, when Src and Dst are in different BBs, the execution
285 // order of them within a single iteration is not guaranteed. Treat
286 // conservatively as not-forward dependency in this case.
287 IsKnownForward = false;
288 } else {
289 // Src and Dst are in the same BB. If they are the different
290 // instructions, Src should appear before Dst in the BB as they are
291 // stored to MemInstr in that order.
292 assert((Src == Dst || inThisOrder(Src, Dst)) &&
293 "Unexpected instructions");
294
295 // If the Dependence object is reversed (due to normalization), it
296 // represents the dependency from Dst to Src, meaning it is a backward
297 // dependency. Otherwise it should be a forward dependency.
298 bool IsReversed = D->getSrc() != Src;
299 if (IsReversed)
300 IsKnownForward = false;
301 }
302
303 // Initialize the last element. Assume forward dependencies only; it
304 // will be updated later if there is any non-forward dependency.
305 Dep.push_back(x: '<');
306
307 // The last element should express the "summary" among one or more
308 // direction vectors whose first N elements are the same (where N is
309 // the depth of the loop nest). Hence we exclude the last element from
310 // the Seen map.
311 auto [Ite, Inserted] = Seen.try_emplace(
312 Key: StringRef(Dep.data(), Dep.size() - 1), Args: DepMatrix.size());
313
314 // Make sure we only add unique entries to the dependency matrix.
315 if (Inserted)
316 DepMatrix.push_back(x: Dep);
317
318 // If we cannot prove that this dependency is forward, change the last
319 // element of the corresponding entry. Since a `[... *]` dependency
320 // includes a `[... <]` dependency, we do not need to keep both and
321 // change the existing entry instead.
322 if (!IsKnownForward)
323 DepMatrix[Ite->second].back() = '*';
324 }
325 }
326 }
327
328 return true;
329}
330
331// A loop is moved from index 'from' to an index 'to'. Update the Dependence
332// matrix by exchanging the two columns.
333static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
334 unsigned ToIndx) {
335 for (auto &Row : DepMatrix)
336 std::swap(a&: Row[ToIndx], b&: Row[FromIndx]);
337}
338
339// Check if a direction vector is lexicographically positive. Return true if it
340// is positive, nullopt if it is "zero", otherwise false.
341// [Theorem] A permutation of the loops in a perfect nest is legal if and only
342// if the direction matrix, after the same permutation is applied to its
343// columns, has no ">" direction as the leftmost non-"=" direction in any row.
344static std::optional<bool>
345isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
346 for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) {
347 if (Direction == '<')
348 return true;
349 if (Direction == '>' || Direction == '*')
350 return false;
351 }
352 return std::nullopt;
353}
354
355// Checks if it is legal to interchange 2 loops.
356static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
357 unsigned InnerLoopId,
358 unsigned OuterLoopId) {
359 unsigned NumRows = DepMatrix.size();
360 std::vector<char> Cur;
361 // For each row check if it is valid to interchange.
362 for (unsigned Row = 0; Row < NumRows; ++Row) {
363 // Create temporary DepVector check its lexicographical order
364 // before and after swapping OuterLoop vs InnerLoop
365 Cur = DepMatrix[Row];
366
367 // If the surrounding loops already ensure that the direction vector is
368 // lexicographically positive, nothing within the loop will be able to break
369 // the dependence. In such a case we can skip the subsequent check.
370 if (isLexicographicallyPositive(DV: Cur, Begin: 0, End: OuterLoopId) == true)
371 continue;
372
373 // Check if the direction vector is lexicographically positive (or zero)
374 // for both before/after exchanged. Ignore the last element because it
375 // doesn't affect the legality.
376 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - 1) == false)
377 return false;
378 std::swap(a&: Cur[InnerLoopId], b&: Cur[OuterLoopId]);
379 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - 1) == false)
380 return false;
381 }
382 return true;
383}
384
385static void populateWorklist(Loop &L, LoopVector &LoopList) {
386 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
387 << L.getHeader()->getParent()->getName() << " Loop: %"
388 << L.getHeader()->getName() << '\n');
389 assert(LoopList.empty() && "LoopList should initially be empty!");
390 Loop *CurrentLoop = &L;
391 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
392 while (!Vec->empty()) {
393 // The current loop has multiple subloops in it hence it is not tightly
394 // nested.
395 // Discard all loops above it added into Worklist.
396 if (Vec->size() != 1) {
397 LoopList = {};
398 return;
399 }
400
401 LoopList.push_back(Elt: CurrentLoop);
402 CurrentLoop = Vec->front();
403 Vec = &CurrentLoop->getSubLoops();
404 }
405 LoopList.push_back(Elt: CurrentLoop);
406}
407
408static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList,
409 OptimizationRemarkEmitter &ORE) {
410 unsigned LoopNestDepth = LoopList.size();
411 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
412 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
413 << ", the supported range is [" << MinLoopNestDepth
414 << ", " << MaxLoopNestDepth << "].\n");
415 Loop *OuterLoop = LoopList.front();
416 ORE.emit(RemarkBuilder: [&]() {
417 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
418 OuterLoop->getStartLoc(),
419 OuterLoop->getHeader())
420 << "Unsupported depth of loop nest, the supported range is ["
421 << std::to_string(val: MinLoopNestDepth) << ", "
422 << std::to_string(val: MaxLoopNestDepth) << "].\n";
423 });
424 return false;
425 }
426 return true;
427}
428
429static bool isComputableLoopNest(ScalarEvolution *SE,
430 ArrayRef<Loop *> LoopList) {
431 for (Loop *L : LoopList) {
432 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
433 if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) {
434 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
435 return false;
436 }
437 if (L->getNumBackEdges() != 1) {
438 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
439 return false;
440 }
441 if (!L->getExitingBlock()) {
442 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
443 return false;
444 }
445 }
446 return true;
447}
448
449namespace {
450
451/// LoopInterchangeLegality checks if it is legal to interchange the loop.
452class LoopInterchangeLegality {
453public:
454 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
455 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
456 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
457
458 /// Check if the loops can be interchanged.
459 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
460 CharMatrix &DepMatrix);
461
462 /// Discover induction PHIs in the header of \p L. Induction
463 /// PHIs are added to \p Inductions.
464 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
465
466 /// Check if the loop structure is understood. We do not handle triangular
467 /// loops for now.
468 bool isLoopStructureUnderstood();
469
470 bool currentLimitations();
471
472 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
473 return OuterInnerReductions;
474 }
475
476 const ArrayRef<PHINode *> getInnerLoopInductions() const {
477 return InnerLoopInductions;
478 }
479
480 ArrayRef<Instruction *> getHasNoWrapReductions() const {
481 return HasNoWrapReductions;
482 }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Discover induction and reduction PHIs in the header of \p L. Induction
511 /// PHIs are added to \p Inductions, reductions are added to
512 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
513 /// to be passed as \p InnerLoop.
514 bool findInductionAndReductions(Loop *L,
515 SmallVector<PHINode *, 8> &Inductions,
516 Loop *InnerLoop);
517
518 /// Detect and record the reduction of the inner loop. Add them to
519 /// InnerReductions.
520 ///
521 /// innerloop:
522 /// Re = phi<0.0, Next>
523 /// Next = Re op ...
524 /// OuterLoopLatch:
525 /// Lcssa = phi<Next> ; lcssa phi
526 /// store Lcssa, MemRef ; LcssaStore
527 ///
528 bool isInnerReduction(Loop *L, PHINode *Phi,
529 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
530
531 Loop *OuterLoop;
532 Loop *InnerLoop;
533
534 ScalarEvolution *SE;
535 DominatorTree *DT;
536
537 /// Interface to emit optimization remarks.
538 OptimizationRemarkEmitter *ORE;
539
540 /// Set of reduction PHIs taking part of a reduction across the inner and
541 /// outer loop.
542 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
543
544 /// Set of inner loop induction PHIs
545 SmallVector<PHINode *, 8> InnerLoopInductions;
546
547 /// Hold instructions that have nuw/nsw flags and involved in reductions,
548 /// like integer addition/multiplication. Those flags must be dropped when
549 /// interchanging the loops.
550 SmallVector<Instruction *, 4> HasNoWrapReductions;
551
552 /// Vector of reductions in the inner loop.
553 SmallVector<InnerReduction, 8> InnerReductions;
554};
555
556/// Manages information utilized by the profitability check for cache. The main
557/// purpose of this class is to delay the computation of CacheCost until it is
558/// actually needed.
559class CacheCostManager {
560 Loop *OutermostLoop;
561 LoopStandardAnalysisResults *AR;
562 DependenceInfo *DI;
563
564 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
565 /// that the result can be nullptr.
566 std::optional<std::unique_ptr<CacheCost>> CC;
567
568 /// Maps each loop to an index representing the optimal position within the
569 /// loop-nest, as determined by the cache cost analysis.
570 DenseMap<const Loop *, unsigned> CostMap;
571
572 void computeIfUnitinialized();
573
574public:
575 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
576 DependenceInfo *DI)
577 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
578 CacheCost *getCacheCost();
579 const DenseMap<const Loop *, unsigned> &getCostMap();
580};
581
582/// LoopInterchangeProfitability checks if it is profitable to interchange the
583/// loop.
584class LoopInterchangeProfitability {
585public:
586 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
587 OptimizationRemarkEmitter *ORE)
588 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
589
590 /// Check if the loop interchange is profitable.
591 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
592 unsigned InnerLoopId, unsigned OuterLoopId,
593 CharMatrix &DepMatrix, CacheCostManager &CCM);
594
595private:
596 int getInstrOrderCost();
597 std::optional<bool> isProfitablePerLoopCacheAnalysis(
598 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
599 std::optional<bool> isProfitablePerInstrOrderCost();
600 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
601 unsigned OuterLoopId,
602 CharMatrix &DepMatrix);
603 Loop *OuterLoop;
604 Loop *InnerLoop;
605
606 /// Scev analysis.
607 ScalarEvolution *SE;
608
609 /// Interface to emit optimization remarks.
610 OptimizationRemarkEmitter *ORE;
611};
612
613/// LoopInterchangeTransform interchanges the loop.
614class LoopInterchangeTransform {
615public:
616 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
617 LoopInfo *LI, DominatorTree *DT,
618 const LoopInterchangeLegality &LIL)
619 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
620
621 /// Interchange OuterLoop and InnerLoop.
622 bool transform(ArrayRef<Instruction *> DropNoWrapInsts);
623 void reduction2Memory();
624 void restructureLoops(Loop *NewInner, Loop *NewOuter,
625 BasicBlock *OrigInnerPreHeader,
626 BasicBlock *OrigOuterPreHeader);
627 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
628
629private:
630 bool adjustLoopLinks();
631 bool adjustLoopBranches();
632
633 Loop *OuterLoop;
634 Loop *InnerLoop;
635
636 /// Scev analysis.
637 ScalarEvolution *SE;
638
639 LoopInfo *LI;
640 DominatorTree *DT;
641
642 const LoopInterchangeLegality &LIL;
643};
644
645struct LoopInterchange {
646 ScalarEvolution *SE = nullptr;
647 LoopInfo *LI = nullptr;
648 DependenceInfo *DI = nullptr;
649 DominatorTree *DT = nullptr;
650 LoopStandardAnalysisResults *AR = nullptr;
651
652 /// Interface to emit optimization remarks.
653 OptimizationRemarkEmitter *ORE;
654
655 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
656 DominatorTree *DT, LoopStandardAnalysisResults *AR,
657 OptimizationRemarkEmitter *ORE)
658 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
659
660 bool run(Loop *L) {
661 if (L->getParentLoop())
662 return false;
663 SmallVector<Loop *, 8> LoopList;
664 populateWorklist(L&: *L, LoopList);
665 return processLoopList(LoopList);
666 }
667
668 bool run(LoopNest &LN) {
669 SmallVector<Loop *, 8> LoopList(LN.getLoops());
670 for (unsigned I = 1; I < LoopList.size(); ++I)
671 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
672 return false;
673 return processLoopList(LoopList);
674 }
675
676 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
677 // TODO: Add a better heuristic to select the loop to be interchanged based
678 // on the dependence matrix. Currently we select the innermost loop.
679 return LoopList.size() - 1;
680 }
681
682 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
683 bool Changed = false;
684
685 // Ensure proper loop nest depth.
686 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
687 "Unsupported depth of loop nest.");
688
689 unsigned LoopNestDepth = LoopList.size();
690
691 LLVM_DEBUG({
692 dbgs() << "Processing LoopList of size = " << LoopNestDepth
693 << " containing the following loops:\n";
694 for (auto *L : LoopList) {
695 dbgs() << " - ";
696 L->print(dbgs());
697 }
698 });
699
700 CharMatrix DependencyMatrix;
701 Loop *OuterMostLoop = *(LoopList.begin());
702 if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth,
703 L: OuterMostLoop, DI, SE, ORE)) {
704 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
705 return false;
706 }
707
708 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
709 printDepMatrix(DependencyMatrix));
710
711 // Get the Outermost loop exit.
712 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
713 if (!LoopNestExit) {
714 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
715 << "' needs an unique exit block");
716 return false;
717 }
718
719 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
720 CacheCostManager CCM(LoopList[0], AR, DI);
721 // We try to achieve the globally optimal memory access for the loopnest,
722 // and do interchange based on a bubble-sort fasion. We start from
723 // the innermost loop, move it outwards to the best possible position
724 // and repeat this process.
725 for (unsigned j = SelecLoopId; j > 0; j--) {
726 bool ChangedPerIter = false;
727 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
728 bool Interchanged =
729 processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - 1, DependencyMatrix, CCM);
730 ChangedPerIter |= Interchanged;
731 Changed |= Interchanged;
732 }
733 // Early abort if there was no interchange during an entire round of
734 // moving loops outwards.
735 if (!ChangedPerIter)
736 break;
737 }
738 return Changed;
739 }
740
741 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
742 unsigned OuterLoopId,
743 std::vector<std::vector<char>> &DependencyMatrix,
744 CacheCostManager &CCM) {
745 Loop *OuterLoop = LoopList[OuterLoopId];
746 Loop *InnerLoop = LoopList[InnerLoopId];
747 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
748 << " and OuterLoopId = " << OuterLoopId << "\n");
749 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
750 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) {
751 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
752 << OuterLoop->getName() << "' and '"
753 << InnerLoop->getName() << "'\n");
754 return false;
755 }
756 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
757 << InnerLoop->getName()
758 << "' are legal to interchange\n");
759 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
760 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
761 DepMatrix&: DependencyMatrix, CCM)) {
762 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
763 << "' and '" << InnerLoop->getName()
764 << "' not profitable.\n");
765 return false;
766 }
767
768 ORE->emit(RemarkBuilder: [&]() {
769 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
770 InnerLoop->getStartLoc(),
771 InnerLoop->getHeader())
772 << "Loop interchanged with enclosing loop.";
773 });
774
775 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
776 LIT.transform(DropNoWrapInsts: LIL.getHasNoWrapReductions());
777 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
778 << OuterLoop->getName() << "' and inner loop '"
779 << InnerLoop->getName() << "'\n");
780 LoopsInterchanged++;
781
782 llvm::formLCSSARecursively(L&: *OuterLoop, DT: *DT, LI, SE);
783
784 // Loops interchanged, update LoopList accordingly.
785 std::swap(a&: LoopList[OuterLoopId], b&: LoopList[InnerLoopId]);
786 // Update the DependencyMatrix
787 interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId);
788
789 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
790 printDepMatrix(DependencyMatrix));
791
792 return true;
793 }
794};
795
796} // end anonymous namespace
797
798bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
799 Instruction *Skip) {
800 return any_of(Range&: *BB, P: [Skip](const Instruction &I) {
801 if (&I == Skip)
802 return false;
803 return I.mayHaveSideEffects() || I.mayReadFromMemory();
804 });
805}
806
807bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
808 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
809 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
810 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
811
812 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
813 << "' and '" << InnerLoop->getName()
814 << "' are tightly nested\n");
815
816 // A perfectly nested loop will not have any branch in between the outer and
817 // inner block i.e. outer header will branch to either inner preheader and
818 // outerloop latch.
819 for (BasicBlock *Succ : successors(BB: OuterLoopHeader))
820 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
821 Succ != OuterLoopLatch)
822 return false;
823
824 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
825
826 // The inner loop reduction pattern requires storing the LCSSA PHI in
827 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
828 // that store during checks.
829 Instruction *Skip = nullptr;
830 assert(InnerReductions.size() <= 1 &&
831 "So far we only support at most one reduction.");
832 if (InnerReductions.size() == 1)
833 Skip = InnerReductions[0].LcssaStore;
834
835 // We do not have any basic block in between now make sure the outer header
836 // and outer loop latch doesn't contain any unsafe instructions.
837 if (containsUnsafeInstructions(BB: OuterLoopHeader, Skip) ||
838 containsUnsafeInstructions(BB: OuterLoopLatch, Skip))
839 return false;
840
841 // Also make sure the inner loop preheader does not contain any unsafe
842 // instructions. Note that all instructions in the preheader will be moved to
843 // the outer loop header when interchanging.
844 if (InnerLoopPreHeader != OuterLoopHeader &&
845 containsUnsafeInstructions(BB: InnerLoopPreHeader, Skip))
846 return false;
847
848 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
849 // Ensure the inner loop exit block flows to the outer loop latch possibly
850 // through empty blocks.
851 const BasicBlock &SuccInner =
852 LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch);
853 if (&SuccInner != OuterLoopLatch) {
854 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
855 << " does not lead to the outer loop latch.\n";);
856 return false;
857 }
858 // The inner loop exit block does flow to the outer loop latch and not some
859 // other BBs, now make sure it contains safe instructions, since it will be
860 // moved into the (new) inner loop after interchange.
861 if (containsUnsafeInstructions(BB: InnerLoopExit, Skip))
862 return false;
863
864 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
865 // We have a perfect loop nest.
866 return true;
867}
868
869bool LoopInterchangeLegality::isLoopStructureUnderstood() {
870 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
871 for (PHINode *InnerInduction : InnerLoopInductions) {
872 unsigned Num = InnerInduction->getNumOperands();
873 for (unsigned i = 0; i < Num; ++i) {
874 Value *Val = InnerInduction->getOperand(i_nocapture: i);
875 if (isa<Constant>(Val))
876 continue;
877 Instruction *I = dyn_cast<Instruction>(Val);
878 if (!I)
879 return false;
880 // TODO: Handle triangular loops.
881 // e.g. for(int i=0;i<N;i++)
882 // for(int j=i;j<N;j++)
883 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
884 if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) ==
885 InnerLoopPreheader &&
886 !OuterLoop->isLoopInvariant(V: I)) {
887 return false;
888 }
889 }
890 }
891
892 // TODO: Handle triangular loops of another form.
893 // e.g. for(int i=0;i<N;i++)
894 // for(int j=0;j<i;j++)
895 // or,
896 // for(int i=0;i<N;i++)
897 // for(int j=0;j*i<N;j++)
898 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
899 CondBrInst *InnerLoopLatchBI =
900 dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
901 if (!InnerLoopLatchBI)
902 return false;
903 if (CmpInst *InnerLoopCmp =
904 dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) {
905 Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: 0);
906 Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: 1);
907
908 // LHS and RHS of the inner loop exit condition, e.g.,
909 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
910 Value *Left = nullptr;
911 Value *Right = nullptr;
912
913 // Check if V only involves inner loop induction variable.
914 // Return true if V is InnerInduction, or a cast from
915 // InnerInduction, or a binary operator that involves
916 // InnerInduction and a constant.
917 std::function<bool(Value *)> IsPathToInnerIndVar;
918 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
919 if (llvm::is_contained(Range&: InnerLoopInductions, Element: V))
920 return true;
921 if (isa<Constant>(Val: V))
922 return true;
923 const Instruction *I = dyn_cast<Instruction>(Val: V);
924 if (!I)
925 return false;
926 if (isa<CastInst>(Val: I))
927 return IsPathToInnerIndVar(I->getOperand(i: 0));
928 if (isa<BinaryOperator>(Val: I))
929 return IsPathToInnerIndVar(I->getOperand(i: 0)) &&
930 IsPathToInnerIndVar(I->getOperand(i: 1));
931 return false;
932 };
933
934 // In case of multiple inner loop indvars, it is okay if LHS and RHS
935 // are both inner indvar related variables.
936 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
937 return true;
938
939 // Otherwise we check if the cmp instruction compares an inner indvar
940 // related variable (Left) with a outer loop invariant (Right).
941 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Val: Op0)) {
942 Left = Op0;
943 Right = Op1;
944 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Val: Op1)) {
945 Left = Op1;
946 Right = Op0;
947 }
948
949 if (Left == nullptr)
950 return false;
951
952 const SCEV *S = SE->getSCEV(V: Right);
953 if (!SE->isLoopInvariant(S, L: OuterLoop))
954 return false;
955 }
956
957 return true;
958}
959
960// If SV is a LCSSA PHI node with a single incoming value, return the incoming
961// value.
962static Value *followLCSSA(Value *SV) {
963 PHINode *PHI = dyn_cast<PHINode>(Val: SV);
964 if (!PHI)
965 return SV;
966
967 if (PHI->getNumIncomingValues() != 1)
968 return SV;
969 return followLCSSA(SV: PHI->getIncomingValue(i: 0));
970}
971
972static bool checkReductionKind(Loop *L, PHINode *PHI,
973 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
974 RecurrenceDescriptor RD;
975 if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) {
976 // Detect floating point reduction only when it can be reordered.
977 if (RD.getExactFPMathInst() != nullptr)
978 return false;
979
980 RecurKind RK = RD.getRecurrenceKind();
981 switch (RK) {
982 case RecurKind::Or:
983 case RecurKind::And:
984 case RecurKind::Xor:
985 case RecurKind::SMin:
986 case RecurKind::SMax:
987 case RecurKind::UMin:
988 case RecurKind::UMax:
989 case RecurKind::FAdd:
990 case RecurKind::FMul:
991 case RecurKind::FMin:
992 case RecurKind::FMax:
993 case RecurKind::FMinimum:
994 case RecurKind::FMaximum:
995 case RecurKind::FMinimumNum:
996 case RecurKind::FMaximumNum:
997 case RecurKind::FMulAdd:
998 case RecurKind::AnyOf:
999 return true;
1000
1001 // Change the order of integer addition/multiplication may change the
1002 // semantics. Consider the following case:
1003 //
1004 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1005 // int sum = 0;
1006 // for (int i = 0; i < 2; i++)
1007 // for (int j = 0; j < 2; j++)
1008 // sum += A[j][i];
1009 //
1010 // If the above loops are exchanged, the addition will cause an
1011 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1012 // addition/multiplication instructions when we actually exchanges the
1013 // loops.
1014 case RecurKind::Add:
1015 case RecurKind::Mul: {
1016 unsigned OpCode = RecurrenceDescriptor::getOpcode(Kind: RK);
1017 SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(Phi: PHI, L);
1018
1019 // Bail out when we fail to collect reduction instructions chain.
1020 if (Ops.empty())
1021 return false;
1022
1023 for (Instruction *I : Ops) {
1024 assert(I->getOpcode() == OpCode &&
1025 "Expected the instruction to be the reduction operation");
1026 (void)OpCode;
1027
1028 // If the instruction has nuw/nsw flags, we must drop them when the
1029 // transformation is actually performed.
1030 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1031 HasNoWrapInsts.push_back(Elt: I);
1032 }
1033 return true;
1034 }
1035
1036 default:
1037 return false;
1038 }
1039 } else
1040 return false;
1041}
1042
1043// Check V's users to see if it is involved in a reduction in L.
1044static PHINode *
1045findInnerReductionPhi(Loop *L, Value *V,
1046 SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1047 // Reduction variables cannot be constants.
1048 if (isa<Constant>(Val: V))
1049 return nullptr;
1050
1051 for (Value *User : V->users()) {
1052 if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) {
1053 if (PHI->getNumIncomingValues() == 1)
1054 continue;
1055
1056 if (checkReductionKind(L, PHI, HasNoWrapInsts))
1057 return PHI;
1058 else
1059 return nullptr;
1060 }
1061 }
1062
1063 return nullptr;
1064}
1065
1066bool LoopInterchangeLegality::isInnerReduction(
1067 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1068
1069 // Only support reduction2Mem when the loop nest to be interchanged is
1070 // the innermost two loops.
1071 if (!L->isInnermost()) {
1072 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1073 ORE->emit(RemarkBuilder: [&]() {
1074 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1075 L->getStartLoc(), L->getHeader())
1076 << "Only supported when the loop is the innermost.";
1077 });
1078 return false;
1079 }
1080
1081 if (Phi->getNumIncomingValues() != 2)
1082 return false;
1083
1084 Value *Init = Phi->getIncomingValueForBlock(BB: L->getLoopPreheader());
1085 Value *Next = Phi->getIncomingValueForBlock(BB: L->getLoopLatch());
1086
1087 // So far only supports constant initial value.
1088 if (!isa<Constant>(Val: Init)) {
1089 LLVM_DEBUG(
1090 dbgs()
1091 << "Only supported for the reduction with a constant initial value.\n");
1092 ORE->emit(RemarkBuilder: [&]() {
1093 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1094 L->getStartLoc(), L->getHeader())
1095 << "Only supported for the reduction with a constant initial "
1096 "value.";
1097 });
1098 return false;
1099 }
1100
1101 // The reduction result must live in the inner loop.
1102 if (Instruction *I = dyn_cast<Instruction>(Val: Next)) {
1103 BasicBlock *BB = I->getParent();
1104 if (!L->contains(BB))
1105 return false;
1106 }
1107
1108 // The reduction should have only one user.
1109 if (!Phi->hasOneUser())
1110 return false;
1111
1112 // Check the reduction kind.
1113 if (!checkReductionKind(L, PHI: Phi, HasNoWrapInsts))
1114 return false;
1115
1116 // Find lcssa_phi in OuterLoop's Latch
1117 BasicBlock *ExitBlock = L->getExitBlock();
1118 if (!ExitBlock)
1119 return false;
1120
1121 PHINode *Lcssa = NULL;
1122 for (auto *U : Next->users()) {
1123 if (auto *P = dyn_cast<PHINode>(Val: U)) {
1124 if (P == Phi)
1125 continue;
1126
1127 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1128 P->getIncomingValueForBlock(BB: L->getLoopLatch()) == Next)
1129 Lcssa = P;
1130 else
1131 return false;
1132 } else
1133 return false;
1134 }
1135 if (!Lcssa)
1136 return false;
1137
1138 if (!Lcssa->hasOneUser()) {
1139 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1140 "the outer loop.\n");
1141 ORE->emit(RemarkBuilder: [&]() {
1142 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1143 L->getStartLoc(), L->getHeader())
1144 << "Only supported when the reduction is used once in the outer "
1145 "loop.";
1146 });
1147 return false;
1148 }
1149
1150 StoreInst *LcssaStore =
1151 dyn_cast<StoreInst>(Val: Lcssa->getUniqueUndroppableUser());
1152 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1153 return false;
1154
1155 Value *MemRef = LcssaStore->getOperand(i_nocapture: 1);
1156 Type *ElemTy = LcssaStore->getOperand(i_nocapture: 0)->getType();
1157
1158 // LcssaStore stores the reduction result in BB.
1159 // When the reduction is initialized from a constant value, we need to load
1160 // from the memory object into the target basic block of the inner loop. This
1161 // means the memory reference was used prematurely. So we must ensure that the
1162 // memory reference does not dominate the target basic block.
1163 // TODO: Move the memory reference definition into the loop header.
1164 if (!DT->dominates(Def: dyn_cast<Instruction>(Val: MemRef), BB: L->getHeader())) {
1165 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1166 "the inner loop.\n");
1167 ORE->emit(RemarkBuilder: [&]() {
1168 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1169 L->getStartLoc(), L->getHeader())
1170 << "Only supported when memory reference dominate the inner "
1171 "loop.";
1172 });
1173 return false;
1174 }
1175
1176 // Found a reduction in the inner loop.
1177 InnerReduction SR;
1178 SR.Reduction = Phi;
1179 SR.Init = Init;
1180 SR.Next = Next;
1181 SR.LcssaPhi = Lcssa;
1182 SR.LcssaStore = LcssaStore;
1183 SR.MemRef = MemRef;
1184 SR.ElemTy = ElemTy;
1185
1186 InnerReductions.push_back(Elt: SR);
1187 return true;
1188}
1189
1190bool LoopInterchangeLegality::findInductionAndReductions(
1191 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
1192 if (!L->getLoopLatch() || !L->getLoopPredecessor())
1193 return false;
1194 for (PHINode &PHI : L->getHeader()->phis()) {
1195 InductionDescriptor ID;
1196 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
1197 Inductions.push_back(Elt: &PHI);
1198 else {
1199 // PHIs in inner loops need to be part of a reduction in the outer loop,
1200 // discovered when checking the PHIs of the outer loop earlier.
1201 if (!InnerLoop) {
1202 if (OuterInnerReductions.count(Ptr: &PHI)) {
1203 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1204 } else if (EnableReduction2Memory &&
1205 isInnerReduction(L, Phi: &PHI, HasNoWrapInsts&: HasNoWrapReductions)) {
1206 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1207 << PHI << '\n');
1208 } else
1209 return false;
1210 } else {
1211 assert(PHI.getNumIncomingValues() == 2 &&
1212 "Phis in loop header should have exactly 2 incoming values");
1213 // Check if we have a PHI node in the outer loop that has a reduction
1214 // result from the inner loop as an incoming value.
1215 Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch()));
1216 PHINode *InnerRedPhi =
1217 findInnerReductionPhi(L: InnerLoop, V, HasNoWrapInsts&: HasNoWrapReductions);
1218 if (!InnerRedPhi ||
1219 !llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) {
1220 LLVM_DEBUG(
1221 dbgs()
1222 << "Failed to recognize PHI as an induction or reduction.\n");
1223 return false;
1224 }
1225 OuterInnerReductions.insert(Ptr: &PHI);
1226 OuterInnerReductions.insert(Ptr: InnerRedPhi);
1227 }
1228 }
1229 }
1230
1231 // For now we only support at most one reduction.
1232 if (InnerReductions.size() > 1) {
1233 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1234 ORE->emit(RemarkBuilder: [&]() {
1235 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1236 L->getStartLoc(), L->getHeader())
1237 << "Only supports at most one reduction.";
1238 });
1239 return false;
1240 }
1241
1242 return true;
1243}
1244
1245// This function indicates the current limitations in the transform as a result
1246// of which we do not proceed.
1247bool LoopInterchangeLegality::currentLimitations() {
1248 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1249
1250 // transform currently expects the loop latches to also be the exiting
1251 // blocks.
1252 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1253 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1254 !isa<CondBrInst>(Val: InnerLoopLatch->getTerminator()) ||
1255 !isa<CondBrInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) {
1256 LLVM_DEBUG(
1257 dbgs() << "Loops where the latch is not the exiting block are not"
1258 << " supported currently.\n");
1259 ORE->emit(RemarkBuilder: [&]() {
1260 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1261 OuterLoop->getStartLoc(),
1262 OuterLoop->getHeader())
1263 << "Loops where the latch is not the exiting block cannot be"
1264 " interchange currently.";
1265 });
1266 return true;
1267 }
1268
1269 SmallVector<PHINode *, 8> Inductions;
1270 if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) {
1271 LLVM_DEBUG(
1272 dbgs() << "Only outer loops with induction or reduction PHI nodes "
1273 << "are supported currently.\n");
1274 ORE->emit(RemarkBuilder: [&]() {
1275 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1276 OuterLoop->getStartLoc(),
1277 OuterLoop->getHeader())
1278 << "Only outer loops with induction or reduction PHI nodes can be"
1279 " interchanged currently.";
1280 });
1281 return true;
1282 }
1283
1284 Inductions.clear();
1285 // For multi-level loop nests, make sure that all phi nodes for inner loops
1286 // at all levels can be recognized as a induction or reduction phi. Bail out
1287 // if a phi node at a certain nesting level cannot be properly recognized.
1288 Loop *CurLevelLoop = OuterLoop;
1289 while (!CurLevelLoop->getSubLoops().empty()) {
1290 // We already made sure that the loop nest is tightly nested.
1291 CurLevelLoop = CurLevelLoop->getSubLoops().front();
1292 if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) {
1293 LLVM_DEBUG(
1294 dbgs() << "Only inner loops with induction or reduction PHI nodes "
1295 << "are supported currently.\n");
1296 ORE->emit(RemarkBuilder: [&]() {
1297 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1298 CurLevelLoop->getStartLoc(),
1299 CurLevelLoop->getHeader())
1300 << "Only inner loops with induction or reduction PHI nodes can be"
1301 " interchange currently.";
1302 });
1303 return true;
1304 }
1305 }
1306
1307 // TODO: Triangular loops are not handled for now.
1308 if (!isLoopStructureUnderstood()) {
1309 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1310 ORE->emit(RemarkBuilder: [&]() {
1311 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1312 InnerLoop->getStartLoc(),
1313 InnerLoop->getHeader())
1314 << "Inner loop structure not understood currently.";
1315 });
1316 return true;
1317 }
1318
1319 return false;
1320}
1321
1322bool LoopInterchangeLegality::findInductions(
1323 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
1324 for (PHINode &PHI : L->getHeader()->phis()) {
1325 InductionDescriptor ID;
1326 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID))
1327 Inductions.push_back(Elt: &PHI);
1328 }
1329 return !Inductions.empty();
1330}
1331
1332// We currently only support LCSSA PHI nodes in the inner loop exit, if their
1333// users are either reduction PHIs or PHIs outside the outer loop (which means
1334// the we are only interested in the final value after the loop).
1335static bool
1336areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
1337 SmallPtrSetImpl<PHINode *> &Reductions,
1338 PHINode *LcssaReduction) {
1339 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
1340 for (PHINode &PHI : InnerExit->phis()) {
1341 // The reduction LCSSA PHI will have only one incoming block, which comes
1342 // from the loop latch.
1343 if (PHI.getNumIncomingValues() > 1)
1344 return false;
1345 if (&PHI == LcssaReduction)
1346 return true;
1347 if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) {
1348 PHINode *PN = dyn_cast<PHINode>(Val: U);
1349 return !PN ||
1350 (!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent()));
1351 })) {
1352 return false;
1353 }
1354 }
1355 return true;
1356}
1357
1358// We currently support LCSSA PHI nodes in the outer loop exit, if their
1359// incoming values do not come from the outer loop latch or if the
1360// outer loop latch has a single predecessor. In that case, the value will
1361// be available if both the inner and outer loop conditions are true, which
1362// will still be true after interchanging. If we have multiple predecessor,
1363// that may not be the case, e.g. because the outer loop latch may be executed
1364// if the inner loop is not executed.
1365static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1366 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1367 for (PHINode &PHI : LoopNestExit->phis()) {
1368 for (Value *Incoming : PHI.incoming_values()) {
1369 Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming);
1370 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1371 continue;
1372
1373 // The incoming value is defined in the outer loop latch. Currently we
1374 // only support that in case the outer loop latch has a single predecessor.
1375 // This guarantees that the outer loop latch is executed if and only if
1376 // the inner loop is executed (because tightlyNested() guarantees that the
1377 // outer loop header only branches to the inner loop or the outer loop
1378 // latch).
1379 // FIXME: We could weaken this logic and allow multiple predecessors,
1380 // if the values are produced outside the loop latch. We would need
1381 // additional logic to update the PHI nodes in the exit block as
1382 // well.
1383 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1384 return false;
1385 }
1386 }
1387 return true;
1388}
1389
1390// In case of multi-level nested loops, it may occur that lcssa phis exist in
1391// the latch of InnerLoop, i.e., when defs of the incoming values are further
1392// inside the loopnest. Sometimes those incoming values are not available
1393// after interchange, since the original inner latch will become the new outer
1394// latch which may have predecessor paths that do not include those incoming
1395// values.
1396// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1397// multi-level loop nests.
1398static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1399 if (InnerLoop->getSubLoops().empty())
1400 return true;
1401 // If the original outer latch has only one predecessor, then values defined
1402 // further inside the looploop, e.g., in the innermost loop, will be available
1403 // at the new outer latch after interchange.
1404 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
1405 return true;
1406
1407 // The outer latch has more than one predecessors, i.e., the inner
1408 // exit and the inner header.
1409 // PHI nodes in the inner latch are lcssa phis where the incoming values
1410 // are defined further inside the loopnest. Check if those phis are used
1411 // in the original inner latch. If that is the case then bail out since
1412 // those incoming values may not be available at the new outer latch.
1413 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1414 for (PHINode &PHI : InnerLoopLatch->phis()) {
1415 for (auto *U : PHI.users()) {
1416 Instruction *UI = cast<Instruction>(Val: U);
1417 if (InnerLoopLatch == UI->getParent())
1418 return false;
1419 }
1420 }
1421 return true;
1422}
1423
1424bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1425 unsigned OuterLoopId,
1426 CharMatrix &DepMatrix) {
1427 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1428 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1429 << " and OuterLoopId = " << OuterLoopId
1430 << " due to dependence\n");
1431 ORE->emit(RemarkBuilder: [&]() {
1432 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1433 InnerLoop->getStartLoc(),
1434 InnerLoop->getHeader())
1435 << "Cannot interchange loops due to dependences.";
1436 });
1437 return false;
1438 }
1439 // Check if outer and inner loop contain legal instructions only.
1440 for (auto *BB : OuterLoop->blocks())
1441 for (Instruction &I : BB->instructionsWithoutDebug())
1442 if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) {
1443 // readnone functions do not prevent interchanging.
1444 if (CI->onlyWritesMemory())
1445 continue;
1446 LLVM_DEBUG(
1447 dbgs() << "Loops with call instructions cannot be interchanged "
1448 << "safely.");
1449 ORE->emit(RemarkBuilder: [&]() {
1450 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
1451 CI->getDebugLoc(),
1452 CI->getParent())
1453 << "Cannot interchange loops due to call instruction.";
1454 });
1455
1456 return false;
1457 }
1458
1459 if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) {
1460 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
1461 return false;
1462 }
1463
1464 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1465 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1466 ORE->emit(RemarkBuilder: [&]() {
1467 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1468 InnerLoop->getStartLoc(),
1469 InnerLoop->getHeader())
1470 << "Cannot interchange loops because unsupported PHI nodes found "
1471 "in inner loop latch.";
1472 });
1473 return false;
1474 }
1475
1476 // TODO: The loops could not be interchanged due to current limitations in the
1477 // transform module.
1478 if (currentLimitations()) {
1479 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1480 return false;
1481 }
1482
1483 // Check if the loops are tightly nested.
1484 if (!tightlyNested(OuterLoop, InnerLoop)) {
1485 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1486 ORE->emit(RemarkBuilder: [&]() {
1487 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1488 InnerLoop->getStartLoc(),
1489 InnerLoop->getHeader())
1490 << "Cannot interchange loops because they are not tightly "
1491 "nested.";
1492 });
1493 return false;
1494 }
1495
1496 // The LCSSA PHI for the reduction has passed checks before; its user
1497 // is a store instruction.
1498 PHINode *LcssaReduction = nullptr;
1499 assert(InnerReductions.size() <= 1 &&
1500 "So far we only support at most one reduction.");
1501 if (InnerReductions.size() == 1)
1502 LcssaReduction = InnerReductions[0].LcssaPhi;
1503
1504 if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop, Reductions&: OuterInnerReductions,
1505 LcssaReduction)) {
1506 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1507 ORE->emit(RemarkBuilder: [&]() {
1508 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1509 InnerLoop->getStartLoc(),
1510 InnerLoop->getHeader())
1511 << "Found unsupported PHI node in loop exit.";
1512 });
1513 return false;
1514 }
1515
1516 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1517 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1518 ORE->emit(RemarkBuilder: [&]() {
1519 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1520 OuterLoop->getStartLoc(),
1521 OuterLoop->getHeader())
1522 << "Found unsupported PHI node in loop exit.";
1523 });
1524 return false;
1525 }
1526
1527 return true;
1528}
1529
1530void CacheCostManager::computeIfUnitinialized() {
1531 if (CC.has_value())
1532 return;
1533
1534 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1535 CC = CacheCost::getCacheCost(Root&: *OutermostLoop, AR&: *AR, DI&: *DI);
1536 // Obtain the loop vector returned from loop cache analysis beforehand,
1537 // and put each <Loop, index> pair into a map for constant time query
1538 // later. Indices in loop vector reprsent the optimal order of the
1539 // corresponding loop, e.g., given a loopnest with depth N, index 0
1540 // indicates the loop should be placed as the outermost loop and index N
1541 // indicates the loop should be placed as the innermost loop.
1542 //
1543 // For the old pass manager CacheCost would be null.
1544 if (*CC != nullptr)
1545 for (const auto &[Idx, Cost] : enumerate(First: (*CC)->getLoopCosts()))
1546 CostMap[Cost.first] = Idx;
1547}
1548
1549CacheCost *CacheCostManager::getCacheCost() {
1550 computeIfUnitinialized();
1551 return CC->get();
1552}
1553
1554const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1555 computeIfUnitinialized();
1556 return CostMap;
1557}
1558
1559int LoopInterchangeProfitability::getInstrOrderCost() {
1560 unsigned GoodOrder, BadOrder;
1561 BadOrder = GoodOrder = 0;
1562 for (BasicBlock *BB : InnerLoop->blocks()) {
1563 for (Instruction &Ins : *BB) {
1564 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) {
1565 bool FoundInnerInduction = false;
1566 bool FoundOuterInduction = false;
1567 for (Value *Op : GEP->operands()) {
1568 // Skip operands that are not SCEV-able.
1569 if (!SE->isSCEVable(Ty: Op->getType()))
1570 continue;
1571
1572 const SCEV *OperandVal = SE->getSCEV(V: Op);
1573 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal);
1574 if (!AR)
1575 continue;
1576
1577 // If we find the inner induction after an outer induction e.g.
1578 // for(int i=0;i<N;i++)
1579 // for(int j=0;j<N;j++)
1580 // A[i][j] = A[i-1][j-1]+k;
1581 // then it is a good order.
1582 if (AR->getLoop() == InnerLoop) {
1583 // We found an InnerLoop induction after OuterLoop induction. It is
1584 // a good order.
1585 FoundInnerInduction = true;
1586 if (FoundOuterInduction) {
1587 GoodOrder++;
1588 break;
1589 }
1590 }
1591 // If we find the outer induction after an inner induction e.g.
1592 // for(int i=0;i<N;i++)
1593 // for(int j=0;j<N;j++)
1594 // A[j][i] = A[j-1][i-1]+k;
1595 // then it is a bad order.
1596 if (AR->getLoop() == OuterLoop) {
1597 // We found an OuterLoop induction after InnerLoop induction. It is
1598 // a bad order.
1599 FoundOuterInduction = true;
1600 if (FoundInnerInduction) {
1601 BadOrder++;
1602 break;
1603 }
1604 }
1605 }
1606 }
1607 }
1608 }
1609 return GoodOrder - BadOrder;
1610}
1611
1612std::optional<bool>
1613LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1614 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1615 // This is the new cost model returned from loop cache analysis.
1616 // A smaller index means the loop should be placed an outer loop, and vice
1617 // versa.
1618 auto InnerLoopIt = CostMap.find(Val: InnerLoop);
1619 if (InnerLoopIt == CostMap.end())
1620 return std::nullopt;
1621 auto OuterLoopIt = CostMap.find(Val: OuterLoop);
1622 if (OuterLoopIt == CostMap.end())
1623 return std::nullopt;
1624
1625 if (CC->getLoopCost(L: *OuterLoop) == CC->getLoopCost(L: *InnerLoop))
1626 return std::nullopt;
1627 unsigned InnerIndex = InnerLoopIt->second;
1628 unsigned OuterIndex = OuterLoopIt->second;
1629 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1630 << ", OuterIndex = " << OuterIndex << "\n");
1631 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1632 "numbers to each loop");
1633 return std::optional<bool>(InnerIndex < OuterIndex);
1634}
1635
1636std::optional<bool>
1637LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1638 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1639 // good and bad order of induction variables in the instruction and allows
1640 // reordering if number of bad orders is more than good.
1641 int Cost = getInstrOrderCost();
1642 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1643 if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1644 return std::optional<bool>(true);
1645
1646 return std::nullopt;
1647}
1648
1649/// Return true if we can vectorize the loop specified by \p LoopId.
1650static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1651 for (const auto &Dep : DepMatrix) {
1652 char Dir = Dep[LoopId];
1653 char DepType = Dep.back();
1654 assert((DepType == '<' || DepType == '*') &&
1655 "Unexpected element in dependency vector");
1656
1657 // There are no loop-carried dependencies.
1658 if (Dir == '=' || Dir == 'I')
1659 continue;
1660
1661 // DepType being '<' means that this direction vector represents a forward
1662 // dependency. In principle, a loop with '<' direction can be vectorized in
1663 // this case.
1664 if (Dir == '<' && DepType == '<')
1665 continue;
1666
1667 // We cannot prove that the loop is vectorizable.
1668 return false;
1669 }
1670 return true;
1671}
1672
1673std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1674 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1675 // If the outer loop cannot be vectorized, it is not profitable to move this
1676 // to inner position.
1677 if (!canVectorize(DepMatrix, LoopId: OuterLoopId))
1678 return false;
1679
1680 // If the inner loop cannot be vectorized but the outer loop can be, then it
1681 // is profitable to interchange to enable inner loop parallelism.
1682 if (!canVectorize(DepMatrix, LoopId: InnerLoopId))
1683 return true;
1684
1685 // If both the inner and the outer loop can be vectorized, it is necessary to
1686 // check the cost of each vectorized loop for profitability decision. At this
1687 // time we do not have a cost model to estimate them, so return nullopt.
1688 // TODO: Estimate the cost of vectorized loop when both the outer and the
1689 // inner loop can be vectorized.
1690 return std::nullopt;
1691}
1692
1693bool LoopInterchangeProfitability::isProfitable(
1694 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1695 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1696 // Do not consider loops with a backedge that isn't taken, e.g. an
1697 // unconditional branch true/false, as candidates for interchange.
1698 // TODO: when interchange is forced, we should probably also allow
1699 // interchange for these loops, and thus this logic should be moved just
1700 // below the cost-model ignore check below. But this check is done first
1701 // to avoid the issue in #163954.
1702 const SCEV *InnerBTC = SE->getBackedgeTakenCount(L: InnerLoop);
1703 const SCEV *OuterBTC = SE->getBackedgeTakenCount(L: OuterLoop);
1704 if (InnerBTC && InnerBTC->isZero()) {
1705 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1706 "single iteration loop\n");
1707 return false;
1708 }
1709 if (OuterBTC && OuterBTC->isZero()) {
1710 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1711 "single iteration loop\n");
1712 return false;
1713 }
1714
1715 // Return true if interchange is forced and the cost-model ignored.
1716 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1717 return true;
1718 assert(noDuplicateRulesAndIgnore(Profitabilities) &&
1719 "Duplicate rules and option 'ignore' are not allowed");
1720
1721 // isProfitable() is structured to avoid endless loop interchange. If the
1722 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1723 // decide the profitability then, profitability check will stop and return the
1724 // analysis result. If it failed to determine it (e.g., cache analysis failed
1725 // to analyze the loopnest due to delinearization issues) then go ahead the
1726 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1727 // Likewise, if it failed to analysis the profitability then only, the last
1728 // rule (isProfitableForVectorization by default) will decide.
1729 std::optional<bool> shouldInterchange;
1730 for (RuleTy RT : Profitabilities) {
1731 switch (RT) {
1732 case RuleTy::PerLoopCacheAnalysis: {
1733 CacheCost *CC = CCM.getCacheCost();
1734 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1735 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1736 break;
1737 }
1738 case RuleTy::PerInstrOrderCost:
1739 shouldInterchange = isProfitablePerInstrOrderCost();
1740 break;
1741 case RuleTy::ForVectorization:
1742 shouldInterchange =
1743 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1744 break;
1745 case RuleTy::Ignore:
1746 llvm_unreachable("Option 'ignore' is not supported with other options");
1747 break;
1748 }
1749
1750 // If this rule could determine the profitability, don't call subsequent
1751 // rules.
1752 if (shouldInterchange.has_value())
1753 break;
1754 }
1755
1756 if (!shouldInterchange.has_value()) {
1757 ORE->emit(RemarkBuilder: [&]() {
1758 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1759 InnerLoop->getStartLoc(),
1760 InnerLoop->getHeader())
1761 << "Insufficient information to calculate the cost of loop for "
1762 "interchange.";
1763 });
1764 return false;
1765 } else if (!shouldInterchange.value()) {
1766 ORE->emit(RemarkBuilder: [&]() {
1767 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1768 InnerLoop->getStartLoc(),
1769 InnerLoop->getHeader())
1770 << "Interchanging loops is not considered to improve cache "
1771 "locality nor vectorization.";
1772 });
1773 return false;
1774 }
1775 return true;
1776}
1777
1778void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1779 Loop *InnerLoop) {
1780 for (Loop *L : *OuterLoop)
1781 if (L == InnerLoop) {
1782 OuterLoop->removeChildLoop(Child: L);
1783 return;
1784 }
1785 llvm_unreachable("Couldn't find loop");
1786}
1787
1788/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1789/// new inner and outer loop after interchanging: NewInner is the original
1790/// outer loop and NewOuter is the original inner loop.
1791///
1792/// Before interchanging, we have the following structure
1793/// Outer preheader
1794// Outer header
1795// Inner preheader
1796// Inner header
1797// Inner body
1798// Inner latch
1799// outer bbs
1800// Outer latch
1801//
1802// After interchanging:
1803// Inner preheader
1804// Inner header
1805// Outer preheader
1806// Outer header
1807// Inner body
1808// outer bbs
1809// Outer latch
1810// Inner latch
1811void LoopInterchangeTransform::restructureLoops(
1812 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1813 BasicBlock *OrigOuterPreHeader) {
1814 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1815 // The original inner loop preheader moves from the new inner loop to
1816 // the parent loop, if there is one.
1817 NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader);
1818 LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent);
1819
1820 // Switch the loop levels.
1821 if (OuterLoopParent) {
1822 // Remove the loop from its parent loop.
1823 removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner);
1824 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1825 OuterLoopParent->addChildLoop(NewChild: NewOuter);
1826 } else {
1827 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
1828 LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter);
1829 }
1830 while (!NewOuter->isInnermost())
1831 NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin()));
1832 NewOuter->addChildLoop(NewChild: NewInner);
1833
1834 // BBs from the original inner loop.
1835 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1836
1837 // Add BBs from the original outer loop to the original inner loop (excluding
1838 // BBs already in inner loop)
1839 for (BasicBlock *BB : NewInner->blocks())
1840 if (LI->getLoopFor(BB) == NewInner)
1841 NewOuter->addBlockEntry(BB);
1842
1843 // Now remove inner loop header and latch from the new inner loop and move
1844 // other BBs (the loop body) to the new inner loop.
1845 BasicBlock *OuterHeader = NewOuter->getHeader();
1846 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1847 for (BasicBlock *BB : OrigInnerBBs) {
1848 // Nothing will change for BBs in child loops.
1849 if (LI->getLoopFor(BB) != NewOuter)
1850 continue;
1851 // Remove the new outer loop header and latch from the new inner loop.
1852 if (BB == OuterHeader || BB == OuterLatch)
1853 NewInner->removeBlockFromLoop(BB);
1854 else
1855 LI->changeLoopFor(BB, L: NewInner);
1856 }
1857
1858 // The preheader of the original outer loop becomes part of the new
1859 // outer loop.
1860 NewOuter->addBlockEntry(BB: OrigOuterPreHeader);
1861 LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter);
1862
1863 // Tell SE that we move the loops around.
1864 SE->forgetLoop(L: NewOuter);
1865}
1866
1867/// User can write, or optimizers can generate the reduction for inner loop.
1868/// To make the interchange valid, apply Reduction2Mem by moving the
1869/// initializer and store instructions into the inner loop. So far we only
1870/// handle cases where the reduction variable is initialized to a constant.
1871/// For example, below code:
1872///
1873/// loop:
1874/// re = phi<0.0, next>
1875/// next = re op ...
1876/// endloop
1877/// reduc_sum = phi<next> // lcssa phi
1878/// MEM_REF[idx] = reduc_sum // LcssaStore
1879///
1880/// is transformed into:
1881///
1882/// loop:
1883/// tmp = MEM_REF[idx];
1884/// new_var = !first_iteration ? tmp : 0.0;
1885/// next = new_var op ...
1886/// MEM_REF[idx] = next; // after moving
1887/// endloop
1888///
1889/// In this way the initial const is used in the first iteration of loop.
1890void LoopInterchangeTransform::reduction2Memory() {
1891 ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
1892 LIL.getInnerReductions();
1893
1894 assert(InnerReductions.size() == 1 &&
1895 "So far we only support at most one reduction.");
1896
1897 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
1898 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1899 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
1900
1901 // Check if it's the first iteration.
1902 LLVMContext &Context = InnerLoopHeader->getContext();
1903 PHINode *FirstIter =
1904 Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Context), NumReservedValues: 2, Name: "first.iter");
1905 FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: 1),
1906 BB: InnerLoop->getLoopPreheader());
1907 FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: 0),
1908 BB: InnerLoop->getLoopLatch());
1909 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
1910
1911 // When the reduction is initialized from a constant value, we need to add
1912 // a stmt loading from the memory object to target basic block in inner
1913 // loop.
1914 Instruction *LoadMem = Builder.CreateLoad(Ty: SR.ElemTy, Ptr: SR.MemRef);
1915
1916 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
1917 Value *NewVar = Builder.CreateSelect(C: FirstIter, True: SR.Init, False: LoadMem, Name: "new.var");
1918
1919 // Replace all uses of the reduction variable with a new variable.
1920 SR.Reduction->replaceAllUsesWith(V: NewVar);
1921
1922 // Move store instruction into inner loop, just after reduction next's
1923 // definition.
1924 SR.LcssaStore->setOperand(i_nocapture: 0, Val_nocapture: SR.Next);
1925 SR.LcssaStore->moveAfter(MovePos: dyn_cast<Instruction>(Val: SR.Next));
1926}
1927
1928bool LoopInterchangeTransform::transform(
1929 ArrayRef<Instruction *> DropNoWrapInsts) {
1930 bool Transformed = false;
1931
1932 ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
1933 LIL.getInnerReductions();
1934 if (InnerReductions.size() == 1)
1935 reduction2Memory();
1936
1937 if (InnerLoop->getSubLoops().empty()) {
1938 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1939 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1940 auto &InductionPHIs = LIL.getInnerLoopInductions();
1941 if (InductionPHIs.empty()) {
1942 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1943 return false;
1944 }
1945
1946 SmallVector<Instruction *, 8> InnerIndexVarList;
1947 for (PHINode *CurInductionPHI : InductionPHIs) {
1948 if (CurInductionPHI->getIncomingBlock(i: 0) == InnerLoopPreHeader)
1949 InnerIndexVarList.push_back(
1950 Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 1)));
1951 else
1952 InnerIndexVarList.push_back(
1953 Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 0)));
1954 }
1955
1956 // Create a new latch block for the inner loop. We split at the
1957 // current latch's terminator and then move the condition and all
1958 // operands that are not either loop-invariant or the induction PHI into the
1959 // new latch block.
1960 BasicBlock *NewLatch =
1961 SplitBlock(Old: InnerLoop->getLoopLatch(),
1962 SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1963
1964 SmallSetVector<Instruction *, 4> WorkList;
1965 unsigned i = 0;
1966 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1967 for (; i < WorkList.size(); i++) {
1968 // Duplicate instruction and move it the new latch. Update uses that
1969 // have been moved.
1970 Instruction *NewI = WorkList[i]->clone();
1971 NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt());
1972 assert(!NewI->mayHaveSideEffects() &&
1973 "Moving instructions with side-effects may change behavior of "
1974 "the loop nest!");
1975 for (Use &U : llvm::make_early_inc_range(Range: WorkList[i]->uses())) {
1976 Instruction *UserI = cast<Instruction>(Val: U.getUser());
1977 if (!InnerLoop->contains(BB: UserI->getParent()) ||
1978 UserI->getParent() == NewLatch ||
1979 llvm::is_contained(Range: InductionPHIs, Element: UserI))
1980 U.set(NewI);
1981 }
1982 // Add operands of moved instruction to the worklist, except if they are
1983 // outside the inner loop or are the induction PHI.
1984 for (Value *Op : WorkList[i]->operands()) {
1985 Instruction *OpI = dyn_cast<Instruction>(Val: Op);
1986 if (!OpI ||
1987 this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop ||
1988 llvm::is_contained(Range: InductionPHIs, Element: OpI))
1989 continue;
1990 WorkList.insert(X: OpI);
1991 }
1992 }
1993 };
1994
1995 // FIXME: Should we interchange when we have a constant condition?
1996 Instruction *CondI = dyn_cast<Instruction>(
1997 Val: cast<CondBrInst>(Val: InnerLoop->getLoopLatch()->getTerminator())
1998 ->getCondition());
1999 if (CondI)
2000 WorkList.insert(X: CondI);
2001 MoveInstructions();
2002 for (Instruction *InnerIndexVar : InnerIndexVarList)
2003 WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar));
2004 MoveInstructions();
2005 }
2006
2007 // Ensure the inner loop phi nodes have a separate basic block.
2008 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2009 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2010 InnerLoopHeader->getTerminator()) {
2011 SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2012 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2013 }
2014
2015 // Instructions in the original inner loop preheader may depend on values
2016 // defined in the outer loop header. Move them there, because the original
2017 // inner loop preheader will become the entry into the interchanged loop nest.
2018 // Currently we move all instructions and rely on LICM to move invariant
2019 // instructions outside the loop nest.
2020 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2021 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2022 if (InnerLoopPreHeader != OuterLoopHeader) {
2023 for (Instruction &I :
2024 make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(),
2025 y: std::prev(x: InnerLoopPreHeader->end()))))
2026 I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator());
2027 }
2028
2029 Transformed |= adjustLoopLinks();
2030 if (!Transformed) {
2031 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2032 return false;
2033 }
2034
2035 // Finally, drop the nsw/nuw flags from the instructions for reduction
2036 // calculations.
2037 for (Instruction *Reduction : DropNoWrapInsts) {
2038 Reduction->setHasNoSignedWrap(false);
2039 Reduction->setHasNoUnsignedWrap(false);
2040 }
2041
2042 return true;
2043}
2044
2045/// \brief Move all instructions except the terminator from FromBB right before
2046/// InsertBefore
2047static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2048 BasicBlock *ToBB = InsertBefore->getParent();
2049
2050 ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(),
2051 FromEndIt: FromBB->getTerminator()->getIterator());
2052}
2053
2054/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2055static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2056 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2057 // from BB1 afterwards.
2058 auto Iter = map_range(C&: *BB1, F: [](Instruction &I) { return &I; });
2059 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(x: Iter.end()));
2060 for (Instruction *I : TempInstrs)
2061 I->removeFromParent();
2062
2063 // Move instructions from BB2 to BB1.
2064 moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator());
2065
2066 // Move instructions from TempInstrs to BB2.
2067 for (Instruction *I : TempInstrs)
2068 I->insertBefore(InsertPos: BB2->getTerminator()->getIterator());
2069}
2070
2071// Update BI to jump to NewBB instead of OldBB. Records updates to the
2072// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2073// \p OldBB is exactly once in BI's successor list.
2074static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2075 BasicBlock *NewBB,
2076 std::vector<DominatorTree::UpdateType> &DTUpdates,
2077 bool MustUpdateOnce = true) {
2078 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2079 "BI must jump to OldBB exactly once.");
2080 bool Changed = false;
2081 for (Use &Op : Term->operands())
2082 if (Op == OldBB) {
2083 Op.set(NewBB);
2084 Changed = true;
2085 }
2086
2087 if (Changed) {
2088 DTUpdates.push_back(
2089 x: {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2090 DTUpdates.push_back(
2091 x: {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2092 }
2093 assert(Changed && "Expected a successor to be updated");
2094}
2095
2096// Move Lcssa PHIs to the right place.
2097static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2098 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2099 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2100 Loop *InnerLoop, LoopInfo *LI) {
2101
2102 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2103 // defined either in the header or latch. Those blocks will become header and
2104 // latch of the new outer loop, and the only possible users can PHI nodes
2105 // in the exit block of the loop nest or the outer loop header (reduction
2106 // PHIs, in that case, the incoming value must be defined in the inner loop
2107 // header). We can just substitute the user with the incoming value and remove
2108 // the PHI.
2109 for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) {
2110 assert(P.getNumIncomingValues() == 1 &&
2111 "Only loops with a single exit are supported!");
2112
2113 // Incoming values are guaranteed be instructions currently.
2114 auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch));
2115 // In case of multi-level nested loops, follow LCSSA to find the incoming
2116 // value defined from the innermost loop.
2117 auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI));
2118 // Skip phis with incoming values from the inner loop body, excluding the
2119 // header and latch.
2120 if (IncIInnerMost->getParent() != InnerLatch &&
2121 IncIInnerMost->getParent() != InnerHeader)
2122 continue;
2123
2124 assert(all_of(P.users(),
2125 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2126 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2127 IncI->getParent() == InnerHeader) ||
2128 cast<PHINode>(U)->getParent() == OuterExit;
2129 }) &&
2130 "Can only replace phis iff the uses are in the loop nest exit or "
2131 "the incoming value is defined in the inner header (it will "
2132 "dominate all loop blocks after interchanging)");
2133 P.replaceAllUsesWith(V: IncI);
2134 P.eraseFromParent();
2135 }
2136
2137 SmallVector<PHINode *, 8> LcssaInnerExit(
2138 llvm::make_pointer_range(Range: InnerExit->phis()));
2139
2140 SmallVector<PHINode *, 8> LcssaInnerLatch(
2141 llvm::make_pointer_range(Range: InnerLatch->phis()));
2142
2143 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2144 // If a PHI node has users outside of InnerExit, it has a use outside the
2145 // interchanged loop and we have to preserve it. We move these to
2146 // InnerLatch, which will become the new exit block for the innermost
2147 // loop after interchanging.
2148 for (PHINode *P : LcssaInnerExit)
2149 P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2150
2151 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2152 // and we have to move them to the new inner latch.
2153 for (PHINode *P : LcssaInnerLatch)
2154 P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt());
2155
2156 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2157 // incoming values defined in the outer loop, we have to add a new PHI
2158 // in the inner loop latch, which became the exit block of the outer loop,
2159 // after interchanging.
2160 if (OuterExit) {
2161 for (PHINode &P : OuterExit->phis()) {
2162 if (P.getNumIncomingValues() != 1)
2163 continue;
2164 // Skip Phis with incoming values defined in the inner loop. Those should
2165 // already have been updated.
2166 auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: 0));
2167 if (!I || LI->getLoopFor(BB: I->getParent()) == InnerLoop)
2168 continue;
2169
2170 PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone());
2171 NewPhi->setIncomingValue(i: 0, V: P.getIncomingValue(i: 0));
2172 NewPhi->setIncomingBlock(i: 0, BB: OuterLatch);
2173 // We might have incoming edges from other BBs, i.e., the original outer
2174 // header.
2175 for (auto *Pred : predecessors(BB: InnerLatch)) {
2176 if (Pred == OuterLatch)
2177 continue;
2178 NewPhi->addIncoming(V: P.getIncomingValue(i: 0), BB: Pred);
2179 }
2180 NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2181 P.setIncomingValue(i: 0, V: NewPhi);
2182 }
2183 }
2184
2185 // Now adjust the incoming blocks for the LCSSA PHIs.
2186 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2187 // with the new latch.
2188 InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch);
2189}
2190
2191/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2192/// block: the outer loop latch block does not need to be exit block of the
2193/// inner loop. Consider a loop that was in LCSSA form, but then some
2194/// transformation like loop-unswitch comes along and creates an empty block,
2195/// where BB5 in this example is the outer loop latch block:
2196///
2197/// BB4:
2198/// br label %BB5
2199/// BB5:
2200/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2201/// br outer.header
2202///
2203/// Interchange then brings it in LCSSA form again resulting in this chain of
2204/// single-input phi nodes:
2205///
2206/// BB4:
2207/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2208/// br label %BB5
2209/// BB5:
2210/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2211///
2212/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2213/// use before the def if we don't check this. The solution is to simplify
2214/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2215///
2216static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2217 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2218 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2219
2220 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2221 if (OuterLoopLatch == InnerLoopExit)
2222 return;
2223
2224 // Collect and remove phis in non-exit blocks if they have 1 input.
2225 SmallVector<PHINode *, 8> Phis(
2226 llvm::make_pointer_range(Range: OuterLoopLatch->phis()));
2227 for (PHINode *Phi : Phis) {
2228 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2229 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2230 << "\n");
2231 Phi->replaceAllUsesWith(V: Phi->getIncomingValue(i: 0));
2232 Phi->eraseFromParent();
2233 }
2234}
2235
2236bool LoopInterchangeTransform::adjustLoopBranches() {
2237 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2238 std::vector<DominatorTree::UpdateType> DTUpdates;
2239
2240 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2241 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2242
2243 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2244 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2245 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2246
2247 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2248
2249 // Ensure that both preheaders do not contain PHI nodes and have single
2250 // predecessors. This allows us to move them easily. We use
2251 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2252 // preheaders do not satisfy those conditions.
2253 if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) ||
2254 !OuterLoopPreHeader->getUniquePredecessor())
2255 OuterLoopPreHeader =
2256 InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2257 if (InnerLoopPreHeader == OuterLoop->getHeader())
2258 InnerLoopPreHeader =
2259 InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2260
2261 // Adjust the loop preheader
2262 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2263 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2264 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2265 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2266 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2267 BasicBlock *InnerLoopLatchPredecessor =
2268 InnerLoopLatch->getUniquePredecessor();
2269 BasicBlock *InnerLoopLatchSuccessor;
2270 BasicBlock *OuterLoopLatchSuccessor;
2271
2272 CondBrInst *OuterLoopLatchBI =
2273 dyn_cast<CondBrInst>(Val: OuterLoopLatch->getTerminator());
2274 CondBrInst *InnerLoopLatchBI =
2275 dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
2276 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2277 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2278
2279 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
2280 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
2281 !InnerLoopHeaderBI)
2282 return false;
2283
2284 Instruction *InnerLoopLatchPredecessorBI =
2285 InnerLoopLatchPredecessor->getTerminator();
2286 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2287
2288 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
2289 return false;
2290 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2291 if (!InnerLoopHeaderSuccessor)
2292 return false;
2293
2294 // Adjust Loop Preheader and headers.
2295 // The branches in the outer loop predecessor and the outer loop header can
2296 // be unconditional branches or conditional branches with duplicates. Consider
2297 // this when updating the successors.
2298 updateSuccessor(Term: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader,
2299 NewBB: InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2300 // The outer loop header might or might not branch to the outer latch.
2301 // We are guaranteed to branch to the inner loop preheader.
2302 if (llvm::is_contained(Range: successors(I: OuterLoopHeaderBI), Element: OuterLoopLatch)) {
2303 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2304 updateSuccessor(Term: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch,
2305 DTUpdates,
2306 /*MustUpdateOnce=*/false);
2307 }
2308 updateSuccessor(Term: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader,
2309 NewBB: InnerLoopHeaderSuccessor, DTUpdates,
2310 /*MustUpdateOnce=*/false);
2311
2312 // Adjust reduction PHI's now that the incoming block has changed.
2313 InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader,
2314 New: OuterLoopHeader);
2315
2316 updateSuccessor(Term: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor,
2317 NewBB: OuterLoopPreHeader, DTUpdates);
2318
2319 // -------------Adjust loop latches-----------
2320 if (InnerLoopLatchBI->getSuccessor(i: 0) == InnerLoopHeader)
2321 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 1);
2322 else
2323 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 0);
2324
2325 updateSuccessor(Term: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch,
2326 NewBB: InnerLoopLatchSuccessor, DTUpdates);
2327
2328 if (OuterLoopLatchBI->getSuccessor(i: 0) == OuterLoopHeader)
2329 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 1);
2330 else
2331 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 0);
2332
2333 updateSuccessor(Term: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor,
2334 NewBB: OuterLoopLatchSuccessor, DTUpdates);
2335 updateSuccessor(Term: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch,
2336 DTUpdates);
2337
2338 DT->applyUpdates(Updates: DTUpdates);
2339 restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader,
2340 OrigOuterPreHeader: OuterLoopPreHeader);
2341
2342 moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch,
2343 OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(),
2344 InnerLoop, LI);
2345 // For PHIs in the exit block of the outer loop, outer's latch has been
2346 // replaced by Inners'.
2347 OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2348
2349 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2350 // Now update the reduction PHIs in the inner and outer loop headers.
2351 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2352 for (PHINode &PHI : InnerLoopHeader->phis())
2353 if (OuterInnerReductions.contains(Ptr: &PHI))
2354 InnerLoopPHIs.push_back(Elt: &PHI);
2355
2356 for (PHINode &PHI : OuterLoopHeader->phis())
2357 if (OuterInnerReductions.contains(Ptr: &PHI))
2358 OuterLoopPHIs.push_back(Elt: &PHI);
2359
2360 // Now move the remaining reduction PHIs from outer to inner loop header and
2361 // vice versa. The PHI nodes must be part of a reduction across the inner and
2362 // outer loop and all the remains to do is and updating the incoming blocks.
2363 for (PHINode *PHI : OuterLoopPHIs) {
2364 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2365 PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt());
2366 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2367 }
2368 for (PHINode *PHI : InnerLoopPHIs) {
2369 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2370 PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt());
2371 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2372 }
2373
2374 // Update the incoming blocks for moved PHI nodes.
2375 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader);
2376 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch);
2377 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader);
2378 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2379
2380 // Values defined in the outer loop header could be used in the inner loop
2381 // latch. In that case, we need to create LCSSA phis for them, because after
2382 // interchanging they will be defined in the new inner loop and used in the
2383 // new outer loop.
2384 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2385 for (Instruction &I :
2386 make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end())))
2387 MayNeedLCSSAPhis.push_back(Elt: &I);
2388 formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: *DT, LI: *LI, SE);
2389
2390 return true;
2391}
2392
2393bool LoopInterchangeTransform::adjustLoopLinks() {
2394 // Adjust all branches in the inner and outer loop.
2395 bool Changed = adjustLoopBranches();
2396 if (Changed) {
2397 // We have interchanged the preheaders so we need to interchange the data in
2398 // the preheaders as well. This is because the content of the inner
2399 // preheader was previously executed inside the outer loop.
2400 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2401 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2402 swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader);
2403 }
2404 return Changed;
2405}
2406
2407PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
2408 LoopAnalysisManager &AM,
2409 LoopStandardAnalysisResults &AR,
2410 LPMUpdater &U) {
2411 Function &F = *LN.getParent();
2412 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2413
2414 if (MaxMemInstrCount < 1) {
2415 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
2416 return PreservedAnalyses::all();
2417 }
2418 OptimizationRemarkEmitter ORE(&F);
2419
2420 // Ensure minimum depth of the loop nest to do the interchange.
2421 if (!hasSupportedLoopDepth(LoopList, ORE))
2422 return PreservedAnalyses::all();
2423 // Ensure computable loop nest.
2424 if (!isComputableLoopNest(SE: &AR.SE, LoopList)) {
2425 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2426 return PreservedAnalyses::all();
2427 }
2428
2429 ORE.emit(RemarkBuilder: [&]() {
2430 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2431 LN.getOutermostLoop().getStartLoc(),
2432 LN.getOutermostLoop().getHeader())
2433 << "Computed dependence info, invoking the transform.";
2434 });
2435
2436 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2437 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2438 return PreservedAnalyses::all();
2439 U.markLoopNestChanged(Changed: true);
2440 return getLoopPassPreservedAnalyses();
2441}
2442