1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
15#include "llvm/Transforms/Scalar/LoopInterchange.h"
16#include "llvm/ADT/STLExtras.h"
17#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/SmallVector.h"
19#include "llvm/ADT/Statistic.h"
20#include "llvm/ADT/StringMap.h"
21#include "llvm/ADT/StringRef.h"
22#include "llvm/Analysis/DependenceAnalysis.h"
23#include "llvm/Analysis/LoopCacheAnalysis.h"
24#include "llvm/Analysis/LoopInfo.h"
25#include "llvm/Analysis/LoopNestAnalysis.h"
26#include "llvm/Analysis/LoopPass.h"
27#include "llvm/Analysis/OptimizationRemarkEmitter.h"
28#include "llvm/Analysis/ScalarEvolution.h"
29#include "llvm/Analysis/ScalarEvolutionExpressions.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/DiagnosticInfo.h"
32#include "llvm/IR/Dominators.h"
33#include "llvm/IR/Function.h"
34#include "llvm/IR/IRBuilder.h"
35#include "llvm/IR/InstrTypes.h"
36#include "llvm/IR/Instruction.h"
37#include "llvm/IR/Instructions.h"
38#include "llvm/IR/User.h"
39#include "llvm/IR/Value.h"
40#include "llvm/Support/Casting.h"
41#include "llvm/Support/CommandLine.h"
42#include "llvm/Support/Debug.h"
43#include "llvm/Support/ErrorHandling.h"
44#include "llvm/Support/raw_ostream.h"
45#include "llvm/Transforms/Scalar/LoopPassManager.h"
46#include "llvm/Transforms/Utils/BasicBlockUtils.h"
47#include "llvm/Transforms/Utils/Local.h"
48#include "llvm/Transforms/Utils/LoopUtils.h"
49#include <cassert>
50#include <utility>
51#include <vector>
52
53using namespace llvm;
54
55#define DEBUG_TYPE "loop-interchange"
56
57STATISTIC(LoopsInterchanged, "Number of loops interchanged");
58
59static cl::opt<int> LoopInterchangeCostThreshold(
60 "loop-interchange-threshold", cl::init(Val: 0), cl::Hidden,
61 cl::desc("Interchange if you gain more than this number"));
62
63static cl::opt<unsigned int> MaxMemInstrRatio(
64 "loop-interchange-max-mem-instr-ratio", cl::init(Val: 4), cl::Hidden,
65 cl::desc("Maximum number of load/store instructions squared in relation to "
66 "the total number of instructions. Higher value may lead to more "
67 "interchanges at the cost of compile-time"));
68
69namespace {
70
71using LoopVector = SmallVector<Loop *, 8>;
72
73/// A list of direction vectors. Each entry represents a direction vector
74/// corresponding to one or more dependencies existing in the loop nest. The
75/// length of all direction vectors is equal and is N + 1, where N is the depth
76/// of the loop nest. The first N elements correspond to the dependency
77/// direction of each N loops. The last one indicates whether this entry is
78/// forward dependency ('<') or not ('*'). The term "forward" aligns with what
79/// is defined in LoopAccessAnalysis.
80// TODO: Check if we can use a sparse matrix here.
81using CharMatrix = std::vector<std::vector<char>>;
82
83/// Types of rules used in profitability check.
84enum class RuleTy {
85 PerLoopCacheAnalysis,
86 PerInstrOrderCost,
87 ForVectorization,
88 Ignore
89};
90
91} // end anonymous namespace
92
93// Minimum loop depth supported.
94static cl::opt<unsigned int> MinLoopNestDepth(
95 "loop-interchange-min-loop-nest-depth", cl::init(Val: 2), cl::Hidden,
96 cl::desc("Minimum depth of loop nest considered for the transform"));
97
98// Maximum loop depth supported.
99static cl::opt<unsigned int> MaxLoopNestDepth(
100 "loop-interchange-max-loop-nest-depth", cl::init(Val: 10), cl::Hidden,
101 cl::desc("Maximum depth of loop nest considered for the transform"));
102
103// We prefer cache cost to vectorization by default.
104static cl::list<RuleTy> Profitabilities(
105 "loop-interchange-profitabilities", cl::MiscFlags::CommaSeparated,
106 cl::Hidden,
107 cl::desc("List of profitability heuristics to be used. They are applied in "
108 "the given order"),
109 cl::list_init<RuleTy>(Vals: {RuleTy::PerInstrOrderCost,
110 RuleTy::ForVectorization}),
111 cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache",
112 "Prioritize loop cache cost"),
113 clEnumValN(RuleTy::PerInstrOrderCost, "instorder",
114 "Prioritize the IVs order of each instruction"),
115 clEnumValN(RuleTy::ForVectorization, "vectorize",
116 "Prioritize vectorization"),
117 clEnumValN(RuleTy::Ignore, "ignore",
118 "Ignore profitability, force interchange (does not "
119 "work with other options)")));
120
121// Support for the inner-loop reduction pattern.
122static cl::opt<bool> EnableReduction2Memory(
123 "loop-interchange-reduction-to-mem", cl::init(Val: false), cl::Hidden,
124 cl::desc("Support for the inner-loop reduction pattern."));
125
126#ifndef NDEBUG
127static bool noDuplicateRulesAndIgnore(ArrayRef<RuleTy> Rules) {
128 SmallSet<RuleTy, 4> Set;
129 for (RuleTy Rule : Rules) {
130 if (!Set.insert(Rule).second)
131 return false;
132 if (Rule == RuleTy::Ignore)
133 return false;
134 }
135 return true;
136}
137
138static void printDepMatrix(CharMatrix &DepMatrix) {
139 for (auto &Row : DepMatrix) {
140 // Drop the last element because it is a flag indicating whether this is
141 // forward dependency or not, which doesn't affect the legality check.
142 for (char D : drop_end(Row))
143 LLVM_DEBUG(dbgs() << D << " ");
144 LLVM_DEBUG(dbgs() << "\n");
145 }
146}
147
148/// Return true if \p Src appears before \p Dst in the same basic block.
149/// Precondition: \p Src and \Dst are distinct instructions within the same
150/// basic block.
151static bool inThisOrder(const Instruction *Src, const Instruction *Dst) {
152 assert(Src->getParent() == Dst->getParent() && Src != Dst &&
153 "Expected Src and Dst to be different instructions in the same BB");
154
155 bool FoundSrc = false;
156 for (const Instruction &I : *(Src->getParent())) {
157 if (&I == Src) {
158 FoundSrc = true;
159 continue;
160 }
161 if (&I == Dst)
162 return FoundSrc;
163 }
164
165 llvm_unreachable("Dst not found");
166}
167#endif
168
169static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
170 Loop *L, DependenceInfo *DI,
171 ScalarEvolution *SE,
172 OptimizationRemarkEmitter *ORE) {
173 using ValueVector = SmallVector<Value *, 16>;
174
175 ValueVector MemInstr;
176 unsigned NumInsts = 0;
177
178 // For each block.
179 for (BasicBlock *BB : L->blocks()) {
180 // Scan the BB and collect legal loads and stores.
181 for (Instruction &I : *BB) {
182 NumInsts++;
183 if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) {
184 if (!Ld->isSimple())
185 return false;
186 MemInstr.push_back(Elt: &I);
187 } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) {
188 if (!St->isSimple())
189 return false;
190 MemInstr.push_back(Elt: &I);
191 }
192 }
193 }
194
195 // To populate the dependence matrix, we perform dependence test for each pair
196 // of memory instructions, which has O(NumMemInstr^2) complexity. This implies
197 // that even if the number of memory instructions is small, the analysis can
198 // still be expensive if the most of the instructions in the loop are memory
199 // instructions. On the other hand, if the number of memory instructions is
200 // not small, but the loop is large (i.e., it contains many non-memory
201 // instructions), the analysis can still be affordable.
202 unsigned NumMemInstr = MemInstr.size();
203 LLVM_DEBUG(dbgs() << "Found " << NumMemInstr
204 << " Loads and Stores to analyze\n");
205 if (MaxMemInstrRatio * NumInsts < NumMemInstr * NumMemInstr) {
206 ORE->emit(RemarkBuilder: [&]() {
207 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
208 L->getStartLoc(), L->getHeader())
209 << "Number of loads/stores exceeded, the supported maximum can be "
210 "increased with option -loop-interchange-max-mem-instr-ratio.";
211 });
212 return false;
213 }
214 ValueVector::iterator I, IE, J, JE;
215
216 // Manage direction vectors that are already seen. Map each direction vector
217 // to an index of DepMatrix at which it is stored.
218 StringMap<unsigned> Seen;
219
220 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
221 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
222 std::vector<char> Dep;
223 Instruction *Src = cast<Instruction>(Val: *I);
224 Instruction *Dst = cast<Instruction>(Val: *J);
225 // Ignore Input dependencies.
226 if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst))
227 continue;
228 // Track Output, Flow, and Anti dependencies.
229 if (auto D = DI->depends(Src, Dst)) {
230 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
231 // If the direction vector is negative, normalize it to
232 // make it non-negative.
233 if (D->normalize(SE))
234 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
235 LLVM_DEBUG(StringRef DepType =
236 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
237 dbgs() << "Found " << DepType
238 << " dependency between Src and Dst\n"
239 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
240 unsigned Levels = D->getLevels();
241 char Direction;
242 for (unsigned II = 1; II <= Levels; ++II) {
243 // `DVEntry::LE` is converted to `*`. This is because `LE` means `<`
244 // or `=`, for which we don't have an equivalent representation, so
245 // that the conservative approximation is necessary. The same goes for
246 // `DVEntry::GE`.
247 // TODO: Use of fine-grained expressions allows for more accurate
248 // analysis.
249 unsigned Dir = D->getDirection(Level: II);
250 if (Dir == Dependence::DVEntry::LT)
251 Direction = '<';
252 else if (Dir == Dependence::DVEntry::GT)
253 Direction = '>';
254 else if (Dir == Dependence::DVEntry::EQ)
255 Direction = '=';
256 else
257 Direction = '*';
258 Dep.push_back(x: Direction);
259 }
260
261 // If the Dependence object doesn't have any information, fill the
262 // dependency vector with '*'.
263 if (D->isConfused()) {
264 assert(Dep.empty() && "Expected empty dependency vector");
265 Dep.assign(n: Level, val: '*');
266 }
267
268 while (Dep.size() != Level) {
269 Dep.push_back(x: 'I');
270 }
271
272 // If all the elements of any direction vector have only '*', legality
273 // can't be proven. Exit early to save compile time.
274 if (all_of(Range&: Dep, P: equal_to(Arg: '*'))) {
275 ORE->emit(RemarkBuilder: [&]() {
276 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
277 L->getStartLoc(), L->getHeader())
278 << "All loops have dependencies in all directions.";
279 });
280 return false;
281 }
282
283 // Test whether the dependency is forward or not.
284 bool IsKnownForward = true;
285 if (Src->getParent() != Dst->getParent()) {
286 // In general, when Src and Dst are in different BBs, the execution
287 // order of them within a single iteration is not guaranteed. Treat
288 // conservatively as not-forward dependency in this case.
289 IsKnownForward = false;
290 } else {
291 // Src and Dst are in the same BB. If they are the different
292 // instructions, Src should appear before Dst in the BB as they are
293 // stored to MemInstr in that order.
294 assert((Src == Dst || inThisOrder(Src, Dst)) &&
295 "Unexpected instructions");
296
297 // If the Dependence object is reversed (due to normalization), it
298 // represents the dependency from Dst to Src, meaning it is a backward
299 // dependency. Otherwise it should be a forward dependency.
300 bool IsReversed = D->getSrc() != Src;
301 if (IsReversed)
302 IsKnownForward = false;
303 }
304
305 // Initialize the last element. Assume forward dependencies only; it
306 // will be updated later if there is any non-forward dependency.
307 Dep.push_back(x: '<');
308
309 // The last element should express the "summary" among one or more
310 // direction vectors whose first N elements are the same (where N is
311 // the depth of the loop nest). Hence we exclude the last element from
312 // the Seen map.
313 auto [Ite, Inserted] = Seen.try_emplace(
314 Key: StringRef(Dep.data(), Dep.size() - 1), Args: DepMatrix.size());
315
316 // Make sure we only add unique entries to the dependency matrix.
317 if (Inserted)
318 DepMatrix.push_back(x: Dep);
319
320 // If we cannot prove that this dependency is forward, change the last
321 // element of the corresponding entry. Since a `[... *]` dependency
322 // includes a `[... <]` dependency, we do not need to keep both and
323 // change the existing entry instead.
324 if (!IsKnownForward)
325 DepMatrix[Ite->second].back() = '*';
326 }
327 }
328 }
329
330 return true;
331}
332
333// A loop is moved from index 'from' to an index 'to'. Update the Dependence
334// matrix by exchanging the two columns.
335static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
336 unsigned ToIndx) {
337 for (auto &Row : DepMatrix)
338 std::swap(a&: Row[ToIndx], b&: Row[FromIndx]);
339}
340
341// Check if a direction vector is lexicographically positive. Return true if it
342// is positive, nullopt if it is "zero", otherwise false.
343// [Theorem] A permutation of the loops in a perfect nest is legal if and only
344// if the direction matrix, after the same permutation is applied to its
345// columns, has no ">" direction as the leftmost non-"=" direction in any row.
346static std::optional<bool>
347isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) {
348 for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) {
349 if (Direction == '<')
350 return true;
351 if (Direction == '>' || Direction == '*')
352 return false;
353 }
354 return std::nullopt;
355}
356
357// Checks if it is legal to interchange 2 loops.
358static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
359 unsigned InnerLoopId,
360 unsigned OuterLoopId) {
361 unsigned NumRows = DepMatrix.size();
362 std::vector<char> Cur;
363 // For each row check if it is valid to interchange.
364 for (unsigned Row = 0; Row < NumRows; ++Row) {
365 // Create temporary DepVector check its lexicographical order
366 // before and after swapping OuterLoop vs InnerLoop
367 Cur = DepMatrix[Row];
368
369 // If the surrounding loops already ensure that the direction vector is
370 // lexicographically positive, nothing within the loop will be able to break
371 // the dependence. In such a case we can skip the subsequent check.
372 if (isLexicographicallyPositive(DV: Cur, Begin: 0, End: OuterLoopId) == true)
373 continue;
374
375 // Check if the direction vector is lexicographically positive (or zero)
376 // for both before/after exchanged. Ignore the last element because it
377 // doesn't affect the legality.
378 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - 1) == false)
379 return false;
380 std::swap(a&: Cur[InnerLoopId], b&: Cur[OuterLoopId]);
381 if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size() - 1) == false)
382 return false;
383 }
384 return true;
385}
386
387static void populateWorklist(Loop &L, LoopVector &LoopList) {
388 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
389 << L.getHeader()->getParent()->getName() << " Loop: %"
390 << L.getHeader()->getName() << '\n');
391 assert(LoopList.empty() && "LoopList should initially be empty!");
392 Loop *CurrentLoop = &L;
393 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
394 while (!Vec->empty()) {
395 // The current loop has multiple subloops in it hence it is not tightly
396 // nested.
397 // Discard all loops above it added into Worklist.
398 if (Vec->size() != 1) {
399 LoopList = {};
400 return;
401 }
402
403 LoopList.push_back(Elt: CurrentLoop);
404 CurrentLoop = Vec->front();
405 Vec = &CurrentLoop->getSubLoops();
406 }
407 LoopList.push_back(Elt: CurrentLoop);
408}
409
410static bool hasSupportedLoopDepth(ArrayRef<Loop *> LoopList,
411 OptimizationRemarkEmitter &ORE) {
412 unsigned LoopNestDepth = LoopList.size();
413 if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) {
414 LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth
415 << ", the supported range is [" << MinLoopNestDepth
416 << ", " << MaxLoopNestDepth << "].\n");
417 Loop *OuterLoop = LoopList.front();
418 ORE.emit(RemarkBuilder: [&]() {
419 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth",
420 OuterLoop->getStartLoc(),
421 OuterLoop->getHeader())
422 << "Unsupported depth of loop nest, the supported range is ["
423 << std::to_string(val: MinLoopNestDepth) << ", "
424 << std::to_string(val: MaxLoopNestDepth) << "].\n";
425 });
426 return false;
427 }
428 return true;
429}
430
431static bool isComputableLoopNest(ScalarEvolution *SE,
432 ArrayRef<Loop *> LoopList) {
433 for (Loop *L : LoopList) {
434 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
435 if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) {
436 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
437 return false;
438 }
439 if (L->getNumBackEdges() != 1) {
440 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
441 return false;
442 }
443 if (!L->getExitingBlock()) {
444 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
445 return false;
446 }
447 }
448 return true;
449}
450
451namespace {
452
453/// LoopInterchangeLegality checks if it is legal to interchange the loop.
454class LoopInterchangeLegality {
455public:
456 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
457 OptimizationRemarkEmitter *ORE, DominatorTree *DT)
458 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), DT(DT), ORE(ORE) {}
459
460 /// Check if the loops can be interchanged.
461 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
462 CharMatrix &DepMatrix);
463
464 /// Check if the loop structure is understood. We do not handle triangular
465 /// loops for now.
466 bool isLoopStructureUnderstood();
467
468 bool currentLimitations();
469
470 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
471 return OuterInnerReductions;
472 }
473
474 const ArrayRef<PHINode *> getInnerLoopInductions() const {
475 return InnerLoopInductions;
476 }
477
478 ArrayRef<Instruction *> getHasNoWrapReductions() const {
479 return HasNoWrapReductions;
480 }
481
482 ArrayRef<Instruction *> getHasNoInfInsts() const { return HasNoInfInsts; }
483
484 /// Record reductions in the inner loop. Currently supported reductions:
485 /// - initialized from a constant.
486 /// - reduction PHI node has only one user.
487 /// - located in the innermost loop.
488 struct InnerReduction {
489 /// The reduction itself.
490 PHINode *Reduction;
491 Value *Init;
492 Value *Next;
493 /// The Lcssa PHI.
494 PHINode *LcssaPhi;
495 /// Store reduction result into memory object.
496 StoreInst *LcssaStore;
497 /// The memory Location.
498 Value *MemRef;
499 Type *ElemTy;
500 };
501
502 ArrayRef<InnerReduction> getInnerReductions() const {
503 return InnerReductions;
504 }
505
506private:
507 bool tightlyNested(Loop *Outer, Loop *Inner);
508 bool containsUnsafeInstructions(BasicBlock *BB, Instruction *Skip);
509
510 /// Traverse all PHI nodes in the header of each loop in the loop nest
511 /// starting from \p OuterLoop, and perform the following checks:
512 ///
513 /// - Identify induction variables in the child loop of \p OuterLoop.
514 /// - Check for reductions across the inner loop and \p OuterLoop.
515 /// - Detect unsupported PHI nodes.
516 ///
517 /// Return false if any unsupported PHI node is found or if no induction
518 /// variable is found in the child loop of \p OuterLoop. Otherwise return
519 /// true.
520 bool checkInductionsAndReductions(Loop *OuterLoop);
521
522 /// Detect and record the reduction of the inner loop. Add them to
523 /// InnerReductions.
524 ///
525 /// innerloop:
526 /// Re = phi<0.0, Next>
527 /// Next = Re op ...
528 /// OuterLoopLatch:
529 /// Lcssa = phi<Next> ; lcssa phi
530 /// store Lcssa, MemRef ; LcssaStore
531 ///
532 bool isInnerReduction(Loop *L, PHINode *Phi,
533 SmallVectorImpl<Instruction *> &HasNoWrapInsts);
534
535 Loop *OuterLoop;
536 Loop *InnerLoop;
537
538 ScalarEvolution *SE;
539 DominatorTree *DT;
540
541 /// Interface to emit optimization remarks.
542 OptimizationRemarkEmitter *ORE;
543
544 /// Set of reduction PHIs taking part of a reduction across the inner and
545 /// outer loop.
546 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
547
548 /// Set of inner loop induction PHIs
549 SmallVector<PHINode *, 8> InnerLoopInductions;
550
551 /// Hold instructions that have nuw/nsw flags and involved in reductions,
552 /// like integer addition/multiplication. Those flags must be dropped when
553 /// interchanging the loops.
554 SmallVector<Instruction *, 4> HasNoWrapReductions;
555
556 /// Hold instructions that have ninf flags and involved in reductions. Those
557 /// flags must be dropped when interchanging the loops.
558 SmallVector<Instruction *, 4> HasNoInfInsts;
559
560 /// Vector of reductions in the inner loop.
561 SmallVector<InnerReduction, 8> InnerReductions;
562};
563
564/// Manages information utilized by the profitability check for cache. The main
565/// purpose of this class is to delay the computation of CacheCost until it is
566/// actually needed.
567class CacheCostManager {
568 Loop *OutermostLoop;
569 LoopStandardAnalysisResults *AR;
570 DependenceInfo *DI;
571
572 /// CacheCost for \ref OutermostLoop. Once it is computed, it is cached. Note
573 /// that the result can be nullptr.
574 std::optional<std::unique_ptr<CacheCost>> CC;
575
576 /// Maps each loop to an index representing the optimal position within the
577 /// loop-nest, as determined by the cache cost analysis.
578 DenseMap<const Loop *, unsigned> CostMap;
579
580 void computeIfUnitinialized();
581
582public:
583 CacheCostManager(Loop *OutermostLoop, LoopStandardAnalysisResults *AR,
584 DependenceInfo *DI)
585 : OutermostLoop(OutermostLoop), AR(AR), DI(DI) {}
586 CacheCost *getCacheCost();
587 const DenseMap<const Loop *, unsigned> &getCostMap();
588};
589
590/// LoopInterchangeProfitability checks if it is profitable to interchange the
591/// loop.
592class LoopInterchangeProfitability {
593public:
594 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
595 OptimizationRemarkEmitter *ORE)
596 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
597
598 /// Check if the loop interchange is profitable.
599 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
600 unsigned InnerLoopId, unsigned OuterLoopId,
601 CharMatrix &DepMatrix, CacheCostManager &CCM);
602
603private:
604 int getInstrOrderCost();
605 std::optional<bool> isProfitablePerLoopCacheAnalysis(
606 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC);
607 std::optional<bool> isProfitablePerInstrOrderCost();
608 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
609 unsigned OuterLoopId,
610 CharMatrix &DepMatrix);
611 Loop *OuterLoop;
612 Loop *InnerLoop;
613
614 /// Scev analysis.
615 ScalarEvolution *SE;
616
617 /// Interface to emit optimization remarks.
618 OptimizationRemarkEmitter *ORE;
619};
620
621/// LoopInterchangeTransform interchanges the loop.
622class LoopInterchangeTransform {
623public:
624 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
625 LoopInfo *LI, DominatorTree *DT,
626 const LoopInterchangeLegality &LIL)
627 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
628
629 /// Interchange OuterLoop and InnerLoop.
630 bool transform(ArrayRef<Instruction *> DropNoWrapInsts,
631 ArrayRef<Instruction *> DropNoInfInsts);
632 void reduction2Memory();
633 void restructureLoops(Loop *NewInner, Loop *NewOuter,
634 BasicBlock *OrigInnerPreHeader,
635 BasicBlock *OrigOuterPreHeader);
636 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
637
638private:
639 bool adjustLoopLinks();
640 bool adjustLoopBranches();
641
642 Loop *OuterLoop;
643 Loop *InnerLoop;
644
645 /// Scev analysis.
646 ScalarEvolution *SE;
647
648 LoopInfo *LI;
649 DominatorTree *DT;
650
651 const LoopInterchangeLegality &LIL;
652};
653
654struct LoopInterchange {
655 ScalarEvolution *SE = nullptr;
656 LoopInfo *LI = nullptr;
657 DependenceInfo *DI = nullptr;
658 DominatorTree *DT = nullptr;
659 LoopStandardAnalysisResults *AR = nullptr;
660
661 /// Interface to emit optimization remarks.
662 OptimizationRemarkEmitter *ORE;
663
664 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
665 DominatorTree *DT, LoopStandardAnalysisResults *AR,
666 OptimizationRemarkEmitter *ORE)
667 : SE(SE), LI(LI), DI(DI), DT(DT), AR(AR), ORE(ORE) {}
668
669 bool run(Loop *L) {
670 if (L->getParentLoop())
671 return false;
672 SmallVector<Loop *, 8> LoopList;
673 populateWorklist(L&: *L, LoopList);
674 return processLoopList(LoopList);
675 }
676
677 bool run(LoopNest &LN) {
678 SmallVector<Loop *, 8> LoopList(LN.getLoops());
679 for (unsigned I = 1; I < LoopList.size(); ++I)
680 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
681 return false;
682 return processLoopList(LoopList);
683 }
684
685 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
686 // TODO: Add a better heuristic to select the loop to be interchanged based
687 // on the dependence matrix. Currently we select the innermost loop.
688 return LoopList.size() - 1;
689 }
690
691 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
692 bool Changed = false;
693
694 // Ensure proper loop nest depth.
695 assert(hasSupportedLoopDepth(LoopList, *ORE) &&
696 "Unsupported depth of loop nest.");
697
698 unsigned LoopNestDepth = LoopList.size();
699
700 LLVM_DEBUG({
701 dbgs() << "Processing LoopList of size = " << LoopNestDepth
702 << " containing the following loops:\n";
703 for (auto *L : LoopList) {
704 dbgs() << " - ";
705 L->print(dbgs());
706 }
707 });
708
709 CharMatrix DependencyMatrix;
710 Loop *OuterMostLoop = *(LoopList.begin());
711 if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth,
712 L: OuterMostLoop, DI, SE, ORE)) {
713 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
714 return false;
715 }
716
717 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
718 printDepMatrix(DependencyMatrix));
719
720 // Get the Outermost loop exit.
721 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
722 if (!LoopNestExit) {
723 LLVM_DEBUG(dbgs() << "OuterMostLoop '" << OuterMostLoop->getName()
724 << "' needs an unique exit block");
725 return false;
726 }
727
728 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
729 CacheCostManager CCM(LoopList[0], AR, DI);
730 // We try to achieve the globally optimal memory access for the loopnest,
731 // and do interchange based on a bubble-sort fasion. We start from
732 // the innermost loop, move it outwards to the best possible position
733 // and repeat this process.
734 for (unsigned j = SelecLoopId; j > 0; j--) {
735 bool ChangedPerIter = false;
736 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
737 bool Interchanged =
738 processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - 1, DependencyMatrix, CCM);
739 ChangedPerIter |= Interchanged;
740 Changed |= Interchanged;
741 }
742 // Early abort if there was no interchange during an entire round of
743 // moving loops outwards.
744 if (!ChangedPerIter)
745 break;
746 }
747 return Changed;
748 }
749
750 bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId,
751 unsigned OuterLoopId,
752 std::vector<std::vector<char>> &DependencyMatrix,
753 CacheCostManager &CCM) {
754 Loop *OuterLoop = LoopList[OuterLoopId];
755 Loop *InnerLoop = LoopList[InnerLoopId];
756 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
757 << " and OuterLoopId = " << OuterLoopId << "\n");
758 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE, DT);
759 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) {
760 LLVM_DEBUG(dbgs() << "Cannot prove legality, not interchanging loops '"
761 << OuterLoop->getName() << "' and '"
762 << InnerLoop->getName() << "'\n");
763 return false;
764 }
765 LLVM_DEBUG(dbgs() << "Loops '" << OuterLoop->getName() << "' and '"
766 << InnerLoop->getName()
767 << "' are legal to interchange\n");
768 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
769 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
770 DepMatrix&: DependencyMatrix, CCM)) {
771 LLVM_DEBUG(dbgs() << "Interchanging loops '" << OuterLoop->getName()
772 << "' and '" << InnerLoop->getName()
773 << "' not profitable.\n");
774 return false;
775 }
776
777 ORE->emit(RemarkBuilder: [&]() {
778 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
779 InnerLoop->getStartLoc(),
780 InnerLoop->getHeader())
781 << "Loop interchanged with enclosing loop.";
782 });
783
784 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
785 LIT.transform(DropNoWrapInsts: LIL.getHasNoWrapReductions(), DropNoInfInsts: LIL.getHasNoInfInsts());
786 LLVM_DEBUG(dbgs() << "Loops interchanged: outer loop '"
787 << OuterLoop->getName() << "' and inner loop '"
788 << InnerLoop->getName() << "'\n");
789 LoopsInterchanged++;
790
791 llvm::formLCSSARecursively(L&: *OuterLoop, DT: *DT, LI, SE);
792
793 // Loops interchanged, update LoopList accordingly.
794 std::swap(a&: LoopList[OuterLoopId], b&: LoopList[InnerLoopId]);
795 // Update the DependencyMatrix
796 interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId);
797
798 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
799 printDepMatrix(DependencyMatrix));
800
801 return true;
802 }
803};
804
805} // end anonymous namespace
806
807bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB,
808 Instruction *Skip) {
809 return any_of(Range&: *BB, P: [Skip](const Instruction &I) {
810 if (&I == Skip)
811 return false;
812 return I.mayHaveSideEffects() || I.mayReadFromMemory();
813 });
814}
815
816bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
817 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
818 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
819 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
820
821 LLVM_DEBUG(dbgs() << "Checking if loops '" << OuterLoop->getName()
822 << "' and '" << InnerLoop->getName()
823 << "' are tightly nested\n");
824
825 // In a perfectly nested loop the outer header branches only into the inner
826 // loop. If it can also reach the outer latch, it conditionally guards the
827 // inner loop (an imperfect nest), so the inner loop runs on only a subset of
828 // the outer iterations. Interchanging such a nest would run the inner loop on
829 // every outer iteration, including the guarded-off ones, which is illegal
830 // when the inner loop relies on the guard to terminate (e.g. an eq/ne exit
831 // whose trip count is degenerate once the guard is false). Reject by allowing
832 // the outer header to branch only into the inner loop.
833 //
834 // TODO: This is conservative. A guarded nest is still safe to interchange
835 // when the inner loop has a computable trip count that is empty exactly when
836 // the guard is false, e.g.:
837 // for (i = 0; i < N; i++)
838 // if (M > 0) // loop-invariant guard
839 // for (j = 0; j < M; j++) // empty when M <= 0
840 // A[j][i] = ...;
841 // Interchanging is legal here because the inner loop runs zero times on the
842 // guarded-off iterations.
843 for (BasicBlock *Succ : successors(BB: OuterLoopHeader))
844 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader())
845 return false;
846
847 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
848
849 // The inner loop reduction pattern requires storing the LCSSA PHI in
850 // the OuterLoop Latch. Therefore, when reduction2Memory is enabled, skip
851 // that store during checks.
852 Instruction *Skip = nullptr;
853 assert(InnerReductions.size() <= 1 &&
854 "So far we only support at most one reduction.");
855 if (InnerReductions.size() == 1)
856 Skip = InnerReductions[0].LcssaStore;
857
858 // We do not have any basic block in between now make sure the outer header
859 // and outer loop latch doesn't contain any unsafe instructions.
860 if (containsUnsafeInstructions(BB: OuterLoopHeader, Skip) ||
861 containsUnsafeInstructions(BB: OuterLoopLatch, Skip))
862 return false;
863
864 // Also make sure the inner loop preheader does not contain any unsafe
865 // instructions. Note that all instructions in the preheader will be moved to
866 // the outer loop header when interchanging.
867 if (InnerLoopPreHeader != OuterLoopHeader &&
868 containsUnsafeInstructions(BB: InnerLoopPreHeader, Skip))
869 return false;
870
871 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
872 // Ensure the inner loop exit block flows to the outer loop latch possibly
873 // through empty blocks.
874 const BasicBlock &SuccInner =
875 LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch);
876 if (&SuccInner != OuterLoopLatch) {
877 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
878 << " does not lead to the outer loop latch.\n";);
879 return false;
880 }
881 // The inner loop exit block does flow to the outer loop latch and not some
882 // other BBs, now make sure it contains safe instructions, since it will be
883 // moved into the (new) inner loop after interchange.
884 if (containsUnsafeInstructions(BB: InnerLoopExit, Skip))
885 return false;
886
887 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
888 // We have a perfect loop nest.
889 return true;
890}
891
892bool LoopInterchangeLegality::isLoopStructureUnderstood() {
893 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
894 for (PHINode *InnerInduction : InnerLoopInductions) {
895 unsigned Num = InnerInduction->getNumOperands();
896 for (unsigned i = 0; i < Num; ++i) {
897 Value *Val = InnerInduction->getOperand(i_nocapture: i);
898 if (isa<Constant>(Val))
899 continue;
900 Instruction *I = dyn_cast<Instruction>(Val);
901 if (!I)
902 return false;
903 // TODO: Handle triangular loops.
904 // e.g. for(int i=0;i<N;i++)
905 // for(int j=i;j<N;j++)
906 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
907 if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) ==
908 InnerLoopPreheader &&
909 !OuterLoop->isLoopInvariant(V: I)) {
910 return false;
911 }
912 }
913 }
914
915 // TODO: Handle triangular loops of another form.
916 // e.g. for(int i=0;i<N;i++)
917 // for(int j=0;j<i;j++)
918 // or,
919 // for(int i=0;i<N;i++)
920 // for(int j=0;j*i<N;j++)
921 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
922 CondBrInst *InnerLoopLatchBI =
923 dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
924 if (!InnerLoopLatchBI)
925 return false;
926
927 CmpInst *InnerLoopCmp = dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition());
928 if (!InnerLoopCmp)
929 return false;
930
931 Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: 0);
932 Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: 1);
933
934 // LHS and RHS of the inner loop exit condition, e.g.,
935 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
936 Value *Left = nullptr;
937 Value *Right = nullptr;
938
939 // Check if V only involves inner loop induction variable.
940 // Return true if V is InnerInduction, or a cast from
941 // InnerInduction, or a binary operator that involves
942 // InnerInduction and a constant.
943 std::function<bool(Value *)> IsPathToInnerIndVar;
944 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
945 if (llvm::is_contained(Range&: InnerLoopInductions, Element: V))
946 return true;
947 if (isa<Constant>(Val: V))
948 return true;
949 const Instruction *I = dyn_cast<Instruction>(Val: V);
950 if (!I)
951 return false;
952 if (isa<CastInst>(Val: I))
953 return IsPathToInnerIndVar(I->getOperand(i: 0));
954 if (isa<BinaryOperator>(Val: I))
955 return IsPathToInnerIndVar(I->getOperand(i: 0)) &&
956 IsPathToInnerIndVar(I->getOperand(i: 1));
957 return false;
958 };
959
960 // In case of multiple inner loop indvars, it is okay if LHS and RHS
961 // are both inner indvar related variables.
962 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
963 return true;
964
965 // Otherwise we check if the cmp instruction compares an inner indvar
966 // related variable (Left) with a outer loop invariant (Right).
967 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Val: Op0)) {
968 Left = Op0;
969 Right = Op1;
970 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Val: Op1)) {
971 Left = Op1;
972 Right = Op0;
973 }
974
975 if (Left == nullptr)
976 return false;
977
978 const SCEV *S = SE->getSCEV(V: Right);
979 if (!SE->isLoopInvariant(S, L: OuterLoop))
980 return false;
981
982 return true;
983}
984
985// If SV is a LCSSA PHI node with a single incoming value, return the incoming
986// value.
987static Value *followLCSSA(Value *SV) {
988 PHINode *PHI = dyn_cast<PHINode>(Val: SV);
989 if (!PHI)
990 return SV;
991
992 if (PHI->getNumIncomingValues() != 1)
993 return SV;
994 return followLCSSA(SV: PHI->getIncomingValue(i: 0));
995}
996
997static bool checkReductionKind(Loop *L, PHINode *PHI,
998 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
999 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1000 RecurrenceDescriptor RD;
1001 if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) {
1002 // Detect floating point reduction only when it can be reordered.
1003 if (RD.getExactFPMathInst() != nullptr)
1004 return false;
1005
1006 RecurKind RK = RD.getRecurrenceKind();
1007 switch (RK) {
1008 case RecurKind::Or:
1009 case RecurKind::And:
1010 case RecurKind::Xor:
1011 case RecurKind::SMin:
1012 case RecurKind::SMax:
1013 case RecurKind::UMin:
1014 case RecurKind::UMax:
1015 return true;
1016
1017 // Interchanging the loops that contain AnyOf reduction is not always legal.
1018 // Especially, when the result value of the AnyOf is not loop-invariant with
1019 // respect to the outer loop, interchanging may change the semantics. The
1020 // following is an example of such case:
1021 // int A = {{ 1, 0 }, { 0, 1 }};
1022 // int red = 0;
1023 // for (int i = 0; i < 2; i++)
1024 // for (int j = 0; j < 2; j++)
1025 // red = (A[j][i] == 0) ? i + 1 : red;
1026 //
1027 // TODO: We may be able to support interchanging loops with AnyOf reduction
1028 // by checking the operand of the reduction is loop-invariant with respect
1029 // to the outer loop as well.
1030 case RecurKind::AnyOf:
1031 return false;
1032
1033 // Changing the order of floating-point operations may alter the results. If
1034 // a certain instruction has the ninf flag, it means that reordering can
1035 // produce a poison value, which may lead to undefined behavior. To prevent
1036 // this, we must drop the ninf flags if we decide to apply the
1037 // transformation.
1038 case RecurKind::FAdd:
1039 case RecurKind::FMul:
1040 case RecurKind::FMin:
1041 case RecurKind::FMax:
1042 case RecurKind::FMinimum:
1043 case RecurKind::FMaximum:
1044 case RecurKind::FMinimumNum:
1045 case RecurKind::FMaximumNum:
1046 case RecurKind::FMulAdd:
1047 for (Instruction *I : RD.getReductionOpChain(Phi: PHI, L))
1048 if (isa<FPMathOperator>(Val: I) && I->hasNoInfs())
1049 HasNoInfInsts.push_back(Elt: I);
1050 return true;
1051
1052 // Change the order of integer addition/multiplication may change the
1053 // semantics. Consider the following case:
1054 //
1055 // int A[2][2] = {{ INT_MAX, INT_MAX }, { INT_MIN, INT_MIN }};
1056 // int sum = 0;
1057 // for (int i = 0; i < 2; i++)
1058 // for (int j = 0; j < 2; j++)
1059 // sum += A[j][i];
1060 //
1061 // If the above loops are exchanged, the addition will cause an
1062 // overflow. To prevent this, we must drop the nuw/nsw flags from the
1063 // addition/multiplication instructions when we actually exchanges the
1064 // loops.
1065 case RecurKind::Add:
1066 case RecurKind::Mul: {
1067 unsigned OpCode = RecurrenceDescriptor::getOpcode(Kind: RK);
1068 SmallVector<Instruction *, 4> Ops = RD.getReductionOpChain(Phi: PHI, L);
1069
1070 // Bail out when we fail to collect reduction instructions chain.
1071 if (Ops.empty())
1072 return false;
1073
1074 for (Instruction *I : Ops) {
1075 assert(I->getOpcode() == OpCode &&
1076 "Expected the instruction to be the reduction operation");
1077 (void)OpCode;
1078
1079 // If the instruction has nuw/nsw flags, we must drop them when the
1080 // transformation is actually performed.
1081 if (I->hasNoSignedWrap() || I->hasNoUnsignedWrap())
1082 HasNoWrapInsts.push_back(Elt: I);
1083 }
1084 return true;
1085 }
1086
1087 default:
1088 return false;
1089 }
1090 } else
1091 return false;
1092}
1093
1094// Check V's users to see if it is involved in a reduction in L.
1095static PHINode *
1096findInnerReductionPhi(Loop *L, Value *V,
1097 SmallVectorImpl<Instruction *> &HasNoWrapInsts,
1098 SmallVectorImpl<Instruction *> &HasNoInfInsts) {
1099 // Reduction variables cannot be constants.
1100 if (isa<Constant>(Val: V))
1101 return nullptr;
1102
1103 for (Value *User : V->users()) {
1104 if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) {
1105 if (PHI->getNumIncomingValues() == 1)
1106 continue;
1107
1108 if (checkReductionKind(L, PHI, HasNoWrapInsts, HasNoInfInsts))
1109 return PHI;
1110 else
1111 return nullptr;
1112 }
1113 }
1114
1115 return nullptr;
1116}
1117
1118bool LoopInterchangeLegality::isInnerReduction(
1119 Loop *L, PHINode *Phi, SmallVectorImpl<Instruction *> &HasNoWrapInsts) {
1120
1121 // Only support reduction2Mem when the loop nest to be interchanged is
1122 // the innermost two loops.
1123 if (!L->isInnermost()) {
1124 LLVM_DEBUG(dbgs() << "Only supported when the loop is the innermost.\n");
1125 ORE->emit(RemarkBuilder: [&]() {
1126 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1127 L->getStartLoc(), L->getHeader())
1128 << "Only supported when the loop is the innermost.";
1129 });
1130 return false;
1131 }
1132
1133 if (Phi->getNumIncomingValues() != 2)
1134 return false;
1135
1136 Value *Init = Phi->getIncomingValueForBlock(BB: L->getLoopPreheader());
1137 Value *Next = Phi->getIncomingValueForBlock(BB: L->getLoopLatch());
1138
1139 // So far only supports constant initial value.
1140 if (!isa<Constant>(Val: Init)) {
1141 LLVM_DEBUG(
1142 dbgs()
1143 << "Only supported for the reduction with a constant initial value.\n");
1144 ORE->emit(RemarkBuilder: [&]() {
1145 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1146 L->getStartLoc(), L->getHeader())
1147 << "Only supported for the reduction with a constant initial "
1148 "value.";
1149 });
1150 return false;
1151 }
1152
1153 // The reduction result must live in the inner loop.
1154 if (Instruction *I = dyn_cast<Instruction>(Val: Next)) {
1155 BasicBlock *BB = I->getParent();
1156 if (!L->contains(BB))
1157 return false;
1158 }
1159
1160 // The reduction should have only one user.
1161 if (!Phi->hasOneUser())
1162 return false;
1163
1164 // Check the reduction kind.
1165 if (!checkReductionKind(L, PHI: Phi, HasNoWrapInsts, HasNoInfInsts))
1166 return false;
1167
1168 // Find lcssa_phi in OuterLoop's Latch
1169 BasicBlock *ExitBlock = L->getExitBlock();
1170 if (!ExitBlock)
1171 return false;
1172
1173 PHINode *Lcssa = NULL;
1174 for (auto *U : Next->users()) {
1175 if (auto *P = dyn_cast<PHINode>(Val: U)) {
1176 if (P == Phi)
1177 continue;
1178
1179 if (Lcssa == NULL && P->getParent() == ExitBlock &&
1180 P->getIncomingValueForBlock(BB: L->getLoopLatch()) == Next)
1181 Lcssa = P;
1182 else
1183 return false;
1184 } else
1185 return false;
1186 }
1187 if (!Lcssa)
1188 return false;
1189
1190 if (!Lcssa->hasOneUser()) {
1191 LLVM_DEBUG(dbgs() << "Only supported when the reduction is used once in "
1192 "the outer loop.\n");
1193 ORE->emit(RemarkBuilder: [&]() {
1194 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1195 L->getStartLoc(), L->getHeader())
1196 << "Only supported when the reduction is used once in the outer "
1197 "loop.";
1198 });
1199 return false;
1200 }
1201
1202 StoreInst *LcssaStore =
1203 dyn_cast<StoreInst>(Val: Lcssa->getUniqueUndroppableUser());
1204 if (!LcssaStore || LcssaStore->getParent() != ExitBlock)
1205 return false;
1206
1207 Value *MemRef = LcssaStore->getOperand(i_nocapture: 1);
1208 Type *ElemTy = LcssaStore->getOperand(i_nocapture: 0)->getType();
1209
1210 // LcssaStore stores the reduction result in BB.
1211 // When the reduction is initialized from a constant value, we need to load
1212 // from the memory object into the target basic block of the inner loop. This
1213 // means the memory reference was used prematurely. So we must ensure that the
1214 // memory reference does not dominate the target basic block.
1215 // TODO: Move the memory reference definition into the loop header.
1216 if (!DT->dominates(Def: dyn_cast<Instruction>(Val: MemRef), BB: L->getHeader())) {
1217 LLVM_DEBUG(dbgs() << "Only supported when memory reference dominate "
1218 "the inner loop.\n");
1219 ORE->emit(RemarkBuilder: [&]() {
1220 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1221 L->getStartLoc(), L->getHeader())
1222 << "Only supported when memory reference dominate the inner "
1223 "loop.";
1224 });
1225 return false;
1226 }
1227
1228 // Found a reduction in the inner loop.
1229 InnerReduction SR;
1230 SR.Reduction = Phi;
1231 SR.Init = Init;
1232 SR.Next = Next;
1233 SR.LcssaPhi = Lcssa;
1234 SR.LcssaStore = LcssaStore;
1235 SR.MemRef = MemRef;
1236 SR.ElemTy = ElemTy;
1237
1238 InnerReductions.push_back(Elt: SR);
1239 return true;
1240}
1241
1242bool LoopInterchangeLegality::checkInductionsAndReductions(Loop *OuterLoop) {
1243 auto ChildLoop = [](Loop *L) {
1244 assert(L->getSubLoops().size() <= 1 &&
1245 "Expect at most one child loop for now.");
1246 return L->getSubLoops().empty() ? nullptr : L->getSubLoops().front();
1247 };
1248
1249 Loop *InnerLoop = ChildLoop(OuterLoop);
1250 for (Loop *CurLoop = OuterLoop; CurLoop; CurLoop = ChildLoop(CurLoop)) {
1251 for (PHINode &PHI : CurLoop->getHeader()->phis()) {
1252 InductionDescriptor ID;
1253 if (InductionDescriptor::isInductionPHI(Phi: &PHI, L: CurLoop, SE, D&: ID)) {
1254 if (CurLoop == InnerLoop) {
1255 const SCEV *Step = ID.getStep();
1256 if (!SE->isLoopInvariant(S: Step, L: OuterLoop))
1257 return false;
1258 InnerLoopInductions.push_back(Elt: &PHI);
1259 }
1260 continue;
1261 }
1262
1263 if (CurLoop == OuterLoop) {
1264 // PHIs in inner loops need to be part of a reduction in the outer loop,
1265 if (PHI.getNumIncomingValues() != 2) {
1266 LLVM_DEBUG(dbgs() << "Only PHI nodes in the outer loop header with 2 "
1267 "incoming values are supported.\n");
1268 return false;
1269 }
1270 // Check if we have a PHI node in the outer loop that has a reduction
1271 // result from the inner loop as an incoming value.
1272 Value *V = followLCSSA(
1273 SV: PHI.getIncomingValueForBlock(BB: OuterLoop->getLoopLatch()));
1274 PHINode *InnerRedPhi = findInnerReductionPhi(
1275 L: InnerLoop, V, HasNoWrapInsts&: HasNoWrapReductions, HasNoInfInsts);
1276
1277 // Reject if PHI has users other than InnerRedPhi. The typical case is
1278 // as follows:
1279 //
1280 // o.header:
1281 // %red.o = phi [ 0, ... ], [ %red.next, %o.latch ]
1282 // br label %i.header
1283 //
1284 // i.header:
1285 // %red.i = phi [ %red.o, %o.header ], [ %red.next, %i.latch ]
1286 // br label %i.body
1287 //
1288 // i.body:
1289 // store %red.o to %mem
1290 // ...
1291 //
1292 if (!InnerRedPhi ||
1293 !llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI) ||
1294 !all_of(Range: PHI.users(),
1295 P: [InnerRedPhi](User *U) { return U == InnerRedPhi; })) {
1296 LLVM_DEBUG(
1297 dbgs()
1298 << "Failed to recognize PHI as an induction or reduction.\n");
1299 ORE->emit(RemarkBuilder: [&]() {
1300 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
1301 OuterLoop->getStartLoc(),
1302 OuterLoop->getHeader())
1303 << "Only outer loops with induction or reduction PHI nodes "
1304 "can be interchanged currently.";
1305 });
1306 return false;
1307 }
1308
1309 OuterInnerReductions.insert(Ptr: &PHI);
1310 OuterInnerReductions.insert(Ptr: InnerRedPhi);
1311 } else {
1312 if (OuterInnerReductions.count(Ptr: &PHI)) {
1313 LLVM_DEBUG(dbgs() << "Found a reduction across the outer loop.\n");
1314 } else if (EnableReduction2Memory &&
1315 isInnerReduction(L: CurLoop, Phi: &PHI, HasNoWrapInsts&: HasNoWrapReductions)) {
1316 LLVM_DEBUG(dbgs() << "Found a reduction in the inner loop: \n"
1317 << PHI << '\n');
1318 } else {
1319 ORE->emit(RemarkBuilder: [&]() {
1320 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
1321 CurLoop->getStartLoc(),
1322 CurLoop->getHeader())
1323 << "Only inner loops with induction or reduction PHI nodes "
1324 "can be interchanged currently.";
1325 });
1326 return false;
1327 }
1328 }
1329 }
1330
1331 // For now we only support at most one reduction.
1332 if (InnerReductions.size() > 1) {
1333 LLVM_DEBUG(dbgs() << "Only supports at most one reduction.\n");
1334 ORE->emit(RemarkBuilder: [&]() {
1335 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerReduction",
1336 CurLoop->getStartLoc(),
1337 CurLoop->getHeader())
1338 << "Only supports at most one reduction.";
1339 });
1340 return false;
1341 }
1342 }
1343
1344 return !InnerLoopInductions.empty();
1345}
1346
1347// This function indicates the current limitations in the transform as a result
1348// of which we do not proceed.
1349bool LoopInterchangeLegality::currentLimitations() {
1350 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1351
1352 // transform currently expects the loop latches to also be the exiting
1353 // blocks.
1354 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
1355 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
1356 !isa<CondBrInst>(Val: InnerLoopLatch->getTerminator()) ||
1357 !isa<CondBrInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) {
1358 LLVM_DEBUG(
1359 dbgs() << "Loops where the latch is not the exiting block are not"
1360 << " supported currently.\n");
1361 ORE->emit(RemarkBuilder: [&]() {
1362 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
1363 OuterLoop->getStartLoc(),
1364 OuterLoop->getHeader())
1365 << "Loops where the latch is not the exiting block cannot be"
1366 " interchange currently.";
1367 });
1368 return true;
1369 }
1370
1371 // TODO: Triangular loops are not handled for now.
1372 if (!isLoopStructureUnderstood()) {
1373 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
1374 ORE->emit(RemarkBuilder: [&]() {
1375 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
1376 InnerLoop->getStartLoc(),
1377 InnerLoop->getHeader())
1378 << "Inner loop structure not understood currently.";
1379 });
1380 return true;
1381 }
1382
1383 // Currently, we do not support loops that have a predecessor entering the
1384 // loop via an indirectbr.
1385 for (Loop *L : {OuterLoop, InnerLoop}) {
1386 BasicBlock *Header = L->getHeader();
1387 for (BasicBlock *Pred : predecessors(BB: Header)) {
1388 if (L->contains(BB: Pred))
1389 continue;
1390 if (isa<IndirectBrInst>(Val: Pred->getTerminator())) {
1391 LLVM_DEBUG(
1392 dbgs() << "Indirect branch found in the loop predecessor.\n");
1393 ORE->emit(RemarkBuilder: [&]() {
1394 return OptimizationRemarkMissed(DEBUG_TYPE, "IndirectBranchPreheader",
1395 L->getStartLoc(), L->getHeader())
1396 << "Indirect branch found in the loop predecessor.";
1397 });
1398 return true;
1399 }
1400 }
1401 }
1402
1403 // Currently, we do not support loops where the inner loop header has
1404 // duplicate successors.
1405 SmallPtrSet<BasicBlock *, 2> InnerLoopHeaderSuccs;
1406 for (BasicBlock *Succ : successors(BB: InnerLoop->getHeader()))
1407 if (!InnerLoopHeaderSuccs.insert(Ptr: Succ).second)
1408 return true;
1409
1410 return false;
1411}
1412
1413/// We currently only support LCSSA PHI nodes in the inner loop exit if their
1414/// users are either of the following:
1415///
1416/// - Reduction PHIs
1417/// - PHIs outside the outer loop
1418/// - PHIs belonging to the latch of the outer loop
1419///
1420/// These conditions mean that we are only interested in the final value after
1421/// the inner loop.
1422static bool
1423areInnerLoopExitPHIsSupported(Loop *OuterL, Loop *InnerL,
1424 SmallPtrSetImpl<PHINode *> &Reductions,
1425 PHINode *LcssaReduction) {
1426 BasicBlock *InnerExit = InnerL->getUniqueExitBlock();
1427 for (PHINode &PHI : InnerExit->phis()) {
1428 // The reduction LCSSA PHI will have only one incoming block, which comes
1429 // from the loop latch.
1430 if (PHI.getNumIncomingValues() > 1)
1431 return false;
1432 // The reduction LCSSA PHI's store user is rewritten by reduction2Memory();
1433 // skip its user-check but keep validating the remaining LCSSA PHIs.
1434 if (&PHI == LcssaReduction)
1435 continue;
1436 if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) {
1437 PHINode *PN = dyn_cast<PHINode>(Val: U);
1438 if (!PN)
1439 return true;
1440 if (Reductions.count(Ptr: PN))
1441 return false;
1442 BasicBlock *PB = PN->getParent();
1443 if (!OuterL->contains(BB: PB))
1444 return false;
1445 return PB != OuterL->getLoopLatch();
1446 }))
1447 return false;
1448 }
1449 return true;
1450}
1451
1452// We currently support LCSSA PHI nodes in the outer loop exit, if their
1453// incoming values do not come from the outer loop latch or if the
1454// outer loop latch has a single predecessor. In that case, the value will
1455// be available if both the inner and outer loop conditions are true, which
1456// will still be true after interchanging. If we have multiple predecessor,
1457// that may not be the case, e.g. because the outer loop latch may be executed
1458// if the inner loop is not executed.
1459static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
1460 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
1461 for (PHINode &PHI : LoopNestExit->phis()) {
1462 for (Value *Incoming : PHI.incoming_values()) {
1463 Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming);
1464 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
1465 continue;
1466
1467 // The incoming value is defined in the outer loop latch. Currently we
1468 // only support that in case the outer loop latch has a single predecessor.
1469 // This guarantees that the outer loop latch is executed if and only if
1470 // the inner loop is executed (because tightlyNested() guarantees that the
1471 // outer loop header only branches to the inner loop or the outer loop
1472 // latch).
1473 // FIXME: We could weaken this logic and allow multiple predecessors,
1474 // if the values are produced outside the loop latch. We would need
1475 // additional logic to update the PHI nodes in the exit block as
1476 // well.
1477 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
1478 return false;
1479 }
1480 }
1481 return true;
1482}
1483
1484/// The transform clones the inner latch's exit condition into the new latch
1485/// (see MoveInstructions in LoopInterchangeTransform::transform), but it does
1486/// not relocate PHI nodes. So if a PHI in the inner latch feeds that condition,
1487/// a later interchange can leave the cloned PHI with a stale incoming block,
1488/// producing invalid IR. Reject that case here.
1489///
1490/// For example, %p is a PHI in the inner latch and the inner loop's exit test
1491/// reads %p, so %p feeds the condition that would be cloned:
1492///
1493/// inner.latch:
1494/// %p = phi i64 [ %v, %subloop.latch ]
1495/// %ec = icmp eq i64 %iv, %p ; inner exit test reads %p
1496/// br i1 %ec, label %exit, label %inner.header
1497///
1498/// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
1499/// multi-level loop nests.
1500static bool areInnerLoopLatchPHIsSupported(Loop *InnerLoop) {
1501 if (InnerLoop->getSubLoops().empty())
1502 return true;
1503
1504 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1505 auto *LatchBI = dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
1506 if (!LatchBI)
1507 return true;
1508 auto *CondI = dyn_cast<Instruction>(Val: LatchBI->getCondition());
1509 if (!CondI)
1510 return true;
1511
1512 // Bail if a phi in the inner latch feeds the exit condition, walking operands
1513 // within the inner loop.
1514 SmallSetVector<Instruction *, 8> Worklist;
1515 Worklist.insert(X: CondI);
1516 for (unsigned I = 0; I < Worklist.size(); ++I) {
1517 Instruction *Cur = Worklist[I];
1518 if (isa<PHINode>(Val: Cur) && Cur->getParent() == InnerLoopLatch)
1519 return false;
1520 for (Value *Op : Cur->operands())
1521 if (auto *OpI = dyn_cast<Instruction>(Val: Op))
1522 if (InnerLoop->contains(Inst: OpI))
1523 Worklist.insert(X: OpI);
1524 }
1525 return true;
1526}
1527
1528bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
1529 unsigned OuterLoopId,
1530 CharMatrix &DepMatrix) {
1531 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
1532 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
1533 << " and OuterLoopId = " << OuterLoopId
1534 << " due to dependence\n");
1535 ORE->emit(RemarkBuilder: [&]() {
1536 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
1537 InnerLoop->getStartLoc(),
1538 InnerLoop->getHeader())
1539 << "Cannot interchange loops due to dependences.";
1540 });
1541 return false;
1542 }
1543 // Check if outer and inner loop contain legal instructions only.
1544 for (auto *BB : OuterLoop->blocks())
1545 for (Instruction &I : *BB) {
1546 // Loads and stores are checked separately, so we can skip them here.
1547 if (isa<LoadInst, StoreInst, PseudoProbeInst>(Val: &I))
1548 continue;
1549
1550 // We cannot ignore potential memory reads, e.g., loads inside the called
1551 // function.
1552 if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
1553 continue;
1554
1555 LLVM_DEBUG(
1556 dbgs()
1557 << "Loops contain instructions that cannot be safely interchanged\n");
1558 ORE->emit(RemarkBuilder: [&]() {
1559 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsafeInst",
1560 I.getDebugLoc(), I.getParent())
1561 << "Cannot interchange loops due to instruction that is "
1562 "potentially unsafe to interchange.";
1563 });
1564
1565 return false;
1566 }
1567
1568 if (!checkInductionsAndReductions(OuterLoop)) {
1569 LLVM_DEBUG(dbgs() << "Failed to find inner loop inductions or found "
1570 "unsupported reductions.\n");
1571 return false;
1572 }
1573
1574 if (!areInnerLoopLatchPHIsSupported(InnerLoop)) {
1575 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1576 ORE->emit(RemarkBuilder: [&]() {
1577 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1578 InnerLoop->getStartLoc(),
1579 InnerLoop->getHeader())
1580 << "Cannot interchange loops because unsupported PHI nodes found "
1581 "in inner loop latch.";
1582 });
1583 return false;
1584 }
1585
1586 // TODO: The loops could not be interchanged due to current limitations in the
1587 // transform module.
1588 if (currentLimitations()) {
1589 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1590 return false;
1591 }
1592
1593 // Check if the loops are tightly nested.
1594 if (!tightlyNested(OuterLoop, InnerLoop)) {
1595 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1596 ORE->emit(RemarkBuilder: [&]() {
1597 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1598 InnerLoop->getStartLoc(),
1599 InnerLoop->getHeader())
1600 << "Cannot interchange loops because they are not tightly "
1601 "nested.";
1602 });
1603 return false;
1604 }
1605
1606 // The LCSSA PHI for the reduction has passed checks before; its user
1607 // is a store instruction.
1608 PHINode *LcssaReduction = nullptr;
1609 assert(InnerReductions.size() <= 1 &&
1610 "So far we only support at most one reduction.");
1611 if (InnerReductions.size() == 1)
1612 LcssaReduction = InnerReductions[0].LcssaPhi;
1613
1614 if (!areInnerLoopExitPHIsSupported(OuterL: OuterLoop, InnerL: InnerLoop, Reductions&: OuterInnerReductions,
1615 LcssaReduction)) {
1616 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1617 ORE->emit(RemarkBuilder: [&]() {
1618 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1619 InnerLoop->getStartLoc(),
1620 InnerLoop->getHeader())
1621 << "Found unsupported PHI node in loop exit.";
1622 });
1623 return false;
1624 }
1625
1626 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1627 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1628 ORE->emit(RemarkBuilder: [&]() {
1629 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1630 OuterLoop->getStartLoc(),
1631 OuterLoop->getHeader())
1632 << "Found unsupported PHI node in loop exit.";
1633 });
1634 return false;
1635 }
1636
1637 if (any_of(Range: OuterLoop->getLoopLatch()->phis(),
1638 P: [](PHINode &PHI) { return PHI.getNumIncomingValues() != 1; })) {
1639 LLVM_DEBUG(dbgs() << "Only outer loop latch PHI nodes with one incoming "
1640 "value are supported.\n");
1641 ORE->emit(RemarkBuilder: [&]() {
1642 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1643 OuterLoop->getStartLoc(),
1644 OuterLoop->getHeader())
1645 << "Only outer loop latch PHI nodes with one incoming value are "
1646 "supported.";
1647 });
1648 return false;
1649 }
1650
1651 // Regarding def-use chains that begin at an LCSSA PHI in the inner loop exit
1652 // and end at any instruction in the outer loop latch, we currently support
1653 // only the case where the chain contains only PHI nodes. Since we already
1654 // call `tightlyNested()`, we know that if there is a def-use chain that we
1655 // don't support (i.e., a chain that contains a non-PHI user), then the
1656 // non-PHI user must be in the outer loop latch.
1657 if (InnerLoop->getExitBlock() != OuterLoop->getLoopLatch())
1658 for (PHINode &PHI : OuterLoop->getLoopLatch()->phis())
1659 if (any_of(Range: PHI.users(), P: [](const User *U) { return !isa<PHINode>(Val: U); })) {
1660 LLVM_DEBUG(dbgs() << "Outer loop latch PHI has a non-PHI user.\n");
1661 ORE->emit(RemarkBuilder: [&]() {
1662 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLatchPHI",
1663 OuterLoop->getStartLoc(),
1664 OuterLoop->getHeader())
1665 << "Cannot interchange loops because an outer loop latch PHI "
1666 "node has a non-PHI user.";
1667 });
1668 return false;
1669 }
1670
1671 return true;
1672}
1673
1674void CacheCostManager::computeIfUnitinialized() {
1675 if (CC.has_value())
1676 return;
1677
1678 LLVM_DEBUG(dbgs() << "Compute CacheCost.\n");
1679 CC = CacheCost::getCacheCost(Root&: *OutermostLoop, AR&: *AR, DI&: *DI);
1680 // Obtain the loop vector returned from loop cache analysis beforehand,
1681 // and put each <Loop, index> pair into a map for constant time query
1682 // later. Indices in loop vector reprsent the optimal order of the
1683 // corresponding loop, e.g., given a loopnest with depth N, index 0
1684 // indicates the loop should be placed as the outermost loop and index N
1685 // indicates the loop should be placed as the innermost loop.
1686 //
1687 // For the old pass manager CacheCost would be null.
1688 if (*CC != nullptr)
1689 for (const auto &[Idx, Cost] : enumerate(First: (*CC)->getLoopCosts()))
1690 CostMap[Cost.first] = Idx;
1691}
1692
1693CacheCost *CacheCostManager::getCacheCost() {
1694 computeIfUnitinialized();
1695 return CC->get();
1696}
1697
1698const DenseMap<const Loop *, unsigned> &CacheCostManager::getCostMap() {
1699 computeIfUnitinialized();
1700 return CostMap;
1701}
1702
1703/// If \S contains an affine addrec for \p L, return the step recurrence of it.
1704/// If \S is loop invariant with respect to \p L, return nullptr. Otherwise,
1705/// return std::nullopt, which indicates we cannot determine the coefficient of
1706/// the addrec for \p L in \S.
1707/// TODO: Handle more complex cases. Maybe using SCEVTraversal is a good way to
1708/// do that.
1709static std::optional<const SCEV *>
1710getAddRecCoefficient(ScalarEvolution &SE, const SCEV *S, const Loop *L) {
1711 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: S);
1712 if (!AR) {
1713 if (SE.isLoopInvariant(S, L))
1714 return nullptr;
1715 return std::nullopt;
1716 }
1717
1718 if (!AR->isAffine()) {
1719 LLVM_DEBUG(dbgs() << "Unexpected non-affine addrec\n");
1720 return std::nullopt;
1721 }
1722
1723 std::optional<const SCEV *> Coeff =
1724 getAddRecCoefficient(SE, S: AR->getStart(), L);
1725 if (!Coeff.has_value())
1726 return std::nullopt;
1727
1728 if (AR->getLoop() == L) {
1729 assert(!*Coeff && "Found more than one addrec for the same loop");
1730 Coeff = AR->getStepRecurrence(SE);
1731 }
1732 return Coeff;
1733}
1734
1735int LoopInterchangeProfitability::getInstrOrderCost() {
1736 SmallPtrSet<const SCEV *, 4> GoodBasePtrs, BadBasePtrs;
1737 for (BasicBlock *BB : InnerLoop->blocks()) {
1738 for (Instruction &Ins : *BB) {
1739 if (!isa<LoadInst, StoreInst>(Val: &Ins))
1740 continue;
1741 const SCEV *Access = SE->getSCEV(V: getLoadStorePointerOperand(V: &Ins));
1742 const SCEV *BasePtr = SE->getPointerBase(V: Access);
1743 std::optional<const SCEV *> OuterCoeff =
1744 getAddRecCoefficient(SE&: *SE, S: Access, L: OuterLoop);
1745 std::optional<const SCEV *> InnerCoeff =
1746 getAddRecCoefficient(SE&: *SE, S: Access, L: InnerLoop);
1747
1748 if (!OuterCoeff.has_value() || !*OuterCoeff || !InnerCoeff.has_value() ||
1749 !*InnerCoeff)
1750 continue;
1751
1752 // This heuristic assumes that a smaller step recurrence implies that the
1753 // induction variable corresponding to the loop is used in the inner
1754 // dimension of the array. Placing such a loop in the inner position would
1755 // be beneficial in terms of locality. If the array access is of the form
1756 // like `A[3*i + 2*j]`, this heuristic may lead to an unprofitable
1757 // interchange, but we expect such cases to be rare.
1758 const SCEV *OuterStep = SE->getAbsExpr(Op: *OuterCoeff, /*IsNSW=*/false);
1759 const SCEV *InnerStep = SE->getAbsExpr(Op: *InnerCoeff, /*IsNSW=*/false);
1760 // If we find the inner induction after an outer induction e.g.
1761 //
1762 // for(int i=0;i<N;i++)
1763 // for(int j=0;j<N;j++)
1764 // A[i][j] = A[i-1][j-1]+k;
1765 //
1766 //
1767 // then it is a good order. If we find the outer induction after an inner
1768 // induction e.g.
1769 //
1770 // for(int i=0;i<N;i++)
1771 // for(int j=0;j<N;j++)
1772 // A[j][i] = A[j-1][i-1]+k;
1773 //
1774 // then it is a bad order.
1775 //
1776 // To avoid counting the same base pointers multiple times, we deduplicate
1777 // them by using a set of base pointers.
1778 if (SE->isKnownPredicate(Pred: ICmpInst::ICMP_SLT, LHS: InnerStep, RHS: OuterStep))
1779 GoodBasePtrs.insert(Ptr: BasePtr);
1780 else if (SE->isKnownPredicate(Pred: ICmpInst::ICMP_SLT, LHS: OuterStep, RHS: InnerStep))
1781 BadBasePtrs.insert(Ptr: BasePtr);
1782 }
1783 }
1784
1785 int GoodOrder = GoodBasePtrs.size();
1786 int BadOrder = BadBasePtrs.size();
1787 return GoodOrder - BadOrder;
1788}
1789
1790std::optional<bool>
1791LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1792 const DenseMap<const Loop *, unsigned> &CostMap, CacheCost *CC) {
1793 // This is the new cost model returned from loop cache analysis.
1794 // A smaller index means the loop should be placed an outer loop, and vice
1795 // versa.
1796 auto InnerLoopIt = CostMap.find(Val: InnerLoop);
1797 if (InnerLoopIt == CostMap.end())
1798 return std::nullopt;
1799 auto OuterLoopIt = CostMap.find(Val: OuterLoop);
1800 if (OuterLoopIt == CostMap.end())
1801 return std::nullopt;
1802
1803 if (CC->getLoopCost(L: *OuterLoop) == CC->getLoopCost(L: *InnerLoop))
1804 return std::nullopt;
1805 unsigned InnerIndex = InnerLoopIt->second;
1806 unsigned OuterIndex = OuterLoopIt->second;
1807 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1808 << ", OuterIndex = " << OuterIndex << "\n");
1809 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1810 "numbers to each loop");
1811 return std::optional<bool>(InnerIndex < OuterIndex);
1812}
1813
1814std::optional<bool>
1815LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1816 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1817 // good and bad order of induction variables in the instruction and allows
1818 // reordering if number of bad orders is more than good.
1819 int Cost = getInstrOrderCost();
1820 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1821 if (Cost < 0 && Cost < LoopInterchangeCostThreshold)
1822 return std::optional<bool>(true);
1823
1824 return std::nullopt;
1825}
1826
1827/// Return true if we can vectorize the loop specified by \p LoopId.
1828static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) {
1829 for (const auto &Dep : DepMatrix) {
1830 char Dir = Dep[LoopId];
1831 char DepType = Dep.back();
1832 assert((DepType == '<' || DepType == '*') &&
1833 "Unexpected element in dependency vector");
1834
1835 // There are no loop-carried dependencies.
1836 if (Dir == '=' || Dir == 'I')
1837 continue;
1838
1839 // DepType being '<' means that this direction vector represents a forward
1840 // dependency. In principle, a loop with '<' direction can be vectorized in
1841 // this case.
1842 if (Dir == '<' && DepType == '<')
1843 continue;
1844
1845 // We cannot prove that the loop is vectorizable.
1846 return false;
1847 }
1848 return true;
1849}
1850
1851std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1852 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1853 // If the outer loop cannot be vectorized, it is not profitable to move this
1854 // to inner position.
1855 if (!canVectorize(DepMatrix, LoopId: OuterLoopId))
1856 return false;
1857
1858 // If the inner loop cannot be vectorized but the outer loop can be, then it
1859 // is profitable to interchange to enable inner loop parallelism.
1860 if (!canVectorize(DepMatrix, LoopId: InnerLoopId))
1861 return true;
1862
1863 // If both the inner and the outer loop can be vectorized, it is necessary to
1864 // check the cost of each vectorized loop for profitability decision. At this
1865 // time we do not have a cost model to estimate them, so return nullopt.
1866 // TODO: Estimate the cost of vectorized loop when both the outer and the
1867 // inner loop can be vectorized.
1868 return std::nullopt;
1869}
1870
1871bool LoopInterchangeProfitability::isProfitable(
1872 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1873 unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) {
1874 // Do not consider loops with a backedge that isn't taken, e.g. an
1875 // unconditional branch true/false, as candidates for interchange.
1876 // TODO: when interchange is forced, we should probably also allow
1877 // interchange for these loops, and thus this logic should be moved just
1878 // below the cost-model ignore check below. But this check is done first
1879 // to avoid the issue in #163954.
1880 const SCEV *InnerBTC = SE->getBackedgeTakenCount(L: InnerLoop);
1881 const SCEV *OuterBTC = SE->getBackedgeTakenCount(L: OuterLoop);
1882 if (InnerBTC && InnerBTC->isZero()) {
1883 LLVM_DEBUG(dbgs() << "Inner loop back-edge isn't taken, rejecting "
1884 "single iteration loop\n");
1885 return false;
1886 }
1887 if (OuterBTC && OuterBTC->isZero()) {
1888 LLVM_DEBUG(dbgs() << "Outer loop back-edge isn't taken, rejecting "
1889 "single iteration loop\n");
1890 return false;
1891 }
1892
1893 // Return true if interchange is forced and the cost-model ignored.
1894 if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore)
1895 return true;
1896 assert(noDuplicateRulesAndIgnore(Profitabilities) &&
1897 "Duplicate rules and option 'ignore' are not allowed");
1898
1899 // isProfitable() is structured to avoid endless loop interchange. If the
1900 // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could
1901 // decide the profitability then, profitability check will stop and return the
1902 // analysis result. If it failed to determine it (e.g., cache analysis failed
1903 // to analyze the loopnest due to delinearization issues) then go ahead the
1904 // second highest priority rule (isProfitablePerInstrOrderCost by default).
1905 // Likewise, if it failed to analysis the profitability then only, the last
1906 // rule (isProfitableForVectorization by default) will decide.
1907 std::optional<bool> shouldInterchange;
1908 for (RuleTy RT : Profitabilities) {
1909 switch (RT) {
1910 case RuleTy::PerLoopCacheAnalysis: {
1911 CacheCost *CC = CCM.getCacheCost();
1912 const DenseMap<const Loop *, unsigned> &CostMap = CCM.getCostMap();
1913 shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC);
1914 break;
1915 }
1916 case RuleTy::PerInstrOrderCost:
1917 shouldInterchange = isProfitablePerInstrOrderCost();
1918 break;
1919 case RuleTy::ForVectorization:
1920 shouldInterchange =
1921 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1922 break;
1923 case RuleTy::Ignore:
1924 llvm_unreachable("Option 'ignore' is not supported with other options");
1925 break;
1926 }
1927
1928 // If this rule could determine the profitability, don't call subsequent
1929 // rules.
1930 if (shouldInterchange.has_value())
1931 break;
1932 }
1933
1934 if (!shouldInterchange.has_value()) {
1935 ORE->emit(RemarkBuilder: [&]() {
1936 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1937 InnerLoop->getStartLoc(),
1938 InnerLoop->getHeader())
1939 << "Insufficient information to calculate the cost of loop for "
1940 "interchange.";
1941 });
1942 return false;
1943 } else if (!shouldInterchange.value()) {
1944 ORE->emit(RemarkBuilder: [&]() {
1945 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1946 InnerLoop->getStartLoc(),
1947 InnerLoop->getHeader())
1948 << "Interchanging loops is not considered to improve cache "
1949 "locality nor vectorization.";
1950 });
1951 return false;
1952 }
1953 return true;
1954}
1955
1956void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1957 Loop *InnerLoop) {
1958 for (Loop *L : *OuterLoop)
1959 if (L == InnerLoop) {
1960 OuterLoop->removeChildLoop(Child: L);
1961 return;
1962 }
1963 llvm_unreachable("Couldn't find loop");
1964}
1965
1966/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1967/// new inner and outer loop after interchanging: NewInner is the original
1968/// outer loop and NewOuter is the original inner loop.
1969///
1970/// Before interchanging, we have the following structure
1971/// Outer preheader
1972// Outer header
1973// Inner preheader
1974// Inner header
1975// Inner body
1976// Inner latch
1977// outer bbs
1978// Outer latch
1979//
1980// After interchanging:
1981// Inner preheader
1982// Inner header
1983// Outer preheader
1984// Outer header
1985// Inner body
1986// outer bbs
1987// Outer latch
1988// Inner latch
1989void LoopInterchangeTransform::restructureLoops(
1990 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1991 BasicBlock *OrigOuterPreHeader) {
1992 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1993 // The original inner loop preheader moves from the new inner loop to
1994 // the parent loop, if there is one.
1995 NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader);
1996 LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent);
1997
1998 // Switch the loop levels.
1999 if (OuterLoopParent) {
2000 // Remove the loop from its parent loop.
2001 removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner);
2002 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
2003 OuterLoopParent->addChildLoop(NewChild: NewOuter);
2004 } else {
2005 removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter);
2006 LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter);
2007 }
2008 while (!NewOuter->isInnermost())
2009 NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin()));
2010 NewOuter->addChildLoop(NewChild: NewInner);
2011
2012 // BBs from the original inner loop.
2013 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
2014
2015 // Add BBs from the original outer loop to the original inner loop (excluding
2016 // BBs already in inner loop)
2017 for (BasicBlock *BB : NewInner->blocks())
2018 if (LI->getLoopFor(BB) == NewInner)
2019 NewOuter->addBlockEntry(BB);
2020
2021 // Now remove inner loop header and latch from the new inner loop and move
2022 // other BBs (the loop body) to the new inner loop.
2023 BasicBlock *OuterHeader = NewOuter->getHeader();
2024 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
2025 for (BasicBlock *BB : OrigInnerBBs) {
2026 // Nothing will change for BBs in child loops.
2027 if (LI->getLoopFor(BB) != NewOuter)
2028 continue;
2029 // Remove the new outer loop header and latch from the new inner loop.
2030 if (BB == OuterHeader || BB == OuterLatch)
2031 NewInner->removeBlockFromLoop(BB);
2032 else
2033 LI->changeLoopFor(BB, L: NewInner);
2034 }
2035
2036 // The preheader of the original outer loop becomes part of the new
2037 // outer loop.
2038 NewOuter->addBlockEntry(BB: OrigOuterPreHeader);
2039 LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter);
2040
2041 // Tell SE that we move the loops around.
2042 SE->forgetLoop(L: NewOuter);
2043}
2044
2045/// User can write, or optimizers can generate the reduction for inner loop.
2046/// To make the interchange valid, apply Reduction2Mem by moving the
2047/// initializer and store instructions into the inner loop. So far we only
2048/// handle cases where the reduction variable is initialized to a constant.
2049/// For example, below code:
2050///
2051/// loop:
2052/// re = phi<0.0, next>
2053/// next = re op ...
2054/// endloop
2055/// reduc_sum = phi<next> // lcssa phi
2056/// MEM_REF[idx] = reduc_sum // LcssaStore
2057///
2058/// is transformed into:
2059///
2060/// loop:
2061/// tmp = MEM_REF[idx];
2062/// new_var = !first_iteration ? tmp : 0.0;
2063/// next = new_var op ...
2064/// MEM_REF[idx] = next; // after moving
2065/// endloop
2066///
2067/// In this way the initial const is used in the first iteration of loop.
2068void LoopInterchangeTransform::reduction2Memory() {
2069 ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
2070 LIL.getInnerReductions();
2071
2072 assert(InnerReductions.size() == 1 &&
2073 "So far we only support at most one reduction.");
2074
2075 LoopInterchangeLegality::InnerReduction SR = InnerReductions[0];
2076 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2077 IRBuilder<> Builder(&*(InnerLoopHeader->getFirstNonPHIIt()));
2078
2079 // Check if it's the first iteration.
2080 LLVMContext &Context = InnerLoopHeader->getContext();
2081 PHINode *FirstIter =
2082 Builder.CreatePHI(Ty: Type::getInt1Ty(C&: Context), NumReservedValues: 2, Name: "first.iter");
2083 FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: 1),
2084 BB: InnerLoop->getLoopPreheader());
2085 FirstIter->addIncoming(V: ConstantInt::get(Ty: Type::getInt1Ty(C&: Context), V: 0),
2086 BB: InnerLoop->getLoopLatch());
2087 assert(FirstIter->isComplete() && "The FirstIter PHI node is not complete.");
2088
2089 // When the reduction is initialized from a constant value, we need to add
2090 // a stmt loading from the memory object to target basic block in inner
2091 // loop.
2092 Instruction *LoadMem = Builder.CreateLoad(Ty: SR.ElemTy, Ptr: SR.MemRef);
2093
2094 // Init new_var to MEM_REF or CONST depending on if it is the first iteration.
2095 Value *NewVar = Builder.CreateSelect(C: FirstIter, True: SR.Init, False: LoadMem, Name: "new.var");
2096
2097 // Replace all uses of the reduction variable with a new variable.
2098 SR.Reduction->replaceAllUsesWith(V: NewVar);
2099
2100 // Move store instruction into inner loop, just after reduction next's
2101 // definition.
2102 SR.LcssaStore->setOperand(i_nocapture: 0, Val_nocapture: SR.Next);
2103 SR.LcssaStore->moveAfter(MovePos: dyn_cast<Instruction>(Val: SR.Next));
2104}
2105
2106bool LoopInterchangeTransform::transform(
2107 ArrayRef<Instruction *> DropNoWrapInsts,
2108 ArrayRef<Instruction *> DropNoInfInsts) {
2109 bool Transformed = false;
2110
2111 ArrayRef<LoopInterchangeLegality::InnerReduction> InnerReductions =
2112 LIL.getInnerReductions();
2113 if (InnerReductions.size() == 1)
2114 reduction2Memory();
2115
2116 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
2117 auto &InductionPHIs = LIL.getInnerLoopInductions();
2118 assert(!InductionPHIs.empty() &&
2119 "Expected at least one induction variable in the inner loop");
2120
2121 SmallVector<Instruction *, 8> InnerIndexVarList;
2122 for (PHINode *CurInductionPHI : InductionPHIs) {
2123 Instruction *IncomingValue = dyn_cast<Instruction>(
2124 Val: CurInductionPHI->getIncomingValueForBlock(BB: InnerLoop->getLoopLatch()));
2125 assert(IncomingValue &&
2126 "Incoming value from loop latch isn't an instruction");
2127 if (is_contained(Range: InductionPHIs, Element: IncomingValue))
2128 continue;
2129 InnerIndexVarList.push_back(Elt: IncomingValue);
2130 }
2131
2132 // Create a new latch block for the inner loop. We split at the
2133 // current latch's terminator and then move the condition and all
2134 // operands that are not either loop-invariant or the induction PHI into the
2135 // new latch block.
2136 BasicBlock *NewLatch =
2137 SplitBlock(Old: InnerLoop->getLoopLatch(),
2138 SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
2139
2140 SmallSetVector<Instruction *, 4> WorkList;
2141 unsigned i = 0;
2142 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
2143 for (; i < WorkList.size(); i++) {
2144 // PHI nodes cannot be cloned and moved here; the legality check
2145 // (areInnerLoopLatchPHIsSupported) ensures none reach the worklist.
2146 assert(!isa<PHINode>(WorkList[i]) &&
2147 "MoveInstructions does not support PHI nodes");
2148 // Duplicate instruction and move it to the new latch. Update uses that
2149 // have been moved.
2150 Instruction *NewI = WorkList[i]->clone();
2151 NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt());
2152 assert(!NewI->mayHaveSideEffects() &&
2153 "Moving instructions with side-effects may change behavior of "
2154 "the loop nest!");
2155 for (Use &U : llvm::make_early_inc_range(Range: WorkList[i]->uses())) {
2156 Instruction *UserI = cast<Instruction>(Val: U.getUser());
2157 if (!InnerLoop->contains(BB: UserI->getParent()) ||
2158 UserI->getParent() == NewLatch ||
2159 llvm::is_contained(Range: InductionPHIs, Element: UserI))
2160 U.set(NewI);
2161 }
2162 // Add operands of moved instruction to the worklist, except if they are
2163 // outside the inner loop or are the induction PHI.
2164 for (Value *Op : WorkList[i]->operands()) {
2165 Instruction *OpI = dyn_cast<Instruction>(Val: Op);
2166 if (!OpI || this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop ||
2167 llvm::is_contained(Range: InductionPHIs, Element: OpI))
2168 continue;
2169 WorkList.insert(X: OpI);
2170 }
2171 }
2172 };
2173
2174 // FIXME: Should we interchange when we have a constant condition?
2175 Instruction *CondI = dyn_cast<Instruction>(
2176 Val: cast<CondBrInst>(Val: InnerLoop->getLoopLatch()->getTerminator())
2177 ->getCondition());
2178 if (CondI)
2179 WorkList.insert(X: CondI);
2180 MoveInstructions();
2181 for (Instruction *InnerIndexVar : InnerIndexVarList)
2182 WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar));
2183 MoveInstructions();
2184
2185 // Ensure the inner loop phi nodes have a separate basic block.
2186 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2187 if (&*InnerLoopHeader->getFirstNonPHIIt() !=
2188 InnerLoopHeader->getTerminator()) {
2189 SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
2190 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
2191 }
2192
2193 // Instructions in the original inner loop preheader may depend on values
2194 // defined in the outer loop header. Move them there, because the original
2195 // inner loop preheader will become the entry into the interchanged loop nest.
2196 // Currently we move all instructions and rely on LICM to move invariant
2197 // instructions outside the loop nest.
2198 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2199 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2200
2201 if (InnerLoopPreHeader != OuterLoopHeader) {
2202 // Eliminate PHIs in the inner-loop preheader.
2203 for (PHINode &P : make_early_inc_range(Range: InnerLoopPreHeader->phis())) {
2204 assert(all_equal(P.incoming_values()) &&
2205 "Expected equivalent incoming values in inner loop preheader");
2206 P.replaceAllUsesWith(V: P.getIncomingValue(i: 0));
2207 P.eraseFromParent();
2208 }
2209 for (Instruction &I :
2210 make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(),
2211 y: std::prev(x: InnerLoopPreHeader->end()))))
2212 I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator());
2213 }
2214
2215 Transformed |= adjustLoopLinks();
2216 if (!Transformed) {
2217 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
2218 return false;
2219 }
2220
2221 // Finally, drop the nsw/nuw/ninf flags from the instructions for reduction
2222 // calculations.
2223 for (Instruction *Reduction : DropNoWrapInsts) {
2224 Reduction->setHasNoSignedWrap(false);
2225 Reduction->setHasNoUnsignedWrap(false);
2226 }
2227 for (Instruction *I : DropNoInfInsts)
2228 I->setHasNoInfs(false);
2229
2230 return true;
2231}
2232
2233/// \brief Move all instructions except the terminator from FromBB right before
2234/// InsertBefore
2235static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
2236 BasicBlock *ToBB = InsertBefore->getParent();
2237
2238 ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(),
2239 FromEndIt: FromBB->getTerminator()->getIterator());
2240}
2241
2242/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
2243static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
2244 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
2245 // from BB1 afterwards.
2246 auto Iter = map_range(C&: *BB1, F: [](Instruction &I) { return &I; });
2247 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(x: Iter.end()));
2248 for (Instruction *I : TempInstrs)
2249 I->removeFromParent();
2250
2251 // Move instructions from BB2 to BB1.
2252 moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator());
2253
2254 // Move instructions from TempInstrs to BB2.
2255 for (Instruction *I : TempInstrs)
2256 I->insertBefore(InsertPos: BB2->getTerminator()->getIterator());
2257}
2258
2259// Update BI to jump to NewBB instead of OldBB. Records updates to the
2260// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
2261// \p OldBB is exactly once in BI's successor list.
2262static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
2263 BasicBlock *NewBB,
2264 std::vector<DominatorTree::UpdateType> &DTUpdates,
2265 bool MustUpdateOnce = true) {
2266 assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
2267 "BI must jump to OldBB exactly once.");
2268 bool Changed = false;
2269 for (Use &Op : Term->operands())
2270 if (Op == OldBB) {
2271 Op.set(NewBB);
2272 Changed = true;
2273 }
2274
2275 if (Changed) {
2276 DTUpdates.push_back(
2277 x: {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
2278 DTUpdates.push_back(
2279 x: {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
2280 }
2281 assert(Changed && "Expected a successor to be updated");
2282}
2283
2284// Move Lcssa PHIs to the right place.
2285static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
2286 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
2287 BasicBlock *OuterLatch, BasicBlock *OuterExit,
2288 Loop *InnerLoop, LoopInfo *LI) {
2289
2290 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
2291 // defined either in the header or latch. Those blocks will become header and
2292 // latch of the new outer loop, and the only possible users can PHI nodes
2293 // in the exit block of the loop nest or the outer loop header (reduction
2294 // PHIs, in that case, the incoming value must be defined in the inner loop
2295 // header). We can just substitute the user with the incoming value and remove
2296 // the PHI.
2297 for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) {
2298 assert(P.getNumIncomingValues() == 1 &&
2299 "Only loops with a single exit are supported!");
2300
2301 Value *IncomingValue = P.getIncomingValueForBlock(BB: InnerLatch);
2302 auto *IncI = dyn_cast<Instruction>(Val: IncomingValue);
2303 if (!IncI) {
2304 // If the incoming value is not an instruction, it must be loop invariant.
2305 // In that case, we can just replace the PHI with the incoming value and
2306 // remove the PHI.
2307 assert(InnerLoop->isLoopInvariant(IncomingValue) &&
2308 "Expected non-instruction incoming value to be loop invariant");
2309 P.replaceAllUsesWith(V: IncomingValue);
2310 P.eraseFromParent();
2311 continue;
2312 }
2313
2314 // In case of multi-level nested loops, follow LCSSA to find the incoming
2315 // value defined from the innermost loop.
2316 auto *IncIInnerMost = dyn_cast<Instruction>(Val: followLCSSA(SV: IncI));
2317 // Skip phis when:
2318 // - they are not an instruction, e.g. incoming values are constants.
2319 // - Incomming values from the inner loop body, excluding the header and
2320 // latch.
2321 if (!IncIInnerMost || (IncIInnerMost->getParent() != InnerLatch &&
2322 IncIInnerMost->getParent() != InnerHeader))
2323 continue;
2324
2325 assert(all_of(P.users(),
2326 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
2327 return (cast<PHINode>(U)->getParent() == OuterHeader &&
2328 IncI->getParent() == InnerHeader) ||
2329 cast<PHINode>(U)->getParent() == OuterExit;
2330 }) &&
2331 "Can only replace phis iff the uses are in the loop nest exit or "
2332 "the incoming value is defined in the inner header (it will "
2333 "dominate all loop blocks after interchanging)");
2334 P.replaceAllUsesWith(V: IncI);
2335 P.eraseFromParent();
2336 }
2337
2338 SmallVector<PHINode *, 8> LcssaInnerExit(
2339 llvm::make_pointer_range(Range: InnerExit->phis()));
2340
2341 SmallVector<PHINode *, 8> LcssaInnerLatch(
2342 llvm::make_pointer_range(Range: InnerLatch->phis()));
2343
2344 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
2345 // If a PHI node has users outside of InnerExit, it has a use outside the
2346 // interchanged loop and we have to preserve it. We move these to
2347 // InnerLatch, which will become the new exit block for the innermost
2348 // loop after interchanging.
2349 for (PHINode *P : LcssaInnerExit)
2350 P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2351
2352 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
2353 // and we have to move them to the new inner latch.
2354 for (PHINode *P : LcssaInnerLatch)
2355 P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt());
2356
2357 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
2358 // incoming values defined in the outer loop, we have to add a new PHI
2359 // in the inner loop latch, which became the exit block of the outer loop,
2360 // after interchanging.
2361 if (OuterExit) {
2362 for (PHINode &P : OuterExit->phis()) {
2363 if (P.getNumIncomingValues() != 1)
2364 continue;
2365 // Skip Phis with incoming values defined in the inner loop. Those should
2366 // already have been updated.
2367 auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: 0));
2368 if (!I || LI->getLoopFor(BB: I->getParent()) == InnerLoop)
2369 continue;
2370
2371 PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone());
2372 NewPhi->setIncomingValue(i: 0, V: P.getIncomingValue(i: 0));
2373 NewPhi->setIncomingBlock(i: 0, BB: OuterLatch);
2374 // We might have incoming edges from other BBs, i.e., the original outer
2375 // header.
2376 for (auto *Pred : predecessors(BB: InnerLatch)) {
2377 if (Pred == OuterLatch)
2378 continue;
2379 NewPhi->addIncoming(V: P.getIncomingValue(i: 0), BB: Pred);
2380 }
2381 NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt());
2382 P.setIncomingValue(i: 0, V: NewPhi);
2383 }
2384 }
2385
2386 // Now adjust the incoming blocks for the LCSSA PHIs.
2387 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
2388 // with the new latch.
2389 InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch);
2390}
2391
2392/// This deals with a corner case when a LCSSA phi node appears in a non-exit
2393/// block: the outer loop latch block does not need to be exit block of the
2394/// inner loop. Consider a loop that was in LCSSA form, but then some
2395/// transformation like loop-unswitch comes along and creates an empty block,
2396/// where BB5 in this example is the outer loop latch block:
2397///
2398/// BB4:
2399/// br label %BB5
2400/// BB5:
2401/// %old.cond.lcssa = phi i16 [ %cond, %BB4 ]
2402/// br outer.header
2403///
2404/// Interchange then brings it in LCSSA form again resulting in this chain of
2405/// single-input phi nodes:
2406///
2407/// BB4:
2408/// %new.cond.lcssa = phi i16 [ %cond, %BB3 ]
2409/// br label %BB5
2410/// BB5:
2411/// %old.cond.lcssa = phi i16 [ %new.cond.lcssa, %BB4 ]
2412///
2413/// The problem is that interchange can reoder blocks BB4 and BB5 placing the
2414/// use before the def if we don't check this. The solution is to simplify
2415/// lcssa phi nodes (remove) if they appear in non-exit blocks.
2416///
2417static void simplifyLCSSAPhis(Loop *OuterLoop, Loop *InnerLoop) {
2418 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
2419 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2420
2421 // Do not modify lcssa phis where they actually belong, i.e. in exit blocks.
2422 if (OuterLoopLatch == InnerLoopExit)
2423 return;
2424
2425 // Collect and remove phis in non-exit blocks if they have 1 input.
2426 SmallVector<PHINode *, 8> Phis(
2427 llvm::make_pointer_range(Range: OuterLoopLatch->phis()));
2428 for (PHINode *Phi : Phis) {
2429 assert(Phi->getNumIncomingValues() == 1 && "Single input phi expected");
2430 LLVM_DEBUG(dbgs() << "Removing 1-input phi in non-exit block: " << *Phi
2431 << "\n");
2432 Phi->replaceAllUsesWith(V: Phi->getIncomingValue(i: 0));
2433 Phi->eraseFromParent();
2434 }
2435}
2436
2437bool LoopInterchangeTransform::adjustLoopBranches() {
2438 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
2439 std::vector<DominatorTree::UpdateType> DTUpdates;
2440
2441 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2442 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2443
2444 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
2445 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
2446 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
2447
2448 simplifyLCSSAPhis(OuterLoop, InnerLoop);
2449
2450 // Ensure that both preheaders do not contain PHI nodes and have single
2451 // predecessors. This allows us to move them easily. We use
2452 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
2453 // preheaders do not satisfy those conditions.
2454 if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) ||
2455 !OuterLoopPreHeader->getUniquePredecessor())
2456 OuterLoopPreHeader =
2457 InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2458 if (InnerLoopPreHeader == OuterLoop->getHeader())
2459 InnerLoopPreHeader =
2460 InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true);
2461
2462 // Adjust the loop preheader
2463 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
2464 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
2465 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
2466 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
2467 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
2468 BasicBlock *InnerLoopLatchPredecessor =
2469 InnerLoopLatch->getUniquePredecessor();
2470 BasicBlock *InnerLoopLatchSuccessor;
2471 BasicBlock *OuterLoopLatchSuccessor;
2472
2473 CondBrInst *OuterLoopLatchBI =
2474 dyn_cast<CondBrInst>(Val: OuterLoopLatch->getTerminator());
2475 CondBrInst *InnerLoopLatchBI =
2476 dyn_cast<CondBrInst>(Val: InnerLoopLatch->getTerminator());
2477 Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
2478 Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
2479
2480 assert(OuterLoopPredecessor && InnerLoopLatchPredecessor &&
2481 "Failed to find a unique predecessor");
2482 assert(OuterLoopLatchBI && InnerLoopLatchBI &&
2483 "Failed to find a conditional branch");
2484
2485 Instruction *InnerLoopLatchPredecessorBI =
2486 InnerLoopLatchPredecessor->getTerminator();
2487 Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
2488
2489 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
2490
2491 // FIXME: IR modification should not stop partway through.
2492 if (!InnerLoopHeaderSuccessor) {
2493 LLVM_DEBUG(
2494 dbgs() << "Inner loop header does not have a unique successor\n");
2495 return false;
2496 }
2497
2498 // Adjust Loop Preheader and headers.
2499 // The branches in the outer loop predecessor and the outer loop header can
2500 // be unconditional branches or conditional branches with duplicates. Consider
2501 // this when updating the successors.
2502 updateSuccessor(Term: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader,
2503 NewBB: InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
2504 // The outer loop header might or might not branch to the outer latch.
2505 // We are guaranteed to branch to the inner loop preheader.
2506 if (llvm::is_contained(Range: successors(I: OuterLoopHeaderBI), Element: OuterLoopLatch)) {
2507 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
2508 updateSuccessor(Term: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch,
2509 DTUpdates,
2510 /*MustUpdateOnce=*/false);
2511 }
2512 updateSuccessor(Term: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader,
2513 NewBB: InnerLoopHeaderSuccessor, DTUpdates,
2514 /*MustUpdateOnce=*/false);
2515
2516 // Adjust reduction PHI's now that the incoming block has changed.
2517 InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader,
2518 New: OuterLoopHeader);
2519
2520 updateSuccessor(Term: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor,
2521 NewBB: OuterLoopPreHeader, DTUpdates);
2522
2523 // -------------Adjust loop latches-----------
2524 if (InnerLoopLatchBI->getSuccessor(i: 0) == InnerLoopHeader)
2525 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 1);
2526 else
2527 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 0);
2528
2529 updateSuccessor(Term: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch,
2530 NewBB: InnerLoopLatchSuccessor, DTUpdates);
2531
2532 if (OuterLoopLatchBI->getSuccessor(i: 0) == OuterLoopHeader)
2533 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 1);
2534 else
2535 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 0);
2536
2537 updateSuccessor(Term: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor,
2538 NewBB: OuterLoopLatchSuccessor, DTUpdates);
2539 updateSuccessor(Term: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch,
2540 DTUpdates);
2541
2542 DT->applyUpdates(Updates: DTUpdates);
2543 restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader,
2544 OrigOuterPreHeader: OuterLoopPreHeader);
2545
2546 moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch,
2547 OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(),
2548 InnerLoop, LI);
2549 // For PHIs in the exit block of the outer loop, outer's latch has been
2550 // replaced by Inners'.
2551 OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2552
2553 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
2554 // Now update the reduction PHIs in the inner and outer loop headers.
2555 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
2556 for (PHINode &PHI : InnerLoopHeader->phis())
2557 if (OuterInnerReductions.contains(Ptr: &PHI))
2558 InnerLoopPHIs.push_back(Elt: &PHI);
2559
2560 for (PHINode &PHI : OuterLoopHeader->phis())
2561 if (OuterInnerReductions.contains(Ptr: &PHI))
2562 OuterLoopPHIs.push_back(Elt: &PHI);
2563
2564 // Now move the remaining reduction PHIs from outer to inner loop header and
2565 // vice versa. The PHI nodes must be part of a reduction across the inner and
2566 // outer loop and all the remains to do is and updating the incoming blocks.
2567 for (PHINode *PHI : OuterLoopPHIs) {
2568 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
2569 PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt());
2570 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2571 }
2572 for (PHINode *PHI : InnerLoopPHIs) {
2573 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
2574 PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt());
2575 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
2576 }
2577
2578 // Update the incoming blocks for moved PHI nodes.
2579 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader);
2580 OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch);
2581 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader);
2582 InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch);
2583
2584 // Values defined in the outer loop header could be used in the inner loop
2585 // latch. In that case, we need to create LCSSA phis for them, because after
2586 // interchanging they will be defined in the new inner loop and used in the
2587 // new outer loop.
2588 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
2589 for (Instruction &I :
2590 make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end())))
2591 MayNeedLCSSAPhis.push_back(Elt: &I);
2592 formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: *DT, LI: *LI, SE);
2593
2594 return true;
2595}
2596
2597bool LoopInterchangeTransform::adjustLoopLinks() {
2598 // Adjust all branches in the inner and outer loop.
2599 bool Changed = adjustLoopBranches();
2600 if (Changed) {
2601 // We have interchanged the preheaders so we need to interchange the data in
2602 // the preheaders as well. This is because the content of the inner
2603 // preheader was previously executed inside the outer loop.
2604 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
2605 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
2606 swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader);
2607 }
2608 return Changed;
2609}
2610
2611PreservedAnalyses LoopInterchangePass::run(LoopNest &LN,
2612 LoopAnalysisManager &AM,
2613 LoopStandardAnalysisResults &AR,
2614 LPMUpdater &U) {
2615 Function &F = *LN.getParent();
2616 SmallVector<Loop *, 8> LoopList(LN.getLoops());
2617
2618 OptimizationRemarkEmitter ORE(&F);
2619
2620 // Ensure minimum depth of the loop nest to do the interchange.
2621 if (!hasSupportedLoopDepth(LoopList, ORE))
2622 return PreservedAnalyses::all();
2623 // Ensure computable loop nest.
2624 if (!isComputableLoopNest(SE: &AR.SE, LoopList)) {
2625 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
2626 return PreservedAnalyses::all();
2627 }
2628
2629 ORE.emit(RemarkBuilder: [&]() {
2630 return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence",
2631 LN.getOutermostLoop().getStartLoc(),
2632 LN.getOutermostLoop().getHeader())
2633 << "Computed dependence info, invoking the transform.";
2634 });
2635
2636 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
2637 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, &AR, &ORE).run(LN))
2638 return PreservedAnalyses::all();
2639 U.markLoopNestChanged(Changed: true);
2640 return getLoopPassPreservedAnalyses();
2641}
2642