1 | //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This Pass handles loop interchange transform. |
10 | // This pass interchanges loops to provide a more cache-friendly memory access |
11 | // patterns. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "llvm/Transforms/Scalar/LoopInterchange.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/SmallVector.h" |
18 | #include "llvm/ADT/Statistic.h" |
19 | #include "llvm/ADT/StringRef.h" |
20 | #include "llvm/Analysis/DependenceAnalysis.h" |
21 | #include "llvm/Analysis/LoopCacheAnalysis.h" |
22 | #include "llvm/Analysis/LoopInfo.h" |
23 | #include "llvm/Analysis/LoopNestAnalysis.h" |
24 | #include "llvm/Analysis/LoopPass.h" |
25 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
26 | #include "llvm/Analysis/ScalarEvolution.h" |
27 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
28 | #include "llvm/IR/BasicBlock.h" |
29 | #include "llvm/IR/Constants.h" |
30 | #include "llvm/IR/DiagnosticInfo.h" |
31 | #include "llvm/IR/Dominators.h" |
32 | #include "llvm/IR/Function.h" |
33 | #include "llvm/IR/InstrTypes.h" |
34 | #include "llvm/IR/Instruction.h" |
35 | #include "llvm/IR/Instructions.h" |
36 | #include "llvm/IR/User.h" |
37 | #include "llvm/IR/Value.h" |
38 | #include "llvm/Support/Casting.h" |
39 | #include "llvm/Support/CommandLine.h" |
40 | #include "llvm/Support/Debug.h" |
41 | #include "llvm/Support/ErrorHandling.h" |
42 | #include "llvm/Support/raw_ostream.h" |
43 | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
44 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
45 | #include "llvm/Transforms/Utils/LoopUtils.h" |
46 | #include <cassert> |
47 | #include <utility> |
48 | #include <vector> |
49 | |
50 | using namespace llvm; |
51 | |
52 | #define DEBUG_TYPE "loop-interchange" |
53 | |
54 | STATISTIC(LoopsInterchanged, "Number of loops interchanged" ); |
55 | |
56 | static cl::opt<int> LoopInterchangeCostThreshold( |
57 | "loop-interchange-threshold" , cl::init(Val: 0), cl::Hidden, |
58 | cl::desc("Interchange if you gain more than this number" )); |
59 | |
60 | namespace { |
61 | |
62 | using LoopVector = SmallVector<Loop *, 8>; |
63 | |
64 | // TODO: Check if we can use a sparse matrix here. |
65 | using CharMatrix = std::vector<std::vector<char>>; |
66 | |
67 | } // end anonymous namespace |
68 | |
69 | // Maximum number of dependencies that can be handled in the dependency matrix. |
70 | static const unsigned MaxMemInstrCount = 100; |
71 | |
72 | // Maximum loop depth supported. |
73 | static const unsigned MaxLoopNestDepth = 10; |
74 | |
75 | #ifdef DUMP_DEP_MATRICIES |
76 | static void printDepMatrix(CharMatrix &DepMatrix) { |
77 | for (auto &Row : DepMatrix) { |
78 | for (auto D : Row) |
79 | LLVM_DEBUG(dbgs() << D << " " ); |
80 | LLVM_DEBUG(dbgs() << "\n" ); |
81 | } |
82 | } |
83 | #endif |
84 | |
85 | static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, |
86 | Loop *L, DependenceInfo *DI, |
87 | ScalarEvolution *SE) { |
88 | using ValueVector = SmallVector<Value *, 16>; |
89 | |
90 | ValueVector MemInstr; |
91 | |
92 | // For each block. |
93 | for (BasicBlock *BB : L->blocks()) { |
94 | // Scan the BB and collect legal loads and stores. |
95 | for (Instruction &I : *BB) { |
96 | if (!isa<Instruction>(Val: I)) |
97 | return false; |
98 | if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) { |
99 | if (!Ld->isSimple()) |
100 | return false; |
101 | MemInstr.push_back(Elt: &I); |
102 | } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) { |
103 | if (!St->isSimple()) |
104 | return false; |
105 | MemInstr.push_back(Elt: &I); |
106 | } |
107 | } |
108 | } |
109 | |
110 | LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() |
111 | << " Loads and Stores to analyze\n" ); |
112 | |
113 | ValueVector::iterator I, IE, J, JE; |
114 | |
115 | for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { |
116 | for (J = I, JE = MemInstr.end(); J != JE; ++J) { |
117 | std::vector<char> Dep; |
118 | Instruction *Src = cast<Instruction>(Val: *I); |
119 | Instruction *Dst = cast<Instruction>(Val: *J); |
120 | // Ignore Input dependencies. |
121 | if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst)) |
122 | continue; |
123 | // Track Output, Flow, and Anti dependencies. |
124 | if (auto D = DI->depends(Src, Dst, PossiblyLoopIndependent: true)) { |
125 | assert(D->isOrdered() && "Expected an output, flow or anti dep." ); |
126 | // If the direction vector is negative, normalize it to |
127 | // make it non-negative. |
128 | if (D->normalize(SE)) |
129 | LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n" ); |
130 | LLVM_DEBUG(StringRef DepType = |
131 | D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output" ; |
132 | dbgs() << "Found " << DepType |
133 | << " dependency between Src and Dst\n" |
134 | << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); |
135 | unsigned Levels = D->getLevels(); |
136 | char Direction; |
137 | for (unsigned II = 1; II <= Levels; ++II) { |
138 | if (D->isScalar(Level: II)) { |
139 | Direction = 'S'; |
140 | Dep.push_back(x: Direction); |
141 | } else { |
142 | unsigned Dir = D->getDirection(Level: II); |
143 | if (Dir == Dependence::DVEntry::LT || |
144 | Dir == Dependence::DVEntry::LE) |
145 | Direction = '<'; |
146 | else if (Dir == Dependence::DVEntry::GT || |
147 | Dir == Dependence::DVEntry::GE) |
148 | Direction = '>'; |
149 | else if (Dir == Dependence::DVEntry::EQ) |
150 | Direction = '='; |
151 | else |
152 | Direction = '*'; |
153 | Dep.push_back(x: Direction); |
154 | } |
155 | } |
156 | while (Dep.size() != Level) { |
157 | Dep.push_back(x: 'I'); |
158 | } |
159 | |
160 | DepMatrix.push_back(x: Dep); |
161 | if (DepMatrix.size() > MaxMemInstrCount) { |
162 | LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount |
163 | << " dependencies inside loop\n" ); |
164 | return false; |
165 | } |
166 | } |
167 | } |
168 | } |
169 | |
170 | return true; |
171 | } |
172 | |
173 | // A loop is moved from index 'from' to an index 'to'. Update the Dependence |
174 | // matrix by exchanging the two columns. |
175 | static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, |
176 | unsigned ToIndx) { |
177 | for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I) |
178 | std::swap(a&: DepMatrix[I][ToIndx], b&: DepMatrix[I][FromIndx]); |
179 | } |
180 | |
181 | // After interchanging, check if the direction vector is valid. |
182 | // [Theorem] A permutation of the loops in a perfect nest is legal if and only |
183 | // if the direction matrix, after the same permutation is applied to its |
184 | // columns, has no ">" direction as the leftmost non-"=" direction in any row. |
185 | static bool isLexicographicallyPositive(std::vector<char> &DV) { |
186 | for (unsigned char Direction : DV) { |
187 | if (Direction == '<') |
188 | return true; |
189 | if (Direction == '>' || Direction == '*') |
190 | return false; |
191 | } |
192 | return true; |
193 | } |
194 | |
195 | // Checks if it is legal to interchange 2 loops. |
196 | static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, |
197 | unsigned InnerLoopId, |
198 | unsigned OuterLoopId) { |
199 | unsigned NumRows = DepMatrix.size(); |
200 | std::vector<char> Cur; |
201 | // For each row check if it is valid to interchange. |
202 | for (unsigned Row = 0; Row < NumRows; ++Row) { |
203 | // Create temporary DepVector check its lexicographical order |
204 | // before and after swapping OuterLoop vs InnerLoop |
205 | Cur = DepMatrix[Row]; |
206 | if (!isLexicographicallyPositive(DV&: Cur)) |
207 | return false; |
208 | std::swap(a&: Cur[InnerLoopId], b&: Cur[OuterLoopId]); |
209 | if (!isLexicographicallyPositive(DV&: Cur)) |
210 | return false; |
211 | } |
212 | return true; |
213 | } |
214 | |
215 | static void populateWorklist(Loop &L, LoopVector &LoopList) { |
216 | LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " |
217 | << L.getHeader()->getParent()->getName() << " Loop: %" |
218 | << L.getHeader()->getName() << '\n'); |
219 | assert(LoopList.empty() && "LoopList should initially be empty!" ); |
220 | Loop *CurrentLoop = &L; |
221 | const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); |
222 | while (!Vec->empty()) { |
223 | // The current loop has multiple subloops in it hence it is not tightly |
224 | // nested. |
225 | // Discard all loops above it added into Worklist. |
226 | if (Vec->size() != 1) { |
227 | LoopList = {}; |
228 | return; |
229 | } |
230 | |
231 | LoopList.push_back(Elt: CurrentLoop); |
232 | CurrentLoop = Vec->front(); |
233 | Vec = &CurrentLoop->getSubLoops(); |
234 | } |
235 | LoopList.push_back(Elt: CurrentLoop); |
236 | } |
237 | |
238 | namespace { |
239 | |
240 | /// LoopInterchangeLegality checks if it is legal to interchange the loop. |
241 | class LoopInterchangeLegality { |
242 | public: |
243 | (Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
244 | OptimizationRemarkEmitter *ORE) |
245 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} |
246 | |
247 | /// Check if the loops can be interchanged. |
248 | bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, |
249 | CharMatrix &DepMatrix); |
250 | |
251 | /// Discover induction PHIs in the header of \p L. Induction |
252 | /// PHIs are added to \p Inductions. |
253 | bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions); |
254 | |
255 | /// Check if the loop structure is understood. We do not handle triangular |
256 | /// loops for now. |
257 | bool isLoopStructureUnderstood(); |
258 | |
259 | bool currentLimitations(); |
260 | |
261 | const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const { |
262 | return OuterInnerReductions; |
263 | } |
264 | |
265 | const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const { |
266 | return InnerLoopInductions; |
267 | } |
268 | |
269 | private: |
270 | bool tightlyNested(Loop *Outer, Loop *Inner); |
271 | bool containsUnsafeInstructions(BasicBlock *BB); |
272 | |
273 | /// Discover induction and reduction PHIs in the header of \p L. Induction |
274 | /// PHIs are added to \p Inductions, reductions are added to |
275 | /// OuterInnerReductions. When the outer loop is passed, the inner loop needs |
276 | /// to be passed as \p InnerLoop. |
277 | bool findInductionAndReductions(Loop *L, |
278 | SmallVector<PHINode *, 8> &Inductions, |
279 | Loop *InnerLoop); |
280 | |
281 | Loop *OuterLoop; |
282 | Loop *InnerLoop; |
283 | |
284 | ScalarEvolution *SE; |
285 | |
286 | /// Interface to emit optimization remarks. |
287 | OptimizationRemarkEmitter *ORE; |
288 | |
289 | /// Set of reduction PHIs taking part of a reduction across the inner and |
290 | /// outer loop. |
291 | SmallPtrSet<PHINode *, 4> OuterInnerReductions; |
292 | |
293 | /// Set of inner loop induction PHIs |
294 | SmallVector<PHINode *, 8> InnerLoopInductions; |
295 | }; |
296 | |
297 | /// LoopInterchangeProfitability checks if it is profitable to interchange the |
298 | /// loop. |
299 | class LoopInterchangeProfitability { |
300 | public: |
301 | (Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
302 | OptimizationRemarkEmitter *ORE) |
303 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} |
304 | |
305 | /// Check if the loop interchange is profitable. |
306 | bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, |
307 | unsigned InnerLoopId, unsigned OuterLoopId, |
308 | CharMatrix &DepMatrix, |
309 | const DenseMap<const Loop *, unsigned> &CostMap, |
310 | std::unique_ptr<CacheCost> &CC); |
311 | |
312 | private: |
313 | int getInstrOrderCost(); |
314 | std::optional<bool> isProfitablePerLoopCacheAnalysis( |
315 | const DenseMap<const Loop *, unsigned> &CostMap, |
316 | std::unique_ptr<CacheCost> &CC); |
317 | std::optional<bool> isProfitablePerInstrOrderCost(); |
318 | std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId, |
319 | unsigned OuterLoopId, |
320 | CharMatrix &DepMatrix); |
321 | Loop *OuterLoop; |
322 | Loop *InnerLoop; |
323 | |
324 | /// Scev analysis. |
325 | ScalarEvolution *SE; |
326 | |
327 | /// Interface to emit optimization remarks. |
328 | OptimizationRemarkEmitter *ORE; |
329 | }; |
330 | |
331 | /// LoopInterchangeTransform interchanges the loop. |
332 | class LoopInterchangeTransform { |
333 | public: |
334 | LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
335 | LoopInfo *LI, DominatorTree *DT, |
336 | const LoopInterchangeLegality &LIL) |
337 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {} |
338 | |
339 | /// Interchange OuterLoop and InnerLoop. |
340 | bool transform(); |
341 | void restructureLoops(Loop *NewInner, Loop *NewOuter, |
342 | BasicBlock *, |
343 | BasicBlock *); |
344 | void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); |
345 | |
346 | private: |
347 | bool adjustLoopLinks(); |
348 | bool adjustLoopBranches(); |
349 | |
350 | Loop *OuterLoop; |
351 | Loop *InnerLoop; |
352 | |
353 | /// Scev analysis. |
354 | ScalarEvolution *SE; |
355 | |
356 | LoopInfo *LI; |
357 | DominatorTree *DT; |
358 | |
359 | const LoopInterchangeLegality &LIL; |
360 | }; |
361 | |
362 | struct LoopInterchange { |
363 | ScalarEvolution *SE = nullptr; |
364 | LoopInfo *LI = nullptr; |
365 | DependenceInfo *DI = nullptr; |
366 | DominatorTree *DT = nullptr; |
367 | std::unique_ptr<CacheCost> CC = nullptr; |
368 | |
369 | /// Interface to emit optimization remarks. |
370 | OptimizationRemarkEmitter *ORE; |
371 | |
372 | LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, |
373 | DominatorTree *DT, std::unique_ptr<CacheCost> &CC, |
374 | OptimizationRemarkEmitter *ORE) |
375 | : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} |
376 | |
377 | bool run(Loop *L) { |
378 | if (L->getParentLoop()) |
379 | return false; |
380 | SmallVector<Loop *, 8> LoopList; |
381 | populateWorklist(L&: *L, LoopList); |
382 | return processLoopList(LoopList); |
383 | } |
384 | |
385 | bool run(LoopNest &LN) { |
386 | SmallVector<Loop *, 8> LoopList(LN.getLoops().begin(), LN.getLoops().end()); |
387 | for (unsigned I = 1; I < LoopList.size(); ++I) |
388 | if (LoopList[I]->getParentLoop() != LoopList[I - 1]) |
389 | return false; |
390 | return processLoopList(LoopList); |
391 | } |
392 | |
393 | bool isComputableLoopNest(ArrayRef<Loop *> LoopList) { |
394 | for (Loop *L : LoopList) { |
395 | const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); |
396 | if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) { |
397 | LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n" ); |
398 | return false; |
399 | } |
400 | if (L->getNumBackEdges() != 1) { |
401 | LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n" ); |
402 | return false; |
403 | } |
404 | if (!L->getExitingBlock()) { |
405 | LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n" ); |
406 | return false; |
407 | } |
408 | } |
409 | return true; |
410 | } |
411 | |
412 | unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) { |
413 | // TODO: Add a better heuristic to select the loop to be interchanged based |
414 | // on the dependence matrix. Currently we select the innermost loop. |
415 | return LoopList.size() - 1; |
416 | } |
417 | |
418 | bool processLoopList(SmallVectorImpl<Loop *> &LoopList) { |
419 | bool Changed = false; |
420 | unsigned LoopNestDepth = LoopList.size(); |
421 | if (LoopNestDepth < 2) { |
422 | LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n" ); |
423 | return false; |
424 | } |
425 | if (LoopNestDepth > MaxLoopNestDepth) { |
426 | LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than " |
427 | << MaxLoopNestDepth << "\n" ); |
428 | return false; |
429 | } |
430 | if (!isComputableLoopNest(LoopList)) { |
431 | LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n" ); |
432 | return false; |
433 | } |
434 | |
435 | LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth |
436 | << "\n" ); |
437 | |
438 | CharMatrix DependencyMatrix; |
439 | Loop *OuterMostLoop = *(LoopList.begin()); |
440 | if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth, |
441 | L: OuterMostLoop, DI, SE)) { |
442 | LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n" ); |
443 | return false; |
444 | } |
445 | #ifdef DUMP_DEP_MATRICIES |
446 | LLVM_DEBUG(dbgs() << "Dependence before interchange\n" ); |
447 | printDepMatrix(DependencyMatrix); |
448 | #endif |
449 | |
450 | // Get the Outermost loop exit. |
451 | BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); |
452 | if (!LoopNestExit) { |
453 | LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block" ); |
454 | return false; |
455 | } |
456 | |
457 | unsigned SelecLoopId = selectLoopForInterchange(LoopList); |
458 | // Obtain the loop vector returned from loop cache analysis beforehand, |
459 | // and put each <Loop, index> pair into a map for constant time query |
460 | // later. Indices in loop vector reprsent the optimal order of the |
461 | // corresponding loop, e.g., given a loopnest with depth N, index 0 |
462 | // indicates the loop should be placed as the outermost loop and index N |
463 | // indicates the loop should be placed as the innermost loop. |
464 | // |
465 | // For the old pass manager CacheCost would be null. |
466 | DenseMap<const Loop *, unsigned> CostMap; |
467 | if (CC != nullptr) { |
468 | const auto &LoopCosts = CC->getLoopCosts(); |
469 | for (unsigned i = 0; i < LoopCosts.size(); i++) { |
470 | CostMap[LoopCosts[i].first] = i; |
471 | } |
472 | } |
473 | // We try to achieve the globally optimal memory access for the loopnest, |
474 | // and do interchange based on a bubble-sort fasion. We start from |
475 | // the innermost loop, move it outwards to the best possible position |
476 | // and repeat this process. |
477 | for (unsigned j = SelecLoopId; j > 0; j--) { |
478 | bool ChangedPerIter = false; |
479 | for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { |
480 | bool Interchanged = processLoop(InnerLoop: LoopList[i], OuterLoop: LoopList[i - 1], InnerLoopId: i, OuterLoopId: i - 1, |
481 | DependencyMatrix, CostMap); |
482 | if (!Interchanged) |
483 | continue; |
484 | // Loops interchanged, update LoopList accordingly. |
485 | std::swap(a&: LoopList[i - 1], b&: LoopList[i]); |
486 | // Update the DependencyMatrix |
487 | interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: i, ToIndx: i - 1); |
488 | #ifdef DUMP_DEP_MATRICIES |
489 | LLVM_DEBUG(dbgs() << "Dependence after interchange\n" ); |
490 | printDepMatrix(DependencyMatrix); |
491 | #endif |
492 | ChangedPerIter |= Interchanged; |
493 | Changed |= Interchanged; |
494 | } |
495 | // Early abort if there was no interchange during an entire round of |
496 | // moving loops outwards. |
497 | if (!ChangedPerIter) |
498 | break; |
499 | } |
500 | return Changed; |
501 | } |
502 | |
503 | bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId, |
504 | unsigned OuterLoopId, |
505 | std::vector<std::vector<char>> &DependencyMatrix, |
506 | const DenseMap<const Loop *, unsigned> &CostMap) { |
507 | LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId |
508 | << " and OuterLoopId = " << OuterLoopId << "\n" ); |
509 | LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); |
510 | if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) { |
511 | LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n" ); |
512 | return false; |
513 | } |
514 | LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n" ); |
515 | LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); |
516 | if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, |
517 | DepMatrix&: DependencyMatrix, CostMap, CC)) { |
518 | LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n" ); |
519 | return false; |
520 | } |
521 | |
522 | ORE->emit(RemarkBuilder: [&]() { |
523 | return OptimizationRemark(DEBUG_TYPE, "Interchanged" , |
524 | InnerLoop->getStartLoc(), |
525 | InnerLoop->getHeader()) |
526 | << "Loop interchanged with enclosing loop." ; |
527 | }); |
528 | |
529 | LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL); |
530 | LIT.transform(); |
531 | LLVM_DEBUG(dbgs() << "Loops interchanged.\n" ); |
532 | LoopsInterchanged++; |
533 | |
534 | llvm::formLCSSARecursively(L&: *OuterLoop, DT: *DT, LI, SE); |
535 | return true; |
536 | } |
537 | }; |
538 | |
539 | } // end anonymous namespace |
540 | |
541 | bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { |
542 | return any_of(Range&: *BB, P: [](const Instruction &I) { |
543 | return I.mayHaveSideEffects() || I.mayReadFromMemory(); |
544 | }); |
545 | } |
546 | |
547 | bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { |
548 | BasicBlock * = OuterLoop->getHeader(); |
549 | BasicBlock * = InnerLoop->getLoopPreheader(); |
550 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
551 | |
552 | LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n" ); |
553 | |
554 | // A perfectly nested loop will not have any branch in between the outer and |
555 | // inner block i.e. outer header will branch to either inner preheader and |
556 | // outerloop latch. |
557 | BranchInst * = |
558 | dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator()); |
559 | if (!OuterLoopHeaderBI) |
560 | return false; |
561 | |
562 | for (BasicBlock *Succ : successors(I: OuterLoopHeaderBI)) |
563 | if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && |
564 | Succ != OuterLoopLatch) |
565 | return false; |
566 | |
567 | LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n" ); |
568 | // We do not have any basic block in between now make sure the outer header |
569 | // and outer loop latch doesn't contain any unsafe instructions. |
570 | if (containsUnsafeInstructions(BB: OuterLoopHeader) || |
571 | containsUnsafeInstructions(BB: OuterLoopLatch)) |
572 | return false; |
573 | |
574 | // Also make sure the inner loop preheader does not contain any unsafe |
575 | // instructions. Note that all instructions in the preheader will be moved to |
576 | // the outer loop header when interchanging. |
577 | if (InnerLoopPreHeader != OuterLoopHeader && |
578 | containsUnsafeInstructions(BB: InnerLoopPreHeader)) |
579 | return false; |
580 | |
581 | BasicBlock *InnerLoopExit = InnerLoop->getExitBlock(); |
582 | // Ensure the inner loop exit block flows to the outer loop latch possibly |
583 | // through empty blocks. |
584 | const BasicBlock &SuccInner = |
585 | LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch); |
586 | if (&SuccInner != OuterLoopLatch) { |
587 | LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit |
588 | << " does not lead to the outer loop latch.\n" ;); |
589 | return false; |
590 | } |
591 | // The inner loop exit block does flow to the outer loop latch and not some |
592 | // other BBs, now make sure it contains safe instructions, since it will be |
593 | // moved into the (new) inner loop after interchange. |
594 | if (containsUnsafeInstructions(BB: InnerLoopExit)) |
595 | return false; |
596 | |
597 | LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n" ); |
598 | // We have a perfect loop nest. |
599 | return true; |
600 | } |
601 | |
602 | bool LoopInterchangeLegality::isLoopStructureUnderstood() { |
603 | BasicBlock * = InnerLoop->getLoopPreheader(); |
604 | for (PHINode *InnerInduction : InnerLoopInductions) { |
605 | unsigned Num = InnerInduction->getNumOperands(); |
606 | for (unsigned i = 0; i < Num; ++i) { |
607 | Value *Val = InnerInduction->getOperand(i_nocapture: i); |
608 | if (isa<Constant>(Val)) |
609 | continue; |
610 | Instruction *I = dyn_cast<Instruction>(Val); |
611 | if (!I) |
612 | return false; |
613 | // TODO: Handle triangular loops. |
614 | // e.g. for(int i=0;i<N;i++) |
615 | // for(int j=i;j<N;j++) |
616 | unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); |
617 | if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) == |
618 | InnerLoopPreheader && |
619 | !OuterLoop->isLoopInvariant(V: I)) { |
620 | return false; |
621 | } |
622 | } |
623 | } |
624 | |
625 | // TODO: Handle triangular loops of another form. |
626 | // e.g. for(int i=0;i<N;i++) |
627 | // for(int j=0;j<i;j++) |
628 | // or, |
629 | // for(int i=0;i<N;i++) |
630 | // for(int j=0;j*i<N;j++) |
631 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
632 | BranchInst *InnerLoopLatchBI = |
633 | dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator()); |
634 | if (!InnerLoopLatchBI->isConditional()) |
635 | return false; |
636 | if (CmpInst *InnerLoopCmp = |
637 | dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) { |
638 | Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: 0); |
639 | Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: 1); |
640 | |
641 | // LHS and RHS of the inner loop exit condition, e.g., |
642 | // in "for(int j=0;j<i;j++)", LHS is j and RHS is i. |
643 | Value *Left = nullptr; |
644 | Value *Right = nullptr; |
645 | |
646 | // Check if V only involves inner loop induction variable. |
647 | // Return true if V is InnerInduction, or a cast from |
648 | // InnerInduction, or a binary operator that involves |
649 | // InnerInduction and a constant. |
650 | std::function<bool(Value *)> IsPathToInnerIndVar; |
651 | IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool { |
652 | if (llvm::is_contained(Range&: InnerLoopInductions, Element: V)) |
653 | return true; |
654 | if (isa<Constant>(Val: V)) |
655 | return true; |
656 | const Instruction *I = dyn_cast<Instruction>(Val: V); |
657 | if (!I) |
658 | return false; |
659 | if (isa<CastInst>(Val: I)) |
660 | return IsPathToInnerIndVar(I->getOperand(i: 0)); |
661 | if (isa<BinaryOperator>(Val: I)) |
662 | return IsPathToInnerIndVar(I->getOperand(i: 0)) && |
663 | IsPathToInnerIndVar(I->getOperand(i: 1)); |
664 | return false; |
665 | }; |
666 | |
667 | // In case of multiple inner loop indvars, it is okay if LHS and RHS |
668 | // are both inner indvar related variables. |
669 | if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1)) |
670 | return true; |
671 | |
672 | // Otherwise we check if the cmp instruction compares an inner indvar |
673 | // related variable (Left) with a outer loop invariant (Right). |
674 | if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Val: Op0)) { |
675 | Left = Op0; |
676 | Right = Op1; |
677 | } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Val: Op1)) { |
678 | Left = Op1; |
679 | Right = Op0; |
680 | } |
681 | |
682 | if (Left == nullptr) |
683 | return false; |
684 | |
685 | const SCEV *S = SE->getSCEV(V: Right); |
686 | if (!SE->isLoopInvariant(S, L: OuterLoop)) |
687 | return false; |
688 | } |
689 | |
690 | return true; |
691 | } |
692 | |
693 | // If SV is a LCSSA PHI node with a single incoming value, return the incoming |
694 | // value. |
695 | static Value *followLCSSA(Value *SV) { |
696 | PHINode *PHI = dyn_cast<PHINode>(Val: SV); |
697 | if (!PHI) |
698 | return SV; |
699 | |
700 | if (PHI->getNumIncomingValues() != 1) |
701 | return SV; |
702 | return followLCSSA(SV: PHI->getIncomingValue(i: 0)); |
703 | } |
704 | |
705 | // Check V's users to see if it is involved in a reduction in L. |
706 | static PHINode *findInnerReductionPhi(Loop *L, Value *V) { |
707 | // Reduction variables cannot be constants. |
708 | if (isa<Constant>(Val: V)) |
709 | return nullptr; |
710 | |
711 | for (Value *User : V->users()) { |
712 | if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) { |
713 | if (PHI->getNumIncomingValues() == 1) |
714 | continue; |
715 | RecurrenceDescriptor RD; |
716 | if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) { |
717 | // Detect floating point reduction only when it can be reordered. |
718 | if (RD.getExactFPMathInst() != nullptr) |
719 | return nullptr; |
720 | return PHI; |
721 | } |
722 | return nullptr; |
723 | } |
724 | } |
725 | |
726 | return nullptr; |
727 | } |
728 | |
729 | bool LoopInterchangeLegality::findInductionAndReductions( |
730 | Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) { |
731 | if (!L->getLoopLatch() || !L->getLoopPredecessor()) |
732 | return false; |
733 | for (PHINode &PHI : L->getHeader()->phis()) { |
734 | InductionDescriptor ID; |
735 | if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID)) |
736 | Inductions.push_back(Elt: &PHI); |
737 | else { |
738 | // PHIs in inner loops need to be part of a reduction in the outer loop, |
739 | // discovered when checking the PHIs of the outer loop earlier. |
740 | if (!InnerLoop) { |
741 | if (!OuterInnerReductions.count(Ptr: &PHI)) { |
742 | LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " |
743 | "across the outer loop.\n" ); |
744 | return false; |
745 | } |
746 | } else { |
747 | assert(PHI.getNumIncomingValues() == 2 && |
748 | "Phis in loop header should have exactly 2 incoming values" ); |
749 | // Check if we have a PHI node in the outer loop that has a reduction |
750 | // result from the inner loop as an incoming value. |
751 | Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch())); |
752 | PHINode *InnerRedPhi = findInnerReductionPhi(L: InnerLoop, V); |
753 | if (!InnerRedPhi || |
754 | !llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) { |
755 | LLVM_DEBUG( |
756 | dbgs() |
757 | << "Failed to recognize PHI as an induction or reduction.\n" ); |
758 | return false; |
759 | } |
760 | OuterInnerReductions.insert(Ptr: &PHI); |
761 | OuterInnerReductions.insert(Ptr: InnerRedPhi); |
762 | } |
763 | } |
764 | } |
765 | return true; |
766 | } |
767 | |
768 | // This function indicates the current limitations in the transform as a result |
769 | // of which we do not proceed. |
770 | bool LoopInterchangeLegality::currentLimitations() { |
771 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
772 | |
773 | // transform currently expects the loop latches to also be the exiting |
774 | // blocks. |
775 | if (InnerLoop->getExitingBlock() != InnerLoopLatch || |
776 | OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || |
777 | !isa<BranchInst>(Val: InnerLoopLatch->getTerminator()) || |
778 | !isa<BranchInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) { |
779 | LLVM_DEBUG( |
780 | dbgs() << "Loops where the latch is not the exiting block are not" |
781 | << " supported currently.\n" ); |
782 | ORE->emit(RemarkBuilder: [&]() { |
783 | return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch" , |
784 | OuterLoop->getStartLoc(), |
785 | OuterLoop->getHeader()) |
786 | << "Loops where the latch is not the exiting block cannot be" |
787 | " interchange currently." ; |
788 | }); |
789 | return true; |
790 | } |
791 | |
792 | SmallVector<PHINode *, 8> Inductions; |
793 | if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) { |
794 | LLVM_DEBUG( |
795 | dbgs() << "Only outer loops with induction or reduction PHI nodes " |
796 | << "are supported currently.\n" ); |
797 | ORE->emit(RemarkBuilder: [&]() { |
798 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter" , |
799 | OuterLoop->getStartLoc(), |
800 | OuterLoop->getHeader()) |
801 | << "Only outer loops with induction or reduction PHI nodes can be" |
802 | " interchanged currently." ; |
803 | }); |
804 | return true; |
805 | } |
806 | |
807 | Inductions.clear(); |
808 | // For multi-level loop nests, make sure that all phi nodes for inner loops |
809 | // at all levels can be recognized as a induction or reduction phi. Bail out |
810 | // if a phi node at a certain nesting level cannot be properly recognized. |
811 | Loop *CurLevelLoop = OuterLoop; |
812 | while (!CurLevelLoop->getSubLoops().empty()) { |
813 | // We already made sure that the loop nest is tightly nested. |
814 | CurLevelLoop = CurLevelLoop->getSubLoops().front(); |
815 | if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) { |
816 | LLVM_DEBUG( |
817 | dbgs() << "Only inner loops with induction or reduction PHI nodes " |
818 | << "are supported currently.\n" ); |
819 | ORE->emit(RemarkBuilder: [&]() { |
820 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner" , |
821 | CurLevelLoop->getStartLoc(), |
822 | CurLevelLoop->getHeader()) |
823 | << "Only inner loops with induction or reduction PHI nodes can be" |
824 | " interchange currently." ; |
825 | }); |
826 | return true; |
827 | } |
828 | } |
829 | |
830 | // TODO: Triangular loops are not handled for now. |
831 | if (!isLoopStructureUnderstood()) { |
832 | LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n" ); |
833 | ORE->emit(RemarkBuilder: [&]() { |
834 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner" , |
835 | InnerLoop->getStartLoc(), |
836 | InnerLoop->getHeader()) |
837 | << "Inner loop structure not understood currently." ; |
838 | }); |
839 | return true; |
840 | } |
841 | |
842 | return false; |
843 | } |
844 | |
845 | bool LoopInterchangeLegality::findInductions( |
846 | Loop *L, SmallVectorImpl<PHINode *> &Inductions) { |
847 | for (PHINode &PHI : L->getHeader()->phis()) { |
848 | InductionDescriptor ID; |
849 | if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID)) |
850 | Inductions.push_back(Elt: &PHI); |
851 | } |
852 | return !Inductions.empty(); |
853 | } |
854 | |
855 | // We currently only support LCSSA PHI nodes in the inner loop exit, if their |
856 | // users are either reduction PHIs or PHIs outside the outer loop (which means |
857 | // the we are only interested in the final value after the loop). |
858 | static bool |
859 | areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, |
860 | SmallPtrSetImpl<PHINode *> &Reductions) { |
861 | BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); |
862 | for (PHINode &PHI : InnerExit->phis()) { |
863 | // Reduction lcssa phi will have only 1 incoming block that from loop latch. |
864 | if (PHI.getNumIncomingValues() > 1) |
865 | return false; |
866 | if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) { |
867 | PHINode *PN = dyn_cast<PHINode>(Val: U); |
868 | return !PN || |
869 | (!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent())); |
870 | })) { |
871 | return false; |
872 | } |
873 | } |
874 | return true; |
875 | } |
876 | |
877 | // We currently support LCSSA PHI nodes in the outer loop exit, if their |
878 | // incoming values do not come from the outer loop latch or if the |
879 | // outer loop latch has a single predecessor. In that case, the value will |
880 | // be available if both the inner and outer loop conditions are true, which |
881 | // will still be true after interchanging. If we have multiple predecessor, |
882 | // that may not be the case, e.g. because the outer loop latch may be executed |
883 | // if the inner loop is not executed. |
884 | static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { |
885 | BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); |
886 | for (PHINode &PHI : LoopNestExit->phis()) { |
887 | for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) { |
888 | Instruction *IncomingI = dyn_cast<Instruction>(Val: PHI.getIncomingValue(i)); |
889 | if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) |
890 | continue; |
891 | |
892 | // The incoming value is defined in the outer loop latch. Currently we |
893 | // only support that in case the outer loop latch has a single predecessor. |
894 | // This guarantees that the outer loop latch is executed if and only if |
895 | // the inner loop is executed (because tightlyNested() guarantees that the |
896 | // outer loop header only branches to the inner loop or the outer loop |
897 | // latch). |
898 | // FIXME: We could weaken this logic and allow multiple predecessors, |
899 | // if the values are produced outside the loop latch. We would need |
900 | // additional logic to update the PHI nodes in the exit block as |
901 | // well. |
902 | if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) |
903 | return false; |
904 | } |
905 | } |
906 | return true; |
907 | } |
908 | |
909 | // In case of multi-level nested loops, it may occur that lcssa phis exist in |
910 | // the latch of InnerLoop, i.e., when defs of the incoming values are further |
911 | // inside the loopnest. Sometimes those incoming values are not available |
912 | // after interchange, since the original inner latch will become the new outer |
913 | // latch which may have predecessor paths that do not include those incoming |
914 | // values. |
915 | // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of |
916 | // multi-level loop nests. |
917 | static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { |
918 | if (InnerLoop->getSubLoops().empty()) |
919 | return true; |
920 | // If the original outer latch has only one predecessor, then values defined |
921 | // further inside the looploop, e.g., in the innermost loop, will be available |
922 | // at the new outer latch after interchange. |
923 | if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr) |
924 | return true; |
925 | |
926 | // The outer latch has more than one predecessors, i.e., the inner |
927 | // exit and the inner header. |
928 | // PHI nodes in the inner latch are lcssa phis where the incoming values |
929 | // are defined further inside the loopnest. Check if those phis are used |
930 | // in the original inner latch. If that is the case then bail out since |
931 | // those incoming values may not be available at the new outer latch. |
932 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
933 | for (PHINode &PHI : InnerLoopLatch->phis()) { |
934 | for (auto *U : PHI.users()) { |
935 | Instruction *UI = cast<Instruction>(Val: U); |
936 | if (InnerLoopLatch == UI->getParent()) |
937 | return false; |
938 | } |
939 | } |
940 | return true; |
941 | } |
942 | |
943 | bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, |
944 | unsigned OuterLoopId, |
945 | CharMatrix &DepMatrix) { |
946 | if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { |
947 | LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId |
948 | << " and OuterLoopId = " << OuterLoopId |
949 | << " due to dependence\n" ); |
950 | ORE->emit(RemarkBuilder: [&]() { |
951 | return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence" , |
952 | InnerLoop->getStartLoc(), |
953 | InnerLoop->getHeader()) |
954 | << "Cannot interchange loops due to dependences." ; |
955 | }); |
956 | return false; |
957 | } |
958 | // Check if outer and inner loop contain legal instructions only. |
959 | for (auto *BB : OuterLoop->blocks()) |
960 | for (Instruction &I : BB->instructionsWithoutDebug()) |
961 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
962 | // readnone functions do not prevent interchanging. |
963 | if (CI->onlyWritesMemory()) |
964 | continue; |
965 | LLVM_DEBUG( |
966 | dbgs() << "Loops with call instructions cannot be interchanged " |
967 | << "safely." ); |
968 | ORE->emit(RemarkBuilder: [&]() { |
969 | return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst" , |
970 | CI->getDebugLoc(), |
971 | CI->getParent()) |
972 | << "Cannot interchange loops due to call instruction." ; |
973 | }); |
974 | |
975 | return false; |
976 | } |
977 | |
978 | if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) { |
979 | LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n" ); |
980 | return false; |
981 | } |
982 | |
983 | if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { |
984 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n" ); |
985 | ORE->emit(RemarkBuilder: [&]() { |
986 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI" , |
987 | InnerLoop->getStartLoc(), |
988 | InnerLoop->getHeader()) |
989 | << "Cannot interchange loops because unsupported PHI nodes found " |
990 | "in inner loop latch." ; |
991 | }); |
992 | return false; |
993 | } |
994 | |
995 | // TODO: The loops could not be interchanged due to current limitations in the |
996 | // transform module. |
997 | if (currentLimitations()) { |
998 | LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n" ); |
999 | return false; |
1000 | } |
1001 | |
1002 | // Check if the loops are tightly nested. |
1003 | if (!tightlyNested(OuterLoop, InnerLoop)) { |
1004 | LLVM_DEBUG(dbgs() << "Loops not tightly nested\n" ); |
1005 | ORE->emit(RemarkBuilder: [&]() { |
1006 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested" , |
1007 | InnerLoop->getStartLoc(), |
1008 | InnerLoop->getHeader()) |
1009 | << "Cannot interchange loops because they are not tightly " |
1010 | "nested." ; |
1011 | }); |
1012 | return false; |
1013 | } |
1014 | |
1015 | if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop, |
1016 | Reductions&: OuterInnerReductions)) { |
1017 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n" ); |
1018 | ORE->emit(RemarkBuilder: [&]() { |
1019 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI" , |
1020 | InnerLoop->getStartLoc(), |
1021 | InnerLoop->getHeader()) |
1022 | << "Found unsupported PHI node in loop exit." ; |
1023 | }); |
1024 | return false; |
1025 | } |
1026 | |
1027 | if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { |
1028 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n" ); |
1029 | ORE->emit(RemarkBuilder: [&]() { |
1030 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI" , |
1031 | OuterLoop->getStartLoc(), |
1032 | OuterLoop->getHeader()) |
1033 | << "Found unsupported PHI node in loop exit." ; |
1034 | }); |
1035 | return false; |
1036 | } |
1037 | |
1038 | return true; |
1039 | } |
1040 | |
1041 | int LoopInterchangeProfitability::getInstrOrderCost() { |
1042 | unsigned GoodOrder, BadOrder; |
1043 | BadOrder = GoodOrder = 0; |
1044 | for (BasicBlock *BB : InnerLoop->blocks()) { |
1045 | for (Instruction &Ins : *BB) { |
1046 | if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) { |
1047 | unsigned NumOp = GEP->getNumOperands(); |
1048 | bool FoundInnerInduction = false; |
1049 | bool FoundOuterInduction = false; |
1050 | for (unsigned i = 0; i < NumOp; ++i) { |
1051 | // Skip operands that are not SCEV-able. |
1052 | if (!SE->isSCEVable(Ty: GEP->getOperand(i_nocapture: i)->getType())) |
1053 | continue; |
1054 | |
1055 | const SCEV *OperandVal = SE->getSCEV(V: GEP->getOperand(i_nocapture: i)); |
1056 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal); |
1057 | if (!AR) |
1058 | continue; |
1059 | |
1060 | // If we find the inner induction after an outer induction e.g. |
1061 | // for(int i=0;i<N;i++) |
1062 | // for(int j=0;j<N;j++) |
1063 | // A[i][j] = A[i-1][j-1]+k; |
1064 | // then it is a good order. |
1065 | if (AR->getLoop() == InnerLoop) { |
1066 | // We found an InnerLoop induction after OuterLoop induction. It is |
1067 | // a good order. |
1068 | FoundInnerInduction = true; |
1069 | if (FoundOuterInduction) { |
1070 | GoodOrder++; |
1071 | break; |
1072 | } |
1073 | } |
1074 | // If we find the outer induction after an inner induction e.g. |
1075 | // for(int i=0;i<N;i++) |
1076 | // for(int j=0;j<N;j++) |
1077 | // A[j][i] = A[j-1][i-1]+k; |
1078 | // then it is a bad order. |
1079 | if (AR->getLoop() == OuterLoop) { |
1080 | // We found an OuterLoop induction after InnerLoop induction. It is |
1081 | // a bad order. |
1082 | FoundOuterInduction = true; |
1083 | if (FoundInnerInduction) { |
1084 | BadOrder++; |
1085 | break; |
1086 | } |
1087 | } |
1088 | } |
1089 | } |
1090 | } |
1091 | } |
1092 | return GoodOrder - BadOrder; |
1093 | } |
1094 | |
1095 | std::optional<bool> |
1096 | LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis( |
1097 | const DenseMap<const Loop *, unsigned> &CostMap, |
1098 | std::unique_ptr<CacheCost> &CC) { |
1099 | // This is the new cost model returned from loop cache analysis. |
1100 | // A smaller index means the loop should be placed an outer loop, and vice |
1101 | // versa. |
1102 | if (CostMap.contains(Val: InnerLoop) && CostMap.contains(Val: OuterLoop)) { |
1103 | unsigned InnerIndex = 0, OuterIndex = 0; |
1104 | InnerIndex = CostMap.find(Val: InnerLoop)->second; |
1105 | OuterIndex = CostMap.find(Val: OuterLoop)->second; |
1106 | LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex |
1107 | << ", OuterIndex = " << OuterIndex << "\n" ); |
1108 | if (InnerIndex < OuterIndex) |
1109 | return std::optional<bool>(true); |
1110 | assert(InnerIndex != OuterIndex && "CostMap should assign unique " |
1111 | "numbers to each loop" ); |
1112 | if (CC->getLoopCost(L: *OuterLoop) == CC->getLoopCost(L: *InnerLoop)) |
1113 | return std::nullopt; |
1114 | return std::optional<bool>(false); |
1115 | } |
1116 | return std::nullopt; |
1117 | } |
1118 | |
1119 | std::optional<bool> |
1120 | LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { |
1121 | // Legacy cost model: this is rough cost estimation algorithm. It counts the |
1122 | // good and bad order of induction variables in the instruction and allows |
1123 | // reordering if number of bad orders is more than good. |
1124 | int Cost = getInstrOrderCost(); |
1125 | LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n" ); |
1126 | if (Cost < 0 && Cost < LoopInterchangeCostThreshold) |
1127 | return std::optional<bool>(true); |
1128 | |
1129 | return std::nullopt; |
1130 | } |
1131 | |
1132 | std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( |
1133 | unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { |
1134 | for (auto &Row : DepMatrix) { |
1135 | // If the inner loop is loop independent or doesn't carry any dependency |
1136 | // it is not profitable to move this to outer position, since we are |
1137 | // likely able to do inner loop vectorization already. |
1138 | if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=') |
1139 | return std::optional<bool>(false); |
1140 | |
1141 | // If the outer loop is not loop independent it is not profitable to move |
1142 | // this to inner position, since doing so would not enable inner loop |
1143 | // parallelism. |
1144 | if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=') |
1145 | return std::optional<bool>(false); |
1146 | } |
1147 | // If inner loop has dependence and outer loop is loop independent then it |
1148 | // is/ profitable to interchange to enable inner loop parallelism. |
1149 | // If there are no dependences, interchanging will not improve anything. |
1150 | return std::optional<bool>(!DepMatrix.empty()); |
1151 | } |
1152 | |
1153 | bool LoopInterchangeProfitability::isProfitable( |
1154 | const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, |
1155 | unsigned OuterLoopId, CharMatrix &DepMatrix, |
1156 | const DenseMap<const Loop *, unsigned> &CostMap, |
1157 | std::unique_ptr<CacheCost> &CC) { |
1158 | // isProfitable() is structured to avoid endless loop interchange. |
1159 | // If loop cache analysis could decide the profitability then, |
1160 | // profitability check will stop and return the analysis result. |
1161 | // If cache analysis failed to analyze the loopnest (e.g., |
1162 | // due to delinearization issues) then only check whether it is |
1163 | // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to |
1164 | // analysis the profitability then only, isProfitableForVectorization |
1165 | // will decide. |
1166 | std::optional<bool> shouldInterchange = |
1167 | isProfitablePerLoopCacheAnalysis(CostMap, CC); |
1168 | if (!shouldInterchange.has_value()) { |
1169 | shouldInterchange = isProfitablePerInstrOrderCost(); |
1170 | if (!shouldInterchange.has_value()) |
1171 | shouldInterchange = |
1172 | isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); |
1173 | } |
1174 | if (!shouldInterchange.has_value()) { |
1175 | ORE->emit(RemarkBuilder: [&]() { |
1176 | return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable" , |
1177 | InnerLoop->getStartLoc(), |
1178 | InnerLoop->getHeader()) |
1179 | << "Insufficient information to calculate the cost of loop for " |
1180 | "interchange." ; |
1181 | }); |
1182 | return false; |
1183 | } else if (!shouldInterchange.value()) { |
1184 | ORE->emit(RemarkBuilder: [&]() { |
1185 | return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable" , |
1186 | InnerLoop->getStartLoc(), |
1187 | InnerLoop->getHeader()) |
1188 | << "Interchanging loops is not considered to improve cache " |
1189 | "locality nor vectorization." ; |
1190 | }); |
1191 | return false; |
1192 | } |
1193 | return true; |
1194 | } |
1195 | |
1196 | void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, |
1197 | Loop *InnerLoop) { |
1198 | for (Loop *L : *OuterLoop) |
1199 | if (L == InnerLoop) { |
1200 | OuterLoop->removeChildLoop(Child: L); |
1201 | return; |
1202 | } |
1203 | llvm_unreachable("Couldn't find loop" ); |
1204 | } |
1205 | |
1206 | /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the |
1207 | /// new inner and outer loop after interchanging: NewInner is the original |
1208 | /// outer loop and NewOuter is the original inner loop. |
1209 | /// |
1210 | /// Before interchanging, we have the following structure |
1211 | /// Outer preheader |
1212 | // Outer header |
1213 | // Inner preheader |
1214 | // Inner header |
1215 | // Inner body |
1216 | // Inner latch |
1217 | // outer bbs |
1218 | // Outer latch |
1219 | // |
1220 | // After interchanging: |
1221 | // Inner preheader |
1222 | // Inner header |
1223 | // Outer preheader |
1224 | // Outer header |
1225 | // Inner body |
1226 | // outer bbs |
1227 | // Outer latch |
1228 | // Inner latch |
1229 | void LoopInterchangeTransform::restructureLoops( |
1230 | Loop *NewInner, Loop *NewOuter, BasicBlock *, |
1231 | BasicBlock *) { |
1232 | Loop *OuterLoopParent = OuterLoop->getParentLoop(); |
1233 | // The original inner loop preheader moves from the new inner loop to |
1234 | // the parent loop, if there is one. |
1235 | NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader); |
1236 | LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent); |
1237 | |
1238 | // Switch the loop levels. |
1239 | if (OuterLoopParent) { |
1240 | // Remove the loop from its parent loop. |
1241 | removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner); |
1242 | removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter); |
1243 | OuterLoopParent->addChildLoop(NewChild: NewOuter); |
1244 | } else { |
1245 | removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter); |
1246 | LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter); |
1247 | } |
1248 | while (!NewOuter->isInnermost()) |
1249 | NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin())); |
1250 | NewOuter->addChildLoop(NewChild: NewInner); |
1251 | |
1252 | // BBs from the original inner loop. |
1253 | SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); |
1254 | |
1255 | // Add BBs from the original outer loop to the original inner loop (excluding |
1256 | // BBs already in inner loop) |
1257 | for (BasicBlock *BB : NewInner->blocks()) |
1258 | if (LI->getLoopFor(BB) == NewInner) |
1259 | NewOuter->addBlockEntry(BB); |
1260 | |
1261 | // Now remove inner loop header and latch from the new inner loop and move |
1262 | // other BBs (the loop body) to the new inner loop. |
1263 | BasicBlock * = NewOuter->getHeader(); |
1264 | BasicBlock *OuterLatch = NewOuter->getLoopLatch(); |
1265 | for (BasicBlock *BB : OrigInnerBBs) { |
1266 | // Nothing will change for BBs in child loops. |
1267 | if (LI->getLoopFor(BB) != NewOuter) |
1268 | continue; |
1269 | // Remove the new outer loop header and latch from the new inner loop. |
1270 | if (BB == OuterHeader || BB == OuterLatch) |
1271 | NewInner->removeBlockFromLoop(BB); |
1272 | else |
1273 | LI->changeLoopFor(BB, L: NewInner); |
1274 | } |
1275 | |
1276 | // The preheader of the original outer loop becomes part of the new |
1277 | // outer loop. |
1278 | NewOuter->addBlockEntry(BB: OrigOuterPreHeader); |
1279 | LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter); |
1280 | |
1281 | // Tell SE that we move the loops around. |
1282 | SE->forgetLoop(L: NewOuter); |
1283 | } |
1284 | |
1285 | bool LoopInterchangeTransform::transform() { |
1286 | bool Transformed = false; |
1287 | |
1288 | if (InnerLoop->getSubLoops().empty()) { |
1289 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1290 | LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n" ); |
1291 | auto &InductionPHIs = LIL.getInnerLoopInductions(); |
1292 | if (InductionPHIs.empty()) { |
1293 | LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n" ); |
1294 | return false; |
1295 | } |
1296 | |
1297 | SmallVector<Instruction *, 8> InnerIndexVarList; |
1298 | for (PHINode *CurInductionPHI : InductionPHIs) { |
1299 | if (CurInductionPHI->getIncomingBlock(i: 0) == InnerLoopPreHeader) |
1300 | InnerIndexVarList.push_back( |
1301 | Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 1))); |
1302 | else |
1303 | InnerIndexVarList.push_back( |
1304 | Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 0))); |
1305 | } |
1306 | |
1307 | // Create a new latch block for the inner loop. We split at the |
1308 | // current latch's terminator and then move the condition and all |
1309 | // operands that are not either loop-invariant or the induction PHI into the |
1310 | // new latch block. |
1311 | BasicBlock *NewLatch = |
1312 | SplitBlock(Old: InnerLoop->getLoopLatch(), |
1313 | SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI); |
1314 | |
1315 | SmallSetVector<Instruction *, 4> WorkList; |
1316 | unsigned i = 0; |
1317 | auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() { |
1318 | for (; i < WorkList.size(); i++) { |
1319 | // Duplicate instruction and move it the new latch. Update uses that |
1320 | // have been moved. |
1321 | Instruction *NewI = WorkList[i]->clone(); |
1322 | NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHI()); |
1323 | assert(!NewI->mayHaveSideEffects() && |
1324 | "Moving instructions with side-effects may change behavior of " |
1325 | "the loop nest!" ); |
1326 | for (Use &U : llvm::make_early_inc_range(Range: WorkList[i]->uses())) { |
1327 | Instruction *UserI = cast<Instruction>(Val: U.getUser()); |
1328 | if (!InnerLoop->contains(BB: UserI->getParent()) || |
1329 | UserI->getParent() == NewLatch || |
1330 | llvm::is_contained(Range: InductionPHIs, Element: UserI)) |
1331 | U.set(NewI); |
1332 | } |
1333 | // Add operands of moved instruction to the worklist, except if they are |
1334 | // outside the inner loop or are the induction PHI. |
1335 | for (Value *Op : WorkList[i]->operands()) { |
1336 | Instruction *OpI = dyn_cast<Instruction>(Val: Op); |
1337 | if (!OpI || |
1338 | this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop || |
1339 | llvm::is_contained(Range: InductionPHIs, Element: OpI)) |
1340 | continue; |
1341 | WorkList.insert(X: OpI); |
1342 | } |
1343 | } |
1344 | }; |
1345 | |
1346 | // FIXME: Should we interchange when we have a constant condition? |
1347 | Instruction *CondI = dyn_cast<Instruction>( |
1348 | Val: cast<BranchInst>(Val: InnerLoop->getLoopLatch()->getTerminator()) |
1349 | ->getCondition()); |
1350 | if (CondI) |
1351 | WorkList.insert(X: CondI); |
1352 | MoveInstructions(); |
1353 | for (Instruction *InnerIndexVar : InnerIndexVarList) |
1354 | WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar)); |
1355 | MoveInstructions(); |
1356 | } |
1357 | |
1358 | // Ensure the inner loop phi nodes have a separate basic block. |
1359 | BasicBlock * = InnerLoop->getHeader(); |
1360 | if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) { |
1361 | SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHI(), DT, LI); |
1362 | LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n" ); |
1363 | } |
1364 | |
1365 | // Instructions in the original inner loop preheader may depend on values |
1366 | // defined in the outer loop header. Move them there, because the original |
1367 | // inner loop preheader will become the entry into the interchanged loop nest. |
1368 | // Currently we move all instructions and rely on LICM to move invariant |
1369 | // instructions outside the loop nest. |
1370 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1371 | BasicBlock * = OuterLoop->getHeader(); |
1372 | if (InnerLoopPreHeader != OuterLoopHeader) { |
1373 | SmallPtrSet<Instruction *, 4> NeedsMoving; |
1374 | for (Instruction &I : |
1375 | make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(), |
1376 | y: std::prev(x: InnerLoopPreHeader->end())))) |
1377 | I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()); |
1378 | } |
1379 | |
1380 | Transformed |= adjustLoopLinks(); |
1381 | if (!Transformed) { |
1382 | LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n" ); |
1383 | return false; |
1384 | } |
1385 | |
1386 | return true; |
1387 | } |
1388 | |
1389 | /// \brief Move all instructions except the terminator from FromBB right before |
1390 | /// InsertBefore |
1391 | static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { |
1392 | BasicBlock *ToBB = InsertBefore->getParent(); |
1393 | |
1394 | ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(), |
1395 | FromEndIt: FromBB->getTerminator()->getIterator()); |
1396 | } |
1397 | |
1398 | /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. |
1399 | static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { |
1400 | // Save all non-terminator instructions of BB1 into TempInstrs and unlink them |
1401 | // from BB1 afterwards. |
1402 | auto Iter = map_range(C&: *BB1, F: [](Instruction &I) { return &I; }); |
1403 | SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(x: Iter.end())); |
1404 | for (Instruction *I : TempInstrs) |
1405 | I->removeFromParent(); |
1406 | |
1407 | // Move instructions from BB2 to BB1. |
1408 | moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator()); |
1409 | |
1410 | // Move instructions from TempInstrs to BB2. |
1411 | for (Instruction *I : TempInstrs) |
1412 | I->insertBefore(InsertPos: BB2->getTerminator()); |
1413 | } |
1414 | |
1415 | // Update BI to jump to NewBB instead of OldBB. Records updates to the |
1416 | // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that |
1417 | // \p OldBB is exactly once in BI's successor list. |
1418 | static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, |
1419 | BasicBlock *NewBB, |
1420 | std::vector<DominatorTree::UpdateType> &DTUpdates, |
1421 | bool MustUpdateOnce = true) { |
1422 | assert((!MustUpdateOnce || |
1423 | llvm::count_if(successors(BI), |
1424 | [OldBB](BasicBlock *BB) { |
1425 | return BB == OldBB; |
1426 | }) == 1) && "BI must jump to OldBB exactly once." ); |
1427 | bool Changed = false; |
1428 | for (Use &Op : BI->operands()) |
1429 | if (Op == OldBB) { |
1430 | Op.set(NewBB); |
1431 | Changed = true; |
1432 | } |
1433 | |
1434 | if (Changed) { |
1435 | DTUpdates.push_back( |
1436 | x: {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); |
1437 | DTUpdates.push_back( |
1438 | x: {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); |
1439 | } |
1440 | assert(Changed && "Expected a successor to be updated" ); |
1441 | } |
1442 | |
1443 | // Move Lcssa PHIs to the right place. |
1444 | static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *, |
1445 | BasicBlock *InnerLatch, BasicBlock *, |
1446 | BasicBlock *OuterLatch, BasicBlock *OuterExit, |
1447 | Loop *InnerLoop, LoopInfo *LI) { |
1448 | |
1449 | // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are |
1450 | // defined either in the header or latch. Those blocks will become header and |
1451 | // latch of the new outer loop, and the only possible users can PHI nodes |
1452 | // in the exit block of the loop nest or the outer loop header (reduction |
1453 | // PHIs, in that case, the incoming value must be defined in the inner loop |
1454 | // header). We can just substitute the user with the incoming value and remove |
1455 | // the PHI. |
1456 | for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) { |
1457 | assert(P.getNumIncomingValues() == 1 && |
1458 | "Only loops with a single exit are supported!" ); |
1459 | |
1460 | // Incoming values are guaranteed be instructions currently. |
1461 | auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch)); |
1462 | // In case of multi-level nested loops, follow LCSSA to find the incoming |
1463 | // value defined from the innermost loop. |
1464 | auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI)); |
1465 | // Skip phis with incoming values from the inner loop body, excluding the |
1466 | // header and latch. |
1467 | if (IncIInnerMost->getParent() != InnerLatch && |
1468 | IncIInnerMost->getParent() != InnerHeader) |
1469 | continue; |
1470 | |
1471 | assert(all_of(P.users(), |
1472 | [OuterHeader, OuterExit, IncI, InnerHeader](User *U) { |
1473 | return (cast<PHINode>(U)->getParent() == OuterHeader && |
1474 | IncI->getParent() == InnerHeader) || |
1475 | cast<PHINode>(U)->getParent() == OuterExit; |
1476 | }) && |
1477 | "Can only replace phis iff the uses are in the loop nest exit or " |
1478 | "the incoming value is defined in the inner header (it will " |
1479 | "dominate all loop blocks after interchanging)" ); |
1480 | P.replaceAllUsesWith(V: IncI); |
1481 | P.eraseFromParent(); |
1482 | } |
1483 | |
1484 | SmallVector<PHINode *, 8> LcssaInnerExit; |
1485 | for (PHINode &P : InnerExit->phis()) |
1486 | LcssaInnerExit.push_back(Elt: &P); |
1487 | |
1488 | SmallVector<PHINode *, 8> LcssaInnerLatch; |
1489 | for (PHINode &P : InnerLatch->phis()) |
1490 | LcssaInnerLatch.push_back(Elt: &P); |
1491 | |
1492 | // Lcssa PHIs for values used outside the inner loop are in InnerExit. |
1493 | // If a PHI node has users outside of InnerExit, it has a use outside the |
1494 | // interchanged loop and we have to preserve it. We move these to |
1495 | // InnerLatch, which will become the new exit block for the innermost |
1496 | // loop after interchanging. |
1497 | for (PHINode *P : LcssaInnerExit) |
1498 | P->moveBefore(MovePos: InnerLatch->getFirstNonPHI()); |
1499 | |
1500 | // If the inner loop latch contains LCSSA PHIs, those come from a child loop |
1501 | // and we have to move them to the new inner latch. |
1502 | for (PHINode *P : LcssaInnerLatch) |
1503 | P->moveBefore(MovePos: InnerExit->getFirstNonPHI()); |
1504 | |
1505 | // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have |
1506 | // incoming values defined in the outer loop, we have to add a new PHI |
1507 | // in the inner loop latch, which became the exit block of the outer loop, |
1508 | // after interchanging. |
1509 | if (OuterExit) { |
1510 | for (PHINode &P : OuterExit->phis()) { |
1511 | if (P.getNumIncomingValues() != 1) |
1512 | continue; |
1513 | // Skip Phis with incoming values defined in the inner loop. Those should |
1514 | // already have been updated. |
1515 | auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: 0)); |
1516 | if (!I || LI->getLoopFor(BB: I->getParent()) == InnerLoop) |
1517 | continue; |
1518 | |
1519 | PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone()); |
1520 | NewPhi->setIncomingValue(i: 0, V: P.getIncomingValue(i: 0)); |
1521 | NewPhi->setIncomingBlock(i: 0, BB: OuterLatch); |
1522 | // We might have incoming edges from other BBs, i.e., the original outer |
1523 | // header. |
1524 | for (auto *Pred : predecessors(BB: InnerLatch)) { |
1525 | if (Pred == OuterLatch) |
1526 | continue; |
1527 | NewPhi->addIncoming(V: P.getIncomingValue(i: 0), BB: Pred); |
1528 | } |
1529 | NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHI()); |
1530 | P.setIncomingValue(i: 0, V: NewPhi); |
1531 | } |
1532 | } |
1533 | |
1534 | // Now adjust the incoming blocks for the LCSSA PHIs. |
1535 | // For PHIs moved from Inner's exit block, we need to replace Inner's latch |
1536 | // with the new latch. |
1537 | InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch); |
1538 | } |
1539 | |
1540 | bool LoopInterchangeTransform::adjustLoopBranches() { |
1541 | LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n" ); |
1542 | std::vector<DominatorTree::UpdateType> DTUpdates; |
1543 | |
1544 | BasicBlock * = OuterLoop->getLoopPreheader(); |
1545 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1546 | |
1547 | assert(OuterLoopPreHeader != OuterLoop->getHeader() && |
1548 | InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && |
1549 | InnerLoopPreHeader && "Guaranteed by loop-simplify form" ); |
1550 | // Ensure that both preheaders do not contain PHI nodes and have single |
1551 | // predecessors. This allows us to move them easily. We use |
1552 | // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing |
1553 | // preheaders do not satisfy those conditions. |
1554 | if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) || |
1555 | !OuterLoopPreHeader->getUniquePredecessor()) |
1556 | OuterLoopPreHeader = |
1557 | InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true); |
1558 | if (InnerLoopPreHeader == OuterLoop->getHeader()) |
1559 | InnerLoopPreHeader = |
1560 | InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true); |
1561 | |
1562 | // Adjust the loop preheader |
1563 | BasicBlock * = InnerLoop->getHeader(); |
1564 | BasicBlock * = OuterLoop->getHeader(); |
1565 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
1566 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
1567 | BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); |
1568 | BasicBlock *InnerLoopLatchPredecessor = |
1569 | InnerLoopLatch->getUniquePredecessor(); |
1570 | BasicBlock *InnerLoopLatchSuccessor; |
1571 | BasicBlock *OuterLoopLatchSuccessor; |
1572 | |
1573 | BranchInst *OuterLoopLatchBI = |
1574 | dyn_cast<BranchInst>(Val: OuterLoopLatch->getTerminator()); |
1575 | BranchInst *InnerLoopLatchBI = |
1576 | dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator()); |
1577 | BranchInst * = |
1578 | dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator()); |
1579 | BranchInst * = |
1580 | dyn_cast<BranchInst>(Val: InnerLoopHeader->getTerminator()); |
1581 | |
1582 | if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || |
1583 | !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || |
1584 | !InnerLoopHeaderBI) |
1585 | return false; |
1586 | |
1587 | BranchInst *InnerLoopLatchPredecessorBI = |
1588 | dyn_cast<BranchInst>(Val: InnerLoopLatchPredecessor->getTerminator()); |
1589 | BranchInst *OuterLoopPredecessorBI = |
1590 | dyn_cast<BranchInst>(Val: OuterLoopPredecessor->getTerminator()); |
1591 | |
1592 | if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) |
1593 | return false; |
1594 | BasicBlock * = InnerLoopHeader->getUniqueSuccessor(); |
1595 | if (!InnerLoopHeaderSuccessor) |
1596 | return false; |
1597 | |
1598 | // Adjust Loop Preheader and headers. |
1599 | // The branches in the outer loop predecessor and the outer loop header can |
1600 | // be unconditional branches or conditional branches with duplicates. Consider |
1601 | // this when updating the successors. |
1602 | updateSuccessor(BI: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader, |
1603 | NewBB: InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); |
1604 | // The outer loop header might or might not branch to the outer latch. |
1605 | // We are guaranteed to branch to the inner loop preheader. |
1606 | if (llvm::is_contained(Range: OuterLoopHeaderBI->successors(), Element: OuterLoopLatch)) { |
1607 | // In this case the outerLoopHeader should branch to the InnerLoopLatch. |
1608 | updateSuccessor(BI: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch, |
1609 | DTUpdates, |
1610 | /*MustUpdateOnce=*/false); |
1611 | } |
1612 | updateSuccessor(BI: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader, |
1613 | NewBB: InnerLoopHeaderSuccessor, DTUpdates, |
1614 | /*MustUpdateOnce=*/false); |
1615 | |
1616 | // Adjust reduction PHI's now that the incoming block has changed. |
1617 | InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader, |
1618 | New: OuterLoopHeader); |
1619 | |
1620 | updateSuccessor(BI: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor, |
1621 | NewBB: OuterLoopPreHeader, DTUpdates); |
1622 | |
1623 | // -------------Adjust loop latches----------- |
1624 | if (InnerLoopLatchBI->getSuccessor(i: 0) == InnerLoopHeader) |
1625 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 1); |
1626 | else |
1627 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 0); |
1628 | |
1629 | updateSuccessor(BI: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch, |
1630 | NewBB: InnerLoopLatchSuccessor, DTUpdates); |
1631 | |
1632 | if (OuterLoopLatchBI->getSuccessor(i: 0) == OuterLoopHeader) |
1633 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 1); |
1634 | else |
1635 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 0); |
1636 | |
1637 | updateSuccessor(BI: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor, |
1638 | NewBB: OuterLoopLatchSuccessor, DTUpdates); |
1639 | updateSuccessor(BI: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch, |
1640 | DTUpdates); |
1641 | |
1642 | DT->applyUpdates(Updates: DTUpdates); |
1643 | restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader, |
1644 | OrigOuterPreHeader: OuterLoopPreHeader); |
1645 | |
1646 | moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch, |
1647 | OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(), |
1648 | InnerLoop, LI); |
1649 | // For PHIs in the exit block of the outer loop, outer's latch has been |
1650 | // replaced by Inners'. |
1651 | OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch); |
1652 | |
1653 | auto &OuterInnerReductions = LIL.getOuterInnerReductions(); |
1654 | // Now update the reduction PHIs in the inner and outer loop headers. |
1655 | SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; |
1656 | for (PHINode &PHI : InnerLoopHeader->phis()) |
1657 | if (OuterInnerReductions.contains(Ptr: &PHI)) |
1658 | InnerLoopPHIs.push_back(Elt: &PHI); |
1659 | |
1660 | for (PHINode &PHI : OuterLoopHeader->phis()) |
1661 | if (OuterInnerReductions.contains(Ptr: &PHI)) |
1662 | OuterLoopPHIs.push_back(Elt: &PHI); |
1663 | |
1664 | // Now move the remaining reduction PHIs from outer to inner loop header and |
1665 | // vice versa. The PHI nodes must be part of a reduction across the inner and |
1666 | // outer loop and all the remains to do is and updating the incoming blocks. |
1667 | for (PHINode *PHI : OuterLoopPHIs) { |
1668 | LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n" ; PHI->dump();); |
1669 | PHI->moveBefore(MovePos: InnerLoopHeader->getFirstNonPHI()); |
1670 | assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node" ); |
1671 | } |
1672 | for (PHINode *PHI : InnerLoopPHIs) { |
1673 | LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n" ; PHI->dump();); |
1674 | PHI->moveBefore(MovePos: OuterLoopHeader->getFirstNonPHI()); |
1675 | assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node" ); |
1676 | } |
1677 | |
1678 | // Update the incoming blocks for moved PHI nodes. |
1679 | OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader); |
1680 | OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch); |
1681 | InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader); |
1682 | InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch); |
1683 | |
1684 | // Values defined in the outer loop header could be used in the inner loop |
1685 | // latch. In that case, we need to create LCSSA phis for them, because after |
1686 | // interchanging they will be defined in the new inner loop and used in the |
1687 | // new outer loop. |
1688 | SmallVector<Instruction *, 4> MayNeedLCSSAPhis; |
1689 | for (Instruction &I : |
1690 | make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end()))) |
1691 | MayNeedLCSSAPhis.push_back(Elt: &I); |
1692 | formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: *DT, LI: *LI, SE); |
1693 | |
1694 | return true; |
1695 | } |
1696 | |
1697 | bool LoopInterchangeTransform::adjustLoopLinks() { |
1698 | // Adjust all branches in the inner and outer loop. |
1699 | bool Changed = adjustLoopBranches(); |
1700 | if (Changed) { |
1701 | // We have interchanged the preheaders so we need to interchange the data in |
1702 | // the preheaders as well. This is because the content of the inner |
1703 | // preheader was previously executed inside the outer loop. |
1704 | BasicBlock * = OuterLoop->getLoopPreheader(); |
1705 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1706 | swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader); |
1707 | } |
1708 | return Changed; |
1709 | } |
1710 | |
1711 | PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, |
1712 | LoopAnalysisManager &AM, |
1713 | LoopStandardAnalysisResults &AR, |
1714 | LPMUpdater &U) { |
1715 | Function &F = *LN.getParent(); |
1716 | |
1717 | DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); |
1718 | std::unique_ptr<CacheCost> CC = |
1719 | CacheCost::getCacheCost(Root&: LN.getOutermostLoop(), AR, DI); |
1720 | OptimizationRemarkEmitter ORE(&F); |
1721 | if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) |
1722 | return PreservedAnalyses::all(); |
1723 | U.markLoopNestChanged(Changed: true); |
1724 | return getLoopPassPreservedAnalyses(); |
1725 | } |
1726 | |