1 | //===- LoopInterchange.cpp - Loop interchange pass-------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This Pass handles loop interchange transform. |
10 | // This pass interchanges loops to provide a more cache-friendly memory access |
11 | // patterns. |
12 | // |
13 | //===----------------------------------------------------------------------===// |
14 | |
15 | #include "llvm/Transforms/Scalar/LoopInterchange.h" |
16 | #include "llvm/ADT/STLExtras.h" |
17 | #include "llvm/ADT/SmallSet.h" |
18 | #include "llvm/ADT/SmallVector.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/ADT/StringRef.h" |
21 | #include "llvm/ADT/StringSet.h" |
22 | #include "llvm/Analysis/DependenceAnalysis.h" |
23 | #include "llvm/Analysis/LoopCacheAnalysis.h" |
24 | #include "llvm/Analysis/LoopInfo.h" |
25 | #include "llvm/Analysis/LoopNestAnalysis.h" |
26 | #include "llvm/Analysis/LoopPass.h" |
27 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
28 | #include "llvm/Analysis/ScalarEvolution.h" |
29 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
30 | #include "llvm/IR/BasicBlock.h" |
31 | #include "llvm/IR/DiagnosticInfo.h" |
32 | #include "llvm/IR/Dominators.h" |
33 | #include "llvm/IR/Function.h" |
34 | #include "llvm/IR/InstrTypes.h" |
35 | #include "llvm/IR/Instruction.h" |
36 | #include "llvm/IR/Instructions.h" |
37 | #include "llvm/IR/User.h" |
38 | #include "llvm/IR/Value.h" |
39 | #include "llvm/Support/Casting.h" |
40 | #include "llvm/Support/CommandLine.h" |
41 | #include "llvm/Support/Debug.h" |
42 | #include "llvm/Support/ErrorHandling.h" |
43 | #include "llvm/Support/raw_ostream.h" |
44 | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
45 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
46 | #include "llvm/Transforms/Utils/LoopUtils.h" |
47 | #include <cassert> |
48 | #include <utility> |
49 | #include <vector> |
50 | |
51 | using namespace llvm; |
52 | |
53 | #define DEBUG_TYPE "loop-interchange" |
54 | |
55 | STATISTIC(LoopsInterchanged, "Number of loops interchanged" ); |
56 | |
57 | static cl::opt<int> LoopInterchangeCostThreshold( |
58 | "loop-interchange-threshold" , cl::init(Val: 0), cl::Hidden, |
59 | cl::desc("Interchange if you gain more than this number" )); |
60 | |
61 | // Maximum number of load-stores that can be handled in the dependency matrix. |
62 | static cl::opt<unsigned int> MaxMemInstrCount( |
63 | "loop-interchange-max-meminstr-count" , cl::init(Val: 64), cl::Hidden, |
64 | cl::desc( |
65 | "Maximum number of load-store instructions that should be handled " |
66 | "in the dependency matrix. Higher value may lead to more interchanges " |
67 | "at the cost of compile-time" )); |
68 | |
69 | namespace { |
70 | |
71 | using LoopVector = SmallVector<Loop *, 8>; |
72 | |
73 | // TODO: Check if we can use a sparse matrix here. |
74 | using CharMatrix = std::vector<std::vector<char>>; |
75 | |
76 | /// Types of rules used in profitability check. |
77 | enum class RuleTy { |
78 | PerLoopCacheAnalysis, |
79 | PerInstrOrderCost, |
80 | ForVectorization, |
81 | }; |
82 | |
83 | } // end anonymous namespace |
84 | |
85 | // Minimum loop depth supported. |
86 | static cl::opt<unsigned int> MinLoopNestDepth( |
87 | "loop-interchange-min-loop-nest-depth" , cl::init(Val: 2), cl::Hidden, |
88 | cl::desc("Minimum depth of loop nest considered for the transform" )); |
89 | |
90 | // Maximum loop depth supported. |
91 | static cl::opt<unsigned int> MaxLoopNestDepth( |
92 | "loop-interchange-max-loop-nest-depth" , cl::init(Val: 10), cl::Hidden, |
93 | cl::desc("Maximum depth of loop nest considered for the transform" )); |
94 | |
95 | // We prefer cache cost to vectorization by default. |
96 | static cl::list<RuleTy> Profitabilities( |
97 | "loop-interchange-profitabilities" , cl::ZeroOrMore, |
98 | cl::MiscFlags::CommaSeparated, cl::Hidden, |
99 | cl::desc("List of profitability heuristics to be used. They are applied in " |
100 | "the given order" ), |
101 | cl::list_init<RuleTy>(Vals: {RuleTy::PerLoopCacheAnalysis, |
102 | RuleTy::PerInstrOrderCost, |
103 | RuleTy::ForVectorization}), |
104 | cl::values(clEnumValN(RuleTy::PerLoopCacheAnalysis, "cache" , |
105 | "Prioritize loop cache cost" ), |
106 | clEnumValN(RuleTy::PerInstrOrderCost, "instorder" , |
107 | "Prioritize the IVs order of each instruction" ), |
108 | clEnumValN(RuleTy::ForVectorization, "vectorize" , |
109 | "Prioritize vectorization" ))); |
110 | |
111 | #ifndef NDEBUG |
112 | static bool noDuplicateRules(ArrayRef<RuleTy> Rules) { |
113 | SmallSet<RuleTy, 4> Set; |
114 | for (RuleTy Rule : Rules) |
115 | if (!Set.insert(Rule).second) |
116 | return false; |
117 | return true; |
118 | } |
119 | |
120 | static void printDepMatrix(CharMatrix &DepMatrix) { |
121 | for (auto &Row : DepMatrix) { |
122 | for (auto D : Row) |
123 | LLVM_DEBUG(dbgs() << D << " " ); |
124 | LLVM_DEBUG(dbgs() << "\n" ); |
125 | } |
126 | } |
127 | #endif |
128 | |
129 | static bool (CharMatrix &DepMatrix, unsigned Level, |
130 | Loop *L, DependenceInfo *DI, |
131 | ScalarEvolution *SE, |
132 | OptimizationRemarkEmitter *ORE) { |
133 | using ValueVector = SmallVector<Value *, 16>; |
134 | |
135 | ValueVector MemInstr; |
136 | |
137 | // For each block. |
138 | for (BasicBlock *BB : L->blocks()) { |
139 | // Scan the BB and collect legal loads and stores. |
140 | for (Instruction &I : *BB) { |
141 | if (!isa<Instruction>(Val: I)) |
142 | return false; |
143 | if (auto *Ld = dyn_cast<LoadInst>(Val: &I)) { |
144 | if (!Ld->isSimple()) |
145 | return false; |
146 | MemInstr.push_back(Elt: &I); |
147 | } else if (auto *St = dyn_cast<StoreInst>(Val: &I)) { |
148 | if (!St->isSimple()) |
149 | return false; |
150 | MemInstr.push_back(Elt: &I); |
151 | } |
152 | } |
153 | } |
154 | |
155 | LLVM_DEBUG(dbgs() << "Found " << MemInstr.size() |
156 | << " Loads and Stores to analyze\n" ); |
157 | if (MemInstr.size() > MaxMemInstrCount) { |
158 | LLVM_DEBUG(dbgs() << "The transform doesn't support more than " |
159 | << MaxMemInstrCount << " load/stores in a loop\n" ); |
160 | ORE->emit(RemarkBuilder: [&]() { |
161 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop" , |
162 | L->getStartLoc(), L->getHeader()) |
163 | << "Number of loads/stores exceeded, the supported maximum " |
164 | "can be increased with option " |
165 | "-loop-interchange-maxmeminstr-count." ; |
166 | }); |
167 | return false; |
168 | } |
169 | ValueVector::iterator I, IE, J, JE; |
170 | StringSet<> Seen; |
171 | |
172 | for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) { |
173 | for (J = I, JE = MemInstr.end(); J != JE; ++J) { |
174 | std::vector<char> Dep; |
175 | Instruction *Src = cast<Instruction>(Val: *I); |
176 | Instruction *Dst = cast<Instruction>(Val: *J); |
177 | // Ignore Input dependencies. |
178 | if (isa<LoadInst>(Val: Src) && isa<LoadInst>(Val: Dst)) |
179 | continue; |
180 | // Track Output, Flow, and Anti dependencies. |
181 | if (auto D = DI->depends(Src, Dst)) { |
182 | assert(D->isOrdered() && "Expected an output, flow or anti dep." ); |
183 | // If the direction vector is negative, normalize it to |
184 | // make it non-negative. |
185 | if (D->normalize(SE)) |
186 | LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n" ); |
187 | LLVM_DEBUG(StringRef DepType = |
188 | D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output" ; |
189 | dbgs() << "Found " << DepType |
190 | << " dependency between Src and Dst\n" |
191 | << " Src:" << *Src << "\n Dst:" << *Dst << '\n'); |
192 | unsigned Levels = D->getLevels(); |
193 | char Direction; |
194 | for (unsigned II = 1; II <= Levels; ++II) { |
195 | // `DVEntry::LE` is converted to `*`. This is because `LE` means `<` |
196 | // or `=`, for which we don't have an equivalent representation, so |
197 | // that the conservative approximation is necessary. The same goes for |
198 | // `DVEntry::GE`. |
199 | // TODO: Use of fine-grained expressions allows for more accurate |
200 | // analysis. |
201 | unsigned Dir = D->getDirection(Level: II); |
202 | if (Dir == Dependence::DVEntry::LT) |
203 | Direction = '<'; |
204 | else if (Dir == Dependence::DVEntry::GT) |
205 | Direction = '>'; |
206 | else if (Dir == Dependence::DVEntry::EQ) |
207 | Direction = '='; |
208 | else |
209 | Direction = '*'; |
210 | Dep.push_back(x: Direction); |
211 | } |
212 | |
213 | // If the Dependence object doesn't have any information, fill the |
214 | // dependency vector with '*'. |
215 | if (D->isConfused()) { |
216 | assert(Dep.empty() && "Expected empty dependency vector" ); |
217 | Dep.assign(n: Level, val: '*'); |
218 | } |
219 | |
220 | while (Dep.size() != Level) { |
221 | Dep.push_back(x: 'I'); |
222 | } |
223 | |
224 | // Make sure we only add unique entries to the dependency matrix. |
225 | if (Seen.insert(key: StringRef(Dep.data(), Dep.size())).second) |
226 | DepMatrix.push_back(x: Dep); |
227 | } |
228 | } |
229 | } |
230 | |
231 | return true; |
232 | } |
233 | |
234 | // A loop is moved from index 'from' to an index 'to'. Update the Dependence |
235 | // matrix by exchanging the two columns. |
236 | static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, |
237 | unsigned ToIndx) { |
238 | for (auto &Row : DepMatrix) |
239 | std::swap(a&: Row[ToIndx], b&: Row[FromIndx]); |
240 | } |
241 | |
242 | // Check if a direction vector is lexicographically positive. Return true if it |
243 | // is positive, nullopt if it is "zero", otherwise false. |
244 | // [Theorem] A permutation of the loops in a perfect nest is legal if and only |
245 | // if the direction matrix, after the same permutation is applied to its |
246 | // columns, has no ">" direction as the leftmost non-"=" direction in any row. |
247 | static std::optional<bool> |
248 | isLexicographicallyPositive(ArrayRef<char> DV, unsigned Begin, unsigned End) { |
249 | for (unsigned char Direction : DV.slice(N: Begin, M: End - Begin)) { |
250 | if (Direction == '<') |
251 | return true; |
252 | if (Direction == '>' || Direction == '*') |
253 | return false; |
254 | } |
255 | return std::nullopt; |
256 | } |
257 | |
258 | // Checks if it is legal to interchange 2 loops. |
259 | static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, |
260 | unsigned InnerLoopId, |
261 | unsigned OuterLoopId) { |
262 | unsigned NumRows = DepMatrix.size(); |
263 | std::vector<char> Cur; |
264 | // For each row check if it is valid to interchange. |
265 | for (unsigned Row = 0; Row < NumRows; ++Row) { |
266 | // Create temporary DepVector check its lexicographical order |
267 | // before and after swapping OuterLoop vs InnerLoop |
268 | Cur = DepMatrix[Row]; |
269 | |
270 | // If the surrounding loops already ensure that the direction vector is |
271 | // lexicographically positive, nothing within the loop will be able to break |
272 | // the dependence. In such a case we can skip the subsequent check. |
273 | if (isLexicographicallyPositive(DV: Cur, Begin: 0, End: OuterLoopId) == true) |
274 | continue; |
275 | |
276 | // Check if the direction vector is lexicographically positive (or zero) |
277 | // for both before/after exchanged. |
278 | if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size()) == false) |
279 | return false; |
280 | std::swap(a&: Cur[InnerLoopId], b&: Cur[OuterLoopId]); |
281 | if (isLexicographicallyPositive(DV: Cur, Begin: OuterLoopId, End: Cur.size()) == false) |
282 | return false; |
283 | } |
284 | return true; |
285 | } |
286 | |
287 | static void populateWorklist(Loop &L, LoopVector &LoopList) { |
288 | LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: " |
289 | << L.getHeader()->getParent()->getName() << " Loop: %" |
290 | << L.getHeader()->getName() << '\n'); |
291 | assert(LoopList.empty() && "LoopList should initially be empty!" ); |
292 | Loop *CurrentLoop = &L; |
293 | const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops(); |
294 | while (!Vec->empty()) { |
295 | // The current loop has multiple subloops in it hence it is not tightly |
296 | // nested. |
297 | // Discard all loops above it added into Worklist. |
298 | if (Vec->size() != 1) { |
299 | LoopList = {}; |
300 | return; |
301 | } |
302 | |
303 | LoopList.push_back(Elt: CurrentLoop); |
304 | CurrentLoop = Vec->front(); |
305 | Vec = &CurrentLoop->getSubLoops(); |
306 | } |
307 | LoopList.push_back(Elt: CurrentLoop); |
308 | } |
309 | |
310 | static bool (ArrayRef<Loop *> LoopList, |
311 | OptimizationRemarkEmitter &ORE) { |
312 | unsigned LoopNestDepth = LoopList.size(); |
313 | if (LoopNestDepth < MinLoopNestDepth || LoopNestDepth > MaxLoopNestDepth) { |
314 | LLVM_DEBUG(dbgs() << "Unsupported depth of loop nest " << LoopNestDepth |
315 | << ", the supported range is [" << MinLoopNestDepth |
316 | << ", " << MaxLoopNestDepth << "].\n" ); |
317 | Loop *OuterLoop = LoopList.front(); |
318 | ORE.emit(RemarkBuilder: [&]() { |
319 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoopNestDepth" , |
320 | OuterLoop->getStartLoc(), |
321 | OuterLoop->getHeader()) |
322 | << "Unsupported depth of loop nest, the supported range is [" |
323 | << std::to_string(val: MinLoopNestDepth) << ", " |
324 | << std::to_string(val: MaxLoopNestDepth) << "].\n" ; |
325 | }); |
326 | return false; |
327 | } |
328 | return true; |
329 | } |
330 | |
331 | static bool isComputableLoopNest(ScalarEvolution *SE, |
332 | ArrayRef<Loop *> LoopList) { |
333 | for (Loop *L : LoopList) { |
334 | const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L); |
335 | if (isa<SCEVCouldNotCompute>(Val: ExitCountOuter)) { |
336 | LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n" ); |
337 | return false; |
338 | } |
339 | if (L->getNumBackEdges() != 1) { |
340 | LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n" ); |
341 | return false; |
342 | } |
343 | if (!L->getExitingBlock()) { |
344 | LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n" ); |
345 | return false; |
346 | } |
347 | } |
348 | return true; |
349 | } |
350 | |
351 | namespace { |
352 | |
353 | /// LoopInterchangeLegality checks if it is legal to interchange the loop. |
354 | class LoopInterchangeLegality { |
355 | public: |
356 | (Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
357 | OptimizationRemarkEmitter *ORE) |
358 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} |
359 | |
360 | /// Check if the loops can be interchanged. |
361 | bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, |
362 | CharMatrix &DepMatrix); |
363 | |
364 | /// Discover induction PHIs in the header of \p L. Induction |
365 | /// PHIs are added to \p Inductions. |
366 | bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions); |
367 | |
368 | /// Check if the loop structure is understood. We do not handle triangular |
369 | /// loops for now. |
370 | bool isLoopStructureUnderstood(); |
371 | |
372 | bool currentLimitations(); |
373 | |
374 | const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const { |
375 | return OuterInnerReductions; |
376 | } |
377 | |
378 | const ArrayRef<PHINode *> getInnerLoopInductions() const { |
379 | return InnerLoopInductions; |
380 | } |
381 | |
382 | private: |
383 | bool tightlyNested(Loop *Outer, Loop *Inner); |
384 | bool containsUnsafeInstructions(BasicBlock *BB); |
385 | |
386 | /// Discover induction and reduction PHIs in the header of \p L. Induction |
387 | /// PHIs are added to \p Inductions, reductions are added to |
388 | /// OuterInnerReductions. When the outer loop is passed, the inner loop needs |
389 | /// to be passed as \p InnerLoop. |
390 | bool findInductionAndReductions(Loop *L, |
391 | SmallVector<PHINode *, 8> &Inductions, |
392 | Loop *InnerLoop); |
393 | |
394 | Loop *OuterLoop; |
395 | Loop *InnerLoop; |
396 | |
397 | ScalarEvolution *SE; |
398 | |
399 | /// Interface to emit optimization remarks. |
400 | OptimizationRemarkEmitter *ORE; |
401 | |
402 | /// Set of reduction PHIs taking part of a reduction across the inner and |
403 | /// outer loop. |
404 | SmallPtrSet<PHINode *, 4> OuterInnerReductions; |
405 | |
406 | /// Set of inner loop induction PHIs |
407 | SmallVector<PHINode *, 8> InnerLoopInductions; |
408 | }; |
409 | |
410 | /// LoopInterchangeProfitability checks if it is profitable to interchange the |
411 | /// loop. |
412 | class LoopInterchangeProfitability { |
413 | public: |
414 | (Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
415 | OptimizationRemarkEmitter *ORE) |
416 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} |
417 | |
418 | /// Check if the loop interchange is profitable. |
419 | bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop, |
420 | unsigned InnerLoopId, unsigned OuterLoopId, |
421 | CharMatrix &DepMatrix, |
422 | const DenseMap<const Loop *, unsigned> &CostMap, |
423 | std::unique_ptr<CacheCost> &CC); |
424 | |
425 | private: |
426 | int getInstrOrderCost(); |
427 | std::optional<bool> isProfitablePerLoopCacheAnalysis( |
428 | const DenseMap<const Loop *, unsigned> &CostMap, |
429 | std::unique_ptr<CacheCost> &CC); |
430 | std::optional<bool> isProfitablePerInstrOrderCost(); |
431 | std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId, |
432 | unsigned OuterLoopId, |
433 | CharMatrix &DepMatrix); |
434 | Loop *OuterLoop; |
435 | Loop *InnerLoop; |
436 | |
437 | /// Scev analysis. |
438 | ScalarEvolution *SE; |
439 | |
440 | /// Interface to emit optimization remarks. |
441 | OptimizationRemarkEmitter *ORE; |
442 | }; |
443 | |
444 | /// LoopInterchangeTransform interchanges the loop. |
445 | class LoopInterchangeTransform { |
446 | public: |
447 | LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE, |
448 | LoopInfo *LI, DominatorTree *DT, |
449 | const LoopInterchangeLegality &LIL) |
450 | : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {} |
451 | |
452 | /// Interchange OuterLoop and InnerLoop. |
453 | bool transform(); |
454 | void restructureLoops(Loop *NewInner, Loop *NewOuter, |
455 | BasicBlock *, |
456 | BasicBlock *); |
457 | void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop); |
458 | |
459 | private: |
460 | bool adjustLoopLinks(); |
461 | bool adjustLoopBranches(); |
462 | |
463 | Loop *OuterLoop; |
464 | Loop *InnerLoop; |
465 | |
466 | /// Scev analysis. |
467 | ScalarEvolution *SE; |
468 | |
469 | LoopInfo *LI; |
470 | DominatorTree *DT; |
471 | |
472 | const LoopInterchangeLegality &LIL; |
473 | }; |
474 | |
475 | struct LoopInterchange { |
476 | ScalarEvolution *SE = nullptr; |
477 | LoopInfo *LI = nullptr; |
478 | DependenceInfo *DI = nullptr; |
479 | DominatorTree *DT = nullptr; |
480 | std::unique_ptr<CacheCost> CC = nullptr; |
481 | |
482 | /// Interface to emit optimization remarks. |
483 | OptimizationRemarkEmitter *ORE; |
484 | |
485 | LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI, |
486 | DominatorTree *DT, std::unique_ptr<CacheCost> &CC, |
487 | OptimizationRemarkEmitter *ORE) |
488 | : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {} |
489 | |
490 | bool run(Loop *L) { |
491 | if (L->getParentLoop()) |
492 | return false; |
493 | SmallVector<Loop *, 8> LoopList; |
494 | populateWorklist(L&: *L, LoopList); |
495 | return processLoopList(LoopList); |
496 | } |
497 | |
498 | bool run(LoopNest &LN) { |
499 | SmallVector<Loop *, 8> LoopList(LN.getLoops()); |
500 | for (unsigned I = 1; I < LoopList.size(); ++I) |
501 | if (LoopList[I]->getParentLoop() != LoopList[I - 1]) |
502 | return false; |
503 | return processLoopList(LoopList); |
504 | } |
505 | |
506 | unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) { |
507 | // TODO: Add a better heuristic to select the loop to be interchanged based |
508 | // on the dependence matrix. Currently we select the innermost loop. |
509 | return LoopList.size() - 1; |
510 | } |
511 | |
512 | bool processLoopList(SmallVectorImpl<Loop *> &LoopList) { |
513 | bool Changed = false; |
514 | |
515 | // Ensure proper loop nest depth. |
516 | assert(hasSupportedLoopDepth(LoopList, *ORE) && |
517 | "Unsupported depth of loop nest." ); |
518 | |
519 | unsigned LoopNestDepth = LoopList.size(); |
520 | |
521 | LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth |
522 | << "\n" ); |
523 | |
524 | CharMatrix DependencyMatrix; |
525 | Loop *OuterMostLoop = *(LoopList.begin()); |
526 | if (!populateDependencyMatrix(DepMatrix&: DependencyMatrix, Level: LoopNestDepth, |
527 | L: OuterMostLoop, DI, SE, ORE)) { |
528 | LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n" ); |
529 | return false; |
530 | } |
531 | |
532 | LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n" ; |
533 | printDepMatrix(DependencyMatrix)); |
534 | |
535 | // Get the Outermost loop exit. |
536 | BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock(); |
537 | if (!LoopNestExit) { |
538 | LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block" ); |
539 | return false; |
540 | } |
541 | |
542 | unsigned SelecLoopId = selectLoopForInterchange(LoopList); |
543 | // Obtain the loop vector returned from loop cache analysis beforehand, |
544 | // and put each <Loop, index> pair into a map for constant time query |
545 | // later. Indices in loop vector reprsent the optimal order of the |
546 | // corresponding loop, e.g., given a loopnest with depth N, index 0 |
547 | // indicates the loop should be placed as the outermost loop and index N |
548 | // indicates the loop should be placed as the innermost loop. |
549 | // |
550 | // For the old pass manager CacheCost would be null. |
551 | DenseMap<const Loop *, unsigned> CostMap; |
552 | if (CC != nullptr) { |
553 | for (const auto &[Idx, Cost] : enumerate(First: CC->getLoopCosts())) |
554 | CostMap[Cost.first] = Idx; |
555 | } |
556 | // We try to achieve the globally optimal memory access for the loopnest, |
557 | // and do interchange based on a bubble-sort fasion. We start from |
558 | // the innermost loop, move it outwards to the best possible position |
559 | // and repeat this process. |
560 | for (unsigned j = SelecLoopId; j > 0; j--) { |
561 | bool ChangedPerIter = false; |
562 | for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) { |
563 | bool Interchanged = |
564 | processLoop(LoopList, InnerLoopId: i, OuterLoopId: i - 1, DependencyMatrix, CostMap); |
565 | ChangedPerIter |= Interchanged; |
566 | Changed |= Interchanged; |
567 | } |
568 | // Early abort if there was no interchange during an entire round of |
569 | // moving loops outwards. |
570 | if (!ChangedPerIter) |
571 | break; |
572 | } |
573 | return Changed; |
574 | } |
575 | |
576 | bool processLoop(SmallVectorImpl<Loop *> &LoopList, unsigned InnerLoopId, |
577 | unsigned OuterLoopId, |
578 | std::vector<std::vector<char>> &DependencyMatrix, |
579 | const DenseMap<const Loop *, unsigned> &CostMap) { |
580 | Loop *OuterLoop = LoopList[OuterLoopId]; |
581 | Loop *InnerLoop = LoopList[InnerLoopId]; |
582 | LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId |
583 | << " and OuterLoopId = " << OuterLoopId << "\n" ); |
584 | LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE); |
585 | if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DepMatrix&: DependencyMatrix)) { |
586 | LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n" ); |
587 | return false; |
588 | } |
589 | LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n" ); |
590 | LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); |
591 | if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId, |
592 | DepMatrix&: DependencyMatrix, CostMap, CC)) { |
593 | LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n" ); |
594 | return false; |
595 | } |
596 | |
597 | ORE->emit(RemarkBuilder: [&]() { |
598 | return OptimizationRemark(DEBUG_TYPE, "Interchanged" , |
599 | InnerLoop->getStartLoc(), |
600 | InnerLoop->getHeader()) |
601 | << "Loop interchanged with enclosing loop." ; |
602 | }); |
603 | |
604 | LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL); |
605 | LIT.transform(); |
606 | LLVM_DEBUG(dbgs() << "Loops interchanged.\n" ); |
607 | LoopsInterchanged++; |
608 | |
609 | llvm::formLCSSARecursively(L&: *OuterLoop, DT: *DT, LI, SE); |
610 | |
611 | // Loops interchanged, update LoopList accordingly. |
612 | std::swap(a&: LoopList[OuterLoopId], b&: LoopList[InnerLoopId]); |
613 | // Update the DependencyMatrix |
614 | interChangeDependencies(DepMatrix&: DependencyMatrix, FromIndx: InnerLoopId, ToIndx: OuterLoopId); |
615 | |
616 | LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n" ; |
617 | printDepMatrix(DependencyMatrix)); |
618 | |
619 | return true; |
620 | } |
621 | }; |
622 | |
623 | } // end anonymous namespace |
624 | |
625 | bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) { |
626 | return any_of(Range&: *BB, P: [](const Instruction &I) { |
627 | return I.mayHaveSideEffects() || I.mayReadFromMemory(); |
628 | }); |
629 | } |
630 | |
631 | bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { |
632 | BasicBlock * = OuterLoop->getHeader(); |
633 | BasicBlock * = InnerLoop->getLoopPreheader(); |
634 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
635 | |
636 | LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n" ); |
637 | |
638 | // A perfectly nested loop will not have any branch in between the outer and |
639 | // inner block i.e. outer header will branch to either inner preheader and |
640 | // outerloop latch. |
641 | BranchInst * = |
642 | dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator()); |
643 | if (!OuterLoopHeaderBI) |
644 | return false; |
645 | |
646 | for (BasicBlock *Succ : successors(I: OuterLoopHeaderBI)) |
647 | if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() && |
648 | Succ != OuterLoopLatch) |
649 | return false; |
650 | |
651 | LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n" ); |
652 | // We do not have any basic block in between now make sure the outer header |
653 | // and outer loop latch doesn't contain any unsafe instructions. |
654 | if (containsUnsafeInstructions(BB: OuterLoopHeader) || |
655 | containsUnsafeInstructions(BB: OuterLoopLatch)) |
656 | return false; |
657 | |
658 | // Also make sure the inner loop preheader does not contain any unsafe |
659 | // instructions. Note that all instructions in the preheader will be moved to |
660 | // the outer loop header when interchanging. |
661 | if (InnerLoopPreHeader != OuterLoopHeader && |
662 | containsUnsafeInstructions(BB: InnerLoopPreHeader)) |
663 | return false; |
664 | |
665 | BasicBlock *InnerLoopExit = InnerLoop->getExitBlock(); |
666 | // Ensure the inner loop exit block flows to the outer loop latch possibly |
667 | // through empty blocks. |
668 | const BasicBlock &SuccInner = |
669 | LoopNest::skipEmptyBlockUntil(From: InnerLoopExit, End: OuterLoopLatch); |
670 | if (&SuccInner != OuterLoopLatch) { |
671 | LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit |
672 | << " does not lead to the outer loop latch.\n" ;); |
673 | return false; |
674 | } |
675 | // The inner loop exit block does flow to the outer loop latch and not some |
676 | // other BBs, now make sure it contains safe instructions, since it will be |
677 | // moved into the (new) inner loop after interchange. |
678 | if (containsUnsafeInstructions(BB: InnerLoopExit)) |
679 | return false; |
680 | |
681 | LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n" ); |
682 | // We have a perfect loop nest. |
683 | return true; |
684 | } |
685 | |
686 | bool LoopInterchangeLegality::isLoopStructureUnderstood() { |
687 | BasicBlock * = InnerLoop->getLoopPreheader(); |
688 | for (PHINode *InnerInduction : InnerLoopInductions) { |
689 | unsigned Num = InnerInduction->getNumOperands(); |
690 | for (unsigned i = 0; i < Num; ++i) { |
691 | Value *Val = InnerInduction->getOperand(i_nocapture: i); |
692 | if (isa<Constant>(Val)) |
693 | continue; |
694 | Instruction *I = dyn_cast<Instruction>(Val); |
695 | if (!I) |
696 | return false; |
697 | // TODO: Handle triangular loops. |
698 | // e.g. for(int i=0;i<N;i++) |
699 | // for(int j=i;j<N;j++) |
700 | unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i); |
701 | if (InnerInduction->getIncomingBlock(i: IncomBlockIndx) == |
702 | InnerLoopPreheader && |
703 | !OuterLoop->isLoopInvariant(V: I)) { |
704 | return false; |
705 | } |
706 | } |
707 | } |
708 | |
709 | // TODO: Handle triangular loops of another form. |
710 | // e.g. for(int i=0;i<N;i++) |
711 | // for(int j=0;j<i;j++) |
712 | // or, |
713 | // for(int i=0;i<N;i++) |
714 | // for(int j=0;j*i<N;j++) |
715 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
716 | BranchInst *InnerLoopLatchBI = |
717 | dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator()); |
718 | if (!InnerLoopLatchBI->isConditional()) |
719 | return false; |
720 | if (CmpInst *InnerLoopCmp = |
721 | dyn_cast<CmpInst>(Val: InnerLoopLatchBI->getCondition())) { |
722 | Value *Op0 = InnerLoopCmp->getOperand(i_nocapture: 0); |
723 | Value *Op1 = InnerLoopCmp->getOperand(i_nocapture: 1); |
724 | |
725 | // LHS and RHS of the inner loop exit condition, e.g., |
726 | // in "for(int j=0;j<i;j++)", LHS is j and RHS is i. |
727 | Value *Left = nullptr; |
728 | Value *Right = nullptr; |
729 | |
730 | // Check if V only involves inner loop induction variable. |
731 | // Return true if V is InnerInduction, or a cast from |
732 | // InnerInduction, or a binary operator that involves |
733 | // InnerInduction and a constant. |
734 | std::function<bool(Value *)> IsPathToInnerIndVar; |
735 | IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool { |
736 | if (llvm::is_contained(Range&: InnerLoopInductions, Element: V)) |
737 | return true; |
738 | if (isa<Constant>(Val: V)) |
739 | return true; |
740 | const Instruction *I = dyn_cast<Instruction>(Val: V); |
741 | if (!I) |
742 | return false; |
743 | if (isa<CastInst>(Val: I)) |
744 | return IsPathToInnerIndVar(I->getOperand(i: 0)); |
745 | if (isa<BinaryOperator>(Val: I)) |
746 | return IsPathToInnerIndVar(I->getOperand(i: 0)) && |
747 | IsPathToInnerIndVar(I->getOperand(i: 1)); |
748 | return false; |
749 | }; |
750 | |
751 | // In case of multiple inner loop indvars, it is okay if LHS and RHS |
752 | // are both inner indvar related variables. |
753 | if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1)) |
754 | return true; |
755 | |
756 | // Otherwise we check if the cmp instruction compares an inner indvar |
757 | // related variable (Left) with a outer loop invariant (Right). |
758 | if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Val: Op0)) { |
759 | Left = Op0; |
760 | Right = Op1; |
761 | } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Val: Op1)) { |
762 | Left = Op1; |
763 | Right = Op0; |
764 | } |
765 | |
766 | if (Left == nullptr) |
767 | return false; |
768 | |
769 | const SCEV *S = SE->getSCEV(V: Right); |
770 | if (!SE->isLoopInvariant(S, L: OuterLoop)) |
771 | return false; |
772 | } |
773 | |
774 | return true; |
775 | } |
776 | |
777 | // If SV is a LCSSA PHI node with a single incoming value, return the incoming |
778 | // value. |
779 | static Value *followLCSSA(Value *SV) { |
780 | PHINode *PHI = dyn_cast<PHINode>(Val: SV); |
781 | if (!PHI) |
782 | return SV; |
783 | |
784 | if (PHI->getNumIncomingValues() != 1) |
785 | return SV; |
786 | return followLCSSA(SV: PHI->getIncomingValue(i: 0)); |
787 | } |
788 | |
789 | // Check V's users to see if it is involved in a reduction in L. |
790 | static PHINode *findInnerReductionPhi(Loop *L, Value *V) { |
791 | // Reduction variables cannot be constants. |
792 | if (isa<Constant>(Val: V)) |
793 | return nullptr; |
794 | |
795 | for (Value *User : V->users()) { |
796 | if (PHINode *PHI = dyn_cast<PHINode>(Val: User)) { |
797 | if (PHI->getNumIncomingValues() == 1) |
798 | continue; |
799 | RecurrenceDescriptor RD; |
800 | if (RecurrenceDescriptor::isReductionPHI(Phi: PHI, TheLoop: L, RedDes&: RD)) { |
801 | // Detect floating point reduction only when it can be reordered. |
802 | if (RD.getExactFPMathInst() != nullptr) |
803 | return nullptr; |
804 | return PHI; |
805 | } |
806 | return nullptr; |
807 | } |
808 | } |
809 | |
810 | return nullptr; |
811 | } |
812 | |
813 | bool LoopInterchangeLegality::findInductionAndReductions( |
814 | Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) { |
815 | if (!L->getLoopLatch() || !L->getLoopPredecessor()) |
816 | return false; |
817 | for (PHINode &PHI : L->getHeader()->phis()) { |
818 | InductionDescriptor ID; |
819 | if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID)) |
820 | Inductions.push_back(Elt: &PHI); |
821 | else { |
822 | // PHIs in inner loops need to be part of a reduction in the outer loop, |
823 | // discovered when checking the PHIs of the outer loop earlier. |
824 | if (!InnerLoop) { |
825 | if (!OuterInnerReductions.count(Ptr: &PHI)) { |
826 | LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions " |
827 | "across the outer loop.\n" ); |
828 | return false; |
829 | } |
830 | } else { |
831 | assert(PHI.getNumIncomingValues() == 2 && |
832 | "Phis in loop header should have exactly 2 incoming values" ); |
833 | // Check if we have a PHI node in the outer loop that has a reduction |
834 | // result from the inner loop as an incoming value. |
835 | Value *V = followLCSSA(SV: PHI.getIncomingValueForBlock(BB: L->getLoopLatch())); |
836 | PHINode *InnerRedPhi = findInnerReductionPhi(L: InnerLoop, V); |
837 | if (!InnerRedPhi || |
838 | !llvm::is_contained(Range: InnerRedPhi->incoming_values(), Element: &PHI)) { |
839 | LLVM_DEBUG( |
840 | dbgs() |
841 | << "Failed to recognize PHI as an induction or reduction.\n" ); |
842 | return false; |
843 | } |
844 | OuterInnerReductions.insert(Ptr: &PHI); |
845 | OuterInnerReductions.insert(Ptr: InnerRedPhi); |
846 | } |
847 | } |
848 | } |
849 | return true; |
850 | } |
851 | |
852 | // This function indicates the current limitations in the transform as a result |
853 | // of which we do not proceed. |
854 | bool LoopInterchangeLegality::currentLimitations() { |
855 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
856 | |
857 | // transform currently expects the loop latches to also be the exiting |
858 | // blocks. |
859 | if (InnerLoop->getExitingBlock() != InnerLoopLatch || |
860 | OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() || |
861 | !isa<BranchInst>(Val: InnerLoopLatch->getTerminator()) || |
862 | !isa<BranchInst>(Val: OuterLoop->getLoopLatch()->getTerminator())) { |
863 | LLVM_DEBUG( |
864 | dbgs() << "Loops where the latch is not the exiting block are not" |
865 | << " supported currently.\n" ); |
866 | ORE->emit(RemarkBuilder: [&]() { |
867 | return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch" , |
868 | OuterLoop->getStartLoc(), |
869 | OuterLoop->getHeader()) |
870 | << "Loops where the latch is not the exiting block cannot be" |
871 | " interchange currently." ; |
872 | }); |
873 | return true; |
874 | } |
875 | |
876 | SmallVector<PHINode *, 8> Inductions; |
877 | if (!findInductionAndReductions(L: OuterLoop, Inductions, InnerLoop)) { |
878 | LLVM_DEBUG( |
879 | dbgs() << "Only outer loops with induction or reduction PHI nodes " |
880 | << "are supported currently.\n" ); |
881 | ORE->emit(RemarkBuilder: [&]() { |
882 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter" , |
883 | OuterLoop->getStartLoc(), |
884 | OuterLoop->getHeader()) |
885 | << "Only outer loops with induction or reduction PHI nodes can be" |
886 | " interchanged currently." ; |
887 | }); |
888 | return true; |
889 | } |
890 | |
891 | Inductions.clear(); |
892 | // For multi-level loop nests, make sure that all phi nodes for inner loops |
893 | // at all levels can be recognized as a induction or reduction phi. Bail out |
894 | // if a phi node at a certain nesting level cannot be properly recognized. |
895 | Loop *CurLevelLoop = OuterLoop; |
896 | while (!CurLevelLoop->getSubLoops().empty()) { |
897 | // We already made sure that the loop nest is tightly nested. |
898 | CurLevelLoop = CurLevelLoop->getSubLoops().front(); |
899 | if (!findInductionAndReductions(L: CurLevelLoop, Inductions, InnerLoop: nullptr)) { |
900 | LLVM_DEBUG( |
901 | dbgs() << "Only inner loops with induction or reduction PHI nodes " |
902 | << "are supported currently.\n" ); |
903 | ORE->emit(RemarkBuilder: [&]() { |
904 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner" , |
905 | CurLevelLoop->getStartLoc(), |
906 | CurLevelLoop->getHeader()) |
907 | << "Only inner loops with induction or reduction PHI nodes can be" |
908 | " interchange currently." ; |
909 | }); |
910 | return true; |
911 | } |
912 | } |
913 | |
914 | // TODO: Triangular loops are not handled for now. |
915 | if (!isLoopStructureUnderstood()) { |
916 | LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n" ); |
917 | ORE->emit(RemarkBuilder: [&]() { |
918 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner" , |
919 | InnerLoop->getStartLoc(), |
920 | InnerLoop->getHeader()) |
921 | << "Inner loop structure not understood currently." ; |
922 | }); |
923 | return true; |
924 | } |
925 | |
926 | return false; |
927 | } |
928 | |
929 | bool LoopInterchangeLegality::findInductions( |
930 | Loop *L, SmallVectorImpl<PHINode *> &Inductions) { |
931 | for (PHINode &PHI : L->getHeader()->phis()) { |
932 | InductionDescriptor ID; |
933 | if (InductionDescriptor::isInductionPHI(Phi: &PHI, L, SE, D&: ID)) |
934 | Inductions.push_back(Elt: &PHI); |
935 | } |
936 | return !Inductions.empty(); |
937 | } |
938 | |
939 | // We currently only support LCSSA PHI nodes in the inner loop exit, if their |
940 | // users are either reduction PHIs or PHIs outside the outer loop (which means |
941 | // the we are only interested in the final value after the loop). |
942 | static bool |
943 | areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, |
944 | SmallPtrSetImpl<PHINode *> &Reductions) { |
945 | BasicBlock *InnerExit = OuterL->getUniqueExitBlock(); |
946 | for (PHINode &PHI : InnerExit->phis()) { |
947 | // Reduction lcssa phi will have only 1 incoming block that from loop latch. |
948 | if (PHI.getNumIncomingValues() > 1) |
949 | return false; |
950 | if (any_of(Range: PHI.users(), P: [&Reductions, OuterL](User *U) { |
951 | PHINode *PN = dyn_cast<PHINode>(Val: U); |
952 | return !PN || |
953 | (!Reductions.count(Ptr: PN) && OuterL->contains(BB: PN->getParent())); |
954 | })) { |
955 | return false; |
956 | } |
957 | } |
958 | return true; |
959 | } |
960 | |
961 | // We currently support LCSSA PHI nodes in the outer loop exit, if their |
962 | // incoming values do not come from the outer loop latch or if the |
963 | // outer loop latch has a single predecessor. In that case, the value will |
964 | // be available if both the inner and outer loop conditions are true, which |
965 | // will still be true after interchanging. If we have multiple predecessor, |
966 | // that may not be the case, e.g. because the outer loop latch may be executed |
967 | // if the inner loop is not executed. |
968 | static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { |
969 | BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock(); |
970 | for (PHINode &PHI : LoopNestExit->phis()) { |
971 | for (Value *Incoming : PHI.incoming_values()) { |
972 | Instruction *IncomingI = dyn_cast<Instruction>(Val: Incoming); |
973 | if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch()) |
974 | continue; |
975 | |
976 | // The incoming value is defined in the outer loop latch. Currently we |
977 | // only support that in case the outer loop latch has a single predecessor. |
978 | // This guarantees that the outer loop latch is executed if and only if |
979 | // the inner loop is executed (because tightlyNested() guarantees that the |
980 | // outer loop header only branches to the inner loop or the outer loop |
981 | // latch). |
982 | // FIXME: We could weaken this logic and allow multiple predecessors, |
983 | // if the values are produced outside the loop latch. We would need |
984 | // additional logic to update the PHI nodes in the exit block as |
985 | // well. |
986 | if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr) |
987 | return false; |
988 | } |
989 | } |
990 | return true; |
991 | } |
992 | |
993 | // In case of multi-level nested loops, it may occur that lcssa phis exist in |
994 | // the latch of InnerLoop, i.e., when defs of the incoming values are further |
995 | // inside the loopnest. Sometimes those incoming values are not available |
996 | // after interchange, since the original inner latch will become the new outer |
997 | // latch which may have predecessor paths that do not include those incoming |
998 | // values. |
999 | // TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of |
1000 | // multi-level loop nests. |
1001 | static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) { |
1002 | if (InnerLoop->getSubLoops().empty()) |
1003 | return true; |
1004 | // If the original outer latch has only one predecessor, then values defined |
1005 | // further inside the looploop, e.g., in the innermost loop, will be available |
1006 | // at the new outer latch after interchange. |
1007 | if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr) |
1008 | return true; |
1009 | |
1010 | // The outer latch has more than one predecessors, i.e., the inner |
1011 | // exit and the inner header. |
1012 | // PHI nodes in the inner latch are lcssa phis where the incoming values |
1013 | // are defined further inside the loopnest. Check if those phis are used |
1014 | // in the original inner latch. If that is the case then bail out since |
1015 | // those incoming values may not be available at the new outer latch. |
1016 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
1017 | for (PHINode &PHI : InnerLoopLatch->phis()) { |
1018 | for (auto *U : PHI.users()) { |
1019 | Instruction *UI = cast<Instruction>(Val: U); |
1020 | if (InnerLoopLatch == UI->getParent()) |
1021 | return false; |
1022 | } |
1023 | } |
1024 | return true; |
1025 | } |
1026 | |
1027 | bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, |
1028 | unsigned OuterLoopId, |
1029 | CharMatrix &DepMatrix) { |
1030 | if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) { |
1031 | LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId |
1032 | << " and OuterLoopId = " << OuterLoopId |
1033 | << " due to dependence\n" ); |
1034 | ORE->emit(RemarkBuilder: [&]() { |
1035 | return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence" , |
1036 | InnerLoop->getStartLoc(), |
1037 | InnerLoop->getHeader()) |
1038 | << "Cannot interchange loops due to dependences." ; |
1039 | }); |
1040 | return false; |
1041 | } |
1042 | // Check if outer and inner loop contain legal instructions only. |
1043 | for (auto *BB : OuterLoop->blocks()) |
1044 | for (Instruction &I : BB->instructionsWithoutDebug()) |
1045 | if (CallInst *CI = dyn_cast<CallInst>(Val: &I)) { |
1046 | // readnone functions do not prevent interchanging. |
1047 | if (CI->onlyWritesMemory()) |
1048 | continue; |
1049 | LLVM_DEBUG( |
1050 | dbgs() << "Loops with call instructions cannot be interchanged " |
1051 | << "safely." ); |
1052 | ORE->emit(RemarkBuilder: [&]() { |
1053 | return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst" , |
1054 | CI->getDebugLoc(), |
1055 | CI->getParent()) |
1056 | << "Cannot interchange loops due to call instruction." ; |
1057 | }); |
1058 | |
1059 | return false; |
1060 | } |
1061 | |
1062 | if (!findInductions(L: InnerLoop, Inductions&: InnerLoopInductions)) { |
1063 | LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n" ); |
1064 | return false; |
1065 | } |
1066 | |
1067 | if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) { |
1068 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n" ); |
1069 | ORE->emit(RemarkBuilder: [&]() { |
1070 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI" , |
1071 | InnerLoop->getStartLoc(), |
1072 | InnerLoop->getHeader()) |
1073 | << "Cannot interchange loops because unsupported PHI nodes found " |
1074 | "in inner loop latch." ; |
1075 | }); |
1076 | return false; |
1077 | } |
1078 | |
1079 | // TODO: The loops could not be interchanged due to current limitations in the |
1080 | // transform module. |
1081 | if (currentLimitations()) { |
1082 | LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n" ); |
1083 | return false; |
1084 | } |
1085 | |
1086 | // Check if the loops are tightly nested. |
1087 | if (!tightlyNested(OuterLoop, InnerLoop)) { |
1088 | LLVM_DEBUG(dbgs() << "Loops not tightly nested\n" ); |
1089 | ORE->emit(RemarkBuilder: [&]() { |
1090 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested" , |
1091 | InnerLoop->getStartLoc(), |
1092 | InnerLoop->getHeader()) |
1093 | << "Cannot interchange loops because they are not tightly " |
1094 | "nested." ; |
1095 | }); |
1096 | return false; |
1097 | } |
1098 | |
1099 | if (!areInnerLoopExitPHIsSupported(InnerL: OuterLoop, OuterL: InnerLoop, |
1100 | Reductions&: OuterInnerReductions)) { |
1101 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n" ); |
1102 | ORE->emit(RemarkBuilder: [&]() { |
1103 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI" , |
1104 | InnerLoop->getStartLoc(), |
1105 | InnerLoop->getHeader()) |
1106 | << "Found unsupported PHI node in loop exit." ; |
1107 | }); |
1108 | return false; |
1109 | } |
1110 | |
1111 | if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) { |
1112 | LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n" ); |
1113 | ORE->emit(RemarkBuilder: [&]() { |
1114 | return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI" , |
1115 | OuterLoop->getStartLoc(), |
1116 | OuterLoop->getHeader()) |
1117 | << "Found unsupported PHI node in loop exit." ; |
1118 | }); |
1119 | return false; |
1120 | } |
1121 | |
1122 | return true; |
1123 | } |
1124 | |
1125 | int LoopInterchangeProfitability::getInstrOrderCost() { |
1126 | unsigned GoodOrder, BadOrder; |
1127 | BadOrder = GoodOrder = 0; |
1128 | for (BasicBlock *BB : InnerLoop->blocks()) { |
1129 | for (Instruction &Ins : *BB) { |
1130 | if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: &Ins)) { |
1131 | bool FoundInnerInduction = false; |
1132 | bool FoundOuterInduction = false; |
1133 | for (Value *Op : GEP->operands()) { |
1134 | // Skip operands that are not SCEV-able. |
1135 | if (!SE->isSCEVable(Ty: Op->getType())) |
1136 | continue; |
1137 | |
1138 | const SCEV *OperandVal = SE->getSCEV(V: Op); |
1139 | const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Val: OperandVal); |
1140 | if (!AR) |
1141 | continue; |
1142 | |
1143 | // If we find the inner induction after an outer induction e.g. |
1144 | // for(int i=0;i<N;i++) |
1145 | // for(int j=0;j<N;j++) |
1146 | // A[i][j] = A[i-1][j-1]+k; |
1147 | // then it is a good order. |
1148 | if (AR->getLoop() == InnerLoop) { |
1149 | // We found an InnerLoop induction after OuterLoop induction. It is |
1150 | // a good order. |
1151 | FoundInnerInduction = true; |
1152 | if (FoundOuterInduction) { |
1153 | GoodOrder++; |
1154 | break; |
1155 | } |
1156 | } |
1157 | // If we find the outer induction after an inner induction e.g. |
1158 | // for(int i=0;i<N;i++) |
1159 | // for(int j=0;j<N;j++) |
1160 | // A[j][i] = A[j-1][i-1]+k; |
1161 | // then it is a bad order. |
1162 | if (AR->getLoop() == OuterLoop) { |
1163 | // We found an OuterLoop induction after InnerLoop induction. It is |
1164 | // a bad order. |
1165 | FoundOuterInduction = true; |
1166 | if (FoundInnerInduction) { |
1167 | BadOrder++; |
1168 | break; |
1169 | } |
1170 | } |
1171 | } |
1172 | } |
1173 | } |
1174 | } |
1175 | return GoodOrder - BadOrder; |
1176 | } |
1177 | |
1178 | std::optional<bool> |
1179 | LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis( |
1180 | const DenseMap<const Loop *, unsigned> &CostMap, |
1181 | std::unique_ptr<CacheCost> &CC) { |
1182 | // This is the new cost model returned from loop cache analysis. |
1183 | // A smaller index means the loop should be placed an outer loop, and vice |
1184 | // versa. |
1185 | auto InnerLoopIt = CostMap.find(Val: InnerLoop); |
1186 | if (InnerLoopIt == CostMap.end()) |
1187 | return std::nullopt; |
1188 | auto OuterLoopIt = CostMap.find(Val: OuterLoop); |
1189 | if (OuterLoopIt == CostMap.end()) |
1190 | return std::nullopt; |
1191 | |
1192 | if (CC->getLoopCost(L: *OuterLoop) == CC->getLoopCost(L: *InnerLoop)) |
1193 | return std::nullopt; |
1194 | unsigned InnerIndex = InnerLoopIt->second; |
1195 | unsigned OuterIndex = OuterLoopIt->second; |
1196 | LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex |
1197 | << ", OuterIndex = " << OuterIndex << "\n" ); |
1198 | assert(InnerIndex != OuterIndex && "CostMap should assign unique " |
1199 | "numbers to each loop" ); |
1200 | return std::optional<bool>(InnerIndex < OuterIndex); |
1201 | } |
1202 | |
1203 | std::optional<bool> |
1204 | LoopInterchangeProfitability::isProfitablePerInstrOrderCost() { |
1205 | // Legacy cost model: this is rough cost estimation algorithm. It counts the |
1206 | // good and bad order of induction variables in the instruction and allows |
1207 | // reordering if number of bad orders is more than good. |
1208 | int Cost = getInstrOrderCost(); |
1209 | LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n" ); |
1210 | if (Cost < 0 && Cost < LoopInterchangeCostThreshold) |
1211 | return std::optional<bool>(true); |
1212 | |
1213 | return std::nullopt; |
1214 | } |
1215 | |
1216 | /// Return true if we can vectorize the loop specified by \p LoopId. |
1217 | static bool canVectorize(const CharMatrix &DepMatrix, unsigned LoopId) { |
1218 | for (const auto &Dep : DepMatrix) { |
1219 | char Dir = Dep[LoopId]; |
1220 | if (Dir != 'I' && Dir != '=') |
1221 | return false; |
1222 | } |
1223 | return true; |
1224 | } |
1225 | |
1226 | std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization( |
1227 | unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) { |
1228 | // If the outer loop is not loop independent it is not profitable to move |
1229 | // this to inner position, since doing so would not enable inner loop |
1230 | // parallelism. |
1231 | if (!canVectorize(DepMatrix, LoopId: OuterLoopId)) |
1232 | return false; |
1233 | |
1234 | // If inner loop has dependence and outer loop is loop independent then it is |
1235 | // profitable to interchange to enable inner loop parallelism. |
1236 | if (!canVectorize(DepMatrix, LoopId: InnerLoopId)) |
1237 | return true; |
1238 | |
1239 | // If both the inner and the outer loop can be vectorized, it is necessary to |
1240 | // check the cost of each vectorized loop for profitability decision. At this |
1241 | // time we do not have a cost model to estimate them, so return nullopt. |
1242 | // TODO: Estimate the cost of vectorized loop when both the outer and the |
1243 | // inner loop can be vectorized. |
1244 | return std::nullopt; |
1245 | } |
1246 | |
1247 | bool LoopInterchangeProfitability::isProfitable( |
1248 | const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, |
1249 | unsigned OuterLoopId, CharMatrix &DepMatrix, |
1250 | const DenseMap<const Loop *, unsigned> &CostMap, |
1251 | std::unique_ptr<CacheCost> &CC) { |
1252 | // isProfitable() is structured to avoid endless loop interchange. If the |
1253 | // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could |
1254 | // decide the profitability then, profitability check will stop and return the |
1255 | // analysis result. If it failed to determine it (e.g., cache analysis failed |
1256 | // to analyze the loopnest due to delinearization issues) then go ahead the |
1257 | // second highest priority rule (isProfitablePerInstrOrderCost by default). |
1258 | // Likewise, if it failed to analysis the profitability then only, the last |
1259 | // rule (isProfitableForVectorization by default) will decide. |
1260 | assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules" ); |
1261 | std::optional<bool> shouldInterchange; |
1262 | for (RuleTy RT : Profitabilities) { |
1263 | switch (RT) { |
1264 | case RuleTy::PerLoopCacheAnalysis: |
1265 | shouldInterchange = isProfitablePerLoopCacheAnalysis(CostMap, CC); |
1266 | break; |
1267 | case RuleTy::PerInstrOrderCost: |
1268 | shouldInterchange = isProfitablePerInstrOrderCost(); |
1269 | break; |
1270 | case RuleTy::ForVectorization: |
1271 | shouldInterchange = |
1272 | isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); |
1273 | break; |
1274 | } |
1275 | |
1276 | // If this rule could determine the profitability, don't call subsequent |
1277 | // rules. |
1278 | if (shouldInterchange.has_value()) |
1279 | break; |
1280 | } |
1281 | |
1282 | if (!shouldInterchange.has_value()) { |
1283 | ORE->emit(RemarkBuilder: [&]() { |
1284 | return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable" , |
1285 | InnerLoop->getStartLoc(), |
1286 | InnerLoop->getHeader()) |
1287 | << "Insufficient information to calculate the cost of loop for " |
1288 | "interchange." ; |
1289 | }); |
1290 | return false; |
1291 | } else if (!shouldInterchange.value()) { |
1292 | ORE->emit(RemarkBuilder: [&]() { |
1293 | return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable" , |
1294 | InnerLoop->getStartLoc(), |
1295 | InnerLoop->getHeader()) |
1296 | << "Interchanging loops is not considered to improve cache " |
1297 | "locality nor vectorization." ; |
1298 | }); |
1299 | return false; |
1300 | } |
1301 | return true; |
1302 | } |
1303 | |
1304 | void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, |
1305 | Loop *InnerLoop) { |
1306 | for (Loop *L : *OuterLoop) |
1307 | if (L == InnerLoop) { |
1308 | OuterLoop->removeChildLoop(Child: L); |
1309 | return; |
1310 | } |
1311 | llvm_unreachable("Couldn't find loop" ); |
1312 | } |
1313 | |
1314 | /// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the |
1315 | /// new inner and outer loop after interchanging: NewInner is the original |
1316 | /// outer loop and NewOuter is the original inner loop. |
1317 | /// |
1318 | /// Before interchanging, we have the following structure |
1319 | /// Outer preheader |
1320 | // Outer header |
1321 | // Inner preheader |
1322 | // Inner header |
1323 | // Inner body |
1324 | // Inner latch |
1325 | // outer bbs |
1326 | // Outer latch |
1327 | // |
1328 | // After interchanging: |
1329 | // Inner preheader |
1330 | // Inner header |
1331 | // Outer preheader |
1332 | // Outer header |
1333 | // Inner body |
1334 | // outer bbs |
1335 | // Outer latch |
1336 | // Inner latch |
1337 | void LoopInterchangeTransform::restructureLoops( |
1338 | Loop *NewInner, Loop *NewOuter, BasicBlock *, |
1339 | BasicBlock *) { |
1340 | Loop *OuterLoopParent = OuterLoop->getParentLoop(); |
1341 | // The original inner loop preheader moves from the new inner loop to |
1342 | // the parent loop, if there is one. |
1343 | NewInner->removeBlockFromLoop(BB: OrigInnerPreHeader); |
1344 | LI->changeLoopFor(BB: OrigInnerPreHeader, L: OuterLoopParent); |
1345 | |
1346 | // Switch the loop levels. |
1347 | if (OuterLoopParent) { |
1348 | // Remove the loop from its parent loop. |
1349 | removeChildLoop(OuterLoop: OuterLoopParent, InnerLoop: NewInner); |
1350 | removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter); |
1351 | OuterLoopParent->addChildLoop(NewChild: NewOuter); |
1352 | } else { |
1353 | removeChildLoop(OuterLoop: NewInner, InnerLoop: NewOuter); |
1354 | LI->changeTopLevelLoop(OldLoop: NewInner, NewLoop: NewOuter); |
1355 | } |
1356 | while (!NewOuter->isInnermost()) |
1357 | NewInner->addChildLoop(NewChild: NewOuter->removeChildLoop(I: NewOuter->begin())); |
1358 | NewOuter->addChildLoop(NewChild: NewInner); |
1359 | |
1360 | // BBs from the original inner loop. |
1361 | SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks()); |
1362 | |
1363 | // Add BBs from the original outer loop to the original inner loop (excluding |
1364 | // BBs already in inner loop) |
1365 | for (BasicBlock *BB : NewInner->blocks()) |
1366 | if (LI->getLoopFor(BB) == NewInner) |
1367 | NewOuter->addBlockEntry(BB); |
1368 | |
1369 | // Now remove inner loop header and latch from the new inner loop and move |
1370 | // other BBs (the loop body) to the new inner loop. |
1371 | BasicBlock * = NewOuter->getHeader(); |
1372 | BasicBlock *OuterLatch = NewOuter->getLoopLatch(); |
1373 | for (BasicBlock *BB : OrigInnerBBs) { |
1374 | // Nothing will change for BBs in child loops. |
1375 | if (LI->getLoopFor(BB) != NewOuter) |
1376 | continue; |
1377 | // Remove the new outer loop header and latch from the new inner loop. |
1378 | if (BB == OuterHeader || BB == OuterLatch) |
1379 | NewInner->removeBlockFromLoop(BB); |
1380 | else |
1381 | LI->changeLoopFor(BB, L: NewInner); |
1382 | } |
1383 | |
1384 | // The preheader of the original outer loop becomes part of the new |
1385 | // outer loop. |
1386 | NewOuter->addBlockEntry(BB: OrigOuterPreHeader); |
1387 | LI->changeLoopFor(BB: OrigOuterPreHeader, L: NewOuter); |
1388 | |
1389 | // Tell SE that we move the loops around. |
1390 | SE->forgetLoop(L: NewOuter); |
1391 | } |
1392 | |
1393 | bool LoopInterchangeTransform::transform() { |
1394 | bool Transformed = false; |
1395 | |
1396 | if (InnerLoop->getSubLoops().empty()) { |
1397 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1398 | LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n" ); |
1399 | auto &InductionPHIs = LIL.getInnerLoopInductions(); |
1400 | if (InductionPHIs.empty()) { |
1401 | LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n" ); |
1402 | return false; |
1403 | } |
1404 | |
1405 | SmallVector<Instruction *, 8> InnerIndexVarList; |
1406 | for (PHINode *CurInductionPHI : InductionPHIs) { |
1407 | if (CurInductionPHI->getIncomingBlock(i: 0) == InnerLoopPreHeader) |
1408 | InnerIndexVarList.push_back( |
1409 | Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 1))); |
1410 | else |
1411 | InnerIndexVarList.push_back( |
1412 | Elt: dyn_cast<Instruction>(Val: CurInductionPHI->getIncomingValue(i: 0))); |
1413 | } |
1414 | |
1415 | // Create a new latch block for the inner loop. We split at the |
1416 | // current latch's terminator and then move the condition and all |
1417 | // operands that are not either loop-invariant or the induction PHI into the |
1418 | // new latch block. |
1419 | BasicBlock *NewLatch = |
1420 | SplitBlock(Old: InnerLoop->getLoopLatch(), |
1421 | SplitPt: InnerLoop->getLoopLatch()->getTerminator(), DT, LI); |
1422 | |
1423 | SmallSetVector<Instruction *, 4> WorkList; |
1424 | unsigned i = 0; |
1425 | auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() { |
1426 | for (; i < WorkList.size(); i++) { |
1427 | // Duplicate instruction and move it the new latch. Update uses that |
1428 | // have been moved. |
1429 | Instruction *NewI = WorkList[i]->clone(); |
1430 | NewI->insertBefore(InsertPos: NewLatch->getFirstNonPHIIt()); |
1431 | assert(!NewI->mayHaveSideEffects() && |
1432 | "Moving instructions with side-effects may change behavior of " |
1433 | "the loop nest!" ); |
1434 | for (Use &U : llvm::make_early_inc_range(Range: WorkList[i]->uses())) { |
1435 | Instruction *UserI = cast<Instruction>(Val: U.getUser()); |
1436 | if (!InnerLoop->contains(BB: UserI->getParent()) || |
1437 | UserI->getParent() == NewLatch || |
1438 | llvm::is_contained(Range: InductionPHIs, Element: UserI)) |
1439 | U.set(NewI); |
1440 | } |
1441 | // Add operands of moved instruction to the worklist, except if they are |
1442 | // outside the inner loop or are the induction PHI. |
1443 | for (Value *Op : WorkList[i]->operands()) { |
1444 | Instruction *OpI = dyn_cast<Instruction>(Val: Op); |
1445 | if (!OpI || |
1446 | this->LI->getLoopFor(BB: OpI->getParent()) != this->InnerLoop || |
1447 | llvm::is_contained(Range: InductionPHIs, Element: OpI)) |
1448 | continue; |
1449 | WorkList.insert(X: OpI); |
1450 | } |
1451 | } |
1452 | }; |
1453 | |
1454 | // FIXME: Should we interchange when we have a constant condition? |
1455 | Instruction *CondI = dyn_cast<Instruction>( |
1456 | Val: cast<BranchInst>(Val: InnerLoop->getLoopLatch()->getTerminator()) |
1457 | ->getCondition()); |
1458 | if (CondI) |
1459 | WorkList.insert(X: CondI); |
1460 | MoveInstructions(); |
1461 | for (Instruction *InnerIndexVar : InnerIndexVarList) |
1462 | WorkList.insert(X: cast<Instruction>(Val: InnerIndexVar)); |
1463 | MoveInstructions(); |
1464 | } |
1465 | |
1466 | // Ensure the inner loop phi nodes have a separate basic block. |
1467 | BasicBlock * = InnerLoop->getHeader(); |
1468 | if (&*InnerLoopHeader->getFirstNonPHIIt() != |
1469 | InnerLoopHeader->getTerminator()) { |
1470 | SplitBlock(Old: InnerLoopHeader, SplitPt: InnerLoopHeader->getFirstNonPHIIt(), DT, LI); |
1471 | LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n" ); |
1472 | } |
1473 | |
1474 | // Instructions in the original inner loop preheader may depend on values |
1475 | // defined in the outer loop header. Move them there, because the original |
1476 | // inner loop preheader will become the entry into the interchanged loop nest. |
1477 | // Currently we move all instructions and rely on LICM to move invariant |
1478 | // instructions outside the loop nest. |
1479 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1480 | BasicBlock * = OuterLoop->getHeader(); |
1481 | if (InnerLoopPreHeader != OuterLoopHeader) { |
1482 | for (Instruction &I : |
1483 | make_early_inc_range(Range: make_range(x: InnerLoopPreHeader->begin(), |
1484 | y: std::prev(x: InnerLoopPreHeader->end())))) |
1485 | I.moveBeforePreserving(MovePos: OuterLoopHeader->getTerminator()->getIterator()); |
1486 | } |
1487 | |
1488 | Transformed |= adjustLoopLinks(); |
1489 | if (!Transformed) { |
1490 | LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n" ); |
1491 | return false; |
1492 | } |
1493 | |
1494 | return true; |
1495 | } |
1496 | |
1497 | /// \brief Move all instructions except the terminator from FromBB right before |
1498 | /// InsertBefore |
1499 | static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) { |
1500 | BasicBlock *ToBB = InsertBefore->getParent(); |
1501 | |
1502 | ToBB->splice(ToIt: InsertBefore->getIterator(), FromBB, FromBeginIt: FromBB->begin(), |
1503 | FromEndIt: FromBB->getTerminator()->getIterator()); |
1504 | } |
1505 | |
1506 | /// Swap instructions between \p BB1 and \p BB2 but keep terminators intact. |
1507 | static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) { |
1508 | // Save all non-terminator instructions of BB1 into TempInstrs and unlink them |
1509 | // from BB1 afterwards. |
1510 | auto Iter = map_range(C&: *BB1, F: [](Instruction &I) { return &I; }); |
1511 | SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(x: Iter.end())); |
1512 | for (Instruction *I : TempInstrs) |
1513 | I->removeFromParent(); |
1514 | |
1515 | // Move instructions from BB2 to BB1. |
1516 | moveBBContents(FromBB: BB2, InsertBefore: BB1->getTerminator()); |
1517 | |
1518 | // Move instructions from TempInstrs to BB2. |
1519 | for (Instruction *I : TempInstrs) |
1520 | I->insertBefore(InsertPos: BB2->getTerminator()->getIterator()); |
1521 | } |
1522 | |
1523 | // Update BI to jump to NewBB instead of OldBB. Records updates to the |
1524 | // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that |
1525 | // \p OldBB is exactly once in BI's successor list. |
1526 | static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, |
1527 | BasicBlock *NewBB, |
1528 | std::vector<DominatorTree::UpdateType> &DTUpdates, |
1529 | bool MustUpdateOnce = true) { |
1530 | assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) && |
1531 | "BI must jump to OldBB exactly once." ); |
1532 | bool Changed = false; |
1533 | for (Use &Op : BI->operands()) |
1534 | if (Op == OldBB) { |
1535 | Op.set(NewBB); |
1536 | Changed = true; |
1537 | } |
1538 | |
1539 | if (Changed) { |
1540 | DTUpdates.push_back( |
1541 | x: {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB}); |
1542 | DTUpdates.push_back( |
1543 | x: {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB}); |
1544 | } |
1545 | assert(Changed && "Expected a successor to be updated" ); |
1546 | } |
1547 | |
1548 | // Move Lcssa PHIs to the right place. |
1549 | static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *, |
1550 | BasicBlock *InnerLatch, BasicBlock *, |
1551 | BasicBlock *OuterLatch, BasicBlock *OuterExit, |
1552 | Loop *InnerLoop, LoopInfo *LI) { |
1553 | |
1554 | // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are |
1555 | // defined either in the header or latch. Those blocks will become header and |
1556 | // latch of the new outer loop, and the only possible users can PHI nodes |
1557 | // in the exit block of the loop nest or the outer loop header (reduction |
1558 | // PHIs, in that case, the incoming value must be defined in the inner loop |
1559 | // header). We can just substitute the user with the incoming value and remove |
1560 | // the PHI. |
1561 | for (PHINode &P : make_early_inc_range(Range: InnerExit->phis())) { |
1562 | assert(P.getNumIncomingValues() == 1 && |
1563 | "Only loops with a single exit are supported!" ); |
1564 | |
1565 | // Incoming values are guaranteed be instructions currently. |
1566 | auto IncI = cast<Instruction>(Val: P.getIncomingValueForBlock(BB: InnerLatch)); |
1567 | // In case of multi-level nested loops, follow LCSSA to find the incoming |
1568 | // value defined from the innermost loop. |
1569 | auto IncIInnerMost = cast<Instruction>(Val: followLCSSA(SV: IncI)); |
1570 | // Skip phis with incoming values from the inner loop body, excluding the |
1571 | // header and latch. |
1572 | if (IncIInnerMost->getParent() != InnerLatch && |
1573 | IncIInnerMost->getParent() != InnerHeader) |
1574 | continue; |
1575 | |
1576 | assert(all_of(P.users(), |
1577 | [OuterHeader, OuterExit, IncI, InnerHeader](User *U) { |
1578 | return (cast<PHINode>(U)->getParent() == OuterHeader && |
1579 | IncI->getParent() == InnerHeader) || |
1580 | cast<PHINode>(U)->getParent() == OuterExit; |
1581 | }) && |
1582 | "Can only replace phis iff the uses are in the loop nest exit or " |
1583 | "the incoming value is defined in the inner header (it will " |
1584 | "dominate all loop blocks after interchanging)" ); |
1585 | P.replaceAllUsesWith(V: IncI); |
1586 | P.eraseFromParent(); |
1587 | } |
1588 | |
1589 | SmallVector<PHINode *, 8> LcssaInnerExit( |
1590 | llvm::make_pointer_range(Range: InnerExit->phis())); |
1591 | |
1592 | SmallVector<PHINode *, 8> LcssaInnerLatch( |
1593 | llvm::make_pointer_range(Range: InnerLatch->phis())); |
1594 | |
1595 | // Lcssa PHIs for values used outside the inner loop are in InnerExit. |
1596 | // If a PHI node has users outside of InnerExit, it has a use outside the |
1597 | // interchanged loop and we have to preserve it. We move these to |
1598 | // InnerLatch, which will become the new exit block for the innermost |
1599 | // loop after interchanging. |
1600 | for (PHINode *P : LcssaInnerExit) |
1601 | P->moveBefore(InsertPos: InnerLatch->getFirstNonPHIIt()); |
1602 | |
1603 | // If the inner loop latch contains LCSSA PHIs, those come from a child loop |
1604 | // and we have to move them to the new inner latch. |
1605 | for (PHINode *P : LcssaInnerLatch) |
1606 | P->moveBefore(InsertPos: InnerExit->getFirstNonPHIIt()); |
1607 | |
1608 | // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have |
1609 | // incoming values defined in the outer loop, we have to add a new PHI |
1610 | // in the inner loop latch, which became the exit block of the outer loop, |
1611 | // after interchanging. |
1612 | if (OuterExit) { |
1613 | for (PHINode &P : OuterExit->phis()) { |
1614 | if (P.getNumIncomingValues() != 1) |
1615 | continue; |
1616 | // Skip Phis with incoming values defined in the inner loop. Those should |
1617 | // already have been updated. |
1618 | auto I = dyn_cast<Instruction>(Val: P.getIncomingValue(i: 0)); |
1619 | if (!I || LI->getLoopFor(BB: I->getParent()) == InnerLoop) |
1620 | continue; |
1621 | |
1622 | PHINode *NewPhi = dyn_cast<PHINode>(Val: P.clone()); |
1623 | NewPhi->setIncomingValue(i: 0, V: P.getIncomingValue(i: 0)); |
1624 | NewPhi->setIncomingBlock(i: 0, BB: OuterLatch); |
1625 | // We might have incoming edges from other BBs, i.e., the original outer |
1626 | // header. |
1627 | for (auto *Pred : predecessors(BB: InnerLatch)) { |
1628 | if (Pred == OuterLatch) |
1629 | continue; |
1630 | NewPhi->addIncoming(V: P.getIncomingValue(i: 0), BB: Pred); |
1631 | } |
1632 | NewPhi->insertBefore(InsertPos: InnerLatch->getFirstNonPHIIt()); |
1633 | P.setIncomingValue(i: 0, V: NewPhi); |
1634 | } |
1635 | } |
1636 | |
1637 | // Now adjust the incoming blocks for the LCSSA PHIs. |
1638 | // For PHIs moved from Inner's exit block, we need to replace Inner's latch |
1639 | // with the new latch. |
1640 | InnerLatch->replacePhiUsesWith(Old: InnerLatch, New: OuterLatch); |
1641 | } |
1642 | |
1643 | bool LoopInterchangeTransform::adjustLoopBranches() { |
1644 | LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n" ); |
1645 | std::vector<DominatorTree::UpdateType> DTUpdates; |
1646 | |
1647 | BasicBlock * = OuterLoop->getLoopPreheader(); |
1648 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1649 | |
1650 | assert(OuterLoopPreHeader != OuterLoop->getHeader() && |
1651 | InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader && |
1652 | InnerLoopPreHeader && "Guaranteed by loop-simplify form" ); |
1653 | // Ensure that both preheaders do not contain PHI nodes and have single |
1654 | // predecessors. This allows us to move them easily. We use |
1655 | // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing |
1656 | // preheaders do not satisfy those conditions. |
1657 | if (isa<PHINode>(Val: OuterLoopPreHeader->begin()) || |
1658 | !OuterLoopPreHeader->getUniquePredecessor()) |
1659 | OuterLoopPreHeader = |
1660 | InsertPreheaderForLoop(L: OuterLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true); |
1661 | if (InnerLoopPreHeader == OuterLoop->getHeader()) |
1662 | InnerLoopPreHeader = |
1663 | InsertPreheaderForLoop(L: InnerLoop, DT, LI, MSSAU: nullptr, PreserveLCSSA: true); |
1664 | |
1665 | // Adjust the loop preheader |
1666 | BasicBlock * = InnerLoop->getHeader(); |
1667 | BasicBlock * = OuterLoop->getHeader(); |
1668 | BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch(); |
1669 | BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch(); |
1670 | BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor(); |
1671 | BasicBlock *InnerLoopLatchPredecessor = |
1672 | InnerLoopLatch->getUniquePredecessor(); |
1673 | BasicBlock *InnerLoopLatchSuccessor; |
1674 | BasicBlock *OuterLoopLatchSuccessor; |
1675 | |
1676 | BranchInst *OuterLoopLatchBI = |
1677 | dyn_cast<BranchInst>(Val: OuterLoopLatch->getTerminator()); |
1678 | BranchInst *InnerLoopLatchBI = |
1679 | dyn_cast<BranchInst>(Val: InnerLoopLatch->getTerminator()); |
1680 | BranchInst * = |
1681 | dyn_cast<BranchInst>(Val: OuterLoopHeader->getTerminator()); |
1682 | BranchInst * = |
1683 | dyn_cast<BranchInst>(Val: InnerLoopHeader->getTerminator()); |
1684 | |
1685 | if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor || |
1686 | !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI || |
1687 | !InnerLoopHeaderBI) |
1688 | return false; |
1689 | |
1690 | BranchInst *InnerLoopLatchPredecessorBI = |
1691 | dyn_cast<BranchInst>(Val: InnerLoopLatchPredecessor->getTerminator()); |
1692 | BranchInst *OuterLoopPredecessorBI = |
1693 | dyn_cast<BranchInst>(Val: OuterLoopPredecessor->getTerminator()); |
1694 | |
1695 | if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI) |
1696 | return false; |
1697 | BasicBlock * = InnerLoopHeader->getUniqueSuccessor(); |
1698 | if (!InnerLoopHeaderSuccessor) |
1699 | return false; |
1700 | |
1701 | // Adjust Loop Preheader and headers. |
1702 | // The branches in the outer loop predecessor and the outer loop header can |
1703 | // be unconditional branches or conditional branches with duplicates. Consider |
1704 | // this when updating the successors. |
1705 | updateSuccessor(BI: OuterLoopPredecessorBI, OldBB: OuterLoopPreHeader, |
1706 | NewBB: InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false); |
1707 | // The outer loop header might or might not branch to the outer latch. |
1708 | // We are guaranteed to branch to the inner loop preheader. |
1709 | if (llvm::is_contained(Range: OuterLoopHeaderBI->successors(), Element: OuterLoopLatch)) { |
1710 | // In this case the outerLoopHeader should branch to the InnerLoopLatch. |
1711 | updateSuccessor(BI: OuterLoopHeaderBI, OldBB: OuterLoopLatch, NewBB: InnerLoopLatch, |
1712 | DTUpdates, |
1713 | /*MustUpdateOnce=*/false); |
1714 | } |
1715 | updateSuccessor(BI: OuterLoopHeaderBI, OldBB: InnerLoopPreHeader, |
1716 | NewBB: InnerLoopHeaderSuccessor, DTUpdates, |
1717 | /*MustUpdateOnce=*/false); |
1718 | |
1719 | // Adjust reduction PHI's now that the incoming block has changed. |
1720 | InnerLoopHeaderSuccessor->replacePhiUsesWith(Old: InnerLoopHeader, |
1721 | New: OuterLoopHeader); |
1722 | |
1723 | updateSuccessor(BI: InnerLoopHeaderBI, OldBB: InnerLoopHeaderSuccessor, |
1724 | NewBB: OuterLoopPreHeader, DTUpdates); |
1725 | |
1726 | // -------------Adjust loop latches----------- |
1727 | if (InnerLoopLatchBI->getSuccessor(i: 0) == InnerLoopHeader) |
1728 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 1); |
1729 | else |
1730 | InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(i: 0); |
1731 | |
1732 | updateSuccessor(BI: InnerLoopLatchPredecessorBI, OldBB: InnerLoopLatch, |
1733 | NewBB: InnerLoopLatchSuccessor, DTUpdates); |
1734 | |
1735 | if (OuterLoopLatchBI->getSuccessor(i: 0) == OuterLoopHeader) |
1736 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 1); |
1737 | else |
1738 | OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(i: 0); |
1739 | |
1740 | updateSuccessor(BI: InnerLoopLatchBI, OldBB: InnerLoopLatchSuccessor, |
1741 | NewBB: OuterLoopLatchSuccessor, DTUpdates); |
1742 | updateSuccessor(BI: OuterLoopLatchBI, OldBB: OuterLoopLatchSuccessor, NewBB: InnerLoopLatch, |
1743 | DTUpdates); |
1744 | |
1745 | DT->applyUpdates(Updates: DTUpdates); |
1746 | restructureLoops(NewInner: OuterLoop, NewOuter: InnerLoop, OrigInnerPreHeader: InnerLoopPreHeader, |
1747 | OrigOuterPreHeader: OuterLoopPreHeader); |
1748 | |
1749 | moveLCSSAPhis(InnerExit: InnerLoopLatchSuccessor, InnerHeader: InnerLoopHeader, InnerLatch: InnerLoopLatch, |
1750 | OuterHeader: OuterLoopHeader, OuterLatch: OuterLoopLatch, OuterExit: InnerLoop->getExitBlock(), |
1751 | InnerLoop, LI); |
1752 | // For PHIs in the exit block of the outer loop, outer's latch has been |
1753 | // replaced by Inners'. |
1754 | OuterLoopLatchSuccessor->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch); |
1755 | |
1756 | auto &OuterInnerReductions = LIL.getOuterInnerReductions(); |
1757 | // Now update the reduction PHIs in the inner and outer loop headers. |
1758 | SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs; |
1759 | for (PHINode &PHI : InnerLoopHeader->phis()) |
1760 | if (OuterInnerReductions.contains(Ptr: &PHI)) |
1761 | InnerLoopPHIs.push_back(Elt: &PHI); |
1762 | |
1763 | for (PHINode &PHI : OuterLoopHeader->phis()) |
1764 | if (OuterInnerReductions.contains(Ptr: &PHI)) |
1765 | OuterLoopPHIs.push_back(Elt: &PHI); |
1766 | |
1767 | // Now move the remaining reduction PHIs from outer to inner loop header and |
1768 | // vice versa. The PHI nodes must be part of a reduction across the inner and |
1769 | // outer loop and all the remains to do is and updating the incoming blocks. |
1770 | for (PHINode *PHI : OuterLoopPHIs) { |
1771 | LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n" ; PHI->dump();); |
1772 | PHI->moveBefore(InsertPos: InnerLoopHeader->getFirstNonPHIIt()); |
1773 | assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node" ); |
1774 | } |
1775 | for (PHINode *PHI : InnerLoopPHIs) { |
1776 | LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n" ; PHI->dump();); |
1777 | PHI->moveBefore(InsertPos: OuterLoopHeader->getFirstNonPHIIt()); |
1778 | assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node" ); |
1779 | } |
1780 | |
1781 | // Update the incoming blocks for moved PHI nodes. |
1782 | OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopPreHeader, New: OuterLoopPreHeader); |
1783 | OuterLoopHeader->replacePhiUsesWith(Old: InnerLoopLatch, New: OuterLoopLatch); |
1784 | InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopPreHeader, New: InnerLoopPreHeader); |
1785 | InnerLoopHeader->replacePhiUsesWith(Old: OuterLoopLatch, New: InnerLoopLatch); |
1786 | |
1787 | // Values defined in the outer loop header could be used in the inner loop |
1788 | // latch. In that case, we need to create LCSSA phis for them, because after |
1789 | // interchanging they will be defined in the new inner loop and used in the |
1790 | // new outer loop. |
1791 | SmallVector<Instruction *, 4> MayNeedLCSSAPhis; |
1792 | for (Instruction &I : |
1793 | make_range(x: OuterLoopHeader->begin(), y: std::prev(x: OuterLoopHeader->end()))) |
1794 | MayNeedLCSSAPhis.push_back(Elt: &I); |
1795 | formLCSSAForInstructions(Worklist&: MayNeedLCSSAPhis, DT: *DT, LI: *LI, SE); |
1796 | |
1797 | return true; |
1798 | } |
1799 | |
1800 | bool LoopInterchangeTransform::adjustLoopLinks() { |
1801 | // Adjust all branches in the inner and outer loop. |
1802 | bool Changed = adjustLoopBranches(); |
1803 | if (Changed) { |
1804 | // We have interchanged the preheaders so we need to interchange the data in |
1805 | // the preheaders as well. This is because the content of the inner |
1806 | // preheader was previously executed inside the outer loop. |
1807 | BasicBlock * = OuterLoop->getLoopPreheader(); |
1808 | BasicBlock * = InnerLoop->getLoopPreheader(); |
1809 | swapBBContents(BB1: OuterLoopPreHeader, BB2: InnerLoopPreHeader); |
1810 | } |
1811 | return Changed; |
1812 | } |
1813 | |
1814 | PreservedAnalyses LoopInterchangePass::run(LoopNest &LN, |
1815 | LoopAnalysisManager &AM, |
1816 | LoopStandardAnalysisResults &AR, |
1817 | LPMUpdater &U) { |
1818 | Function &F = *LN.getParent(); |
1819 | SmallVector<Loop *, 8> LoopList(LN.getLoops()); |
1820 | |
1821 | if (MaxMemInstrCount < 1) { |
1822 | LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1" ); |
1823 | return PreservedAnalyses::all(); |
1824 | } |
1825 | OptimizationRemarkEmitter ORE(&F); |
1826 | |
1827 | // Ensure minimum depth of the loop nest to do the interchange. |
1828 | if (!hasSupportedLoopDepth(LoopList, ORE)) |
1829 | return PreservedAnalyses::all(); |
1830 | // Ensure computable loop nest. |
1831 | if (!isComputableLoopNest(SE: &AR.SE, LoopList)) { |
1832 | LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n" ); |
1833 | return PreservedAnalyses::all(); |
1834 | } |
1835 | |
1836 | ORE.emit(RemarkBuilder: [&]() { |
1837 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "Dependence" , |
1838 | LN.getOutermostLoop().getStartLoc(), |
1839 | LN.getOutermostLoop().getHeader()) |
1840 | << "Computed dependence info, invoking the transform." ; |
1841 | }); |
1842 | |
1843 | DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI); |
1844 | std::unique_ptr<CacheCost> CC = |
1845 | CacheCost::getCacheCost(Root&: LN.getOutermostLoop(), AR, DI); |
1846 | |
1847 | if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN)) |
1848 | return PreservedAnalyses::all(); |
1849 | U.markLoopNestChanged(Changed: true); |
1850 | return getLoopPassPreservedAnalyses(); |
1851 | } |
1852 | |