1 | //===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass does the inverse transformation of what LICM does. |
10 | // It traverses all of the instructions in the loop's preheader and sinks |
11 | // them to the loop body where frequency is lower than the loop's preheader. |
12 | // This pass is a reverse-transformation of LICM. It differs from the Sink |
13 | // pass in the following ways: |
14 | // |
15 | // * It only handles sinking of instructions from the loop's preheader to the |
16 | // loop's body |
17 | // * It uses alias set tracker to get more accurate alias info |
18 | // * It uses block frequency info to find the optimal sinking locations |
19 | // |
20 | // Overall algorithm: |
21 | // |
22 | // For I in Preheader: |
23 | // InsertBBs = BBs that uses I |
24 | // For BB in sorted(LoopBBs): |
25 | // DomBBs = BBs in InsertBBs that are dominated by BB |
26 | // if freq(DomBBs) > freq(BB) |
27 | // InsertBBs = UseBBs - DomBBs + BB |
28 | // For BB in InsertBBs: |
29 | // Insert I at BB's beginning |
30 | // |
31 | //===----------------------------------------------------------------------===// |
32 | |
33 | #include "llvm/Transforms/Scalar/LoopSink.h" |
34 | #include "llvm/ADT/SetOperations.h" |
35 | #include "llvm/ADT/Statistic.h" |
36 | #include "llvm/Analysis/AliasAnalysis.h" |
37 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
38 | #include "llvm/Analysis/LoopInfo.h" |
39 | #include "llvm/Analysis/MemorySSA.h" |
40 | #include "llvm/Analysis/MemorySSAUpdater.h" |
41 | #include "llvm/Analysis/ScalarEvolution.h" |
42 | #include "llvm/IR/Dominators.h" |
43 | #include "llvm/IR/Instructions.h" |
44 | #include "llvm/Support/BranchProbability.h" |
45 | #include "llvm/Support/CommandLine.h" |
46 | #include "llvm/Transforms/Scalar.h" |
47 | #include "llvm/Transforms/Utils/Local.h" |
48 | #include "llvm/Transforms/Utils/LoopUtils.h" |
49 | using namespace llvm; |
50 | |
51 | #define DEBUG_TYPE "loopsink" |
52 | |
53 | STATISTIC(NumLoopSunk, "Number of instructions sunk into loop" ); |
54 | STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop" ); |
55 | |
56 | static cl::opt<unsigned> SinkFrequencyPercentThreshold( |
57 | "sink-freq-percent-threshold" , cl::Hidden, cl::init(Val: 90), |
58 | cl::desc("Do not sink instructions that require cloning unless they " |
59 | "execute less than this percent of the time." )); |
60 | |
61 | static cl::opt<unsigned> MaxNumberOfUseBBsForSinking( |
62 | "max-uses-for-sinking" , cl::Hidden, cl::init(Val: 30), |
63 | cl::desc("Do not sink instructions that have too many uses." )); |
64 | |
65 | /// Return adjusted total frequency of \p BBs. |
66 | /// |
67 | /// * If there is only one BB, sinking instruction will not introduce code |
68 | /// size increase. Thus there is no need to adjust the frequency. |
69 | /// * If there are more than one BB, sinking would lead to code size increase. |
70 | /// In this case, we add some "tax" to the total frequency to make it harder |
71 | /// to sink. E.g. |
72 | /// Freq(Preheader) = 100 |
73 | /// Freq(BBs) = sum(50, 49) = 99 |
74 | /// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to |
75 | /// BBs as the difference is too small to justify the code size increase. |
76 | /// To model this, The adjusted Freq(BBs) will be: |
77 | /// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold% |
78 | static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs, |
79 | BlockFrequencyInfo &BFI) { |
80 | BlockFrequency T(0); |
81 | for (BasicBlock *B : BBs) |
82 | T += BFI.getBlockFreq(BB: B); |
83 | if (BBs.size() > 1) |
84 | T /= BranchProbability(SinkFrequencyPercentThreshold, 100); |
85 | return T; |
86 | } |
87 | |
88 | /// Return a set of basic blocks to insert sinked instructions. |
89 | /// |
90 | /// The returned set of basic blocks (BBsToSinkInto) should satisfy: |
91 | /// |
92 | /// * Inside the loop \p L |
93 | /// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto |
94 | /// that domintates the UseBB |
95 | /// * Has minimum total frequency that is no greater than preheader frequency |
96 | /// |
97 | /// The purpose of the function is to find the optimal sinking points to |
98 | /// minimize execution cost, which is defined as "sum of frequency of |
99 | /// BBsToSinkInto". |
100 | /// As a result, the returned BBsToSinkInto needs to have minimum total |
101 | /// frequency. |
102 | /// Additionally, if the total frequency of BBsToSinkInto exceeds preheader |
103 | /// frequency, the optimal solution is not sinking (return empty set). |
104 | /// |
105 | /// \p ColdLoopBBs is used to help find the optimal sinking locations. |
106 | /// It stores a list of BBs that is: |
107 | /// |
108 | /// * Inside the loop \p L |
109 | /// * Has a frequency no larger than the loop's preheader |
110 | /// * Sorted by BB frequency |
111 | /// |
112 | /// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()). |
113 | /// To avoid expensive computation, we cap the maximum UseBBs.size() in its |
114 | /// caller. |
115 | static SmallPtrSet<BasicBlock *, 2> |
116 | findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs, |
117 | const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, |
118 | DominatorTree &DT, BlockFrequencyInfo &BFI) { |
119 | SmallPtrSet<BasicBlock *, 2> BBsToSinkInto; |
120 | if (UseBBs.size() == 0) |
121 | return BBsToSinkInto; |
122 | |
123 | BBsToSinkInto.insert_range(R: UseBBs); |
124 | SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB; |
125 | |
126 | // For every iteration: |
127 | // * Pick the ColdestBB from ColdLoopBBs |
128 | // * Find the set BBsDominatedByColdestBB that satisfy: |
129 | // - BBsDominatedByColdestBB is a subset of BBsToSinkInto |
130 | // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB |
131 | // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove |
132 | // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to |
133 | // BBsToSinkInto |
134 | for (BasicBlock *ColdestBB : ColdLoopBBs) { |
135 | BBsDominatedByColdestBB.clear(); |
136 | for (BasicBlock *SinkedBB : BBsToSinkInto) |
137 | if (DT.dominates(A: ColdestBB, B: SinkedBB)) |
138 | BBsDominatedByColdestBB.insert(Ptr: SinkedBB); |
139 | if (BBsDominatedByColdestBB.size() == 0) |
140 | continue; |
141 | if (adjustedSumFreq(BBs&: BBsDominatedByColdestBB, BFI) > |
142 | BFI.getBlockFreq(BB: ColdestBB)) { |
143 | for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) { |
144 | BBsToSinkInto.erase(Ptr: DominatedBB); |
145 | } |
146 | BBsToSinkInto.insert(Ptr: ColdestBB); |
147 | continue; |
148 | } |
149 | // Otherwise, see if we can stop the search through the cold BBs early. |
150 | // Since the ColdLoopBBs list is sorted in increasing magnitude of |
151 | // frequency the cold BB frequencies can only get larger. The |
152 | // BBsToSinkInto set can only get smaller and have a smaller |
153 | // adjustedSumFreq, due to the earlier checking. So once we find a cold BB |
154 | // with a frequency at least as large as the adjustedSumFreq of the |
155 | // current BBsToSinkInto set, the earlier frequency check can never be |
156 | // true for a future iteration. Note we could do check this more |
157 | // aggressively earlier, but in practice this ended up being more |
158 | // expensive overall (added checking to the critical path through the loop |
159 | // that often ended up continuing early due to an empty |
160 | // BBsDominatedByColdestBB set, and the frequency check there was false |
161 | // most of the time anyway). |
162 | if (adjustedSumFreq(BBs&: BBsToSinkInto, BFI) <= BFI.getBlockFreq(BB: ColdestBB)) |
163 | break; |
164 | } |
165 | |
166 | // Can't sink into blocks that have no valid insertion point. |
167 | for (BasicBlock *BB : BBsToSinkInto) { |
168 | if (BB->getFirstInsertionPt() == BB->end()) { |
169 | BBsToSinkInto.clear(); |
170 | break; |
171 | } |
172 | } |
173 | |
174 | // If the total frequency of BBsToSinkInto is larger than preheader frequency, |
175 | // do not sink. |
176 | if (adjustedSumFreq(BBs&: BBsToSinkInto, BFI) > |
177 | BFI.getBlockFreq(BB: L.getLoopPreheader())) |
178 | BBsToSinkInto.clear(); |
179 | return BBsToSinkInto; |
180 | } |
181 | |
182 | // Sinks \p I from the loop \p L's preheader to its uses. Returns true if |
183 | // sinking is successful. |
184 | // \p LoopBlockNumber is used to sort the insertion blocks to ensure |
185 | // determinism. |
186 | static bool sinkInstruction( |
187 | Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs, |
188 | const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI, |
189 | DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) { |
190 | // Compute the set of blocks in loop L which contain a use of I. |
191 | SmallPtrSet<BasicBlock *, 2> BBs; |
192 | for (auto &U : I.uses()) { |
193 | Instruction *UI = cast<Instruction>(Val: U.getUser()); |
194 | |
195 | // We cannot sink I if it has uses outside of the loop. |
196 | if (!L.contains(L: LI.getLoopFor(BB: UI->getParent()))) |
197 | return false; |
198 | |
199 | if (!isa<PHINode>(Val: UI)) { |
200 | BBs.insert(Ptr: UI->getParent()); |
201 | continue; |
202 | } |
203 | |
204 | // We cannot sink I to PHI-uses, try to look through PHI to find the incoming |
205 | // block of the value being used. |
206 | PHINode *PN = dyn_cast<PHINode>(Val: UI); |
207 | BasicBlock *PhiBB = PN->getIncomingBlock(U); |
208 | |
209 | // If value's incoming block is from loop preheader directly, there's no |
210 | // place to sink to, bailout. |
211 | if (L.getLoopPreheader() == PhiBB) |
212 | return false; |
213 | |
214 | BBs.insert(Ptr: PhiBB); |
215 | } |
216 | |
217 | // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max |
218 | // BBs.size() to avoid expensive computation. |
219 | // FIXME: Handle code size growth for min_size and opt_size. |
220 | if (BBs.size() > MaxNumberOfUseBBsForSinking) |
221 | return false; |
222 | |
223 | // Find the set of BBs that we should insert a copy of I. |
224 | SmallPtrSet<BasicBlock *, 2> BBsToSinkInto = |
225 | findBBsToSinkInto(L, UseBBs: BBs, ColdLoopBBs, DT, BFI); |
226 | if (BBsToSinkInto.empty()) |
227 | return false; |
228 | |
229 | // Return if any of the candidate blocks to sink into is non-cold. |
230 | if (BBsToSinkInto.size() > 1 && |
231 | !llvm::set_is_subset(S1: BBsToSinkInto, S2: LoopBlockNumber)) |
232 | return false; |
233 | |
234 | // Copy the final BBs into a vector and sort them using the total ordering |
235 | // of the loop block numbers as iterating the set doesn't give a useful |
236 | // order. No need to stable sort as the block numbers are a total ordering. |
237 | SmallVector<BasicBlock *, 2> SortedBBsToSinkInto; |
238 | llvm::append_range(C&: SortedBBsToSinkInto, R&: BBsToSinkInto); |
239 | if (SortedBBsToSinkInto.size() > 1) { |
240 | llvm::sort(C&: SortedBBsToSinkInto, Comp: [&](BasicBlock *A, BasicBlock *B) { |
241 | return LoopBlockNumber.find(Val: A)->second < LoopBlockNumber.find(Val: B)->second; |
242 | }); |
243 | } |
244 | |
245 | BasicBlock *MoveBB = *SortedBBsToSinkInto.begin(); |
246 | // FIXME: Optimize the efficiency for cloned value replacement. The current |
247 | // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()). |
248 | for (BasicBlock *N : ArrayRef(SortedBBsToSinkInto).drop_front(N: 1)) { |
249 | assert(LoopBlockNumber.find(N)->second > |
250 | LoopBlockNumber.find(MoveBB)->second && |
251 | "BBs not sorted!" ); |
252 | // Clone I and replace its uses. |
253 | Instruction *IC = I.clone(); |
254 | IC->setName(I.getName()); |
255 | IC->insertBefore(InsertPos: N->getFirstInsertionPt()); |
256 | |
257 | if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(I: &I)) { |
258 | // Create a new MemoryAccess and let MemorySSA set its defining access. |
259 | MemoryAccess *NewMemAcc = |
260 | MSSAU->createMemoryAccessInBB(I: IC, Definition: nullptr, BB: N, Point: MemorySSA::Beginning); |
261 | if (NewMemAcc) { |
262 | if (auto *MemDef = dyn_cast<MemoryDef>(Val: NewMemAcc)) |
263 | MSSAU->insertDef(Def: MemDef, /*RenameUses=*/true); |
264 | else { |
265 | auto *MemUse = cast<MemoryUse>(Val: NewMemAcc); |
266 | MSSAU->insertUse(Use: MemUse, /*RenameUses=*/true); |
267 | } |
268 | } |
269 | } |
270 | |
271 | // Replaces uses of I with IC in N, except PHI-use which is being taken |
272 | // care of by defs in PHI's incoming blocks. |
273 | I.replaceUsesWithIf(New: IC, ShouldReplace: [N](Use &U) { |
274 | Instruction *UIToReplace = cast<Instruction>(Val: U.getUser()); |
275 | return UIToReplace->getParent() == N && !isa<PHINode>(Val: UIToReplace); |
276 | }); |
277 | // Replaces uses of I with IC in blocks dominated by N |
278 | replaceDominatedUsesWith(From: &I, To: IC, DT, BB: N); |
279 | LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName() |
280 | << '\n'); |
281 | NumLoopSunkCloned++; |
282 | } |
283 | LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n'); |
284 | NumLoopSunk++; |
285 | I.moveBefore(InsertPos: MoveBB->getFirstInsertionPt()); |
286 | |
287 | if (MSSAU) |
288 | if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>( |
289 | Val: MSSAU->getMemorySSA()->getMemoryAccess(I: &I))) |
290 | MSSAU->moveToPlace(What: OldMemAcc, BB: MoveBB, Where: MemorySSA::Beginning); |
291 | |
292 | return true; |
293 | } |
294 | |
295 | /// Sinks instructions from loop's preheader to the loop body if the |
296 | /// sum frequency of inserted copy is smaller than preheader's frequency. |
297 | static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, |
298 | DominatorTree &DT, |
299 | BlockFrequencyInfo &BFI, |
300 | MemorySSA &MSSA, |
301 | ScalarEvolution *SE) { |
302 | BasicBlock * = L.getLoopPreheader(); |
303 | assert(Preheader && "Expected loop to have preheader" ); |
304 | |
305 | assert(Preheader->getParent()->hasProfileData() && |
306 | "Unexpected call when profile data unavailable." ); |
307 | |
308 | const BlockFrequency = BFI.getBlockFreq(BB: Preheader); |
309 | // If there are no basic blocks with lower frequency than the preheader then |
310 | // we can avoid the detailed analysis as we will never find profitable sinking |
311 | // opportunities. |
312 | if (all_of(Range: L.blocks(), P: [&](const BasicBlock *BB) { |
313 | return BFI.getBlockFreq(BB) > PreheaderFreq; |
314 | })) |
315 | return false; |
316 | |
317 | MemorySSAUpdater MSSAU(&MSSA); |
318 | SinkAndHoistLICMFlags LICMFlags(/*IsSink=*/true, L, MSSA); |
319 | |
320 | bool Changed = false; |
321 | |
322 | // Sort loop's basic blocks by frequency |
323 | SmallVector<BasicBlock *, 10> ColdLoopBBs; |
324 | SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber; |
325 | int i = 0; |
326 | for (BasicBlock *B : L.blocks()) |
327 | if (BFI.getBlockFreq(BB: B) < BFI.getBlockFreq(BB: L.getLoopPreheader())) { |
328 | ColdLoopBBs.push_back(Elt: B); |
329 | LoopBlockNumber[B] = ++i; |
330 | } |
331 | llvm::stable_sort(Range&: ColdLoopBBs, C: [&](BasicBlock *A, BasicBlock *B) { |
332 | return BFI.getBlockFreq(BB: A) < BFI.getBlockFreq(BB: B); |
333 | }); |
334 | |
335 | // Traverse preheader's instructions in reverse order because if A depends |
336 | // on B (A appears after B), A needs to be sunk first before B can be |
337 | // sinked. |
338 | for (Instruction &I : llvm::make_early_inc_range(Range: llvm::reverse(C&: *Preheader))) { |
339 | if (isa<PHINode>(Val: &I)) |
340 | continue; |
341 | // No need to check for instruction's operands are loop invariant. |
342 | assert(L.hasLoopInvariantOperands(&I) && |
343 | "Insts in a loop's preheader should have loop invariant operands!" ); |
344 | if (!canSinkOrHoistInst(I, AA: &AA, DT: &DT, CurLoop: &L, MSSAU, TargetExecutesOncePerLoop: false, LICMFlags)) |
345 | continue; |
346 | if (sinkInstruction(L, I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI, |
347 | MSSAU: &MSSAU)) { |
348 | Changed = true; |
349 | if (SE) |
350 | SE->forgetBlockAndLoopDispositions(V: &I); |
351 | } |
352 | } |
353 | |
354 | return Changed; |
355 | } |
356 | |
357 | PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { |
358 | // Enable LoopSink only when runtime profile is available. |
359 | // With static profile, the sinking decision may be sub-optimal. |
360 | if (!F.hasProfileData()) |
361 | return PreservedAnalyses::all(); |
362 | |
363 | LoopInfo &LI = FAM.getResult<LoopAnalysis>(IR&: F); |
364 | // Nothing to do if there are no loops. |
365 | if (LI.empty()) |
366 | return PreservedAnalyses::all(); |
367 | |
368 | AAResults &AA = FAM.getResult<AAManager>(IR&: F); |
369 | DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
370 | BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(IR&: F); |
371 | MemorySSA &MSSA = FAM.getResult<MemorySSAAnalysis>(IR&: F).getMSSA(); |
372 | |
373 | // We want to do a postorder walk over the loops. Since loops are a tree this |
374 | // is equivalent to a reversed preorder walk and preorder is easy to compute |
375 | // without recursion. Since we reverse the preorder, we will visit siblings |
376 | // in reverse program order. This isn't expected to matter at all but is more |
377 | // consistent with sinking algorithms which generally work bottom-up. |
378 | SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder(); |
379 | |
380 | bool Changed = false; |
381 | do { |
382 | Loop &L = *PreorderLoops.pop_back_val(); |
383 | |
384 | BasicBlock * = L.getLoopPreheader(); |
385 | if (!Preheader) |
386 | continue; |
387 | |
388 | // Note that we don't pass SCEV here because it is only used to invalidate |
389 | // loops in SCEV and we don't preserve (or request) SCEV at all making that |
390 | // unnecessary. |
391 | Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, MSSA, |
392 | /*ScalarEvolution*/ SE: nullptr); |
393 | } while (!PreorderLoops.empty()); |
394 | |
395 | if (!Changed) |
396 | return PreservedAnalyses::all(); |
397 | |
398 | PreservedAnalyses PA; |
399 | PA.preserveSet<CFGAnalyses>(); |
400 | PA.preserve<MemorySSAAnalysis>(); |
401 | |
402 | if (VerifyMemorySSA) |
403 | MSSA.verifyMemorySSA(); |
404 | |
405 | return PA; |
406 | } |
407 | |