1 | ///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" |
10 | #include "llvm/ADT/DenseMap.h" |
11 | #include "llvm/ADT/STLExtras.h" |
12 | #include "llvm/ADT/Sequence.h" |
13 | #include "llvm/ADT/SetVector.h" |
14 | #include "llvm/ADT/SmallPtrSet.h" |
15 | #include "llvm/ADT/SmallVector.h" |
16 | #include "llvm/ADT/Statistic.h" |
17 | #include "llvm/ADT/Twine.h" |
18 | #include "llvm/Analysis/AssumptionCache.h" |
19 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
20 | #include "llvm/Analysis/CFG.h" |
21 | #include "llvm/Analysis/CodeMetrics.h" |
22 | #include "llvm/Analysis/DomTreeUpdater.h" |
23 | #include "llvm/Analysis/GuardUtils.h" |
24 | #include "llvm/Analysis/LoopAnalysisManager.h" |
25 | #include "llvm/Analysis/LoopInfo.h" |
26 | #include "llvm/Analysis/LoopIterator.h" |
27 | #include "llvm/Analysis/MemorySSA.h" |
28 | #include "llvm/Analysis/MemorySSAUpdater.h" |
29 | #include "llvm/Analysis/MustExecute.h" |
30 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
31 | #include "llvm/Analysis/ScalarEvolution.h" |
32 | #include "llvm/Analysis/TargetTransformInfo.h" |
33 | #include "llvm/Analysis/ValueTracking.h" |
34 | #include "llvm/IR/BasicBlock.h" |
35 | #include "llvm/IR/Constant.h" |
36 | #include "llvm/IR/Constants.h" |
37 | #include "llvm/IR/Dominators.h" |
38 | #include "llvm/IR/Function.h" |
39 | #include "llvm/IR/IRBuilder.h" |
40 | #include "llvm/IR/InstrTypes.h" |
41 | #include "llvm/IR/Instruction.h" |
42 | #include "llvm/IR/Instructions.h" |
43 | #include "llvm/IR/IntrinsicInst.h" |
44 | #include "llvm/IR/Module.h" |
45 | #include "llvm/IR/PatternMatch.h" |
46 | #include "llvm/IR/ProfDataUtils.h" |
47 | #include "llvm/IR/Use.h" |
48 | #include "llvm/IR/Value.h" |
49 | #include "llvm/Support/Casting.h" |
50 | #include "llvm/Support/CommandLine.h" |
51 | #include "llvm/Support/Debug.h" |
52 | #include "llvm/Support/ErrorHandling.h" |
53 | #include "llvm/Support/GenericDomTree.h" |
54 | #include "llvm/Support/InstructionCost.h" |
55 | #include "llvm/Support/raw_ostream.h" |
56 | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
57 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
58 | #include "llvm/Transforms/Utils/Cloning.h" |
59 | #include "llvm/Transforms/Utils/Local.h" |
60 | #include "llvm/Transforms/Utils/LoopUtils.h" |
61 | #include "llvm/Transforms/Utils/ValueMapper.h" |
62 | #include <algorithm> |
63 | #include <cassert> |
64 | #include <iterator> |
65 | #include <numeric> |
66 | #include <optional> |
67 | #include <utility> |
68 | |
69 | #define DEBUG_TYPE "simple-loop-unswitch" |
70 | |
71 | using namespace llvm; |
72 | using namespace llvm::PatternMatch; |
73 | |
74 | STATISTIC(NumBranches, "Number of branches unswitched" ); |
75 | STATISTIC(NumSwitches, "Number of switches unswitched" ); |
76 | STATISTIC(NumSelects, "Number of selects turned into branches for unswitching" ); |
77 | STATISTIC(NumGuards, "Number of guards turned into branches for unswitching" ); |
78 | STATISTIC(NumTrivial, "Number of unswitches that are trivial" ); |
79 | STATISTIC( |
80 | NumCostMultiplierSkipped, |
81 | "Number of unswitch candidates that had their cost multiplier skipped" ); |
82 | STATISTIC(NumInvariantConditionsInjected, |
83 | "Number of invariant conditions injected and unswitched" ); |
84 | |
85 | static cl::opt<bool> EnableNonTrivialUnswitch( |
86 | "enable-nontrivial-unswitch" , cl::init(Val: false), cl::Hidden, |
87 | cl::desc("Forcibly enables non-trivial loop unswitching rather than " |
88 | "following the configuration passed into the pass." )); |
89 | |
90 | static cl::opt<int> |
91 | UnswitchThreshold("unswitch-threshold" , cl::init(Val: 50), cl::Hidden, |
92 | cl::desc("The cost threshold for unswitching a loop." )); |
93 | |
94 | static cl::opt<bool> EnableUnswitchCostMultiplier( |
95 | "enable-unswitch-cost-multiplier" , cl::init(Val: true), cl::Hidden, |
96 | cl::desc("Enable unswitch cost multiplier that prohibits exponential " |
97 | "explosion in nontrivial unswitch." )); |
98 | static cl::opt<int> UnswitchSiblingsToplevelDiv( |
99 | "unswitch-siblings-toplevel-div" , cl::init(Val: 2), cl::Hidden, |
100 | cl::desc("Toplevel siblings divisor for cost multiplier." )); |
101 | static cl::opt<int> UnswitchNumInitialUnscaledCandidates( |
102 | "unswitch-num-initial-unscaled-candidates" , cl::init(Val: 8), cl::Hidden, |
103 | cl::desc("Number of unswitch candidates that are ignored when calculating " |
104 | "cost multiplier." )); |
105 | static cl::opt<bool> UnswitchGuards( |
106 | "simple-loop-unswitch-guards" , cl::init(Val: true), cl::Hidden, |
107 | cl::desc("If enabled, simple loop unswitching will also consider " |
108 | "llvm.experimental.guard intrinsics as unswitch candidates." )); |
109 | static cl::opt<bool> DropNonTrivialImplicitNullChecks( |
110 | "simple-loop-unswitch-drop-non-trivial-implicit-null-checks" , |
111 | cl::init(Val: false), cl::Hidden, |
112 | cl::desc("If enabled, drop make.implicit metadata in unswitched implicit " |
113 | "null checks to save time analyzing if we can keep it." )); |
114 | static cl::opt<unsigned> |
115 | MSSAThreshold("simple-loop-unswitch-memoryssa-threshold" , |
116 | cl::desc("Max number of memory uses to explore during " |
117 | "partial unswitching analysis" ), |
118 | cl::init(Val: 100), cl::Hidden); |
119 | static cl::opt<bool> FreezeLoopUnswitchCond( |
120 | "freeze-loop-unswitch-cond" , cl::init(Val: true), cl::Hidden, |
121 | cl::desc("If enabled, the freeze instruction will be added to condition " |
122 | "of loop unswitch to prevent miscompilation." )); |
123 | |
124 | static cl::opt<bool> InjectInvariantConditions( |
125 | "simple-loop-unswitch-inject-invariant-conditions" , cl::Hidden, |
126 | cl::desc("Whether we should inject new invariants and unswitch them to " |
127 | "eliminate some existing (non-invariant) conditions." ), |
128 | cl::init(Val: true)); |
129 | |
130 | static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( |
131 | "simple-loop-unswitch-inject-invariant-condition-hotness-threshold" , |
132 | cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " |
133 | "unswitch on them to eliminate branches that are " |
134 | "not-taken 1/<this option> times or less." ), |
135 | cl::init(Val: 16)); |
136 | |
137 | AnalysisKey ShouldRunExtraSimpleLoopUnswitch::; |
138 | namespace { |
139 | struct CompareDesc { |
140 | BranchInst *Term; |
141 | Value *Invariant; |
142 | BasicBlock *InLoopSucc; |
143 | |
144 | CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc) |
145 | : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {} |
146 | }; |
147 | |
148 | struct InjectedInvariant { |
149 | ICmpInst::Predicate Pred; |
150 | Value *LHS; |
151 | Value *RHS; |
152 | BasicBlock *InLoopSucc; |
153 | |
154 | InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, |
155 | BasicBlock *InLoopSucc) |
156 | : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {} |
157 | }; |
158 | |
159 | struct NonTrivialUnswitchCandidate { |
160 | Instruction *TI = nullptr; |
161 | TinyPtrVector<Value *> Invariants; |
162 | std::optional<InstructionCost> Cost; |
163 | std::optional<InjectedInvariant> PendingInjection; |
164 | NonTrivialUnswitchCandidate( |
165 | Instruction *TI, ArrayRef<Value *> Invariants, |
166 | std::optional<InstructionCost> Cost = std::nullopt, |
167 | std::optional<InjectedInvariant> PendingInjection = std::nullopt) |
168 | : TI(TI), Invariants(Invariants), Cost(Cost), |
169 | PendingInjection(PendingInjection) {}; |
170 | |
171 | bool hasPendingInjection() const { return PendingInjection.has_value(); } |
172 | }; |
173 | } // end anonymous namespace. |
174 | |
175 | // Helper to skip (select x, true, false), which matches both a logical AND and |
176 | // OR and can confuse code that tries to determine if \p Cond is either a |
177 | // logical AND or OR but not both. |
178 | static Value *skipTrivialSelect(Value *Cond) { |
179 | Value *CondNext; |
180 | while (match(V: Cond, P: m_Select(C: m_Value(V&: CondNext), L: m_One(), R: m_Zero()))) |
181 | Cond = CondNext; |
182 | return Cond; |
183 | } |
184 | |
185 | /// Collect all of the loop invariant input values transitively used by the |
186 | /// homogeneous instruction graph from a given root. |
187 | /// |
188 | /// This essentially walks from a root recursively through loop variant operands |
189 | /// which have perform the same logical operation (AND or OR) and finds all |
190 | /// inputs which are loop invariant. For some operations these can be |
191 | /// re-associated and unswitched out of the loop entirely. |
192 | static TinyPtrVector<Value *> |
193 | collectHomogenousInstGraphLoopInvariants(const Loop &L, Instruction &Root, |
194 | const LoopInfo &LI) { |
195 | assert(!L.isLoopInvariant(&Root) && |
196 | "Only need to walk the graph if root itself is not invariant." ); |
197 | TinyPtrVector<Value *> Invariants; |
198 | |
199 | bool IsRootAnd = match(V: &Root, P: m_LogicalAnd()); |
200 | bool IsRootOr = match(V: &Root, P: m_LogicalOr()); |
201 | |
202 | // Build a worklist and recurse through operators collecting invariants. |
203 | SmallVector<Instruction *, 4> Worklist; |
204 | SmallPtrSet<Instruction *, 8> Visited; |
205 | Worklist.push_back(Elt: &Root); |
206 | Visited.insert(Ptr: &Root); |
207 | do { |
208 | Instruction &I = *Worklist.pop_back_val(); |
209 | for (Value *OpV : I.operand_values()) { |
210 | // Skip constants as unswitching isn't interesting for them. |
211 | if (isa<Constant>(Val: OpV)) |
212 | continue; |
213 | |
214 | // Add it to our result if loop invariant. |
215 | if (L.isLoopInvariant(V: OpV)) { |
216 | Invariants.push_back(NewVal: OpV); |
217 | continue; |
218 | } |
219 | |
220 | // If not an instruction with the same opcode, nothing we can do. |
221 | Instruction *OpI = dyn_cast<Instruction>(Val: skipTrivialSelect(Cond: OpV)); |
222 | |
223 | if (OpI && ((IsRootAnd && match(V: OpI, P: m_LogicalAnd())) || |
224 | (IsRootOr && match(V: OpI, P: m_LogicalOr())))) { |
225 | // Visit this operand. |
226 | if (Visited.insert(Ptr: OpI).second) |
227 | Worklist.push_back(Elt: OpI); |
228 | } |
229 | } |
230 | } while (!Worklist.empty()); |
231 | |
232 | return Invariants; |
233 | } |
234 | |
235 | static void replaceLoopInvariantUses(const Loop &L, Value *Invariant, |
236 | Constant &Replacement) { |
237 | assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?" ); |
238 | |
239 | // Replace uses of LIC in the loop with the given constant. |
240 | // We use make_early_inc_range as set invalidates the iterator. |
241 | for (Use &U : llvm::make_early_inc_range(Range: Invariant->uses())) { |
242 | Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser()); |
243 | |
244 | // Replace this use within the loop body. |
245 | if (UserI && L.contains(Inst: UserI)) |
246 | U.set(&Replacement); |
247 | } |
248 | } |
249 | |
250 | /// Check that all the LCSSA PHI nodes in the loop exit block have trivial |
251 | /// incoming values along this edge. |
252 | static bool areLoopExitPHIsLoopInvariant(const Loop &L, |
253 | const BasicBlock &ExitingBB, |
254 | const BasicBlock &ExitBB) { |
255 | for (const Instruction &I : ExitBB) { |
256 | auto *PN = dyn_cast<PHINode>(Val: &I); |
257 | if (!PN) |
258 | // No more PHIs to check. |
259 | return true; |
260 | |
261 | // If the incoming value for this edge isn't loop invariant the unswitch |
262 | // won't be trivial. |
263 | if (!L.isLoopInvariant(V: PN->getIncomingValueForBlock(BB: &ExitingBB))) |
264 | return false; |
265 | } |
266 | llvm_unreachable("Basic blocks should never be empty!" ); |
267 | } |
268 | |
269 | /// Copy a set of loop invariant values \p ToDuplicate and insert them at the |
270 | /// end of \p BB and conditionally branch on the copied condition. We only |
271 | /// branch on a single value. |
272 | static void buildPartialUnswitchConditionalBranch( |
273 | BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, |
274 | BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, |
275 | const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { |
276 | IRBuilder<> IRB(&BB); |
277 | |
278 | SmallVector<Value *> FrozenInvariants; |
279 | for (Value *Inv : Invariants) { |
280 | if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(V: Inv, AC, CtxI: I, DT: &DT)) |
281 | Inv = IRB.CreateFreeze(V: Inv, Name: Inv->getName() + ".fr" ); |
282 | FrozenInvariants.push_back(Elt: Inv); |
283 | } |
284 | |
285 | Value *Cond = Direction ? IRB.CreateOr(Ops: FrozenInvariants) |
286 | : IRB.CreateAnd(Ops: FrozenInvariants); |
287 | IRB.CreateCondBr(Cond, True: Direction ? &UnswitchedSucc : &NormalSucc, |
288 | False: Direction ? &NormalSucc : &UnswitchedSucc); |
289 | } |
290 | |
291 | /// Copy a set of loop invariant values, and conditionally branch on them. |
292 | static void buildPartialInvariantUnswitchConditionalBranch( |
293 | BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, |
294 | BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, |
295 | MemorySSAUpdater *MSSAU) { |
296 | ValueToValueMapTy VMap; |
297 | for (auto *Val : reverse(C&: ToDuplicate)) { |
298 | Instruction *Inst = cast<Instruction>(Val); |
299 | Instruction *NewInst = Inst->clone(); |
300 | NewInst->insertInto(ParentBB: &BB, It: BB.end()); |
301 | RemapInstruction(I: NewInst, VM&: VMap, |
302 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
303 | VMap[Val] = NewInst; |
304 | |
305 | if (!MSSAU) |
306 | continue; |
307 | |
308 | MemorySSA *MSSA = MSSAU->getMemorySSA(); |
309 | if (auto *MemUse = |
310 | dyn_cast_or_null<MemoryUse>(Val: MSSA->getMemoryAccess(I: Inst))) { |
311 | auto *DefiningAccess = MemUse->getDefiningAccess(); |
312 | // Get the first defining access before the loop. |
313 | while (L.contains(BB: DefiningAccess->getBlock())) { |
314 | // If the defining access is a MemoryPhi, get the incoming |
315 | // value for the pre-header as defining access. |
316 | if (auto *MemPhi = dyn_cast<MemoryPhi>(Val: DefiningAccess)) |
317 | DefiningAccess = |
318 | MemPhi->getIncomingValueForBlock(BB: L.getLoopPreheader()); |
319 | else |
320 | DefiningAccess = cast<MemoryDef>(Val: DefiningAccess)->getDefiningAccess(); |
321 | } |
322 | MSSAU->createMemoryAccessInBB(I: NewInst, Definition: DefiningAccess, |
323 | BB: NewInst->getParent(), |
324 | Point: MemorySSA::BeforeTerminator); |
325 | } |
326 | } |
327 | |
328 | IRBuilder<> IRB(&BB); |
329 | Value *Cond = VMap[ToDuplicate[0]]; |
330 | IRB.CreateCondBr(Cond, True: Direction ? &UnswitchedSucc : &NormalSucc, |
331 | False: Direction ? &NormalSucc : &UnswitchedSucc); |
332 | } |
333 | |
334 | /// Rewrite the PHI nodes in an unswitched loop exit basic block. |
335 | /// |
336 | /// Requires that the loop exit and unswitched basic block are the same, and |
337 | /// that the exiting block was a unique predecessor of that block. Rewrites the |
338 | /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial |
339 | /// PHI nodes from the old preheader that now contains the unswitched |
340 | /// terminator. |
341 | static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, |
342 | BasicBlock &OldExitingBB, |
343 | BasicBlock &OldPH) { |
344 | for (PHINode &PN : UnswitchedBB.phis()) { |
345 | // When the loop exit is directly unswitched we just need to update the |
346 | // incoming basic block. We loop to handle weird cases with repeated |
347 | // incoming blocks, but expect to typically only have one operand here. |
348 | for (auto i : seq<int>(Begin: 0, End: PN.getNumOperands())) { |
349 | assert(PN.getIncomingBlock(i) == &OldExitingBB && |
350 | "Found incoming block different from unique predecessor!" ); |
351 | PN.setIncomingBlock(i, BB: &OldPH); |
352 | } |
353 | } |
354 | } |
355 | |
356 | /// Rewrite the PHI nodes in the loop exit basic block and the split off |
357 | /// unswitched block. |
358 | /// |
359 | /// Because the exit block remains an exit from the loop, this rewrites the |
360 | /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI |
361 | /// nodes into the unswitched basic block to select between the value in the |
362 | /// old preheader and the loop exit. |
363 | static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, |
364 | BasicBlock &UnswitchedBB, |
365 | BasicBlock &OldExitingBB, |
366 | BasicBlock &OldPH, |
367 | bool FullUnswitch) { |
368 | assert(&ExitBB != &UnswitchedBB && |
369 | "Must have different loop exit and unswitched blocks!" ); |
370 | BasicBlock::iterator InsertPt = UnswitchedBB.begin(); |
371 | for (PHINode &PN : ExitBB.phis()) { |
372 | auto *NewPN = PHINode::Create(Ty: PN.getType(), /*NumReservedValues*/ 2, |
373 | NameStr: PN.getName() + ".split" ); |
374 | NewPN->insertBefore(InsertPos: InsertPt); |
375 | |
376 | // Walk backwards over the old PHI node's inputs to minimize the cost of |
377 | // removing each one. We have to do this weird loop manually so that we |
378 | // create the same number of new incoming edges in the new PHI as we expect |
379 | // each case-based edge to be included in the unswitched switch in some |
380 | // cases. |
381 | // FIXME: This is really, really gross. It would be much cleaner if LLVM |
382 | // allowed us to create a single entry for a predecessor block without |
383 | // having separate entries for each "edge" even though these edges are |
384 | // required to produce identical results. |
385 | for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { |
386 | if (PN.getIncomingBlock(i) != &OldExitingBB) |
387 | continue; |
388 | |
389 | Value *Incoming = PN.getIncomingValue(i); |
390 | if (FullUnswitch) |
391 | // No more edge from the old exiting block to the exit block. |
392 | PN.removeIncomingValue(Idx: i); |
393 | |
394 | NewPN->addIncoming(V: Incoming, BB: &OldPH); |
395 | } |
396 | |
397 | // Now replace the old PHI with the new one and wire the old one in as an |
398 | // input to the new one. |
399 | PN.replaceAllUsesWith(V: NewPN); |
400 | NewPN->addIncoming(V: &PN, BB: &ExitBB); |
401 | } |
402 | } |
403 | |
404 | /// Hoist the current loop up to the innermost loop containing a remaining exit. |
405 | /// |
406 | /// Because we've removed an exit from the loop, we may have changed the set of |
407 | /// loops reachable and need to move the current loop up the loop nest or even |
408 | /// to an entirely separate nest. |
409 | static void hoistLoopToNewParent(Loop &L, BasicBlock &, |
410 | DominatorTree &DT, LoopInfo &LI, |
411 | MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { |
412 | // If the loop is already at the top level, we can't hoist it anywhere. |
413 | Loop *OldParentL = L.getParentLoop(); |
414 | if (!OldParentL) |
415 | return; |
416 | |
417 | SmallVector<BasicBlock *, 4> Exits; |
418 | L.getExitBlocks(ExitBlocks&: Exits); |
419 | Loop *NewParentL = nullptr; |
420 | for (auto *ExitBB : Exits) |
421 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) |
422 | if (!NewParentL || NewParentL->contains(L: ExitL)) |
423 | NewParentL = ExitL; |
424 | |
425 | if (NewParentL == OldParentL) |
426 | return; |
427 | |
428 | // The new parent loop (if different) should always contain the old one. |
429 | if (NewParentL) |
430 | assert(NewParentL->contains(OldParentL) && |
431 | "Can only hoist this loop up the nest!" ); |
432 | |
433 | // The preheader will need to move with the body of this loop. However, |
434 | // because it isn't in this loop we also need to update the primary loop map. |
435 | assert(OldParentL == LI.getLoopFor(&Preheader) && |
436 | "Parent loop of this loop should contain this loop's preheader!" ); |
437 | LI.changeLoopFor(BB: &Preheader, L: NewParentL); |
438 | |
439 | // Remove this loop from its old parent. |
440 | OldParentL->removeChildLoop(Child: &L); |
441 | |
442 | // Add the loop either to the new parent or as a top-level loop. |
443 | if (NewParentL) |
444 | NewParentL->addChildLoop(NewChild: &L); |
445 | else |
446 | LI.addTopLevelLoop(New: &L); |
447 | |
448 | // Remove this loops blocks from the old parent and every other loop up the |
449 | // nest until reaching the new parent. Also update all of these |
450 | // no-longer-containing loops to reflect the nesting change. |
451 | for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; |
452 | OldContainingL = OldContainingL->getParentLoop()) { |
453 | llvm::erase_if(C&: OldContainingL->getBlocksVector(), |
454 | P: [&](const BasicBlock *BB) { |
455 | return BB == &Preheader || L.contains(BB); |
456 | }); |
457 | |
458 | OldContainingL->getBlocksSet().erase(Ptr: &Preheader); |
459 | for (BasicBlock *BB : L.blocks()) |
460 | OldContainingL->getBlocksSet().erase(Ptr: BB); |
461 | |
462 | // Because we just hoisted a loop out of this one, we have essentially |
463 | // created new exit paths from it. That means we need to form LCSSA PHI |
464 | // nodes for values used in the no-longer-nested loop. |
465 | formLCSSA(L&: *OldContainingL, DT, LI: &LI, SE); |
466 | |
467 | // We shouldn't need to form dedicated exits because the exit introduced |
468 | // here is the (just split by unswitching) preheader. However, after trivial |
469 | // unswitching it is possible to get new non-dedicated exits out of parent |
470 | // loop so let's conservatively form dedicated exit blocks and figure out |
471 | // if we can optimize later. |
472 | formDedicatedExitBlocks(L: OldContainingL, DT: &DT, LI: &LI, MSSAU, |
473 | /*PreserveLCSSA*/ true); |
474 | } |
475 | } |
476 | |
477 | // Return the top-most loop containing ExitBB and having ExitBB as exiting block |
478 | // or the loop containing ExitBB, if there is no parent loop containing ExitBB |
479 | // as exiting block. |
480 | static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, |
481 | const LoopInfo &LI) { |
482 | Loop *TopMost = LI.getLoopFor(BB: ExitBB); |
483 | Loop *Current = TopMost; |
484 | while (Current) { |
485 | if (Current->isLoopExiting(BB: ExitBB)) |
486 | TopMost = Current; |
487 | Current = Current->getParentLoop(); |
488 | } |
489 | return TopMost; |
490 | } |
491 | |
492 | /// Unswitch a trivial branch if the condition is loop invariant. |
493 | /// |
494 | /// This routine should only be called when loop code leading to the branch has |
495 | /// been validated as trivial (no side effects). This routine checks if the |
496 | /// condition is invariant and one of the successors is a loop exit. This |
497 | /// allows us to unswitch without duplicating the loop, making it trivial. |
498 | /// |
499 | /// If this routine fails to unswitch the branch it returns false. |
500 | /// |
501 | /// If the branch can be unswitched, this routine splits the preheader and |
502 | /// hoists the branch above that split. Preserves loop simplified form |
503 | /// (splitting the exit block as necessary). It simplifies the branch within |
504 | /// the loop to an unconditional branch but doesn't remove it entirely. Further |
505 | /// cleanup can be done with some simplifycfg like pass. |
506 | /// |
507 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
508 | /// invalidated by this. |
509 | static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, |
510 | LoopInfo &LI, ScalarEvolution *SE, |
511 | MemorySSAUpdater *MSSAU) { |
512 | assert(BI.isConditional() && "Can only unswitch a conditional branch!" ); |
513 | LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n" ); |
514 | |
515 | // The loop invariant values that we want to unswitch. |
516 | TinyPtrVector<Value *> Invariants; |
517 | |
518 | // When true, we're fully unswitching the branch rather than just unswitching |
519 | // some input conditions to the branch. |
520 | bool FullUnswitch = false; |
521 | |
522 | Value *Cond = skipTrivialSelect(Cond: BI.getCondition()); |
523 | if (L.isLoopInvariant(V: Cond)) { |
524 | Invariants.push_back(NewVal: Cond); |
525 | FullUnswitch = true; |
526 | } else { |
527 | if (auto *CondInst = dyn_cast<Instruction>(Val: Cond)) |
528 | Invariants = collectHomogenousInstGraphLoopInvariants(L, Root&: *CondInst, LI); |
529 | if (Invariants.empty()) { |
530 | LLVM_DEBUG(dbgs() << " Couldn't find invariant inputs!\n" ); |
531 | return false; |
532 | } |
533 | } |
534 | |
535 | // Check that one of the branch's successors exits, and which one. |
536 | bool ExitDirection = true; |
537 | int LoopExitSuccIdx = 0; |
538 | auto *LoopExitBB = BI.getSuccessor(i: 0); |
539 | if (L.contains(BB: LoopExitBB)) { |
540 | ExitDirection = false; |
541 | LoopExitSuccIdx = 1; |
542 | LoopExitBB = BI.getSuccessor(i: 1); |
543 | if (L.contains(BB: LoopExitBB)) { |
544 | LLVM_DEBUG(dbgs() << " Branch doesn't exit the loop!\n" ); |
545 | return false; |
546 | } |
547 | } |
548 | auto *ContinueBB = BI.getSuccessor(i: 1 - LoopExitSuccIdx); |
549 | auto *ParentBB = BI.getParent(); |
550 | if (!areLoopExitPHIsLoopInvariant(L, ExitingBB: *ParentBB, ExitBB: *LoopExitBB)) { |
551 | LLVM_DEBUG(dbgs() << " Loop exit PHI's aren't loop-invariant!\n" ); |
552 | return false; |
553 | } |
554 | |
555 | // When unswitching only part of the branch's condition, we need the exit |
556 | // block to be reached directly from the partially unswitched input. This can |
557 | // be done when the exit block is along the true edge and the branch condition |
558 | // is a graph of `or` operations, or the exit block is along the false edge |
559 | // and the condition is a graph of `and` operations. |
560 | if (!FullUnswitch) { |
561 | if (ExitDirection ? !match(V: Cond, P: m_LogicalOr()) |
562 | : !match(V: Cond, P: m_LogicalAnd())) { |
563 | LLVM_DEBUG(dbgs() << " Branch condition is in improper form for " |
564 | "non-full unswitch!\n" ); |
565 | return false; |
566 | } |
567 | } |
568 | |
569 | LLVM_DEBUG({ |
570 | dbgs() << " unswitching trivial invariant conditions for: " << BI |
571 | << "\n" ; |
572 | for (Value *Invariant : Invariants) { |
573 | dbgs() << " " << *Invariant << " == true" ; |
574 | if (Invariant != Invariants.back()) |
575 | dbgs() << " ||" ; |
576 | dbgs() << "\n" ; |
577 | } |
578 | }); |
579 | |
580 | // If we have scalar evolutions, we need to invalidate them including this |
581 | // loop, the loop containing the exit block and the topmost parent loop |
582 | // exiting via LoopExitBB. |
583 | if (SE) { |
584 | if (const Loop *ExitL = getTopMostExitingLoop(ExitBB: LoopExitBB, LI)) |
585 | SE->forgetLoop(L: ExitL); |
586 | else |
587 | // Forget the entire nest as this exits the entire nest. |
588 | SE->forgetTopmostLoop(L: &L); |
589 | SE->forgetBlockAndLoopDispositions(); |
590 | } |
591 | |
592 | if (MSSAU && VerifyMemorySSA) |
593 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
594 | |
595 | // Split the preheader, so that we know that there is a safe place to insert |
596 | // the conditional branch. We will change the preheader to have a conditional |
597 | // branch on LoopCond. |
598 | BasicBlock *OldPH = L.getLoopPreheader(); |
599 | BasicBlock *NewPH = SplitEdge(From: OldPH, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
600 | |
601 | // Now that we have a place to insert the conditional branch, create a place |
602 | // to branch to: this is the exit block out of the loop that we are |
603 | // unswitching. We need to split this if there are other loop predecessors. |
604 | // Because the loop is in simplified form, *any* other predecessor is enough. |
605 | BasicBlock *UnswitchedBB; |
606 | if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { |
607 | assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && |
608 | "A branch's parent isn't a predecessor!" ); |
609 | UnswitchedBB = LoopExitBB; |
610 | } else { |
611 | UnswitchedBB = |
612 | SplitBlock(Old: LoopExitBB, SplitPt: LoopExitBB->begin(), DT: &DT, LI: &LI, MSSAU, BBName: "" , Before: false); |
613 | } |
614 | |
615 | if (MSSAU && VerifyMemorySSA) |
616 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
617 | |
618 | // Actually move the invariant uses into the unswitched position. If possible, |
619 | // we do this by moving the instructions, but when doing partial unswitching |
620 | // we do it by building a new merge of the values in the unswitched position. |
621 | OldPH->getTerminator()->eraseFromParent(); |
622 | if (FullUnswitch) { |
623 | // If fully unswitching, we can use the existing branch instruction. |
624 | // Splice it into the old PH to gate reaching the new preheader and re-point |
625 | // its successors. |
626 | BI.moveBefore(BB&: *OldPH, I: OldPH->end()); |
627 | BI.setCondition(Cond); |
628 | if (MSSAU) { |
629 | // Temporarily clone the terminator, to make MSSA update cheaper by |
630 | // separating "insert edge" updates from "remove edge" ones. |
631 | BI.clone()->insertInto(ParentBB, It: ParentBB->end()); |
632 | } else { |
633 | // Create a new unconditional branch that will continue the loop as a new |
634 | // terminator. |
635 | Instruction *NewBI = BranchInst::Create(IfTrue: ContinueBB, InsertBefore: ParentBB); |
636 | NewBI->setDebugLoc(BI.getDebugLoc()); |
637 | } |
638 | BI.setSuccessor(idx: LoopExitSuccIdx, NewSucc: UnswitchedBB); |
639 | BI.setSuccessor(idx: 1 - LoopExitSuccIdx, NewSucc: NewPH); |
640 | } else { |
641 | // Only unswitching a subset of inputs to the condition, so we will need to |
642 | // build a new branch that merges the invariant inputs. |
643 | if (ExitDirection) |
644 | assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalOr()) && |
645 | "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the " |
646 | "condition!" ); |
647 | else |
648 | assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalAnd()) && |
649 | "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the" |
650 | " condition!" ); |
651 | buildPartialUnswitchConditionalBranch( |
652 | BB&: *OldPH, Invariants, Direction: ExitDirection, UnswitchedSucc&: *UnswitchedBB, NormalSucc&: *NewPH, |
653 | InsertFreeze: FreezeLoopUnswitchCond, I: OldPH->getTerminator(), AC: nullptr, DT); |
654 | } |
655 | |
656 | // Update the dominator tree with the added edge. |
657 | DT.insertEdge(From: OldPH, To: UnswitchedBB); |
658 | |
659 | // After the dominator tree was updated with the added edge, update MemorySSA |
660 | // if available. |
661 | if (MSSAU) { |
662 | SmallVector<CFGUpdate, 1> Updates; |
663 | Updates.push_back(Elt: {cfg::UpdateKind::Insert, OldPH, UnswitchedBB}); |
664 | MSSAU->applyInsertUpdates(Updates, DT); |
665 | } |
666 | |
667 | // Finish updating dominator tree and memory ssa for full unswitch. |
668 | if (FullUnswitch) { |
669 | if (MSSAU) { |
670 | Instruction *Term = ParentBB->getTerminator(); |
671 | // Remove the cloned branch instruction and create unconditional branch |
672 | // now. |
673 | Instruction *NewBI = BranchInst::Create(IfTrue: ContinueBB, InsertBefore: ParentBB); |
674 | NewBI->setDebugLoc(Term->getDebugLoc()); |
675 | Term->eraseFromParent(); |
676 | MSSAU->removeEdge(From: ParentBB, To: LoopExitBB); |
677 | } |
678 | DT.deleteEdge(From: ParentBB, To: LoopExitBB); |
679 | } |
680 | |
681 | if (MSSAU && VerifyMemorySSA) |
682 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
683 | |
684 | // Rewrite the relevant PHI nodes. |
685 | if (UnswitchedBB == LoopExitBB) |
686 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *UnswitchedBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
687 | else |
688 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *LoopExitBB, UnswitchedBB&: *UnswitchedBB, |
689 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, FullUnswitch); |
690 | |
691 | // The constant we can replace all of our invariants with inside the loop |
692 | // body. If any of the invariants have a value other than this the loop won't |
693 | // be entered. |
694 | ConstantInt *Replacement = ExitDirection |
695 | ? ConstantInt::getFalse(Context&: BI.getContext()) |
696 | : ConstantInt::getTrue(Context&: BI.getContext()); |
697 | |
698 | // Since this is an i1 condition we can also trivially replace uses of it |
699 | // within the loop with a constant. |
700 | for (Value *Invariant : Invariants) |
701 | replaceLoopInvariantUses(L, Invariant, Replacement&: *Replacement); |
702 | |
703 | // If this was full unswitching, we may have changed the nesting relationship |
704 | // for this loop so hoist it to its correct parent if needed. |
705 | if (FullUnswitch) |
706 | hoistLoopToNewParent(L, Preheader&: *NewPH, DT, LI, MSSAU, SE); |
707 | |
708 | if (MSSAU && VerifyMemorySSA) |
709 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
710 | |
711 | LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n" ); |
712 | ++NumTrivial; |
713 | ++NumBranches; |
714 | return true; |
715 | } |
716 | |
717 | /// Unswitch a trivial switch if the condition is loop invariant. |
718 | /// |
719 | /// This routine should only be called when loop code leading to the switch has |
720 | /// been validated as trivial (no side effects). This routine checks if the |
721 | /// condition is invariant and that at least one of the successors is a loop |
722 | /// exit. This allows us to unswitch without duplicating the loop, making it |
723 | /// trivial. |
724 | /// |
725 | /// If this routine fails to unswitch the switch it returns false. |
726 | /// |
727 | /// If the switch can be unswitched, this routine splits the preheader and |
728 | /// copies the switch above that split. If the default case is one of the |
729 | /// exiting cases, it copies the non-exiting cases and points them at the new |
730 | /// preheader. If the default case is not exiting, it copies the exiting cases |
731 | /// and points the default at the preheader. It preserves loop simplified form |
732 | /// (splitting the exit blocks as necessary). It simplifies the switch within |
733 | /// the loop by removing now-dead cases. If the default case is one of those |
734 | /// unswitched, it replaces its destination with a new basic block containing |
735 | /// only unreachable. Such basic blocks, while technically loop exits, are not |
736 | /// considered for unswitching so this is a stable transform and the same |
737 | /// switch will not be revisited. If after unswitching there is only a single |
738 | /// in-loop successor, the switch is further simplified to an unconditional |
739 | /// branch. Still more cleanup can be done with some simplifycfg like pass. |
740 | /// |
741 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
742 | /// invalidated by this. |
743 | static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, |
744 | LoopInfo &LI, ScalarEvolution *SE, |
745 | MemorySSAUpdater *MSSAU) { |
746 | LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n" ); |
747 | Value *LoopCond = SI.getCondition(); |
748 | |
749 | // If this isn't switching on an invariant condition, we can't unswitch it. |
750 | if (!L.isLoopInvariant(V: LoopCond)) |
751 | return false; |
752 | |
753 | auto *ParentBB = SI.getParent(); |
754 | |
755 | // The same check must be used both for the default and the exit cases. We |
756 | // should never leave edges from the switch instruction to a basic block that |
757 | // we are unswitching, hence the condition used to determine the default case |
758 | // needs to also be used to populate ExitCaseIndices, which is then used to |
759 | // remove cases from the switch. |
760 | auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) { |
761 | // BBToCheck is not an exit block if it is inside loop L. |
762 | if (L.contains(BB: &BBToCheck)) |
763 | return false; |
764 | // BBToCheck is not trivial to unswitch if its phis aren't loop invariant. |
765 | if (!areLoopExitPHIsLoopInvariant(L, ExitingBB: *ParentBB, ExitBB: BBToCheck)) |
766 | return false; |
767 | // We do not unswitch a block that only has an unreachable statement, as |
768 | // it's possible this is a previously unswitched block. Only unswitch if |
769 | // either the terminator is not unreachable, or, if it is, it's not the only |
770 | // instruction in the block. |
771 | auto *TI = BBToCheck.getTerminator(); |
772 | bool isUnreachable = isa<UnreachableInst>(Val: TI); |
773 | return !isUnreachable || |
774 | (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI)); |
775 | }; |
776 | |
777 | SmallVector<int, 4> ExitCaseIndices; |
778 | for (auto Case : SI.cases()) |
779 | if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor())) |
780 | ExitCaseIndices.push_back(Elt: Case.getCaseIndex()); |
781 | BasicBlock *DefaultExitBB = nullptr; |
782 | SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight = |
783 | SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, idx: 0); |
784 | if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) { |
785 | DefaultExitBB = SI.getDefaultDest(); |
786 | } else if (ExitCaseIndices.empty()) |
787 | return false; |
788 | |
789 | LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n" ); |
790 | |
791 | if (MSSAU && VerifyMemorySSA) |
792 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
793 | |
794 | // We may need to invalidate SCEVs for the outermost loop reached by any of |
795 | // the exits. |
796 | Loop *OuterL = &L; |
797 | |
798 | if (DefaultExitBB) { |
799 | // Check the loop containing this exit. |
800 | Loop *ExitL = getTopMostExitingLoop(ExitBB: DefaultExitBB, LI); |
801 | if (!ExitL || ExitL->contains(L: OuterL)) |
802 | OuterL = ExitL; |
803 | } |
804 | for (unsigned Index : ExitCaseIndices) { |
805 | auto CaseI = SI.case_begin() + Index; |
806 | // Compute the outer loop from this exit. |
807 | Loop *ExitL = getTopMostExitingLoop(ExitBB: CaseI->getCaseSuccessor(), LI); |
808 | if (!ExitL || ExitL->contains(L: OuterL)) |
809 | OuterL = ExitL; |
810 | } |
811 | |
812 | if (SE) { |
813 | if (OuterL) |
814 | SE->forgetLoop(L: OuterL); |
815 | else |
816 | SE->forgetTopmostLoop(L: &L); |
817 | } |
818 | |
819 | if (DefaultExitBB) { |
820 | // Clear out the default destination temporarily to allow accurate |
821 | // predecessor lists to be examined below. |
822 | SI.setDefaultDest(nullptr); |
823 | } |
824 | |
825 | // Store the exit cases into a separate data structure and remove them from |
826 | // the switch. |
827 | SmallVector<std::tuple<ConstantInt *, BasicBlock *, |
828 | SwitchInstProfUpdateWrapper::CaseWeightOpt>, |
829 | 4> ExitCases; |
830 | ExitCases.reserve(N: ExitCaseIndices.size()); |
831 | SwitchInstProfUpdateWrapper SIW(SI); |
832 | // We walk the case indices backwards so that we remove the last case first |
833 | // and don't disrupt the earlier indices. |
834 | for (unsigned Index : reverse(C&: ExitCaseIndices)) { |
835 | auto CaseI = SI.case_begin() + Index; |
836 | // Save the value of this case. |
837 | auto W = SIW.getSuccessorWeight(idx: CaseI->getSuccessorIndex()); |
838 | ExitCases.emplace_back(Args: CaseI->getCaseValue(), Args: CaseI->getCaseSuccessor(), Args&: W); |
839 | // Delete the unswitched cases. |
840 | SIW.removeCase(I: CaseI); |
841 | } |
842 | |
843 | // Check if after this all of the remaining cases point at the same |
844 | // successor. |
845 | BasicBlock *CommonSuccBB = nullptr; |
846 | if (SI.getNumCases() > 0 && |
847 | all_of(Range: drop_begin(RangeOrContainer: SI.cases()), P: [&SI](const SwitchInst::CaseHandle &Case) { |
848 | return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor(); |
849 | })) |
850 | CommonSuccBB = SI.case_begin()->getCaseSuccessor(); |
851 | if (!DefaultExitBB) { |
852 | // If we're not unswitching the default, we need it to match any cases to |
853 | // have a common successor or if we have no cases it is the common |
854 | // successor. |
855 | if (SI.getNumCases() == 0) |
856 | CommonSuccBB = SI.getDefaultDest(); |
857 | else if (SI.getDefaultDest() != CommonSuccBB) |
858 | CommonSuccBB = nullptr; |
859 | } |
860 | |
861 | // Split the preheader, so that we know that there is a safe place to insert |
862 | // the switch. |
863 | BasicBlock *OldPH = L.getLoopPreheader(); |
864 | BasicBlock *NewPH = SplitEdge(From: OldPH, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
865 | OldPH->getTerminator()->eraseFromParent(); |
866 | |
867 | // Now add the unswitched switch. This new switch instruction inherits the |
868 | // debug location of the old switch, because it semantically replace the old |
869 | // one. |
870 | auto *NewSI = SwitchInst::Create(Value: LoopCond, Default: NewPH, NumCases: ExitCases.size(), InsertBefore: OldPH); |
871 | NewSI->setDebugLoc(SIW->getDebugLoc()); |
872 | SwitchInstProfUpdateWrapper NewSIW(*NewSI); |
873 | |
874 | // Rewrite the IR for the unswitched basic blocks. This requires two steps. |
875 | // First, we split any exit blocks with remaining in-loop predecessors. Then |
876 | // we update the PHIs in one of two ways depending on if there was a split. |
877 | // We walk in reverse so that we split in the same order as the cases |
878 | // appeared. This is purely for convenience of reading the resulting IR, but |
879 | // it doesn't cost anything really. |
880 | SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; |
881 | SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; |
882 | // Handle the default exit if necessary. |
883 | // FIXME: It'd be great if we could merge this with the loop below but LLVM's |
884 | // ranges aren't quite powerful enough yet. |
885 | if (DefaultExitBB) { |
886 | if (pred_empty(BB: DefaultExitBB)) { |
887 | UnswitchedExitBBs.insert(Ptr: DefaultExitBB); |
888 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *DefaultExitBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
889 | } else { |
890 | auto *SplitBB = |
891 | SplitBlock(Old: DefaultExitBB, SplitPt: DefaultExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
892 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *DefaultExitBB, UnswitchedBB&: *SplitBB, |
893 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, |
894 | /*FullUnswitch*/ true); |
895 | DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; |
896 | } |
897 | } |
898 | // Note that we must use a reference in the for loop so that we update the |
899 | // container. |
900 | for (auto &ExitCase : reverse(C&: ExitCases)) { |
901 | // Grab a reference to the exit block in the pair so that we can update it. |
902 | BasicBlock *ExitBB = std::get<1>(t&: ExitCase); |
903 | |
904 | // If this case is the last edge into the exit block, we can simply reuse it |
905 | // as it will no longer be a loop exit. No mapping necessary. |
906 | if (pred_empty(BB: ExitBB)) { |
907 | // Only rewrite once. |
908 | if (UnswitchedExitBBs.insert(Ptr: ExitBB).second) |
909 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *ExitBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
910 | continue; |
911 | } |
912 | |
913 | // Otherwise we need to split the exit block so that we retain an exit |
914 | // block from the loop and a target for the unswitched condition. |
915 | BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; |
916 | if (!SplitExitBB) { |
917 | // If this is the first time we see this, do the split and remember it. |
918 | SplitExitBB = SplitBlock(Old: ExitBB, SplitPt: ExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
919 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *ExitBB, UnswitchedBB&: *SplitExitBB, |
920 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, |
921 | /*FullUnswitch*/ true); |
922 | } |
923 | // Update the case pair to point to the split block. |
924 | std::get<1>(t&: ExitCase) = SplitExitBB; |
925 | } |
926 | |
927 | // Now add the unswitched cases. We do this in reverse order as we built them |
928 | // in reverse order. |
929 | for (auto &ExitCase : reverse(C&: ExitCases)) { |
930 | ConstantInt *CaseVal = std::get<0>(t&: ExitCase); |
931 | BasicBlock *UnswitchedBB = std::get<1>(t&: ExitCase); |
932 | |
933 | NewSIW.addCase(OnVal: CaseVal, Dest: UnswitchedBB, W: std::get<2>(t&: ExitCase)); |
934 | } |
935 | |
936 | // If the default was unswitched, re-point it and add explicit cases for |
937 | // entering the loop. |
938 | if (DefaultExitBB) { |
939 | NewSIW->setDefaultDest(DefaultExitBB); |
940 | NewSIW.setSuccessorWeight(idx: 0, W: DefaultCaseWeight); |
941 | |
942 | // We removed all the exit cases, so we just copy the cases to the |
943 | // unswitched switch. |
944 | for (const auto &Case : SI.cases()) |
945 | NewSIW.addCase(OnVal: Case.getCaseValue(), Dest: NewPH, |
946 | W: SIW.getSuccessorWeight(idx: Case.getSuccessorIndex())); |
947 | } else if (DefaultCaseWeight) { |
948 | // We have to set branch weight of the default case. |
949 | uint64_t SW = *DefaultCaseWeight; |
950 | for (const auto &Case : SI.cases()) { |
951 | auto W = SIW.getSuccessorWeight(idx: Case.getSuccessorIndex()); |
952 | assert(W && |
953 | "case weight must be defined as default case weight is defined" ); |
954 | SW += *W; |
955 | } |
956 | NewSIW.setSuccessorWeight(idx: 0, W: SW); |
957 | } |
958 | |
959 | // If we ended up with a common successor for every path through the switch |
960 | // after unswitching, rewrite it to an unconditional branch to make it easy |
961 | // to recognize. Otherwise we potentially have to recognize the default case |
962 | // pointing at unreachable and other complexity. |
963 | if (CommonSuccBB) { |
964 | BasicBlock *BB = SI.getParent(); |
965 | // We may have had multiple edges to this common successor block, so remove |
966 | // them as predecessors. We skip the first one, either the default or the |
967 | // actual first case. |
968 | bool SkippedFirst = DefaultExitBB == nullptr; |
969 | for (auto Case : SI.cases()) { |
970 | assert(Case.getCaseSuccessor() == CommonSuccBB && |
971 | "Non-common successor!" ); |
972 | (void)Case; |
973 | if (!SkippedFirst) { |
974 | SkippedFirst = true; |
975 | continue; |
976 | } |
977 | CommonSuccBB->removePredecessor(Pred: BB, |
978 | /*KeepOneInputPHIs*/ true); |
979 | } |
980 | // Now nuke the switch and replace it with a direct branch. |
981 | Instruction *NewBI = BranchInst::Create(IfTrue: CommonSuccBB, InsertBefore: BB); |
982 | NewBI->setDebugLoc(SIW->getDebugLoc()); |
983 | SIW.eraseFromParent(); |
984 | } else if (DefaultExitBB) { |
985 | assert(SI.getNumCases() > 0 && |
986 | "If we had no cases we'd have a common successor!" ); |
987 | // Move the last case to the default successor. This is valid as if the |
988 | // default got unswitched it cannot be reached. This has the advantage of |
989 | // being simple and keeping the number of edges from this switch to |
990 | // successors the same, and avoiding any PHI update complexity. |
991 | auto LastCaseI = std::prev(x: SI.case_end()); |
992 | |
993 | SI.setDefaultDest(LastCaseI->getCaseSuccessor()); |
994 | SIW.setSuccessorWeight( |
995 | idx: 0, W: SIW.getSuccessorWeight(idx: LastCaseI->getSuccessorIndex())); |
996 | SIW.removeCase(I: LastCaseI); |
997 | } |
998 | |
999 | // Walk the unswitched exit blocks and the unswitched split blocks and update |
1000 | // the dominator tree based on the CFG edits. While we are walking unordered |
1001 | // containers here, the API for applyUpdates takes an unordered list of |
1002 | // updates and requires them to not contain duplicates. |
1003 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
1004 | for (auto *UnswitchedExitBB : UnswitchedExitBBs) { |
1005 | DTUpdates.push_back(Elt: {DT.Delete, ParentBB, UnswitchedExitBB}); |
1006 | DTUpdates.push_back(Elt: {DT.Insert, OldPH, UnswitchedExitBB}); |
1007 | } |
1008 | for (auto SplitUnswitchedPair : SplitExitBBMap) { |
1009 | DTUpdates.push_back(Elt: {DT.Delete, ParentBB, SplitUnswitchedPair.first}); |
1010 | DTUpdates.push_back(Elt: {DT.Insert, OldPH, SplitUnswitchedPair.second}); |
1011 | } |
1012 | |
1013 | if (MSSAU) { |
1014 | MSSAU->applyUpdates(Updates: DTUpdates, DT, /*UpdateDT=*/UpdateDTFirst: true); |
1015 | if (VerifyMemorySSA) |
1016 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
1017 | } else { |
1018 | DT.applyUpdates(Updates: DTUpdates); |
1019 | } |
1020 | |
1021 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
1022 | |
1023 | // We may have changed the nesting relationship for this loop so hoist it to |
1024 | // its correct parent if needed. |
1025 | hoistLoopToNewParent(L, Preheader&: *NewPH, DT, LI, MSSAU, SE); |
1026 | |
1027 | if (MSSAU && VerifyMemorySSA) |
1028 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
1029 | |
1030 | ++NumTrivial; |
1031 | ++NumSwitches; |
1032 | LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n" ); |
1033 | return true; |
1034 | } |
1035 | |
1036 | /// This routine scans the loop to find a branch or switch which occurs before |
1037 | /// any side effects occur. These can potentially be unswitched without |
1038 | /// duplicating the loop. If a branch or switch is successfully unswitched the |
1039 | /// scanning continues to see if subsequent branches or switches have become |
1040 | /// trivial. Once all trivial candidates have been unswitched, this routine |
1041 | /// returns. |
1042 | /// |
1043 | /// The return value indicates whether anything was unswitched (and therefore |
1044 | /// changed). |
1045 | /// |
1046 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
1047 | /// invalidated by this. |
1048 | static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, |
1049 | LoopInfo &LI, ScalarEvolution *SE, |
1050 | MemorySSAUpdater *MSSAU) { |
1051 | bool Changed = false; |
1052 | |
1053 | // If loop header has only one reachable successor we should keep looking for |
1054 | // trivial condition candidates in the successor as well. An alternative is |
1055 | // to constant fold conditions and merge successors into loop header (then we |
1056 | // only need to check header's terminator). The reason for not doing this in |
1057 | // LoopUnswitch pass is that it could potentially break LoopPassManager's |
1058 | // invariants. Folding dead branches could either eliminate the current loop |
1059 | // or make other loops unreachable. LCSSA form might also not be preserved |
1060 | // after deleting branches. The following code keeps traversing loop header's |
1061 | // successors until it finds the trivial condition candidate (condition that |
1062 | // is not a constant). Since unswitching generates branches with constant |
1063 | // conditions, this scenario could be very common in practice. |
1064 | BasicBlock *CurrentBB = L.getHeader(); |
1065 | SmallPtrSet<BasicBlock *, 8> Visited; |
1066 | Visited.insert(Ptr: CurrentBB); |
1067 | do { |
1068 | // Check if there are any side-effecting instructions (e.g. stores, calls, |
1069 | // volatile loads) in the part of the loop that the code *would* execute |
1070 | // without unswitching. |
1071 | if (MSSAU) // Possible early exit with MSSA |
1072 | if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(BB: CurrentBB)) |
1073 | if (!isa<MemoryPhi>(Val: *Defs->begin()) || (++Defs->begin() != Defs->end())) |
1074 | return Changed; |
1075 | if (llvm::any_of(Range&: *CurrentBB, |
1076 | P: [](Instruction &I) { return I.mayHaveSideEffects(); })) |
1077 | return Changed; |
1078 | |
1079 | Instruction *CurrentTerm = CurrentBB->getTerminator(); |
1080 | |
1081 | if (auto *SI = dyn_cast<SwitchInst>(Val: CurrentTerm)) { |
1082 | // Don't bother trying to unswitch past a switch with a constant |
1083 | // condition. This should be removed prior to running this pass by |
1084 | // simplifycfg. |
1085 | if (isa<Constant>(Val: SI->getCondition())) |
1086 | return Changed; |
1087 | |
1088 | if (!unswitchTrivialSwitch(L, SI&: *SI, DT, LI, SE, MSSAU)) |
1089 | // Couldn't unswitch this one so we're done. |
1090 | return Changed; |
1091 | |
1092 | // Mark that we managed to unswitch something. |
1093 | Changed = true; |
1094 | |
1095 | // If unswitching turned the terminator into an unconditional branch then |
1096 | // we can continue. The unswitching logic specifically works to fold any |
1097 | // cases it can into an unconditional branch to make it easier to |
1098 | // recognize here. |
1099 | auto *BI = dyn_cast<BranchInst>(Val: CurrentBB->getTerminator()); |
1100 | if (!BI || BI->isConditional()) |
1101 | return Changed; |
1102 | |
1103 | CurrentBB = BI->getSuccessor(i: 0); |
1104 | continue; |
1105 | } |
1106 | |
1107 | auto *BI = dyn_cast<BranchInst>(Val: CurrentTerm); |
1108 | if (!BI) |
1109 | // We do not understand other terminator instructions. |
1110 | return Changed; |
1111 | |
1112 | // Don't bother trying to unswitch past an unconditional branch or a branch |
1113 | // with a constant value. These should be removed by simplifycfg prior to |
1114 | // running this pass. |
1115 | if (!BI->isConditional() || |
1116 | isa<Constant>(Val: skipTrivialSelect(Cond: BI->getCondition()))) |
1117 | return Changed; |
1118 | |
1119 | // Found a trivial condition candidate: non-foldable conditional branch. If |
1120 | // we fail to unswitch this, we can't do anything else that is trivial. |
1121 | if (!unswitchTrivialBranch(L, BI&: *BI, DT, LI, SE, MSSAU)) |
1122 | return Changed; |
1123 | |
1124 | // Mark that we managed to unswitch something. |
1125 | Changed = true; |
1126 | |
1127 | // If we only unswitched some of the conditions feeding the branch, we won't |
1128 | // have collapsed it to a single successor. |
1129 | BI = cast<BranchInst>(Val: CurrentBB->getTerminator()); |
1130 | if (BI->isConditional()) |
1131 | return Changed; |
1132 | |
1133 | // Follow the newly unconditional branch into its successor. |
1134 | CurrentBB = BI->getSuccessor(i: 0); |
1135 | |
1136 | // When continuing, if we exit the loop or reach a previous visited block, |
1137 | // then we can not reach any trivial condition candidates (unfoldable |
1138 | // branch instructions or switch instructions) and no unswitch can happen. |
1139 | } while (L.contains(BB: CurrentBB) && Visited.insert(Ptr: CurrentBB).second); |
1140 | |
1141 | return Changed; |
1142 | } |
1143 | |
1144 | /// Build the cloned blocks for an unswitched copy of the given loop. |
1145 | /// |
1146 | /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and |
1147 | /// after the split block (`SplitBB`) that will be used to select between the |
1148 | /// cloned and original loop. |
1149 | /// |
1150 | /// This routine handles cloning all of the necessary loop blocks and exit |
1151 | /// blocks including rewriting their instructions and the relevant PHI nodes. |
1152 | /// Any loop blocks or exit blocks which are dominated by a different successor |
1153 | /// than the one for this clone of the loop blocks can be trivially skipped. We |
1154 | /// use the `DominatingSucc` map to determine whether a block satisfies that |
1155 | /// property with a simple map lookup. |
1156 | /// |
1157 | /// It also correctly creates the unconditional branch in the cloned |
1158 | /// unswitched parent block to only point at the unswitched successor. |
1159 | /// |
1160 | /// This does not handle most of the necessary updates to `LoopInfo`. Only exit |
1161 | /// block splitting is correctly reflected in `LoopInfo`, essentially all of |
1162 | /// the cloned blocks (and their loops) are left without full `LoopInfo` |
1163 | /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned |
1164 | /// blocks to them but doesn't create the cloned `DominatorTree` structure and |
1165 | /// instead the caller must recompute an accurate DT. It *does* correctly |
1166 | /// update the `AssumptionCache` provided in `AC`. |
1167 | static BasicBlock *buildClonedLoopBlocks( |
1168 | Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, |
1169 | ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, |
1170 | BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, |
1171 | const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, |
1172 | ValueToValueMapTy &VMap, |
1173 | SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, |
1174 | DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, |
1175 | ScalarEvolution *SE) { |
1176 | SmallVector<BasicBlock *, 4> NewBlocks; |
1177 | NewBlocks.reserve(N: L.getNumBlocks() + ExitBlocks.size()); |
1178 | |
1179 | // We will need to clone a bunch of blocks, wrap up the clone operation in |
1180 | // a helper. |
1181 | auto CloneBlock = [&](BasicBlock *OldBB) { |
1182 | // Clone the basic block and insert it before the new preheader. |
1183 | BasicBlock *NewBB = CloneBasicBlock(BB: OldBB, VMap, NameSuffix: ".us" , F: OldBB->getParent()); |
1184 | NewBB->moveBefore(MovePos: LoopPH); |
1185 | |
1186 | // Record this block and the mapping. |
1187 | NewBlocks.push_back(Elt: NewBB); |
1188 | VMap[OldBB] = NewBB; |
1189 | |
1190 | return NewBB; |
1191 | }; |
1192 | |
1193 | // We skip cloning blocks when they have a dominating succ that is not the |
1194 | // succ we are cloning for. |
1195 | auto SkipBlock = [&](BasicBlock *BB) { |
1196 | auto It = DominatingSucc.find(Val: BB); |
1197 | return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; |
1198 | }; |
1199 | |
1200 | // First, clone the preheader. |
1201 | auto *ClonedPH = CloneBlock(LoopPH); |
1202 | |
1203 | // Then clone all the loop blocks, skipping the ones that aren't necessary. |
1204 | for (auto *LoopBB : L.blocks()) |
1205 | if (!SkipBlock(LoopBB)) |
1206 | CloneBlock(LoopBB); |
1207 | |
1208 | // Split all the loop exit edges so that when we clone the exit blocks, if |
1209 | // any of the exit blocks are *also* a preheader for some other loop, we |
1210 | // don't create multiple predecessors entering the loop header. |
1211 | for (auto *ExitBB : ExitBlocks) { |
1212 | if (SkipBlock(ExitBB)) |
1213 | continue; |
1214 | |
1215 | // When we are going to clone an exit, we don't need to clone all the |
1216 | // instructions in the exit block and we want to ensure we have an easy |
1217 | // place to merge the CFG, so split the exit first. This is always safe to |
1218 | // do because there cannot be any non-loop predecessors of a loop exit in |
1219 | // loop simplified form. |
1220 | auto *MergeBB = SplitBlock(Old: ExitBB, SplitPt: ExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
1221 | |
1222 | // Rearrange the names to make it easier to write test cases by having the |
1223 | // exit block carry the suffix rather than the merge block carrying the |
1224 | // suffix. |
1225 | MergeBB->takeName(V: ExitBB); |
1226 | ExitBB->setName(Twine(MergeBB->getName()) + ".split" ); |
1227 | |
1228 | // Now clone the original exit block. |
1229 | auto *ClonedExitBB = CloneBlock(ExitBB); |
1230 | assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 && |
1231 | "Exit block should have been split to have one successor!" ); |
1232 | assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && |
1233 | "Cloned exit block has the wrong successor!" ); |
1234 | |
1235 | // Remap any cloned instructions and create a merge phi node for them. |
1236 | for (auto ZippedInsts : llvm::zip_first( |
1237 | t: llvm::make_range(x: ExitBB->begin(), y: std::prev(x: ExitBB->end())), |
1238 | u: llvm::make_range(x: ClonedExitBB->begin(), |
1239 | y: std::prev(x: ClonedExitBB->end())))) { |
1240 | Instruction &I = std::get<0>(t&: ZippedInsts); |
1241 | Instruction &ClonedI = std::get<1>(t&: ZippedInsts); |
1242 | |
1243 | // The only instructions in the exit block should be PHI nodes and |
1244 | // potentially a landing pad. |
1245 | assert( |
1246 | (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) && |
1247 | "Bad instruction in exit block!" ); |
1248 | // We should have a value map between the instruction and its clone. |
1249 | assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!" ); |
1250 | |
1251 | // Forget SCEVs based on exit phis in case SCEV looked through the phi. |
1252 | if (SE && isa<PHINode>(Val: I)) |
1253 | SE->forgetValue(V: &I); |
1254 | |
1255 | BasicBlock::iterator InsertPt = MergeBB->getFirstInsertionPt(); |
1256 | |
1257 | auto *MergePN = |
1258 | PHINode::Create(Ty: I.getType(), /*NumReservedValues*/ 2, NameStr: ".us-phi" ); |
1259 | MergePN->insertBefore(InsertPos: InsertPt); |
1260 | MergePN->setDebugLoc(InsertPt->getDebugLoc()); |
1261 | I.replaceAllUsesWith(V: MergePN); |
1262 | MergePN->addIncoming(V: &I, BB: ExitBB); |
1263 | MergePN->addIncoming(V: &ClonedI, BB: ClonedExitBB); |
1264 | } |
1265 | } |
1266 | |
1267 | // Rewrite the instructions in the cloned blocks to refer to the instructions |
1268 | // in the cloned blocks. We have to do this as a second pass so that we have |
1269 | // everything available. Also, we have inserted new instructions which may |
1270 | // include assume intrinsics, so we update the assumption cache while |
1271 | // processing this. |
1272 | Module *M = ClonedPH->getParent()->getParent(); |
1273 | for (auto *ClonedBB : NewBlocks) |
1274 | for (Instruction &I : *ClonedBB) { |
1275 | RemapDbgRecordRange(M, Range: I.getDbgRecordRange(), VM&: VMap, |
1276 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
1277 | RemapInstruction(I: &I, VM&: VMap, |
1278 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
1279 | if (auto *II = dyn_cast<AssumeInst>(Val: &I)) |
1280 | AC.registerAssumption(CI: II); |
1281 | } |
1282 | |
1283 | // Update any PHI nodes in the cloned successors of the skipped blocks to not |
1284 | // have spurious incoming values. |
1285 | for (auto *LoopBB : L.blocks()) |
1286 | if (SkipBlock(LoopBB)) |
1287 | for (auto *SuccBB : successors(BB: LoopBB)) |
1288 | if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: SuccBB))) |
1289 | for (PHINode &PN : ClonedSuccBB->phis()) |
1290 | PN.removeIncomingValue(BB: LoopBB, /*DeletePHIIfEmpty*/ false); |
1291 | |
1292 | // Remove the cloned parent as a predecessor of any successor we ended up |
1293 | // cloning other than the unswitched one. |
1294 | auto *ClonedParentBB = cast<BasicBlock>(Val: VMap.lookup(Val: ParentBB)); |
1295 | for (auto *SuccBB : successors(BB: ParentBB)) { |
1296 | if (SuccBB == UnswitchedSuccBB) |
1297 | continue; |
1298 | |
1299 | auto *ClonedSuccBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: SuccBB)); |
1300 | if (!ClonedSuccBB) |
1301 | continue; |
1302 | |
1303 | ClonedSuccBB->removePredecessor(Pred: ClonedParentBB, |
1304 | /*KeepOneInputPHIs*/ true); |
1305 | } |
1306 | |
1307 | // Replace the cloned branch with an unconditional branch to the cloned |
1308 | // unswitched successor. |
1309 | auto *ClonedSuccBB = cast<BasicBlock>(Val: VMap.lookup(Val: UnswitchedSuccBB)); |
1310 | Instruction *ClonedTerminator = ClonedParentBB->getTerminator(); |
1311 | // Trivial Simplification. If Terminator is a conditional branch and |
1312 | // condition becomes dead - erase it. |
1313 | Value *ClonedConditionToErase = nullptr; |
1314 | if (auto *BI = dyn_cast<BranchInst>(Val: ClonedTerminator)) |
1315 | ClonedConditionToErase = BI->getCondition(); |
1316 | else if (auto *SI = dyn_cast<SwitchInst>(Val: ClonedTerminator)) |
1317 | ClonedConditionToErase = SI->getCondition(); |
1318 | |
1319 | Instruction *BI = BranchInst::Create(IfTrue: ClonedSuccBB, InsertBefore: ClonedParentBB); |
1320 | BI->setDebugLoc(ClonedTerminator->getDebugLoc()); |
1321 | ClonedTerminator->eraseFromParent(); |
1322 | |
1323 | if (ClonedConditionToErase) |
1324 | RecursivelyDeleteTriviallyDeadInstructions(V: ClonedConditionToErase, TLI: nullptr, |
1325 | MSSAU); |
1326 | |
1327 | // If there are duplicate entries in the PHI nodes because of multiple edges |
1328 | // to the unswitched successor, we need to nuke all but one as we replaced it |
1329 | // with a direct branch. |
1330 | for (PHINode &PN : ClonedSuccBB->phis()) { |
1331 | bool Found = false; |
1332 | // Loop over the incoming operands backwards so we can easily delete as we |
1333 | // go without invalidating the index. |
1334 | for (int i = PN.getNumOperands() - 1; i >= 0; --i) { |
1335 | if (PN.getIncomingBlock(i) != ClonedParentBB) |
1336 | continue; |
1337 | if (!Found) { |
1338 | Found = true; |
1339 | continue; |
1340 | } |
1341 | PN.removeIncomingValue(Idx: i, /*DeletePHIIfEmpty*/ false); |
1342 | } |
1343 | } |
1344 | |
1345 | // Record the domtree updates for the new blocks. |
1346 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1347 | for (auto *ClonedBB : NewBlocks) { |
1348 | for (auto *SuccBB : successors(BB: ClonedBB)) |
1349 | if (SuccSet.insert(Ptr: SuccBB).second) |
1350 | DTUpdates.push_back(Elt: {DominatorTree::Insert, ClonedBB, SuccBB}); |
1351 | SuccSet.clear(); |
1352 | } |
1353 | |
1354 | return ClonedPH; |
1355 | } |
1356 | |
1357 | /// Recursively clone the specified loop and all of its children. |
1358 | /// |
1359 | /// The target parent loop for the clone should be provided, or can be null if |
1360 | /// the clone is a top-level loop. While cloning, all the blocks are mapped |
1361 | /// with the provided value map. The entire original loop must be present in |
1362 | /// the value map. The cloned loop is returned. |
1363 | static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, |
1364 | const ValueToValueMapTy &VMap, LoopInfo &LI) { |
1365 | auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) { |
1366 | assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!" ); |
1367 | ClonedL.reserveBlocks(size: OrigL.getNumBlocks()); |
1368 | for (auto *BB : OrigL.blocks()) { |
1369 | auto *ClonedBB = cast<BasicBlock>(Val: VMap.lookup(Val: BB)); |
1370 | ClonedL.addBlockEntry(BB: ClonedBB); |
1371 | if (LI.getLoopFor(BB) == &OrigL) |
1372 | LI.changeLoopFor(BB: ClonedBB, L: &ClonedL); |
1373 | } |
1374 | }; |
1375 | |
1376 | // We specially handle the first loop because it may get cloned into |
1377 | // a different parent and because we most commonly are cloning leaf loops. |
1378 | Loop *ClonedRootL = LI.AllocateLoop(); |
1379 | if (RootParentL) |
1380 | RootParentL->addChildLoop(NewChild: ClonedRootL); |
1381 | else |
1382 | LI.addTopLevelLoop(New: ClonedRootL); |
1383 | AddClonedBlocksToLoop(OrigRootL, *ClonedRootL); |
1384 | |
1385 | if (OrigRootL.isInnermost()) |
1386 | return ClonedRootL; |
1387 | |
1388 | // If we have a nest, we can quickly clone the entire loop nest using an |
1389 | // iterative approach because it is a tree. We keep the cloned parent in the |
1390 | // data structure to avoid repeatedly querying through a map to find it. |
1391 | SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone; |
1392 | // Build up the loops to clone in reverse order as we'll clone them from the |
1393 | // back. |
1394 | for (Loop *ChildL : llvm::reverse(C&: OrigRootL)) |
1395 | LoopsToClone.push_back(Elt: {ClonedRootL, ChildL}); |
1396 | do { |
1397 | Loop *ClonedParentL, *L; |
1398 | std::tie(args&: ClonedParentL, args&: L) = LoopsToClone.pop_back_val(); |
1399 | Loop *ClonedL = LI.AllocateLoop(); |
1400 | ClonedParentL->addChildLoop(NewChild: ClonedL); |
1401 | AddClonedBlocksToLoop(*L, *ClonedL); |
1402 | for (Loop *ChildL : llvm::reverse(C&: *L)) |
1403 | LoopsToClone.push_back(Elt: {ClonedL, ChildL}); |
1404 | } while (!LoopsToClone.empty()); |
1405 | |
1406 | return ClonedRootL; |
1407 | } |
1408 | |
1409 | /// Build the cloned loops of an original loop from unswitching. |
1410 | /// |
1411 | /// Because unswitching simplifies the CFG of the loop, this isn't a trivial |
1412 | /// operation. We need to re-verify that there even is a loop (as the backedge |
1413 | /// may not have been cloned), and even if there are remaining backedges the |
1414 | /// backedge set may be different. However, we know that each child loop is |
1415 | /// undisturbed, we only need to find where to place each child loop within |
1416 | /// either any parent loop or within a cloned version of the original loop. |
1417 | /// |
1418 | /// Because child loops may end up cloned outside of any cloned version of the |
1419 | /// original loop, multiple cloned sibling loops may be created. All of them |
1420 | /// are returned so that the newly introduced loop nest roots can be |
1421 | /// identified. |
1422 | static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, |
1423 | const ValueToValueMapTy &VMap, LoopInfo &LI, |
1424 | SmallVectorImpl<Loop *> &NonChildClonedLoops) { |
1425 | Loop *ClonedL = nullptr; |
1426 | |
1427 | auto *OrigPH = OrigL.getLoopPreheader(); |
1428 | auto * = OrigL.getHeader(); |
1429 | |
1430 | auto *ClonedPH = cast<BasicBlock>(Val: VMap.lookup(Val: OrigPH)); |
1431 | auto * = cast<BasicBlock>(Val: VMap.lookup(Val: OrigHeader)); |
1432 | |
1433 | // We need to know the loops of the cloned exit blocks to even compute the |
1434 | // accurate parent loop. If we only clone exits to some parent of the |
1435 | // original parent, we want to clone into that outer loop. We also keep track |
1436 | // of the loops that our cloned exit blocks participate in. |
1437 | Loop *ParentL = nullptr; |
1438 | SmallVector<BasicBlock *, 4> ClonedExitsInLoops; |
1439 | SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap; |
1440 | ClonedExitsInLoops.reserve(N: ExitBlocks.size()); |
1441 | for (auto *ExitBB : ExitBlocks) |
1442 | if (auto *ClonedExitBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ExitBB))) |
1443 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) { |
1444 | ExitLoopMap[ClonedExitBB] = ExitL; |
1445 | ClonedExitsInLoops.push_back(Elt: ClonedExitBB); |
1446 | if (!ParentL || (ParentL != ExitL && ParentL->contains(L: ExitL))) |
1447 | ParentL = ExitL; |
1448 | } |
1449 | assert((!ParentL || ParentL == OrigL.getParentLoop() || |
1450 | ParentL->contains(OrigL.getParentLoop())) && |
1451 | "The computed parent loop should always contain (or be) the parent of " |
1452 | "the original loop." ); |
1453 | |
1454 | // We build the set of blocks dominated by the cloned header from the set of |
1455 | // cloned blocks out of the original loop. While not all of these will |
1456 | // necessarily be in the cloned loop, it is enough to establish that they |
1457 | // aren't in unreachable cycles, etc. |
1458 | SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks; |
1459 | for (auto *BB : OrigL.blocks()) |
1460 | if (auto *ClonedBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: BB))) |
1461 | ClonedLoopBlocks.insert(X: ClonedBB); |
1462 | |
1463 | // Rebuild the set of blocks that will end up in the cloned loop. We may have |
1464 | // skipped cloning some region of this loop which can in turn skip some of |
1465 | // the backedges so we have to rebuild the blocks in the loop based on the |
1466 | // backedges that remain after cloning. |
1467 | SmallVector<BasicBlock *, 16> Worklist; |
1468 | SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop; |
1469 | for (auto *Pred : predecessors(BB: ClonedHeader)) { |
1470 | // The only possible non-loop header predecessor is the preheader because |
1471 | // we know we cloned the loop in simplified form. |
1472 | if (Pred == ClonedPH) |
1473 | continue; |
1474 | |
1475 | // Because the loop was in simplified form, the only non-loop predecessor |
1476 | // should be the preheader. |
1477 | assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop " |
1478 | "header other than the preheader " |
1479 | "that is not part of the loop!" ); |
1480 | |
1481 | // Insert this block into the loop set and on the first visit (and if it |
1482 | // isn't the header we're currently walking) put it into the worklist to |
1483 | // recurse through. |
1484 | if (BlocksInClonedLoop.insert(Ptr: Pred).second && Pred != ClonedHeader) |
1485 | Worklist.push_back(Elt: Pred); |
1486 | } |
1487 | |
1488 | // If we had any backedges then there *is* a cloned loop. Put the header into |
1489 | // the loop set and then walk the worklist backwards to find all the blocks |
1490 | // that remain within the loop after cloning. |
1491 | if (!BlocksInClonedLoop.empty()) { |
1492 | BlocksInClonedLoop.insert(Ptr: ClonedHeader); |
1493 | |
1494 | while (!Worklist.empty()) { |
1495 | BasicBlock *BB = Worklist.pop_back_val(); |
1496 | assert(BlocksInClonedLoop.count(BB) && |
1497 | "Didn't put block into the loop set!" ); |
1498 | |
1499 | // Insert any predecessors that are in the possible set into the cloned |
1500 | // set, and if the insert is successful, add them to the worklist. Note |
1501 | // that we filter on the blocks that are definitely reachable via the |
1502 | // backedge to the loop header so we may prune out dead code within the |
1503 | // cloned loop. |
1504 | for (auto *Pred : predecessors(BB)) |
1505 | if (ClonedLoopBlocks.count(key: Pred) && |
1506 | BlocksInClonedLoop.insert(Ptr: Pred).second) |
1507 | Worklist.push_back(Elt: Pred); |
1508 | } |
1509 | |
1510 | ClonedL = LI.AllocateLoop(); |
1511 | if (ParentL) { |
1512 | ParentL->addBasicBlockToLoop(NewBB: ClonedPH, LI); |
1513 | ParentL->addChildLoop(NewChild: ClonedL); |
1514 | } else { |
1515 | LI.addTopLevelLoop(New: ClonedL); |
1516 | } |
1517 | NonChildClonedLoops.push_back(Elt: ClonedL); |
1518 | |
1519 | ClonedL->reserveBlocks(size: BlocksInClonedLoop.size()); |
1520 | // We don't want to just add the cloned loop blocks based on how we |
1521 | // discovered them. The original order of blocks was carefully built in |
1522 | // a way that doesn't rely on predecessor ordering. Rather than re-invent |
1523 | // that logic, we just re-walk the original blocks (and those of the child |
1524 | // loops) and filter them as we add them into the cloned loop. |
1525 | for (auto *BB : OrigL.blocks()) { |
1526 | auto *ClonedBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: BB)); |
1527 | if (!ClonedBB || !BlocksInClonedLoop.count(Ptr: ClonedBB)) |
1528 | continue; |
1529 | |
1530 | // Directly add the blocks that are only in this loop. |
1531 | if (LI.getLoopFor(BB) == &OrigL) { |
1532 | ClonedL->addBasicBlockToLoop(NewBB: ClonedBB, LI); |
1533 | continue; |
1534 | } |
1535 | |
1536 | // We want to manually add it to this loop and parents. |
1537 | // Registering it with LoopInfo will happen when we clone the top |
1538 | // loop for this block. |
1539 | for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop()) |
1540 | PL->addBlockEntry(BB: ClonedBB); |
1541 | } |
1542 | |
1543 | // Now add each child loop whose header remains within the cloned loop. All |
1544 | // of the blocks within the loop must satisfy the same constraints as the |
1545 | // header so once we pass the header checks we can just clone the entire |
1546 | // child loop nest. |
1547 | for (Loop *ChildL : OrigL) { |
1548 | auto * = |
1549 | cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ChildL->getHeader())); |
1550 | if (!ClonedChildHeader || !BlocksInClonedLoop.count(Ptr: ClonedChildHeader)) |
1551 | continue; |
1552 | |
1553 | #ifndef NDEBUG |
1554 | // We should never have a cloned child loop header but fail to have |
1555 | // all of the blocks for that child loop. |
1556 | for (auto *ChildLoopBB : ChildL->blocks()) |
1557 | assert(BlocksInClonedLoop.count( |
1558 | cast<BasicBlock>(VMap.lookup(ChildLoopBB))) && |
1559 | "Child cloned loop has a header within the cloned outer " |
1560 | "loop but not all of its blocks!" ); |
1561 | #endif |
1562 | |
1563 | cloneLoopNest(OrigRootL&: *ChildL, RootParentL: ClonedL, VMap, LI); |
1564 | } |
1565 | } |
1566 | |
1567 | // Now that we've handled all the components of the original loop that were |
1568 | // cloned into a new loop, we still need to handle anything from the original |
1569 | // loop that wasn't in a cloned loop. |
1570 | |
1571 | // Figure out what blocks are left to place within any loop nest containing |
1572 | // the unswitched loop. If we never formed a loop, the cloned PH is one of |
1573 | // them. |
1574 | SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet; |
1575 | if (BlocksInClonedLoop.empty()) |
1576 | UnloopedBlockSet.insert(Ptr: ClonedPH); |
1577 | for (auto *ClonedBB : ClonedLoopBlocks) |
1578 | if (!BlocksInClonedLoop.count(Ptr: ClonedBB)) |
1579 | UnloopedBlockSet.insert(Ptr: ClonedBB); |
1580 | |
1581 | // Copy the cloned exits and sort them in ascending loop depth, we'll work |
1582 | // backwards across these to process them inside out. The order shouldn't |
1583 | // matter as we're just trying to build up the map from inside-out; we use |
1584 | // the map in a more stably ordered way below. |
1585 | auto OrderedClonedExitsInLoops = ClonedExitsInLoops; |
1586 | llvm::sort(C&: OrderedClonedExitsInLoops, Comp: [&](BasicBlock *LHS, BasicBlock *RHS) { |
1587 | return ExitLoopMap.lookup(Val: LHS)->getLoopDepth() < |
1588 | ExitLoopMap.lookup(Val: RHS)->getLoopDepth(); |
1589 | }); |
1590 | |
1591 | // Populate the existing ExitLoopMap with everything reachable from each |
1592 | // exit, starting from the inner most exit. |
1593 | while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) { |
1594 | assert(Worklist.empty() && "Didn't clear worklist!" ); |
1595 | |
1596 | BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val(); |
1597 | Loop *ExitL = ExitLoopMap.lookup(Val: ExitBB); |
1598 | |
1599 | // Walk the CFG back until we hit the cloned PH adding everything reachable |
1600 | // and in the unlooped set to this exit block's loop. |
1601 | Worklist.push_back(Elt: ExitBB); |
1602 | do { |
1603 | BasicBlock *BB = Worklist.pop_back_val(); |
1604 | // We can stop recursing at the cloned preheader (if we get there). |
1605 | if (BB == ClonedPH) |
1606 | continue; |
1607 | |
1608 | for (BasicBlock *PredBB : predecessors(BB)) { |
1609 | // If this pred has already been moved to our set or is part of some |
1610 | // (inner) loop, no update needed. |
1611 | if (!UnloopedBlockSet.erase(Ptr: PredBB)) { |
1612 | assert( |
1613 | (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) && |
1614 | "Predecessor not mapped to a loop!" ); |
1615 | continue; |
1616 | } |
1617 | |
1618 | // We just insert into the loop set here. We'll add these blocks to the |
1619 | // exit loop after we build up the set in an order that doesn't rely on |
1620 | // predecessor order (which in turn relies on use list order). |
1621 | bool Inserted = ExitLoopMap.insert(KV: {PredBB, ExitL}).second; |
1622 | (void)Inserted; |
1623 | assert(Inserted && "Should only visit an unlooped block once!" ); |
1624 | |
1625 | // And recurse through to its predecessors. |
1626 | Worklist.push_back(Elt: PredBB); |
1627 | } |
1628 | } while (!Worklist.empty()); |
1629 | } |
1630 | |
1631 | // Now that the ExitLoopMap gives as mapping for all the non-looping cloned |
1632 | // blocks to their outer loops, walk the cloned blocks and the cloned exits |
1633 | // in their original order adding them to the correct loop. |
1634 | |
1635 | // We need a stable insertion order. We use the order of the original loop |
1636 | // order and map into the correct parent loop. |
1637 | for (auto *BB : llvm::concat<BasicBlock *const>( |
1638 | Ranges: ArrayRef(ClonedPH), Ranges&: ClonedLoopBlocks, Ranges&: ClonedExitsInLoops)) |
1639 | if (Loop *OuterL = ExitLoopMap.lookup(Val: BB)) |
1640 | OuterL->addBasicBlockToLoop(NewBB: BB, LI); |
1641 | |
1642 | #ifndef NDEBUG |
1643 | for (auto &BBAndL : ExitLoopMap) { |
1644 | auto *BB = BBAndL.first; |
1645 | auto *OuterL = BBAndL.second; |
1646 | assert(LI.getLoopFor(BB) == OuterL && |
1647 | "Failed to put all blocks into outer loops!" ); |
1648 | } |
1649 | #endif |
1650 | |
1651 | // Now that all the blocks are placed into the correct containing loop in the |
1652 | // absence of child loops, find all the potentially cloned child loops and |
1653 | // clone them into whatever outer loop we placed their header into. |
1654 | for (Loop *ChildL : OrigL) { |
1655 | auto * = |
1656 | cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ChildL->getHeader())); |
1657 | if (!ClonedChildHeader || BlocksInClonedLoop.count(Ptr: ClonedChildHeader)) |
1658 | continue; |
1659 | |
1660 | #ifndef NDEBUG |
1661 | for (auto *ChildLoopBB : ChildL->blocks()) |
1662 | assert(VMap.count(ChildLoopBB) && |
1663 | "Cloned a child loop header but not all of that loops blocks!" ); |
1664 | #endif |
1665 | |
1666 | NonChildClonedLoops.push_back(Elt: cloneLoopNest( |
1667 | OrigRootL&: *ChildL, RootParentL: ExitLoopMap.lookup(Val: ClonedChildHeader), VMap, LI)); |
1668 | } |
1669 | } |
1670 | |
1671 | static void |
1672 | deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, |
1673 | ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, |
1674 | DominatorTree &DT, MemorySSAUpdater *MSSAU) { |
1675 | // Find all the dead clones, and remove them from their successors. |
1676 | SmallVector<BasicBlock *, 16> DeadBlocks; |
1677 | for (BasicBlock *BB : llvm::concat<BasicBlock *const>(Ranges: L.blocks(), Ranges&: ExitBlocks)) |
1678 | for (const auto &VMap : VMaps) |
1679 | if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(Val: VMap->lookup(Val: BB))) |
1680 | if (!DT.isReachableFromEntry(A: ClonedBB)) { |
1681 | for (BasicBlock *SuccBB : successors(BB: ClonedBB)) |
1682 | SuccBB->removePredecessor(Pred: ClonedBB); |
1683 | DeadBlocks.push_back(Elt: ClonedBB); |
1684 | } |
1685 | |
1686 | // Remove all MemorySSA in the dead blocks |
1687 | if (MSSAU) { |
1688 | SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(), |
1689 | DeadBlocks.end()); |
1690 | MSSAU->removeBlocks(DeadBlocks: DeadBlockSet); |
1691 | } |
1692 | |
1693 | // Drop any remaining references to break cycles. |
1694 | for (BasicBlock *BB : DeadBlocks) |
1695 | BB->dropAllReferences(); |
1696 | // Erase them from the IR. |
1697 | for (BasicBlock *BB : DeadBlocks) |
1698 | BB->eraseFromParent(); |
1699 | } |
1700 | |
1701 | static void deleteDeadBlocksFromLoop(Loop &L, |
1702 | SmallVectorImpl<BasicBlock *> &ExitBlocks, |
1703 | DominatorTree &DT, LoopInfo &LI, |
1704 | MemorySSAUpdater *MSSAU, |
1705 | ScalarEvolution *SE, |
1706 | LPMUpdater &LoopUpdater) { |
1707 | // Find all the dead blocks tied to this loop, and remove them from their |
1708 | // successors. |
1709 | SmallSetVector<BasicBlock *, 8> DeadBlockSet; |
1710 | |
1711 | // Start with loop/exit blocks and get a transitive closure of reachable dead |
1712 | // blocks. |
1713 | SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(), |
1714 | ExitBlocks.end()); |
1715 | DeathCandidates.append(in_start: L.blocks().begin(), in_end: L.blocks().end()); |
1716 | while (!DeathCandidates.empty()) { |
1717 | auto *BB = DeathCandidates.pop_back_val(); |
1718 | if (!DeadBlockSet.count(key: BB) && !DT.isReachableFromEntry(A: BB)) { |
1719 | for (BasicBlock *SuccBB : successors(BB)) { |
1720 | SuccBB->removePredecessor(Pred: BB); |
1721 | DeathCandidates.push_back(Elt: SuccBB); |
1722 | } |
1723 | DeadBlockSet.insert(X: BB); |
1724 | } |
1725 | } |
1726 | |
1727 | // Remove all MemorySSA in the dead blocks |
1728 | if (MSSAU) |
1729 | MSSAU->removeBlocks(DeadBlocks: DeadBlockSet); |
1730 | |
1731 | // Filter out the dead blocks from the exit blocks list so that it can be |
1732 | // used in the caller. |
1733 | llvm::erase_if(C&: ExitBlocks, |
1734 | P: [&](BasicBlock *BB) { return DeadBlockSet.count(key: BB); }); |
1735 | |
1736 | // Walk from this loop up through its parents removing all of the dead blocks. |
1737 | for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { |
1738 | for (auto *BB : DeadBlockSet) |
1739 | ParentL->getBlocksSet().erase(Ptr: BB); |
1740 | llvm::erase_if(C&: ParentL->getBlocksVector(), |
1741 | P: [&](BasicBlock *BB) { return DeadBlockSet.count(key: BB); }); |
1742 | } |
1743 | |
1744 | // Now delete the dead child loops. This raw delete will clear them |
1745 | // recursively. |
1746 | llvm::erase_if(C&: L.getSubLoopsVector(), P: [&](Loop *ChildL) { |
1747 | if (!DeadBlockSet.count(key: ChildL->getHeader())) |
1748 | return false; |
1749 | |
1750 | assert(llvm::all_of(ChildL->blocks(), |
1751 | [&](BasicBlock *ChildBB) { |
1752 | return DeadBlockSet.count(ChildBB); |
1753 | }) && |
1754 | "If the child loop header is dead all blocks in the child loop must " |
1755 | "be dead as well!" ); |
1756 | LoopUpdater.markLoopAsDeleted(L&: *ChildL, Name: ChildL->getName()); |
1757 | if (SE) |
1758 | SE->forgetBlockAndLoopDispositions(); |
1759 | LI.destroy(L: ChildL); |
1760 | return true; |
1761 | }); |
1762 | |
1763 | // Remove the loop mappings for the dead blocks and drop all the references |
1764 | // from these blocks to others to handle cyclic references as we start |
1765 | // deleting the blocks themselves. |
1766 | for (auto *BB : DeadBlockSet) { |
1767 | // Check that the dominator tree has already been updated. |
1768 | assert(!DT.getNode(BB) && "Should already have cleared domtree!" ); |
1769 | LI.changeLoopFor(BB, L: nullptr); |
1770 | // Drop all uses of the instructions to make sure we won't have dangling |
1771 | // uses in other blocks. |
1772 | for (auto &I : *BB) |
1773 | if (!I.use_empty()) |
1774 | I.replaceAllUsesWith(V: PoisonValue::get(T: I.getType())); |
1775 | BB->dropAllReferences(); |
1776 | } |
1777 | |
1778 | // Actually delete the blocks now that they've been fully unhooked from the |
1779 | // IR. |
1780 | for (auto *BB : DeadBlockSet) |
1781 | BB->eraseFromParent(); |
1782 | } |
1783 | |
1784 | /// Recompute the set of blocks in a loop after unswitching. |
1785 | /// |
1786 | /// This walks from the original headers predecessors to rebuild the loop. We |
1787 | /// take advantage of the fact that new blocks can't have been added, and so we |
1788 | /// filter by the original loop's blocks. This also handles potentially |
1789 | /// unreachable code that we don't want to explore but might be found examining |
1790 | /// the predecessors of the header. |
1791 | /// |
1792 | /// If the original loop is no longer a loop, this will return an empty set. If |
1793 | /// it remains a loop, all the blocks within it will be added to the set |
1794 | /// (including those blocks in inner loops). |
1795 | static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, |
1796 | LoopInfo &LI) { |
1797 | SmallPtrSet<const BasicBlock *, 16> LoopBlockSet; |
1798 | |
1799 | auto *PH = L.getLoopPreheader(); |
1800 | auto * = L.getHeader(); |
1801 | |
1802 | // A worklist to use while walking backwards from the header. |
1803 | SmallVector<BasicBlock *, 16> Worklist; |
1804 | |
1805 | // First walk the predecessors of the header to find the backedges. This will |
1806 | // form the basis of our walk. |
1807 | for (auto *Pred : predecessors(BB: Header)) { |
1808 | // Skip the preheader. |
1809 | if (Pred == PH) |
1810 | continue; |
1811 | |
1812 | // Because the loop was in simplified form, the only non-loop predecessor |
1813 | // is the preheader. |
1814 | assert(L.contains(Pred) && "Found a predecessor of the loop header other " |
1815 | "than the preheader that is not part of the " |
1816 | "loop!" ); |
1817 | |
1818 | // Insert this block into the loop set and on the first visit and, if it |
1819 | // isn't the header we're currently walking, put it into the worklist to |
1820 | // recurse through. |
1821 | if (LoopBlockSet.insert(Ptr: Pred).second && Pred != Header) |
1822 | Worklist.push_back(Elt: Pred); |
1823 | } |
1824 | |
1825 | // If no backedges were found, we're done. |
1826 | if (LoopBlockSet.empty()) |
1827 | return LoopBlockSet; |
1828 | |
1829 | // We found backedges, recurse through them to identify the loop blocks. |
1830 | while (!Worklist.empty()) { |
1831 | BasicBlock *BB = Worklist.pop_back_val(); |
1832 | assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!" ); |
1833 | |
1834 | // No need to walk past the header. |
1835 | if (BB == Header) |
1836 | continue; |
1837 | |
1838 | // Because we know the inner loop structure remains valid we can use the |
1839 | // loop structure to jump immediately across the entire nested loop. |
1840 | // Further, because it is in loop simplified form, we can directly jump |
1841 | // to its preheader afterward. |
1842 | if (Loop *InnerL = LI.getLoopFor(BB)) |
1843 | if (InnerL != &L) { |
1844 | assert(L.contains(InnerL) && |
1845 | "Should not reach a loop *outside* this loop!" ); |
1846 | // The preheader is the only possible predecessor of the loop so |
1847 | // insert it into the set and check whether it was already handled. |
1848 | auto *InnerPH = InnerL->getLoopPreheader(); |
1849 | assert(L.contains(InnerPH) && "Cannot contain an inner loop block " |
1850 | "but not contain the inner loop " |
1851 | "preheader!" ); |
1852 | if (!LoopBlockSet.insert(Ptr: InnerPH).second) |
1853 | // The only way to reach the preheader is through the loop body |
1854 | // itself so if it has been visited the loop is already handled. |
1855 | continue; |
1856 | |
1857 | // Insert all of the blocks (other than those already present) into |
1858 | // the loop set. We expect at least the block that led us to find the |
1859 | // inner loop to be in the block set, but we may also have other loop |
1860 | // blocks if they were already enqueued as predecessors of some other |
1861 | // outer loop block. |
1862 | for (auto *InnerBB : InnerL->blocks()) { |
1863 | if (InnerBB == BB) { |
1864 | assert(LoopBlockSet.count(InnerBB) && |
1865 | "Block should already be in the set!" ); |
1866 | continue; |
1867 | } |
1868 | |
1869 | LoopBlockSet.insert(Ptr: InnerBB); |
1870 | } |
1871 | |
1872 | // Add the preheader to the worklist so we will continue past the |
1873 | // loop body. |
1874 | Worklist.push_back(Elt: InnerPH); |
1875 | continue; |
1876 | } |
1877 | |
1878 | // Insert any predecessors that were in the original loop into the new |
1879 | // set, and if the insert is successful, add them to the worklist. |
1880 | for (auto *Pred : predecessors(BB)) |
1881 | if (L.contains(BB: Pred) && LoopBlockSet.insert(Ptr: Pred).second) |
1882 | Worklist.push_back(Elt: Pred); |
1883 | } |
1884 | |
1885 | assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!" ); |
1886 | |
1887 | // We've found all the blocks participating in the loop, return our completed |
1888 | // set. |
1889 | return LoopBlockSet; |
1890 | } |
1891 | |
1892 | /// Rebuild a loop after unswitching removes some subset of blocks and edges. |
1893 | /// |
1894 | /// The removal may have removed some child loops entirely but cannot have |
1895 | /// disturbed any remaining child loops. However, they may need to be hoisted |
1896 | /// to the parent loop (or to be top-level loops). The original loop may be |
1897 | /// completely removed. |
1898 | /// |
1899 | /// The sibling loops resulting from this update are returned. If the original |
1900 | /// loop remains a valid loop, it will be the first entry in this list with all |
1901 | /// of the newly sibling loops following it. |
1902 | /// |
1903 | /// Returns true if the loop remains a loop after unswitching, and false if it |
1904 | /// is no longer a loop after unswitching (and should not continue to be |
1905 | /// referenced). |
1906 | static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, |
1907 | LoopInfo &LI, |
1908 | SmallVectorImpl<Loop *> &HoistedLoops, |
1909 | ScalarEvolution *SE) { |
1910 | auto *PH = L.getLoopPreheader(); |
1911 | |
1912 | // Compute the actual parent loop from the exit blocks. Because we may have |
1913 | // pruned some exits the loop may be different from the original parent. |
1914 | Loop *ParentL = nullptr; |
1915 | SmallVector<Loop *, 4> ExitLoops; |
1916 | SmallVector<BasicBlock *, 4> ExitsInLoops; |
1917 | ExitsInLoops.reserve(N: ExitBlocks.size()); |
1918 | for (auto *ExitBB : ExitBlocks) |
1919 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) { |
1920 | ExitLoops.push_back(Elt: ExitL); |
1921 | ExitsInLoops.push_back(Elt: ExitBB); |
1922 | if (!ParentL || (ParentL != ExitL && ParentL->contains(L: ExitL))) |
1923 | ParentL = ExitL; |
1924 | } |
1925 | |
1926 | // Recompute the blocks participating in this loop. This may be empty if it |
1927 | // is no longer a loop. |
1928 | auto LoopBlockSet = recomputeLoopBlockSet(L, LI); |
1929 | |
1930 | // If we still have a loop, we need to re-set the loop's parent as the exit |
1931 | // block set changing may have moved it within the loop nest. Note that this |
1932 | // can only happen when this loop has a parent as it can only hoist the loop |
1933 | // *up* the nest. |
1934 | if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) { |
1935 | // Remove this loop's (original) blocks from all of the intervening loops. |
1936 | for (Loop *IL = L.getParentLoop(); IL != ParentL; |
1937 | IL = IL->getParentLoop()) { |
1938 | IL->getBlocksSet().erase(Ptr: PH); |
1939 | for (auto *BB : L.blocks()) |
1940 | IL->getBlocksSet().erase(Ptr: BB); |
1941 | llvm::erase_if(C&: IL->getBlocksVector(), P: [&](BasicBlock *BB) { |
1942 | return BB == PH || L.contains(BB); |
1943 | }); |
1944 | } |
1945 | |
1946 | LI.changeLoopFor(BB: PH, L: ParentL); |
1947 | L.getParentLoop()->removeChildLoop(Child: &L); |
1948 | if (ParentL) |
1949 | ParentL->addChildLoop(NewChild: &L); |
1950 | else |
1951 | LI.addTopLevelLoop(New: &L); |
1952 | } |
1953 | |
1954 | // Now we update all the blocks which are no longer within the loop. |
1955 | auto &Blocks = L.getBlocksVector(); |
1956 | auto BlocksSplitI = |
1957 | LoopBlockSet.empty() |
1958 | ? Blocks.begin() |
1959 | : std::stable_partition( |
1960 | first: Blocks.begin(), last: Blocks.end(), |
1961 | pred: [&](BasicBlock *BB) { return LoopBlockSet.count(Ptr: BB); }); |
1962 | |
1963 | // Before we erase the list of unlooped blocks, build a set of them. |
1964 | SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end()); |
1965 | if (LoopBlockSet.empty()) |
1966 | UnloopedBlocks.insert(Ptr: PH); |
1967 | |
1968 | // Now erase these blocks from the loop. |
1969 | for (auto *BB : make_range(x: BlocksSplitI, y: Blocks.end())) |
1970 | L.getBlocksSet().erase(Ptr: BB); |
1971 | Blocks.erase(first: BlocksSplitI, last: Blocks.end()); |
1972 | |
1973 | // Sort the exits in ascending loop depth, we'll work backwards across these |
1974 | // to process them inside out. |
1975 | llvm::stable_sort(Range&: ExitsInLoops, C: [&](BasicBlock *LHS, BasicBlock *RHS) { |
1976 | return LI.getLoopDepth(BB: LHS) < LI.getLoopDepth(BB: RHS); |
1977 | }); |
1978 | |
1979 | // We'll build up a set for each exit loop. |
1980 | SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks; |
1981 | Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop. |
1982 | |
1983 | auto RemoveUnloopedBlocksFromLoop = |
1984 | [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) { |
1985 | for (auto *BB : UnloopedBlocks) |
1986 | L.getBlocksSet().erase(Ptr: BB); |
1987 | llvm::erase_if(C&: L.getBlocksVector(), P: [&](BasicBlock *BB) { |
1988 | return UnloopedBlocks.count(Ptr: BB); |
1989 | }); |
1990 | }; |
1991 | |
1992 | SmallVector<BasicBlock *, 16> Worklist; |
1993 | while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) { |
1994 | assert(Worklist.empty() && "Didn't clear worklist!" ); |
1995 | assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!" ); |
1996 | |
1997 | // Grab the next exit block, in decreasing loop depth order. |
1998 | BasicBlock *ExitBB = ExitsInLoops.pop_back_val(); |
1999 | Loop &ExitL = *LI.getLoopFor(BB: ExitBB); |
2000 | assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!" ); |
2001 | |
2002 | // Erase all of the unlooped blocks from the loops between the previous |
2003 | // exit loop and this exit loop. This works because the ExitInLoops list is |
2004 | // sorted in increasing order of loop depth and thus we visit loops in |
2005 | // decreasing order of loop depth. |
2006 | for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop()) |
2007 | RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); |
2008 | |
2009 | // Walk the CFG back until we hit the cloned PH adding everything reachable |
2010 | // and in the unlooped set to this exit block's loop. |
2011 | Worklist.push_back(Elt: ExitBB); |
2012 | do { |
2013 | BasicBlock *BB = Worklist.pop_back_val(); |
2014 | // We can stop recursing at the cloned preheader (if we get there). |
2015 | if (BB == PH) |
2016 | continue; |
2017 | |
2018 | for (BasicBlock *PredBB : predecessors(BB)) { |
2019 | // If this pred has already been moved to our set or is part of some |
2020 | // (inner) loop, no update needed. |
2021 | if (!UnloopedBlocks.erase(Ptr: PredBB)) { |
2022 | assert((NewExitLoopBlocks.count(PredBB) || |
2023 | ExitL.contains(LI.getLoopFor(PredBB))) && |
2024 | "Predecessor not in a nested loop (or already visited)!" ); |
2025 | continue; |
2026 | } |
2027 | |
2028 | // We just insert into the loop set here. We'll add these blocks to the |
2029 | // exit loop after we build up the set in a deterministic order rather |
2030 | // than the predecessor-influenced visit order. |
2031 | bool Inserted = NewExitLoopBlocks.insert(Ptr: PredBB).second; |
2032 | (void)Inserted; |
2033 | assert(Inserted && "Should only visit an unlooped block once!" ); |
2034 | |
2035 | // And recurse through to its predecessors. |
2036 | Worklist.push_back(Elt: PredBB); |
2037 | } |
2038 | } while (!Worklist.empty()); |
2039 | |
2040 | // If blocks in this exit loop were directly part of the original loop (as |
2041 | // opposed to a child loop) update the map to point to this exit loop. This |
2042 | // just updates a map and so the fact that the order is unstable is fine. |
2043 | for (auto *BB : NewExitLoopBlocks) |
2044 | if (Loop *BBL = LI.getLoopFor(BB)) |
2045 | if (BBL == &L || !L.contains(L: BBL)) |
2046 | LI.changeLoopFor(BB, L: &ExitL); |
2047 | |
2048 | // We will remove the remaining unlooped blocks from this loop in the next |
2049 | // iteration or below. |
2050 | NewExitLoopBlocks.clear(); |
2051 | } |
2052 | |
2053 | // Any remaining unlooped blocks are no longer part of any loop unless they |
2054 | // are part of some child loop. |
2055 | for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop()) |
2056 | RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); |
2057 | for (auto *BB : UnloopedBlocks) |
2058 | if (Loop *BBL = LI.getLoopFor(BB)) |
2059 | if (BBL == &L || !L.contains(L: BBL)) |
2060 | LI.changeLoopFor(BB, L: nullptr); |
2061 | |
2062 | // Sink all the child loops whose headers are no longer in the loop set to |
2063 | // the parent (or to be top level loops). We reach into the loop and directly |
2064 | // update its subloop vector to make this batch update efficient. |
2065 | auto &SubLoops = L.getSubLoopsVector(); |
2066 | auto SubLoopsSplitI = |
2067 | LoopBlockSet.empty() |
2068 | ? SubLoops.begin() |
2069 | : std::stable_partition( |
2070 | first: SubLoops.begin(), last: SubLoops.end(), pred: [&](Loop *SubL) { |
2071 | return LoopBlockSet.count(Ptr: SubL->getHeader()); |
2072 | }); |
2073 | for (auto *HoistedL : make_range(x: SubLoopsSplitI, y: SubLoops.end())) { |
2074 | HoistedLoops.push_back(Elt: HoistedL); |
2075 | HoistedL->setParentLoop(nullptr); |
2076 | |
2077 | // To compute the new parent of this hoisted loop we look at where we |
2078 | // placed the preheader above. We can't lookup the header itself because we |
2079 | // retained the mapping from the header to the hoisted loop. But the |
2080 | // preheader and header should have the exact same new parent computed |
2081 | // based on the set of exit blocks from the original loop as the preheader |
2082 | // is a predecessor of the header and so reached in the reverse walk. And |
2083 | // because the loops were all in simplified form the preheader of the |
2084 | // hoisted loop can't be part of some *other* loop. |
2085 | if (auto *NewParentL = LI.getLoopFor(BB: HoistedL->getLoopPreheader())) |
2086 | NewParentL->addChildLoop(NewChild: HoistedL); |
2087 | else |
2088 | LI.addTopLevelLoop(New: HoistedL); |
2089 | } |
2090 | SubLoops.erase(first: SubLoopsSplitI, last: SubLoops.end()); |
2091 | |
2092 | // Actually delete the loop if nothing remained within it. |
2093 | if (Blocks.empty()) { |
2094 | assert(SubLoops.empty() && |
2095 | "Failed to remove all subloops from the original loop!" ); |
2096 | if (Loop *ParentL = L.getParentLoop()) |
2097 | ParentL->removeChildLoop(I: llvm::find(Range&: *ParentL, Val: &L)); |
2098 | else |
2099 | LI.removeLoop(I: llvm::find(Range&: LI, Val: &L)); |
2100 | // markLoopAsDeleted for L should be triggered by the caller (it is |
2101 | // typically done within postUnswitch). |
2102 | if (SE) |
2103 | SE->forgetBlockAndLoopDispositions(); |
2104 | LI.destroy(L: &L); |
2105 | return false; |
2106 | } |
2107 | |
2108 | return true; |
2109 | } |
2110 | |
2111 | /// Helper to visit a dominator subtree, invoking a callable on each node. |
2112 | /// |
2113 | /// Returning false at any point will stop walking past that node of the tree. |
2114 | template <typename CallableT> |
2115 | void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { |
2116 | SmallVector<DomTreeNode *, 4> DomWorklist; |
2117 | DomWorklist.push_back(Elt: DT[BB]); |
2118 | #ifndef NDEBUG |
2119 | SmallPtrSet<DomTreeNode *, 4> Visited; |
2120 | Visited.insert(DT[BB]); |
2121 | #endif |
2122 | do { |
2123 | DomTreeNode *N = DomWorklist.pop_back_val(); |
2124 | |
2125 | // Visit this node. |
2126 | if (!Callable(N->getBlock())) |
2127 | continue; |
2128 | |
2129 | // Accumulate the child nodes. |
2130 | for (DomTreeNode *ChildN : *N) { |
2131 | assert(Visited.insert(ChildN).second && |
2132 | "Cannot visit a node twice when walking a tree!" ); |
2133 | DomWorklist.push_back(Elt: ChildN); |
2134 | } |
2135 | } while (!DomWorklist.empty()); |
2136 | } |
2137 | |
2138 | void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, |
2139 | bool CurrentLoopValid, bool PartiallyInvariant, |
2140 | bool InjectedCondition, ArrayRef<Loop *> NewLoops) { |
2141 | // If we did a non-trivial unswitch, we have added new (cloned) loops. |
2142 | if (!NewLoops.empty()) |
2143 | U.addSiblingLoops(NewSibLoops: NewLoops); |
2144 | |
2145 | // If the current loop remains valid, we should revisit it to catch any |
2146 | // other unswitch opportunities. Otherwise, we need to mark it as deleted. |
2147 | if (CurrentLoopValid) { |
2148 | if (PartiallyInvariant) { |
2149 | // Mark the new loop as partially unswitched, to avoid unswitching on |
2150 | // the same condition again. |
2151 | auto &Context = L.getHeader()->getContext(); |
2152 | MDNode *DisableUnswitchMD = MDNode::get( |
2153 | Context, |
2154 | MDs: MDString::get(Context, Str: "llvm.loop.unswitch.partial.disable" )); |
2155 | MDNode *NewLoopID = makePostTransformationMetadata( |
2156 | Context, OrigLoopID: L.getLoopID(), RemovePrefixes: {"llvm.loop.unswitch.partial" }, |
2157 | AddAttrs: {DisableUnswitchMD}); |
2158 | L.setLoopID(NewLoopID); |
2159 | } else if (InjectedCondition) { |
2160 | // Do the same for injection of invariant conditions. |
2161 | auto &Context = L.getHeader()->getContext(); |
2162 | MDNode *DisableUnswitchMD = MDNode::get( |
2163 | Context, |
2164 | MDs: MDString::get(Context, Str: "llvm.loop.unswitch.injection.disable" )); |
2165 | MDNode *NewLoopID = makePostTransformationMetadata( |
2166 | Context, OrigLoopID: L.getLoopID(), RemovePrefixes: {"llvm.loop.unswitch.injection" }, |
2167 | AddAttrs: {DisableUnswitchMD}); |
2168 | L.setLoopID(NewLoopID); |
2169 | } else |
2170 | U.revisitCurrentLoop(); |
2171 | } else |
2172 | U.markLoopAsDeleted(L, Name: LoopName); |
2173 | } |
2174 | |
2175 | static void unswitchNontrivialInvariants( |
2176 | Loop &L, Instruction &TI, ArrayRef<Value *> Invariants, |
2177 | IVConditionInfo &PartialIVInfo, DominatorTree &DT, LoopInfo &LI, |
2178 | AssumptionCache &AC, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, |
2179 | LPMUpdater &LoopUpdater, bool InsertFreeze, bool InjectedCondition) { |
2180 | auto *ParentBB = TI.getParent(); |
2181 | BranchInst *BI = dyn_cast<BranchInst>(Val: &TI); |
2182 | SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(Val: &TI); |
2183 | |
2184 | // Save the current loop name in a variable so that we can report it even |
2185 | // after it has been deleted. |
2186 | std::string LoopName(L.getName()); |
2187 | |
2188 | // We can only unswitch switches, conditional branches with an invariant |
2189 | // condition, or combining invariant conditions with an instruction or |
2190 | // partially invariant instructions. |
2191 | assert((SI || (BI && BI->isConditional())) && |
2192 | "Can only unswitch switches and conditional branch!" ); |
2193 | bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty(); |
2194 | bool FullUnswitch = |
2195 | SI || (skipTrivialSelect(Cond: BI->getCondition()) == Invariants[0] && |
2196 | !PartiallyInvariant); |
2197 | if (FullUnswitch) |
2198 | assert(Invariants.size() == 1 && |
2199 | "Cannot have other invariants with full unswitching!" ); |
2200 | else |
2201 | assert(isa<Instruction>(skipTrivialSelect(BI->getCondition())) && |
2202 | "Partial unswitching requires an instruction as the condition!" ); |
2203 | |
2204 | if (MSSAU && VerifyMemorySSA) |
2205 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2206 | |
2207 | // Constant and BBs tracking the cloned and continuing successor. When we are |
2208 | // unswitching the entire condition, this can just be trivially chosen to |
2209 | // unswitch towards `true`. However, when we are unswitching a set of |
2210 | // invariants combined with `and` or `or` or partially invariant instructions, |
2211 | // the combining operation determines the best direction to unswitch: we want |
2212 | // to unswitch the direction that will collapse the branch. |
2213 | bool Direction = true; |
2214 | int ClonedSucc = 0; |
2215 | if (!FullUnswitch) { |
2216 | Value *Cond = skipTrivialSelect(Cond: BI->getCondition()); |
2217 | (void)Cond; |
2218 | assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) || |
2219 | PartiallyInvariant) && |
2220 | "Only `or`, `and`, an `select`, partially invariant instructions " |
2221 | "can combine invariants being unswitched." ); |
2222 | if (!match(V: Cond, P: m_LogicalOr())) { |
2223 | if (match(V: Cond, P: m_LogicalAnd()) || |
2224 | (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) { |
2225 | Direction = false; |
2226 | ClonedSucc = 1; |
2227 | } |
2228 | } |
2229 | } |
2230 | |
2231 | BasicBlock *RetainedSuccBB = |
2232 | BI ? BI->getSuccessor(i: 1 - ClonedSucc) : SI->getDefaultDest(); |
2233 | SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; |
2234 | if (BI) |
2235 | UnswitchedSuccBBs.insert(X: BI->getSuccessor(i: ClonedSucc)); |
2236 | else |
2237 | for (auto Case : SI->cases()) |
2238 | if (Case.getCaseSuccessor() != RetainedSuccBB) |
2239 | UnswitchedSuccBBs.insert(X: Case.getCaseSuccessor()); |
2240 | |
2241 | assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && |
2242 | "Should not unswitch the same successor we are retaining!" ); |
2243 | |
2244 | // The branch should be in this exact loop. Any inner loop's invariant branch |
2245 | // should be handled by unswitching that inner loop. The caller of this |
2246 | // routine should filter out any candidates that remain (but were skipped for |
2247 | // whatever reason). |
2248 | assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!" ); |
2249 | |
2250 | // Compute the parent loop now before we start hacking on things. |
2251 | Loop *ParentL = L.getParentLoop(); |
2252 | // Get blocks in RPO order for MSSA update, before changing the CFG. |
2253 | LoopBlocksRPO LBRPO(&L); |
2254 | if (MSSAU) |
2255 | LBRPO.perform(LI: &LI); |
2256 | |
2257 | // Compute the outer-most loop containing one of our exit blocks. This is the |
2258 | // furthest up our loopnest which can be mutated, which we will use below to |
2259 | // update things. |
2260 | Loop *OuterExitL = &L; |
2261 | SmallVector<BasicBlock *, 4> ExitBlocks; |
2262 | L.getUniqueExitBlocks(ExitBlocks); |
2263 | for (auto *ExitBB : ExitBlocks) { |
2264 | // ExitBB can be an exit block for several levels in the loop nest. Make |
2265 | // sure we find the top most. |
2266 | Loop *NewOuterExitL = getTopMostExitingLoop(ExitBB, LI); |
2267 | if (!NewOuterExitL) { |
2268 | // We exited the entire nest with this block, so we're done. |
2269 | OuterExitL = nullptr; |
2270 | break; |
2271 | } |
2272 | if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(L: OuterExitL)) |
2273 | OuterExitL = NewOuterExitL; |
2274 | } |
2275 | |
2276 | // At this point, we're definitely going to unswitch something so invalidate |
2277 | // any cached information in ScalarEvolution for the outer most loop |
2278 | // containing an exit block and all nested loops. |
2279 | if (SE) { |
2280 | if (OuterExitL) |
2281 | SE->forgetLoop(L: OuterExitL); |
2282 | else |
2283 | SE->forgetTopmostLoop(L: &L); |
2284 | SE->forgetBlockAndLoopDispositions(); |
2285 | } |
2286 | |
2287 | // If the edge from this terminator to a successor dominates that successor, |
2288 | // store a map from each block in its dominator subtree to it. This lets us |
2289 | // tell when cloning for a particular successor if a block is dominated by |
2290 | // some *other* successor with a single data structure. We use this to |
2291 | // significantly reduce cloning. |
2292 | SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; |
2293 | for (auto *SuccBB : llvm::concat<BasicBlock *const>(Ranges: ArrayRef(RetainedSuccBB), |
2294 | Ranges&: UnswitchedSuccBBs)) |
2295 | if (SuccBB->getUniquePredecessor() || |
2296 | llvm::all_of(Range: predecessors(BB: SuccBB), P: [&](BasicBlock *PredBB) { |
2297 | return PredBB == ParentBB || DT.dominates(A: SuccBB, B: PredBB); |
2298 | })) |
2299 | visitDomSubTree(DT, BB: SuccBB, Callable: [&](BasicBlock *BB) { |
2300 | DominatingSucc[BB] = SuccBB; |
2301 | return true; |
2302 | }); |
2303 | |
2304 | // Split the preheader, so that we know that there is a safe place to insert |
2305 | // the conditional branch. We will change the preheader to have a conditional |
2306 | // branch on LoopCond. The original preheader will become the split point |
2307 | // between the unswitched versions, and we will have a new preheader for the |
2308 | // original loop. |
2309 | BasicBlock *SplitBB = L.getLoopPreheader(); |
2310 | BasicBlock *LoopPH = SplitEdge(From: SplitBB, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
2311 | |
2312 | // Keep track of the dominator tree updates needed. |
2313 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
2314 | |
2315 | // Clone the loop for each unswitched successor. |
2316 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
2317 | VMaps.reserve(N: UnswitchedSuccBBs.size()); |
2318 | SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; |
2319 | for (auto *SuccBB : UnswitchedSuccBBs) { |
2320 | VMaps.emplace_back(Args: new ValueToValueMapTy()); |
2321 | ClonedPHs[SuccBB] = buildClonedLoopBlocks( |
2322 | L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB: SuccBB, ContinueSuccBB: RetainedSuccBB, |
2323 | DominatingSucc, VMap&: *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU, SE); |
2324 | } |
2325 | |
2326 | // Drop metadata if we may break its semantics by moving this instr into the |
2327 | // split block. |
2328 | if (TI.getMetadata(KindID: LLVMContext::MD_make_implicit)) { |
2329 | if (DropNonTrivialImplicitNullChecks) |
2330 | // Do not spend time trying to understand if we can keep it, just drop it |
2331 | // to save compile time. |
2332 | TI.setMetadata(KindID: LLVMContext::MD_make_implicit, Node: nullptr); |
2333 | else { |
2334 | // It is only legal to preserve make.implicit metadata if we are |
2335 | // guaranteed no reach implicit null check after following this branch. |
2336 | ICFLoopSafetyInfo SafetyInfo; |
2337 | SafetyInfo.computeLoopSafetyInfo(CurLoop: &L); |
2338 | if (!SafetyInfo.isGuaranteedToExecute(Inst: TI, DT: &DT, CurLoop: &L)) |
2339 | TI.setMetadata(KindID: LLVMContext::MD_make_implicit, Node: nullptr); |
2340 | } |
2341 | } |
2342 | |
2343 | // The stitching of the branched code back together depends on whether we're |
2344 | // doing full unswitching or not with the exception that we always want to |
2345 | // nuke the initial terminator placed in the split block. |
2346 | SplitBB->getTerminator()->eraseFromParent(); |
2347 | if (FullUnswitch) { |
2348 | // Keep a clone of the terminator for MSSA updates. |
2349 | Instruction *NewTI = TI.clone(); |
2350 | NewTI->insertInto(ParentBB, It: ParentBB->end()); |
2351 | |
2352 | // Splice the terminator from the original loop and rewrite its |
2353 | // successors. |
2354 | TI.moveBefore(BB&: *SplitBB, I: SplitBB->end()); |
2355 | TI.dropLocation(); |
2356 | |
2357 | // First wire up the moved terminator to the preheaders. |
2358 | if (BI) { |
2359 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2360 | BI->setSuccessor(idx: ClonedSucc, NewSucc: ClonedPH); |
2361 | BI->setSuccessor(idx: 1 - ClonedSucc, NewSucc: LoopPH); |
2362 | Value *Cond = skipTrivialSelect(Cond: BI->getCondition()); |
2363 | if (InsertFreeze) { |
2364 | // We don't give any debug location to the new freeze, because the |
2365 | // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted |
2366 | // out of the loop. |
2367 | Cond = new FreezeInst(Cond, Cond->getName() + ".fr" , BI->getIterator()); |
2368 | } |
2369 | BI->setCondition(Cond); |
2370 | DTUpdates.push_back(Elt: {DominatorTree::Insert, SplitBB, ClonedPH}); |
2371 | } else { |
2372 | assert(SI && "Must either be a branch or switch!" ); |
2373 | |
2374 | // Walk the cases and directly update their successors. |
2375 | assert(SI->getDefaultDest() == RetainedSuccBB && |
2376 | "Not retaining default successor!" ); |
2377 | SI->setDefaultDest(LoopPH); |
2378 | for (const auto &Case : SI->cases()) |
2379 | if (Case.getCaseSuccessor() == RetainedSuccBB) |
2380 | Case.setSuccessor(LoopPH); |
2381 | else |
2382 | Case.setSuccessor(ClonedPHs.find(Val: Case.getCaseSuccessor())->second); |
2383 | |
2384 | if (InsertFreeze) |
2385 | SI->setCondition(new FreezeInst(SI->getCondition(), |
2386 | SI->getCondition()->getName() + ".fr" , |
2387 | SI->getIterator())); |
2388 | |
2389 | // We need to use the set to populate domtree updates as even when there |
2390 | // are multiple cases pointing at the same successor we only want to |
2391 | // remove and insert one edge in the domtree. |
2392 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2393 | DTUpdates.push_back( |
2394 | Elt: {DominatorTree::Insert, SplitBB, ClonedPHs.find(Val: SuccBB)->second}); |
2395 | } |
2396 | |
2397 | if (MSSAU) { |
2398 | DT.applyUpdates(Updates: DTUpdates); |
2399 | DTUpdates.clear(); |
2400 | |
2401 | // Remove all but one edge to the retained block and all unswitched |
2402 | // blocks. This is to avoid having duplicate entries in the cloned Phis, |
2403 | // when we know we only keep a single edge for each case. |
2404 | MSSAU->removeDuplicatePhiEdgesBetween(From: ParentBB, To: RetainedSuccBB); |
2405 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2406 | MSSAU->removeDuplicatePhiEdgesBetween(From: ParentBB, To: SuccBB); |
2407 | |
2408 | for (auto &VMap : VMaps) |
2409 | MSSAU->updateForClonedLoop(LoopBlocks: LBRPO, ExitBlocks, VM: *VMap, |
2410 | /*IgnoreIncomingWithNoClones=*/true); |
2411 | MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); |
2412 | |
2413 | // Remove all edges to unswitched blocks. |
2414 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2415 | MSSAU->removeEdge(From: ParentBB, To: SuccBB); |
2416 | } |
2417 | |
2418 | // Now unhook the successor relationship as we'll be replacing |
2419 | // the terminator with a direct branch. This is much simpler for branches |
2420 | // than switches so we handle those first. |
2421 | if (BI) { |
2422 | // Remove the parent as a predecessor of the unswitched successor. |
2423 | assert(UnswitchedSuccBBs.size() == 1 && |
2424 | "Only one possible unswitched block for a branch!" ); |
2425 | BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); |
2426 | UnswitchedSuccBB->removePredecessor(Pred: ParentBB, |
2427 | /*KeepOneInputPHIs*/ true); |
2428 | DTUpdates.push_back(Elt: {DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); |
2429 | } else { |
2430 | // Note that we actually want to remove the parent block as a predecessor |
2431 | // of *every* case successor. The case successor is either unswitched, |
2432 | // completely eliminating an edge from the parent to that successor, or it |
2433 | // is a duplicate edge to the retained successor as the retained successor |
2434 | // is always the default successor and as we'll replace this with a direct |
2435 | // branch we no longer need the duplicate entries in the PHI nodes. |
2436 | SwitchInst *NewSI = cast<SwitchInst>(Val: NewTI); |
2437 | assert(NewSI->getDefaultDest() == RetainedSuccBB && |
2438 | "Not retaining default successor!" ); |
2439 | for (const auto &Case : NewSI->cases()) |
2440 | Case.getCaseSuccessor()->removePredecessor( |
2441 | Pred: ParentBB, |
2442 | /*KeepOneInputPHIs*/ true); |
2443 | |
2444 | // We need to use the set to populate domtree updates as even when there |
2445 | // are multiple cases pointing at the same successor we only want to |
2446 | // remove and insert one edge in the domtree. |
2447 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2448 | DTUpdates.push_back(Elt: {DominatorTree::Delete, ParentBB, SuccBB}); |
2449 | } |
2450 | |
2451 | // Create a new unconditional branch to the continuing block (as opposed to |
2452 | // the one cloned). |
2453 | Instruction *NewBI = BranchInst::Create(IfTrue: RetainedSuccBB, InsertBefore: ParentBB); |
2454 | NewBI->setDebugLoc(NewTI->getDebugLoc()); |
2455 | |
2456 | // After MSSAU update, remove the cloned terminator instruction NewTI. |
2457 | NewTI->eraseFromParent(); |
2458 | } else { |
2459 | assert(BI && "Only branches have partial unswitching." ); |
2460 | assert(UnswitchedSuccBBs.size() == 1 && |
2461 | "Only one possible unswitched block for a branch!" ); |
2462 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2463 | // When doing a partial unswitch, we have to do a bit more work to build up |
2464 | // the branch in the split block. |
2465 | if (PartiallyInvariant) |
2466 | buildPartialInvariantUnswitchConditionalBranch( |
2467 | BB&: *SplitBB, ToDuplicate: Invariants, Direction, UnswitchedSucc&: *ClonedPH, NormalSucc&: *LoopPH, L, MSSAU); |
2468 | else { |
2469 | buildPartialUnswitchConditionalBranch( |
2470 | BB&: *SplitBB, Invariants, Direction, UnswitchedSucc&: *ClonedPH, NormalSucc&: *LoopPH, |
2471 | InsertFreeze: FreezeLoopUnswitchCond, I: BI, AC: &AC, DT); |
2472 | } |
2473 | DTUpdates.push_back(Elt: {DominatorTree::Insert, SplitBB, ClonedPH}); |
2474 | |
2475 | if (MSSAU) { |
2476 | DT.applyUpdates(Updates: DTUpdates); |
2477 | DTUpdates.clear(); |
2478 | |
2479 | // Perform MSSA cloning updates. |
2480 | for (auto &VMap : VMaps) |
2481 | MSSAU->updateForClonedLoop(LoopBlocks: LBRPO, ExitBlocks, VM: *VMap, |
2482 | /*IgnoreIncomingWithNoClones=*/true); |
2483 | MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); |
2484 | } |
2485 | } |
2486 | |
2487 | // Apply the updates accumulated above to get an up-to-date dominator tree. |
2488 | DT.applyUpdates(Updates: DTUpdates); |
2489 | |
2490 | // Now that we have an accurate dominator tree, first delete the dead cloned |
2491 | // blocks so that we can accurately build any cloned loops. It is important to |
2492 | // not delete the blocks from the original loop yet because we still want to |
2493 | // reference the original loop to understand the cloned loop's structure. |
2494 | deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU); |
2495 | |
2496 | // Build the cloned loop structure itself. This may be substantially |
2497 | // different from the original structure due to the simplified CFG. This also |
2498 | // handles inserting all the cloned blocks into the correct loops. |
2499 | SmallVector<Loop *, 4> NonChildClonedLoops; |
2500 | for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) |
2501 | buildClonedLoops(OrigL&: L, ExitBlocks, VMap: *VMap, LI, NonChildClonedLoops); |
2502 | |
2503 | // Now that our cloned loops have been built, we can update the original loop. |
2504 | // First we delete the dead blocks from it and then we rebuild the loop |
2505 | // structure taking these deletions into account. |
2506 | deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, SE, LoopUpdater); |
2507 | |
2508 | if (MSSAU && VerifyMemorySSA) |
2509 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2510 | |
2511 | SmallVector<Loop *, 4> HoistedLoops; |
2512 | bool IsStillLoop = |
2513 | rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops, SE); |
2514 | |
2515 | if (MSSAU && VerifyMemorySSA) |
2516 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2517 | |
2518 | // This transformation has a high risk of corrupting the dominator tree, and |
2519 | // the below steps to rebuild loop structures will result in hard to debug |
2520 | // errors in that case so verify that the dominator tree is sane first. |
2521 | // FIXME: Remove this when the bugs stop showing up and rely on existing |
2522 | // verification steps. |
2523 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
2524 | |
2525 | if (BI && !PartiallyInvariant) { |
2526 | // If we unswitched a branch which collapses the condition to a known |
2527 | // constant we want to replace all the uses of the invariants within both |
2528 | // the original and cloned blocks. We do this here so that we can use the |
2529 | // now updated dominator tree to identify which side the users are on. |
2530 | assert(UnswitchedSuccBBs.size() == 1 && |
2531 | "Only one possible unswitched block for a branch!" ); |
2532 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2533 | |
2534 | // When considering multiple partially-unswitched invariants |
2535 | // we cant just go replace them with constants in both branches. |
2536 | // |
2537 | // For 'AND' we infer that true branch ("continue") means true |
2538 | // for each invariant operand. |
2539 | // For 'OR' we can infer that false branch ("continue") means false |
2540 | // for each invariant operand. |
2541 | // So it happens that for multiple-partial case we dont replace |
2542 | // in the unswitched branch. |
2543 | bool ReplaceUnswitched = |
2544 | FullUnswitch || (Invariants.size() == 1) || PartiallyInvariant; |
2545 | |
2546 | ConstantInt *UnswitchedReplacement = |
2547 | Direction ? ConstantInt::getTrue(Context&: BI->getContext()) |
2548 | : ConstantInt::getFalse(Context&: BI->getContext()); |
2549 | ConstantInt *ContinueReplacement = |
2550 | Direction ? ConstantInt::getFalse(Context&: BI->getContext()) |
2551 | : ConstantInt::getTrue(Context&: BI->getContext()); |
2552 | for (Value *Invariant : Invariants) { |
2553 | assert(!isa<Constant>(Invariant) && |
2554 | "Should not be replacing constant values!" ); |
2555 | // Use make_early_inc_range here as set invalidates the iterator. |
2556 | for (Use &U : llvm::make_early_inc_range(Range: Invariant->uses())) { |
2557 | Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser()); |
2558 | if (!UserI) |
2559 | continue; |
2560 | |
2561 | // Replace it with the 'continue' side if in the main loop body, and the |
2562 | // unswitched if in the cloned blocks. |
2563 | if (DT.dominates(A: LoopPH, B: UserI->getParent())) |
2564 | U.set(ContinueReplacement); |
2565 | else if (ReplaceUnswitched && |
2566 | DT.dominates(A: ClonedPH, B: UserI->getParent())) |
2567 | U.set(UnswitchedReplacement); |
2568 | } |
2569 | } |
2570 | } |
2571 | |
2572 | // We can change which blocks are exit blocks of all the cloned sibling |
2573 | // loops, the current loop, and any parent loops which shared exit blocks |
2574 | // with the current loop. As a consequence, we need to re-form LCSSA for |
2575 | // them. But we shouldn't need to re-form LCSSA for any child loops. |
2576 | // FIXME: This could be made more efficient by tracking which exit blocks are |
2577 | // new, and focusing on them, but that isn't likely to be necessary. |
2578 | // |
2579 | // In order to reasonably rebuild LCSSA we need to walk inside-out across the |
2580 | // loop nest and update every loop that could have had its exits changed. We |
2581 | // also need to cover any intervening loops. We add all of these loops to |
2582 | // a list and sort them by loop depth to achieve this without updating |
2583 | // unnecessary loops. |
2584 | auto UpdateLoop = [&](Loop &UpdateL) { |
2585 | #ifndef NDEBUG |
2586 | UpdateL.verifyLoop(); |
2587 | for (Loop *ChildL : UpdateL) { |
2588 | ChildL->verifyLoop(); |
2589 | assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && |
2590 | "Perturbed a child loop's LCSSA form!" ); |
2591 | } |
2592 | #endif |
2593 | // First build LCSSA for this loop so that we can preserve it when |
2594 | // forming dedicated exits. We don't want to perturb some other loop's |
2595 | // LCSSA while doing that CFG edit. |
2596 | formLCSSA(L&: UpdateL, DT, LI: &LI, SE); |
2597 | |
2598 | // For loops reached by this loop's original exit blocks we may |
2599 | // introduced new, non-dedicated exits. At least try to re-form dedicated |
2600 | // exits for these loops. This may fail if they couldn't have dedicated |
2601 | // exits to start with. |
2602 | formDedicatedExitBlocks(L: &UpdateL, DT: &DT, LI: &LI, MSSAU, /*PreserveLCSSA*/ true); |
2603 | }; |
2604 | |
2605 | // For non-child cloned loops and hoisted loops, we just need to update LCSSA |
2606 | // and we can do it in any order as they don't nest relative to each other. |
2607 | // |
2608 | // Also check if any of the loops we have updated have become top-level loops |
2609 | // as that will necessitate widening the outer loop scope. |
2610 | for (Loop *UpdatedL : |
2611 | llvm::concat<Loop *>(Ranges&: NonChildClonedLoops, Ranges&: HoistedLoops)) { |
2612 | UpdateLoop(*UpdatedL); |
2613 | if (UpdatedL->isOutermost()) |
2614 | OuterExitL = nullptr; |
2615 | } |
2616 | if (IsStillLoop) { |
2617 | UpdateLoop(L); |
2618 | if (L.isOutermost()) |
2619 | OuterExitL = nullptr; |
2620 | } |
2621 | |
2622 | // If the original loop had exit blocks, walk up through the outer most loop |
2623 | // of those exit blocks to update LCSSA and form updated dedicated exits. |
2624 | if (OuterExitL != &L) |
2625 | for (Loop *OuterL = ParentL; OuterL != OuterExitL; |
2626 | OuterL = OuterL->getParentLoop()) |
2627 | UpdateLoop(*OuterL); |
2628 | |
2629 | #ifndef NDEBUG |
2630 | // Verify the entire loop structure to catch any incorrect updates before we |
2631 | // progress in the pass pipeline. |
2632 | LI.verify(DT); |
2633 | #endif |
2634 | |
2635 | // Now that we've unswitched something, make callbacks to report the changes. |
2636 | // For that we need to merge together the updated loops and the cloned loops |
2637 | // and check whether the original loop survived. |
2638 | SmallVector<Loop *, 4> SibLoops; |
2639 | for (Loop *UpdatedL : llvm::concat<Loop *>(Ranges&: NonChildClonedLoops, Ranges&: HoistedLoops)) |
2640 | if (UpdatedL->getParentLoop() == ParentL) |
2641 | SibLoops.push_back(Elt: UpdatedL); |
2642 | postUnswitch(L, U&: LoopUpdater, LoopName, CurrentLoopValid: IsStillLoop, PartiallyInvariant, |
2643 | InjectedCondition, NewLoops: SibLoops); |
2644 | |
2645 | if (MSSAU && VerifyMemorySSA) |
2646 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2647 | |
2648 | if (BI) |
2649 | ++NumBranches; |
2650 | else |
2651 | ++NumSwitches; |
2652 | } |
2653 | |
2654 | /// Recursively compute the cost of a dominator subtree based on the per-block |
2655 | /// cost map provided. |
2656 | /// |
2657 | /// The recursive computation is memozied into the provided DT-indexed cost map |
2658 | /// to allow querying it for most nodes in the domtree without it becoming |
2659 | /// quadratic. |
2660 | static InstructionCost computeDomSubtreeCost( |
2661 | DomTreeNode &N, |
2662 | const SmallDenseMap<BasicBlock *, InstructionCost, 4> &BBCostMap, |
2663 | SmallDenseMap<DomTreeNode *, InstructionCost, 4> &DTCostMap) { |
2664 | // Don't accumulate cost (or recurse through) blocks not in our block cost |
2665 | // map and thus not part of the duplication cost being considered. |
2666 | auto BBCostIt = BBCostMap.find(Val: N.getBlock()); |
2667 | if (BBCostIt == BBCostMap.end()) |
2668 | return 0; |
2669 | |
2670 | // Lookup this node to see if we already computed its cost. |
2671 | auto DTCostIt = DTCostMap.find(Val: &N); |
2672 | if (DTCostIt != DTCostMap.end()) |
2673 | return DTCostIt->second; |
2674 | |
2675 | // If not, we have to compute it. We can't use insert above and update |
2676 | // because computing the cost may insert more things into the map. |
2677 | InstructionCost Cost = std::accumulate( |
2678 | first: N.begin(), last: N.end(), init: BBCostIt->second, |
2679 | binary_op: [&](InstructionCost Sum, DomTreeNode *ChildN) -> InstructionCost { |
2680 | return Sum + computeDomSubtreeCost(N&: *ChildN, BBCostMap, DTCostMap); |
2681 | }); |
2682 | bool Inserted = DTCostMap.insert(KV: {&N, Cost}).second; |
2683 | (void)Inserted; |
2684 | assert(Inserted && "Should not insert a node while visiting children!" ); |
2685 | return Cost; |
2686 | } |
2687 | |
2688 | /// Turns a select instruction into implicit control flow branch, |
2689 | /// making the following replacement: |
2690 | /// |
2691 | /// head: |
2692 | /// --code before select-- |
2693 | /// select %cond, %trueval, %falseval |
2694 | /// --code after select-- |
2695 | /// |
2696 | /// into |
2697 | /// |
2698 | /// head: |
2699 | /// --code before select-- |
2700 | /// br i1 %cond, label %then, label %tail |
2701 | /// |
2702 | /// then: |
2703 | /// br %tail |
2704 | /// |
2705 | /// tail: |
2706 | /// phi [ %trueval, %then ], [ %falseval, %head] |
2707 | /// unreachable |
2708 | /// |
2709 | /// It also makes all relevant DT and LI updates, so that all structures are in |
2710 | /// valid state after this transform. |
2711 | static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, |
2712 | LoopInfo &LI, MemorySSAUpdater *MSSAU, |
2713 | AssumptionCache *AC) { |
2714 | LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n" ); |
2715 | BasicBlock *HeadBB = SI->getParent(); |
2716 | |
2717 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
2718 | SplitBlockAndInsertIfThen(Cond: SI->getCondition(), SplitBefore: SI, Unreachable: false, |
2719 | BranchWeights: SI->getMetadata(KindID: LLVMContext::MD_prof), DTU: &DTU, LI: &LI); |
2720 | auto *CondBr = cast<BranchInst>(Val: HeadBB->getTerminator()); |
2721 | BasicBlock *ThenBB = CondBr->getSuccessor(i: 0), |
2722 | *TailBB = CondBr->getSuccessor(i: 1); |
2723 | if (MSSAU) |
2724 | MSSAU->moveAllAfterSpliceBlocks(From: HeadBB, To: TailBB, Start: SI); |
2725 | |
2726 | PHINode *Phi = |
2727 | PHINode::Create(Ty: SI->getType(), NumReservedValues: 2, NameStr: "unswitched.select" , InsertBefore: SI->getIterator()); |
2728 | Phi->addIncoming(V: SI->getTrueValue(), BB: ThenBB); |
2729 | Phi->addIncoming(V: SI->getFalseValue(), BB: HeadBB); |
2730 | Phi->setDebugLoc(SI->getDebugLoc()); |
2731 | SI->replaceAllUsesWith(V: Phi); |
2732 | SI->eraseFromParent(); |
2733 | |
2734 | if (MSSAU && VerifyMemorySSA) |
2735 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2736 | |
2737 | ++NumSelects; |
2738 | return CondBr; |
2739 | } |
2740 | |
2741 | /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, |
2742 | /// making the following replacement: |
2743 | /// |
2744 | /// --code before guard-- |
2745 | /// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] |
2746 | /// --code after guard-- |
2747 | /// |
2748 | /// into |
2749 | /// |
2750 | /// --code before guard-- |
2751 | /// br i1 %cond, label %guarded, label %deopt |
2752 | /// |
2753 | /// guarded: |
2754 | /// --code after guard-- |
2755 | /// |
2756 | /// deopt: |
2757 | /// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] |
2758 | /// unreachable |
2759 | /// |
2760 | /// It also makes all relevant DT and LI updates, so that all structures are in |
2761 | /// valid state after this transform. |
2762 | static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, |
2763 | DominatorTree &DT, LoopInfo &LI, |
2764 | MemorySSAUpdater *MSSAU) { |
2765 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
2766 | LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n" ); |
2767 | BasicBlock *CheckBB = GI->getParent(); |
2768 | |
2769 | if (MSSAU && VerifyMemorySSA) |
2770 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2771 | |
2772 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
2773 | Instruction *DeoptBlockTerm = |
2774 | SplitBlockAndInsertIfThen(Cond: GI->getArgOperand(i: 0), SplitBefore: GI, Unreachable: true, |
2775 | BranchWeights: GI->getMetadata(KindID: LLVMContext::MD_prof), DTU: &DTU, LI: &LI); |
2776 | BranchInst *CheckBI = cast<BranchInst>(Val: CheckBB->getTerminator()); |
2777 | // SplitBlockAndInsertIfThen inserts control flow that branches to |
2778 | // DeoptBlockTerm if the condition is true. We want the opposite. |
2779 | CheckBI->swapSuccessors(); |
2780 | |
2781 | BasicBlock *GuardedBlock = CheckBI->getSuccessor(i: 0); |
2782 | GuardedBlock->setName("guarded" ); |
2783 | CheckBI->getSuccessor(i: 1)->setName("deopt" ); |
2784 | BasicBlock *DeoptBlock = CheckBI->getSuccessor(i: 1); |
2785 | |
2786 | if (MSSAU) |
2787 | MSSAU->moveAllAfterSpliceBlocks(From: CheckBB, To: GuardedBlock, Start: GI); |
2788 | |
2789 | GI->moveBefore(MovePos: DeoptBlockTerm); |
2790 | GI->setArgOperand(i: 0, v: ConstantInt::getFalse(Context&: GI->getContext())); |
2791 | |
2792 | if (MSSAU) { |
2793 | MemoryDef *MD = cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: GI)); |
2794 | MSSAU->moveToPlace(What: MD, BB: DeoptBlock, Where: MemorySSA::BeforeTerminator); |
2795 | if (VerifyMemorySSA) |
2796 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2797 | } |
2798 | |
2799 | if (VerifyLoopInfo) |
2800 | LI.verify(DomTree: DT); |
2801 | ++NumGuards; |
2802 | return CheckBI; |
2803 | } |
2804 | |
2805 | /// Cost multiplier is a way to limit potentially exponential behavior |
2806 | /// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch |
2807 | /// candidates available. Also accounting for the number of "sibling" loops with |
2808 | /// the idea to account for previous unswitches that already happened on this |
2809 | /// cluster of loops. There was an attempt to keep this formula simple, |
2810 | /// just enough to limit the worst case behavior. Even if it is not that simple |
2811 | /// now it is still not an attempt to provide a detailed heuristic size |
2812 | /// prediction. |
2813 | /// |
2814 | /// TODO: Make a proper accounting of "explosion" effect for all kinds of |
2815 | /// unswitch candidates, making adequate predictions instead of wild guesses. |
2816 | /// That requires knowing not just the number of "remaining" candidates but |
2817 | /// also costs of unswitching for each of these candidates. |
2818 | static int CalculateUnswitchCostMultiplier( |
2819 | const Instruction &TI, const Loop &L, const LoopInfo &LI, |
2820 | const DominatorTree &DT, |
2821 | ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates) { |
2822 | |
2823 | // Guards and other exiting conditions do not contribute to exponential |
2824 | // explosion as soon as they dominate the latch (otherwise there might be |
2825 | // another path to the latch remaining that does not allow to eliminate the |
2826 | // loop copy on unswitch). |
2827 | const BasicBlock *Latch = L.getLoopLatch(); |
2828 | const BasicBlock *CondBlock = TI.getParent(); |
2829 | if (DT.dominates(A: CondBlock, B: Latch) && |
2830 | (isGuard(U: &TI) || |
2831 | (TI.isTerminator() && |
2832 | llvm::count_if(Range: successors(I: &TI), P: [&L](const BasicBlock *SuccBB) { |
2833 | return L.contains(BB: SuccBB); |
2834 | }) <= 1))) { |
2835 | NumCostMultiplierSkipped++; |
2836 | return 1; |
2837 | } |
2838 | |
2839 | auto *ParentL = L.getParentLoop(); |
2840 | int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() |
2841 | : std::distance(first: LI.begin(), last: LI.end())); |
2842 | // Count amount of clones that all the candidates might cause during |
2843 | // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its |
2844 | // cases. |
2845 | int UnswitchedClones = 0; |
2846 | for (const auto &Candidate : UnswitchCandidates) { |
2847 | const Instruction *CI = Candidate.TI; |
2848 | const BasicBlock *CondBlock = CI->getParent(); |
2849 | bool SkipExitingSuccessors = DT.dominates(A: CondBlock, B: Latch); |
2850 | if (isa<SelectInst>(Val: CI)) { |
2851 | UnswitchedClones++; |
2852 | continue; |
2853 | } |
2854 | if (isGuard(U: CI)) { |
2855 | if (!SkipExitingSuccessors) |
2856 | UnswitchedClones++; |
2857 | continue; |
2858 | } |
2859 | int NonExitingSuccessors = |
2860 | llvm::count_if(Range: successors(BB: CondBlock), |
2861 | P: [SkipExitingSuccessors, &L](const BasicBlock *SuccBB) { |
2862 | return !SkipExitingSuccessors || L.contains(BB: SuccBB); |
2863 | }); |
2864 | UnswitchedClones += Log2_32(Value: NonExitingSuccessors); |
2865 | } |
2866 | |
2867 | // Ignore up to the "unscaled candidates" number of unswitch candidates |
2868 | // when calculating the power-of-two scaling of the cost. The main idea |
2869 | // with this control is to allow a small number of unswitches to happen |
2870 | // and rely more on siblings multiplier (see below) when the number |
2871 | // of candidates is small. |
2872 | unsigned ClonesPower = |
2873 | std::max(a: UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, b: 0); |
2874 | |
2875 | // Allowing top-level loops to spread a bit more than nested ones. |
2876 | int SiblingsMultiplier = |
2877 | std::max(a: (ParentL ? SiblingsCount |
2878 | : SiblingsCount / (int)UnswitchSiblingsToplevelDiv), |
2879 | b: 1); |
2880 | // Compute the cost multiplier in a way that won't overflow by saturating |
2881 | // at an upper bound. |
2882 | int CostMultiplier; |
2883 | if (ClonesPower > Log2_32(Value: UnswitchThreshold) || |
2884 | SiblingsMultiplier > UnswitchThreshold) |
2885 | CostMultiplier = UnswitchThreshold; |
2886 | else |
2887 | CostMultiplier = std::min(a: SiblingsMultiplier * (1 << ClonesPower), |
2888 | b: (int)UnswitchThreshold); |
2889 | |
2890 | LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier |
2891 | << " (siblings " << SiblingsMultiplier << " * clones " |
2892 | << (1 << ClonesPower) << ")" |
2893 | << " for unswitch candidate: " << TI << "\n" ); |
2894 | return CostMultiplier; |
2895 | } |
2896 | |
2897 | static bool collectUnswitchCandidates( |
2898 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, |
2899 | IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, |
2900 | const Loop &L, const LoopInfo &LI, AAResults &AA, |
2901 | const MemorySSAUpdater *MSSAU) { |
2902 | assert(UnswitchCandidates.empty() && "Should be!" ); |
2903 | |
2904 | auto AddUnswitchCandidatesForInst = [&](Instruction *I, Value *Cond) { |
2905 | Cond = skipTrivialSelect(Cond); |
2906 | if (isa<Constant>(Val: Cond)) |
2907 | return; |
2908 | if (L.isLoopInvariant(V: Cond)) { |
2909 | UnswitchCandidates.push_back(Elt: {I, {Cond}}); |
2910 | return; |
2911 | } |
2912 | if (match(V: Cond, P: m_CombineOr(L: m_LogicalAnd(), R: m_LogicalOr()))) { |
2913 | TinyPtrVector<Value *> Invariants = |
2914 | collectHomogenousInstGraphLoopInvariants( |
2915 | L, Root&: *static_cast<Instruction *>(Cond), LI); |
2916 | if (!Invariants.empty()) |
2917 | UnswitchCandidates.push_back(Elt: {I, std::move(Invariants)}); |
2918 | } |
2919 | }; |
2920 | |
2921 | // Whether or not we should also collect guards in the loop. |
2922 | bool CollectGuards = false; |
2923 | if (UnswitchGuards) { |
2924 | auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction( |
2925 | Name: Intrinsic::getName(id: Intrinsic::experimental_guard)); |
2926 | if (GuardDecl && !GuardDecl->use_empty()) |
2927 | CollectGuards = true; |
2928 | } |
2929 | |
2930 | for (auto *BB : L.blocks()) { |
2931 | if (LI.getLoopFor(BB) != &L) |
2932 | continue; |
2933 | |
2934 | for (auto &I : *BB) { |
2935 | if (auto *SI = dyn_cast<SelectInst>(Val: &I)) { |
2936 | auto *Cond = SI->getCondition(); |
2937 | // Do not unswitch vector selects and logical and/or selects |
2938 | if (Cond->getType()->isIntegerTy(Bitwidth: 1) && !SI->getType()->isIntegerTy(Bitwidth: 1)) |
2939 | AddUnswitchCandidatesForInst(SI, Cond); |
2940 | } else if (CollectGuards && isGuard(U: &I)) { |
2941 | auto *Cond = |
2942 | skipTrivialSelect(Cond: cast<IntrinsicInst>(Val: &I)->getArgOperand(i: 0)); |
2943 | // TODO: Support AND, OR conditions and partial unswitching. |
2944 | if (!isa<Constant>(Val: Cond) && L.isLoopInvariant(V: Cond)) |
2945 | UnswitchCandidates.push_back(Elt: {&I, {Cond}}); |
2946 | } |
2947 | } |
2948 | |
2949 | if (auto *SI = dyn_cast<SwitchInst>(Val: BB->getTerminator())) { |
2950 | // We can only consider fully loop-invariant switch conditions as we need |
2951 | // to completely eliminate the switch after unswitching. |
2952 | if (!isa<Constant>(Val: SI->getCondition()) && |
2953 | L.isLoopInvariant(V: SI->getCondition()) && !BB->getUniqueSuccessor()) |
2954 | UnswitchCandidates.push_back(Elt: {SI, {SI->getCondition()}}); |
2955 | continue; |
2956 | } |
2957 | |
2958 | auto *BI = dyn_cast<BranchInst>(Val: BB->getTerminator()); |
2959 | if (!BI || !BI->isConditional() || |
2960 | BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1)) |
2961 | continue; |
2962 | |
2963 | AddUnswitchCandidatesForInst(BI, BI->getCondition()); |
2964 | } |
2965 | |
2966 | if (MSSAU && !findOptionMDForLoop(TheLoop: &L, Name: "llvm.loop.unswitch.partial.disable" ) && |
2967 | !any_of(Range&: UnswitchCandidates, P: [&L](auto &TerminatorAndInvariants) { |
2968 | return TerminatorAndInvariants.TI == L.getHeader()->getTerminator(); |
2969 | })) { |
2970 | MemorySSA *MSSA = MSSAU->getMemorySSA(); |
2971 | if (auto Info = hasPartialIVCondition(L, MSSAThreshold, MSSA: *MSSA, AA)) { |
2972 | LLVM_DEBUG( |
2973 | dbgs() << "simple-loop-unswitch: Found partially invariant condition " |
2974 | << *Info->InstToDuplicate[0] << "\n" ); |
2975 | PartialIVInfo = *Info; |
2976 | PartialIVCondBranch = L.getHeader()->getTerminator(); |
2977 | TinyPtrVector<Value *> ValsToDuplicate; |
2978 | llvm::append_range(C&: ValsToDuplicate, R&: Info->InstToDuplicate); |
2979 | UnswitchCandidates.push_back( |
2980 | Elt: {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)}); |
2981 | } |
2982 | } |
2983 | return !UnswitchCandidates.empty(); |
2984 | } |
2985 | |
2986 | /// Tries to canonicalize condition described by: |
2987 | /// |
2988 | /// br (LHS pred RHS), label IfTrue, label IfFalse |
2989 | /// |
2990 | /// into its equivalent where `Pred` is something that we support for injected |
2991 | /// invariants (so far it is limited to ult), LHS in canonicalized form is |
2992 | /// non-invariant and RHS is an invariant. |
2993 | static void canonicalizeForInvariantConditionInjection( |
2994 | ICmpInst::Predicate &Pred, Value *&LHS, Value *&RHS, BasicBlock *&IfTrue, |
2995 | BasicBlock *&IfFalse, const Loop &L) { |
2996 | if (!L.contains(BB: IfTrue)) { |
2997 | Pred = ICmpInst::getInversePredicate(pred: Pred); |
2998 | std::swap(a&: IfTrue, b&: IfFalse); |
2999 | } |
3000 | |
3001 | // Move loop-invariant argument to RHS position. |
3002 | if (L.isLoopInvariant(V: LHS)) { |
3003 | Pred = ICmpInst::getSwappedPredicate(pred: Pred); |
3004 | std::swap(a&: LHS, b&: RHS); |
3005 | } |
3006 | |
3007 | if (Pred == ICmpInst::ICMP_SGE && match(V: RHS, P: m_Zero())) { |
3008 | // Turn "x >=s 0" into "x <u UMIN_INT" |
3009 | Pred = ICmpInst::ICMP_ULT; |
3010 | RHS = ConstantInt::get( |
3011 | Context&: RHS->getContext(), |
3012 | V: APInt::getSignedMinValue(numBits: RHS->getType()->getIntegerBitWidth())); |
3013 | } |
3014 | } |
3015 | |
3016 | /// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS ) |
3017 | /// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by |
3018 | /// injecting a loop-invariant condition. |
3019 | static bool shouldTryInjectInvariantCondition( |
3020 | const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS, |
3021 | const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) { |
3022 | if (L.isLoopInvariant(V: LHS) || !L.isLoopInvariant(V: RHS)) |
3023 | return false; |
3024 | // TODO: Support other predicates. |
3025 | if (Pred != ICmpInst::ICMP_ULT) |
3026 | return false; |
3027 | // TODO: Support non-loop-exiting branches? |
3028 | if (!L.contains(BB: IfTrue) || L.contains(BB: IfFalse)) |
3029 | return false; |
3030 | // FIXME: For some reason this causes problems with MSSA updates, need to |
3031 | // investigate why. So far, just don't unswitch latch. |
3032 | if (L.getHeader() == IfTrue) |
3033 | return false; |
3034 | return true; |
3035 | } |
3036 | |
3037 | /// Returns true, if metadata on \p BI allows us to optimize branching into \p |
3038 | /// TakenSucc via injection of invariant conditions. The branch should be not |
3039 | /// enough and not previously unswitched, the information about this comes from |
3040 | /// the metadata. |
3041 | bool shouldTryInjectBasingOnMetadata(const BranchInst *BI, |
3042 | const BasicBlock *TakenSucc) { |
3043 | SmallVector<uint32_t> Weights; |
3044 | if (!extractBranchWeights(I: *BI, Weights)) |
3045 | return false; |
3046 | unsigned T = InjectInvariantConditionHotnesThreshold; |
3047 | BranchProbability LikelyTaken(T - 1, T); |
3048 | |
3049 | assert(Weights.size() == 2 && "Unexpected profile data!" ); |
3050 | size_t Idx = BI->getSuccessor(i: 0) == TakenSucc ? 0 : 1; |
3051 | auto Num = Weights[Idx]; |
3052 | auto Denom = Weights[0] + Weights[1]; |
3053 | // Degenerate or overflowed metadata. |
3054 | if (Denom == 0 || Num > Denom) |
3055 | return false; |
3056 | BranchProbability ActualTaken(Num, Denom); |
3057 | if (LikelyTaken > ActualTaken) |
3058 | return false; |
3059 | return true; |
3060 | } |
3061 | |
3062 | /// Materialize pending invariant condition of the given candidate into IR. The |
3063 | /// injected loop-invariant condition implies the original loop-variant branch |
3064 | /// condition, so the materialization turns |
3065 | /// |
3066 | /// loop_block: |
3067 | /// ... |
3068 | /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc |
3069 | /// |
3070 | /// into |
3071 | /// |
3072 | /// preheader: |
3073 | /// %invariant_cond = LHS pred RHS |
3074 | /// ... |
3075 | /// loop_block: |
3076 | /// br i1 %invariant_cond, label InLoopSucc, label OriginalCheck |
3077 | /// OriginalCheck: |
3078 | /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc |
3079 | /// ... |
3080 | static NonTrivialUnswitchCandidate |
3081 | injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, |
3082 | DominatorTree &DT, LoopInfo &LI, |
3083 | AssumptionCache &AC, MemorySSAUpdater *MSSAU) { |
3084 | assert(Candidate.hasPendingInjection() && "Nothing to inject!" ); |
3085 | BasicBlock * = L.getLoopPreheader(); |
3086 | assert(Preheader && "Loop is not in simplified form?" ); |
3087 | assert(LI.getLoopFor(Candidate.TI->getParent()) == &L && |
3088 | "Unswitching branch of inner loop!" ); |
3089 | |
3090 | auto Pred = Candidate.PendingInjection->Pred; |
3091 | auto *LHS = Candidate.PendingInjection->LHS; |
3092 | auto *RHS = Candidate.PendingInjection->RHS; |
3093 | auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc; |
3094 | auto *TI = cast<BranchInst>(Val: Candidate.TI); |
3095 | auto *BB = Candidate.TI->getParent(); |
3096 | auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(i: 0) ? TI->getSuccessor(i: 1) |
3097 | : TI->getSuccessor(i: 0); |
3098 | // FIXME: Remove this once limitation on successors is lifted. |
3099 | assert(L.contains(InLoopSucc) && "Not supported yet!" ); |
3100 | assert(!L.contains(OutOfLoopSucc) && "Not supported yet!" ); |
3101 | auto &Ctx = BB->getContext(); |
3102 | |
3103 | IRBuilder<> Builder(Preheader->getTerminator()); |
3104 | assert(ICmpInst::isUnsigned(Pred) && "Not supported yet!" ); |
3105 | if (LHS->getType() != RHS->getType()) { |
3106 | if (LHS->getType()->getIntegerBitWidth() < |
3107 | RHS->getType()->getIntegerBitWidth()) |
3108 | LHS = Builder.CreateZExt(V: LHS, DestTy: RHS->getType(), Name: LHS->getName() + ".wide" ); |
3109 | else |
3110 | RHS = Builder.CreateZExt(V: RHS, DestTy: LHS->getType(), Name: RHS->getName() + ".wide" ); |
3111 | } |
3112 | // Do not use builder here: CreateICmp may simplify this into a constant and |
3113 | // unswitching will break. Better optimize it away later. |
3114 | auto *InjectedCond = |
3115 | ICmpInst::Create(Op: Instruction::ICmp, Pred, S1: LHS, S2: RHS, Name: "injected.cond" , |
3116 | InsertBefore: Preheader->getTerminator()->getIterator()); |
3117 | |
3118 | BasicBlock *CheckBlock = BasicBlock::Create(Context&: Ctx, Name: BB->getName() + ".check" , |
3119 | Parent: BB->getParent(), InsertBefore: InLoopSucc); |
3120 | Builder.SetInsertPoint(TI); |
3121 | auto *InvariantBr = |
3122 | Builder.CreateCondBr(Cond: InjectedCond, True: InLoopSucc, False: CheckBlock); |
3123 | |
3124 | Builder.SetInsertPoint(CheckBlock); |
3125 | Builder.CreateCondBr(Cond: TI->getCondition(), True: TI->getSuccessor(i: 0), |
3126 | False: TI->getSuccessor(i: 1)); |
3127 | TI->eraseFromParent(); |
3128 | |
3129 | // Fixup phis. |
3130 | for (auto &I : *InLoopSucc) { |
3131 | auto *PN = dyn_cast<PHINode>(Val: &I); |
3132 | if (!PN) |
3133 | break; |
3134 | auto *Inc = PN->getIncomingValueForBlock(BB); |
3135 | PN->addIncoming(V: Inc, BB: CheckBlock); |
3136 | } |
3137 | OutOfLoopSucc->replacePhiUsesWith(Old: BB, New: CheckBlock); |
3138 | |
3139 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates = { |
3140 | { DominatorTree::Insert, BB, CheckBlock }, |
3141 | { DominatorTree::Insert, CheckBlock, InLoopSucc }, |
3142 | { DominatorTree::Insert, CheckBlock, OutOfLoopSucc }, |
3143 | { DominatorTree::Delete, BB, OutOfLoopSucc } |
3144 | }; |
3145 | |
3146 | DT.applyUpdates(Updates: DTUpdates); |
3147 | if (MSSAU) |
3148 | MSSAU->applyUpdates(Updates: DTUpdates, DT); |
3149 | L.addBasicBlockToLoop(NewBB: CheckBlock, LI); |
3150 | |
3151 | #ifndef NDEBUG |
3152 | DT.verify(); |
3153 | LI.verify(DT); |
3154 | if (MSSAU && VerifyMemorySSA) |
3155 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
3156 | #endif |
3157 | |
3158 | // TODO: In fact, cost of unswitching a new invariant candidate is *slightly* |
3159 | // higher because we have just inserted a new block. Need to think how to |
3160 | // adjust the cost of injected candidates when it was first computed. |
3161 | LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr |
3162 | << " and considering it for unswitching." ); |
3163 | ++NumInvariantConditionsInjected; |
3164 | return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond }, |
3165 | Candidate.Cost); |
3166 | } |
3167 | |
3168 | /// Given chain of loop branch conditions looking like: |
3169 | /// br (Variant < Invariant1) |
3170 | /// br (Variant < Invariant2) |
3171 | /// br (Variant < Invariant3) |
3172 | /// ... |
3173 | /// collect set of invariant conditions on which we want to unswitch, which |
3174 | /// look like: |
3175 | /// Invariant1 <= Invariant2 |
3176 | /// Invariant2 <= Invariant3 |
3177 | /// ... |
3178 | /// Though they might not immediately exist in the IR, we can still inject them. |
3179 | static bool insertCandidatesWithPendingInjections( |
3180 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L, |
3181 | ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares, |
3182 | const DominatorTree &DT) { |
3183 | |
3184 | assert(ICmpInst::isRelational(Pred)); |
3185 | assert(ICmpInst::isStrictPredicate(Pred)); |
3186 | if (Compares.size() < 2) |
3187 | return false; |
3188 | ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(pred: Pred); |
3189 | for (auto Prev = Compares.begin(), Next = Compares.begin() + 1; |
3190 | Next != Compares.end(); ++Prev, ++Next) { |
3191 | Value *LHS = Next->Invariant; |
3192 | Value *RHS = Prev->Invariant; |
3193 | BasicBlock *InLoopSucc = Prev->InLoopSucc; |
3194 | InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc); |
3195 | NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS }, |
3196 | std::nullopt, std::move(ToInject)); |
3197 | UnswitchCandidates.push_back(Elt: std::move(Candidate)); |
3198 | } |
3199 | return true; |
3200 | } |
3201 | |
3202 | /// Collect unswitch candidates by invariant conditions that are not immediately |
3203 | /// present in the loop. However, they can be injected into the code if we |
3204 | /// decide it's profitable. |
3205 | /// An example of such conditions is following: |
3206 | /// |
3207 | /// for (...) { |
3208 | /// x = load ... |
3209 | /// if (! x <u C1) break; |
3210 | /// if (! x <u C2) break; |
3211 | /// <do something> |
3212 | /// } |
3213 | /// |
3214 | /// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <= |
3215 | /// C2" automatically implies "x <u C2", so we can get rid of one of |
3216 | /// loop-variant checks in unswitched loop version. |
3217 | static bool collectUnswitchCandidatesWithInjections( |
3218 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, |
3219 | IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L, |
3220 | const DominatorTree &DT, const LoopInfo &LI, AAResults &AA, |
3221 | const MemorySSAUpdater *MSSAU) { |
3222 | if (!InjectInvariantConditions) |
3223 | return false; |
3224 | |
3225 | if (!DT.isReachableFromEntry(A: L.getHeader())) |
3226 | return false; |
3227 | auto *Latch = L.getLoopLatch(); |
3228 | // Need to have a single latch and a preheader. |
3229 | if (!Latch) |
3230 | return false; |
3231 | assert(L.getLoopPreheader() && "Must have a preheader!" ); |
3232 | |
3233 | DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT; |
3234 | // Traverse the conditions that dominate latch (and therefore dominate each |
3235 | // other). |
3236 | for (auto *DTN = DT.getNode(BB: Latch); L.contains(BB: DTN->getBlock()); |
3237 | DTN = DTN->getIDom()) { |
3238 | ICmpInst::Predicate Pred; |
3239 | Value *LHS = nullptr, *RHS = nullptr; |
3240 | BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; |
3241 | auto *BB = DTN->getBlock(); |
3242 | // Ignore inner loops. |
3243 | if (LI.getLoopFor(BB) != &L) |
3244 | continue; |
3245 | auto *Term = BB->getTerminator(); |
3246 | if (!match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Value(V&: LHS), R: m_Value(V&: RHS)), |
3247 | T: m_BasicBlock(V&: IfTrue), F: m_BasicBlock(V&: IfFalse)))) |
3248 | continue; |
3249 | if (!LHS->getType()->isIntegerTy()) |
3250 | continue; |
3251 | canonicalizeForInvariantConditionInjection(Pred, LHS, RHS, IfTrue, IfFalse, |
3252 | L); |
3253 | if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L)) |
3254 | continue; |
3255 | if (!shouldTryInjectBasingOnMetadata(BI: cast<BranchInst>(Val: Term), TakenSucc: IfTrue)) |
3256 | continue; |
3257 | // Strip ZEXT for unsigned predicate. |
3258 | // TODO: once signed predicates are supported, also strip SEXT. |
3259 | CompareDesc Desc(cast<BranchInst>(Val: Term), RHS, IfTrue); |
3260 | while (auto *Zext = dyn_cast<ZExtInst>(Val: LHS)) |
3261 | LHS = Zext->getOperand(i_nocapture: 0); |
3262 | CandidatesULT[LHS].push_back(Elt: Desc); |
3263 | } |
3264 | |
3265 | bool Found = false; |
3266 | for (auto &It : CandidatesULT) |
3267 | Found |= insertCandidatesWithPendingInjections( |
3268 | UnswitchCandidates, L, Pred: ICmpInst::ICMP_ULT, Compares: It.second, DT); |
3269 | return Found; |
3270 | } |
3271 | |
3272 | static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { |
3273 | if (!L.isSafeToClone()) |
3274 | return false; |
3275 | for (auto *BB : L.blocks()) |
3276 | for (auto &I : *BB) { |
3277 | if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) |
3278 | return false; |
3279 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
3280 | assert(!CB->cannotDuplicate() && "Checked by L.isSafeToClone()." ); |
3281 | if (CB->isConvergent()) |
3282 | return false; |
3283 | } |
3284 | } |
3285 | |
3286 | // Check if there are irreducible CFG cycles in this loop. If so, we cannot |
3287 | // easily unswitch non-trivial edges out of the loop. Doing so might turn the |
3288 | // irreducible control flow into reducible control flow and introduce new |
3289 | // loops "out of thin air". If we ever discover important use cases for doing |
3290 | // this, we can add support to loop unswitch, but it is a lot of complexity |
3291 | // for what seems little or no real world benefit. |
3292 | LoopBlocksRPO RPOT(&L); |
3293 | RPOT.perform(LI: &LI); |
3294 | if (containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI)) |
3295 | return false; |
3296 | |
3297 | SmallVector<BasicBlock *, 4> ExitBlocks; |
3298 | L.getUniqueExitBlocks(ExitBlocks); |
3299 | // We cannot unswitch if exit blocks contain a cleanuppad/catchswitch |
3300 | // instruction as we don't know how to split those exit blocks. |
3301 | // FIXME: We should teach SplitBlock to handle this and remove this |
3302 | // restriction. |
3303 | for (auto *ExitBB : ExitBlocks) { |
3304 | auto *I = ExitBB->getFirstNonPHI(); |
3305 | if (isa<CleanupPadInst>(Val: I) || isa<CatchSwitchInst>(Val: I)) { |
3306 | LLVM_DEBUG(dbgs() << "Cannot unswitch because of cleanuppad/catchswitch " |
3307 | "in exit block\n" ); |
3308 | return false; |
3309 | } |
3310 | } |
3311 | |
3312 | return true; |
3313 | } |
3314 | |
3315 | static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( |
3316 | ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates, const Loop &L, |
3317 | const DominatorTree &DT, const LoopInfo &LI, AssumptionCache &AC, |
3318 | const TargetTransformInfo &TTI, const IVConditionInfo &PartialIVInfo) { |
3319 | // Given that unswitching these terminators will require duplicating parts of |
3320 | // the loop, so we need to be able to model that cost. Compute the ephemeral |
3321 | // values and set up a data structure to hold per-BB costs. We cache each |
3322 | // block's cost so that we don't recompute this when considering different |
3323 | // subsets of the loop for duplication during unswitching. |
3324 | SmallPtrSet<const Value *, 4> EphValues; |
3325 | CodeMetrics::collectEphemeralValues(L: &L, AC: &AC, EphValues); |
3326 | SmallDenseMap<BasicBlock *, InstructionCost, 4> BBCostMap; |
3327 | |
3328 | // Compute the cost of each block, as well as the total loop cost. Also, bail |
3329 | // out if we see instructions which are incompatible with loop unswitching |
3330 | // (convergent, noduplicate, or cross-basic-block tokens). |
3331 | // FIXME: We might be able to safely handle some of these in non-duplicated |
3332 | // regions. |
3333 | TargetTransformInfo::TargetCostKind CostKind = |
3334 | L.getHeader()->getParent()->hasMinSize() |
3335 | ? TargetTransformInfo::TCK_CodeSize |
3336 | : TargetTransformInfo::TCK_SizeAndLatency; |
3337 | InstructionCost LoopCost = 0; |
3338 | for (auto *BB : L.blocks()) { |
3339 | InstructionCost Cost = 0; |
3340 | for (auto &I : *BB) { |
3341 | if (EphValues.count(Ptr: &I)) |
3342 | continue; |
3343 | Cost += TTI.getInstructionCost(U: &I, CostKind); |
3344 | } |
3345 | assert(Cost >= 0 && "Must not have negative costs!" ); |
3346 | LoopCost += Cost; |
3347 | assert(LoopCost >= 0 && "Must not have negative loop costs!" ); |
3348 | BBCostMap[BB] = Cost; |
3349 | } |
3350 | LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n" ); |
3351 | |
3352 | // Now we find the best candidate by searching for the one with the following |
3353 | // properties in order: |
3354 | // |
3355 | // 1) An unswitching cost below the threshold |
3356 | // 2) The smallest number of duplicated unswitch candidates (to avoid |
3357 | // creating redundant subsequent unswitching) |
3358 | // 3) The smallest cost after unswitching. |
3359 | // |
3360 | // We prioritize reducing fanout of unswitch candidates provided the cost |
3361 | // remains below the threshold because this has a multiplicative effect. |
3362 | // |
3363 | // This requires memoizing each dominator subtree to avoid redundant work. |
3364 | // |
3365 | // FIXME: Need to actually do the number of candidates part above. |
3366 | SmallDenseMap<DomTreeNode *, InstructionCost, 4> DTCostMap; |
3367 | // Given a terminator which might be unswitched, computes the non-duplicated |
3368 | // cost for that terminator. |
3369 | auto ComputeUnswitchedCost = [&](Instruction &TI, |
3370 | bool FullUnswitch) -> InstructionCost { |
3371 | // Unswitching selects unswitches the entire loop. |
3372 | if (isa<SelectInst>(Val: TI)) |
3373 | return LoopCost; |
3374 | |
3375 | BasicBlock &BB = *TI.getParent(); |
3376 | SmallPtrSet<BasicBlock *, 4> Visited; |
3377 | |
3378 | InstructionCost Cost = 0; |
3379 | for (BasicBlock *SuccBB : successors(BB: &BB)) { |
3380 | // Don't count successors more than once. |
3381 | if (!Visited.insert(Ptr: SuccBB).second) |
3382 | continue; |
3383 | |
3384 | // If this is a partial unswitch candidate, then it must be a conditional |
3385 | // branch with a condition of either `or`, `and`, their corresponding |
3386 | // select forms or partially invariant instructions. In that case, one of |
3387 | // the successors is necessarily duplicated, so don't even try to remove |
3388 | // its cost. |
3389 | if (!FullUnswitch) { |
3390 | auto &BI = cast<BranchInst>(Val&: TI); |
3391 | Value *Cond = skipTrivialSelect(Cond: BI.getCondition()); |
3392 | if (match(V: Cond, P: m_LogicalAnd())) { |
3393 | if (SuccBB == BI.getSuccessor(i: 1)) |
3394 | continue; |
3395 | } else if (match(V: Cond, P: m_LogicalOr())) { |
3396 | if (SuccBB == BI.getSuccessor(i: 0)) |
3397 | continue; |
3398 | } else if ((PartialIVInfo.KnownValue->isOneValue() && |
3399 | SuccBB == BI.getSuccessor(i: 0)) || |
3400 | (!PartialIVInfo.KnownValue->isOneValue() && |
3401 | SuccBB == BI.getSuccessor(i: 1))) |
3402 | continue; |
3403 | } |
3404 | |
3405 | // This successor's domtree will not need to be duplicated after |
3406 | // unswitching if the edge to the successor dominates it (and thus the |
3407 | // entire tree). This essentially means there is no other path into this |
3408 | // subtree and so it will end up live in only one clone of the loop. |
3409 | if (SuccBB->getUniquePredecessor() || |
3410 | llvm::all_of(Range: predecessors(BB: SuccBB), P: [&](BasicBlock *PredBB) { |
3411 | return PredBB == &BB || DT.dominates(A: SuccBB, B: PredBB); |
3412 | })) { |
3413 | Cost += computeDomSubtreeCost(N&: *DT[SuccBB], BBCostMap, DTCostMap); |
3414 | assert(Cost <= LoopCost && |
3415 | "Non-duplicated cost should never exceed total loop cost!" ); |
3416 | } |
3417 | } |
3418 | |
3419 | // Now scale the cost by the number of unique successors minus one. We |
3420 | // subtract one because there is already at least one copy of the entire |
3421 | // loop. This is computing the new cost of unswitching a condition. |
3422 | // Note that guards always have 2 unique successors that are implicit and |
3423 | // will be materialized if we decide to unswitch it. |
3424 | int SuccessorsCount = isGuard(U: &TI) ? 2 : Visited.size(); |
3425 | assert(SuccessorsCount > 1 && |
3426 | "Cannot unswitch a condition without multiple distinct successors!" ); |
3427 | return (LoopCost - Cost) * (SuccessorsCount - 1); |
3428 | }; |
3429 | |
3430 | std::optional<NonTrivialUnswitchCandidate> Best; |
3431 | for (auto &Candidate : UnswitchCandidates) { |
3432 | Instruction &TI = *Candidate.TI; |
3433 | ArrayRef<Value *> Invariants = Candidate.Invariants; |
3434 | BranchInst *BI = dyn_cast<BranchInst>(Val: &TI); |
3435 | bool FullUnswitch = |
3436 | !BI || Candidate.hasPendingInjection() || |
3437 | (Invariants.size() == 1 && |
3438 | Invariants[0] == skipTrivialSelect(Cond: BI->getCondition())); |
3439 | InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch); |
3440 | // Calculate cost multiplier which is a tool to limit potentially |
3441 | // exponential behavior of loop-unswitch. |
3442 | if (EnableUnswitchCostMultiplier) { |
3443 | int CostMultiplier = |
3444 | CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates); |
3445 | assert( |
3446 | (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) && |
3447 | "cost multiplier needs to be in the range of 1..UnswitchThreshold" ); |
3448 | CandidateCost *= CostMultiplier; |
3449 | LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost |
3450 | << " (multiplier: " << CostMultiplier << ")" |
3451 | << " for unswitch candidate: " << TI << "\n" ); |
3452 | } else { |
3453 | LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost |
3454 | << " for unswitch candidate: " << TI << "\n" ); |
3455 | } |
3456 | |
3457 | if (!Best || CandidateCost < Best->Cost) { |
3458 | Best = Candidate; |
3459 | Best->Cost = CandidateCost; |
3460 | } |
3461 | } |
3462 | assert(Best && "Must be!" ); |
3463 | return *Best; |
3464 | } |
3465 | |
3466 | // Insert a freeze on an unswitched branch if all is true: |
3467 | // 1. freeze-loop-unswitch-cond option is true |
3468 | // 2. The branch may not execute in the loop pre-transformation. If a branch may |
3469 | // not execute and could cause UB, it would always cause UB if it is hoisted outside |
3470 | // of the loop. Insert a freeze to prevent this case. |
3471 | // 3. The branch condition may be poison or undef |
3472 | static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, |
3473 | AssumptionCache &AC) { |
3474 | assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI)); |
3475 | if (!FreezeLoopUnswitchCond) |
3476 | return false; |
3477 | |
3478 | ICFLoopSafetyInfo SafetyInfo; |
3479 | SafetyInfo.computeLoopSafetyInfo(CurLoop: &L); |
3480 | if (SafetyInfo.isGuaranteedToExecute(Inst: TI, DT: &DT, CurLoop: &L)) |
3481 | return false; |
3482 | |
3483 | Value *Cond; |
3484 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &TI)) |
3485 | Cond = skipTrivialSelect(Cond: BI->getCondition()); |
3486 | else |
3487 | Cond = skipTrivialSelect(Cond: cast<SwitchInst>(Val: &TI)->getCondition()); |
3488 | return !isGuaranteedNotToBeUndefOrPoison( |
3489 | V: Cond, AC: &AC, CtxI: L.getLoopPreheader()->getTerminator(), DT: &DT); |
3490 | } |
3491 | |
3492 | static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, |
3493 | AssumptionCache &AC, AAResults &AA, |
3494 | TargetTransformInfo &TTI, ScalarEvolution *SE, |
3495 | MemorySSAUpdater *MSSAU, |
3496 | LPMUpdater &LoopUpdater) { |
3497 | // Collect all invariant conditions within this loop (as opposed to an inner |
3498 | // loop which would be handled when visiting that inner loop). |
3499 | SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; |
3500 | IVConditionInfo PartialIVInfo; |
3501 | Instruction *PartialIVCondBranch = nullptr; |
3502 | collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, |
3503 | PartialIVCondBranch, L, LI, AA, MSSAU); |
3504 | if (!findOptionMDForLoop(TheLoop: &L, Name: "llvm.loop.unswitch.injection.disable" )) |
3505 | collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, |
3506 | PartialIVCondBranch, L, DT, LI, AA, |
3507 | MSSAU); |
3508 | // If we didn't find any candidates, we're done. |
3509 | if (UnswitchCandidates.empty()) |
3510 | return false; |
3511 | |
3512 | LLVM_DEBUG( |
3513 | dbgs() << "Considering " << UnswitchCandidates.size() |
3514 | << " non-trivial loop invariant conditions for unswitching.\n" ); |
3515 | |
3516 | NonTrivialUnswitchCandidate Best = findBestNonTrivialUnswitchCandidate( |
3517 | UnswitchCandidates, L, DT, LI, AC, TTI, PartialIVInfo); |
3518 | |
3519 | assert(Best.TI && "Failed to find loop unswitch candidate" ); |
3520 | assert(Best.Cost && "Failed to compute cost" ); |
3521 | |
3522 | if (*Best.Cost >= UnswitchThreshold) { |
3523 | LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << *Best.Cost |
3524 | << "\n" ); |
3525 | return false; |
3526 | } |
3527 | |
3528 | bool InjectedCondition = false; |
3529 | if (Best.hasPendingInjection()) { |
3530 | Best = injectPendingInvariantConditions(Candidate: Best, L, DT, LI, AC, MSSAU); |
3531 | InjectedCondition = true; |
3532 | } |
3533 | assert(!Best.hasPendingInjection() && |
3534 | "All injections should have been done by now!" ); |
3535 | |
3536 | if (Best.TI != PartialIVCondBranch) |
3537 | PartialIVInfo.InstToDuplicate.clear(); |
3538 | |
3539 | bool InsertFreeze; |
3540 | if (auto *SI = dyn_cast<SelectInst>(Val: Best.TI)) { |
3541 | // If the best candidate is a select, turn it into a branch. Select |
3542 | // instructions with a poison conditional do not propagate poison, but |
3543 | // branching on poison causes UB. Insert a freeze on the select |
3544 | // conditional to prevent UB after turning the select into a branch. |
3545 | InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( |
3546 | V: SI->getCondition(), AC: &AC, CtxI: L.getLoopPreheader()->getTerminator(), DT: &DT); |
3547 | Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU, AC: &AC); |
3548 | } else { |
3549 | // If the best candidate is a guard, turn it into a branch. |
3550 | if (isGuard(U: Best.TI)) |
3551 | Best.TI = |
3552 | turnGuardIntoBranch(GI: cast<IntrinsicInst>(Val: Best.TI), L, DT, LI, MSSAU); |
3553 | InsertFreeze = shouldInsertFreeze(L, TI&: *Best.TI, DT, AC); |
3554 | } |
3555 | |
3556 | LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost |
3557 | << ") terminator: " << *Best.TI << "\n" ); |
3558 | unswitchNontrivialInvariants(L, TI&: *Best.TI, Invariants: Best.Invariants, PartialIVInfo, DT, |
3559 | LI, AC, SE, MSSAU, LoopUpdater, InsertFreeze, |
3560 | InjectedCondition); |
3561 | return true; |
3562 | } |
3563 | |
3564 | /// Unswitch control flow predicated on loop invariant conditions. |
3565 | /// |
3566 | /// This first hoists all branches or switches which are trivial (IE, do not |
3567 | /// require duplicating any part of the loop) out of the loop body. It then |
3568 | /// looks at other loop invariant control flows and tries to unswitch those as |
3569 | /// well by cloning the loop if the result is small enough. |
3570 | /// |
3571 | /// The `DT`, `LI`, `AC`, `AA`, `TTI` parameters are required analyses that are |
3572 | /// also updated based on the unswitch. The `MSSA` analysis is also updated if |
3573 | /// valid (i.e. its use is enabled). |
3574 | /// |
3575 | /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is |
3576 | /// true, we will attempt to do non-trivial unswitching as well as trivial |
3577 | /// unswitching. |
3578 | /// |
3579 | /// The `postUnswitch` function will be run after unswitching is complete |
3580 | /// with information on whether or not the provided loop remains a loop and |
3581 | /// a list of new sibling loops created. |
3582 | /// |
3583 | /// If `SE` is non-null, we will update that analysis based on the unswitching |
3584 | /// done. |
3585 | static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, |
3586 | AssumptionCache &AC, AAResults &AA, |
3587 | TargetTransformInfo &TTI, bool Trivial, |
3588 | bool NonTrivial, ScalarEvolution *SE, |
3589 | MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, |
3590 | BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { |
3591 | assert(L.isRecursivelyLCSSAForm(DT, LI) && |
3592 | "Loops must be in LCSSA form before unswitching." ); |
3593 | |
3594 | // Must be in loop simplified form: we need a preheader and dedicated exits. |
3595 | if (!L.isLoopSimplifyForm()) |
3596 | return false; |
3597 | |
3598 | // Try trivial unswitch first before loop over other basic blocks in the loop. |
3599 | if (Trivial && unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) { |
3600 | // If we unswitched successfully we will want to clean up the loop before |
3601 | // processing it further so just mark it as unswitched and return. |
3602 | postUnswitch(L, U&: LoopUpdater, LoopName: L.getName(), |
3603 | /*CurrentLoopValid*/ true, /*PartiallyInvariant*/ false, |
3604 | /*InjectedCondition*/ false, NewLoops: {}); |
3605 | return true; |
3606 | } |
3607 | |
3608 | const Function *F = L.getHeader()->getParent(); |
3609 | |
3610 | // Check whether we should continue with non-trivial conditions. |
3611 | // EnableNonTrivialUnswitch: Global variable that forces non-trivial |
3612 | // unswitching for testing and debugging. |
3613 | // NonTrivial: Parameter that enables non-trivial unswitching for this |
3614 | // invocation of the transform. But this should be allowed only |
3615 | // for targets without branch divergence. |
3616 | // |
3617 | // FIXME: If divergence analysis becomes available to a loop |
3618 | // transform, we should allow unswitching for non-trivial uniform |
3619 | // branches even on targets that have divergence. |
3620 | // https://bugs.llvm.org/show_bug.cgi?id=48819 |
3621 | bool ContinueWithNonTrivial = |
3622 | EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F)); |
3623 | if (!ContinueWithNonTrivial) |
3624 | return false; |
3625 | |
3626 | // Skip non-trivial unswitching for optsize functions. |
3627 | if (F->hasOptSize()) |
3628 | return false; |
3629 | |
3630 | // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, |
3631 | // of the loops L is nested in, and of the loops nested in L are all cold. |
3632 | auto IsLoopNestCold = [&](const Loop *L) { |
3633 | // Check L and all of its parent loops. |
3634 | auto *Parent = L; |
3635 | while (Parent) { |
3636 | if (!PSI->isColdBlock(BB: Parent->getHeader(), BFI)) |
3637 | return false; |
3638 | Parent = Parent->getParentLoop(); |
3639 | } |
3640 | // Next check all loops nested within L. |
3641 | SmallVector<const Loop *, 4> Worklist; |
3642 | Worklist.insert(I: Worklist.end(), From: L->getSubLoops().begin(), |
3643 | To: L->getSubLoops().end()); |
3644 | while (!Worklist.empty()) { |
3645 | auto *CurLoop = Worklist.pop_back_val(); |
3646 | if (!PSI->isColdBlock(BB: CurLoop->getHeader(), BFI)) |
3647 | return false; |
3648 | Worklist.insert(I: Worklist.end(), From: CurLoop->getSubLoops().begin(), |
3649 | To: CurLoop->getSubLoops().end()); |
3650 | } |
3651 | return true; |
3652 | }; |
3653 | |
3654 | // Skip cold loops in cold loop nests, as unswitching them brings little |
3655 | // benefit but increases the code size |
3656 | if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { |
3657 | LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n" ); |
3658 | return false; |
3659 | } |
3660 | |
3661 | // Perform legality checks. |
3662 | if (!isSafeForNoNTrivialUnswitching(L, LI)) |
3663 | return false; |
3664 | |
3665 | // For non-trivial unswitching, because it often creates new loops, we rely on |
3666 | // the pass manager to iterate on the loops rather than trying to immediately |
3667 | // reach a fixed point. There is no substantial advantage to iterating |
3668 | // internally, and if any of the new loops are simplified enough to contain |
3669 | // trivial unswitching we want to prefer those. |
3670 | |
3671 | // Try to unswitch the best invariant condition. We prefer this full unswitch to |
3672 | // a partial unswitch when possible below the threshold. |
3673 | if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, SE, MSSAU, LoopUpdater)) |
3674 | return true; |
3675 | |
3676 | // No other opportunities to unswitch. |
3677 | return false; |
3678 | } |
3679 | |
3680 | PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, |
3681 | LoopStandardAnalysisResults &AR, |
3682 | LPMUpdater &U) { |
3683 | Function &F = *L.getHeader()->getParent(); |
3684 | (void)F; |
3685 | ProfileSummaryInfo *PSI = nullptr; |
3686 | if (auto OuterProxy = |
3687 | AM.getResult<FunctionAnalysisManagerLoopProxy>(IR&: L, ExtraArgs&: AR) |
3688 | .getCachedResult<ModuleAnalysisManagerFunctionProxy>(IR&: F)) |
3689 | PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
3690 | LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L |
3691 | << "\n" ); |
3692 | |
3693 | std::optional<MemorySSAUpdater> MSSAU; |
3694 | if (AR.MSSA) { |
3695 | MSSAU = MemorySSAUpdater(AR.MSSA); |
3696 | if (VerifyMemorySSA) |
3697 | AR.MSSA->verifyMemorySSA(); |
3698 | } |
3699 | if (!unswitchLoop(L, DT&: AR.DT, LI&: AR.LI, AC&: AR.AC, AA&: AR.AA, TTI&: AR.TTI, Trivial, NonTrivial, |
3700 | SE: &AR.SE, MSSAU: MSSAU ? &*MSSAU : nullptr, PSI, BFI: AR.BFI, LoopUpdater&: U)) |
3701 | return PreservedAnalyses::all(); |
3702 | |
3703 | if (AR.MSSA && VerifyMemorySSA) |
3704 | AR.MSSA->verifyMemorySSA(); |
3705 | |
3706 | // Historically this pass has had issues with the dominator tree so verify it |
3707 | // in asserts builds. |
3708 | assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); |
3709 | |
3710 | auto PA = getLoopPassPreservedAnalyses(); |
3711 | if (AR.MSSA) |
3712 | PA.preserve<MemorySSAAnalysis>(); |
3713 | return PA; |
3714 | } |
3715 | |
3716 | void SimpleLoopUnswitchPass::printPipeline( |
3717 | raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { |
3718 | static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline( |
3719 | OS, MapClassName2PassName); |
3720 | |
3721 | OS << '<'; |
3722 | OS << (NonTrivial ? "" : "no-" ) << "nontrivial;" ; |
3723 | OS << (Trivial ? "" : "no-" ) << "trivial" ; |
3724 | OS << '>'; |
3725 | } |
3726 | |