1 | ///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" |
10 | #include "llvm/ADT/DenseMap.h" |
11 | #include "llvm/ADT/STLExtras.h" |
12 | #include "llvm/ADT/Sequence.h" |
13 | #include "llvm/ADT/SetVector.h" |
14 | #include "llvm/ADT/SmallPtrSet.h" |
15 | #include "llvm/ADT/SmallVector.h" |
16 | #include "llvm/ADT/Statistic.h" |
17 | #include "llvm/ADT/Twine.h" |
18 | #include "llvm/Analysis/AssumptionCache.h" |
19 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
20 | #include "llvm/Analysis/CFG.h" |
21 | #include "llvm/Analysis/CodeMetrics.h" |
22 | #include "llvm/Analysis/DomTreeUpdater.h" |
23 | #include "llvm/Analysis/GuardUtils.h" |
24 | #include "llvm/Analysis/LoopAnalysisManager.h" |
25 | #include "llvm/Analysis/LoopInfo.h" |
26 | #include "llvm/Analysis/LoopIterator.h" |
27 | #include "llvm/Analysis/MemorySSA.h" |
28 | #include "llvm/Analysis/MemorySSAUpdater.h" |
29 | #include "llvm/Analysis/MustExecute.h" |
30 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
31 | #include "llvm/Analysis/ScalarEvolution.h" |
32 | #include "llvm/Analysis/TargetTransformInfo.h" |
33 | #include "llvm/Analysis/ValueTracking.h" |
34 | #include "llvm/IR/BasicBlock.h" |
35 | #include "llvm/IR/Constant.h" |
36 | #include "llvm/IR/Constants.h" |
37 | #include "llvm/IR/Dominators.h" |
38 | #include "llvm/IR/Function.h" |
39 | #include "llvm/IR/IRBuilder.h" |
40 | #include "llvm/IR/InstrTypes.h" |
41 | #include "llvm/IR/Instruction.h" |
42 | #include "llvm/IR/Instructions.h" |
43 | #include "llvm/IR/IntrinsicInst.h" |
44 | #include "llvm/IR/Module.h" |
45 | #include "llvm/IR/PatternMatch.h" |
46 | #include "llvm/IR/ProfDataUtils.h" |
47 | #include "llvm/IR/Use.h" |
48 | #include "llvm/IR/Value.h" |
49 | #include "llvm/Support/Casting.h" |
50 | #include "llvm/Support/CommandLine.h" |
51 | #include "llvm/Support/Debug.h" |
52 | #include "llvm/Support/ErrorHandling.h" |
53 | #include "llvm/Support/GenericDomTree.h" |
54 | #include "llvm/Support/InstructionCost.h" |
55 | #include "llvm/Support/raw_ostream.h" |
56 | #include "llvm/Transforms/Scalar/LoopPassManager.h" |
57 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
58 | #include "llvm/Transforms/Utils/Cloning.h" |
59 | #include "llvm/Transforms/Utils/Local.h" |
60 | #include "llvm/Transforms/Utils/LoopUtils.h" |
61 | #include "llvm/Transforms/Utils/ValueMapper.h" |
62 | #include <algorithm> |
63 | #include <cassert> |
64 | #include <iterator> |
65 | #include <numeric> |
66 | #include <optional> |
67 | #include <utility> |
68 | |
69 | #define DEBUG_TYPE "simple-loop-unswitch" |
70 | |
71 | using namespace llvm; |
72 | using namespace llvm::PatternMatch; |
73 | |
74 | STATISTIC(NumBranches, "Number of branches unswitched" ); |
75 | STATISTIC(NumSwitches, "Number of switches unswitched" ); |
76 | STATISTIC(NumSelects, "Number of selects turned into branches for unswitching" ); |
77 | STATISTIC(NumGuards, "Number of guards turned into branches for unswitching" ); |
78 | STATISTIC(NumTrivial, "Number of unswitches that are trivial" ); |
79 | STATISTIC( |
80 | NumCostMultiplierSkipped, |
81 | "Number of unswitch candidates that had their cost multiplier skipped" ); |
82 | STATISTIC(NumInvariantConditionsInjected, |
83 | "Number of invariant conditions injected and unswitched" ); |
84 | |
85 | static cl::opt<bool> EnableNonTrivialUnswitch( |
86 | "enable-nontrivial-unswitch" , cl::init(Val: false), cl::Hidden, |
87 | cl::desc("Forcibly enables non-trivial loop unswitching rather than " |
88 | "following the configuration passed into the pass." )); |
89 | |
90 | static cl::opt<int> |
91 | UnswitchThreshold("unswitch-threshold" , cl::init(Val: 50), cl::Hidden, |
92 | cl::desc("The cost threshold for unswitching a loop." )); |
93 | |
94 | static cl::opt<bool> EnableUnswitchCostMultiplier( |
95 | "enable-unswitch-cost-multiplier" , cl::init(Val: true), cl::Hidden, |
96 | cl::desc("Enable unswitch cost multiplier that prohibits exponential " |
97 | "explosion in nontrivial unswitch." )); |
98 | static cl::opt<int> UnswitchSiblingsToplevelDiv( |
99 | "unswitch-siblings-toplevel-div" , cl::init(Val: 2), cl::Hidden, |
100 | cl::desc("Toplevel siblings divisor for cost multiplier." )); |
101 | static cl::opt<int> UnswitchNumInitialUnscaledCandidates( |
102 | "unswitch-num-initial-unscaled-candidates" , cl::init(Val: 8), cl::Hidden, |
103 | cl::desc("Number of unswitch candidates that are ignored when calculating " |
104 | "cost multiplier." )); |
105 | static cl::opt<bool> UnswitchGuards( |
106 | "simple-loop-unswitch-guards" , cl::init(Val: true), cl::Hidden, |
107 | cl::desc("If enabled, simple loop unswitching will also consider " |
108 | "llvm.experimental.guard intrinsics as unswitch candidates." )); |
109 | static cl::opt<bool> DropNonTrivialImplicitNullChecks( |
110 | "simple-loop-unswitch-drop-non-trivial-implicit-null-checks" , |
111 | cl::init(Val: false), cl::Hidden, |
112 | cl::desc("If enabled, drop make.implicit metadata in unswitched implicit " |
113 | "null checks to save time analyzing if we can keep it." )); |
114 | static cl::opt<unsigned> |
115 | MSSAThreshold("simple-loop-unswitch-memoryssa-threshold" , |
116 | cl::desc("Max number of memory uses to explore during " |
117 | "partial unswitching analysis" ), |
118 | cl::init(Val: 100), cl::Hidden); |
119 | static cl::opt<bool> FreezeLoopUnswitchCond( |
120 | "freeze-loop-unswitch-cond" , cl::init(Val: true), cl::Hidden, |
121 | cl::desc("If enabled, the freeze instruction will be added to condition " |
122 | "of loop unswitch to prevent miscompilation." )); |
123 | |
124 | static cl::opt<bool> InjectInvariantConditions( |
125 | "simple-loop-unswitch-inject-invariant-conditions" , cl::Hidden, |
126 | cl::desc("Whether we should inject new invariants and unswitch them to " |
127 | "eliminate some existing (non-invariant) conditions." ), |
128 | cl::init(Val: true)); |
129 | |
130 | static cl::opt<unsigned> InjectInvariantConditionHotnesThreshold( |
131 | "simple-loop-unswitch-inject-invariant-condition-hotness-threshold" , |
132 | cl::Hidden, cl::desc("Only try to inject loop invariant conditions and " |
133 | "unswitch on them to eliminate branches that are " |
134 | "not-taken 1/<this option> times or less." ), |
135 | cl::init(Val: 16)); |
136 | |
137 | AnalysisKey ShouldRunExtraSimpleLoopUnswitch::; |
138 | namespace { |
139 | struct CompareDesc { |
140 | BranchInst *Term; |
141 | Value *Invariant; |
142 | BasicBlock *InLoopSucc; |
143 | |
144 | CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc) |
145 | : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {} |
146 | }; |
147 | |
148 | struct InjectedInvariant { |
149 | ICmpInst::Predicate Pred; |
150 | Value *LHS; |
151 | Value *RHS; |
152 | BasicBlock *InLoopSucc; |
153 | |
154 | InjectedInvariant(ICmpInst::Predicate Pred, Value *LHS, Value *RHS, |
155 | BasicBlock *InLoopSucc) |
156 | : Pred(Pred), LHS(LHS), RHS(RHS), InLoopSucc(InLoopSucc) {} |
157 | }; |
158 | |
159 | struct NonTrivialUnswitchCandidate { |
160 | Instruction *TI = nullptr; |
161 | TinyPtrVector<Value *> Invariants; |
162 | std::optional<InstructionCost> Cost; |
163 | std::optional<InjectedInvariant> PendingInjection; |
164 | NonTrivialUnswitchCandidate( |
165 | Instruction *TI, ArrayRef<Value *> Invariants, |
166 | std::optional<InstructionCost> Cost = std::nullopt, |
167 | std::optional<InjectedInvariant> PendingInjection = std::nullopt) |
168 | : TI(TI), Invariants(Invariants), Cost(Cost), |
169 | PendingInjection(PendingInjection) {}; |
170 | |
171 | bool hasPendingInjection() const { return PendingInjection.has_value(); } |
172 | }; |
173 | } // end anonymous namespace. |
174 | |
175 | // Helper to skip (select x, true, false), which matches both a logical AND and |
176 | // OR and can confuse code that tries to determine if \p Cond is either a |
177 | // logical AND or OR but not both. |
178 | static Value *skipTrivialSelect(Value *Cond) { |
179 | Value *CondNext; |
180 | while (match(V: Cond, P: m_Select(C: m_Value(V&: CondNext), L: m_One(), R: m_Zero()))) |
181 | Cond = CondNext; |
182 | return Cond; |
183 | } |
184 | |
185 | /// Collect all of the loop invariant input values transitively used by the |
186 | /// homogeneous instruction graph from a given root. |
187 | /// |
188 | /// This essentially walks from a root recursively through loop variant operands |
189 | /// which have perform the same logical operation (AND or OR) and finds all |
190 | /// inputs which are loop invariant. For some operations these can be |
191 | /// re-associated and unswitched out of the loop entirely. |
192 | static TinyPtrVector<Value *> |
193 | collectHomogenousInstGraphLoopInvariants(const Loop &L, Instruction &Root, |
194 | const LoopInfo &LI) { |
195 | assert(!L.isLoopInvariant(&Root) && |
196 | "Only need to walk the graph if root itself is not invariant." ); |
197 | TinyPtrVector<Value *> Invariants; |
198 | |
199 | bool IsRootAnd = match(V: &Root, P: m_LogicalAnd()); |
200 | bool IsRootOr = match(V: &Root, P: m_LogicalOr()); |
201 | |
202 | // Build a worklist and recurse through operators collecting invariants. |
203 | SmallVector<Instruction *, 4> Worklist; |
204 | SmallPtrSet<Instruction *, 8> Visited; |
205 | Worklist.push_back(Elt: &Root); |
206 | Visited.insert(Ptr: &Root); |
207 | do { |
208 | Instruction &I = *Worklist.pop_back_val(); |
209 | for (Value *OpV : I.operand_values()) { |
210 | // Skip constants as unswitching isn't interesting for them. |
211 | if (isa<Constant>(Val: OpV)) |
212 | continue; |
213 | |
214 | // Add it to our result if loop invariant. |
215 | if (L.isLoopInvariant(V: OpV)) { |
216 | Invariants.push_back(NewVal: OpV); |
217 | continue; |
218 | } |
219 | |
220 | // If not an instruction with the same opcode, nothing we can do. |
221 | Instruction *OpI = dyn_cast<Instruction>(Val: skipTrivialSelect(Cond: OpV)); |
222 | |
223 | if (OpI && ((IsRootAnd && match(V: OpI, P: m_LogicalAnd())) || |
224 | (IsRootOr && match(V: OpI, P: m_LogicalOr())))) { |
225 | // Visit this operand. |
226 | if (Visited.insert(Ptr: OpI).second) |
227 | Worklist.push_back(Elt: OpI); |
228 | } |
229 | } |
230 | } while (!Worklist.empty()); |
231 | |
232 | return Invariants; |
233 | } |
234 | |
235 | static void replaceLoopInvariantUses(const Loop &L, Value *Invariant, |
236 | Constant &Replacement) { |
237 | assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?" ); |
238 | |
239 | // Replace uses of LIC in the loop with the given constant. |
240 | // We use make_early_inc_range as set invalidates the iterator. |
241 | for (Use &U : llvm::make_early_inc_range(Range: Invariant->uses())) { |
242 | Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser()); |
243 | |
244 | // Replace this use within the loop body. |
245 | if (UserI && L.contains(Inst: UserI)) |
246 | U.set(&Replacement); |
247 | } |
248 | } |
249 | |
250 | /// Check that all the LCSSA PHI nodes in the loop exit block have trivial |
251 | /// incoming values along this edge. |
252 | static bool areLoopExitPHIsLoopInvariant(const Loop &L, |
253 | const BasicBlock &ExitingBB, |
254 | const BasicBlock &ExitBB) { |
255 | for (const Instruction &I : ExitBB) { |
256 | auto *PN = dyn_cast<PHINode>(Val: &I); |
257 | if (!PN) |
258 | // No more PHIs to check. |
259 | return true; |
260 | |
261 | // If the incoming value for this edge isn't loop invariant the unswitch |
262 | // won't be trivial. |
263 | if (!L.isLoopInvariant(V: PN->getIncomingValueForBlock(BB: &ExitingBB))) |
264 | return false; |
265 | } |
266 | llvm_unreachable("Basic blocks should never be empty!" ); |
267 | } |
268 | |
269 | /// Copy a set of loop invariant values \p ToDuplicate and insert them at the |
270 | /// end of \p BB and conditionally branch on the copied condition. We only |
271 | /// branch on a single value. |
272 | static void buildPartialUnswitchConditionalBranch( |
273 | BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction, |
274 | BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, |
275 | const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { |
276 | IRBuilder<> IRB(&BB); |
277 | IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); |
278 | |
279 | SmallVector<Value *> FrozenInvariants; |
280 | for (Value *Inv : Invariants) { |
281 | if (InsertFreeze && !isGuaranteedNotToBeUndefOrPoison(V: Inv, AC, CtxI: I, DT: &DT)) |
282 | Inv = IRB.CreateFreeze(V: Inv, Name: Inv->getName() + ".fr" ); |
283 | FrozenInvariants.push_back(Elt: Inv); |
284 | } |
285 | |
286 | Value *Cond = Direction ? IRB.CreateOr(Ops: FrozenInvariants) |
287 | : IRB.CreateAnd(Ops: FrozenInvariants); |
288 | IRB.CreateCondBr(Cond, True: Direction ? &UnswitchedSucc : &NormalSucc, |
289 | False: Direction ? &NormalSucc : &UnswitchedSucc); |
290 | } |
291 | |
292 | /// Copy a set of loop invariant values, and conditionally branch on them. |
293 | static void buildPartialInvariantUnswitchConditionalBranch( |
294 | BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction, |
295 | BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L, |
296 | MemorySSAUpdater *MSSAU) { |
297 | ValueToValueMapTy VMap; |
298 | for (auto *Val : reverse(C&: ToDuplicate)) { |
299 | Instruction *Inst = cast<Instruction>(Val); |
300 | Instruction *NewInst = Inst->clone(); |
301 | |
302 | if (const DebugLoc &DL = Inst->getDebugLoc()) |
303 | mapAtomInstance(DL, VMap); |
304 | |
305 | NewInst->insertInto(ParentBB: &BB, It: BB.end()); |
306 | RemapInstruction(I: NewInst, VM&: VMap, |
307 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
308 | VMap[Val] = NewInst; |
309 | |
310 | if (!MSSAU) |
311 | continue; |
312 | |
313 | MemorySSA *MSSA = MSSAU->getMemorySSA(); |
314 | if (auto *MemUse = |
315 | dyn_cast_or_null<MemoryUse>(Val: MSSA->getMemoryAccess(I: Inst))) { |
316 | auto *DefiningAccess = MemUse->getDefiningAccess(); |
317 | // Get the first defining access before the loop. |
318 | while (L.contains(BB: DefiningAccess->getBlock())) { |
319 | // If the defining access is a MemoryPhi, get the incoming |
320 | // value for the pre-header as defining access. |
321 | if (auto *MemPhi = dyn_cast<MemoryPhi>(Val: DefiningAccess)) |
322 | DefiningAccess = |
323 | MemPhi->getIncomingValueForBlock(BB: L.getLoopPreheader()); |
324 | else |
325 | DefiningAccess = cast<MemoryDef>(Val: DefiningAccess)->getDefiningAccess(); |
326 | } |
327 | MSSAU->createMemoryAccessInBB(I: NewInst, Definition: DefiningAccess, |
328 | BB: NewInst->getParent(), |
329 | Point: MemorySSA::BeforeTerminator); |
330 | } |
331 | } |
332 | |
333 | IRBuilder<> IRB(&BB); |
334 | IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); |
335 | Value *Cond = VMap[ToDuplicate[0]]; |
336 | IRB.CreateCondBr(Cond, True: Direction ? &UnswitchedSucc : &NormalSucc, |
337 | False: Direction ? &NormalSucc : &UnswitchedSucc); |
338 | } |
339 | |
340 | /// Rewrite the PHI nodes in an unswitched loop exit basic block. |
341 | /// |
342 | /// Requires that the loop exit and unswitched basic block are the same, and |
343 | /// that the exiting block was a unique predecessor of that block. Rewrites the |
344 | /// PHI nodes in that block such that what were LCSSA PHI nodes become trivial |
345 | /// PHI nodes from the old preheader that now contains the unswitched |
346 | /// terminator. |
347 | static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, |
348 | BasicBlock &OldExitingBB, |
349 | BasicBlock &OldPH) { |
350 | for (PHINode &PN : UnswitchedBB.phis()) { |
351 | // When the loop exit is directly unswitched we just need to update the |
352 | // incoming basic block. We loop to handle weird cases with repeated |
353 | // incoming blocks, but expect to typically only have one operand here. |
354 | for (auto i : seq<int>(Begin: 0, End: PN.getNumOperands())) { |
355 | assert(PN.getIncomingBlock(i) == &OldExitingBB && |
356 | "Found incoming block different from unique predecessor!" ); |
357 | PN.setIncomingBlock(i, BB: &OldPH); |
358 | } |
359 | } |
360 | } |
361 | |
362 | /// Rewrite the PHI nodes in the loop exit basic block and the split off |
363 | /// unswitched block. |
364 | /// |
365 | /// Because the exit block remains an exit from the loop, this rewrites the |
366 | /// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI |
367 | /// nodes into the unswitched basic block to select between the value in the |
368 | /// old preheader and the loop exit. |
369 | static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, |
370 | BasicBlock &UnswitchedBB, |
371 | BasicBlock &OldExitingBB, |
372 | BasicBlock &OldPH, |
373 | bool FullUnswitch) { |
374 | assert(&ExitBB != &UnswitchedBB && |
375 | "Must have different loop exit and unswitched blocks!" ); |
376 | BasicBlock::iterator InsertPt = UnswitchedBB.begin(); |
377 | for (PHINode &PN : ExitBB.phis()) { |
378 | auto *NewPN = PHINode::Create(Ty: PN.getType(), /*NumReservedValues*/ 2, |
379 | NameStr: PN.getName() + ".split" ); |
380 | NewPN->insertBefore(InsertPos: InsertPt); |
381 | |
382 | // Walk backwards over the old PHI node's inputs to minimize the cost of |
383 | // removing each one. We have to do this weird loop manually so that we |
384 | // create the same number of new incoming edges in the new PHI as we expect |
385 | // each case-based edge to be included in the unswitched switch in some |
386 | // cases. |
387 | // FIXME: This is really, really gross. It would be much cleaner if LLVM |
388 | // allowed us to create a single entry for a predecessor block without |
389 | // having separate entries for each "edge" even though these edges are |
390 | // required to produce identical results. |
391 | for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) { |
392 | if (PN.getIncomingBlock(i) != &OldExitingBB) |
393 | continue; |
394 | |
395 | Value *Incoming = PN.getIncomingValue(i); |
396 | if (FullUnswitch) |
397 | // No more edge from the old exiting block to the exit block. |
398 | PN.removeIncomingValue(Idx: i); |
399 | |
400 | NewPN->addIncoming(V: Incoming, BB: &OldPH); |
401 | } |
402 | |
403 | // Now replace the old PHI with the new one and wire the old one in as an |
404 | // input to the new one. |
405 | PN.replaceAllUsesWith(V: NewPN); |
406 | NewPN->addIncoming(V: &PN, BB: &ExitBB); |
407 | } |
408 | } |
409 | |
410 | /// Hoist the current loop up to the innermost loop containing a remaining exit. |
411 | /// |
412 | /// Because we've removed an exit from the loop, we may have changed the set of |
413 | /// loops reachable and need to move the current loop up the loop nest or even |
414 | /// to an entirely separate nest. |
415 | static void hoistLoopToNewParent(Loop &L, BasicBlock &, |
416 | DominatorTree &DT, LoopInfo &LI, |
417 | MemorySSAUpdater *MSSAU, ScalarEvolution *SE) { |
418 | // If the loop is already at the top level, we can't hoist it anywhere. |
419 | Loop *OldParentL = L.getParentLoop(); |
420 | if (!OldParentL) |
421 | return; |
422 | |
423 | SmallVector<BasicBlock *, 4> Exits; |
424 | L.getExitBlocks(ExitBlocks&: Exits); |
425 | Loop *NewParentL = nullptr; |
426 | for (auto *ExitBB : Exits) |
427 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) |
428 | if (!NewParentL || NewParentL->contains(L: ExitL)) |
429 | NewParentL = ExitL; |
430 | |
431 | if (NewParentL == OldParentL) |
432 | return; |
433 | |
434 | // The new parent loop (if different) should always contain the old one. |
435 | if (NewParentL) |
436 | assert(NewParentL->contains(OldParentL) && |
437 | "Can only hoist this loop up the nest!" ); |
438 | |
439 | // The preheader will need to move with the body of this loop. However, |
440 | // because it isn't in this loop we also need to update the primary loop map. |
441 | assert(OldParentL == LI.getLoopFor(&Preheader) && |
442 | "Parent loop of this loop should contain this loop's preheader!" ); |
443 | LI.changeLoopFor(BB: &Preheader, L: NewParentL); |
444 | |
445 | // Remove this loop from its old parent. |
446 | OldParentL->removeChildLoop(Child: &L); |
447 | |
448 | // Add the loop either to the new parent or as a top-level loop. |
449 | if (NewParentL) |
450 | NewParentL->addChildLoop(NewChild: &L); |
451 | else |
452 | LI.addTopLevelLoop(New: &L); |
453 | |
454 | // Remove this loops blocks from the old parent and every other loop up the |
455 | // nest until reaching the new parent. Also update all of these |
456 | // no-longer-containing loops to reflect the nesting change. |
457 | for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL; |
458 | OldContainingL = OldContainingL->getParentLoop()) { |
459 | llvm::erase_if(C&: OldContainingL->getBlocksVector(), |
460 | P: [&](const BasicBlock *BB) { |
461 | return BB == &Preheader || L.contains(BB); |
462 | }); |
463 | |
464 | OldContainingL->getBlocksSet().erase(Ptr: &Preheader); |
465 | for (BasicBlock *BB : L.blocks()) |
466 | OldContainingL->getBlocksSet().erase(Ptr: BB); |
467 | |
468 | // Because we just hoisted a loop out of this one, we have essentially |
469 | // created new exit paths from it. That means we need to form LCSSA PHI |
470 | // nodes for values used in the no-longer-nested loop. |
471 | formLCSSA(L&: *OldContainingL, DT, LI: &LI, SE); |
472 | |
473 | // We shouldn't need to form dedicated exits because the exit introduced |
474 | // here is the (just split by unswitching) preheader. However, after trivial |
475 | // unswitching it is possible to get new non-dedicated exits out of parent |
476 | // loop so let's conservatively form dedicated exit blocks and figure out |
477 | // if we can optimize later. |
478 | formDedicatedExitBlocks(L: OldContainingL, DT: &DT, LI: &LI, MSSAU, |
479 | /*PreserveLCSSA*/ true); |
480 | } |
481 | } |
482 | |
483 | // Return the top-most loop containing ExitBB and having ExitBB as exiting block |
484 | // or the loop containing ExitBB, if there is no parent loop containing ExitBB |
485 | // as exiting block. |
486 | static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB, |
487 | const LoopInfo &LI) { |
488 | Loop *TopMost = LI.getLoopFor(BB: ExitBB); |
489 | Loop *Current = TopMost; |
490 | while (Current) { |
491 | if (Current->isLoopExiting(BB: ExitBB)) |
492 | TopMost = Current; |
493 | Current = Current->getParentLoop(); |
494 | } |
495 | return TopMost; |
496 | } |
497 | |
498 | /// Unswitch a trivial branch if the condition is loop invariant. |
499 | /// |
500 | /// This routine should only be called when loop code leading to the branch has |
501 | /// been validated as trivial (no side effects). This routine checks if the |
502 | /// condition is invariant and one of the successors is a loop exit. This |
503 | /// allows us to unswitch without duplicating the loop, making it trivial. |
504 | /// |
505 | /// If this routine fails to unswitch the branch it returns false. |
506 | /// |
507 | /// If the branch can be unswitched, this routine splits the preheader and |
508 | /// hoists the branch above that split. Preserves loop simplified form |
509 | /// (splitting the exit block as necessary). It simplifies the branch within |
510 | /// the loop to an unconditional branch but doesn't remove it entirely. Further |
511 | /// cleanup can be done with some simplifycfg like pass. |
512 | /// |
513 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
514 | /// invalidated by this. |
515 | static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, |
516 | LoopInfo &LI, ScalarEvolution *SE, |
517 | MemorySSAUpdater *MSSAU) { |
518 | assert(BI.isConditional() && "Can only unswitch a conditional branch!" ); |
519 | LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n" ); |
520 | |
521 | // The loop invariant values that we want to unswitch. |
522 | TinyPtrVector<Value *> Invariants; |
523 | |
524 | // When true, we're fully unswitching the branch rather than just unswitching |
525 | // some input conditions to the branch. |
526 | bool FullUnswitch = false; |
527 | |
528 | Value *Cond = skipTrivialSelect(Cond: BI.getCondition()); |
529 | if (L.isLoopInvariant(V: Cond)) { |
530 | Invariants.push_back(NewVal: Cond); |
531 | FullUnswitch = true; |
532 | } else { |
533 | if (auto *CondInst = dyn_cast<Instruction>(Val: Cond)) |
534 | Invariants = collectHomogenousInstGraphLoopInvariants(L, Root&: *CondInst, LI); |
535 | if (Invariants.empty()) { |
536 | LLVM_DEBUG(dbgs() << " Couldn't find invariant inputs!\n" ); |
537 | return false; |
538 | } |
539 | } |
540 | |
541 | // Check that one of the branch's successors exits, and which one. |
542 | bool ExitDirection = true; |
543 | int LoopExitSuccIdx = 0; |
544 | auto *LoopExitBB = BI.getSuccessor(i: 0); |
545 | if (L.contains(BB: LoopExitBB)) { |
546 | ExitDirection = false; |
547 | LoopExitSuccIdx = 1; |
548 | LoopExitBB = BI.getSuccessor(i: 1); |
549 | if (L.contains(BB: LoopExitBB)) { |
550 | LLVM_DEBUG(dbgs() << " Branch doesn't exit the loop!\n" ); |
551 | return false; |
552 | } |
553 | } |
554 | auto *ContinueBB = BI.getSuccessor(i: 1 - LoopExitSuccIdx); |
555 | auto *ParentBB = BI.getParent(); |
556 | if (!areLoopExitPHIsLoopInvariant(L, ExitingBB: *ParentBB, ExitBB: *LoopExitBB)) { |
557 | LLVM_DEBUG(dbgs() << " Loop exit PHI's aren't loop-invariant!\n" ); |
558 | return false; |
559 | } |
560 | |
561 | // When unswitching only part of the branch's condition, we need the exit |
562 | // block to be reached directly from the partially unswitched input. This can |
563 | // be done when the exit block is along the true edge and the branch condition |
564 | // is a graph of `or` operations, or the exit block is along the false edge |
565 | // and the condition is a graph of `and` operations. |
566 | if (!FullUnswitch) { |
567 | if (ExitDirection ? !match(V: Cond, P: m_LogicalOr()) |
568 | : !match(V: Cond, P: m_LogicalAnd())) { |
569 | LLVM_DEBUG(dbgs() << " Branch condition is in improper form for " |
570 | "non-full unswitch!\n" ); |
571 | return false; |
572 | } |
573 | } |
574 | |
575 | LLVM_DEBUG({ |
576 | dbgs() << " unswitching trivial invariant conditions for: " << BI |
577 | << "\n" ; |
578 | for (Value *Invariant : Invariants) { |
579 | dbgs() << " " << *Invariant << " == true" ; |
580 | if (Invariant != Invariants.back()) |
581 | dbgs() << " ||" ; |
582 | dbgs() << "\n" ; |
583 | } |
584 | }); |
585 | |
586 | // If we have scalar evolutions, we need to invalidate them including this |
587 | // loop, the loop containing the exit block and the topmost parent loop |
588 | // exiting via LoopExitBB. |
589 | if (SE) { |
590 | if (const Loop *ExitL = getTopMostExitingLoop(ExitBB: LoopExitBB, LI)) |
591 | SE->forgetLoop(L: ExitL); |
592 | else |
593 | // Forget the entire nest as this exits the entire nest. |
594 | SE->forgetTopmostLoop(L: &L); |
595 | SE->forgetBlockAndLoopDispositions(); |
596 | } |
597 | |
598 | if (MSSAU && VerifyMemorySSA) |
599 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
600 | |
601 | // Split the preheader, so that we know that there is a safe place to insert |
602 | // the conditional branch. We will change the preheader to have a conditional |
603 | // branch on LoopCond. |
604 | BasicBlock *OldPH = L.getLoopPreheader(); |
605 | BasicBlock *NewPH = SplitEdge(From: OldPH, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
606 | |
607 | // Now that we have a place to insert the conditional branch, create a place |
608 | // to branch to: this is the exit block out of the loop that we are |
609 | // unswitching. We need to split this if there are other loop predecessors. |
610 | // Because the loop is in simplified form, *any* other predecessor is enough. |
611 | BasicBlock *UnswitchedBB; |
612 | if (FullUnswitch && LoopExitBB->getUniquePredecessor()) { |
613 | assert(LoopExitBB->getUniquePredecessor() == BI.getParent() && |
614 | "A branch's parent isn't a predecessor!" ); |
615 | UnswitchedBB = LoopExitBB; |
616 | } else { |
617 | UnswitchedBB = |
618 | SplitBlock(Old: LoopExitBB, SplitPt: LoopExitBB->begin(), DT: &DT, LI: &LI, MSSAU, BBName: "" , Before: false); |
619 | } |
620 | |
621 | if (MSSAU && VerifyMemorySSA) |
622 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
623 | |
624 | // Actually move the invariant uses into the unswitched position. If possible, |
625 | // we do this by moving the instructions, but when doing partial unswitching |
626 | // we do it by building a new merge of the values in the unswitched position. |
627 | OldPH->getTerminator()->eraseFromParent(); |
628 | if (FullUnswitch) { |
629 | // If fully unswitching, we can use the existing branch instruction. |
630 | // Splice it into the old PH to gate reaching the new preheader and re-point |
631 | // its successors. |
632 | BI.moveBefore(BB&: *OldPH, I: OldPH->end()); |
633 | BI.setCondition(Cond); |
634 | if (MSSAU) { |
635 | // Temporarily clone the terminator, to make MSSA update cheaper by |
636 | // separating "insert edge" updates from "remove edge" ones. |
637 | BI.clone()->insertInto(ParentBB, It: ParentBB->end()); |
638 | } else { |
639 | // Create a new unconditional branch that will continue the loop as a new |
640 | // terminator. |
641 | Instruction *NewBI = BranchInst::Create(IfTrue: ContinueBB, InsertBefore: ParentBB); |
642 | NewBI->setDebugLoc(BI.getDebugLoc()); |
643 | } |
644 | BI.setSuccessor(idx: LoopExitSuccIdx, NewSucc: UnswitchedBB); |
645 | BI.setSuccessor(idx: 1 - LoopExitSuccIdx, NewSucc: NewPH); |
646 | } else { |
647 | // Only unswitching a subset of inputs to the condition, so we will need to |
648 | // build a new branch that merges the invariant inputs. |
649 | if (ExitDirection) |
650 | assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalOr()) && |
651 | "Must have an `or` of `i1`s or `select i1 X, true, Y`s for the " |
652 | "condition!" ); |
653 | else |
654 | assert(match(skipTrivialSelect(BI.getCondition()), m_LogicalAnd()) && |
655 | "Must have an `and` of `i1`s or `select i1 X, Y, false`s for the" |
656 | " condition!" ); |
657 | buildPartialUnswitchConditionalBranch( |
658 | BB&: *OldPH, Invariants, Direction: ExitDirection, UnswitchedSucc&: *UnswitchedBB, NormalSucc&: *NewPH, |
659 | InsertFreeze: FreezeLoopUnswitchCond, I: OldPH->getTerminator(), AC: nullptr, DT); |
660 | } |
661 | |
662 | // Update the dominator tree with the added edge. |
663 | DT.insertEdge(From: OldPH, To: UnswitchedBB); |
664 | |
665 | // After the dominator tree was updated with the added edge, update MemorySSA |
666 | // if available. |
667 | if (MSSAU) { |
668 | SmallVector<CFGUpdate, 1> Updates; |
669 | Updates.push_back(Elt: {cfg::UpdateKind::Insert, OldPH, UnswitchedBB}); |
670 | MSSAU->applyInsertUpdates(Updates, DT); |
671 | } |
672 | |
673 | // Finish updating dominator tree and memory ssa for full unswitch. |
674 | if (FullUnswitch) { |
675 | if (MSSAU) { |
676 | Instruction *Term = ParentBB->getTerminator(); |
677 | // Remove the cloned branch instruction and create unconditional branch |
678 | // now. |
679 | Instruction *NewBI = BranchInst::Create(IfTrue: ContinueBB, InsertBefore: ParentBB); |
680 | NewBI->setDebugLoc(Term->getDebugLoc()); |
681 | Term->eraseFromParent(); |
682 | MSSAU->removeEdge(From: ParentBB, To: LoopExitBB); |
683 | } |
684 | DT.deleteEdge(From: ParentBB, To: LoopExitBB); |
685 | } |
686 | |
687 | if (MSSAU && VerifyMemorySSA) |
688 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
689 | |
690 | // Rewrite the relevant PHI nodes. |
691 | if (UnswitchedBB == LoopExitBB) |
692 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *UnswitchedBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
693 | else |
694 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *LoopExitBB, UnswitchedBB&: *UnswitchedBB, |
695 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, FullUnswitch); |
696 | |
697 | // The constant we can replace all of our invariants with inside the loop |
698 | // body. If any of the invariants have a value other than this the loop won't |
699 | // be entered. |
700 | ConstantInt *Replacement = ExitDirection |
701 | ? ConstantInt::getFalse(Context&: BI.getContext()) |
702 | : ConstantInt::getTrue(Context&: BI.getContext()); |
703 | |
704 | // Since this is an i1 condition we can also trivially replace uses of it |
705 | // within the loop with a constant. |
706 | for (Value *Invariant : Invariants) |
707 | replaceLoopInvariantUses(L, Invariant, Replacement&: *Replacement); |
708 | |
709 | // If this was full unswitching, we may have changed the nesting relationship |
710 | // for this loop so hoist it to its correct parent if needed. |
711 | if (FullUnswitch) |
712 | hoistLoopToNewParent(L, Preheader&: *NewPH, DT, LI, MSSAU, SE); |
713 | |
714 | if (MSSAU && VerifyMemorySSA) |
715 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
716 | |
717 | LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n" ); |
718 | ++NumTrivial; |
719 | ++NumBranches; |
720 | return true; |
721 | } |
722 | |
723 | /// Unswitch a trivial switch if the condition is loop invariant. |
724 | /// |
725 | /// This routine should only be called when loop code leading to the switch has |
726 | /// been validated as trivial (no side effects). This routine checks if the |
727 | /// condition is invariant and that at least one of the successors is a loop |
728 | /// exit. This allows us to unswitch without duplicating the loop, making it |
729 | /// trivial. |
730 | /// |
731 | /// If this routine fails to unswitch the switch it returns false. |
732 | /// |
733 | /// If the switch can be unswitched, this routine splits the preheader and |
734 | /// copies the switch above that split. If the default case is one of the |
735 | /// exiting cases, it copies the non-exiting cases and points them at the new |
736 | /// preheader. If the default case is not exiting, it copies the exiting cases |
737 | /// and points the default at the preheader. It preserves loop simplified form |
738 | /// (splitting the exit blocks as necessary). It simplifies the switch within |
739 | /// the loop by removing now-dead cases. If the default case is one of those |
740 | /// unswitched, it replaces its destination with a new basic block containing |
741 | /// only unreachable. Such basic blocks, while technically loop exits, are not |
742 | /// considered for unswitching so this is a stable transform and the same |
743 | /// switch will not be revisited. If after unswitching there is only a single |
744 | /// in-loop successor, the switch is further simplified to an unconditional |
745 | /// branch. Still more cleanup can be done with some simplifycfg like pass. |
746 | /// |
747 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
748 | /// invalidated by this. |
749 | static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, |
750 | LoopInfo &LI, ScalarEvolution *SE, |
751 | MemorySSAUpdater *MSSAU) { |
752 | LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n" ); |
753 | Value *LoopCond = SI.getCondition(); |
754 | |
755 | // If this isn't switching on an invariant condition, we can't unswitch it. |
756 | if (!L.isLoopInvariant(V: LoopCond)) |
757 | return false; |
758 | |
759 | auto *ParentBB = SI.getParent(); |
760 | |
761 | // The same check must be used both for the default and the exit cases. We |
762 | // should never leave edges from the switch instruction to a basic block that |
763 | // we are unswitching, hence the condition used to determine the default case |
764 | // needs to also be used to populate ExitCaseIndices, which is then used to |
765 | // remove cases from the switch. |
766 | auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) { |
767 | // BBToCheck is not an exit block if it is inside loop L. |
768 | if (L.contains(BB: &BBToCheck)) |
769 | return false; |
770 | // BBToCheck is not trivial to unswitch if its phis aren't loop invariant. |
771 | if (!areLoopExitPHIsLoopInvariant(L, ExitingBB: *ParentBB, ExitBB: BBToCheck)) |
772 | return false; |
773 | // We do not unswitch a block that only has an unreachable statement, as |
774 | // it's possible this is a previously unswitched block. Only unswitch if |
775 | // either the terminator is not unreachable, or, if it is, it's not the only |
776 | // instruction in the block. |
777 | auto *TI = BBToCheck.getTerminator(); |
778 | bool isUnreachable = isa<UnreachableInst>(Val: TI); |
779 | return !isUnreachable || &*BBToCheck.getFirstNonPHIOrDbg() != TI; |
780 | }; |
781 | |
782 | SmallVector<int, 4> ExitCaseIndices; |
783 | for (auto Case : SI.cases()) |
784 | if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor())) |
785 | ExitCaseIndices.push_back(Elt: Case.getCaseIndex()); |
786 | BasicBlock *DefaultExitBB = nullptr; |
787 | SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight = |
788 | SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, idx: 0); |
789 | if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) { |
790 | DefaultExitBB = SI.getDefaultDest(); |
791 | } else if (ExitCaseIndices.empty()) |
792 | return false; |
793 | |
794 | LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n" ); |
795 | |
796 | if (MSSAU && VerifyMemorySSA) |
797 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
798 | |
799 | // We may need to invalidate SCEVs for the outermost loop reached by any of |
800 | // the exits. |
801 | Loop *OuterL = &L; |
802 | |
803 | if (DefaultExitBB) { |
804 | // Check the loop containing this exit. |
805 | Loop *ExitL = getTopMostExitingLoop(ExitBB: DefaultExitBB, LI); |
806 | if (!ExitL || ExitL->contains(L: OuterL)) |
807 | OuterL = ExitL; |
808 | } |
809 | for (unsigned Index : ExitCaseIndices) { |
810 | auto CaseI = SI.case_begin() + Index; |
811 | // Compute the outer loop from this exit. |
812 | Loop *ExitL = getTopMostExitingLoop(ExitBB: CaseI->getCaseSuccessor(), LI); |
813 | if (!ExitL || ExitL->contains(L: OuterL)) |
814 | OuterL = ExitL; |
815 | } |
816 | |
817 | if (SE) { |
818 | if (OuterL) |
819 | SE->forgetLoop(L: OuterL); |
820 | else |
821 | SE->forgetTopmostLoop(L: &L); |
822 | } |
823 | |
824 | if (DefaultExitBB) { |
825 | // Clear out the default destination temporarily to allow accurate |
826 | // predecessor lists to be examined below. |
827 | SI.setDefaultDest(nullptr); |
828 | } |
829 | |
830 | // Store the exit cases into a separate data structure and remove them from |
831 | // the switch. |
832 | SmallVector<std::tuple<ConstantInt *, BasicBlock *, |
833 | SwitchInstProfUpdateWrapper::CaseWeightOpt>, |
834 | 4> ExitCases; |
835 | ExitCases.reserve(N: ExitCaseIndices.size()); |
836 | SwitchInstProfUpdateWrapper SIW(SI); |
837 | // We walk the case indices backwards so that we remove the last case first |
838 | // and don't disrupt the earlier indices. |
839 | for (unsigned Index : reverse(C&: ExitCaseIndices)) { |
840 | auto CaseI = SI.case_begin() + Index; |
841 | // Save the value of this case. |
842 | auto W = SIW.getSuccessorWeight(idx: CaseI->getSuccessorIndex()); |
843 | ExitCases.emplace_back(Args: CaseI->getCaseValue(), Args: CaseI->getCaseSuccessor(), Args&: W); |
844 | // Delete the unswitched cases. |
845 | SIW.removeCase(I: CaseI); |
846 | } |
847 | |
848 | // Check if after this all of the remaining cases point at the same |
849 | // successor. |
850 | BasicBlock *CommonSuccBB = nullptr; |
851 | if (SI.getNumCases() > 0 && |
852 | all_of(Range: drop_begin(RangeOrContainer: SI.cases()), P: [&SI](const SwitchInst::CaseHandle &Case) { |
853 | return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor(); |
854 | })) |
855 | CommonSuccBB = SI.case_begin()->getCaseSuccessor(); |
856 | if (!DefaultExitBB) { |
857 | // If we're not unswitching the default, we need it to match any cases to |
858 | // have a common successor or if we have no cases it is the common |
859 | // successor. |
860 | if (SI.getNumCases() == 0) |
861 | CommonSuccBB = SI.getDefaultDest(); |
862 | else if (SI.getDefaultDest() != CommonSuccBB) |
863 | CommonSuccBB = nullptr; |
864 | } |
865 | |
866 | // Split the preheader, so that we know that there is a safe place to insert |
867 | // the switch. |
868 | BasicBlock *OldPH = L.getLoopPreheader(); |
869 | BasicBlock *NewPH = SplitEdge(From: OldPH, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
870 | OldPH->getTerminator()->eraseFromParent(); |
871 | |
872 | // Now add the unswitched switch. This new switch instruction inherits the |
873 | // debug location of the old switch, because it semantically replace the old |
874 | // one. |
875 | auto *NewSI = SwitchInst::Create(Value: LoopCond, Default: NewPH, NumCases: ExitCases.size(), InsertBefore: OldPH); |
876 | NewSI->setDebugLoc(SIW->getDebugLoc()); |
877 | SwitchInstProfUpdateWrapper NewSIW(*NewSI); |
878 | |
879 | // Rewrite the IR for the unswitched basic blocks. This requires two steps. |
880 | // First, we split any exit blocks with remaining in-loop predecessors. Then |
881 | // we update the PHIs in one of two ways depending on if there was a split. |
882 | // We walk in reverse so that we split in the same order as the cases |
883 | // appeared. This is purely for convenience of reading the resulting IR, but |
884 | // it doesn't cost anything really. |
885 | SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; |
886 | SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; |
887 | // Handle the default exit if necessary. |
888 | // FIXME: It'd be great if we could merge this with the loop below but LLVM's |
889 | // ranges aren't quite powerful enough yet. |
890 | if (DefaultExitBB) { |
891 | if (pred_empty(BB: DefaultExitBB)) { |
892 | UnswitchedExitBBs.insert(Ptr: DefaultExitBB); |
893 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *DefaultExitBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
894 | } else { |
895 | auto *SplitBB = |
896 | SplitBlock(Old: DefaultExitBB, SplitPt: DefaultExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
897 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *DefaultExitBB, UnswitchedBB&: *SplitBB, |
898 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, |
899 | /*FullUnswitch*/ true); |
900 | DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; |
901 | } |
902 | } |
903 | // Note that we must use a reference in the for loop so that we update the |
904 | // container. |
905 | for (auto &ExitCase : reverse(C&: ExitCases)) { |
906 | // Grab a reference to the exit block in the pair so that we can update it. |
907 | BasicBlock *ExitBB = std::get<1>(t&: ExitCase); |
908 | |
909 | // If this case is the last edge into the exit block, we can simply reuse it |
910 | // as it will no longer be a loop exit. No mapping necessary. |
911 | if (pred_empty(BB: ExitBB)) { |
912 | // Only rewrite once. |
913 | if (UnswitchedExitBBs.insert(Ptr: ExitBB).second) |
914 | rewritePHINodesForUnswitchedExitBlock(UnswitchedBB&: *ExitBB, OldExitingBB&: *ParentBB, OldPH&: *OldPH); |
915 | continue; |
916 | } |
917 | |
918 | // Otherwise we need to split the exit block so that we retain an exit |
919 | // block from the loop and a target for the unswitched condition. |
920 | BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; |
921 | if (!SplitExitBB) { |
922 | // If this is the first time we see this, do the split and remember it. |
923 | SplitExitBB = SplitBlock(Old: ExitBB, SplitPt: ExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
924 | rewritePHINodesForExitAndUnswitchedBlocks(ExitBB&: *ExitBB, UnswitchedBB&: *SplitExitBB, |
925 | OldExitingBB&: *ParentBB, OldPH&: *OldPH, |
926 | /*FullUnswitch*/ true); |
927 | } |
928 | // Update the case pair to point to the split block. |
929 | std::get<1>(t&: ExitCase) = SplitExitBB; |
930 | } |
931 | |
932 | // Now add the unswitched cases. We do this in reverse order as we built them |
933 | // in reverse order. |
934 | for (auto &ExitCase : reverse(C&: ExitCases)) { |
935 | ConstantInt *CaseVal = std::get<0>(t&: ExitCase); |
936 | BasicBlock *UnswitchedBB = std::get<1>(t&: ExitCase); |
937 | |
938 | NewSIW.addCase(OnVal: CaseVal, Dest: UnswitchedBB, W: std::get<2>(t&: ExitCase)); |
939 | } |
940 | |
941 | // If the default was unswitched, re-point it and add explicit cases for |
942 | // entering the loop. |
943 | if (DefaultExitBB) { |
944 | NewSIW->setDefaultDest(DefaultExitBB); |
945 | NewSIW.setSuccessorWeight(idx: 0, W: DefaultCaseWeight); |
946 | |
947 | // We removed all the exit cases, so we just copy the cases to the |
948 | // unswitched switch. |
949 | for (const auto &Case : SI.cases()) |
950 | NewSIW.addCase(OnVal: Case.getCaseValue(), Dest: NewPH, |
951 | W: SIW.getSuccessorWeight(idx: Case.getSuccessorIndex())); |
952 | } else if (DefaultCaseWeight) { |
953 | // We have to set branch weight of the default case. |
954 | uint64_t SW = *DefaultCaseWeight; |
955 | for (const auto &Case : SI.cases()) { |
956 | auto W = SIW.getSuccessorWeight(idx: Case.getSuccessorIndex()); |
957 | assert(W && |
958 | "case weight must be defined as default case weight is defined" ); |
959 | SW += *W; |
960 | } |
961 | NewSIW.setSuccessorWeight(idx: 0, W: SW); |
962 | } |
963 | |
964 | // If we ended up with a common successor for every path through the switch |
965 | // after unswitching, rewrite it to an unconditional branch to make it easy |
966 | // to recognize. Otherwise we potentially have to recognize the default case |
967 | // pointing at unreachable and other complexity. |
968 | if (CommonSuccBB) { |
969 | BasicBlock *BB = SI.getParent(); |
970 | // We may have had multiple edges to this common successor block, so remove |
971 | // them as predecessors. We skip the first one, either the default or the |
972 | // actual first case. |
973 | bool SkippedFirst = DefaultExitBB == nullptr; |
974 | for (auto Case : SI.cases()) { |
975 | assert(Case.getCaseSuccessor() == CommonSuccBB && |
976 | "Non-common successor!" ); |
977 | (void)Case; |
978 | if (!SkippedFirst) { |
979 | SkippedFirst = true; |
980 | continue; |
981 | } |
982 | CommonSuccBB->removePredecessor(Pred: BB, |
983 | /*KeepOneInputPHIs*/ true); |
984 | } |
985 | // Now nuke the switch and replace it with a direct branch. |
986 | Instruction *NewBI = BranchInst::Create(IfTrue: CommonSuccBB, InsertBefore: BB); |
987 | NewBI->setDebugLoc(SIW->getDebugLoc()); |
988 | SIW.eraseFromParent(); |
989 | } else if (DefaultExitBB) { |
990 | assert(SI.getNumCases() > 0 && |
991 | "If we had no cases we'd have a common successor!" ); |
992 | // Move the last case to the default successor. This is valid as if the |
993 | // default got unswitched it cannot be reached. This has the advantage of |
994 | // being simple and keeping the number of edges from this switch to |
995 | // successors the same, and avoiding any PHI update complexity. |
996 | auto LastCaseI = std::prev(x: SI.case_end()); |
997 | |
998 | SI.setDefaultDest(LastCaseI->getCaseSuccessor()); |
999 | SIW.setSuccessorWeight( |
1000 | idx: 0, W: SIW.getSuccessorWeight(idx: LastCaseI->getSuccessorIndex())); |
1001 | SIW.removeCase(I: LastCaseI); |
1002 | } |
1003 | |
1004 | // Walk the unswitched exit blocks and the unswitched split blocks and update |
1005 | // the dominator tree based on the CFG edits. While we are walking unordered |
1006 | // containers here, the API for applyUpdates takes an unordered list of |
1007 | // updates and requires them to not contain duplicates. |
1008 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
1009 | for (auto *UnswitchedExitBB : UnswitchedExitBBs) { |
1010 | DTUpdates.push_back(Elt: {DT.Delete, ParentBB, UnswitchedExitBB}); |
1011 | DTUpdates.push_back(Elt: {DT.Insert, OldPH, UnswitchedExitBB}); |
1012 | } |
1013 | for (auto SplitUnswitchedPair : SplitExitBBMap) { |
1014 | DTUpdates.push_back(Elt: {DT.Delete, ParentBB, SplitUnswitchedPair.first}); |
1015 | DTUpdates.push_back(Elt: {DT.Insert, OldPH, SplitUnswitchedPair.second}); |
1016 | } |
1017 | |
1018 | if (MSSAU) { |
1019 | MSSAU->applyUpdates(Updates: DTUpdates, DT, /*UpdateDT=*/UpdateDTFirst: true); |
1020 | if (VerifyMemorySSA) |
1021 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
1022 | } else { |
1023 | DT.applyUpdates(Updates: DTUpdates); |
1024 | } |
1025 | |
1026 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
1027 | |
1028 | // We may have changed the nesting relationship for this loop so hoist it to |
1029 | // its correct parent if needed. |
1030 | hoistLoopToNewParent(L, Preheader&: *NewPH, DT, LI, MSSAU, SE); |
1031 | |
1032 | if (MSSAU && VerifyMemorySSA) |
1033 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
1034 | |
1035 | ++NumTrivial; |
1036 | ++NumSwitches; |
1037 | LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n" ); |
1038 | return true; |
1039 | } |
1040 | |
1041 | /// This routine scans the loop to find a branch or switch which occurs before |
1042 | /// any side effects occur. These can potentially be unswitched without |
1043 | /// duplicating the loop. If a branch or switch is successfully unswitched the |
1044 | /// scanning continues to see if subsequent branches or switches have become |
1045 | /// trivial. Once all trivial candidates have been unswitched, this routine |
1046 | /// returns. |
1047 | /// |
1048 | /// The return value indicates whether anything was unswitched (and therefore |
1049 | /// changed). |
1050 | /// |
1051 | /// If `SE` is not null, it will be updated based on the potential loop SCEVs |
1052 | /// invalidated by this. |
1053 | static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, |
1054 | LoopInfo &LI, ScalarEvolution *SE, |
1055 | MemorySSAUpdater *MSSAU) { |
1056 | bool Changed = false; |
1057 | |
1058 | // If loop header has only one reachable successor we should keep looking for |
1059 | // trivial condition candidates in the successor as well. An alternative is |
1060 | // to constant fold conditions and merge successors into loop header (then we |
1061 | // only need to check header's terminator). The reason for not doing this in |
1062 | // LoopUnswitch pass is that it could potentially break LoopPassManager's |
1063 | // invariants. Folding dead branches could either eliminate the current loop |
1064 | // or make other loops unreachable. LCSSA form might also not be preserved |
1065 | // after deleting branches. The following code keeps traversing loop header's |
1066 | // successors until it finds the trivial condition candidate (condition that |
1067 | // is not a constant). Since unswitching generates branches with constant |
1068 | // conditions, this scenario could be very common in practice. |
1069 | BasicBlock *CurrentBB = L.getHeader(); |
1070 | SmallPtrSet<BasicBlock *, 8> Visited; |
1071 | Visited.insert(Ptr: CurrentBB); |
1072 | do { |
1073 | // Check if there are any side-effecting instructions (e.g. stores, calls, |
1074 | // volatile loads) in the part of the loop that the code *would* execute |
1075 | // without unswitching. |
1076 | if (MSSAU) // Possible early exit with MSSA |
1077 | if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(BB: CurrentBB)) |
1078 | if (!isa<MemoryPhi>(Val: *Defs->begin()) || (++Defs->begin() != Defs->end())) |
1079 | return Changed; |
1080 | if (llvm::any_of(Range&: *CurrentBB, |
1081 | P: [](Instruction &I) { return I.mayHaveSideEffects(); })) |
1082 | return Changed; |
1083 | |
1084 | Instruction *CurrentTerm = CurrentBB->getTerminator(); |
1085 | |
1086 | if (auto *SI = dyn_cast<SwitchInst>(Val: CurrentTerm)) { |
1087 | // Don't bother trying to unswitch past a switch with a constant |
1088 | // condition. This should be removed prior to running this pass by |
1089 | // simplifycfg. |
1090 | if (isa<Constant>(Val: SI->getCondition())) |
1091 | return Changed; |
1092 | |
1093 | if (!unswitchTrivialSwitch(L, SI&: *SI, DT, LI, SE, MSSAU)) |
1094 | // Couldn't unswitch this one so we're done. |
1095 | return Changed; |
1096 | |
1097 | // Mark that we managed to unswitch something. |
1098 | Changed = true; |
1099 | |
1100 | // If unswitching turned the terminator into an unconditional branch then |
1101 | // we can continue. The unswitching logic specifically works to fold any |
1102 | // cases it can into an unconditional branch to make it easier to |
1103 | // recognize here. |
1104 | auto *BI = dyn_cast<BranchInst>(Val: CurrentBB->getTerminator()); |
1105 | if (!BI || BI->isConditional()) |
1106 | return Changed; |
1107 | |
1108 | CurrentBB = BI->getSuccessor(i: 0); |
1109 | continue; |
1110 | } |
1111 | |
1112 | auto *BI = dyn_cast<BranchInst>(Val: CurrentTerm); |
1113 | if (!BI) |
1114 | // We do not understand other terminator instructions. |
1115 | return Changed; |
1116 | |
1117 | // Don't bother trying to unswitch past an unconditional branch or a branch |
1118 | // with a constant value. These should be removed by simplifycfg prior to |
1119 | // running this pass. |
1120 | if (!BI->isConditional() || |
1121 | isa<Constant>(Val: skipTrivialSelect(Cond: BI->getCondition()))) |
1122 | return Changed; |
1123 | |
1124 | // Found a trivial condition candidate: non-foldable conditional branch. If |
1125 | // we fail to unswitch this, we can't do anything else that is trivial. |
1126 | if (!unswitchTrivialBranch(L, BI&: *BI, DT, LI, SE, MSSAU)) |
1127 | return Changed; |
1128 | |
1129 | // Mark that we managed to unswitch something. |
1130 | Changed = true; |
1131 | |
1132 | // If we only unswitched some of the conditions feeding the branch, we won't |
1133 | // have collapsed it to a single successor. |
1134 | BI = cast<BranchInst>(Val: CurrentBB->getTerminator()); |
1135 | if (BI->isConditional()) |
1136 | return Changed; |
1137 | |
1138 | // Follow the newly unconditional branch into its successor. |
1139 | CurrentBB = BI->getSuccessor(i: 0); |
1140 | |
1141 | // When continuing, if we exit the loop or reach a previous visited block, |
1142 | // then we can not reach any trivial condition candidates (unfoldable |
1143 | // branch instructions or switch instructions) and no unswitch can happen. |
1144 | } while (L.contains(BB: CurrentBB) && Visited.insert(Ptr: CurrentBB).second); |
1145 | |
1146 | return Changed; |
1147 | } |
1148 | |
1149 | /// Build the cloned blocks for an unswitched copy of the given loop. |
1150 | /// |
1151 | /// The cloned blocks are inserted before the loop preheader (`LoopPH`) and |
1152 | /// after the split block (`SplitBB`) that will be used to select between the |
1153 | /// cloned and original loop. |
1154 | /// |
1155 | /// This routine handles cloning all of the necessary loop blocks and exit |
1156 | /// blocks including rewriting their instructions and the relevant PHI nodes. |
1157 | /// Any loop blocks or exit blocks which are dominated by a different successor |
1158 | /// than the one for this clone of the loop blocks can be trivially skipped. We |
1159 | /// use the `DominatingSucc` map to determine whether a block satisfies that |
1160 | /// property with a simple map lookup. |
1161 | /// |
1162 | /// It also correctly creates the unconditional branch in the cloned |
1163 | /// unswitched parent block to only point at the unswitched successor. |
1164 | /// |
1165 | /// This does not handle most of the necessary updates to `LoopInfo`. Only exit |
1166 | /// block splitting is correctly reflected in `LoopInfo`, essentially all of |
1167 | /// the cloned blocks (and their loops) are left without full `LoopInfo` |
1168 | /// updates. This also doesn't fully update `DominatorTree`. It adds the cloned |
1169 | /// blocks to them but doesn't create the cloned `DominatorTree` structure and |
1170 | /// instead the caller must recompute an accurate DT. It *does* correctly |
1171 | /// update the `AssumptionCache` provided in `AC`. |
1172 | static BasicBlock *buildClonedLoopBlocks( |
1173 | Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB, |
1174 | ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB, |
1175 | BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB, |
1176 | const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc, |
1177 | ValueToValueMapTy &VMap, |
1178 | SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC, |
1179 | DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, |
1180 | ScalarEvolution *SE) { |
1181 | SmallVector<BasicBlock *, 4> NewBlocks; |
1182 | NewBlocks.reserve(N: L.getNumBlocks() + ExitBlocks.size()); |
1183 | |
1184 | // We will need to clone a bunch of blocks, wrap up the clone operation in |
1185 | // a helper. |
1186 | auto CloneBlock = [&](BasicBlock *OldBB) { |
1187 | // Clone the basic block and insert it before the new preheader. |
1188 | BasicBlock *NewBB = CloneBasicBlock(BB: OldBB, VMap, NameSuffix: ".us" , F: OldBB->getParent()); |
1189 | NewBB->moveBefore(MovePos: LoopPH); |
1190 | |
1191 | // Record this block and the mapping. |
1192 | NewBlocks.push_back(Elt: NewBB); |
1193 | VMap[OldBB] = NewBB; |
1194 | |
1195 | return NewBB; |
1196 | }; |
1197 | |
1198 | // We skip cloning blocks when they have a dominating succ that is not the |
1199 | // succ we are cloning for. |
1200 | auto SkipBlock = [&](BasicBlock *BB) { |
1201 | auto It = DominatingSucc.find(Val: BB); |
1202 | return It != DominatingSucc.end() && It->second != UnswitchedSuccBB; |
1203 | }; |
1204 | |
1205 | // First, clone the preheader. |
1206 | auto *ClonedPH = CloneBlock(LoopPH); |
1207 | |
1208 | // Then clone all the loop blocks, skipping the ones that aren't necessary. |
1209 | for (auto *LoopBB : L.blocks()) |
1210 | if (!SkipBlock(LoopBB)) |
1211 | CloneBlock(LoopBB); |
1212 | |
1213 | // Split all the loop exit edges so that when we clone the exit blocks, if |
1214 | // any of the exit blocks are *also* a preheader for some other loop, we |
1215 | // don't create multiple predecessors entering the loop header. |
1216 | for (auto *ExitBB : ExitBlocks) { |
1217 | if (SkipBlock(ExitBB)) |
1218 | continue; |
1219 | |
1220 | // When we are going to clone an exit, we don't need to clone all the |
1221 | // instructions in the exit block and we want to ensure we have an easy |
1222 | // place to merge the CFG, so split the exit first. This is always safe to |
1223 | // do because there cannot be any non-loop predecessors of a loop exit in |
1224 | // loop simplified form. |
1225 | auto *MergeBB = SplitBlock(Old: ExitBB, SplitPt: ExitBB->begin(), DT: &DT, LI: &LI, MSSAU); |
1226 | |
1227 | // Rearrange the names to make it easier to write test cases by having the |
1228 | // exit block carry the suffix rather than the merge block carrying the |
1229 | // suffix. |
1230 | MergeBB->takeName(V: ExitBB); |
1231 | ExitBB->setName(Twine(MergeBB->getName()) + ".split" ); |
1232 | |
1233 | // Now clone the original exit block. |
1234 | auto *ClonedExitBB = CloneBlock(ExitBB); |
1235 | assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 && |
1236 | "Exit block should have been split to have one successor!" ); |
1237 | assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB && |
1238 | "Cloned exit block has the wrong successor!" ); |
1239 | |
1240 | // Remap any cloned instructions and create a merge phi node for them. |
1241 | for (auto ZippedInsts : llvm::zip_first( |
1242 | t: llvm::make_range(x: ExitBB->begin(), y: std::prev(x: ExitBB->end())), |
1243 | u: llvm::make_range(x: ClonedExitBB->begin(), |
1244 | y: std::prev(x: ClonedExitBB->end())))) { |
1245 | Instruction &I = std::get<0>(t&: ZippedInsts); |
1246 | Instruction &ClonedI = std::get<1>(t&: ZippedInsts); |
1247 | |
1248 | // The only instructions in the exit block should be PHI nodes and |
1249 | // potentially a landing pad. |
1250 | assert( |
1251 | (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) && |
1252 | "Bad instruction in exit block!" ); |
1253 | // We should have a value map between the instruction and its clone. |
1254 | assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!" ); |
1255 | |
1256 | // Forget SCEVs based on exit phis in case SCEV looked through the phi. |
1257 | if (SE) |
1258 | if (auto *PN = dyn_cast<PHINode>(Val: &I)) |
1259 | SE->forgetLcssaPhiWithNewPredecessor(L: &L, V: PN); |
1260 | |
1261 | BasicBlock::iterator InsertPt = MergeBB->getFirstInsertionPt(); |
1262 | |
1263 | auto *MergePN = |
1264 | PHINode::Create(Ty: I.getType(), /*NumReservedValues*/ 2, NameStr: ".us-phi" ); |
1265 | MergePN->insertBefore(InsertPos: InsertPt); |
1266 | MergePN->setDebugLoc(InsertPt->getDebugLoc()); |
1267 | I.replaceAllUsesWith(V: MergePN); |
1268 | MergePN->addIncoming(V: &I, BB: ExitBB); |
1269 | MergePN->addIncoming(V: &ClonedI, BB: ClonedExitBB); |
1270 | } |
1271 | } |
1272 | |
1273 | // Rewrite the instructions in the cloned blocks to refer to the instructions |
1274 | // in the cloned blocks. We have to do this as a second pass so that we have |
1275 | // everything available. Also, we have inserted new instructions which may |
1276 | // include assume intrinsics, so we update the assumption cache while |
1277 | // processing this. |
1278 | Module *M = ClonedPH->getParent()->getParent(); |
1279 | for (auto *ClonedBB : NewBlocks) |
1280 | for (Instruction &I : *ClonedBB) { |
1281 | RemapDbgRecordRange(M, Range: I.getDbgRecordRange(), VM&: VMap, |
1282 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
1283 | RemapInstruction(I: &I, VM&: VMap, |
1284 | Flags: RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); |
1285 | if (auto *II = dyn_cast<AssumeInst>(Val: &I)) |
1286 | AC.registerAssumption(CI: II); |
1287 | } |
1288 | |
1289 | // Update any PHI nodes in the cloned successors of the skipped blocks to not |
1290 | // have spurious incoming values. |
1291 | for (auto *LoopBB : L.blocks()) |
1292 | if (SkipBlock(LoopBB)) |
1293 | for (auto *SuccBB : successors(BB: LoopBB)) |
1294 | if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: SuccBB))) |
1295 | for (PHINode &PN : ClonedSuccBB->phis()) |
1296 | PN.removeIncomingValue(BB: LoopBB, /*DeletePHIIfEmpty*/ false); |
1297 | |
1298 | // Remove the cloned parent as a predecessor of any successor we ended up |
1299 | // cloning other than the unswitched one. |
1300 | auto *ClonedParentBB = cast<BasicBlock>(Val: VMap.lookup(Val: ParentBB)); |
1301 | for (auto *SuccBB : successors(BB: ParentBB)) { |
1302 | if (SuccBB == UnswitchedSuccBB) |
1303 | continue; |
1304 | |
1305 | auto *ClonedSuccBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: SuccBB)); |
1306 | if (!ClonedSuccBB) |
1307 | continue; |
1308 | |
1309 | ClonedSuccBB->removePredecessor(Pred: ClonedParentBB, |
1310 | /*KeepOneInputPHIs*/ true); |
1311 | } |
1312 | |
1313 | // Replace the cloned branch with an unconditional branch to the cloned |
1314 | // unswitched successor. |
1315 | auto *ClonedSuccBB = cast<BasicBlock>(Val: VMap.lookup(Val: UnswitchedSuccBB)); |
1316 | Instruction *ClonedTerminator = ClonedParentBB->getTerminator(); |
1317 | // Trivial Simplification. If Terminator is a conditional branch and |
1318 | // condition becomes dead - erase it. |
1319 | Value *ClonedConditionToErase = nullptr; |
1320 | if (auto *BI = dyn_cast<BranchInst>(Val: ClonedTerminator)) |
1321 | ClonedConditionToErase = BI->getCondition(); |
1322 | else if (auto *SI = dyn_cast<SwitchInst>(Val: ClonedTerminator)) |
1323 | ClonedConditionToErase = SI->getCondition(); |
1324 | |
1325 | Instruction *BI = BranchInst::Create(IfTrue: ClonedSuccBB, InsertBefore: ClonedParentBB); |
1326 | BI->setDebugLoc(ClonedTerminator->getDebugLoc()); |
1327 | ClonedTerminator->eraseFromParent(); |
1328 | |
1329 | if (ClonedConditionToErase) |
1330 | RecursivelyDeleteTriviallyDeadInstructions(V: ClonedConditionToErase, TLI: nullptr, |
1331 | MSSAU); |
1332 | |
1333 | // If there are duplicate entries in the PHI nodes because of multiple edges |
1334 | // to the unswitched successor, we need to nuke all but one as we replaced it |
1335 | // with a direct branch. |
1336 | for (PHINode &PN : ClonedSuccBB->phis()) { |
1337 | bool Found = false; |
1338 | // Loop over the incoming operands backwards so we can easily delete as we |
1339 | // go without invalidating the index. |
1340 | for (int i = PN.getNumOperands() - 1; i >= 0; --i) { |
1341 | if (PN.getIncomingBlock(i) != ClonedParentBB) |
1342 | continue; |
1343 | if (!Found) { |
1344 | Found = true; |
1345 | continue; |
1346 | } |
1347 | PN.removeIncomingValue(Idx: i, /*DeletePHIIfEmpty*/ false); |
1348 | } |
1349 | } |
1350 | |
1351 | // Record the domtree updates for the new blocks. |
1352 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1353 | for (auto *ClonedBB : NewBlocks) { |
1354 | for (auto *SuccBB : successors(BB: ClonedBB)) |
1355 | if (SuccSet.insert(Ptr: SuccBB).second) |
1356 | DTUpdates.push_back(Elt: {DominatorTree::Insert, ClonedBB, SuccBB}); |
1357 | SuccSet.clear(); |
1358 | } |
1359 | |
1360 | return ClonedPH; |
1361 | } |
1362 | |
1363 | /// Recursively clone the specified loop and all of its children. |
1364 | /// |
1365 | /// The target parent loop for the clone should be provided, or can be null if |
1366 | /// the clone is a top-level loop. While cloning, all the blocks are mapped |
1367 | /// with the provided value map. The entire original loop must be present in |
1368 | /// the value map. The cloned loop is returned. |
1369 | static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL, |
1370 | const ValueToValueMapTy &VMap, LoopInfo &LI) { |
1371 | auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) { |
1372 | assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!" ); |
1373 | ClonedL.reserveBlocks(size: OrigL.getNumBlocks()); |
1374 | for (auto *BB : OrigL.blocks()) { |
1375 | auto *ClonedBB = cast<BasicBlock>(Val: VMap.lookup(Val: BB)); |
1376 | ClonedL.addBlockEntry(BB: ClonedBB); |
1377 | if (LI.getLoopFor(BB) == &OrigL) |
1378 | LI.changeLoopFor(BB: ClonedBB, L: &ClonedL); |
1379 | } |
1380 | }; |
1381 | |
1382 | // We specially handle the first loop because it may get cloned into |
1383 | // a different parent and because we most commonly are cloning leaf loops. |
1384 | Loop *ClonedRootL = LI.AllocateLoop(); |
1385 | if (RootParentL) |
1386 | RootParentL->addChildLoop(NewChild: ClonedRootL); |
1387 | else |
1388 | LI.addTopLevelLoop(New: ClonedRootL); |
1389 | AddClonedBlocksToLoop(OrigRootL, *ClonedRootL); |
1390 | |
1391 | if (OrigRootL.isInnermost()) |
1392 | return ClonedRootL; |
1393 | |
1394 | // If we have a nest, we can quickly clone the entire loop nest using an |
1395 | // iterative approach because it is a tree. We keep the cloned parent in the |
1396 | // data structure to avoid repeatedly querying through a map to find it. |
1397 | SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone; |
1398 | // Build up the loops to clone in reverse order as we'll clone them from the |
1399 | // back. |
1400 | for (Loop *ChildL : llvm::reverse(C&: OrigRootL)) |
1401 | LoopsToClone.push_back(Elt: {ClonedRootL, ChildL}); |
1402 | do { |
1403 | Loop *ClonedParentL, *L; |
1404 | std::tie(args&: ClonedParentL, args&: L) = LoopsToClone.pop_back_val(); |
1405 | Loop *ClonedL = LI.AllocateLoop(); |
1406 | ClonedParentL->addChildLoop(NewChild: ClonedL); |
1407 | AddClonedBlocksToLoop(*L, *ClonedL); |
1408 | for (Loop *ChildL : llvm::reverse(C&: *L)) |
1409 | LoopsToClone.push_back(Elt: {ClonedL, ChildL}); |
1410 | } while (!LoopsToClone.empty()); |
1411 | |
1412 | return ClonedRootL; |
1413 | } |
1414 | |
1415 | /// Build the cloned loops of an original loop from unswitching. |
1416 | /// |
1417 | /// Because unswitching simplifies the CFG of the loop, this isn't a trivial |
1418 | /// operation. We need to re-verify that there even is a loop (as the backedge |
1419 | /// may not have been cloned), and even if there are remaining backedges the |
1420 | /// backedge set may be different. However, we know that each child loop is |
1421 | /// undisturbed, we only need to find where to place each child loop within |
1422 | /// either any parent loop or within a cloned version of the original loop. |
1423 | /// |
1424 | /// Because child loops may end up cloned outside of any cloned version of the |
1425 | /// original loop, multiple cloned sibling loops may be created. All of them |
1426 | /// are returned so that the newly introduced loop nest roots can be |
1427 | /// identified. |
1428 | static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks, |
1429 | const ValueToValueMapTy &VMap, LoopInfo &LI, |
1430 | SmallVectorImpl<Loop *> &NonChildClonedLoops) { |
1431 | Loop *ClonedL = nullptr; |
1432 | |
1433 | auto *OrigPH = OrigL.getLoopPreheader(); |
1434 | auto * = OrigL.getHeader(); |
1435 | |
1436 | auto *ClonedPH = cast<BasicBlock>(Val: VMap.lookup(Val: OrigPH)); |
1437 | auto * = cast<BasicBlock>(Val: VMap.lookup(Val: OrigHeader)); |
1438 | |
1439 | // We need to know the loops of the cloned exit blocks to even compute the |
1440 | // accurate parent loop. If we only clone exits to some parent of the |
1441 | // original parent, we want to clone into that outer loop. We also keep track |
1442 | // of the loops that our cloned exit blocks participate in. |
1443 | Loop *ParentL = nullptr; |
1444 | SmallVector<BasicBlock *, 4> ClonedExitsInLoops; |
1445 | SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap; |
1446 | ClonedExitsInLoops.reserve(N: ExitBlocks.size()); |
1447 | for (auto *ExitBB : ExitBlocks) |
1448 | if (auto *ClonedExitBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ExitBB))) |
1449 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) { |
1450 | ExitLoopMap[ClonedExitBB] = ExitL; |
1451 | ClonedExitsInLoops.push_back(Elt: ClonedExitBB); |
1452 | if (!ParentL || (ParentL != ExitL && ParentL->contains(L: ExitL))) |
1453 | ParentL = ExitL; |
1454 | } |
1455 | assert((!ParentL || ParentL == OrigL.getParentLoop() || |
1456 | ParentL->contains(OrigL.getParentLoop())) && |
1457 | "The computed parent loop should always contain (or be) the parent of " |
1458 | "the original loop." ); |
1459 | |
1460 | // We build the set of blocks dominated by the cloned header from the set of |
1461 | // cloned blocks out of the original loop. While not all of these will |
1462 | // necessarily be in the cloned loop, it is enough to establish that they |
1463 | // aren't in unreachable cycles, etc. |
1464 | SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks; |
1465 | for (auto *BB : OrigL.blocks()) |
1466 | if (auto *ClonedBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: BB))) |
1467 | ClonedLoopBlocks.insert(X: ClonedBB); |
1468 | |
1469 | // Rebuild the set of blocks that will end up in the cloned loop. We may have |
1470 | // skipped cloning some region of this loop which can in turn skip some of |
1471 | // the backedges so we have to rebuild the blocks in the loop based on the |
1472 | // backedges that remain after cloning. |
1473 | SmallVector<BasicBlock *, 16> Worklist; |
1474 | SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop; |
1475 | for (auto *Pred : predecessors(BB: ClonedHeader)) { |
1476 | // The only possible non-loop header predecessor is the preheader because |
1477 | // we know we cloned the loop in simplified form. |
1478 | if (Pred == ClonedPH) |
1479 | continue; |
1480 | |
1481 | // Because the loop was in simplified form, the only non-loop predecessor |
1482 | // should be the preheader. |
1483 | assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop " |
1484 | "header other than the preheader " |
1485 | "that is not part of the loop!" ); |
1486 | |
1487 | // Insert this block into the loop set and on the first visit (and if it |
1488 | // isn't the header we're currently walking) put it into the worklist to |
1489 | // recurse through. |
1490 | if (BlocksInClonedLoop.insert(Ptr: Pred).second && Pred != ClonedHeader) |
1491 | Worklist.push_back(Elt: Pred); |
1492 | } |
1493 | |
1494 | // If we had any backedges then there *is* a cloned loop. Put the header into |
1495 | // the loop set and then walk the worklist backwards to find all the blocks |
1496 | // that remain within the loop after cloning. |
1497 | if (!BlocksInClonedLoop.empty()) { |
1498 | BlocksInClonedLoop.insert(Ptr: ClonedHeader); |
1499 | |
1500 | while (!Worklist.empty()) { |
1501 | BasicBlock *BB = Worklist.pop_back_val(); |
1502 | assert(BlocksInClonedLoop.count(BB) && |
1503 | "Didn't put block into the loop set!" ); |
1504 | |
1505 | // Insert any predecessors that are in the possible set into the cloned |
1506 | // set, and if the insert is successful, add them to the worklist. Note |
1507 | // that we filter on the blocks that are definitely reachable via the |
1508 | // backedge to the loop header so we may prune out dead code within the |
1509 | // cloned loop. |
1510 | for (auto *Pred : predecessors(BB)) |
1511 | if (ClonedLoopBlocks.count(key: Pred) && |
1512 | BlocksInClonedLoop.insert(Ptr: Pred).second) |
1513 | Worklist.push_back(Elt: Pred); |
1514 | } |
1515 | |
1516 | ClonedL = LI.AllocateLoop(); |
1517 | if (ParentL) { |
1518 | ParentL->addBasicBlockToLoop(NewBB: ClonedPH, LI); |
1519 | ParentL->addChildLoop(NewChild: ClonedL); |
1520 | } else { |
1521 | LI.addTopLevelLoop(New: ClonedL); |
1522 | } |
1523 | NonChildClonedLoops.push_back(Elt: ClonedL); |
1524 | |
1525 | ClonedL->reserveBlocks(size: BlocksInClonedLoop.size()); |
1526 | // We don't want to just add the cloned loop blocks based on how we |
1527 | // discovered them. The original order of blocks was carefully built in |
1528 | // a way that doesn't rely on predecessor ordering. Rather than re-invent |
1529 | // that logic, we just re-walk the original blocks (and those of the child |
1530 | // loops) and filter them as we add them into the cloned loop. |
1531 | for (auto *BB : OrigL.blocks()) { |
1532 | auto *ClonedBB = cast_or_null<BasicBlock>(Val: VMap.lookup(Val: BB)); |
1533 | if (!ClonedBB || !BlocksInClonedLoop.count(Ptr: ClonedBB)) |
1534 | continue; |
1535 | |
1536 | // Directly add the blocks that are only in this loop. |
1537 | if (LI.getLoopFor(BB) == &OrigL) { |
1538 | ClonedL->addBasicBlockToLoop(NewBB: ClonedBB, LI); |
1539 | continue; |
1540 | } |
1541 | |
1542 | // We want to manually add it to this loop and parents. |
1543 | // Registering it with LoopInfo will happen when we clone the top |
1544 | // loop for this block. |
1545 | for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop()) |
1546 | PL->addBlockEntry(BB: ClonedBB); |
1547 | } |
1548 | |
1549 | // Now add each child loop whose header remains within the cloned loop. All |
1550 | // of the blocks within the loop must satisfy the same constraints as the |
1551 | // header so once we pass the header checks we can just clone the entire |
1552 | // child loop nest. |
1553 | for (Loop *ChildL : OrigL) { |
1554 | auto * = |
1555 | cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ChildL->getHeader())); |
1556 | if (!ClonedChildHeader || !BlocksInClonedLoop.count(Ptr: ClonedChildHeader)) |
1557 | continue; |
1558 | |
1559 | #ifndef NDEBUG |
1560 | // We should never have a cloned child loop header but fail to have |
1561 | // all of the blocks for that child loop. |
1562 | for (auto *ChildLoopBB : ChildL->blocks()) |
1563 | assert(BlocksInClonedLoop.count( |
1564 | cast<BasicBlock>(VMap.lookup(ChildLoopBB))) && |
1565 | "Child cloned loop has a header within the cloned outer " |
1566 | "loop but not all of its blocks!" ); |
1567 | #endif |
1568 | |
1569 | cloneLoopNest(OrigRootL&: *ChildL, RootParentL: ClonedL, VMap, LI); |
1570 | } |
1571 | } |
1572 | |
1573 | // Now that we've handled all the components of the original loop that were |
1574 | // cloned into a new loop, we still need to handle anything from the original |
1575 | // loop that wasn't in a cloned loop. |
1576 | |
1577 | // Figure out what blocks are left to place within any loop nest containing |
1578 | // the unswitched loop. If we never formed a loop, the cloned PH is one of |
1579 | // them. |
1580 | SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet; |
1581 | if (BlocksInClonedLoop.empty()) |
1582 | UnloopedBlockSet.insert(Ptr: ClonedPH); |
1583 | for (auto *ClonedBB : ClonedLoopBlocks) |
1584 | if (!BlocksInClonedLoop.count(Ptr: ClonedBB)) |
1585 | UnloopedBlockSet.insert(Ptr: ClonedBB); |
1586 | |
1587 | // Copy the cloned exits and sort them in ascending loop depth, we'll work |
1588 | // backwards across these to process them inside out. The order shouldn't |
1589 | // matter as we're just trying to build up the map from inside-out; we use |
1590 | // the map in a more stably ordered way below. |
1591 | auto OrderedClonedExitsInLoops = ClonedExitsInLoops; |
1592 | llvm::sort(C&: OrderedClonedExitsInLoops, Comp: [&](BasicBlock *LHS, BasicBlock *RHS) { |
1593 | return ExitLoopMap.lookup(Val: LHS)->getLoopDepth() < |
1594 | ExitLoopMap.lookup(Val: RHS)->getLoopDepth(); |
1595 | }); |
1596 | |
1597 | // Populate the existing ExitLoopMap with everything reachable from each |
1598 | // exit, starting from the inner most exit. |
1599 | while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) { |
1600 | assert(Worklist.empty() && "Didn't clear worklist!" ); |
1601 | |
1602 | BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val(); |
1603 | Loop *ExitL = ExitLoopMap.lookup(Val: ExitBB); |
1604 | |
1605 | // Walk the CFG back until we hit the cloned PH adding everything reachable |
1606 | // and in the unlooped set to this exit block's loop. |
1607 | Worklist.push_back(Elt: ExitBB); |
1608 | do { |
1609 | BasicBlock *BB = Worklist.pop_back_val(); |
1610 | // We can stop recursing at the cloned preheader (if we get there). |
1611 | if (BB == ClonedPH) |
1612 | continue; |
1613 | |
1614 | for (BasicBlock *PredBB : predecessors(BB)) { |
1615 | // If this pred has already been moved to our set or is part of some |
1616 | // (inner) loop, no update needed. |
1617 | if (!UnloopedBlockSet.erase(Ptr: PredBB)) { |
1618 | assert( |
1619 | (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) && |
1620 | "Predecessor not mapped to a loop!" ); |
1621 | continue; |
1622 | } |
1623 | |
1624 | // We just insert into the loop set here. We'll add these blocks to the |
1625 | // exit loop after we build up the set in an order that doesn't rely on |
1626 | // predecessor order (which in turn relies on use list order). |
1627 | bool Inserted = ExitLoopMap.insert(KV: {PredBB, ExitL}).second; |
1628 | (void)Inserted; |
1629 | assert(Inserted && "Should only visit an unlooped block once!" ); |
1630 | |
1631 | // And recurse through to its predecessors. |
1632 | Worklist.push_back(Elt: PredBB); |
1633 | } |
1634 | } while (!Worklist.empty()); |
1635 | } |
1636 | |
1637 | // Now that the ExitLoopMap gives as mapping for all the non-looping cloned |
1638 | // blocks to their outer loops, walk the cloned blocks and the cloned exits |
1639 | // in their original order adding them to the correct loop. |
1640 | |
1641 | // We need a stable insertion order. We use the order of the original loop |
1642 | // order and map into the correct parent loop. |
1643 | for (auto *BB : llvm::concat<BasicBlock *const>( |
1644 | Ranges: ArrayRef(ClonedPH), Ranges&: ClonedLoopBlocks, Ranges&: ClonedExitsInLoops)) |
1645 | if (Loop *OuterL = ExitLoopMap.lookup(Val: BB)) |
1646 | OuterL->addBasicBlockToLoop(NewBB: BB, LI); |
1647 | |
1648 | #ifndef NDEBUG |
1649 | for (auto &BBAndL : ExitLoopMap) { |
1650 | auto *BB = BBAndL.first; |
1651 | auto *OuterL = BBAndL.second; |
1652 | assert(LI.getLoopFor(BB) == OuterL && |
1653 | "Failed to put all blocks into outer loops!" ); |
1654 | } |
1655 | #endif |
1656 | |
1657 | // Now that all the blocks are placed into the correct containing loop in the |
1658 | // absence of child loops, find all the potentially cloned child loops and |
1659 | // clone them into whatever outer loop we placed their header into. |
1660 | for (Loop *ChildL : OrigL) { |
1661 | auto * = |
1662 | cast_or_null<BasicBlock>(Val: VMap.lookup(Val: ChildL->getHeader())); |
1663 | if (!ClonedChildHeader || BlocksInClonedLoop.count(Ptr: ClonedChildHeader)) |
1664 | continue; |
1665 | |
1666 | #ifndef NDEBUG |
1667 | for (auto *ChildLoopBB : ChildL->blocks()) |
1668 | assert(VMap.count(ChildLoopBB) && |
1669 | "Cloned a child loop header but not all of that loops blocks!" ); |
1670 | #endif |
1671 | |
1672 | NonChildClonedLoops.push_back(Elt: cloneLoopNest( |
1673 | OrigRootL&: *ChildL, RootParentL: ExitLoopMap.lookup(Val: ClonedChildHeader), VMap, LI)); |
1674 | } |
1675 | } |
1676 | |
1677 | static void |
1678 | deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, |
1679 | ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, |
1680 | DominatorTree &DT, MemorySSAUpdater *MSSAU) { |
1681 | // Find all the dead clones, and remove them from their successors. |
1682 | SmallVector<BasicBlock *, 16> DeadBlocks; |
1683 | for (BasicBlock *BB : llvm::concat<BasicBlock *const>(Ranges: L.blocks(), Ranges&: ExitBlocks)) |
1684 | for (const auto &VMap : VMaps) |
1685 | if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(Val: VMap->lookup(Val: BB))) |
1686 | if (!DT.isReachableFromEntry(A: ClonedBB)) { |
1687 | for (BasicBlock *SuccBB : successors(BB: ClonedBB)) |
1688 | SuccBB->removePredecessor(Pred: ClonedBB); |
1689 | DeadBlocks.push_back(Elt: ClonedBB); |
1690 | } |
1691 | |
1692 | // Remove all MemorySSA in the dead blocks |
1693 | if (MSSAU) { |
1694 | SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(), |
1695 | DeadBlocks.end()); |
1696 | MSSAU->removeBlocks(DeadBlocks: DeadBlockSet); |
1697 | } |
1698 | |
1699 | // Drop any remaining references to break cycles. |
1700 | for (BasicBlock *BB : DeadBlocks) |
1701 | BB->dropAllReferences(); |
1702 | // Erase them from the IR. |
1703 | for (BasicBlock *BB : DeadBlocks) |
1704 | BB->eraseFromParent(); |
1705 | } |
1706 | |
1707 | static void deleteDeadBlocksFromLoop(Loop &L, |
1708 | SmallVectorImpl<BasicBlock *> &ExitBlocks, |
1709 | DominatorTree &DT, LoopInfo &LI, |
1710 | MemorySSAUpdater *MSSAU, |
1711 | ScalarEvolution *SE, |
1712 | LPMUpdater &LoopUpdater) { |
1713 | // Find all the dead blocks tied to this loop, and remove them from their |
1714 | // successors. |
1715 | SmallSetVector<BasicBlock *, 8> DeadBlockSet; |
1716 | |
1717 | // Start with loop/exit blocks and get a transitive closure of reachable dead |
1718 | // blocks. |
1719 | SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(), |
1720 | ExitBlocks.end()); |
1721 | DeathCandidates.append(in_start: L.blocks().begin(), in_end: L.blocks().end()); |
1722 | while (!DeathCandidates.empty()) { |
1723 | auto *BB = DeathCandidates.pop_back_val(); |
1724 | if (!DeadBlockSet.count(key: BB) && !DT.isReachableFromEntry(A: BB)) { |
1725 | for (BasicBlock *SuccBB : successors(BB)) { |
1726 | SuccBB->removePredecessor(Pred: BB); |
1727 | DeathCandidates.push_back(Elt: SuccBB); |
1728 | } |
1729 | DeadBlockSet.insert(X: BB); |
1730 | } |
1731 | } |
1732 | |
1733 | // Remove all MemorySSA in the dead blocks |
1734 | if (MSSAU) |
1735 | MSSAU->removeBlocks(DeadBlocks: DeadBlockSet); |
1736 | |
1737 | // Filter out the dead blocks from the exit blocks list so that it can be |
1738 | // used in the caller. |
1739 | llvm::erase_if(C&: ExitBlocks, |
1740 | P: [&](BasicBlock *BB) { return DeadBlockSet.count(key: BB); }); |
1741 | |
1742 | // Walk from this loop up through its parents removing all of the dead blocks. |
1743 | for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) { |
1744 | for (auto *BB : DeadBlockSet) |
1745 | ParentL->getBlocksSet().erase(Ptr: BB); |
1746 | llvm::erase_if(C&: ParentL->getBlocksVector(), |
1747 | P: [&](BasicBlock *BB) { return DeadBlockSet.count(key: BB); }); |
1748 | } |
1749 | |
1750 | // Now delete the dead child loops. This raw delete will clear them |
1751 | // recursively. |
1752 | llvm::erase_if(C&: L.getSubLoopsVector(), P: [&](Loop *ChildL) { |
1753 | if (!DeadBlockSet.count(key: ChildL->getHeader())) |
1754 | return false; |
1755 | |
1756 | assert(llvm::all_of(ChildL->blocks(), |
1757 | [&](BasicBlock *ChildBB) { |
1758 | return DeadBlockSet.count(ChildBB); |
1759 | }) && |
1760 | "If the child loop header is dead all blocks in the child loop must " |
1761 | "be dead as well!" ); |
1762 | LoopUpdater.markLoopAsDeleted(L&: *ChildL, Name: ChildL->getName()); |
1763 | if (SE) |
1764 | SE->forgetBlockAndLoopDispositions(); |
1765 | LI.destroy(L: ChildL); |
1766 | return true; |
1767 | }); |
1768 | |
1769 | // Remove the loop mappings for the dead blocks and drop all the references |
1770 | // from these blocks to others to handle cyclic references as we start |
1771 | // deleting the blocks themselves. |
1772 | for (auto *BB : DeadBlockSet) { |
1773 | // Check that the dominator tree has already been updated. |
1774 | assert(!DT.getNode(BB) && "Should already have cleared domtree!" ); |
1775 | LI.changeLoopFor(BB, L: nullptr); |
1776 | // Drop all uses of the instructions to make sure we won't have dangling |
1777 | // uses in other blocks. |
1778 | for (auto &I : *BB) |
1779 | if (!I.use_empty()) |
1780 | I.replaceAllUsesWith(V: PoisonValue::get(T: I.getType())); |
1781 | BB->dropAllReferences(); |
1782 | } |
1783 | |
1784 | // Actually delete the blocks now that they've been fully unhooked from the |
1785 | // IR. |
1786 | for (auto *BB : DeadBlockSet) |
1787 | BB->eraseFromParent(); |
1788 | } |
1789 | |
1790 | /// Recompute the set of blocks in a loop after unswitching. |
1791 | /// |
1792 | /// This walks from the original headers predecessors to rebuild the loop. We |
1793 | /// take advantage of the fact that new blocks can't have been added, and so we |
1794 | /// filter by the original loop's blocks. This also handles potentially |
1795 | /// unreachable code that we don't want to explore but might be found examining |
1796 | /// the predecessors of the header. |
1797 | /// |
1798 | /// If the original loop is no longer a loop, this will return an empty set. If |
1799 | /// it remains a loop, all the blocks within it will be added to the set |
1800 | /// (including those blocks in inner loops). |
1801 | static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L, |
1802 | LoopInfo &LI) { |
1803 | SmallPtrSet<const BasicBlock *, 16> LoopBlockSet; |
1804 | |
1805 | auto *PH = L.getLoopPreheader(); |
1806 | auto * = L.getHeader(); |
1807 | |
1808 | // A worklist to use while walking backwards from the header. |
1809 | SmallVector<BasicBlock *, 16> Worklist; |
1810 | |
1811 | // First walk the predecessors of the header to find the backedges. This will |
1812 | // form the basis of our walk. |
1813 | for (auto *Pred : predecessors(BB: Header)) { |
1814 | // Skip the preheader. |
1815 | if (Pred == PH) |
1816 | continue; |
1817 | |
1818 | // Because the loop was in simplified form, the only non-loop predecessor |
1819 | // is the preheader. |
1820 | assert(L.contains(Pred) && "Found a predecessor of the loop header other " |
1821 | "than the preheader that is not part of the " |
1822 | "loop!" ); |
1823 | |
1824 | // Insert this block into the loop set and on the first visit and, if it |
1825 | // isn't the header we're currently walking, put it into the worklist to |
1826 | // recurse through. |
1827 | if (LoopBlockSet.insert(Ptr: Pred).second && Pred != Header) |
1828 | Worklist.push_back(Elt: Pred); |
1829 | } |
1830 | |
1831 | // If no backedges were found, we're done. |
1832 | if (LoopBlockSet.empty()) |
1833 | return LoopBlockSet; |
1834 | |
1835 | // We found backedges, recurse through them to identify the loop blocks. |
1836 | while (!Worklist.empty()) { |
1837 | BasicBlock *BB = Worklist.pop_back_val(); |
1838 | assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!" ); |
1839 | |
1840 | // No need to walk past the header. |
1841 | if (BB == Header) |
1842 | continue; |
1843 | |
1844 | // Because we know the inner loop structure remains valid we can use the |
1845 | // loop structure to jump immediately across the entire nested loop. |
1846 | // Further, because it is in loop simplified form, we can directly jump |
1847 | // to its preheader afterward. |
1848 | if (Loop *InnerL = LI.getLoopFor(BB)) |
1849 | if (InnerL != &L) { |
1850 | assert(L.contains(InnerL) && |
1851 | "Should not reach a loop *outside* this loop!" ); |
1852 | // The preheader is the only possible predecessor of the loop so |
1853 | // insert it into the set and check whether it was already handled. |
1854 | auto *InnerPH = InnerL->getLoopPreheader(); |
1855 | assert(L.contains(InnerPH) && "Cannot contain an inner loop block " |
1856 | "but not contain the inner loop " |
1857 | "preheader!" ); |
1858 | if (!LoopBlockSet.insert(Ptr: InnerPH).second) |
1859 | // The only way to reach the preheader is through the loop body |
1860 | // itself so if it has been visited the loop is already handled. |
1861 | continue; |
1862 | |
1863 | // Insert all of the blocks (other than those already present) into |
1864 | // the loop set. We expect at least the block that led us to find the |
1865 | // inner loop to be in the block set, but we may also have other loop |
1866 | // blocks if they were already enqueued as predecessors of some other |
1867 | // outer loop block. |
1868 | for (auto *InnerBB : InnerL->blocks()) { |
1869 | if (InnerBB == BB) { |
1870 | assert(LoopBlockSet.count(InnerBB) && |
1871 | "Block should already be in the set!" ); |
1872 | continue; |
1873 | } |
1874 | |
1875 | LoopBlockSet.insert(Ptr: InnerBB); |
1876 | } |
1877 | |
1878 | // Add the preheader to the worklist so we will continue past the |
1879 | // loop body. |
1880 | Worklist.push_back(Elt: InnerPH); |
1881 | continue; |
1882 | } |
1883 | |
1884 | // Insert any predecessors that were in the original loop into the new |
1885 | // set, and if the insert is successful, add them to the worklist. |
1886 | for (auto *Pred : predecessors(BB)) |
1887 | if (L.contains(BB: Pred) && LoopBlockSet.insert(Ptr: Pred).second) |
1888 | Worklist.push_back(Elt: Pred); |
1889 | } |
1890 | |
1891 | assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!" ); |
1892 | |
1893 | // We've found all the blocks participating in the loop, return our completed |
1894 | // set. |
1895 | return LoopBlockSet; |
1896 | } |
1897 | |
1898 | /// Rebuild a loop after unswitching removes some subset of blocks and edges. |
1899 | /// |
1900 | /// The removal may have removed some child loops entirely but cannot have |
1901 | /// disturbed any remaining child loops. However, they may need to be hoisted |
1902 | /// to the parent loop (or to be top-level loops). The original loop may be |
1903 | /// completely removed. |
1904 | /// |
1905 | /// The sibling loops resulting from this update are returned. If the original |
1906 | /// loop remains a valid loop, it will be the first entry in this list with all |
1907 | /// of the newly sibling loops following it. |
1908 | /// |
1909 | /// Returns true if the loop remains a loop after unswitching, and false if it |
1910 | /// is no longer a loop after unswitching (and should not continue to be |
1911 | /// referenced). |
1912 | static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks, |
1913 | LoopInfo &LI, |
1914 | SmallVectorImpl<Loop *> &HoistedLoops, |
1915 | ScalarEvolution *SE) { |
1916 | auto *PH = L.getLoopPreheader(); |
1917 | |
1918 | // Compute the actual parent loop from the exit blocks. Because we may have |
1919 | // pruned some exits the loop may be different from the original parent. |
1920 | Loop *ParentL = nullptr; |
1921 | SmallVector<Loop *, 4> ExitLoops; |
1922 | SmallVector<BasicBlock *, 4> ExitsInLoops; |
1923 | ExitsInLoops.reserve(N: ExitBlocks.size()); |
1924 | for (auto *ExitBB : ExitBlocks) |
1925 | if (Loop *ExitL = LI.getLoopFor(BB: ExitBB)) { |
1926 | ExitLoops.push_back(Elt: ExitL); |
1927 | ExitsInLoops.push_back(Elt: ExitBB); |
1928 | if (!ParentL || (ParentL != ExitL && ParentL->contains(L: ExitL))) |
1929 | ParentL = ExitL; |
1930 | } |
1931 | |
1932 | // Recompute the blocks participating in this loop. This may be empty if it |
1933 | // is no longer a loop. |
1934 | auto LoopBlockSet = recomputeLoopBlockSet(L, LI); |
1935 | |
1936 | // If we still have a loop, we need to re-set the loop's parent as the exit |
1937 | // block set changing may have moved it within the loop nest. Note that this |
1938 | // can only happen when this loop has a parent as it can only hoist the loop |
1939 | // *up* the nest. |
1940 | if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) { |
1941 | // Remove this loop's (original) blocks from all of the intervening loops. |
1942 | for (Loop *IL = L.getParentLoop(); IL != ParentL; |
1943 | IL = IL->getParentLoop()) { |
1944 | IL->getBlocksSet().erase(Ptr: PH); |
1945 | for (auto *BB : L.blocks()) |
1946 | IL->getBlocksSet().erase(Ptr: BB); |
1947 | llvm::erase_if(C&: IL->getBlocksVector(), P: [&](BasicBlock *BB) { |
1948 | return BB == PH || L.contains(BB); |
1949 | }); |
1950 | } |
1951 | |
1952 | LI.changeLoopFor(BB: PH, L: ParentL); |
1953 | L.getParentLoop()->removeChildLoop(Child: &L); |
1954 | if (ParentL) |
1955 | ParentL->addChildLoop(NewChild: &L); |
1956 | else |
1957 | LI.addTopLevelLoop(New: &L); |
1958 | } |
1959 | |
1960 | // Now we update all the blocks which are no longer within the loop. |
1961 | auto &Blocks = L.getBlocksVector(); |
1962 | auto BlocksSplitI = |
1963 | LoopBlockSet.empty() |
1964 | ? Blocks.begin() |
1965 | : std::stable_partition( |
1966 | first: Blocks.begin(), last: Blocks.end(), |
1967 | pred: [&](BasicBlock *BB) { return LoopBlockSet.count(Ptr: BB); }); |
1968 | |
1969 | // Before we erase the list of unlooped blocks, build a set of them. |
1970 | SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end()); |
1971 | if (LoopBlockSet.empty()) |
1972 | UnloopedBlocks.insert(Ptr: PH); |
1973 | |
1974 | // Now erase these blocks from the loop. |
1975 | for (auto *BB : make_range(x: BlocksSplitI, y: Blocks.end())) |
1976 | L.getBlocksSet().erase(Ptr: BB); |
1977 | Blocks.erase(first: BlocksSplitI, last: Blocks.end()); |
1978 | |
1979 | // Sort the exits in ascending loop depth, we'll work backwards across these |
1980 | // to process them inside out. |
1981 | llvm::stable_sort(Range&: ExitsInLoops, C: [&](BasicBlock *LHS, BasicBlock *RHS) { |
1982 | return LI.getLoopDepth(BB: LHS) < LI.getLoopDepth(BB: RHS); |
1983 | }); |
1984 | |
1985 | // We'll build up a set for each exit loop. |
1986 | SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks; |
1987 | Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop. |
1988 | |
1989 | auto RemoveUnloopedBlocksFromLoop = |
1990 | [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) { |
1991 | for (auto *BB : UnloopedBlocks) |
1992 | L.getBlocksSet().erase(Ptr: BB); |
1993 | llvm::erase_if(C&: L.getBlocksVector(), P: [&](BasicBlock *BB) { |
1994 | return UnloopedBlocks.count(Ptr: BB); |
1995 | }); |
1996 | }; |
1997 | |
1998 | SmallVector<BasicBlock *, 16> Worklist; |
1999 | while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) { |
2000 | assert(Worklist.empty() && "Didn't clear worklist!" ); |
2001 | assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!" ); |
2002 | |
2003 | // Grab the next exit block, in decreasing loop depth order. |
2004 | BasicBlock *ExitBB = ExitsInLoops.pop_back_val(); |
2005 | Loop &ExitL = *LI.getLoopFor(BB: ExitBB); |
2006 | assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!" ); |
2007 | |
2008 | // Erase all of the unlooped blocks from the loops between the previous |
2009 | // exit loop and this exit loop. This works because the ExitInLoops list is |
2010 | // sorted in increasing order of loop depth and thus we visit loops in |
2011 | // decreasing order of loop depth. |
2012 | for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop()) |
2013 | RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); |
2014 | |
2015 | // Walk the CFG back until we hit the cloned PH adding everything reachable |
2016 | // and in the unlooped set to this exit block's loop. |
2017 | Worklist.push_back(Elt: ExitBB); |
2018 | do { |
2019 | BasicBlock *BB = Worklist.pop_back_val(); |
2020 | // We can stop recursing at the cloned preheader (if we get there). |
2021 | if (BB == PH) |
2022 | continue; |
2023 | |
2024 | for (BasicBlock *PredBB : predecessors(BB)) { |
2025 | // If this pred has already been moved to our set or is part of some |
2026 | // (inner) loop, no update needed. |
2027 | if (!UnloopedBlocks.erase(Ptr: PredBB)) { |
2028 | assert((NewExitLoopBlocks.count(PredBB) || |
2029 | ExitL.contains(LI.getLoopFor(PredBB))) && |
2030 | "Predecessor not in a nested loop (or already visited)!" ); |
2031 | continue; |
2032 | } |
2033 | |
2034 | // We just insert into the loop set here. We'll add these blocks to the |
2035 | // exit loop after we build up the set in a deterministic order rather |
2036 | // than the predecessor-influenced visit order. |
2037 | bool Inserted = NewExitLoopBlocks.insert(Ptr: PredBB).second; |
2038 | (void)Inserted; |
2039 | assert(Inserted && "Should only visit an unlooped block once!" ); |
2040 | |
2041 | // And recurse through to its predecessors. |
2042 | Worklist.push_back(Elt: PredBB); |
2043 | } |
2044 | } while (!Worklist.empty()); |
2045 | |
2046 | // If blocks in this exit loop were directly part of the original loop (as |
2047 | // opposed to a child loop) update the map to point to this exit loop. This |
2048 | // just updates a map and so the fact that the order is unstable is fine. |
2049 | for (auto *BB : NewExitLoopBlocks) |
2050 | if (Loop *BBL = LI.getLoopFor(BB)) |
2051 | if (BBL == &L || !L.contains(L: BBL)) |
2052 | LI.changeLoopFor(BB, L: &ExitL); |
2053 | |
2054 | // We will remove the remaining unlooped blocks from this loop in the next |
2055 | // iteration or below. |
2056 | NewExitLoopBlocks.clear(); |
2057 | } |
2058 | |
2059 | // Any remaining unlooped blocks are no longer part of any loop unless they |
2060 | // are part of some child loop. |
2061 | for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop()) |
2062 | RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks); |
2063 | for (auto *BB : UnloopedBlocks) |
2064 | if (Loop *BBL = LI.getLoopFor(BB)) |
2065 | if (BBL == &L || !L.contains(L: BBL)) |
2066 | LI.changeLoopFor(BB, L: nullptr); |
2067 | |
2068 | // Sink all the child loops whose headers are no longer in the loop set to |
2069 | // the parent (or to be top level loops). We reach into the loop and directly |
2070 | // update its subloop vector to make this batch update efficient. |
2071 | auto &SubLoops = L.getSubLoopsVector(); |
2072 | auto SubLoopsSplitI = |
2073 | LoopBlockSet.empty() |
2074 | ? SubLoops.begin() |
2075 | : std::stable_partition( |
2076 | first: SubLoops.begin(), last: SubLoops.end(), pred: [&](Loop *SubL) { |
2077 | return LoopBlockSet.count(Ptr: SubL->getHeader()); |
2078 | }); |
2079 | for (auto *HoistedL : make_range(x: SubLoopsSplitI, y: SubLoops.end())) { |
2080 | HoistedLoops.push_back(Elt: HoistedL); |
2081 | HoistedL->setParentLoop(nullptr); |
2082 | |
2083 | // To compute the new parent of this hoisted loop we look at where we |
2084 | // placed the preheader above. We can't lookup the header itself because we |
2085 | // retained the mapping from the header to the hoisted loop. But the |
2086 | // preheader and header should have the exact same new parent computed |
2087 | // based on the set of exit blocks from the original loop as the preheader |
2088 | // is a predecessor of the header and so reached in the reverse walk. And |
2089 | // because the loops were all in simplified form the preheader of the |
2090 | // hoisted loop can't be part of some *other* loop. |
2091 | if (auto *NewParentL = LI.getLoopFor(BB: HoistedL->getLoopPreheader())) |
2092 | NewParentL->addChildLoop(NewChild: HoistedL); |
2093 | else |
2094 | LI.addTopLevelLoop(New: HoistedL); |
2095 | } |
2096 | SubLoops.erase(first: SubLoopsSplitI, last: SubLoops.end()); |
2097 | |
2098 | // Actually delete the loop if nothing remained within it. |
2099 | if (Blocks.empty()) { |
2100 | assert(SubLoops.empty() && |
2101 | "Failed to remove all subloops from the original loop!" ); |
2102 | if (Loop *ParentL = L.getParentLoop()) |
2103 | ParentL->removeChildLoop(I: llvm::find(Range&: *ParentL, Val: &L)); |
2104 | else |
2105 | LI.removeLoop(I: llvm::find(Range&: LI, Val: &L)); |
2106 | // markLoopAsDeleted for L should be triggered by the caller (it is |
2107 | // typically done within postUnswitch). |
2108 | if (SE) |
2109 | SE->forgetBlockAndLoopDispositions(); |
2110 | LI.destroy(L: &L); |
2111 | return false; |
2112 | } |
2113 | |
2114 | return true; |
2115 | } |
2116 | |
2117 | /// Helper to visit a dominator subtree, invoking a callable on each node. |
2118 | /// |
2119 | /// Returning false at any point will stop walking past that node of the tree. |
2120 | template <typename CallableT> |
2121 | void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { |
2122 | SmallVector<DomTreeNode *, 4> DomWorklist; |
2123 | DomWorklist.push_back(Elt: DT[BB]); |
2124 | #ifndef NDEBUG |
2125 | SmallPtrSet<DomTreeNode *, 4> Visited; |
2126 | Visited.insert(DT[BB]); |
2127 | #endif |
2128 | do { |
2129 | DomTreeNode *N = DomWorklist.pop_back_val(); |
2130 | |
2131 | // Visit this node. |
2132 | if (!Callable(N->getBlock())) |
2133 | continue; |
2134 | |
2135 | // Accumulate the child nodes. |
2136 | for (DomTreeNode *ChildN : *N) { |
2137 | assert(Visited.insert(ChildN).second && |
2138 | "Cannot visit a node twice when walking a tree!" ); |
2139 | DomWorklist.push_back(Elt: ChildN); |
2140 | } |
2141 | } while (!DomWorklist.empty()); |
2142 | } |
2143 | |
2144 | void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, |
2145 | bool CurrentLoopValid, bool PartiallyInvariant, |
2146 | bool InjectedCondition, ArrayRef<Loop *> NewLoops) { |
2147 | // If we did a non-trivial unswitch, we have added new (cloned) loops. |
2148 | if (!NewLoops.empty()) |
2149 | U.addSiblingLoops(NewSibLoops: NewLoops); |
2150 | |
2151 | // If the current loop remains valid, we should revisit it to catch any |
2152 | // other unswitch opportunities. Otherwise, we need to mark it as deleted. |
2153 | if (CurrentLoopValid) { |
2154 | if (PartiallyInvariant) { |
2155 | // Mark the new loop as partially unswitched, to avoid unswitching on |
2156 | // the same condition again. |
2157 | auto &Context = L.getHeader()->getContext(); |
2158 | MDNode *DisableUnswitchMD = MDNode::get( |
2159 | Context, |
2160 | MDs: MDString::get(Context, Str: "llvm.loop.unswitch.partial.disable" )); |
2161 | MDNode *NewLoopID = makePostTransformationMetadata( |
2162 | Context, OrigLoopID: L.getLoopID(), RemovePrefixes: {"llvm.loop.unswitch.partial" }, |
2163 | AddAttrs: {DisableUnswitchMD}); |
2164 | L.setLoopID(NewLoopID); |
2165 | } else if (InjectedCondition) { |
2166 | // Do the same for injection of invariant conditions. |
2167 | auto &Context = L.getHeader()->getContext(); |
2168 | MDNode *DisableUnswitchMD = MDNode::get( |
2169 | Context, |
2170 | MDs: MDString::get(Context, Str: "llvm.loop.unswitch.injection.disable" )); |
2171 | MDNode *NewLoopID = makePostTransformationMetadata( |
2172 | Context, OrigLoopID: L.getLoopID(), RemovePrefixes: {"llvm.loop.unswitch.injection" }, |
2173 | AddAttrs: {DisableUnswitchMD}); |
2174 | L.setLoopID(NewLoopID); |
2175 | } else |
2176 | U.revisitCurrentLoop(); |
2177 | } else |
2178 | U.markLoopAsDeleted(L, Name: LoopName); |
2179 | } |
2180 | |
2181 | static void unswitchNontrivialInvariants( |
2182 | Loop &L, Instruction &TI, ArrayRef<Value *> Invariants, |
2183 | IVConditionInfo &PartialIVInfo, DominatorTree &DT, LoopInfo &LI, |
2184 | AssumptionCache &AC, ScalarEvolution *SE, MemorySSAUpdater *MSSAU, |
2185 | LPMUpdater &LoopUpdater, bool InsertFreeze, bool InjectedCondition) { |
2186 | auto *ParentBB = TI.getParent(); |
2187 | BranchInst *BI = dyn_cast<BranchInst>(Val: &TI); |
2188 | SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(Val: &TI); |
2189 | |
2190 | // Save the current loop name in a variable so that we can report it even |
2191 | // after it has been deleted. |
2192 | std::string LoopName(L.getName()); |
2193 | |
2194 | // We can only unswitch switches, conditional branches with an invariant |
2195 | // condition, or combining invariant conditions with an instruction or |
2196 | // partially invariant instructions. |
2197 | assert((SI || (BI && BI->isConditional())) && |
2198 | "Can only unswitch switches and conditional branch!" ); |
2199 | bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty(); |
2200 | bool FullUnswitch = |
2201 | SI || (skipTrivialSelect(Cond: BI->getCondition()) == Invariants[0] && |
2202 | !PartiallyInvariant); |
2203 | if (FullUnswitch) |
2204 | assert(Invariants.size() == 1 && |
2205 | "Cannot have other invariants with full unswitching!" ); |
2206 | else |
2207 | assert(isa<Instruction>(skipTrivialSelect(BI->getCondition())) && |
2208 | "Partial unswitching requires an instruction as the condition!" ); |
2209 | |
2210 | if (MSSAU && VerifyMemorySSA) |
2211 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2212 | |
2213 | // Constant and BBs tracking the cloned and continuing successor. When we are |
2214 | // unswitching the entire condition, this can just be trivially chosen to |
2215 | // unswitch towards `true`. However, when we are unswitching a set of |
2216 | // invariants combined with `and` or `or` or partially invariant instructions, |
2217 | // the combining operation determines the best direction to unswitch: we want |
2218 | // to unswitch the direction that will collapse the branch. |
2219 | bool Direction = true; |
2220 | int ClonedSucc = 0; |
2221 | if (!FullUnswitch) { |
2222 | Value *Cond = skipTrivialSelect(Cond: BI->getCondition()); |
2223 | (void)Cond; |
2224 | assert(((match(Cond, m_LogicalAnd()) ^ match(Cond, m_LogicalOr())) || |
2225 | PartiallyInvariant) && |
2226 | "Only `or`, `and`, an `select`, partially invariant instructions " |
2227 | "can combine invariants being unswitched." ); |
2228 | if (!match(V: Cond, P: m_LogicalOr())) { |
2229 | if (match(V: Cond, P: m_LogicalAnd()) || |
2230 | (PartiallyInvariant && !PartialIVInfo.KnownValue->isOneValue())) { |
2231 | Direction = false; |
2232 | ClonedSucc = 1; |
2233 | } |
2234 | } |
2235 | } |
2236 | |
2237 | BasicBlock *RetainedSuccBB = |
2238 | BI ? BI->getSuccessor(i: 1 - ClonedSucc) : SI->getDefaultDest(); |
2239 | SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs; |
2240 | if (BI) |
2241 | UnswitchedSuccBBs.insert(X: BI->getSuccessor(i: ClonedSucc)); |
2242 | else |
2243 | for (auto Case : SI->cases()) |
2244 | if (Case.getCaseSuccessor() != RetainedSuccBB) |
2245 | UnswitchedSuccBBs.insert(X: Case.getCaseSuccessor()); |
2246 | |
2247 | assert(!UnswitchedSuccBBs.count(RetainedSuccBB) && |
2248 | "Should not unswitch the same successor we are retaining!" ); |
2249 | |
2250 | // The branch should be in this exact loop. Any inner loop's invariant branch |
2251 | // should be handled by unswitching that inner loop. The caller of this |
2252 | // routine should filter out any candidates that remain (but were skipped for |
2253 | // whatever reason). |
2254 | assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!" ); |
2255 | |
2256 | // Compute the parent loop now before we start hacking on things. |
2257 | Loop *ParentL = L.getParentLoop(); |
2258 | // Get blocks in RPO order for MSSA update, before changing the CFG. |
2259 | LoopBlocksRPO LBRPO(&L); |
2260 | if (MSSAU) |
2261 | LBRPO.perform(LI: &LI); |
2262 | |
2263 | // Compute the outer-most loop containing one of our exit blocks. This is the |
2264 | // furthest up our loopnest which can be mutated, which we will use below to |
2265 | // update things. |
2266 | Loop *OuterExitL = &L; |
2267 | SmallVector<BasicBlock *, 4> ExitBlocks; |
2268 | L.getUniqueExitBlocks(ExitBlocks); |
2269 | for (auto *ExitBB : ExitBlocks) { |
2270 | // ExitBB can be an exit block for several levels in the loop nest. Make |
2271 | // sure we find the top most. |
2272 | Loop *NewOuterExitL = getTopMostExitingLoop(ExitBB, LI); |
2273 | if (!NewOuterExitL) { |
2274 | // We exited the entire nest with this block, so we're done. |
2275 | OuterExitL = nullptr; |
2276 | break; |
2277 | } |
2278 | if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(L: OuterExitL)) |
2279 | OuterExitL = NewOuterExitL; |
2280 | } |
2281 | |
2282 | // At this point, we're definitely going to unswitch something so invalidate |
2283 | // any cached information in ScalarEvolution for the outer most loop |
2284 | // containing an exit block and all nested loops. |
2285 | if (SE) { |
2286 | if (OuterExitL) |
2287 | SE->forgetLoop(L: OuterExitL); |
2288 | else |
2289 | SE->forgetTopmostLoop(L: &L); |
2290 | SE->forgetBlockAndLoopDispositions(); |
2291 | } |
2292 | |
2293 | // If the edge from this terminator to a successor dominates that successor, |
2294 | // store a map from each block in its dominator subtree to it. This lets us |
2295 | // tell when cloning for a particular successor if a block is dominated by |
2296 | // some *other* successor with a single data structure. We use this to |
2297 | // significantly reduce cloning. |
2298 | SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc; |
2299 | for (auto *SuccBB : llvm::concat<BasicBlock *const>(Ranges: ArrayRef(RetainedSuccBB), |
2300 | Ranges&: UnswitchedSuccBBs)) |
2301 | if (SuccBB->getUniquePredecessor() || |
2302 | llvm::all_of(Range: predecessors(BB: SuccBB), P: [&](BasicBlock *PredBB) { |
2303 | return PredBB == ParentBB || DT.dominates(A: SuccBB, B: PredBB); |
2304 | })) |
2305 | visitDomSubTree(DT, BB: SuccBB, Callable: [&](BasicBlock *BB) { |
2306 | DominatingSucc[BB] = SuccBB; |
2307 | return true; |
2308 | }); |
2309 | |
2310 | // Split the preheader, so that we know that there is a safe place to insert |
2311 | // the conditional branch. We will change the preheader to have a conditional |
2312 | // branch on LoopCond. The original preheader will become the split point |
2313 | // between the unswitched versions, and we will have a new preheader for the |
2314 | // original loop. |
2315 | BasicBlock *SplitBB = L.getLoopPreheader(); |
2316 | BasicBlock *LoopPH = SplitEdge(From: SplitBB, To: L.getHeader(), DT: &DT, LI: &LI, MSSAU); |
2317 | |
2318 | // Keep track of the dominator tree updates needed. |
2319 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates; |
2320 | |
2321 | // Clone the loop for each unswitched successor. |
2322 | SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; |
2323 | VMaps.reserve(N: UnswitchedSuccBBs.size()); |
2324 | SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs; |
2325 | for (auto *SuccBB : UnswitchedSuccBBs) { |
2326 | VMaps.emplace_back(Args: new ValueToValueMapTy()); |
2327 | ClonedPHs[SuccBB] = buildClonedLoopBlocks( |
2328 | L, LoopPH, SplitBB, ExitBlocks, ParentBB, UnswitchedSuccBB: SuccBB, ContinueSuccBB: RetainedSuccBB, |
2329 | DominatingSucc, VMap&: *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU, SE); |
2330 | } |
2331 | |
2332 | // Drop metadata if we may break its semantics by moving this instr into the |
2333 | // split block. |
2334 | if (TI.getMetadata(KindID: LLVMContext::MD_make_implicit)) { |
2335 | if (DropNonTrivialImplicitNullChecks) |
2336 | // Do not spend time trying to understand if we can keep it, just drop it |
2337 | // to save compile time. |
2338 | TI.setMetadata(KindID: LLVMContext::MD_make_implicit, Node: nullptr); |
2339 | else { |
2340 | // It is only legal to preserve make.implicit metadata if we are |
2341 | // guaranteed no reach implicit null check after following this branch. |
2342 | ICFLoopSafetyInfo SafetyInfo; |
2343 | SafetyInfo.computeLoopSafetyInfo(CurLoop: &L); |
2344 | if (!SafetyInfo.isGuaranteedToExecute(Inst: TI, DT: &DT, CurLoop: &L)) |
2345 | TI.setMetadata(KindID: LLVMContext::MD_make_implicit, Node: nullptr); |
2346 | } |
2347 | } |
2348 | |
2349 | // The stitching of the branched code back together depends on whether we're |
2350 | // doing full unswitching or not with the exception that we always want to |
2351 | // nuke the initial terminator placed in the split block. |
2352 | SplitBB->getTerminator()->eraseFromParent(); |
2353 | if (FullUnswitch) { |
2354 | // Keep a clone of the terminator for MSSA updates. |
2355 | Instruction *NewTI = TI.clone(); |
2356 | NewTI->insertInto(ParentBB, It: ParentBB->end()); |
2357 | |
2358 | // Splice the terminator from the original loop and rewrite its |
2359 | // successors. |
2360 | TI.moveBefore(BB&: *SplitBB, I: SplitBB->end()); |
2361 | TI.dropLocation(); |
2362 | |
2363 | // First wire up the moved terminator to the preheaders. |
2364 | if (BI) { |
2365 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2366 | BI->setSuccessor(idx: ClonedSucc, NewSucc: ClonedPH); |
2367 | BI->setSuccessor(idx: 1 - ClonedSucc, NewSucc: LoopPH); |
2368 | Value *Cond = skipTrivialSelect(Cond: BI->getCondition()); |
2369 | if (InsertFreeze) { |
2370 | // We don't give any debug location to the new freeze, because the |
2371 | // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted |
2372 | // out of the loop. |
2373 | Cond = new FreezeInst(Cond, Cond->getName() + ".fr" , BI->getIterator()); |
2374 | cast<Instruction>(Val: Cond)->setDebugLoc(DebugLoc::getDropped()); |
2375 | } |
2376 | BI->setCondition(Cond); |
2377 | DTUpdates.push_back(Elt: {DominatorTree::Insert, SplitBB, ClonedPH}); |
2378 | } else { |
2379 | assert(SI && "Must either be a branch or switch!" ); |
2380 | |
2381 | // Walk the cases and directly update their successors. |
2382 | assert(SI->getDefaultDest() == RetainedSuccBB && |
2383 | "Not retaining default successor!" ); |
2384 | SI->setDefaultDest(LoopPH); |
2385 | for (const auto &Case : SI->cases()) |
2386 | if (Case.getCaseSuccessor() == RetainedSuccBB) |
2387 | Case.setSuccessor(LoopPH); |
2388 | else |
2389 | Case.setSuccessor(ClonedPHs.find(Val: Case.getCaseSuccessor())->second); |
2390 | |
2391 | if (InsertFreeze) |
2392 | SI->setCondition(new FreezeInst(SI->getCondition(), |
2393 | SI->getCondition()->getName() + ".fr" , |
2394 | SI->getIterator())); |
2395 | |
2396 | // We need to use the set to populate domtree updates as even when there |
2397 | // are multiple cases pointing at the same successor we only want to |
2398 | // remove and insert one edge in the domtree. |
2399 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2400 | DTUpdates.push_back( |
2401 | Elt: {DominatorTree::Insert, SplitBB, ClonedPHs.find(Val: SuccBB)->second}); |
2402 | } |
2403 | |
2404 | if (MSSAU) { |
2405 | DT.applyUpdates(Updates: DTUpdates); |
2406 | DTUpdates.clear(); |
2407 | |
2408 | // Remove all but one edge to the retained block and all unswitched |
2409 | // blocks. This is to avoid having duplicate entries in the cloned Phis, |
2410 | // when we know we only keep a single edge for each case. |
2411 | MSSAU->removeDuplicatePhiEdgesBetween(From: ParentBB, To: RetainedSuccBB); |
2412 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2413 | MSSAU->removeDuplicatePhiEdgesBetween(From: ParentBB, To: SuccBB); |
2414 | |
2415 | for (auto &VMap : VMaps) |
2416 | MSSAU->updateForClonedLoop(LoopBlocks: LBRPO, ExitBlocks, VM: *VMap, |
2417 | /*IgnoreIncomingWithNoClones=*/true); |
2418 | MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); |
2419 | |
2420 | // Remove all edges to unswitched blocks. |
2421 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2422 | MSSAU->removeEdge(From: ParentBB, To: SuccBB); |
2423 | } |
2424 | |
2425 | // Now unhook the successor relationship as we'll be replacing |
2426 | // the terminator with a direct branch. This is much simpler for branches |
2427 | // than switches so we handle those first. |
2428 | if (BI) { |
2429 | // Remove the parent as a predecessor of the unswitched successor. |
2430 | assert(UnswitchedSuccBBs.size() == 1 && |
2431 | "Only one possible unswitched block for a branch!" ); |
2432 | BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin(); |
2433 | UnswitchedSuccBB->removePredecessor(Pred: ParentBB, |
2434 | /*KeepOneInputPHIs*/ true); |
2435 | DTUpdates.push_back(Elt: {DominatorTree::Delete, ParentBB, UnswitchedSuccBB}); |
2436 | } else { |
2437 | // Note that we actually want to remove the parent block as a predecessor |
2438 | // of *every* case successor. The case successor is either unswitched, |
2439 | // completely eliminating an edge from the parent to that successor, or it |
2440 | // is a duplicate edge to the retained successor as the retained successor |
2441 | // is always the default successor and as we'll replace this with a direct |
2442 | // branch we no longer need the duplicate entries in the PHI nodes. |
2443 | SwitchInst *NewSI = cast<SwitchInst>(Val: NewTI); |
2444 | assert(NewSI->getDefaultDest() == RetainedSuccBB && |
2445 | "Not retaining default successor!" ); |
2446 | for (const auto &Case : NewSI->cases()) |
2447 | Case.getCaseSuccessor()->removePredecessor( |
2448 | Pred: ParentBB, |
2449 | /*KeepOneInputPHIs*/ true); |
2450 | |
2451 | // We need to use the set to populate domtree updates as even when there |
2452 | // are multiple cases pointing at the same successor we only want to |
2453 | // remove and insert one edge in the domtree. |
2454 | for (BasicBlock *SuccBB : UnswitchedSuccBBs) |
2455 | DTUpdates.push_back(Elt: {DominatorTree::Delete, ParentBB, SuccBB}); |
2456 | } |
2457 | |
2458 | // Create a new unconditional branch to the continuing block (as opposed to |
2459 | // the one cloned). |
2460 | Instruction *NewBI = BranchInst::Create(IfTrue: RetainedSuccBB, InsertBefore: ParentBB); |
2461 | NewBI->setDebugLoc(NewTI->getDebugLoc()); |
2462 | |
2463 | // After MSSAU update, remove the cloned terminator instruction NewTI. |
2464 | NewTI->eraseFromParent(); |
2465 | } else { |
2466 | assert(BI && "Only branches have partial unswitching." ); |
2467 | assert(UnswitchedSuccBBs.size() == 1 && |
2468 | "Only one possible unswitched block for a branch!" ); |
2469 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2470 | // When doing a partial unswitch, we have to do a bit more work to build up |
2471 | // the branch in the split block. |
2472 | if (PartiallyInvariant) |
2473 | buildPartialInvariantUnswitchConditionalBranch( |
2474 | BB&: *SplitBB, ToDuplicate: Invariants, Direction, UnswitchedSucc&: *ClonedPH, NormalSucc&: *LoopPH, L, MSSAU); |
2475 | else { |
2476 | buildPartialUnswitchConditionalBranch( |
2477 | BB&: *SplitBB, Invariants, Direction, UnswitchedSucc&: *ClonedPH, NormalSucc&: *LoopPH, |
2478 | InsertFreeze: FreezeLoopUnswitchCond, I: BI, AC: &AC, DT); |
2479 | } |
2480 | DTUpdates.push_back(Elt: {DominatorTree::Insert, SplitBB, ClonedPH}); |
2481 | |
2482 | if (MSSAU) { |
2483 | DT.applyUpdates(Updates: DTUpdates); |
2484 | DTUpdates.clear(); |
2485 | |
2486 | // Perform MSSA cloning updates. |
2487 | for (auto &VMap : VMaps) |
2488 | MSSAU->updateForClonedLoop(LoopBlocks: LBRPO, ExitBlocks, VM: *VMap, |
2489 | /*IgnoreIncomingWithNoClones=*/true); |
2490 | MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT); |
2491 | } |
2492 | } |
2493 | |
2494 | // Apply the updates accumulated above to get an up-to-date dominator tree. |
2495 | DT.applyUpdates(Updates: DTUpdates); |
2496 | |
2497 | // Now that we have an accurate dominator tree, first delete the dead cloned |
2498 | // blocks so that we can accurately build any cloned loops. It is important to |
2499 | // not delete the blocks from the original loop yet because we still want to |
2500 | // reference the original loop to understand the cloned loop's structure. |
2501 | deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU); |
2502 | |
2503 | // Build the cloned loop structure itself. This may be substantially |
2504 | // different from the original structure due to the simplified CFG. This also |
2505 | // handles inserting all the cloned blocks into the correct loops. |
2506 | SmallVector<Loop *, 4> NonChildClonedLoops; |
2507 | for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps) |
2508 | buildClonedLoops(OrigL&: L, ExitBlocks, VMap: *VMap, LI, NonChildClonedLoops); |
2509 | |
2510 | // Now that our cloned loops have been built, we can update the original loop. |
2511 | // First we delete the dead blocks from it and then we rebuild the loop |
2512 | // structure taking these deletions into account. |
2513 | deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU, SE, LoopUpdater); |
2514 | |
2515 | if (MSSAU && VerifyMemorySSA) |
2516 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2517 | |
2518 | SmallVector<Loop *, 4> HoistedLoops; |
2519 | bool IsStillLoop = |
2520 | rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops, SE); |
2521 | |
2522 | if (MSSAU && VerifyMemorySSA) |
2523 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2524 | |
2525 | // This transformation has a high risk of corrupting the dominator tree, and |
2526 | // the below steps to rebuild loop structures will result in hard to debug |
2527 | // errors in that case so verify that the dominator tree is sane first. |
2528 | // FIXME: Remove this when the bugs stop showing up and rely on existing |
2529 | // verification steps. |
2530 | assert(DT.verify(DominatorTree::VerificationLevel::Fast)); |
2531 | |
2532 | if (BI && !PartiallyInvariant) { |
2533 | // If we unswitched a branch which collapses the condition to a known |
2534 | // constant we want to replace all the uses of the invariants within both |
2535 | // the original and cloned blocks. We do this here so that we can use the |
2536 | // now updated dominator tree to identify which side the users are on. |
2537 | assert(UnswitchedSuccBBs.size() == 1 && |
2538 | "Only one possible unswitched block for a branch!" ); |
2539 | BasicBlock *ClonedPH = ClonedPHs.begin()->second; |
2540 | |
2541 | // When considering multiple partially-unswitched invariants |
2542 | // we cant just go replace them with constants in both branches. |
2543 | // |
2544 | // For 'AND' we infer that true branch ("continue") means true |
2545 | // for each invariant operand. |
2546 | // For 'OR' we can infer that false branch ("continue") means false |
2547 | // for each invariant operand. |
2548 | // So it happens that for multiple-partial case we dont replace |
2549 | // in the unswitched branch. |
2550 | bool ReplaceUnswitched = |
2551 | FullUnswitch || (Invariants.size() == 1) || PartiallyInvariant; |
2552 | |
2553 | ConstantInt *UnswitchedReplacement = |
2554 | Direction ? ConstantInt::getTrue(Context&: BI->getContext()) |
2555 | : ConstantInt::getFalse(Context&: BI->getContext()); |
2556 | ConstantInt *ContinueReplacement = |
2557 | Direction ? ConstantInt::getFalse(Context&: BI->getContext()) |
2558 | : ConstantInt::getTrue(Context&: BI->getContext()); |
2559 | for (Value *Invariant : Invariants) { |
2560 | assert(!isa<Constant>(Invariant) && |
2561 | "Should not be replacing constant values!" ); |
2562 | // Use make_early_inc_range here as set invalidates the iterator. |
2563 | for (Use &U : llvm::make_early_inc_range(Range: Invariant->uses())) { |
2564 | Instruction *UserI = dyn_cast<Instruction>(Val: U.getUser()); |
2565 | if (!UserI) |
2566 | continue; |
2567 | |
2568 | // Replace it with the 'continue' side if in the main loop body, and the |
2569 | // unswitched if in the cloned blocks. |
2570 | if (DT.dominates(A: LoopPH, B: UserI->getParent())) |
2571 | U.set(ContinueReplacement); |
2572 | else if (ReplaceUnswitched && |
2573 | DT.dominates(A: ClonedPH, B: UserI->getParent())) |
2574 | U.set(UnswitchedReplacement); |
2575 | } |
2576 | } |
2577 | } |
2578 | |
2579 | // We can change which blocks are exit blocks of all the cloned sibling |
2580 | // loops, the current loop, and any parent loops which shared exit blocks |
2581 | // with the current loop. As a consequence, we need to re-form LCSSA for |
2582 | // them. But we shouldn't need to re-form LCSSA for any child loops. |
2583 | // FIXME: This could be made more efficient by tracking which exit blocks are |
2584 | // new, and focusing on them, but that isn't likely to be necessary. |
2585 | // |
2586 | // In order to reasonably rebuild LCSSA we need to walk inside-out across the |
2587 | // loop nest and update every loop that could have had its exits changed. We |
2588 | // also need to cover any intervening loops. We add all of these loops to |
2589 | // a list and sort them by loop depth to achieve this without updating |
2590 | // unnecessary loops. |
2591 | auto UpdateLoop = [&](Loop &UpdateL) { |
2592 | #ifndef NDEBUG |
2593 | UpdateL.verifyLoop(); |
2594 | for (Loop *ChildL : UpdateL) { |
2595 | ChildL->verifyLoop(); |
2596 | assert(ChildL->isRecursivelyLCSSAForm(DT, LI) && |
2597 | "Perturbed a child loop's LCSSA form!" ); |
2598 | } |
2599 | #endif |
2600 | // First build LCSSA for this loop so that we can preserve it when |
2601 | // forming dedicated exits. We don't want to perturb some other loop's |
2602 | // LCSSA while doing that CFG edit. |
2603 | formLCSSA(L&: UpdateL, DT, LI: &LI, SE); |
2604 | |
2605 | // For loops reached by this loop's original exit blocks we may |
2606 | // introduced new, non-dedicated exits. At least try to re-form dedicated |
2607 | // exits for these loops. This may fail if they couldn't have dedicated |
2608 | // exits to start with. |
2609 | formDedicatedExitBlocks(L: &UpdateL, DT: &DT, LI: &LI, MSSAU, /*PreserveLCSSA*/ true); |
2610 | }; |
2611 | |
2612 | // For non-child cloned loops and hoisted loops, we just need to update LCSSA |
2613 | // and we can do it in any order as they don't nest relative to each other. |
2614 | // |
2615 | // Also check if any of the loops we have updated have become top-level loops |
2616 | // as that will necessitate widening the outer loop scope. |
2617 | for (Loop *UpdatedL : |
2618 | llvm::concat<Loop *>(Ranges&: NonChildClonedLoops, Ranges&: HoistedLoops)) { |
2619 | UpdateLoop(*UpdatedL); |
2620 | if (UpdatedL->isOutermost()) |
2621 | OuterExitL = nullptr; |
2622 | } |
2623 | if (IsStillLoop) { |
2624 | UpdateLoop(L); |
2625 | if (L.isOutermost()) |
2626 | OuterExitL = nullptr; |
2627 | } |
2628 | |
2629 | // If the original loop had exit blocks, walk up through the outer most loop |
2630 | // of those exit blocks to update LCSSA and form updated dedicated exits. |
2631 | if (OuterExitL != &L) |
2632 | for (Loop *OuterL = ParentL; OuterL != OuterExitL; |
2633 | OuterL = OuterL->getParentLoop()) |
2634 | UpdateLoop(*OuterL); |
2635 | |
2636 | #ifndef NDEBUG |
2637 | // Verify the entire loop structure to catch any incorrect updates before we |
2638 | // progress in the pass pipeline. |
2639 | LI.verify(DT); |
2640 | #endif |
2641 | |
2642 | // Now that we've unswitched something, make callbacks to report the changes. |
2643 | // For that we need to merge together the updated loops and the cloned loops |
2644 | // and check whether the original loop survived. |
2645 | SmallVector<Loop *, 4> SibLoops; |
2646 | for (Loop *UpdatedL : llvm::concat<Loop *>(Ranges&: NonChildClonedLoops, Ranges&: HoistedLoops)) |
2647 | if (UpdatedL->getParentLoop() == ParentL) |
2648 | SibLoops.push_back(Elt: UpdatedL); |
2649 | postUnswitch(L, U&: LoopUpdater, LoopName, CurrentLoopValid: IsStillLoop, PartiallyInvariant, |
2650 | InjectedCondition, NewLoops: SibLoops); |
2651 | |
2652 | if (MSSAU && VerifyMemorySSA) |
2653 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2654 | |
2655 | if (BI) |
2656 | ++NumBranches; |
2657 | else |
2658 | ++NumSwitches; |
2659 | } |
2660 | |
2661 | /// Recursively compute the cost of a dominator subtree based on the per-block |
2662 | /// cost map provided. |
2663 | /// |
2664 | /// The recursive computation is memozied into the provided DT-indexed cost map |
2665 | /// to allow querying it for most nodes in the domtree without it becoming |
2666 | /// quadratic. |
2667 | static InstructionCost computeDomSubtreeCost( |
2668 | DomTreeNode &N, |
2669 | const SmallDenseMap<BasicBlock *, InstructionCost, 4> &BBCostMap, |
2670 | SmallDenseMap<DomTreeNode *, InstructionCost, 4> &DTCostMap) { |
2671 | // Don't accumulate cost (or recurse through) blocks not in our block cost |
2672 | // map and thus not part of the duplication cost being considered. |
2673 | auto BBCostIt = BBCostMap.find(Val: N.getBlock()); |
2674 | if (BBCostIt == BBCostMap.end()) |
2675 | return 0; |
2676 | |
2677 | // Lookup this node to see if we already computed its cost. |
2678 | auto DTCostIt = DTCostMap.find(Val: &N); |
2679 | if (DTCostIt != DTCostMap.end()) |
2680 | return DTCostIt->second; |
2681 | |
2682 | // If not, we have to compute it. We can't use insert above and update |
2683 | // because computing the cost may insert more things into the map. |
2684 | InstructionCost Cost = std::accumulate( |
2685 | first: N.begin(), last: N.end(), init: BBCostIt->second, |
2686 | binary_op: [&](InstructionCost Sum, DomTreeNode *ChildN) -> InstructionCost { |
2687 | return Sum + computeDomSubtreeCost(N&: *ChildN, BBCostMap, DTCostMap); |
2688 | }); |
2689 | bool Inserted = DTCostMap.insert(KV: {&N, Cost}).second; |
2690 | (void)Inserted; |
2691 | assert(Inserted && "Should not insert a node while visiting children!" ); |
2692 | return Cost; |
2693 | } |
2694 | |
2695 | /// Turns a select instruction into implicit control flow branch, |
2696 | /// making the following replacement: |
2697 | /// |
2698 | /// head: |
2699 | /// --code before select-- |
2700 | /// select %cond, %trueval, %falseval |
2701 | /// --code after select-- |
2702 | /// |
2703 | /// into |
2704 | /// |
2705 | /// head: |
2706 | /// --code before select-- |
2707 | /// br i1 %cond, label %then, label %tail |
2708 | /// |
2709 | /// then: |
2710 | /// br %tail |
2711 | /// |
2712 | /// tail: |
2713 | /// phi [ %trueval, %then ], [ %falseval, %head] |
2714 | /// unreachable |
2715 | /// |
2716 | /// It also makes all relevant DT and LI updates, so that all structures are in |
2717 | /// valid state after this transform. |
2718 | static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT, |
2719 | LoopInfo &LI, MemorySSAUpdater *MSSAU, |
2720 | AssumptionCache *AC) { |
2721 | LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n" ); |
2722 | BasicBlock *HeadBB = SI->getParent(); |
2723 | |
2724 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
2725 | SplitBlockAndInsertIfThen(Cond: SI->getCondition(), SplitBefore: SI, Unreachable: false, |
2726 | BranchWeights: SI->getMetadata(KindID: LLVMContext::MD_prof), DTU: &DTU, LI: &LI); |
2727 | auto *CondBr = cast<BranchInst>(Val: HeadBB->getTerminator()); |
2728 | BasicBlock *ThenBB = CondBr->getSuccessor(i: 0), |
2729 | *TailBB = CondBr->getSuccessor(i: 1); |
2730 | if (MSSAU) |
2731 | MSSAU->moveAllAfterSpliceBlocks(From: HeadBB, To: TailBB, Start: SI); |
2732 | |
2733 | PHINode *Phi = |
2734 | PHINode::Create(Ty: SI->getType(), NumReservedValues: 2, NameStr: "unswitched.select" , InsertBefore: SI->getIterator()); |
2735 | Phi->addIncoming(V: SI->getTrueValue(), BB: ThenBB); |
2736 | Phi->addIncoming(V: SI->getFalseValue(), BB: HeadBB); |
2737 | Phi->setDebugLoc(SI->getDebugLoc()); |
2738 | SI->replaceAllUsesWith(V: Phi); |
2739 | SI->eraseFromParent(); |
2740 | |
2741 | if (MSSAU && VerifyMemorySSA) |
2742 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2743 | |
2744 | ++NumSelects; |
2745 | return CondBr; |
2746 | } |
2747 | |
2748 | /// Turns a llvm.experimental.guard intrinsic into implicit control flow branch, |
2749 | /// making the following replacement: |
2750 | /// |
2751 | /// --code before guard-- |
2752 | /// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ] |
2753 | /// --code after guard-- |
2754 | /// |
2755 | /// into |
2756 | /// |
2757 | /// --code before guard-- |
2758 | /// br i1 %cond, label %guarded, label %deopt |
2759 | /// |
2760 | /// guarded: |
2761 | /// --code after guard-- |
2762 | /// |
2763 | /// deopt: |
2764 | /// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] |
2765 | /// unreachable |
2766 | /// |
2767 | /// It also makes all relevant DT and LI updates, so that all structures are in |
2768 | /// valid state after this transform. |
2769 | static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, |
2770 | DominatorTree &DT, LoopInfo &LI, |
2771 | MemorySSAUpdater *MSSAU) { |
2772 | LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n" ); |
2773 | BasicBlock *CheckBB = GI->getParent(); |
2774 | |
2775 | if (MSSAU && VerifyMemorySSA) |
2776 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2777 | |
2778 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
2779 | Instruction *DeoptBlockTerm = |
2780 | SplitBlockAndInsertIfThen(Cond: GI->getArgOperand(i: 0), SplitBefore: GI, Unreachable: true, |
2781 | BranchWeights: GI->getMetadata(KindID: LLVMContext::MD_prof), DTU: &DTU, LI: &LI); |
2782 | BranchInst *CheckBI = cast<BranchInst>(Val: CheckBB->getTerminator()); |
2783 | // SplitBlockAndInsertIfThen inserts control flow that branches to |
2784 | // DeoptBlockTerm if the condition is true. We want the opposite. |
2785 | CheckBI->swapSuccessors(); |
2786 | |
2787 | BasicBlock *GuardedBlock = CheckBI->getSuccessor(i: 0); |
2788 | GuardedBlock->setName("guarded" ); |
2789 | CheckBI->getSuccessor(i: 1)->setName("deopt" ); |
2790 | BasicBlock *DeoptBlock = CheckBI->getSuccessor(i: 1); |
2791 | |
2792 | if (MSSAU) |
2793 | MSSAU->moveAllAfterSpliceBlocks(From: CheckBB, To: GuardedBlock, Start: GI); |
2794 | |
2795 | GI->moveBefore(InsertPos: DeoptBlockTerm->getIterator()); |
2796 | GI->setArgOperand(i: 0, v: ConstantInt::getFalse(Context&: GI->getContext())); |
2797 | |
2798 | if (MSSAU) { |
2799 | MemoryDef *MD = cast<MemoryDef>(Val: MSSAU->getMemorySSA()->getMemoryAccess(I: GI)); |
2800 | MSSAU->moveToPlace(What: MD, BB: DeoptBlock, Where: MemorySSA::BeforeTerminator); |
2801 | if (VerifyMemorySSA) |
2802 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
2803 | } |
2804 | |
2805 | if (VerifyLoopInfo) |
2806 | LI.verify(DomTree: DT); |
2807 | ++NumGuards; |
2808 | return CheckBI; |
2809 | } |
2810 | |
2811 | /// Cost multiplier is a way to limit potentially exponential behavior |
2812 | /// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch |
2813 | /// candidates available. Also accounting for the number of "sibling" loops with |
2814 | /// the idea to account for previous unswitches that already happened on this |
2815 | /// cluster of loops. There was an attempt to keep this formula simple, |
2816 | /// just enough to limit the worst case behavior. Even if it is not that simple |
2817 | /// now it is still not an attempt to provide a detailed heuristic size |
2818 | /// prediction. |
2819 | /// |
2820 | /// TODO: Make a proper accounting of "explosion" effect for all kinds of |
2821 | /// unswitch candidates, making adequate predictions instead of wild guesses. |
2822 | /// That requires knowing not just the number of "remaining" candidates but |
2823 | /// also costs of unswitching for each of these candidates. |
2824 | static int CalculateUnswitchCostMultiplier( |
2825 | const Instruction &TI, const Loop &L, const LoopInfo &LI, |
2826 | const DominatorTree &DT, |
2827 | ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates) { |
2828 | |
2829 | // Guards and other exiting conditions do not contribute to exponential |
2830 | // explosion as soon as they dominate the latch (otherwise there might be |
2831 | // another path to the latch remaining that does not allow to eliminate the |
2832 | // loop copy on unswitch). |
2833 | const BasicBlock *Latch = L.getLoopLatch(); |
2834 | const BasicBlock *CondBlock = TI.getParent(); |
2835 | if (DT.dominates(A: CondBlock, B: Latch) && |
2836 | (isGuard(U: &TI) || |
2837 | (TI.isTerminator() && |
2838 | llvm::count_if(Range: successors(I: &TI), P: [&L](const BasicBlock *SuccBB) { |
2839 | return L.contains(BB: SuccBB); |
2840 | }) <= 1))) { |
2841 | NumCostMultiplierSkipped++; |
2842 | return 1; |
2843 | } |
2844 | |
2845 | auto *ParentL = L.getParentLoop(); |
2846 | int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size() |
2847 | : std::distance(first: LI.begin(), last: LI.end())); |
2848 | // Count amount of clones that all the candidates might cause during |
2849 | // unswitching. Branch/guard/select counts as 1, switch counts as log2 of its |
2850 | // cases. |
2851 | int UnswitchedClones = 0; |
2852 | for (const auto &Candidate : UnswitchCandidates) { |
2853 | const Instruction *CI = Candidate.TI; |
2854 | const BasicBlock *CondBlock = CI->getParent(); |
2855 | bool SkipExitingSuccessors = DT.dominates(A: CondBlock, B: Latch); |
2856 | if (isa<SelectInst>(Val: CI)) { |
2857 | UnswitchedClones++; |
2858 | continue; |
2859 | } |
2860 | if (isGuard(U: CI)) { |
2861 | if (!SkipExitingSuccessors) |
2862 | UnswitchedClones++; |
2863 | continue; |
2864 | } |
2865 | int NonExitingSuccessors = |
2866 | llvm::count_if(Range: successors(BB: CondBlock), |
2867 | P: [SkipExitingSuccessors, &L](const BasicBlock *SuccBB) { |
2868 | return !SkipExitingSuccessors || L.contains(BB: SuccBB); |
2869 | }); |
2870 | UnswitchedClones += Log2_32(Value: NonExitingSuccessors); |
2871 | } |
2872 | |
2873 | // Ignore up to the "unscaled candidates" number of unswitch candidates |
2874 | // when calculating the power-of-two scaling of the cost. The main idea |
2875 | // with this control is to allow a small number of unswitches to happen |
2876 | // and rely more on siblings multiplier (see below) when the number |
2877 | // of candidates is small. |
2878 | unsigned ClonesPower = |
2879 | std::max(a: UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, b: 0); |
2880 | |
2881 | // Allowing top-level loops to spread a bit more than nested ones. |
2882 | int SiblingsMultiplier = |
2883 | std::max(a: (ParentL ? SiblingsCount |
2884 | : SiblingsCount / (int)UnswitchSiblingsToplevelDiv), |
2885 | b: 1); |
2886 | // Compute the cost multiplier in a way that won't overflow by saturating |
2887 | // at an upper bound. |
2888 | int CostMultiplier; |
2889 | if (ClonesPower > Log2_32(Value: UnswitchThreshold) || |
2890 | SiblingsMultiplier > UnswitchThreshold) |
2891 | CostMultiplier = UnswitchThreshold; |
2892 | else |
2893 | CostMultiplier = std::min(a: SiblingsMultiplier * (1 << ClonesPower), |
2894 | b: (int)UnswitchThreshold); |
2895 | |
2896 | LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier |
2897 | << " (siblings " << SiblingsMultiplier << " * clones " |
2898 | << (1 << ClonesPower) << ")" |
2899 | << " for unswitch candidate: " << TI << "\n" ); |
2900 | return CostMultiplier; |
2901 | } |
2902 | |
2903 | static bool collectUnswitchCandidates( |
2904 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, |
2905 | IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, |
2906 | const Loop &L, const LoopInfo &LI, AAResults &AA, |
2907 | const MemorySSAUpdater *MSSAU) { |
2908 | assert(UnswitchCandidates.empty() && "Should be!" ); |
2909 | |
2910 | auto AddUnswitchCandidatesForInst = [&](Instruction *I, Value *Cond) { |
2911 | Cond = skipTrivialSelect(Cond); |
2912 | if (isa<Constant>(Val: Cond)) |
2913 | return; |
2914 | if (L.isLoopInvariant(V: Cond)) { |
2915 | UnswitchCandidates.push_back(Elt: {I, {Cond}}); |
2916 | return; |
2917 | } |
2918 | if (match(V: Cond, P: m_CombineOr(L: m_LogicalAnd(), R: m_LogicalOr()))) { |
2919 | TinyPtrVector<Value *> Invariants = |
2920 | collectHomogenousInstGraphLoopInvariants( |
2921 | L, Root&: *static_cast<Instruction *>(Cond), LI); |
2922 | if (!Invariants.empty()) |
2923 | UnswitchCandidates.push_back(Elt: {I, std::move(Invariants)}); |
2924 | } |
2925 | }; |
2926 | |
2927 | // Whether or not we should also collect guards in the loop. |
2928 | bool CollectGuards = false; |
2929 | if (UnswitchGuards) { |
2930 | auto *GuardDecl = Intrinsic::getDeclarationIfExists( |
2931 | M: L.getHeader()->getParent()->getParent(), id: Intrinsic::experimental_guard); |
2932 | if (GuardDecl && !GuardDecl->use_empty()) |
2933 | CollectGuards = true; |
2934 | } |
2935 | |
2936 | for (auto *BB : L.blocks()) { |
2937 | if (LI.getLoopFor(BB) != &L) |
2938 | continue; |
2939 | |
2940 | for (auto &I : *BB) { |
2941 | if (auto *SI = dyn_cast<SelectInst>(Val: &I)) { |
2942 | auto *Cond = SI->getCondition(); |
2943 | // Do not unswitch vector selects and logical and/or selects |
2944 | if (Cond->getType()->isIntegerTy(Bitwidth: 1) && !SI->getType()->isIntegerTy(Bitwidth: 1)) |
2945 | AddUnswitchCandidatesForInst(SI, Cond); |
2946 | } else if (CollectGuards && isGuard(U: &I)) { |
2947 | auto *Cond = |
2948 | skipTrivialSelect(Cond: cast<IntrinsicInst>(Val: &I)->getArgOperand(i: 0)); |
2949 | // TODO: Support AND, OR conditions and partial unswitching. |
2950 | if (!isa<Constant>(Val: Cond) && L.isLoopInvariant(V: Cond)) |
2951 | UnswitchCandidates.push_back(Elt: {&I, {Cond}}); |
2952 | } |
2953 | } |
2954 | |
2955 | if (auto *SI = dyn_cast<SwitchInst>(Val: BB->getTerminator())) { |
2956 | // We can only consider fully loop-invariant switch conditions as we need |
2957 | // to completely eliminate the switch after unswitching. |
2958 | if (!isa<Constant>(Val: SI->getCondition()) && |
2959 | L.isLoopInvariant(V: SI->getCondition()) && !BB->getUniqueSuccessor()) |
2960 | UnswitchCandidates.push_back(Elt: {SI, {SI->getCondition()}}); |
2961 | continue; |
2962 | } |
2963 | |
2964 | auto *BI = dyn_cast<BranchInst>(Val: BB->getTerminator()); |
2965 | if (!BI || !BI->isConditional() || |
2966 | BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1)) |
2967 | continue; |
2968 | |
2969 | AddUnswitchCandidatesForInst(BI, BI->getCondition()); |
2970 | } |
2971 | |
2972 | if (MSSAU && !findOptionMDForLoop(TheLoop: &L, Name: "llvm.loop.unswitch.partial.disable" ) && |
2973 | !any_of(Range&: UnswitchCandidates, P: [&L](auto &TerminatorAndInvariants) { |
2974 | return TerminatorAndInvariants.TI == L.getHeader()->getTerminator(); |
2975 | })) { |
2976 | MemorySSA *MSSA = MSSAU->getMemorySSA(); |
2977 | if (auto Info = hasPartialIVCondition(L, MSSAThreshold, MSSA: *MSSA, AA)) { |
2978 | LLVM_DEBUG( |
2979 | dbgs() << "simple-loop-unswitch: Found partially invariant condition " |
2980 | << *Info->InstToDuplicate[0] << "\n" ); |
2981 | PartialIVInfo = *Info; |
2982 | PartialIVCondBranch = L.getHeader()->getTerminator(); |
2983 | TinyPtrVector<Value *> ValsToDuplicate; |
2984 | llvm::append_range(C&: ValsToDuplicate, R&: Info->InstToDuplicate); |
2985 | UnswitchCandidates.push_back( |
2986 | Elt: {L.getHeader()->getTerminator(), std::move(ValsToDuplicate)}); |
2987 | } |
2988 | } |
2989 | return !UnswitchCandidates.empty(); |
2990 | } |
2991 | |
2992 | /// Tries to canonicalize condition described by: |
2993 | /// |
2994 | /// br (LHS pred RHS), label IfTrue, label IfFalse |
2995 | /// |
2996 | /// into its equivalent where `Pred` is something that we support for injected |
2997 | /// invariants (so far it is limited to ult), LHS in canonicalized form is |
2998 | /// non-invariant and RHS is an invariant. |
2999 | static void canonicalizeForInvariantConditionInjection(CmpPredicate &Pred, |
3000 | Value *&LHS, Value *&RHS, |
3001 | BasicBlock *&IfTrue, |
3002 | BasicBlock *&IfFalse, |
3003 | const Loop &L) { |
3004 | if (!L.contains(BB: IfTrue)) { |
3005 | Pred = ICmpInst::getInversePredicate(pred: Pred); |
3006 | std::swap(a&: IfTrue, b&: IfFalse); |
3007 | } |
3008 | |
3009 | // Move loop-invariant argument to RHS position. |
3010 | if (L.isLoopInvariant(V: LHS)) { |
3011 | Pred = ICmpInst::getSwappedPredicate(pred: Pred); |
3012 | std::swap(a&: LHS, b&: RHS); |
3013 | } |
3014 | |
3015 | if (Pred == ICmpInst::ICMP_SGE && match(V: RHS, P: m_Zero())) { |
3016 | // Turn "x >=s 0" into "x <u UMIN_INT" |
3017 | Pred = ICmpInst::ICMP_ULT; |
3018 | RHS = ConstantInt::get( |
3019 | Context&: RHS->getContext(), |
3020 | V: APInt::getSignedMinValue(numBits: RHS->getType()->getIntegerBitWidth())); |
3021 | } |
3022 | } |
3023 | |
3024 | /// Returns true, if predicate described by ( \p Pred, \p LHS, \p RHS ) |
3025 | /// succeeding into blocks ( \p IfTrue, \p IfFalse) can be optimized by |
3026 | /// injecting a loop-invariant condition. |
3027 | static bool shouldTryInjectInvariantCondition( |
3028 | const ICmpInst::Predicate Pred, const Value *LHS, const Value *RHS, |
3029 | const BasicBlock *IfTrue, const BasicBlock *IfFalse, const Loop &L) { |
3030 | if (L.isLoopInvariant(V: LHS) || !L.isLoopInvariant(V: RHS)) |
3031 | return false; |
3032 | // TODO: Support other predicates. |
3033 | if (Pred != ICmpInst::ICMP_ULT) |
3034 | return false; |
3035 | // TODO: Support non-loop-exiting branches? |
3036 | if (!L.contains(BB: IfTrue) || L.contains(BB: IfFalse)) |
3037 | return false; |
3038 | // FIXME: For some reason this causes problems with MSSA updates, need to |
3039 | // investigate why. So far, just don't unswitch latch. |
3040 | if (L.getHeader() == IfTrue) |
3041 | return false; |
3042 | return true; |
3043 | } |
3044 | |
3045 | /// Returns true, if metadata on \p BI allows us to optimize branching into \p |
3046 | /// TakenSucc via injection of invariant conditions. The branch should be not |
3047 | /// enough and not previously unswitched, the information about this comes from |
3048 | /// the metadata. |
3049 | bool shouldTryInjectBasingOnMetadata(const BranchInst *BI, |
3050 | const BasicBlock *TakenSucc) { |
3051 | SmallVector<uint32_t> Weights; |
3052 | if (!extractBranchWeights(I: *BI, Weights)) |
3053 | return false; |
3054 | unsigned T = InjectInvariantConditionHotnesThreshold; |
3055 | BranchProbability LikelyTaken(T - 1, T); |
3056 | |
3057 | assert(Weights.size() == 2 && "Unexpected profile data!" ); |
3058 | size_t Idx = BI->getSuccessor(i: 0) == TakenSucc ? 0 : 1; |
3059 | auto Num = Weights[Idx]; |
3060 | auto Denom = Weights[0] + Weights[1]; |
3061 | // Degenerate or overflowed metadata. |
3062 | if (Denom == 0 || Num > Denom) |
3063 | return false; |
3064 | BranchProbability ActualTaken(Num, Denom); |
3065 | if (LikelyTaken > ActualTaken) |
3066 | return false; |
3067 | return true; |
3068 | } |
3069 | |
3070 | /// Materialize pending invariant condition of the given candidate into IR. The |
3071 | /// injected loop-invariant condition implies the original loop-variant branch |
3072 | /// condition, so the materialization turns |
3073 | /// |
3074 | /// loop_block: |
3075 | /// ... |
3076 | /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc |
3077 | /// |
3078 | /// into |
3079 | /// |
3080 | /// preheader: |
3081 | /// %invariant_cond = LHS pred RHS |
3082 | /// ... |
3083 | /// loop_block: |
3084 | /// br i1 %invariant_cond, label InLoopSucc, label OriginalCheck |
3085 | /// OriginalCheck: |
3086 | /// br i1 %variant_cond, label InLoopSucc, label OutOfLoopSucc |
3087 | /// ... |
3088 | static NonTrivialUnswitchCandidate |
3089 | injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L, |
3090 | DominatorTree &DT, LoopInfo &LI, |
3091 | AssumptionCache &AC, MemorySSAUpdater *MSSAU) { |
3092 | assert(Candidate.hasPendingInjection() && "Nothing to inject!" ); |
3093 | BasicBlock * = L.getLoopPreheader(); |
3094 | assert(Preheader && "Loop is not in simplified form?" ); |
3095 | assert(LI.getLoopFor(Candidate.TI->getParent()) == &L && |
3096 | "Unswitching branch of inner loop!" ); |
3097 | |
3098 | auto Pred = Candidate.PendingInjection->Pred; |
3099 | auto *LHS = Candidate.PendingInjection->LHS; |
3100 | auto *RHS = Candidate.PendingInjection->RHS; |
3101 | auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc; |
3102 | auto *TI = cast<BranchInst>(Val: Candidate.TI); |
3103 | auto *BB = Candidate.TI->getParent(); |
3104 | auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(i: 0) ? TI->getSuccessor(i: 1) |
3105 | : TI->getSuccessor(i: 0); |
3106 | // FIXME: Remove this once limitation on successors is lifted. |
3107 | assert(L.contains(InLoopSucc) && "Not supported yet!" ); |
3108 | assert(!L.contains(OutOfLoopSucc) && "Not supported yet!" ); |
3109 | auto &Ctx = BB->getContext(); |
3110 | |
3111 | IRBuilder<> Builder(Preheader->getTerminator()); |
3112 | assert(ICmpInst::isUnsigned(Pred) && "Not supported yet!" ); |
3113 | if (LHS->getType() != RHS->getType()) { |
3114 | if (LHS->getType()->getIntegerBitWidth() < |
3115 | RHS->getType()->getIntegerBitWidth()) |
3116 | LHS = Builder.CreateZExt(V: LHS, DestTy: RHS->getType(), Name: LHS->getName() + ".wide" ); |
3117 | else |
3118 | RHS = Builder.CreateZExt(V: RHS, DestTy: LHS->getType(), Name: RHS->getName() + ".wide" ); |
3119 | } |
3120 | // Do not use builder here: CreateICmp may simplify this into a constant and |
3121 | // unswitching will break. Better optimize it away later. |
3122 | auto *InjectedCond = |
3123 | ICmpInst::Create(Op: Instruction::ICmp, Pred, S1: LHS, S2: RHS, Name: "injected.cond" , |
3124 | InsertBefore: Preheader->getTerminator()->getIterator()); |
3125 | |
3126 | BasicBlock *CheckBlock = BasicBlock::Create(Context&: Ctx, Name: BB->getName() + ".check" , |
3127 | Parent: BB->getParent(), InsertBefore: InLoopSucc); |
3128 | Builder.SetInsertPoint(TI); |
3129 | auto *InvariantBr = |
3130 | Builder.CreateCondBr(Cond: InjectedCond, True: InLoopSucc, False: CheckBlock); |
3131 | |
3132 | Builder.SetInsertPoint(CheckBlock); |
3133 | Builder.CreateCondBr(Cond: TI->getCondition(), True: TI->getSuccessor(i: 0), |
3134 | False: TI->getSuccessor(i: 1)); |
3135 | TI->eraseFromParent(); |
3136 | |
3137 | // Fixup phis. |
3138 | for (auto &I : *InLoopSucc) { |
3139 | auto *PN = dyn_cast<PHINode>(Val: &I); |
3140 | if (!PN) |
3141 | break; |
3142 | auto *Inc = PN->getIncomingValueForBlock(BB); |
3143 | PN->addIncoming(V: Inc, BB: CheckBlock); |
3144 | } |
3145 | OutOfLoopSucc->replacePhiUsesWith(Old: BB, New: CheckBlock); |
3146 | |
3147 | SmallVector<DominatorTree::UpdateType, 4> DTUpdates = { |
3148 | { DominatorTree::Insert, BB, CheckBlock }, |
3149 | { DominatorTree::Insert, CheckBlock, InLoopSucc }, |
3150 | { DominatorTree::Insert, CheckBlock, OutOfLoopSucc }, |
3151 | { DominatorTree::Delete, BB, OutOfLoopSucc } |
3152 | }; |
3153 | |
3154 | DT.applyUpdates(Updates: DTUpdates); |
3155 | if (MSSAU) |
3156 | MSSAU->applyUpdates(Updates: DTUpdates, DT); |
3157 | L.addBasicBlockToLoop(NewBB: CheckBlock, LI); |
3158 | |
3159 | #ifndef NDEBUG |
3160 | DT.verify(); |
3161 | LI.verify(DT); |
3162 | if (MSSAU && VerifyMemorySSA) |
3163 | MSSAU->getMemorySSA()->verifyMemorySSA(); |
3164 | #endif |
3165 | |
3166 | // TODO: In fact, cost of unswitching a new invariant candidate is *slightly* |
3167 | // higher because we have just inserted a new block. Need to think how to |
3168 | // adjust the cost of injected candidates when it was first computed. |
3169 | LLVM_DEBUG(dbgs() << "Injected a new loop-invariant branch " << *InvariantBr |
3170 | << " and considering it for unswitching." ); |
3171 | ++NumInvariantConditionsInjected; |
3172 | return NonTrivialUnswitchCandidate(InvariantBr, { InjectedCond }, |
3173 | Candidate.Cost); |
3174 | } |
3175 | |
3176 | /// Given chain of loop branch conditions looking like: |
3177 | /// br (Variant < Invariant1) |
3178 | /// br (Variant < Invariant2) |
3179 | /// br (Variant < Invariant3) |
3180 | /// ... |
3181 | /// collect set of invariant conditions on which we want to unswitch, which |
3182 | /// look like: |
3183 | /// Invariant1 <= Invariant2 |
3184 | /// Invariant2 <= Invariant3 |
3185 | /// ... |
3186 | /// Though they might not immediately exist in the IR, we can still inject them. |
3187 | static bool insertCandidatesWithPendingInjections( |
3188 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, Loop &L, |
3189 | ICmpInst::Predicate Pred, ArrayRef<CompareDesc> Compares, |
3190 | const DominatorTree &DT) { |
3191 | |
3192 | assert(ICmpInst::isRelational(Pred)); |
3193 | assert(ICmpInst::isStrictPredicate(Pred)); |
3194 | if (Compares.size() < 2) |
3195 | return false; |
3196 | ICmpInst::Predicate NonStrictPred = ICmpInst::getNonStrictPredicate(pred: Pred); |
3197 | for (auto Prev = Compares.begin(), Next = Compares.begin() + 1; |
3198 | Next != Compares.end(); ++Prev, ++Next) { |
3199 | Value *LHS = Next->Invariant; |
3200 | Value *RHS = Prev->Invariant; |
3201 | BasicBlock *InLoopSucc = Prev->InLoopSucc; |
3202 | InjectedInvariant ToInject(NonStrictPred, LHS, RHS, InLoopSucc); |
3203 | NonTrivialUnswitchCandidate Candidate(Prev->Term, { LHS, RHS }, |
3204 | std::nullopt, std::move(ToInject)); |
3205 | UnswitchCandidates.push_back(Elt: std::move(Candidate)); |
3206 | } |
3207 | return true; |
3208 | } |
3209 | |
3210 | /// Collect unswitch candidates by invariant conditions that are not immediately |
3211 | /// present in the loop. However, they can be injected into the code if we |
3212 | /// decide it's profitable. |
3213 | /// An example of such conditions is following: |
3214 | /// |
3215 | /// for (...) { |
3216 | /// x = load ... |
3217 | /// if (! x <u C1) break; |
3218 | /// if (! x <u C2) break; |
3219 | /// <do something> |
3220 | /// } |
3221 | /// |
3222 | /// We can unswitch by condition "C1 <=u C2". If that is true, then "x <u C1 <= |
3223 | /// C2" automatically implies "x <u C2", so we can get rid of one of |
3224 | /// loop-variant checks in unswitched loop version. |
3225 | static bool collectUnswitchCandidatesWithInjections( |
3226 | SmallVectorImpl<NonTrivialUnswitchCandidate> &UnswitchCandidates, |
3227 | IVConditionInfo &PartialIVInfo, Instruction *&PartialIVCondBranch, Loop &L, |
3228 | const DominatorTree &DT, const LoopInfo &LI, AAResults &AA, |
3229 | const MemorySSAUpdater *MSSAU) { |
3230 | if (!InjectInvariantConditions) |
3231 | return false; |
3232 | |
3233 | if (!DT.isReachableFromEntry(A: L.getHeader())) |
3234 | return false; |
3235 | auto *Latch = L.getLoopLatch(); |
3236 | // Need to have a single latch and a preheader. |
3237 | if (!Latch) |
3238 | return false; |
3239 | assert(L.getLoopPreheader() && "Must have a preheader!" ); |
3240 | |
3241 | DenseMap<Value *, SmallVector<CompareDesc, 4> > CandidatesULT; |
3242 | // Traverse the conditions that dominate latch (and therefore dominate each |
3243 | // other). |
3244 | for (auto *DTN = DT.getNode(BB: Latch); L.contains(BB: DTN->getBlock()); |
3245 | DTN = DTN->getIDom()) { |
3246 | CmpPredicate Pred; |
3247 | Value *LHS = nullptr, *RHS = nullptr; |
3248 | BasicBlock *IfTrue = nullptr, *IfFalse = nullptr; |
3249 | auto *BB = DTN->getBlock(); |
3250 | // Ignore inner loops. |
3251 | if (LI.getLoopFor(BB) != &L) |
3252 | continue; |
3253 | auto *Term = BB->getTerminator(); |
3254 | if (!match(V: Term, P: m_Br(C: m_ICmp(Pred, L: m_Value(V&: LHS), R: m_Value(V&: RHS)), |
3255 | T: m_BasicBlock(V&: IfTrue), F: m_BasicBlock(V&: IfFalse)))) |
3256 | continue; |
3257 | if (!LHS->getType()->isIntegerTy()) |
3258 | continue; |
3259 | canonicalizeForInvariantConditionInjection(Pred, LHS, RHS, IfTrue, IfFalse, |
3260 | L); |
3261 | if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L)) |
3262 | continue; |
3263 | if (!shouldTryInjectBasingOnMetadata(BI: cast<BranchInst>(Val: Term), TakenSucc: IfTrue)) |
3264 | continue; |
3265 | // Strip ZEXT for unsigned predicate. |
3266 | // TODO: once signed predicates are supported, also strip SEXT. |
3267 | CompareDesc Desc(cast<BranchInst>(Val: Term), RHS, IfTrue); |
3268 | while (auto *Zext = dyn_cast<ZExtInst>(Val: LHS)) |
3269 | LHS = Zext->getOperand(i_nocapture: 0); |
3270 | CandidatesULT[LHS].push_back(Elt: Desc); |
3271 | } |
3272 | |
3273 | bool Found = false; |
3274 | for (auto &It : CandidatesULT) |
3275 | Found |= insertCandidatesWithPendingInjections( |
3276 | UnswitchCandidates, L, Pred: ICmpInst::ICMP_ULT, Compares: It.second, DT); |
3277 | return Found; |
3278 | } |
3279 | |
3280 | static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) { |
3281 | if (!L.isSafeToClone()) |
3282 | return false; |
3283 | for (auto *BB : L.blocks()) |
3284 | for (auto &I : *BB) { |
3285 | if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) |
3286 | return false; |
3287 | if (auto *CB = dyn_cast<CallBase>(Val: &I)) { |
3288 | assert(!CB->cannotDuplicate() && "Checked by L.isSafeToClone()." ); |
3289 | if (CB->isConvergent()) |
3290 | return false; |
3291 | } |
3292 | } |
3293 | |
3294 | // Check if there are irreducible CFG cycles in this loop. If so, we cannot |
3295 | // easily unswitch non-trivial edges out of the loop. Doing so might turn the |
3296 | // irreducible control flow into reducible control flow and introduce new |
3297 | // loops "out of thin air". If we ever discover important use cases for doing |
3298 | // this, we can add support to loop unswitch, but it is a lot of complexity |
3299 | // for what seems little or no real world benefit. |
3300 | LoopBlocksRPO RPOT(&L); |
3301 | RPOT.perform(LI: &LI); |
3302 | if (containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI)) |
3303 | return false; |
3304 | |
3305 | SmallVector<BasicBlock *, 4> ExitBlocks; |
3306 | L.getUniqueExitBlocks(ExitBlocks); |
3307 | // We cannot unswitch if exit blocks contain a cleanuppad/catchswitch |
3308 | // instruction as we don't know how to split those exit blocks. |
3309 | // FIXME: We should teach SplitBlock to handle this and remove this |
3310 | // restriction. |
3311 | for (auto *ExitBB : ExitBlocks) { |
3312 | auto It = ExitBB->getFirstNonPHIIt(); |
3313 | if (isa<CleanupPadInst>(Val: It) || isa<CatchSwitchInst>(Val: It)) { |
3314 | LLVM_DEBUG(dbgs() << "Cannot unswitch because of cleanuppad/catchswitch " |
3315 | "in exit block\n" ); |
3316 | return false; |
3317 | } |
3318 | } |
3319 | |
3320 | return true; |
3321 | } |
3322 | |
3323 | static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate( |
3324 | ArrayRef<NonTrivialUnswitchCandidate> UnswitchCandidates, const Loop &L, |
3325 | const DominatorTree &DT, const LoopInfo &LI, AssumptionCache &AC, |
3326 | const TargetTransformInfo &TTI, const IVConditionInfo &PartialIVInfo) { |
3327 | // Given that unswitching these terminators will require duplicating parts of |
3328 | // the loop, so we need to be able to model that cost. Compute the ephemeral |
3329 | // values and set up a data structure to hold per-BB costs. We cache each |
3330 | // block's cost so that we don't recompute this when considering different |
3331 | // subsets of the loop for duplication during unswitching. |
3332 | SmallPtrSet<const Value *, 4> EphValues; |
3333 | CodeMetrics::collectEphemeralValues(L: &L, AC: &AC, EphValues); |
3334 | SmallDenseMap<BasicBlock *, InstructionCost, 4> BBCostMap; |
3335 | |
3336 | // Compute the cost of each block, as well as the total loop cost. Also, bail |
3337 | // out if we see instructions which are incompatible with loop unswitching |
3338 | // (convergent, noduplicate, or cross-basic-block tokens). |
3339 | // FIXME: We might be able to safely handle some of these in non-duplicated |
3340 | // regions. |
3341 | TargetTransformInfo::TargetCostKind CostKind = |
3342 | L.getHeader()->getParent()->hasMinSize() |
3343 | ? TargetTransformInfo::TCK_CodeSize |
3344 | : TargetTransformInfo::TCK_SizeAndLatency; |
3345 | InstructionCost LoopCost = 0; |
3346 | for (auto *BB : L.blocks()) { |
3347 | InstructionCost Cost = 0; |
3348 | for (auto &I : *BB) { |
3349 | if (EphValues.count(Ptr: &I)) |
3350 | continue; |
3351 | Cost += TTI.getInstructionCost(U: &I, CostKind); |
3352 | } |
3353 | assert(Cost >= 0 && "Must not have negative costs!" ); |
3354 | LoopCost += Cost; |
3355 | assert(LoopCost >= 0 && "Must not have negative loop costs!" ); |
3356 | BBCostMap[BB] = Cost; |
3357 | } |
3358 | LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n" ); |
3359 | |
3360 | // Now we find the best candidate by searching for the one with the following |
3361 | // properties in order: |
3362 | // |
3363 | // 1) An unswitching cost below the threshold |
3364 | // 2) The smallest number of duplicated unswitch candidates (to avoid |
3365 | // creating redundant subsequent unswitching) |
3366 | // 3) The smallest cost after unswitching. |
3367 | // |
3368 | // We prioritize reducing fanout of unswitch candidates provided the cost |
3369 | // remains below the threshold because this has a multiplicative effect. |
3370 | // |
3371 | // This requires memoizing each dominator subtree to avoid redundant work. |
3372 | // |
3373 | // FIXME: Need to actually do the number of candidates part above. |
3374 | SmallDenseMap<DomTreeNode *, InstructionCost, 4> DTCostMap; |
3375 | // Given a terminator which might be unswitched, computes the non-duplicated |
3376 | // cost for that terminator. |
3377 | auto ComputeUnswitchedCost = [&](Instruction &TI, |
3378 | bool FullUnswitch) -> InstructionCost { |
3379 | // Unswitching selects unswitches the entire loop. |
3380 | if (isa<SelectInst>(Val: TI)) |
3381 | return LoopCost; |
3382 | |
3383 | BasicBlock &BB = *TI.getParent(); |
3384 | SmallPtrSet<BasicBlock *, 4> Visited; |
3385 | |
3386 | InstructionCost Cost = 0; |
3387 | for (BasicBlock *SuccBB : successors(BB: &BB)) { |
3388 | // Don't count successors more than once. |
3389 | if (!Visited.insert(Ptr: SuccBB).second) |
3390 | continue; |
3391 | |
3392 | // If this is a partial unswitch candidate, then it must be a conditional |
3393 | // branch with a condition of either `or`, `and`, their corresponding |
3394 | // select forms or partially invariant instructions. In that case, one of |
3395 | // the successors is necessarily duplicated, so don't even try to remove |
3396 | // its cost. |
3397 | if (!FullUnswitch) { |
3398 | auto &BI = cast<BranchInst>(Val&: TI); |
3399 | Value *Cond = skipTrivialSelect(Cond: BI.getCondition()); |
3400 | if (match(V: Cond, P: m_LogicalAnd())) { |
3401 | if (SuccBB == BI.getSuccessor(i: 1)) |
3402 | continue; |
3403 | } else if (match(V: Cond, P: m_LogicalOr())) { |
3404 | if (SuccBB == BI.getSuccessor(i: 0)) |
3405 | continue; |
3406 | } else if ((PartialIVInfo.KnownValue->isOneValue() && |
3407 | SuccBB == BI.getSuccessor(i: 0)) || |
3408 | (!PartialIVInfo.KnownValue->isOneValue() && |
3409 | SuccBB == BI.getSuccessor(i: 1))) |
3410 | continue; |
3411 | } |
3412 | |
3413 | // This successor's domtree will not need to be duplicated after |
3414 | // unswitching if the edge to the successor dominates it (and thus the |
3415 | // entire tree). This essentially means there is no other path into this |
3416 | // subtree and so it will end up live in only one clone of the loop. |
3417 | if (SuccBB->getUniquePredecessor() || |
3418 | llvm::all_of(Range: predecessors(BB: SuccBB), P: [&](BasicBlock *PredBB) { |
3419 | return PredBB == &BB || DT.dominates(A: SuccBB, B: PredBB); |
3420 | })) { |
3421 | Cost += computeDomSubtreeCost(N&: *DT[SuccBB], BBCostMap, DTCostMap); |
3422 | assert(Cost <= LoopCost && |
3423 | "Non-duplicated cost should never exceed total loop cost!" ); |
3424 | } |
3425 | } |
3426 | |
3427 | // Now scale the cost by the number of unique successors minus one. We |
3428 | // subtract one because there is already at least one copy of the entire |
3429 | // loop. This is computing the new cost of unswitching a condition. |
3430 | // Note that guards always have 2 unique successors that are implicit and |
3431 | // will be materialized if we decide to unswitch it. |
3432 | int SuccessorsCount = isGuard(U: &TI) ? 2 : Visited.size(); |
3433 | assert(SuccessorsCount > 1 && |
3434 | "Cannot unswitch a condition without multiple distinct successors!" ); |
3435 | return (LoopCost - Cost) * (SuccessorsCount - 1); |
3436 | }; |
3437 | |
3438 | std::optional<NonTrivialUnswitchCandidate> Best; |
3439 | for (auto &Candidate : UnswitchCandidates) { |
3440 | Instruction &TI = *Candidate.TI; |
3441 | ArrayRef<Value *> Invariants = Candidate.Invariants; |
3442 | BranchInst *BI = dyn_cast<BranchInst>(Val: &TI); |
3443 | bool FullUnswitch = |
3444 | !BI || Candidate.hasPendingInjection() || |
3445 | (Invariants.size() == 1 && |
3446 | Invariants[0] == skipTrivialSelect(Cond: BI->getCondition())); |
3447 | InstructionCost CandidateCost = ComputeUnswitchedCost(TI, FullUnswitch); |
3448 | // Calculate cost multiplier which is a tool to limit potentially |
3449 | // exponential behavior of loop-unswitch. |
3450 | if (EnableUnswitchCostMultiplier) { |
3451 | int CostMultiplier = |
3452 | CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates); |
3453 | assert( |
3454 | (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) && |
3455 | "cost multiplier needs to be in the range of 1..UnswitchThreshold" ); |
3456 | CandidateCost *= CostMultiplier; |
3457 | LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost |
3458 | << " (multiplier: " << CostMultiplier << ")" |
3459 | << " for unswitch candidate: " << TI << "\n" ); |
3460 | } else { |
3461 | LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost |
3462 | << " for unswitch candidate: " << TI << "\n" ); |
3463 | } |
3464 | |
3465 | if (!Best || CandidateCost < Best->Cost) { |
3466 | Best = Candidate; |
3467 | Best->Cost = CandidateCost; |
3468 | } |
3469 | } |
3470 | assert(Best && "Must be!" ); |
3471 | return *Best; |
3472 | } |
3473 | |
3474 | // Insert a freeze on an unswitched branch if all is true: |
3475 | // 1. freeze-loop-unswitch-cond option is true |
3476 | // 2. The branch may not execute in the loop pre-transformation. If a branch may |
3477 | // not execute and could cause UB, it would always cause UB if it is hoisted outside |
3478 | // of the loop. Insert a freeze to prevent this case. |
3479 | // 3. The branch condition may be poison or undef |
3480 | static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT, |
3481 | AssumptionCache &AC) { |
3482 | assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI)); |
3483 | if (!FreezeLoopUnswitchCond) |
3484 | return false; |
3485 | |
3486 | ICFLoopSafetyInfo SafetyInfo; |
3487 | SafetyInfo.computeLoopSafetyInfo(CurLoop: &L); |
3488 | if (SafetyInfo.isGuaranteedToExecute(Inst: TI, DT: &DT, CurLoop: &L)) |
3489 | return false; |
3490 | |
3491 | Value *Cond; |
3492 | if (BranchInst *BI = dyn_cast<BranchInst>(Val: &TI)) |
3493 | Cond = skipTrivialSelect(Cond: BI->getCondition()); |
3494 | else |
3495 | Cond = skipTrivialSelect(Cond: cast<SwitchInst>(Val: &TI)->getCondition()); |
3496 | return !isGuaranteedNotToBeUndefOrPoison( |
3497 | V: Cond, AC: &AC, CtxI: L.getLoopPreheader()->getTerminator(), DT: &DT); |
3498 | } |
3499 | |
3500 | static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, |
3501 | AssumptionCache &AC, AAResults &AA, |
3502 | TargetTransformInfo &TTI, ScalarEvolution *SE, |
3503 | MemorySSAUpdater *MSSAU, |
3504 | LPMUpdater &LoopUpdater) { |
3505 | // Collect all invariant conditions within this loop (as opposed to an inner |
3506 | // loop which would be handled when visiting that inner loop). |
3507 | SmallVector<NonTrivialUnswitchCandidate, 4> UnswitchCandidates; |
3508 | IVConditionInfo PartialIVInfo; |
3509 | Instruction *PartialIVCondBranch = nullptr; |
3510 | collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, |
3511 | PartialIVCondBranch, L, LI, AA, MSSAU); |
3512 | if (!findOptionMDForLoop(TheLoop: &L, Name: "llvm.loop.unswitch.injection.disable" )) |
3513 | collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, |
3514 | PartialIVCondBranch, L, DT, LI, AA, |
3515 | MSSAU); |
3516 | // If we didn't find any candidates, we're done. |
3517 | if (UnswitchCandidates.empty()) |
3518 | return false; |
3519 | |
3520 | LLVM_DEBUG( |
3521 | dbgs() << "Considering " << UnswitchCandidates.size() |
3522 | << " non-trivial loop invariant conditions for unswitching.\n" ); |
3523 | |
3524 | NonTrivialUnswitchCandidate Best = findBestNonTrivialUnswitchCandidate( |
3525 | UnswitchCandidates, L, DT, LI, AC, TTI, PartialIVInfo); |
3526 | |
3527 | assert(Best.TI && "Failed to find loop unswitch candidate" ); |
3528 | assert(Best.Cost && "Failed to compute cost" ); |
3529 | |
3530 | if (*Best.Cost >= UnswitchThreshold) { |
3531 | LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: " << *Best.Cost |
3532 | << "\n" ); |
3533 | return false; |
3534 | } |
3535 | |
3536 | bool InjectedCondition = false; |
3537 | if (Best.hasPendingInjection()) { |
3538 | Best = injectPendingInvariantConditions(Candidate: Best, L, DT, LI, AC, MSSAU); |
3539 | InjectedCondition = true; |
3540 | } |
3541 | assert(!Best.hasPendingInjection() && |
3542 | "All injections should have been done by now!" ); |
3543 | |
3544 | if (Best.TI != PartialIVCondBranch) |
3545 | PartialIVInfo.InstToDuplicate.clear(); |
3546 | |
3547 | bool InsertFreeze; |
3548 | if (auto *SI = dyn_cast<SelectInst>(Val: Best.TI)) { |
3549 | // If the best candidate is a select, turn it into a branch. Select |
3550 | // instructions with a poison conditional do not propagate poison, but |
3551 | // branching on poison causes UB. Insert a freeze on the select |
3552 | // conditional to prevent UB after turning the select into a branch. |
3553 | InsertFreeze = !isGuaranteedNotToBeUndefOrPoison( |
3554 | V: SI->getCondition(), AC: &AC, CtxI: L.getLoopPreheader()->getTerminator(), DT: &DT); |
3555 | Best.TI = turnSelectIntoBranch(SI, DT, LI, MSSAU, AC: &AC); |
3556 | } else { |
3557 | // If the best candidate is a guard, turn it into a branch. |
3558 | if (isGuard(U: Best.TI)) |
3559 | Best.TI = |
3560 | turnGuardIntoBranch(GI: cast<IntrinsicInst>(Val: Best.TI), L, DT, LI, MSSAU); |
3561 | InsertFreeze = shouldInsertFreeze(L, TI&: *Best.TI, DT, AC); |
3562 | } |
3563 | |
3564 | LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = " << Best.Cost |
3565 | << ") terminator: " << *Best.TI << "\n" ); |
3566 | unswitchNontrivialInvariants(L, TI&: *Best.TI, Invariants: Best.Invariants, PartialIVInfo, DT, |
3567 | LI, AC, SE, MSSAU, LoopUpdater, InsertFreeze, |
3568 | InjectedCondition); |
3569 | return true; |
3570 | } |
3571 | |
3572 | /// Unswitch control flow predicated on loop invariant conditions. |
3573 | /// |
3574 | /// This first hoists all branches or switches which are trivial (IE, do not |
3575 | /// require duplicating any part of the loop) out of the loop body. It then |
3576 | /// looks at other loop invariant control flows and tries to unswitch those as |
3577 | /// well by cloning the loop if the result is small enough. |
3578 | /// |
3579 | /// The `DT`, `LI`, `AC`, `AA`, `TTI` parameters are required analyses that are |
3580 | /// also updated based on the unswitch. The `MSSA` analysis is also updated if |
3581 | /// valid (i.e. its use is enabled). |
3582 | /// |
3583 | /// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is |
3584 | /// true, we will attempt to do non-trivial unswitching as well as trivial |
3585 | /// unswitching. |
3586 | /// |
3587 | /// The `postUnswitch` function will be run after unswitching is complete |
3588 | /// with information on whether or not the provided loop remains a loop and |
3589 | /// a list of new sibling loops created. |
3590 | /// |
3591 | /// If `SE` is non-null, we will update that analysis based on the unswitching |
3592 | /// done. |
3593 | static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, |
3594 | AssumptionCache &AC, AAResults &AA, |
3595 | TargetTransformInfo &TTI, bool Trivial, |
3596 | bool NonTrivial, ScalarEvolution *SE, |
3597 | MemorySSAUpdater *MSSAU, ProfileSummaryInfo *PSI, |
3598 | BlockFrequencyInfo *BFI, LPMUpdater &LoopUpdater) { |
3599 | assert(L.isRecursivelyLCSSAForm(DT, LI) && |
3600 | "Loops must be in LCSSA form before unswitching." ); |
3601 | |
3602 | // Must be in loop simplified form: we need a preheader and dedicated exits. |
3603 | if (!L.isLoopSimplifyForm()) |
3604 | return false; |
3605 | |
3606 | // Try trivial unswitch first before loop over other basic blocks in the loop. |
3607 | if (Trivial && unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) { |
3608 | // If we unswitched successfully we will want to clean up the loop before |
3609 | // processing it further so just mark it as unswitched and return. |
3610 | postUnswitch(L, U&: LoopUpdater, LoopName: L.getName(), |
3611 | /*CurrentLoopValid*/ true, /*PartiallyInvariant*/ false, |
3612 | /*InjectedCondition*/ false, NewLoops: {}); |
3613 | return true; |
3614 | } |
3615 | |
3616 | const Function *F = L.getHeader()->getParent(); |
3617 | |
3618 | // Check whether we should continue with non-trivial conditions. |
3619 | // EnableNonTrivialUnswitch: Global variable that forces non-trivial |
3620 | // unswitching for testing and debugging. |
3621 | // NonTrivial: Parameter that enables non-trivial unswitching for this |
3622 | // invocation of the transform. But this should be allowed only |
3623 | // for targets without branch divergence. |
3624 | // |
3625 | // FIXME: If divergence analysis becomes available to a loop |
3626 | // transform, we should allow unswitching for non-trivial uniform |
3627 | // branches even on targets that have divergence. |
3628 | // https://bugs.llvm.org/show_bug.cgi?id=48819 |
3629 | bool ContinueWithNonTrivial = |
3630 | EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence(F)); |
3631 | if (!ContinueWithNonTrivial) |
3632 | return false; |
3633 | |
3634 | // Skip non-trivial unswitching for optsize functions. |
3635 | if (F->hasOptSize()) |
3636 | return false; |
3637 | |
3638 | // Returns true if Loop L's loop nest is cold, i.e. if the headers of L, |
3639 | // of the loops L is nested in, and of the loops nested in L are all cold. |
3640 | auto IsLoopNestCold = [&](const Loop *L) { |
3641 | // Check L and all of its parent loops. |
3642 | auto *Parent = L; |
3643 | while (Parent) { |
3644 | if (!PSI->isColdBlock(BB: Parent->getHeader(), BFI)) |
3645 | return false; |
3646 | Parent = Parent->getParentLoop(); |
3647 | } |
3648 | // Next check all loops nested within L. |
3649 | SmallVector<const Loop *, 4> Worklist; |
3650 | llvm::append_range(C&: Worklist, R: L->getSubLoops()); |
3651 | while (!Worklist.empty()) { |
3652 | auto *CurLoop = Worklist.pop_back_val(); |
3653 | if (!PSI->isColdBlock(BB: CurLoop->getHeader(), BFI)) |
3654 | return false; |
3655 | llvm::append_range(C&: Worklist, R: CurLoop->getSubLoops()); |
3656 | } |
3657 | return true; |
3658 | }; |
3659 | |
3660 | // Skip cold loops in cold loop nests, as unswitching them brings little |
3661 | // benefit but increases the code size |
3662 | if (PSI && PSI->hasProfileSummary() && BFI && IsLoopNestCold(&L)) { |
3663 | LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n" ); |
3664 | return false; |
3665 | } |
3666 | |
3667 | // Perform legality checks. |
3668 | if (!isSafeForNoNTrivialUnswitching(L, LI)) |
3669 | return false; |
3670 | |
3671 | // For non-trivial unswitching, because it often creates new loops, we rely on |
3672 | // the pass manager to iterate on the loops rather than trying to immediately |
3673 | // reach a fixed point. There is no substantial advantage to iterating |
3674 | // internally, and if any of the new loops are simplified enough to contain |
3675 | // trivial unswitching we want to prefer those. |
3676 | |
3677 | // Try to unswitch the best invariant condition. We prefer this full unswitch to |
3678 | // a partial unswitch when possible below the threshold. |
3679 | if (unswitchBestCondition(L, DT, LI, AC, AA, TTI, SE, MSSAU, LoopUpdater)) |
3680 | return true; |
3681 | |
3682 | // No other opportunities to unswitch. |
3683 | return false; |
3684 | } |
3685 | |
3686 | PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, |
3687 | LoopStandardAnalysisResults &AR, |
3688 | LPMUpdater &U) { |
3689 | Function &F = *L.getHeader()->getParent(); |
3690 | (void)F; |
3691 | ProfileSummaryInfo *PSI = nullptr; |
3692 | if (auto OuterProxy = |
3693 | AM.getResult<FunctionAnalysisManagerLoopProxy>(IR&: L, ExtraArgs&: AR) |
3694 | .getCachedResult<ModuleAnalysisManagerFunctionProxy>(IR&: F)) |
3695 | PSI = OuterProxy->getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
3696 | LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L |
3697 | << "\n" ); |
3698 | |
3699 | std::optional<MemorySSAUpdater> MSSAU; |
3700 | if (AR.MSSA) { |
3701 | MSSAU = MemorySSAUpdater(AR.MSSA); |
3702 | if (VerifyMemorySSA) |
3703 | AR.MSSA->verifyMemorySSA(); |
3704 | } |
3705 | if (!unswitchLoop(L, DT&: AR.DT, LI&: AR.LI, AC&: AR.AC, AA&: AR.AA, TTI&: AR.TTI, Trivial, NonTrivial, |
3706 | SE: &AR.SE, MSSAU: MSSAU ? &*MSSAU : nullptr, PSI, BFI: AR.BFI, LoopUpdater&: U)) |
3707 | return PreservedAnalyses::all(); |
3708 | |
3709 | if (AR.MSSA && VerifyMemorySSA) |
3710 | AR.MSSA->verifyMemorySSA(); |
3711 | |
3712 | // Historically this pass has had issues with the dominator tree so verify it |
3713 | // in asserts builds. |
3714 | assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast)); |
3715 | |
3716 | auto PA = getLoopPassPreservedAnalyses(); |
3717 | if (AR.MSSA) |
3718 | PA.preserve<MemorySSAAnalysis>(); |
3719 | return PA; |
3720 | } |
3721 | |
3722 | void SimpleLoopUnswitchPass::printPipeline( |
3723 | raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { |
3724 | static_cast<PassInfoMixin<SimpleLoopUnswitchPass> *>(this)->printPipeline( |
3725 | OS, MapClassName2PassName); |
3726 | |
3727 | OS << '<'; |
3728 | OS << (NonTrivial ? "" : "no-" ) << "nontrivial;" ; |
3729 | OS << (Trivial ? "" : "no-" ) << "trivial" ; |
3730 | OS << '>'; |
3731 | } |
3732 | |