1 | //===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Transform each threading path to effectively jump thread the DFA. For |
10 | // example, the CFG below could be transformed as follows, where the cloned |
11 | // blocks unconditionally branch to the next correct case based on what is |
12 | // identified in the analysis. |
13 | // |
14 | // sw.bb sw.bb |
15 | // / | \ / | \ |
16 | // case1 case2 case3 case1 case2 case3 |
17 | // \ | / | | | |
18 | // determinator det.2 det.3 det.1 |
19 | // br sw.bb / | \ |
20 | // sw.bb.2 sw.bb.3 sw.bb.1 |
21 | // br case2 br case3 br case1ยง |
22 | // |
23 | // Definitions and Terminology: |
24 | // |
25 | // * Threading path: |
26 | // a list of basic blocks, the exit state, and the block that determines |
27 | // the next state, for which the following notation will be used: |
28 | // < path of BBs that form a cycle > [ state, determinator ] |
29 | // |
30 | // * Predictable switch: |
31 | // The switch variable is always a known constant so that all conditional |
32 | // jumps based on switch variable can be converted to unconditional jump. |
33 | // |
34 | // * Determinator: |
35 | // The basic block that determines the next state of the DFA. |
36 | // |
37 | // Representing the optimization in C-like pseudocode: the code pattern on the |
38 | // left could functionally be transformed to the right pattern if the switch |
39 | // condition is predictable. |
40 | // |
41 | // X = A goto A |
42 | // for (...) A: |
43 | // switch (X) ... |
44 | // case A goto B |
45 | // X = B B: |
46 | // case B ... |
47 | // X = C goto C |
48 | // |
49 | // The pass first checks that switch variable X is decided by the control flow |
50 | // path taken in the loop; for example, in case B, the next value of X is |
51 | // decided to be C. It then enumerates through all paths in the loop and labels |
52 | // the basic blocks where the next state is decided. |
53 | // |
54 | // Using this information it creates new paths that unconditionally branch to |
55 | // the next case. This involves cloning code, so it only gets triggered if the |
56 | // amount of code duplicated is below a threshold. |
57 | // |
58 | //===----------------------------------------------------------------------===// |
59 | |
60 | #include "llvm/Transforms/Scalar/DFAJumpThreading.h" |
61 | #include "llvm/ADT/APInt.h" |
62 | #include "llvm/ADT/DenseMap.h" |
63 | #include "llvm/ADT/SmallSet.h" |
64 | #include "llvm/ADT/Statistic.h" |
65 | #include "llvm/Analysis/AssumptionCache.h" |
66 | #include "llvm/Analysis/CodeMetrics.h" |
67 | #include "llvm/Analysis/DomTreeUpdater.h" |
68 | #include "llvm/Analysis/LoopInfo.h" |
69 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
70 | #include "llvm/Analysis/TargetTransformInfo.h" |
71 | #include "llvm/IR/CFG.h" |
72 | #include "llvm/IR/Constants.h" |
73 | #include "llvm/IR/IntrinsicInst.h" |
74 | #include "llvm/Support/CommandLine.h" |
75 | #include "llvm/Support/Debug.h" |
76 | #include "llvm/Transforms/Utils/Cloning.h" |
77 | #include "llvm/Transforms/Utils/SSAUpdaterBulk.h" |
78 | #include "llvm/Transforms/Utils/ValueMapper.h" |
79 | #include <algorithm> |
80 | #include <deque> |
81 | |
82 | #ifdef EXPENSIVE_CHECKS |
83 | #include "llvm/IR/Verifier.h" |
84 | #endif |
85 | |
86 | using namespace llvm; |
87 | |
88 | #define DEBUG_TYPE "dfa-jump-threading" |
89 | |
90 | STATISTIC(NumTransforms, "Number of transformations done" ); |
91 | STATISTIC(NumCloned, "Number of blocks cloned" ); |
92 | STATISTIC(NumPaths, "Number of individual paths threaded" ); |
93 | |
94 | static cl::opt<bool> |
95 | ClViewCfgBefore("dfa-jump-view-cfg-before" , |
96 | cl::desc("View the CFG before DFA Jump Threading" ), |
97 | cl::Hidden, cl::init(Val: false)); |
98 | |
99 | static cl::opt<bool> EarlyExitHeuristic( |
100 | "dfa-early-exit-heuristic" , |
101 | cl::desc("Exit early if an unpredictable value come from the same loop" ), |
102 | cl::Hidden, cl::init(Val: true)); |
103 | |
104 | static cl::opt<unsigned> MaxPathLength( |
105 | "dfa-max-path-length" , |
106 | cl::desc("Max number of blocks searched to find a threading path" ), |
107 | cl::Hidden, cl::init(Val: 20)); |
108 | |
109 | static cl::opt<unsigned> |
110 | MaxNumPaths("dfa-max-num-paths" , |
111 | cl::desc("Max number of paths enumerated around a switch" ), |
112 | cl::Hidden, cl::init(Val: 200)); |
113 | |
114 | static cl::opt<unsigned> |
115 | CostThreshold("dfa-cost-threshold" , |
116 | cl::desc("Maximum cost accepted for the transformation" ), |
117 | cl::Hidden, cl::init(Val: 50)); |
118 | |
119 | namespace { |
120 | |
121 | class SelectInstToUnfold { |
122 | SelectInst *SI; |
123 | PHINode *SIUse; |
124 | |
125 | public: |
126 | SelectInstToUnfold(SelectInst *SI, PHINode *SIUse) : SI(SI), SIUse(SIUse) {} |
127 | |
128 | SelectInst *getInst() { return SI; } |
129 | PHINode *getUse() { return SIUse; } |
130 | |
131 | explicit operator bool() const { return SI && SIUse; } |
132 | }; |
133 | |
134 | void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, |
135 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
136 | std::vector<BasicBlock *> *NewBBs); |
137 | |
138 | class DFAJumpThreading { |
139 | public: |
140 | (AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, |
141 | TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) |
142 | : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {} |
143 | |
144 | bool run(Function &F); |
145 | bool LoopInfoBroken; |
146 | |
147 | private: |
148 | void |
149 | unfoldSelectInstrs(DominatorTree *DT, |
150 | const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { |
151 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
152 | SmallVector<SelectInstToUnfold, 4> Stack; |
153 | for (SelectInstToUnfold SIToUnfold : SelectInsts) |
154 | Stack.push_back(Elt: SIToUnfold); |
155 | |
156 | while (!Stack.empty()) { |
157 | SelectInstToUnfold SIToUnfold = Stack.pop_back_val(); |
158 | |
159 | std::vector<SelectInstToUnfold> NewSIsToUnfold; |
160 | std::vector<BasicBlock *> NewBBs; |
161 | unfold(DTU: &DTU, LI, SIToUnfold, NewSIsToUnfold: &NewSIsToUnfold, NewBBs: &NewBBs); |
162 | |
163 | // Put newly discovered select instructions into the work list. |
164 | for (const SelectInstToUnfold &NewSIToUnfold : NewSIsToUnfold) |
165 | Stack.push_back(Elt: NewSIToUnfold); |
166 | } |
167 | } |
168 | |
169 | AssumptionCache *AC; |
170 | DominatorTree *DT; |
171 | LoopInfo *LI; |
172 | TargetTransformInfo *TTI; |
173 | OptimizationRemarkEmitter *ORE; |
174 | }; |
175 | |
176 | } // end anonymous namespace |
177 | |
178 | namespace { |
179 | |
180 | /// Create a new basic block and sink \p SIToSink into it. |
181 | void createBasicBlockAndSinkSelectInst( |
182 | DomTreeUpdater *DTU, SelectInst *SI, PHINode *SIUse, SelectInst *SIToSink, |
183 | BasicBlock *EndBlock, StringRef NewBBName, BasicBlock **NewBlock, |
184 | BranchInst **NewBranch, std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
185 | std::vector<BasicBlock *> *NewBBs) { |
186 | assert(SIToSink->hasOneUse()); |
187 | assert(NewBlock); |
188 | assert(NewBranch); |
189 | *NewBlock = BasicBlock::Create(Context&: SI->getContext(), Name: NewBBName, |
190 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
191 | NewBBs->push_back(x: *NewBlock); |
192 | *NewBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: *NewBlock); |
193 | SIToSink->moveBefore(MovePos: *NewBranch); |
194 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(SIToSink, SIUse)); |
195 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, *NewBlock, EndBlock}}); |
196 | } |
197 | |
198 | /// Unfold the select instruction held in \p SIToUnfold by replacing it with |
199 | /// control flow. |
200 | /// |
201 | /// Put newly discovered select instructions into \p NewSIsToUnfold. Put newly |
202 | /// created basic blocks into \p NewBBs. |
203 | /// |
204 | /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. |
205 | void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, |
206 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
207 | std::vector<BasicBlock *> *NewBBs) { |
208 | SelectInst *SI = SIToUnfold.getInst(); |
209 | PHINode *SIUse = SIToUnfold.getUse(); |
210 | BasicBlock *StartBlock = SI->getParent(); |
211 | BasicBlock *EndBlock = SIUse->getParent(); |
212 | BranchInst *StartBlockTerm = |
213 | dyn_cast<BranchInst>(Val: StartBlock->getTerminator()); |
214 | |
215 | assert(StartBlockTerm && StartBlockTerm->isUnconditional()); |
216 | assert(SI->hasOneUse()); |
217 | |
218 | // These are the new basic blocks for the conditional branch. |
219 | // At least one will become an actual new basic block. |
220 | BasicBlock *TrueBlock = nullptr; |
221 | BasicBlock *FalseBlock = nullptr; |
222 | BranchInst *TrueBranch = nullptr; |
223 | BranchInst *FalseBranch = nullptr; |
224 | |
225 | // Sink select instructions to be able to unfold them later. |
226 | if (SelectInst *SIOp = dyn_cast<SelectInst>(Val: SI->getTrueValue())) { |
227 | createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIToSink: SIOp, EndBlock, |
228 | NewBBName: "si.unfold.true" , NewBlock: &TrueBlock, NewBranch: &TrueBranch, |
229 | NewSIsToUnfold, NewBBs); |
230 | } |
231 | if (SelectInst *SIOp = dyn_cast<SelectInst>(Val: SI->getFalseValue())) { |
232 | createBasicBlockAndSinkSelectInst(DTU, SI, SIUse, SIToSink: SIOp, EndBlock, |
233 | NewBBName: "si.unfold.false" , NewBlock: &FalseBlock, |
234 | NewBranch: &FalseBranch, NewSIsToUnfold, NewBBs); |
235 | } |
236 | |
237 | // If there was nothing to sink, then arbitrarily choose the 'false' side |
238 | // for a new input value to the PHI. |
239 | if (!TrueBlock && !FalseBlock) { |
240 | FalseBlock = BasicBlock::Create(Context&: SI->getContext(), Name: "si.unfold.false" , |
241 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
242 | NewBBs->push_back(x: FalseBlock); |
243 | BranchInst::Create(IfTrue: EndBlock, InsertBefore: FalseBlock); |
244 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, FalseBlock, EndBlock}}); |
245 | } |
246 | |
247 | // Insert the real conditional branch based on the original condition. |
248 | // If we did not create a new block for one of the 'true' or 'false' paths |
249 | // of the condition, it means that side of the branch goes to the end block |
250 | // directly and the path originates from the start block from the point of |
251 | // view of the new PHI. |
252 | BasicBlock *TT = EndBlock; |
253 | BasicBlock *FT = EndBlock; |
254 | if (TrueBlock && FalseBlock) { |
255 | // A diamond. |
256 | TT = TrueBlock; |
257 | FT = FalseBlock; |
258 | |
259 | // Update the phi node of SI. |
260 | SIUse->addIncoming(V: SI->getTrueValue(), BB: TrueBlock); |
261 | SIUse->addIncoming(V: SI->getFalseValue(), BB: FalseBlock); |
262 | |
263 | // Update any other PHI nodes in EndBlock. |
264 | for (PHINode &Phi : EndBlock->phis()) { |
265 | if (&Phi != SIUse) { |
266 | Value *OrigValue = Phi.getIncomingValueForBlock(BB: StartBlock); |
267 | Phi.addIncoming(V: OrigValue, BB: TrueBlock); |
268 | Phi.addIncoming(V: OrigValue, BB: FalseBlock); |
269 | } |
270 | |
271 | // Remove incoming place of original StartBlock, which comes in a indirect |
272 | // way (through TrueBlock and FalseBlock) now. |
273 | Phi.removeIncomingValue(BB: StartBlock, /* DeletePHIIfEmpty = */ false); |
274 | } |
275 | } else { |
276 | BasicBlock *NewBlock = nullptr; |
277 | Value *SIOp1 = SI->getTrueValue(); |
278 | Value *SIOp2 = SI->getFalseValue(); |
279 | |
280 | // A triangle pointing right. |
281 | if (!TrueBlock) { |
282 | NewBlock = FalseBlock; |
283 | FT = FalseBlock; |
284 | } |
285 | // A triangle pointing left. |
286 | else { |
287 | NewBlock = TrueBlock; |
288 | TT = TrueBlock; |
289 | std::swap(a&: SIOp1, b&: SIOp2); |
290 | } |
291 | |
292 | // Update the phi node of SI. |
293 | for (unsigned Idx = 0; Idx < SIUse->getNumIncomingValues(); ++Idx) { |
294 | if (SIUse->getIncomingBlock(i: Idx) == StartBlock) |
295 | SIUse->setIncomingValue(i: Idx, V: SIOp1); |
296 | } |
297 | SIUse->addIncoming(V: SIOp2, BB: NewBlock); |
298 | |
299 | // Update any other PHI nodes in EndBlock. |
300 | for (auto II = EndBlock->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); |
301 | ++II) { |
302 | if (Phi != SIUse) |
303 | Phi->addIncoming(V: Phi->getIncomingValueForBlock(BB: StartBlock), BB: NewBlock); |
304 | } |
305 | } |
306 | StartBlockTerm->eraseFromParent(); |
307 | BranchInst::Create(IfTrue: TT, IfFalse: FT, Cond: SI->getCondition(), InsertBefore: StartBlock); |
308 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, StartBlock, TT}, |
309 | {DominatorTree::Insert, StartBlock, FT}}); |
310 | |
311 | // Preserve loop info |
312 | if (Loop *L = LI->getLoopFor(BB: SI->getParent())) { |
313 | for (BasicBlock *NewBB : *NewBBs) |
314 | L->addBasicBlockToLoop(NewBB, LI&: *LI); |
315 | } |
316 | |
317 | // The select is now dead. |
318 | assert(SI->use_empty() && "Select must be dead now" ); |
319 | SI->eraseFromParent(); |
320 | } |
321 | |
322 | struct ClonedBlock { |
323 | BasicBlock *BB; |
324 | APInt State; ///< \p State corresponds to the next value of a switch stmnt. |
325 | }; |
326 | |
327 | typedef std::deque<BasicBlock *> PathType; |
328 | typedef std::vector<PathType> PathsType; |
329 | typedef SmallPtrSet<const BasicBlock *, 8> VisitedBlocks; |
330 | typedef std::vector<ClonedBlock> CloneList; |
331 | |
332 | // This data structure keeps track of all blocks that have been cloned. If two |
333 | // different ThreadingPaths clone the same block for a certain state it should |
334 | // be reused, and it can be looked up in this map. |
335 | typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap; |
336 | |
337 | // This map keeps track of all the new definitions for an instruction. This |
338 | // information is needed when restoring SSA form after cloning blocks. |
339 | typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap; |
340 | |
341 | inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { |
342 | OS << "< " ; |
343 | for (const BasicBlock *BB : Path) { |
344 | std::string BBName; |
345 | if (BB->hasName()) |
346 | raw_string_ostream(BBName) << BB->getName(); |
347 | else |
348 | raw_string_ostream(BBName) << BB; |
349 | OS << BBName << " " ; |
350 | } |
351 | OS << ">" ; |
352 | return OS; |
353 | } |
354 | |
355 | /// ThreadingPath is a path in the control flow of a loop that can be threaded |
356 | /// by cloning necessary basic blocks and replacing conditional branches with |
357 | /// unconditional ones. A threading path includes a list of basic blocks, the |
358 | /// exit state, and the block that determines the next state. |
359 | struct ThreadingPath { |
360 | /// Exit value is DFA's exit state for the given path. |
361 | APInt getExitValue() const { return ExitVal; } |
362 | void setExitValue(const ConstantInt *V) { |
363 | ExitVal = V->getValue(); |
364 | IsExitValSet = true; |
365 | } |
366 | bool isExitValueSet() const { return IsExitValSet; } |
367 | |
368 | /// Determinator is the basic block that determines the next state of the DFA. |
369 | const BasicBlock *getDeterminatorBB() const { return DBB; } |
370 | void setDeterminator(const BasicBlock *BB) { DBB = BB; } |
371 | |
372 | /// Path is a list of basic blocks. |
373 | const PathType &getPath() const { return Path; } |
374 | void setPath(const PathType &NewPath) { Path = NewPath; } |
375 | |
376 | void print(raw_ostream &OS) const { |
377 | OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]" ; |
378 | } |
379 | |
380 | private: |
381 | PathType Path; |
382 | APInt ExitVal; |
383 | const BasicBlock *DBB = nullptr; |
384 | bool IsExitValSet = false; |
385 | }; |
386 | |
387 | #ifndef NDEBUG |
388 | inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { |
389 | TPath.print(OS); |
390 | return OS; |
391 | } |
392 | #endif |
393 | |
394 | struct MainSwitch { |
395 | MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE) |
396 | : LI(LI) { |
397 | if (isCandidate(SI)) { |
398 | Instr = SI; |
399 | } else { |
400 | ORE->emit(RemarkBuilder: [&]() { |
401 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , SI) |
402 | << "Switch instruction is not predictable." ; |
403 | }); |
404 | } |
405 | } |
406 | |
407 | virtual ~MainSwitch() = default; |
408 | |
409 | SwitchInst *getInstr() const { return Instr; } |
410 | const SmallVector<SelectInstToUnfold, 4> getSelectInsts() { |
411 | return SelectInsts; |
412 | } |
413 | |
414 | private: |
415 | /// Do a use-def chain traversal starting from the switch condition to see if |
416 | /// \p SI is a potential condidate. |
417 | /// |
418 | /// Also, collect select instructions to unfold. |
419 | bool isCandidate(const SwitchInst *SI) { |
420 | std::deque<std::pair<Value *, BasicBlock *>> Q; |
421 | SmallSet<Value *, 16> SeenValues; |
422 | SelectInsts.clear(); |
423 | |
424 | Value *SICond = SI->getCondition(); |
425 | LLVM_DEBUG(dbgs() << "\tSICond: " << *SICond << "\n" ); |
426 | if (!isa<PHINode>(Val: SICond)) |
427 | return false; |
428 | |
429 | // The switch must be in a loop. |
430 | const Loop *L = LI->getLoopFor(BB: SI->getParent()); |
431 | if (!L) |
432 | return false; |
433 | |
434 | addToQueue(Val: SICond, BB: nullptr, Q, SeenValues); |
435 | |
436 | while (!Q.empty()) { |
437 | Value *Current = Q.front().first; |
438 | BasicBlock *CurrentIncomingBB = Q.front().second; |
439 | Q.pop_front(); |
440 | |
441 | if (auto *Phi = dyn_cast<PHINode>(Val: Current)) { |
442 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
443 | Value *Incoming = Phi->getIncomingValueForBlock(BB: IncomingBB); |
444 | addToQueue(Val: Incoming, BB: IncomingBB, Q, SeenValues); |
445 | } |
446 | LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n" ); |
447 | } else if (SelectInst *SelI = dyn_cast<SelectInst>(Val: Current)) { |
448 | if (!isValidSelectInst(SI: SelI)) |
449 | return false; |
450 | addToQueue(Val: SelI->getTrueValue(), BB: CurrentIncomingBB, Q, SeenValues); |
451 | addToQueue(Val: SelI->getFalseValue(), BB: CurrentIncomingBB, Q, SeenValues); |
452 | LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n" ); |
453 | if (auto *SelIUse = dyn_cast<PHINode>(Val: SelI->user_back())) |
454 | SelectInsts.push_back(Elt: SelectInstToUnfold(SelI, SelIUse)); |
455 | } else if (isa<Constant>(Val: Current)) { |
456 | LLVM_DEBUG(dbgs() << "\tconst: " << *Current << "\n" ); |
457 | continue; |
458 | } else { |
459 | LLVM_DEBUG(dbgs() << "\tother: " << *Current << "\n" ); |
460 | // Allow unpredictable values. The hope is that those will be the |
461 | // initial switch values that can be ignored (they will hit the |
462 | // unthreaded switch) but this assumption will get checked later after |
463 | // paths have been enumerated (in function getStateDefMap). |
464 | |
465 | // If the unpredictable value comes from the same inner loop it is |
466 | // likely that it will also be on the enumerated paths, causing us to |
467 | // exit after we have enumerated all the paths. This heuristic save |
468 | // compile time because a search for all the paths can become expensive. |
469 | if (EarlyExitHeuristic && |
470 | L->contains(L: LI->getLoopFor(BB: CurrentIncomingBB))) { |
471 | LLVM_DEBUG(dbgs() |
472 | << "\tExiting early due to unpredictability heuristic.\n" ); |
473 | return false; |
474 | } |
475 | |
476 | continue; |
477 | } |
478 | } |
479 | |
480 | return true; |
481 | } |
482 | |
483 | void addToQueue(Value *Val, BasicBlock *BB, |
484 | std::deque<std::pair<Value *, BasicBlock *>> &Q, |
485 | SmallSet<Value *, 16> &SeenValues) { |
486 | if (SeenValues.contains(Ptr: Val)) |
487 | return; |
488 | Q.push_back(x: {Val, BB}); |
489 | SeenValues.insert(Ptr: Val); |
490 | } |
491 | |
492 | bool isValidSelectInst(SelectInst *SI) { |
493 | if (!SI->hasOneUse()) |
494 | return false; |
495 | |
496 | Instruction *SIUse = dyn_cast<Instruction>(Val: SI->user_back()); |
497 | // The use of the select inst should be either a phi or another select. |
498 | if (!SIUse && !(isa<PHINode>(Val: SIUse) || isa<SelectInst>(Val: SIUse))) |
499 | return false; |
500 | |
501 | BasicBlock *SIBB = SI->getParent(); |
502 | |
503 | // Currently, we can only expand select instructions in basic blocks with |
504 | // one successor. |
505 | BranchInst *SITerm = dyn_cast<BranchInst>(Val: SIBB->getTerminator()); |
506 | if (!SITerm || !SITerm->isUnconditional()) |
507 | return false; |
508 | |
509 | // Only fold the select coming from directly where it is defined. |
510 | PHINode *PHIUser = dyn_cast<PHINode>(Val: SIUse); |
511 | if (PHIUser && PHIUser->getIncomingBlock(U: *SI->use_begin()) != SIBB) |
512 | return false; |
513 | |
514 | // If select will not be sunk during unfolding, and it is in the same basic |
515 | // block as another state defining select, then cannot unfold both. |
516 | for (SelectInstToUnfold SIToUnfold : SelectInsts) { |
517 | SelectInst *PrevSI = SIToUnfold.getInst(); |
518 | if (PrevSI->getTrueValue() != SI && PrevSI->getFalseValue() != SI && |
519 | PrevSI->getParent() == SI->getParent()) |
520 | return false; |
521 | } |
522 | |
523 | return true; |
524 | } |
525 | |
526 | LoopInfo *LI; |
527 | SwitchInst *Instr = nullptr; |
528 | SmallVector<SelectInstToUnfold, 4> SelectInsts; |
529 | }; |
530 | |
531 | struct AllSwitchPaths { |
532 | AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE, |
533 | LoopInfo *LI) |
534 | : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), ORE(ORE), |
535 | LI(LI) {} |
536 | |
537 | std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; } |
538 | unsigned getNumThreadingPaths() { return TPaths.size(); } |
539 | SwitchInst *getSwitchInst() { return Switch; } |
540 | BasicBlock *getSwitchBlock() { return SwitchBlock; } |
541 | |
542 | void run() { |
543 | VisitedBlocks Visited; |
544 | PathsType LoopPaths = paths(BB: SwitchBlock, Visited, /* PathDepth = */ 1); |
545 | StateDefMap StateDef = getStateDefMap(LoopPaths); |
546 | |
547 | if (StateDef.empty()) { |
548 | ORE->emit(RemarkBuilder: [&]() { |
549 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , |
550 | Switch) |
551 | << "Switch instruction is not predictable." ; |
552 | }); |
553 | return; |
554 | } |
555 | |
556 | for (const PathType &Path : LoopPaths) { |
557 | ThreadingPath TPath; |
558 | |
559 | const BasicBlock *PrevBB = Path.back(); |
560 | for (const BasicBlock *BB : Path) { |
561 | if (StateDef.contains(Val: BB)) { |
562 | const PHINode *Phi = dyn_cast<PHINode>(Val: StateDef[BB]); |
563 | assert(Phi && "Expected a state-defining instr to be a phi node." ); |
564 | |
565 | const Value *V = Phi->getIncomingValueForBlock(BB: PrevBB); |
566 | if (const ConstantInt *C = dyn_cast<const ConstantInt>(Val: V)) { |
567 | TPath.setExitValue(C); |
568 | TPath.setDeterminator(BB); |
569 | TPath.setPath(Path); |
570 | } |
571 | } |
572 | |
573 | // Switch block is the determinator, this is the final exit value. |
574 | if (TPath.isExitValueSet() && BB == Path.front()) |
575 | break; |
576 | |
577 | PrevBB = BB; |
578 | } |
579 | |
580 | if (TPath.isExitValueSet() && isSupported(TPath)) |
581 | TPaths.push_back(x: TPath); |
582 | } |
583 | } |
584 | |
585 | private: |
586 | // Value: an instruction that defines a switch state; |
587 | // Key: the parent basic block of that instruction. |
588 | typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; |
589 | |
590 | PathsType paths(BasicBlock *BB, VisitedBlocks &Visited, |
591 | unsigned PathDepth) const { |
592 | PathsType Res; |
593 | |
594 | // Stop exploring paths after visiting MaxPathLength blocks |
595 | if (PathDepth > MaxPathLength) { |
596 | ORE->emit(RemarkBuilder: [&]() { |
597 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "MaxPathLengthReached" , |
598 | Switch) |
599 | << "Exploration stopped after visiting MaxPathLength=" |
600 | << ore::NV("MaxPathLength" , MaxPathLength) << " blocks." ; |
601 | }); |
602 | return Res; |
603 | } |
604 | |
605 | Visited.insert(Ptr: BB); |
606 | |
607 | // Stop if we have reached the BB out of loop, since its successors have no |
608 | // impact on the DFA. |
609 | // TODO: Do we need to stop exploring if BB is the outer loop of the switch? |
610 | if (!LI->getLoopFor(BB)) |
611 | return Res; |
612 | |
613 | // Some blocks have multiple edges to the same successor, and this set |
614 | // is used to prevent a duplicate path from being generated |
615 | SmallSet<BasicBlock *, 4> Successors; |
616 | for (BasicBlock *Succ : successors(BB)) { |
617 | if (!Successors.insert(Ptr: Succ).second) |
618 | continue; |
619 | |
620 | // Found a cycle through the SwitchBlock |
621 | if (Succ == SwitchBlock) { |
622 | Res.push_back(x: {BB}); |
623 | continue; |
624 | } |
625 | |
626 | // We have encountered a cycle, do not get caught in it |
627 | if (Visited.contains(Ptr: Succ)) |
628 | continue; |
629 | |
630 | PathsType SuccPaths = paths(BB: Succ, Visited, PathDepth: PathDepth + 1); |
631 | for (const PathType &Path : SuccPaths) { |
632 | PathType NewPath(Path); |
633 | NewPath.push_front(x: BB); |
634 | Res.push_back(x: NewPath); |
635 | if (Res.size() >= MaxNumPaths) { |
636 | return Res; |
637 | } |
638 | } |
639 | } |
640 | // This block could now be visited again from a different predecessor. Note |
641 | // that this will result in exponential runtime. Subpaths could possibly be |
642 | // cached but it takes a lot of memory to store them. |
643 | Visited.erase(Ptr: BB); |
644 | return Res; |
645 | } |
646 | |
647 | /// Walk the use-def chain and collect all the state-defining instructions. |
648 | /// |
649 | /// Return an empty map if unpredictable values encountered inside the basic |
650 | /// blocks of \p LoopPaths. |
651 | StateDefMap getStateDefMap(const PathsType &LoopPaths) const { |
652 | StateDefMap Res; |
653 | |
654 | // Basic blocks belonging to any of the loops around the switch statement. |
655 | SmallPtrSet<BasicBlock *, 16> LoopBBs; |
656 | for (const PathType &Path : LoopPaths) { |
657 | for (BasicBlock *BB : Path) |
658 | LoopBBs.insert(Ptr: BB); |
659 | } |
660 | |
661 | Value *FirstDef = Switch->getOperand(i_nocapture: 0); |
662 | |
663 | assert(isa<PHINode>(FirstDef) && "The first definition must be a phi." ); |
664 | |
665 | SmallVector<PHINode *, 8> Stack; |
666 | Stack.push_back(Elt: dyn_cast<PHINode>(Val: FirstDef)); |
667 | SmallSet<Value *, 16> SeenValues; |
668 | |
669 | while (!Stack.empty()) { |
670 | PHINode *CurPhi = Stack.pop_back_val(); |
671 | |
672 | Res[CurPhi->getParent()] = CurPhi; |
673 | SeenValues.insert(Ptr: CurPhi); |
674 | |
675 | for (BasicBlock *IncomingBB : CurPhi->blocks()) { |
676 | Value *Incoming = CurPhi->getIncomingValueForBlock(BB: IncomingBB); |
677 | bool IsOutsideLoops = LoopBBs.count(Ptr: IncomingBB) == 0; |
678 | if (Incoming == FirstDef || isa<ConstantInt>(Val: Incoming) || |
679 | SeenValues.contains(Ptr: Incoming) || IsOutsideLoops) { |
680 | continue; |
681 | } |
682 | |
683 | // Any unpredictable value inside the loops means we must bail out. |
684 | if (!isa<PHINode>(Val: Incoming)) |
685 | return StateDefMap(); |
686 | |
687 | Stack.push_back(Elt: cast<PHINode>(Val: Incoming)); |
688 | } |
689 | } |
690 | |
691 | return Res; |
692 | } |
693 | |
694 | /// The determinator BB should precede the switch-defining BB. |
695 | /// |
696 | /// Otherwise, it is possible that the state defined in the determinator block |
697 | /// defines the state for the next iteration of the loop, rather than for the |
698 | /// current one. |
699 | /// |
700 | /// Currently supported paths: |
701 | /// \code |
702 | /// < switch bb1 determ def > [ 42, determ ] |
703 | /// < switch_and_def bb1 determ > [ 42, determ ] |
704 | /// < switch_and_def_and_determ bb1 > [ 42, switch_and_def_and_determ ] |
705 | /// \endcode |
706 | /// |
707 | /// Unsupported paths: |
708 | /// \code |
709 | /// < switch bb1 def determ > [ 43, determ ] |
710 | /// < switch_and_determ bb1 def > [ 43, switch_and_determ ] |
711 | /// \endcode |
712 | bool isSupported(const ThreadingPath &TPath) { |
713 | Instruction *SwitchCondI = dyn_cast<Instruction>(Val: Switch->getCondition()); |
714 | assert(SwitchCondI); |
715 | if (!SwitchCondI) |
716 | return false; |
717 | |
718 | const BasicBlock *SwitchCondDefBB = SwitchCondI->getParent(); |
719 | const BasicBlock *SwitchCondUseBB = Switch->getParent(); |
720 | const BasicBlock *DeterminatorBB = TPath.getDeterminatorBB(); |
721 | |
722 | assert( |
723 | SwitchCondUseBB == TPath.getPath().front() && |
724 | "The first BB in a threading path should have the switch instruction" ); |
725 | if (SwitchCondUseBB != TPath.getPath().front()) |
726 | return false; |
727 | |
728 | // Make DeterminatorBB the first element in Path. |
729 | PathType Path = TPath.getPath(); |
730 | auto ItDet = llvm::find(Range&: Path, Val: DeterminatorBB); |
731 | std::rotate(first: Path.begin(), middle: ItDet, last: Path.end()); |
732 | |
733 | bool IsDetBBSeen = false; |
734 | bool IsDefBBSeen = false; |
735 | bool IsUseBBSeen = false; |
736 | for (BasicBlock *BB : Path) { |
737 | if (BB == DeterminatorBB) |
738 | IsDetBBSeen = true; |
739 | if (BB == SwitchCondDefBB) |
740 | IsDefBBSeen = true; |
741 | if (BB == SwitchCondUseBB) |
742 | IsUseBBSeen = true; |
743 | if (IsDetBBSeen && IsUseBBSeen && !IsDefBBSeen) |
744 | return false; |
745 | } |
746 | |
747 | return true; |
748 | } |
749 | |
750 | SwitchInst *Switch; |
751 | BasicBlock *SwitchBlock; |
752 | OptimizationRemarkEmitter *ORE; |
753 | std::vector<ThreadingPath> TPaths; |
754 | LoopInfo *LI; |
755 | }; |
756 | |
757 | struct TransformDFA { |
758 | TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT, |
759 | AssumptionCache *AC, TargetTransformInfo *TTI, |
760 | OptimizationRemarkEmitter *ORE, |
761 | SmallPtrSet<const Value *, 32> EphValues) |
762 | : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE), |
763 | EphValues(EphValues) {} |
764 | |
765 | void run() { |
766 | if (isLegalAndProfitableToTransform()) { |
767 | createAllExitPaths(); |
768 | NumTransforms++; |
769 | } |
770 | } |
771 | |
772 | private: |
773 | /// This function performs both a legality check and profitability check at |
774 | /// the same time since it is convenient to do so. It iterates through all |
775 | /// blocks that will be cloned, and keeps track of the duplication cost. It |
776 | /// also returns false if it is illegal to clone some required block. |
777 | bool isLegalAndProfitableToTransform() { |
778 | CodeMetrics Metrics; |
779 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
780 | |
781 | // Don't thread switch without multiple successors. |
782 | if (Switch->getNumSuccessors() <= 1) |
783 | return false; |
784 | |
785 | // Note that DuplicateBlockMap is not being used as intended here. It is |
786 | // just being used to ensure (BB, State) pairs are only counted once. |
787 | DuplicateBlockMap DuplicateMap; |
788 | |
789 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
790 | PathType PathBBs = TPath.getPath(); |
791 | APInt NextState = TPath.getExitValue(); |
792 | const BasicBlock *Determinator = TPath.getDeterminatorBB(); |
793 | |
794 | // Update Metrics for the Switch block, this is always cloned |
795 | BasicBlock *BB = SwitchPaths->getSwitchBlock(); |
796 | BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
797 | if (!VisitedBB) { |
798 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
799 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
800 | } |
801 | |
802 | // If the Switch block is the Determinator, then we can continue since |
803 | // this is the only block that is cloned and we already counted for it. |
804 | if (PathBBs.front() == Determinator) |
805 | continue; |
806 | |
807 | // Otherwise update Metrics for all blocks that will be cloned. If any |
808 | // block is already cloned and would be reused, don't double count it. |
809 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
810 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
811 | BB = *BBIt; |
812 | VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
813 | if (VisitedBB) |
814 | continue; |
815 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
816 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
817 | } |
818 | |
819 | if (Metrics.notDuplicatable) { |
820 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
821 | << "non-duplicatable instructions.\n" ); |
822 | ORE->emit(RemarkBuilder: [&]() { |
823 | return OptimizationRemarkMissed(DEBUG_TYPE, "NonDuplicatableInst" , |
824 | Switch) |
825 | << "Contains non-duplicatable instructions." ; |
826 | }); |
827 | return false; |
828 | } |
829 | |
830 | // FIXME: Allow jump threading with controlled convergence. |
831 | if (Metrics.Convergence != ConvergenceKind::None) { |
832 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
833 | << "convergent instructions.\n" ); |
834 | ORE->emit(RemarkBuilder: [&]() { |
835 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
836 | << "Contains convergent instructions." ; |
837 | }); |
838 | return false; |
839 | } |
840 | |
841 | if (!Metrics.NumInsts.isValid()) { |
842 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
843 | << "instructions with invalid cost.\n" ); |
844 | ORE->emit(RemarkBuilder: [&]() { |
845 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
846 | << "Contains instructions with invalid cost." ; |
847 | }); |
848 | return false; |
849 | } |
850 | } |
851 | |
852 | InstructionCost DuplicationCost = 0; |
853 | |
854 | unsigned JumpTableSize = 0; |
855 | TTI->getEstimatedNumberOfCaseClusters(SI: *Switch, JTSize&: JumpTableSize, PSI: nullptr, |
856 | BFI: nullptr); |
857 | if (JumpTableSize == 0) { |
858 | // Factor in the number of conditional branches reduced from jump |
859 | // threading. Assume that lowering the switch block is implemented by |
860 | // using binary search, hence the LogBase2(). |
861 | unsigned CondBranches = |
862 | APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); |
863 | assert(CondBranches > 0 && |
864 | "The threaded switch must have multiple branches" ); |
865 | DuplicationCost = Metrics.NumInsts / CondBranches; |
866 | } else { |
867 | // Compared with jump tables, the DFA optimizer removes an indirect branch |
868 | // on each loop iteration, thus making branch prediction more precise. The |
869 | // more branch targets there are, the more likely it is for the branch |
870 | // predictor to make a mistake, and the more benefit there is in the DFA |
871 | // optimizer. Thus, the more branch targets there are, the lower is the |
872 | // cost of the DFA opt. |
873 | DuplicationCost = Metrics.NumInsts / JumpTableSize; |
874 | } |
875 | |
876 | LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " |
877 | << SwitchPaths->getSwitchBlock()->getName() |
878 | << " is: " << DuplicationCost << "\n\n" ); |
879 | |
880 | if (DuplicationCost > CostThreshold) { |
881 | LLVM_DEBUG(dbgs() << "Not jump threading, duplication cost exceeds the " |
882 | << "cost threshold.\n" ); |
883 | ORE->emit(RemarkBuilder: [&]() { |
884 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable" , Switch) |
885 | << "Duplication cost exceeds the cost threshold (cost=" |
886 | << ore::NV("Cost" , DuplicationCost) |
887 | << ", threshold=" << ore::NV("Threshold" , CostThreshold) << ")." ; |
888 | }); |
889 | return false; |
890 | } |
891 | |
892 | ORE->emit(RemarkBuilder: [&]() { |
893 | return OptimizationRemark(DEBUG_TYPE, "JumpThreaded" , Switch) |
894 | << "Switch statement jump-threaded." ; |
895 | }); |
896 | |
897 | return true; |
898 | } |
899 | |
900 | /// Transform each threading path to effectively jump thread the DFA. |
901 | void createAllExitPaths() { |
902 | DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager); |
903 | |
904 | // Move the switch block to the end of the path, since it will be duplicated |
905 | BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock(); |
906 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
907 | LLVM_DEBUG(dbgs() << TPath << "\n" ); |
908 | PathType NewPath(TPath.getPath()); |
909 | NewPath.push_back(x: SwitchBlock); |
910 | TPath.setPath(NewPath); |
911 | } |
912 | |
913 | // Transform the ThreadingPaths and keep track of the cloned values |
914 | DuplicateBlockMap DuplicateMap; |
915 | DefMap NewDefs; |
916 | |
917 | SmallSet<BasicBlock *, 16> BlocksToClean; |
918 | for (BasicBlock *BB : successors(BB: SwitchBlock)) |
919 | BlocksToClean.insert(Ptr: BB); |
920 | |
921 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
922 | createExitPath(NewDefs, Path&: TPath, DuplicateMap, BlocksToClean, DTU: &DTU); |
923 | NumPaths++; |
924 | } |
925 | |
926 | // After all paths are cloned, now update the last successor of the cloned |
927 | // path so it skips over the switch statement |
928 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) |
929 | updateLastSuccessor(TPath, DuplicateMap, DTU: &DTU); |
930 | |
931 | // For each instruction that was cloned and used outside, update its uses |
932 | updateSSA(NewDefs); |
933 | |
934 | // Clean PHI Nodes for the newly created blocks |
935 | for (BasicBlock *BB : BlocksToClean) |
936 | cleanPhiNodes(BB); |
937 | } |
938 | |
939 | /// For a specific ThreadingPath \p Path, create an exit path starting from |
940 | /// the determinator block. |
941 | /// |
942 | /// To remember the correct destination, we have to duplicate blocks |
943 | /// corresponding to each state. Also update the terminating instruction of |
944 | /// the predecessors, and phis in the successor blocks. |
945 | void createExitPath(DefMap &NewDefs, ThreadingPath &Path, |
946 | DuplicateBlockMap &DuplicateMap, |
947 | SmallSet<BasicBlock *, 16> &BlocksToClean, |
948 | DomTreeUpdater *DTU) { |
949 | APInt NextState = Path.getExitValue(); |
950 | const BasicBlock *Determinator = Path.getDeterminatorBB(); |
951 | PathType PathBBs = Path.getPath(); |
952 | |
953 | // Don't select the placeholder block in front |
954 | if (PathBBs.front() == Determinator) |
955 | PathBBs.pop_front(); |
956 | |
957 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
958 | // When there is only one BB in PathBBs, the determinator takes itself as a |
959 | // direct predecessor. |
960 | BasicBlock *PrevBB = PathBBs.size() == 1 ? *DetIt : *std::prev(x: DetIt); |
961 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
962 | BasicBlock *BB = *BBIt; |
963 | BlocksToClean.insert(Ptr: BB); |
964 | |
965 | // We already cloned BB for this NextState, now just update the branch |
966 | // and continue. |
967 | BasicBlock *NextBB = getClonedBB(BB, NextState, DuplicateMap); |
968 | if (NextBB) { |
969 | updatePredecessor(PrevBB, OldBB: BB, NewBB: NextBB, DTU); |
970 | PrevBB = NextBB; |
971 | continue; |
972 | } |
973 | |
974 | // Clone the BB and update the successor of Prev to jump to the new block |
975 | BasicBlock *NewBB = cloneBlockAndUpdatePredecessor( |
976 | BB, PrevBB, NextState, DuplicateMap, NewDefs, DTU); |
977 | DuplicateMap[BB].push_back(x: {.BB: NewBB, .State: NextState}); |
978 | BlocksToClean.insert(Ptr: NewBB); |
979 | PrevBB = NewBB; |
980 | } |
981 | } |
982 | |
983 | /// Restore SSA form after cloning blocks. |
984 | /// |
985 | /// Each cloned block creates new defs for a variable, and the uses need to be |
986 | /// updated to reflect this. The uses may be replaced with a cloned value, or |
987 | /// some derived phi instruction. Note that all uses of a value defined in the |
988 | /// same block were already remapped when cloning the block. |
989 | void updateSSA(DefMap &NewDefs) { |
990 | SSAUpdaterBulk SSAUpdate; |
991 | SmallVector<Use *, 16> UsesToRename; |
992 | |
993 | for (const auto &KV : NewDefs) { |
994 | Instruction *I = KV.first; |
995 | BasicBlock *BB = I->getParent(); |
996 | std::vector<Instruction *> Cloned = KV.second; |
997 | |
998 | // Scan all uses of this instruction to see if it is used outside of its |
999 | // block, and if so, record them in UsesToRename. |
1000 | for (Use &U : I->uses()) { |
1001 | Instruction *User = cast<Instruction>(Val: U.getUser()); |
1002 | if (PHINode *UserPN = dyn_cast<PHINode>(Val: User)) { |
1003 | if (UserPN->getIncomingBlock(U) == BB) |
1004 | continue; |
1005 | } else if (User->getParent() == BB) { |
1006 | continue; |
1007 | } |
1008 | |
1009 | UsesToRename.push_back(Elt: &U); |
1010 | } |
1011 | |
1012 | // If there are no uses outside the block, we're done with this |
1013 | // instruction. |
1014 | if (UsesToRename.empty()) |
1015 | continue; |
1016 | LLVM_DEBUG(dbgs() << "DFA-JT: Renaming non-local uses of: " << *I |
1017 | << "\n" ); |
1018 | |
1019 | // We found a use of I outside of BB. Rename all uses of I that are |
1020 | // outside its block to be uses of the appropriate PHI node etc. See |
1021 | // ValuesInBlocks with the values we know. |
1022 | unsigned VarNum = SSAUpdate.AddVariable(Name: I->getName(), Ty: I->getType()); |
1023 | SSAUpdate.AddAvailableValue(Var: VarNum, BB, V: I); |
1024 | for (Instruction *New : Cloned) |
1025 | SSAUpdate.AddAvailableValue(Var: VarNum, BB: New->getParent(), V: New); |
1026 | |
1027 | while (!UsesToRename.empty()) |
1028 | SSAUpdate.AddUse(Var: VarNum, U: UsesToRename.pop_back_val()); |
1029 | |
1030 | LLVM_DEBUG(dbgs() << "\n" ); |
1031 | } |
1032 | // SSAUpdater handles phi placement and renaming uses with the appropriate |
1033 | // value. |
1034 | SSAUpdate.RewriteAllUses(DT); |
1035 | } |
1036 | |
1037 | /// Clones a basic block, and adds it to the CFG. |
1038 | /// |
1039 | /// This function also includes updating phi nodes in the successors of the |
1040 | /// BB, and remapping uses that were defined locally in the cloned BB. |
1041 | BasicBlock *cloneBlockAndUpdatePredecessor(BasicBlock *BB, BasicBlock *PrevBB, |
1042 | const APInt &NextState, |
1043 | DuplicateBlockMap &DuplicateMap, |
1044 | DefMap &NewDefs, |
1045 | DomTreeUpdater *DTU) { |
1046 | ValueToValueMapTy VMap; |
1047 | BasicBlock *NewBB = CloneBasicBlock( |
1048 | BB, VMap, NameSuffix: ".jt" + std::to_string(val: NextState.getLimitedValue()), |
1049 | F: BB->getParent()); |
1050 | NewBB->moveAfter(MovePos: BB); |
1051 | NumCloned++; |
1052 | |
1053 | for (Instruction &I : *NewBB) { |
1054 | // Do not remap operands of PHINode in case a definition in BB is an |
1055 | // incoming value to a phi in the same block. This incoming value will |
1056 | // be renamed later while restoring SSA. |
1057 | if (isa<PHINode>(Val: &I)) |
1058 | continue; |
1059 | RemapInstruction(I: &I, VM&: VMap, |
1060 | Flags: RF_IgnoreMissingLocals | RF_NoModuleLevelChanges); |
1061 | if (AssumeInst *II = dyn_cast<AssumeInst>(Val: &I)) |
1062 | AC->registerAssumption(CI: II); |
1063 | } |
1064 | |
1065 | updateSuccessorPhis(BB, ClonedBB: NewBB, NextState, VMap, DuplicateMap); |
1066 | updatePredecessor(PrevBB, OldBB: BB, NewBB, DTU); |
1067 | updateDefMap(NewDefs, VMap); |
1068 | |
1069 | // Add all successors to the DominatorTree |
1070 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1071 | for (auto *SuccBB : successors(BB: NewBB)) { |
1072 | if (SuccSet.insert(Ptr: SuccBB).second) |
1073 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBB, SuccBB}}); |
1074 | } |
1075 | SuccSet.clear(); |
1076 | return NewBB; |
1077 | } |
1078 | |
1079 | /// Update the phi nodes in BB's successors. |
1080 | /// |
1081 | /// This means creating a new incoming value from NewBB with the new |
1082 | /// instruction wherever there is an incoming value from BB. |
1083 | void updateSuccessorPhis(BasicBlock *BB, BasicBlock *ClonedBB, |
1084 | const APInt &NextState, ValueToValueMapTy &VMap, |
1085 | DuplicateBlockMap &DuplicateMap) { |
1086 | std::vector<BasicBlock *> BlocksToUpdate; |
1087 | |
1088 | // If BB is the last block in the path, we can simply update the one case |
1089 | // successor that will be reached. |
1090 | if (BB == SwitchPaths->getSwitchBlock()) { |
1091 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
1092 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
1093 | BlocksToUpdate.push_back(x: NextCase); |
1094 | BasicBlock *ClonedSucc = getClonedBB(BB: NextCase, NextState, DuplicateMap); |
1095 | if (ClonedSucc) |
1096 | BlocksToUpdate.push_back(x: ClonedSucc); |
1097 | } |
1098 | // Otherwise update phis in all successors. |
1099 | else { |
1100 | for (BasicBlock *Succ : successors(BB)) { |
1101 | BlocksToUpdate.push_back(x: Succ); |
1102 | |
1103 | // Check if a successor has already been cloned for the particular exit |
1104 | // value. In this case if a successor was already cloned, the phi nodes |
1105 | // in the cloned block should be updated directly. |
1106 | BasicBlock *ClonedSucc = getClonedBB(BB: Succ, NextState, DuplicateMap); |
1107 | if (ClonedSucc) |
1108 | BlocksToUpdate.push_back(x: ClonedSucc); |
1109 | } |
1110 | } |
1111 | |
1112 | // If there is a phi with an incoming value from BB, create a new incoming |
1113 | // value for the new predecessor ClonedBB. The value will either be the same |
1114 | // value from BB or a cloned value. |
1115 | for (BasicBlock *Succ : BlocksToUpdate) { |
1116 | for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); |
1117 | ++II) { |
1118 | Value *Incoming = Phi->getIncomingValueForBlock(BB); |
1119 | if (Incoming) { |
1120 | if (isa<Constant>(Val: Incoming)) { |
1121 | Phi->addIncoming(V: Incoming, BB: ClonedBB); |
1122 | continue; |
1123 | } |
1124 | Value *ClonedVal = VMap[Incoming]; |
1125 | if (ClonedVal) |
1126 | Phi->addIncoming(V: ClonedVal, BB: ClonedBB); |
1127 | else |
1128 | Phi->addIncoming(V: Incoming, BB: ClonedBB); |
1129 | } |
1130 | } |
1131 | } |
1132 | } |
1133 | |
1134 | /// Sets the successor of PrevBB to be NewBB instead of OldBB. Note that all |
1135 | /// other successors are kept as well. |
1136 | void updatePredecessor(BasicBlock *PrevBB, BasicBlock *OldBB, |
1137 | BasicBlock *NewBB, DomTreeUpdater *DTU) { |
1138 | // When a path is reused, there is a chance that predecessors were already |
1139 | // updated before. Check if the predecessor needs to be updated first. |
1140 | if (!isPredecessor(BB: OldBB, IncomingBB: PrevBB)) |
1141 | return; |
1142 | |
1143 | Instruction *PrevTerm = PrevBB->getTerminator(); |
1144 | for (unsigned Idx = 0; Idx < PrevTerm->getNumSuccessors(); Idx++) { |
1145 | if (PrevTerm->getSuccessor(Idx) == OldBB) { |
1146 | OldBB->removePredecessor(Pred: PrevBB, /* KeepOneInputPHIs = */ true); |
1147 | PrevTerm->setSuccessor(Idx, BB: NewBB); |
1148 | } |
1149 | } |
1150 | DTU->applyUpdates(Updates: {{DominatorTree::Delete, PrevBB, OldBB}, |
1151 | {DominatorTree::Insert, PrevBB, NewBB}}); |
1152 | } |
1153 | |
1154 | /// Add new value mappings to the DefMap to keep track of all new definitions |
1155 | /// for a particular instruction. These will be used while updating SSA form. |
1156 | void updateDefMap(DefMap &NewDefs, ValueToValueMapTy &VMap) { |
1157 | SmallVector<std::pair<Instruction *, Instruction *>> NewDefsVector; |
1158 | NewDefsVector.reserve(N: VMap.size()); |
1159 | |
1160 | for (auto Entry : VMap) { |
1161 | Instruction *Inst = |
1162 | dyn_cast<Instruction>(Val: const_cast<Value *>(Entry.first)); |
1163 | if (!Inst || !Entry.second || isa<BranchInst>(Val: Inst) || |
1164 | isa<SwitchInst>(Val: Inst)) { |
1165 | continue; |
1166 | } |
1167 | |
1168 | Instruction *Cloned = dyn_cast<Instruction>(Val&: Entry.second); |
1169 | if (!Cloned) |
1170 | continue; |
1171 | |
1172 | NewDefsVector.push_back(Elt: {Inst, Cloned}); |
1173 | } |
1174 | |
1175 | // Sort the defs to get deterministic insertion order into NewDefs. |
1176 | sort(C&: NewDefsVector, Comp: [](const auto &LHS, const auto &RHS) { |
1177 | if (LHS.first == RHS.first) |
1178 | return LHS.second->comesBefore(RHS.second); |
1179 | return LHS.first->comesBefore(RHS.first); |
1180 | }); |
1181 | |
1182 | for (const auto &KV : NewDefsVector) |
1183 | NewDefs[KV.first].push_back(x: KV.second); |
1184 | } |
1185 | |
1186 | /// Update the last branch of a particular cloned path to point to the correct |
1187 | /// case successor. |
1188 | /// |
1189 | /// Note that this is an optional step and would have been done in later |
1190 | /// optimizations, but it makes the CFG significantly easier to work with. |
1191 | void updateLastSuccessor(ThreadingPath &TPath, |
1192 | DuplicateBlockMap &DuplicateMap, |
1193 | DomTreeUpdater *DTU) { |
1194 | APInt NextState = TPath.getExitValue(); |
1195 | BasicBlock *BB = TPath.getPath().back(); |
1196 | BasicBlock *LastBlock = getClonedBB(BB, NextState, DuplicateMap); |
1197 | |
1198 | // Note multiple paths can end at the same block so check that it is not |
1199 | // updated yet |
1200 | if (!isa<SwitchInst>(Val: LastBlock->getTerminator())) |
1201 | return; |
1202 | SwitchInst *Switch = cast<SwitchInst>(Val: LastBlock->getTerminator()); |
1203 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
1204 | |
1205 | std::vector<DominatorTree::UpdateType> DTUpdates; |
1206 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1207 | for (BasicBlock *Succ : successors(BB: LastBlock)) { |
1208 | if (Succ != NextCase && SuccSet.insert(Ptr: Succ).second) |
1209 | DTUpdates.push_back(x: {DominatorTree::Delete, LastBlock, Succ}); |
1210 | } |
1211 | |
1212 | Switch->eraseFromParent(); |
1213 | BranchInst::Create(IfTrue: NextCase, InsertBefore: LastBlock); |
1214 | |
1215 | DTU->applyUpdates(Updates: DTUpdates); |
1216 | } |
1217 | |
1218 | /// After cloning blocks, some of the phi nodes have extra incoming values |
1219 | /// that are no longer used. This function removes them. |
1220 | void cleanPhiNodes(BasicBlock *BB) { |
1221 | // If BB is no longer reachable, remove any remaining phi nodes |
1222 | if (pred_empty(BB)) { |
1223 | std::vector<PHINode *> PhiToRemove; |
1224 | for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); ++II) { |
1225 | PhiToRemove.push_back(x: Phi); |
1226 | } |
1227 | for (PHINode *PN : PhiToRemove) { |
1228 | PN->replaceAllUsesWith(V: PoisonValue::get(T: PN->getType())); |
1229 | PN->eraseFromParent(); |
1230 | } |
1231 | return; |
1232 | } |
1233 | |
1234 | // Remove any incoming values that come from an invalid predecessor |
1235 | for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); ++II) { |
1236 | std::vector<BasicBlock *> BlocksToRemove; |
1237 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
1238 | if (!isPredecessor(BB, IncomingBB)) |
1239 | BlocksToRemove.push_back(x: IncomingBB); |
1240 | } |
1241 | for (BasicBlock *BB : BlocksToRemove) |
1242 | Phi->removeIncomingValue(BB); |
1243 | } |
1244 | } |
1245 | |
1246 | /// Checks if BB was already cloned for a particular next state value. If it |
1247 | /// was then it returns this cloned block, and otherwise null. |
1248 | BasicBlock *getClonedBB(BasicBlock *BB, const APInt &NextState, |
1249 | DuplicateBlockMap &DuplicateMap) { |
1250 | CloneList ClonedBBs = DuplicateMap[BB]; |
1251 | |
1252 | // Find an entry in the CloneList with this NextState. If it exists then |
1253 | // return the corresponding BB |
1254 | auto It = llvm::find_if(Range&: ClonedBBs, P: [NextState](const ClonedBlock &C) { |
1255 | return C.State == NextState; |
1256 | }); |
1257 | return It != ClonedBBs.end() ? (*It).BB : nullptr; |
1258 | } |
1259 | |
1260 | /// Helper to get the successor corresponding to a particular case value for |
1261 | /// a switch statement. |
1262 | BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) { |
1263 | BasicBlock *NextCase = nullptr; |
1264 | for (auto Case : Switch->cases()) { |
1265 | if (Case.getCaseValue()->getValue() == NextState) { |
1266 | NextCase = Case.getCaseSuccessor(); |
1267 | break; |
1268 | } |
1269 | } |
1270 | if (!NextCase) |
1271 | NextCase = Switch->getDefaultDest(); |
1272 | return NextCase; |
1273 | } |
1274 | |
1275 | /// Returns true if IncomingBB is a predecessor of BB. |
1276 | bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { |
1277 | return llvm::is_contained(Range: predecessors(BB), Element: IncomingBB); |
1278 | } |
1279 | |
1280 | AllSwitchPaths *SwitchPaths; |
1281 | DominatorTree *DT; |
1282 | AssumptionCache *AC; |
1283 | TargetTransformInfo *TTI; |
1284 | OptimizationRemarkEmitter *ORE; |
1285 | SmallPtrSet<const Value *, 32> EphValues; |
1286 | std::vector<ThreadingPath> TPaths; |
1287 | }; |
1288 | |
1289 | bool DFAJumpThreading::run(Function &F) { |
1290 | LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n" ); |
1291 | |
1292 | if (F.hasOptSize()) { |
1293 | LLVM_DEBUG(dbgs() << "Skipping due to the 'minsize' attribute\n" ); |
1294 | return false; |
1295 | } |
1296 | |
1297 | if (ClViewCfgBefore) |
1298 | F.viewCFG(); |
1299 | |
1300 | SmallVector<AllSwitchPaths, 2> ThreadableLoops; |
1301 | bool MadeChanges = false; |
1302 | LoopInfoBroken = false; |
1303 | |
1304 | for (BasicBlock &BB : F) { |
1305 | auto *SI = dyn_cast<SwitchInst>(Val: BB.getTerminator()); |
1306 | if (!SI) |
1307 | continue; |
1308 | |
1309 | LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() |
1310 | << " is a candidate\n" ); |
1311 | MainSwitch Switch(SI, LI, ORE); |
1312 | |
1313 | if (!Switch.getInstr()) |
1314 | continue; |
1315 | |
1316 | LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is a " |
1317 | << "candidate for jump threading\n" ); |
1318 | LLVM_DEBUG(SI->dump()); |
1319 | |
1320 | unfoldSelectInstrs(DT, SelectInsts: Switch.getSelectInsts()); |
1321 | if (!Switch.getSelectInsts().empty()) |
1322 | MadeChanges = true; |
1323 | |
1324 | AllSwitchPaths SwitchPaths(&Switch, ORE, LI); |
1325 | SwitchPaths.run(); |
1326 | |
1327 | if (SwitchPaths.getNumThreadingPaths() > 0) { |
1328 | ThreadableLoops.push_back(Elt: SwitchPaths); |
1329 | |
1330 | // For the time being limit this optimization to occurring once in a |
1331 | // function since it can change the CFG significantly. This is not a |
1332 | // strict requirement but it can cause buggy behavior if there is an |
1333 | // overlap of blocks in different opportunities. There is a lot of room to |
1334 | // experiment with catching more opportunities here. |
1335 | // NOTE: To release this contraint, we must handle LoopInfo invalidation |
1336 | break; |
1337 | } |
1338 | } |
1339 | |
1340 | #ifdef NDEBUG |
1341 | LI->verify(DomTree: *DT); |
1342 | #endif |
1343 | |
1344 | SmallPtrSet<const Value *, 32> EphValues; |
1345 | if (ThreadableLoops.size() > 0) |
1346 | CodeMetrics::collectEphemeralValues(L: &F, AC, EphValues); |
1347 | |
1348 | for (AllSwitchPaths SwitchPaths : ThreadableLoops) { |
1349 | TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); |
1350 | Transform.run(); |
1351 | MadeChanges = true; |
1352 | LoopInfoBroken = true; |
1353 | } |
1354 | |
1355 | #ifdef EXPENSIVE_CHECKS |
1356 | assert(DT->verify(DominatorTree::VerificationLevel::Full)); |
1357 | verifyFunction(F, &dbgs()); |
1358 | #endif |
1359 | |
1360 | return MadeChanges; |
1361 | } |
1362 | |
1363 | } // end anonymous namespace |
1364 | |
1365 | /// Integrate with the new Pass Manager |
1366 | PreservedAnalyses DFAJumpThreadingPass::run(Function &F, |
1367 | FunctionAnalysisManager &AM) { |
1368 | AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
1369 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
1370 | LoopInfo &LI = AM.getResult<LoopAnalysis>(IR&: F); |
1371 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
1372 | OptimizationRemarkEmitter ORE(&F); |
1373 | DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE); |
1374 | if (!ThreadImpl.run(F)) |
1375 | return PreservedAnalyses::all(); |
1376 | |
1377 | PreservedAnalyses PA; |
1378 | PA.preserve<DominatorTreeAnalysis>(); |
1379 | if (!ThreadImpl.LoopInfoBroken) |
1380 | PA.preserve<LoopAnalysis>(); |
1381 | return PA; |
1382 | } |
1383 | |