1 | //===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Transform each threading path to effectively jump thread the DFA. For |
10 | // example, the CFG below could be transformed as follows, where the cloned |
11 | // blocks unconditionally branch to the next correct case based on what is |
12 | // identified in the analysis. |
13 | // |
14 | // sw.bb sw.bb |
15 | // / | \ / | \ |
16 | // case1 case2 case3 case1 case2 case3 |
17 | // \ | / | | | |
18 | // determinator det.2 det.3 det.1 |
19 | // br sw.bb / | \ |
20 | // sw.bb.2 sw.bb.3 sw.bb.1 |
21 | // br case2 br case3 br case1ยง |
22 | // |
23 | // Definitions and Terminology: |
24 | // |
25 | // * Threading path: |
26 | // a list of basic blocks, the exit state, and the block that determines |
27 | // the next state, for which the following notation will be used: |
28 | // < path of BBs that form a cycle > [ state, determinator ] |
29 | // |
30 | // * Predictable switch: |
31 | // The switch variable is always a known constant so that all conditional |
32 | // jumps based on switch variable can be converted to unconditional jump. |
33 | // |
34 | // * Determinator: |
35 | // The basic block that determines the next state of the DFA. |
36 | // |
37 | // Representing the optimization in C-like pseudocode: the code pattern on the |
38 | // left could functionally be transformed to the right pattern if the switch |
39 | // condition is predictable. |
40 | // |
41 | // X = A goto A |
42 | // for (...) A: |
43 | // switch (X) ... |
44 | // case A goto B |
45 | // X = B B: |
46 | // case B ... |
47 | // X = C goto C |
48 | // |
49 | // The pass first checks that switch variable X is decided by the control flow |
50 | // path taken in the loop; for example, in case B, the next value of X is |
51 | // decided to be C. It then enumerates through all paths in the loop and labels |
52 | // the basic blocks where the next state is decided. |
53 | // |
54 | // Using this information it creates new paths that unconditionally branch to |
55 | // the next case. This involves cloning code, so it only gets triggered if the |
56 | // amount of code duplicated is below a threshold. |
57 | // |
58 | //===----------------------------------------------------------------------===// |
59 | |
60 | #include "llvm/Transforms/Scalar/DFAJumpThreading.h" |
61 | #include "llvm/ADT/APInt.h" |
62 | #include "llvm/ADT/DenseMap.h" |
63 | #include "llvm/ADT/SmallSet.h" |
64 | #include "llvm/ADT/Statistic.h" |
65 | #include "llvm/Analysis/AssumptionCache.h" |
66 | #include "llvm/Analysis/CodeMetrics.h" |
67 | #include "llvm/Analysis/DomTreeUpdater.h" |
68 | #include "llvm/Analysis/LoopInfo.h" |
69 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
70 | #include "llvm/Analysis/TargetTransformInfo.h" |
71 | #include "llvm/IR/CFG.h" |
72 | #include "llvm/IR/Constants.h" |
73 | #include "llvm/IR/IntrinsicInst.h" |
74 | #include "llvm/Support/CommandLine.h" |
75 | #include "llvm/Support/Debug.h" |
76 | #include "llvm/Transforms/Utils/Cloning.h" |
77 | #include "llvm/Transforms/Utils/SSAUpdaterBulk.h" |
78 | #include "llvm/Transforms/Utils/ValueMapper.h" |
79 | #include <deque> |
80 | |
81 | #ifdef EXPENSIVE_CHECKS |
82 | #include "llvm/IR/Verifier.h" |
83 | #endif |
84 | |
85 | using namespace llvm; |
86 | |
87 | #define DEBUG_TYPE "dfa-jump-threading" |
88 | |
89 | STATISTIC(NumTransforms, "Number of transformations done" ); |
90 | STATISTIC(NumCloned, "Number of blocks cloned" ); |
91 | STATISTIC(NumPaths, "Number of individual paths threaded" ); |
92 | |
93 | static cl::opt<bool> |
94 | ClViewCfgBefore("dfa-jump-view-cfg-before" , |
95 | cl::desc("View the CFG before DFA Jump Threading" ), |
96 | cl::Hidden, cl::init(Val: false)); |
97 | |
98 | static cl::opt<bool> EarlyExitHeuristic( |
99 | "dfa-early-exit-heuristic" , |
100 | cl::desc("Exit early if an unpredictable value come from the same loop" ), |
101 | cl::Hidden, cl::init(Val: true)); |
102 | |
103 | static cl::opt<unsigned> MaxPathLength( |
104 | "dfa-max-path-length" , |
105 | cl::desc("Max number of blocks searched to find a threading path" ), |
106 | cl::Hidden, cl::init(Val: 20)); |
107 | |
108 | static cl::opt<unsigned> MaxNumVisitiedPaths( |
109 | "dfa-max-num-visited-paths" , |
110 | cl::desc( |
111 | "Max number of blocks visited while enumerating paths around a switch" ), |
112 | cl::Hidden, cl::init(Val: 2500)); |
113 | |
114 | static cl::opt<unsigned> |
115 | MaxNumPaths("dfa-max-num-paths" , |
116 | cl::desc("Max number of paths enumerated around a switch" ), |
117 | cl::Hidden, cl::init(Val: 200)); |
118 | |
119 | static cl::opt<unsigned> |
120 | CostThreshold("dfa-cost-threshold" , |
121 | cl::desc("Maximum cost accepted for the transformation" ), |
122 | cl::Hidden, cl::init(Val: 50)); |
123 | |
124 | namespace { |
125 | |
126 | class SelectInstToUnfold { |
127 | SelectInst *SI; |
128 | PHINode *SIUse; |
129 | |
130 | public: |
131 | SelectInstToUnfold(SelectInst *SI, PHINode *SIUse) : SI(SI), SIUse(SIUse) {} |
132 | |
133 | SelectInst *getInst() { return SI; } |
134 | PHINode *getUse() { return SIUse; } |
135 | |
136 | explicit operator bool() const { return SI && SIUse; } |
137 | }; |
138 | |
139 | void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, |
140 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
141 | std::vector<BasicBlock *> *NewBBs); |
142 | |
143 | class DFAJumpThreading { |
144 | public: |
145 | (AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, |
146 | TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) |
147 | : AC(AC), DT(DT), LI(LI), TTI(TTI), ORE(ORE) {} |
148 | |
149 | bool run(Function &F); |
150 | bool LoopInfoBroken; |
151 | |
152 | private: |
153 | void |
154 | unfoldSelectInstrs(DominatorTree *DT, |
155 | const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { |
156 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); |
157 | SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts); |
158 | |
159 | while (!Stack.empty()) { |
160 | SelectInstToUnfold SIToUnfold = Stack.pop_back_val(); |
161 | |
162 | std::vector<SelectInstToUnfold> NewSIsToUnfold; |
163 | std::vector<BasicBlock *> NewBBs; |
164 | unfold(DTU: &DTU, LI, SIToUnfold, NewSIsToUnfold: &NewSIsToUnfold, NewBBs: &NewBBs); |
165 | |
166 | // Put newly discovered select instructions into the work list. |
167 | llvm::append_range(C&: Stack, R&: NewSIsToUnfold); |
168 | } |
169 | } |
170 | |
171 | AssumptionCache *AC; |
172 | DominatorTree *DT; |
173 | LoopInfo *LI; |
174 | TargetTransformInfo *TTI; |
175 | OptimizationRemarkEmitter *ORE; |
176 | }; |
177 | |
178 | } // end anonymous namespace |
179 | |
180 | namespace { |
181 | |
182 | /// Unfold the select instruction held in \p SIToUnfold by replacing it with |
183 | /// control flow. |
184 | /// |
185 | /// Put newly discovered select instructions into \p NewSIsToUnfold. Put newly |
186 | /// created basic blocks into \p NewBBs. |
187 | /// |
188 | /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. |
189 | void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold, |
190 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
191 | std::vector<BasicBlock *> *NewBBs) { |
192 | SelectInst *SI = SIToUnfold.getInst(); |
193 | PHINode *SIUse = SIToUnfold.getUse(); |
194 | BasicBlock *StartBlock = SI->getParent(); |
195 | BranchInst *StartBlockTerm = |
196 | dyn_cast<BranchInst>(Val: StartBlock->getTerminator()); |
197 | |
198 | assert(StartBlockTerm); |
199 | assert(SI->hasOneUse()); |
200 | |
201 | if (StartBlockTerm->isUnconditional()) { |
202 | BasicBlock *EndBlock = StartBlock->getUniqueSuccessor(); |
203 | // Arbitrarily choose the 'false' side for a new input value to the PHI. |
204 | BasicBlock *NewBlock = BasicBlock::Create( |
205 | Context&: SI->getContext(), Name: Twine(SI->getName(), ".si.unfold.false" ), |
206 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
207 | NewBBs->push_back(x: NewBlock); |
208 | BranchInst::Create(IfTrue: EndBlock, InsertBefore: NewBlock); |
209 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBlock, EndBlock}}); |
210 | |
211 | // StartBlock |
212 | // | \ |
213 | // | NewBlock |
214 | // | / |
215 | // EndBlock |
216 | Value *SIOp1 = SI->getTrueValue(); |
217 | Value *SIOp2 = SI->getFalseValue(); |
218 | |
219 | PHINode *NewPhi = PHINode::Create(Ty: SIUse->getType(), NumReservedValues: 1, |
220 | NameStr: Twine(SIOp2->getName(), ".si.unfold.phi" ), |
221 | InsertBefore: NewBlock->getFirstInsertionPt()); |
222 | NewPhi->addIncoming(V: SIOp2, BB: StartBlock); |
223 | |
224 | // Update any other PHI nodes in EndBlock. |
225 | for (PHINode &Phi : EndBlock->phis()) { |
226 | if (SIUse == &Phi) |
227 | continue; |
228 | Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: StartBlock), BB: NewBlock); |
229 | } |
230 | |
231 | // Update the phi node of SI, which is its only use. |
232 | if (EndBlock == SIUse->getParent()) { |
233 | SIUse->addIncoming(V: NewPhi, BB: NewBlock); |
234 | SIUse->replaceUsesOfWith(From: SI, To: SIOp1); |
235 | } else { |
236 | PHINode *EndPhi = PHINode::Create(Ty: SIUse->getType(), NumReservedValues: pred_size(BB: EndBlock), |
237 | NameStr: Twine(SI->getName(), ".si.unfold.phi" ), |
238 | InsertBefore: EndBlock->getFirstInsertionPt()); |
239 | for (BasicBlock *Pred : predecessors(BB: EndBlock)) { |
240 | if (Pred != StartBlock && Pred != NewBlock) |
241 | EndPhi->addIncoming(V: EndPhi, BB: Pred); |
242 | } |
243 | |
244 | EndPhi->addIncoming(V: SIOp1, BB: StartBlock); |
245 | EndPhi->addIncoming(V: NewPhi, BB: NewBlock); |
246 | SIUse->replaceUsesOfWith(From: SI, To: EndPhi); |
247 | SIUse = EndPhi; |
248 | } |
249 | |
250 | if (auto *OpSi = dyn_cast<SelectInst>(Val: SIOp1)) |
251 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(OpSi, SIUse)); |
252 | if (auto *OpSi = dyn_cast<SelectInst>(Val: SIOp2)) |
253 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(OpSi, NewPhi)); |
254 | |
255 | // Insert the real conditional branch based on the original condition. |
256 | StartBlockTerm->eraseFromParent(); |
257 | BranchInst::Create(IfTrue: EndBlock, IfFalse: NewBlock, Cond: SI->getCondition(), InsertBefore: StartBlock); |
258 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, StartBlock, EndBlock}, |
259 | {DominatorTree::Insert, StartBlock, NewBlock}}); |
260 | } else { |
261 | BasicBlock *EndBlock = SIUse->getParent(); |
262 | BasicBlock *NewBlockT = BasicBlock::Create( |
263 | Context&: SI->getContext(), Name: Twine(SI->getName(), ".si.unfold.true" ), |
264 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
265 | BasicBlock *NewBlockF = BasicBlock::Create( |
266 | Context&: SI->getContext(), Name: Twine(SI->getName(), ".si.unfold.false" ), |
267 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
268 | |
269 | NewBBs->push_back(x: NewBlockT); |
270 | NewBBs->push_back(x: NewBlockF); |
271 | |
272 | // Def only has one use in EndBlock. |
273 | // Before transformation: |
274 | // StartBlock(Def) |
275 | // | \ |
276 | // EndBlock OtherBlock |
277 | // (Use) |
278 | // |
279 | // After transformation: |
280 | // StartBlock(Def) |
281 | // | \ |
282 | // | OtherBlock |
283 | // NewBlockT |
284 | // | \ |
285 | // | NewBlockF |
286 | // | / |
287 | // | / |
288 | // EndBlock |
289 | // (Use) |
290 | BranchInst::Create(IfTrue: EndBlock, InsertBefore: NewBlockF); |
291 | // Insert the real conditional branch based on the original condition. |
292 | BranchInst::Create(IfTrue: EndBlock, IfFalse: NewBlockF, Cond: SI->getCondition(), InsertBefore: NewBlockT); |
293 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBlockT, NewBlockF}, |
294 | {DominatorTree::Insert, NewBlockT, EndBlock}, |
295 | {DominatorTree::Insert, NewBlockF, EndBlock}}); |
296 | |
297 | Value *TrueVal = SI->getTrueValue(); |
298 | Value *FalseVal = SI->getFalseValue(); |
299 | |
300 | PHINode *NewPhiT = PHINode::Create( |
301 | Ty: SIUse->getType(), NumReservedValues: 1, NameStr: Twine(TrueVal->getName(), ".si.unfold.phi" ), |
302 | InsertBefore: NewBlockT->getFirstInsertionPt()); |
303 | PHINode *NewPhiF = PHINode::Create( |
304 | Ty: SIUse->getType(), NumReservedValues: 1, NameStr: Twine(FalseVal->getName(), ".si.unfold.phi" ), |
305 | InsertBefore: NewBlockF->getFirstInsertionPt()); |
306 | NewPhiT->addIncoming(V: TrueVal, BB: StartBlock); |
307 | NewPhiF->addIncoming(V: FalseVal, BB: NewBlockT); |
308 | |
309 | if (auto *TrueSI = dyn_cast<SelectInst>(Val: TrueVal)) |
310 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(TrueSI, NewPhiT)); |
311 | if (auto *FalseSi = dyn_cast<SelectInst>(Val: FalseVal)) |
312 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(FalseSi, NewPhiF)); |
313 | |
314 | SIUse->addIncoming(V: NewPhiT, BB: NewBlockT); |
315 | SIUse->addIncoming(V: NewPhiF, BB: NewBlockF); |
316 | SIUse->removeIncomingValue(BB: StartBlock); |
317 | |
318 | // Update any other PHI nodes in EndBlock. |
319 | for (PHINode &Phi : EndBlock->phis()) { |
320 | if (SIUse == &Phi) |
321 | continue; |
322 | Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: StartBlock), BB: NewBlockT); |
323 | Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: StartBlock), BB: NewBlockF); |
324 | Phi.removeIncomingValue(BB: StartBlock); |
325 | } |
326 | |
327 | // Update the appropriate successor of the start block to point to the new |
328 | // unfolded block. |
329 | unsigned SuccNum = StartBlockTerm->getSuccessor(i: 1) == EndBlock ? 1 : 0; |
330 | StartBlockTerm->setSuccessor(idx: SuccNum, NewSucc: NewBlockT); |
331 | DTU->applyUpdates(Updates: {{DominatorTree::Delete, StartBlock, EndBlock}, |
332 | {DominatorTree::Insert, StartBlock, NewBlockT}}); |
333 | } |
334 | |
335 | // Preserve loop info |
336 | if (Loop *L = LI->getLoopFor(BB: SI->getParent())) { |
337 | for (BasicBlock *NewBB : *NewBBs) |
338 | L->addBasicBlockToLoop(NewBB, LI&: *LI); |
339 | } |
340 | |
341 | // The select is now dead. |
342 | assert(SI->use_empty() && "Select must be dead now" ); |
343 | SI->eraseFromParent(); |
344 | } |
345 | |
346 | struct ClonedBlock { |
347 | BasicBlock *BB; |
348 | APInt State; ///< \p State corresponds to the next value of a switch stmnt. |
349 | }; |
350 | |
351 | typedef std::deque<BasicBlock *> PathType; |
352 | typedef std::vector<PathType> PathsType; |
353 | typedef SmallPtrSet<const BasicBlock *, 8> VisitedBlocks; |
354 | typedef std::vector<ClonedBlock> CloneList; |
355 | |
356 | // This data structure keeps track of all blocks that have been cloned. If two |
357 | // different ThreadingPaths clone the same block for a certain state it should |
358 | // be reused, and it can be looked up in this map. |
359 | typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap; |
360 | |
361 | // This map keeps track of all the new definitions for an instruction. This |
362 | // information is needed when restoring SSA form after cloning blocks. |
363 | typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap; |
364 | |
365 | inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { |
366 | OS << "< " ; |
367 | for (const BasicBlock *BB : Path) { |
368 | std::string BBName; |
369 | if (BB->hasName()) |
370 | raw_string_ostream(BBName) << BB->getName(); |
371 | else |
372 | raw_string_ostream(BBName) << BB; |
373 | OS << BBName << " " ; |
374 | } |
375 | OS << ">" ; |
376 | return OS; |
377 | } |
378 | |
379 | /// ThreadingPath is a path in the control flow of a loop that can be threaded |
380 | /// by cloning necessary basic blocks and replacing conditional branches with |
381 | /// unconditional ones. A threading path includes a list of basic blocks, the |
382 | /// exit state, and the block that determines the next state. |
383 | struct ThreadingPath { |
384 | /// Exit value is DFA's exit state for the given path. |
385 | APInt getExitValue() const { return ExitVal; } |
386 | void setExitValue(const ConstantInt *V) { |
387 | ExitVal = V->getValue(); |
388 | IsExitValSet = true; |
389 | } |
390 | bool isExitValueSet() const { return IsExitValSet; } |
391 | |
392 | /// Determinator is the basic block that determines the next state of the DFA. |
393 | const BasicBlock *getDeterminatorBB() const { return DBB; } |
394 | void setDeterminator(const BasicBlock *BB) { DBB = BB; } |
395 | |
396 | /// Path is a list of basic blocks. |
397 | const PathType &getPath() const { return Path; } |
398 | void setPath(const PathType &NewPath) { Path = NewPath; } |
399 | void push_back(BasicBlock *BB) { Path.push_back(x: BB); } |
400 | void push_front(BasicBlock *BB) { Path.push_front(x: BB); } |
401 | void appendExcludingFirst(const PathType &OtherPath) { |
402 | llvm::append_range(C&: Path, R: llvm::drop_begin(RangeOrContainer: OtherPath)); |
403 | } |
404 | |
405 | void print(raw_ostream &OS) const { |
406 | OS << Path << " [ " << ExitVal << ", " << DBB->getName() << " ]" ; |
407 | } |
408 | |
409 | private: |
410 | PathType Path; |
411 | APInt ExitVal; |
412 | const BasicBlock *DBB = nullptr; |
413 | bool IsExitValSet = false; |
414 | }; |
415 | |
416 | #ifndef NDEBUG |
417 | inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { |
418 | TPath.print(OS); |
419 | return OS; |
420 | } |
421 | #endif |
422 | |
423 | struct MainSwitch { |
424 | MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE) |
425 | : LI(LI) { |
426 | if (isCandidate(SI)) { |
427 | Instr = SI; |
428 | } else { |
429 | ORE->emit(RemarkBuilder: [&]() { |
430 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , SI) |
431 | << "Switch instruction is not predictable." ; |
432 | }); |
433 | } |
434 | } |
435 | |
436 | virtual ~MainSwitch() = default; |
437 | |
438 | SwitchInst *getInstr() const { return Instr; } |
439 | const SmallVector<SelectInstToUnfold, 4> getSelectInsts() { |
440 | return SelectInsts; |
441 | } |
442 | |
443 | private: |
444 | /// Do a use-def chain traversal starting from the switch condition to see if |
445 | /// \p SI is a potential condidate. |
446 | /// |
447 | /// Also, collect select instructions to unfold. |
448 | bool isCandidate(const SwitchInst *SI) { |
449 | std::deque<std::pair<Value *, BasicBlock *>> Q; |
450 | SmallSet<Value *, 16> SeenValues; |
451 | SelectInsts.clear(); |
452 | |
453 | Value *SICond = SI->getCondition(); |
454 | LLVM_DEBUG(dbgs() << "\tSICond: " << *SICond << "\n" ); |
455 | if (!isa<PHINode>(Val: SICond)) |
456 | return false; |
457 | |
458 | // The switch must be in a loop. |
459 | const Loop *L = LI->getLoopFor(BB: SI->getParent()); |
460 | if (!L) |
461 | return false; |
462 | |
463 | addToQueue(Val: SICond, BB: nullptr, Q, SeenValues); |
464 | |
465 | while (!Q.empty()) { |
466 | Value *Current = Q.front().first; |
467 | BasicBlock *CurrentIncomingBB = Q.front().second; |
468 | Q.pop_front(); |
469 | |
470 | if (auto *Phi = dyn_cast<PHINode>(Val: Current)) { |
471 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
472 | Value *Incoming = Phi->getIncomingValueForBlock(BB: IncomingBB); |
473 | addToQueue(Val: Incoming, BB: IncomingBB, Q, SeenValues); |
474 | } |
475 | LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n" ); |
476 | } else if (SelectInst *SelI = dyn_cast<SelectInst>(Val: Current)) { |
477 | if (!isValidSelectInst(SI: SelI)) |
478 | return false; |
479 | addToQueue(Val: SelI->getTrueValue(), BB: CurrentIncomingBB, Q, SeenValues); |
480 | addToQueue(Val: SelI->getFalseValue(), BB: CurrentIncomingBB, Q, SeenValues); |
481 | LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n" ); |
482 | if (auto *SelIUse = dyn_cast<PHINode>(Val: SelI->user_back())) |
483 | SelectInsts.push_back(Elt: SelectInstToUnfold(SelI, SelIUse)); |
484 | } else if (isa<Constant>(Val: Current)) { |
485 | LLVM_DEBUG(dbgs() << "\tconst: " << *Current << "\n" ); |
486 | continue; |
487 | } else { |
488 | LLVM_DEBUG(dbgs() << "\tother: " << *Current << "\n" ); |
489 | // Allow unpredictable values. The hope is that those will be the |
490 | // initial switch values that can be ignored (they will hit the |
491 | // unthreaded switch) but this assumption will get checked later after |
492 | // paths have been enumerated (in function getStateDefMap). |
493 | |
494 | // If the unpredictable value comes from the same inner loop it is |
495 | // likely that it will also be on the enumerated paths, causing us to |
496 | // exit after we have enumerated all the paths. This heuristic save |
497 | // compile time because a search for all the paths can become expensive. |
498 | if (EarlyExitHeuristic && |
499 | L->contains(L: LI->getLoopFor(BB: CurrentIncomingBB))) { |
500 | LLVM_DEBUG(dbgs() |
501 | << "\tExiting early due to unpredictability heuristic.\n" ); |
502 | return false; |
503 | } |
504 | |
505 | continue; |
506 | } |
507 | } |
508 | |
509 | return true; |
510 | } |
511 | |
512 | void addToQueue(Value *Val, BasicBlock *BB, |
513 | std::deque<std::pair<Value *, BasicBlock *>> &Q, |
514 | SmallSet<Value *, 16> &SeenValues) { |
515 | if (SeenValues.insert(Ptr: Val).second) |
516 | Q.push_back(x: {Val, BB}); |
517 | } |
518 | |
519 | bool isValidSelectInst(SelectInst *SI) { |
520 | if (!SI->hasOneUse()) |
521 | return false; |
522 | |
523 | Instruction *SIUse = dyn_cast<Instruction>(Val: SI->user_back()); |
524 | // The use of the select inst should be either a phi or another select. |
525 | if (!SIUse && !(isa<PHINode>(Val: SIUse) || isa<SelectInst>(Val: SIUse))) |
526 | return false; |
527 | |
528 | BasicBlock *SIBB = SI->getParent(); |
529 | |
530 | // Currently, we can only expand select instructions in basic blocks with |
531 | // one successor. |
532 | BranchInst *SITerm = dyn_cast<BranchInst>(Val: SIBB->getTerminator()); |
533 | if (!SITerm || !SITerm->isUnconditional()) |
534 | return false; |
535 | |
536 | // Only fold the select coming from directly where it is defined. |
537 | PHINode *PHIUser = dyn_cast<PHINode>(Val: SIUse); |
538 | if (PHIUser && PHIUser->getIncomingBlock(U: *SI->use_begin()) != SIBB) |
539 | return false; |
540 | |
541 | // If select will not be sunk during unfolding, and it is in the same basic |
542 | // block as another state defining select, then cannot unfold both. |
543 | for (SelectInstToUnfold SIToUnfold : SelectInsts) { |
544 | SelectInst *PrevSI = SIToUnfold.getInst(); |
545 | if (PrevSI->getTrueValue() != SI && PrevSI->getFalseValue() != SI && |
546 | PrevSI->getParent() == SI->getParent()) |
547 | return false; |
548 | } |
549 | |
550 | return true; |
551 | } |
552 | |
553 | LoopInfo *LI; |
554 | SwitchInst *Instr = nullptr; |
555 | SmallVector<SelectInstToUnfold, 4> SelectInsts; |
556 | }; |
557 | |
558 | struct AllSwitchPaths { |
559 | AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE, |
560 | LoopInfo *LI, Loop *L) |
561 | : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), ORE(ORE), |
562 | LI(LI), SwitchOuterLoop(L) {} |
563 | |
564 | std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; } |
565 | unsigned getNumThreadingPaths() { return TPaths.size(); } |
566 | SwitchInst *getSwitchInst() { return Switch; } |
567 | BasicBlock *getSwitchBlock() { return SwitchBlock; } |
568 | |
569 | void run() { |
570 | StateDefMap StateDef = getStateDefMap(); |
571 | if (StateDef.empty()) { |
572 | ORE->emit(RemarkBuilder: [&]() { |
573 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , |
574 | Switch) |
575 | << "Switch instruction is not predictable." ; |
576 | }); |
577 | return; |
578 | } |
579 | |
580 | auto *SwitchPhi = cast<PHINode>(Val: Switch->getOperand(i_nocapture: 0)); |
581 | auto *SwitchPhiDefBB = SwitchPhi->getParent(); |
582 | VisitedBlocks VB; |
583 | // Get paths from the determinator BBs to SwitchPhiDefBB |
584 | std::vector<ThreadingPath> PathsToPhiDef = |
585 | getPathsFromStateDefMap(StateDef, Phi: SwitchPhi, VB); |
586 | if (SwitchPhiDefBB == SwitchBlock) { |
587 | TPaths = std::move(PathsToPhiDef); |
588 | return; |
589 | } |
590 | |
591 | // Find and append paths from SwitchPhiDefBB to SwitchBlock. |
592 | PathsType PathsToSwitchBB = |
593 | paths(BB: SwitchPhiDefBB, ToBB: SwitchBlock, Visited&: VB, /* PathDepth = */ 1); |
594 | if (PathsToSwitchBB.empty()) |
595 | return; |
596 | |
597 | std::vector<ThreadingPath> TempList; |
598 | for (const ThreadingPath &Path : PathsToPhiDef) { |
599 | for (const PathType &PathToSw : PathsToSwitchBB) { |
600 | ThreadingPath PathCopy(Path); |
601 | PathCopy.appendExcludingFirst(OtherPath: PathToSw); |
602 | TempList.push_back(x: PathCopy); |
603 | } |
604 | } |
605 | TPaths = std::move(TempList); |
606 | } |
607 | |
608 | private: |
609 | // Value: an instruction that defines a switch state; |
610 | // Key: the parent basic block of that instruction. |
611 | typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; |
612 | std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, |
613 | PHINode *Phi, |
614 | VisitedBlocks &VB) { |
615 | std::vector<ThreadingPath> Res; |
616 | auto *PhiBB = Phi->getParent(); |
617 | VB.insert(Ptr: PhiBB); |
618 | |
619 | VisitedBlocks UniqueBlocks; |
620 | for (auto *IncomingBB : Phi->blocks()) { |
621 | if (!UniqueBlocks.insert(Ptr: IncomingBB).second) |
622 | continue; |
623 | if (!SwitchOuterLoop->contains(BB: IncomingBB)) |
624 | continue; |
625 | |
626 | Value *IncomingValue = Phi->getIncomingValueForBlock(BB: IncomingBB); |
627 | // We found the determinator. This is the start of our path. |
628 | if (auto *C = dyn_cast<ConstantInt>(Val: IncomingValue)) { |
629 | // SwitchBlock is the determinator, unsupported unless its also the def. |
630 | if (PhiBB == SwitchBlock && |
631 | SwitchBlock != cast<PHINode>(Val: Switch->getOperand(i_nocapture: 0))->getParent()) |
632 | continue; |
633 | ThreadingPath NewPath; |
634 | NewPath.setDeterminator(PhiBB); |
635 | NewPath.setExitValue(C); |
636 | // Don't add SwitchBlock at the start, this is handled later. |
637 | if (IncomingBB != SwitchBlock) |
638 | NewPath.push_back(BB: IncomingBB); |
639 | NewPath.push_back(BB: PhiBB); |
640 | Res.push_back(x: NewPath); |
641 | continue; |
642 | } |
643 | // Don't get into a cycle. |
644 | if (VB.contains(Ptr: IncomingBB) || IncomingBB == SwitchBlock) |
645 | continue; |
646 | // Recurse up the PHI chain. |
647 | auto *IncomingPhi = dyn_cast<PHINode>(Val: IncomingValue); |
648 | if (!IncomingPhi) |
649 | continue; |
650 | auto *IncomingPhiDefBB = IncomingPhi->getParent(); |
651 | if (!StateDef.contains(Val: IncomingPhiDefBB)) |
652 | continue; |
653 | |
654 | // Direct predecessor, just add to the path. |
655 | if (IncomingPhiDefBB == IncomingBB) { |
656 | std::vector<ThreadingPath> PredPaths = |
657 | getPathsFromStateDefMap(StateDef, Phi: IncomingPhi, VB); |
658 | for (ThreadingPath &Path : PredPaths) { |
659 | Path.push_back(BB: PhiBB); |
660 | Res.push_back(x: std::move(Path)); |
661 | } |
662 | continue; |
663 | } |
664 | // Not a direct predecessor, find intermediate paths to append to the |
665 | // existing path. |
666 | if (VB.contains(Ptr: IncomingPhiDefBB)) |
667 | continue; |
668 | |
669 | PathsType IntermediatePaths; |
670 | IntermediatePaths = |
671 | paths(BB: IncomingPhiDefBB, ToBB: IncomingBB, Visited&: VB, /* PathDepth = */ 1); |
672 | if (IntermediatePaths.empty()) |
673 | continue; |
674 | |
675 | std::vector<ThreadingPath> PredPaths = |
676 | getPathsFromStateDefMap(StateDef, Phi: IncomingPhi, VB); |
677 | for (const ThreadingPath &Path : PredPaths) { |
678 | for (const PathType &IPath : IntermediatePaths) { |
679 | ThreadingPath NewPath(Path); |
680 | NewPath.appendExcludingFirst(OtherPath: IPath); |
681 | NewPath.push_back(BB: PhiBB); |
682 | Res.push_back(x: NewPath); |
683 | } |
684 | } |
685 | } |
686 | VB.erase(Ptr: PhiBB); |
687 | return Res; |
688 | } |
689 | |
690 | PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, |
691 | unsigned PathDepth) { |
692 | PathsType Res; |
693 | |
694 | // Stop exploring paths after visiting MaxPathLength blocks |
695 | if (PathDepth > MaxPathLength) { |
696 | ORE->emit(RemarkBuilder: [&]() { |
697 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "MaxPathLengthReached" , |
698 | Switch) |
699 | << "Exploration stopped after visiting MaxPathLength=" |
700 | << ore::NV("MaxPathLength" , MaxPathLength) << " blocks." ; |
701 | }); |
702 | return Res; |
703 | } |
704 | |
705 | Visited.insert(Ptr: BB); |
706 | if (++NumVisited > MaxNumVisitiedPaths) |
707 | return Res; |
708 | |
709 | // Stop if we have reached the BB out of loop, since its successors have no |
710 | // impact on the DFA. |
711 | if (!SwitchOuterLoop->contains(BB)) |
712 | return Res; |
713 | |
714 | // Some blocks have multiple edges to the same successor, and this set |
715 | // is used to prevent a duplicate path from being generated |
716 | SmallSet<BasicBlock *, 4> Successors; |
717 | for (BasicBlock *Succ : successors(BB)) { |
718 | if (!Successors.insert(Ptr: Succ).second) |
719 | continue; |
720 | |
721 | // Found a cycle through the final block. |
722 | if (Succ == ToBB) { |
723 | Res.push_back(x: {BB, ToBB}); |
724 | continue; |
725 | } |
726 | |
727 | // We have encountered a cycle, do not get caught in it |
728 | if (Visited.contains(Ptr: Succ)) |
729 | continue; |
730 | |
731 | auto *CurrLoop = LI->getLoopFor(BB); |
732 | // Unlikely to be beneficial. |
733 | if (Succ == CurrLoop->getHeader()) |
734 | continue; |
735 | // Skip for now, revisit this condition later to see the impact on |
736 | // coverage and compile time. |
737 | if (LI->getLoopFor(BB: Succ) != CurrLoop) |
738 | continue; |
739 | |
740 | PathsType SuccPaths = paths(BB: Succ, ToBB, Visited, PathDepth: PathDepth + 1); |
741 | for (PathType &Path : SuccPaths) { |
742 | Path.push_front(x: BB); |
743 | Res.push_back(x: Path); |
744 | if (Res.size() >= MaxNumPaths) { |
745 | return Res; |
746 | } |
747 | } |
748 | } |
749 | // This block could now be visited again from a different predecessor. Note |
750 | // that this will result in exponential runtime. Subpaths could possibly be |
751 | // cached but it takes a lot of memory to store them. |
752 | Visited.erase(Ptr: BB); |
753 | return Res; |
754 | } |
755 | |
756 | /// Walk the use-def chain and collect all the state-defining blocks and the |
757 | /// PHI nodes in those blocks that define the state. |
758 | StateDefMap getStateDefMap() const { |
759 | StateDefMap Res; |
760 | PHINode *FirstDef = dyn_cast<PHINode>(Val: Switch->getOperand(i_nocapture: 0)); |
761 | assert(FirstDef && "The first definition must be a phi." ); |
762 | |
763 | SmallVector<PHINode *, 8> Stack; |
764 | Stack.push_back(Elt: FirstDef); |
765 | SmallSet<Value *, 16> SeenValues; |
766 | |
767 | while (!Stack.empty()) { |
768 | PHINode *CurPhi = Stack.pop_back_val(); |
769 | |
770 | Res[CurPhi->getParent()] = CurPhi; |
771 | SeenValues.insert(Ptr: CurPhi); |
772 | |
773 | for (BasicBlock *IncomingBB : CurPhi->blocks()) { |
774 | PHINode *IncomingPhi = |
775 | dyn_cast<PHINode>(Val: CurPhi->getIncomingValueForBlock(BB: IncomingBB)); |
776 | if (!IncomingPhi) |
777 | continue; |
778 | bool IsOutsideLoops = !SwitchOuterLoop->contains(BB: IncomingBB); |
779 | if (SeenValues.contains(Ptr: IncomingPhi) || IsOutsideLoops) |
780 | continue; |
781 | |
782 | Stack.push_back(Elt: IncomingPhi); |
783 | } |
784 | } |
785 | |
786 | return Res; |
787 | } |
788 | |
789 | unsigned NumVisited = 0; |
790 | SwitchInst *Switch; |
791 | BasicBlock *SwitchBlock; |
792 | OptimizationRemarkEmitter *ORE; |
793 | std::vector<ThreadingPath> TPaths; |
794 | LoopInfo *LI; |
795 | Loop *SwitchOuterLoop; |
796 | }; |
797 | |
798 | struct TransformDFA { |
799 | TransformDFA(AllSwitchPaths *SwitchPaths, DominatorTree *DT, |
800 | AssumptionCache *AC, TargetTransformInfo *TTI, |
801 | OptimizationRemarkEmitter *ORE, |
802 | SmallPtrSet<const Value *, 32> EphValues) |
803 | : SwitchPaths(SwitchPaths), DT(DT), AC(AC), TTI(TTI), ORE(ORE), |
804 | EphValues(EphValues) {} |
805 | |
806 | void run() { |
807 | if (isLegalAndProfitableToTransform()) { |
808 | createAllExitPaths(); |
809 | NumTransforms++; |
810 | } |
811 | } |
812 | |
813 | private: |
814 | /// This function performs both a legality check and profitability check at |
815 | /// the same time since it is convenient to do so. It iterates through all |
816 | /// blocks that will be cloned, and keeps track of the duplication cost. It |
817 | /// also returns false if it is illegal to clone some required block. |
818 | bool isLegalAndProfitableToTransform() { |
819 | CodeMetrics Metrics; |
820 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
821 | |
822 | // Don't thread switch without multiple successors. |
823 | if (Switch->getNumSuccessors() <= 1) |
824 | return false; |
825 | |
826 | // Note that DuplicateBlockMap is not being used as intended here. It is |
827 | // just being used to ensure (BB, State) pairs are only counted once. |
828 | DuplicateBlockMap DuplicateMap; |
829 | |
830 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
831 | PathType PathBBs = TPath.getPath(); |
832 | APInt NextState = TPath.getExitValue(); |
833 | const BasicBlock *Determinator = TPath.getDeterminatorBB(); |
834 | |
835 | // Update Metrics for the Switch block, this is always cloned |
836 | BasicBlock *BB = SwitchPaths->getSwitchBlock(); |
837 | BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
838 | if (!VisitedBB) { |
839 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
840 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
841 | } |
842 | |
843 | // If the Switch block is the Determinator, then we can continue since |
844 | // this is the only block that is cloned and we already counted for it. |
845 | if (PathBBs.front() == Determinator) |
846 | continue; |
847 | |
848 | // Otherwise update Metrics for all blocks that will be cloned. If any |
849 | // block is already cloned and would be reused, don't double count it. |
850 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
851 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
852 | BB = *BBIt; |
853 | VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
854 | if (VisitedBB) |
855 | continue; |
856 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
857 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
858 | } |
859 | |
860 | if (Metrics.notDuplicatable) { |
861 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
862 | << "non-duplicatable instructions.\n" ); |
863 | ORE->emit(RemarkBuilder: [&]() { |
864 | return OptimizationRemarkMissed(DEBUG_TYPE, "NonDuplicatableInst" , |
865 | Switch) |
866 | << "Contains non-duplicatable instructions." ; |
867 | }); |
868 | return false; |
869 | } |
870 | |
871 | // FIXME: Allow jump threading with controlled convergence. |
872 | if (Metrics.Convergence != ConvergenceKind::None) { |
873 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
874 | << "convergent instructions.\n" ); |
875 | ORE->emit(RemarkBuilder: [&]() { |
876 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
877 | << "Contains convergent instructions." ; |
878 | }); |
879 | return false; |
880 | } |
881 | |
882 | if (!Metrics.NumInsts.isValid()) { |
883 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
884 | << "instructions with invalid cost.\n" ); |
885 | ORE->emit(RemarkBuilder: [&]() { |
886 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
887 | << "Contains instructions with invalid cost." ; |
888 | }); |
889 | return false; |
890 | } |
891 | } |
892 | |
893 | InstructionCost DuplicationCost = 0; |
894 | |
895 | unsigned JumpTableSize = 0; |
896 | TTI->getEstimatedNumberOfCaseClusters(SI: *Switch, JTSize&: JumpTableSize, PSI: nullptr, |
897 | BFI: nullptr); |
898 | if (JumpTableSize == 0) { |
899 | // Factor in the number of conditional branches reduced from jump |
900 | // threading. Assume that lowering the switch block is implemented by |
901 | // using binary search, hence the LogBase2(). |
902 | unsigned CondBranches = |
903 | APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); |
904 | assert(CondBranches > 0 && |
905 | "The threaded switch must have multiple branches" ); |
906 | DuplicationCost = Metrics.NumInsts / CondBranches; |
907 | } else { |
908 | // Compared with jump tables, the DFA optimizer removes an indirect branch |
909 | // on each loop iteration, thus making branch prediction more precise. The |
910 | // more branch targets there are, the more likely it is for the branch |
911 | // predictor to make a mistake, and the more benefit there is in the DFA |
912 | // optimizer. Thus, the more branch targets there are, the lower is the |
913 | // cost of the DFA opt. |
914 | DuplicationCost = Metrics.NumInsts / JumpTableSize; |
915 | } |
916 | |
917 | LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " |
918 | << SwitchPaths->getSwitchBlock()->getName() |
919 | << " is: " << DuplicationCost << "\n\n" ); |
920 | |
921 | if (DuplicationCost > CostThreshold) { |
922 | LLVM_DEBUG(dbgs() << "Not jump threading, duplication cost exceeds the " |
923 | << "cost threshold.\n" ); |
924 | ORE->emit(RemarkBuilder: [&]() { |
925 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable" , Switch) |
926 | << "Duplication cost exceeds the cost threshold (cost=" |
927 | << ore::NV("Cost" , DuplicationCost) |
928 | << ", threshold=" << ore::NV("Threshold" , CostThreshold) << ")." ; |
929 | }); |
930 | return false; |
931 | } |
932 | |
933 | ORE->emit(RemarkBuilder: [&]() { |
934 | return OptimizationRemark(DEBUG_TYPE, "JumpThreaded" , Switch) |
935 | << "Switch statement jump-threaded." ; |
936 | }); |
937 | |
938 | return true; |
939 | } |
940 | |
941 | /// Transform each threading path to effectively jump thread the DFA. |
942 | void createAllExitPaths() { |
943 | DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Eager); |
944 | |
945 | // Move the switch block to the end of the path, since it will be duplicated |
946 | BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock(); |
947 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
948 | LLVM_DEBUG(dbgs() << TPath << "\n" ); |
949 | // TODO: Fix exit path creation logic so that we dont need this |
950 | // placeholder. |
951 | TPath.push_front(BB: SwitchBlock); |
952 | } |
953 | |
954 | // Transform the ThreadingPaths and keep track of the cloned values |
955 | DuplicateBlockMap DuplicateMap; |
956 | DefMap NewDefs; |
957 | |
958 | SmallSet<BasicBlock *, 16> BlocksToClean; |
959 | BlocksToClean.insert_range(R: successors(BB: SwitchBlock)); |
960 | |
961 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
962 | createExitPath(NewDefs, Path&: TPath, DuplicateMap, BlocksToClean, DTU: &DTU); |
963 | NumPaths++; |
964 | } |
965 | |
966 | // After all paths are cloned, now update the last successor of the cloned |
967 | // path so it skips over the switch statement |
968 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) |
969 | updateLastSuccessor(TPath, DuplicateMap, DTU: &DTU); |
970 | |
971 | // For each instruction that was cloned and used outside, update its uses |
972 | updateSSA(NewDefs); |
973 | |
974 | // Clean PHI Nodes for the newly created blocks |
975 | for (BasicBlock *BB : BlocksToClean) |
976 | cleanPhiNodes(BB); |
977 | } |
978 | |
979 | /// For a specific ThreadingPath \p Path, create an exit path starting from |
980 | /// the determinator block. |
981 | /// |
982 | /// To remember the correct destination, we have to duplicate blocks |
983 | /// corresponding to each state. Also update the terminating instruction of |
984 | /// the predecessors, and phis in the successor blocks. |
985 | void createExitPath(DefMap &NewDefs, ThreadingPath &Path, |
986 | DuplicateBlockMap &DuplicateMap, |
987 | SmallSet<BasicBlock *, 16> &BlocksToClean, |
988 | DomTreeUpdater *DTU) { |
989 | APInt NextState = Path.getExitValue(); |
990 | const BasicBlock *Determinator = Path.getDeterminatorBB(); |
991 | PathType PathBBs = Path.getPath(); |
992 | |
993 | // Don't select the placeholder block in front |
994 | if (PathBBs.front() == Determinator) |
995 | PathBBs.pop_front(); |
996 | |
997 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
998 | // When there is only one BB in PathBBs, the determinator takes itself as a |
999 | // direct predecessor. |
1000 | BasicBlock *PrevBB = PathBBs.size() == 1 ? *DetIt : *std::prev(x: DetIt); |
1001 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
1002 | BasicBlock *BB = *BBIt; |
1003 | BlocksToClean.insert(Ptr: BB); |
1004 | |
1005 | // We already cloned BB for this NextState, now just update the branch |
1006 | // and continue. |
1007 | BasicBlock *NextBB = getClonedBB(BB, NextState, DuplicateMap); |
1008 | if (NextBB) { |
1009 | updatePredecessor(PrevBB, OldBB: BB, NewBB: NextBB, DTU); |
1010 | PrevBB = NextBB; |
1011 | continue; |
1012 | } |
1013 | |
1014 | // Clone the BB and update the successor of Prev to jump to the new block |
1015 | BasicBlock *NewBB = cloneBlockAndUpdatePredecessor( |
1016 | BB, PrevBB, NextState, DuplicateMap, NewDefs, DTU); |
1017 | DuplicateMap[BB].push_back(x: {.BB: NewBB, .State: NextState}); |
1018 | BlocksToClean.insert(Ptr: NewBB); |
1019 | PrevBB = NewBB; |
1020 | } |
1021 | } |
1022 | |
1023 | /// Restore SSA form after cloning blocks. |
1024 | /// |
1025 | /// Each cloned block creates new defs for a variable, and the uses need to be |
1026 | /// updated to reflect this. The uses may be replaced with a cloned value, or |
1027 | /// some derived phi instruction. Note that all uses of a value defined in the |
1028 | /// same block were already remapped when cloning the block. |
1029 | void updateSSA(DefMap &NewDefs) { |
1030 | SSAUpdaterBulk SSAUpdate; |
1031 | SmallVector<Use *, 16> UsesToRename; |
1032 | |
1033 | for (const auto &KV : NewDefs) { |
1034 | Instruction *I = KV.first; |
1035 | BasicBlock *BB = I->getParent(); |
1036 | std::vector<Instruction *> Cloned = KV.second; |
1037 | |
1038 | // Scan all uses of this instruction to see if it is used outside of its |
1039 | // block, and if so, record them in UsesToRename. |
1040 | for (Use &U : I->uses()) { |
1041 | Instruction *User = cast<Instruction>(Val: U.getUser()); |
1042 | if (PHINode *UserPN = dyn_cast<PHINode>(Val: User)) { |
1043 | if (UserPN->getIncomingBlock(U) == BB) |
1044 | continue; |
1045 | } else if (User->getParent() == BB) { |
1046 | continue; |
1047 | } |
1048 | |
1049 | UsesToRename.push_back(Elt: &U); |
1050 | } |
1051 | |
1052 | // If there are no uses outside the block, we're done with this |
1053 | // instruction. |
1054 | if (UsesToRename.empty()) |
1055 | continue; |
1056 | LLVM_DEBUG(dbgs() << "DFA-JT: Renaming non-local uses of: " << *I |
1057 | << "\n" ); |
1058 | |
1059 | // We found a use of I outside of BB. Rename all uses of I that are |
1060 | // outside its block to be uses of the appropriate PHI node etc. See |
1061 | // ValuesInBlocks with the values we know. |
1062 | unsigned VarNum = SSAUpdate.AddVariable(Name: I->getName(), Ty: I->getType()); |
1063 | SSAUpdate.AddAvailableValue(Var: VarNum, BB, V: I); |
1064 | for (Instruction *New : Cloned) |
1065 | SSAUpdate.AddAvailableValue(Var: VarNum, BB: New->getParent(), V: New); |
1066 | |
1067 | while (!UsesToRename.empty()) |
1068 | SSAUpdate.AddUse(Var: VarNum, U: UsesToRename.pop_back_val()); |
1069 | |
1070 | LLVM_DEBUG(dbgs() << "\n" ); |
1071 | } |
1072 | // SSAUpdater handles phi placement and renaming uses with the appropriate |
1073 | // value. |
1074 | SSAUpdate.RewriteAllUses(DT); |
1075 | } |
1076 | |
1077 | /// Clones a basic block, and adds it to the CFG. |
1078 | /// |
1079 | /// This function also includes updating phi nodes in the successors of the |
1080 | /// BB, and remapping uses that were defined locally in the cloned BB. |
1081 | BasicBlock *cloneBlockAndUpdatePredecessor(BasicBlock *BB, BasicBlock *PrevBB, |
1082 | const APInt &NextState, |
1083 | DuplicateBlockMap &DuplicateMap, |
1084 | DefMap &NewDefs, |
1085 | DomTreeUpdater *DTU) { |
1086 | ValueToValueMapTy VMap; |
1087 | BasicBlock *NewBB = CloneBasicBlock( |
1088 | BB, VMap, NameSuffix: ".jt" + std::to_string(val: NextState.getLimitedValue()), |
1089 | F: BB->getParent()); |
1090 | NewBB->moveAfter(MovePos: BB); |
1091 | NumCloned++; |
1092 | |
1093 | for (Instruction &I : *NewBB) { |
1094 | // Do not remap operands of PHINode in case a definition in BB is an |
1095 | // incoming value to a phi in the same block. This incoming value will |
1096 | // be renamed later while restoring SSA. |
1097 | if (isa<PHINode>(Val: &I)) |
1098 | continue; |
1099 | RemapInstruction(I: &I, VM&: VMap, |
1100 | Flags: RF_IgnoreMissingLocals | RF_NoModuleLevelChanges); |
1101 | if (AssumeInst *II = dyn_cast<AssumeInst>(Val: &I)) |
1102 | AC->registerAssumption(CI: II); |
1103 | } |
1104 | |
1105 | updateSuccessorPhis(BB, ClonedBB: NewBB, NextState, VMap, DuplicateMap); |
1106 | updatePredecessor(PrevBB, OldBB: BB, NewBB, DTU); |
1107 | updateDefMap(NewDefs, VMap); |
1108 | |
1109 | // Add all successors to the DominatorTree |
1110 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1111 | for (auto *SuccBB : successors(BB: NewBB)) { |
1112 | if (SuccSet.insert(Ptr: SuccBB).second) |
1113 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBB, SuccBB}}); |
1114 | } |
1115 | SuccSet.clear(); |
1116 | return NewBB; |
1117 | } |
1118 | |
1119 | /// Update the phi nodes in BB's successors. |
1120 | /// |
1121 | /// This means creating a new incoming value from NewBB with the new |
1122 | /// instruction wherever there is an incoming value from BB. |
1123 | void updateSuccessorPhis(BasicBlock *BB, BasicBlock *ClonedBB, |
1124 | const APInt &NextState, ValueToValueMapTy &VMap, |
1125 | DuplicateBlockMap &DuplicateMap) { |
1126 | std::vector<BasicBlock *> BlocksToUpdate; |
1127 | |
1128 | // If BB is the last block in the path, we can simply update the one case |
1129 | // successor that will be reached. |
1130 | if (BB == SwitchPaths->getSwitchBlock()) { |
1131 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
1132 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
1133 | BlocksToUpdate.push_back(x: NextCase); |
1134 | BasicBlock *ClonedSucc = getClonedBB(BB: NextCase, NextState, DuplicateMap); |
1135 | if (ClonedSucc) |
1136 | BlocksToUpdate.push_back(x: ClonedSucc); |
1137 | } |
1138 | // Otherwise update phis in all successors. |
1139 | else { |
1140 | for (BasicBlock *Succ : successors(BB)) { |
1141 | BlocksToUpdate.push_back(x: Succ); |
1142 | |
1143 | // Check if a successor has already been cloned for the particular exit |
1144 | // value. In this case if a successor was already cloned, the phi nodes |
1145 | // in the cloned block should be updated directly. |
1146 | BasicBlock *ClonedSucc = getClonedBB(BB: Succ, NextState, DuplicateMap); |
1147 | if (ClonedSucc) |
1148 | BlocksToUpdate.push_back(x: ClonedSucc); |
1149 | } |
1150 | } |
1151 | |
1152 | // If there is a phi with an incoming value from BB, create a new incoming |
1153 | // value for the new predecessor ClonedBB. The value will either be the same |
1154 | // value from BB or a cloned value. |
1155 | for (BasicBlock *Succ : BlocksToUpdate) { |
1156 | for (auto II = Succ->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); |
1157 | ++II) { |
1158 | Value *Incoming = Phi->getIncomingValueForBlock(BB); |
1159 | if (Incoming) { |
1160 | if (isa<Constant>(Val: Incoming)) { |
1161 | Phi->addIncoming(V: Incoming, BB: ClonedBB); |
1162 | continue; |
1163 | } |
1164 | Value *ClonedVal = VMap[Incoming]; |
1165 | if (ClonedVal) |
1166 | Phi->addIncoming(V: ClonedVal, BB: ClonedBB); |
1167 | else |
1168 | Phi->addIncoming(V: Incoming, BB: ClonedBB); |
1169 | } |
1170 | } |
1171 | } |
1172 | } |
1173 | |
1174 | /// Sets the successor of PrevBB to be NewBB instead of OldBB. Note that all |
1175 | /// other successors are kept as well. |
1176 | void updatePredecessor(BasicBlock *PrevBB, BasicBlock *OldBB, |
1177 | BasicBlock *NewBB, DomTreeUpdater *DTU) { |
1178 | // When a path is reused, there is a chance that predecessors were already |
1179 | // updated before. Check if the predecessor needs to be updated first. |
1180 | if (!isPredecessor(BB: OldBB, IncomingBB: PrevBB)) |
1181 | return; |
1182 | |
1183 | Instruction *PrevTerm = PrevBB->getTerminator(); |
1184 | for (unsigned Idx = 0; Idx < PrevTerm->getNumSuccessors(); Idx++) { |
1185 | if (PrevTerm->getSuccessor(Idx) == OldBB) { |
1186 | OldBB->removePredecessor(Pred: PrevBB, /* KeepOneInputPHIs = */ true); |
1187 | PrevTerm->setSuccessor(Idx, BB: NewBB); |
1188 | } |
1189 | } |
1190 | DTU->applyUpdates(Updates: {{DominatorTree::Delete, PrevBB, OldBB}, |
1191 | {DominatorTree::Insert, PrevBB, NewBB}}); |
1192 | } |
1193 | |
1194 | /// Add new value mappings to the DefMap to keep track of all new definitions |
1195 | /// for a particular instruction. These will be used while updating SSA form. |
1196 | void updateDefMap(DefMap &NewDefs, ValueToValueMapTy &VMap) { |
1197 | SmallVector<std::pair<Instruction *, Instruction *>> NewDefsVector; |
1198 | NewDefsVector.reserve(N: VMap.size()); |
1199 | |
1200 | for (auto Entry : VMap) { |
1201 | Instruction *Inst = |
1202 | dyn_cast<Instruction>(Val: const_cast<Value *>(Entry.first)); |
1203 | if (!Inst || !Entry.second || isa<BranchInst>(Val: Inst) || |
1204 | isa<SwitchInst>(Val: Inst)) { |
1205 | continue; |
1206 | } |
1207 | |
1208 | Instruction *Cloned = dyn_cast<Instruction>(Val&: Entry.second); |
1209 | if (!Cloned) |
1210 | continue; |
1211 | |
1212 | NewDefsVector.push_back(Elt: {Inst, Cloned}); |
1213 | } |
1214 | |
1215 | // Sort the defs to get deterministic insertion order into NewDefs. |
1216 | sort(C&: NewDefsVector, Comp: [](const auto &LHS, const auto &RHS) { |
1217 | if (LHS.first == RHS.first) |
1218 | return LHS.second->comesBefore(RHS.second); |
1219 | return LHS.first->comesBefore(RHS.first); |
1220 | }); |
1221 | |
1222 | for (const auto &KV : NewDefsVector) |
1223 | NewDefs[KV.first].push_back(x: KV.second); |
1224 | } |
1225 | |
1226 | /// Update the last branch of a particular cloned path to point to the correct |
1227 | /// case successor. |
1228 | /// |
1229 | /// Note that this is an optional step and would have been done in later |
1230 | /// optimizations, but it makes the CFG significantly easier to work with. |
1231 | void updateLastSuccessor(ThreadingPath &TPath, |
1232 | DuplicateBlockMap &DuplicateMap, |
1233 | DomTreeUpdater *DTU) { |
1234 | APInt NextState = TPath.getExitValue(); |
1235 | BasicBlock *BB = TPath.getPath().back(); |
1236 | BasicBlock *LastBlock = getClonedBB(BB, NextState, DuplicateMap); |
1237 | |
1238 | // Note multiple paths can end at the same block so check that it is not |
1239 | // updated yet |
1240 | if (!isa<SwitchInst>(Val: LastBlock->getTerminator())) |
1241 | return; |
1242 | SwitchInst *Switch = cast<SwitchInst>(Val: LastBlock->getTerminator()); |
1243 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
1244 | |
1245 | std::vector<DominatorTree::UpdateType> DTUpdates; |
1246 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
1247 | for (BasicBlock *Succ : successors(BB: LastBlock)) { |
1248 | if (Succ != NextCase && SuccSet.insert(Ptr: Succ).second) |
1249 | DTUpdates.push_back(x: {DominatorTree::Delete, LastBlock, Succ}); |
1250 | } |
1251 | |
1252 | Switch->eraseFromParent(); |
1253 | BranchInst::Create(IfTrue: NextCase, InsertBefore: LastBlock); |
1254 | |
1255 | DTU->applyUpdates(Updates: DTUpdates); |
1256 | } |
1257 | |
1258 | /// After cloning blocks, some of the phi nodes have extra incoming values |
1259 | /// that are no longer used. This function removes them. |
1260 | void cleanPhiNodes(BasicBlock *BB) { |
1261 | // If BB is no longer reachable, remove any remaining phi nodes |
1262 | if (pred_empty(BB)) { |
1263 | std::vector<PHINode *> PhiToRemove; |
1264 | for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); ++II) { |
1265 | PhiToRemove.push_back(x: Phi); |
1266 | } |
1267 | for (PHINode *PN : PhiToRemove) { |
1268 | PN->replaceAllUsesWith(V: PoisonValue::get(T: PN->getType())); |
1269 | PN->eraseFromParent(); |
1270 | } |
1271 | return; |
1272 | } |
1273 | |
1274 | // Remove any incoming values that come from an invalid predecessor |
1275 | for (auto II = BB->begin(); PHINode *Phi = dyn_cast<PHINode>(Val&: II); ++II) { |
1276 | std::vector<BasicBlock *> BlocksToRemove; |
1277 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
1278 | if (!isPredecessor(BB, IncomingBB)) |
1279 | BlocksToRemove.push_back(x: IncomingBB); |
1280 | } |
1281 | for (BasicBlock *BB : BlocksToRemove) |
1282 | Phi->removeIncomingValue(BB); |
1283 | } |
1284 | } |
1285 | |
1286 | /// Checks if BB was already cloned for a particular next state value. If it |
1287 | /// was then it returns this cloned block, and otherwise null. |
1288 | BasicBlock *getClonedBB(BasicBlock *BB, const APInt &NextState, |
1289 | DuplicateBlockMap &DuplicateMap) { |
1290 | CloneList ClonedBBs = DuplicateMap[BB]; |
1291 | |
1292 | // Find an entry in the CloneList with this NextState. If it exists then |
1293 | // return the corresponding BB |
1294 | auto It = llvm::find_if(Range&: ClonedBBs, P: [NextState](const ClonedBlock &C) { |
1295 | return C.State == NextState; |
1296 | }); |
1297 | return It != ClonedBBs.end() ? (*It).BB : nullptr; |
1298 | } |
1299 | |
1300 | /// Helper to get the successor corresponding to a particular case value for |
1301 | /// a switch statement. |
1302 | BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, const APInt &NextState) { |
1303 | BasicBlock *NextCase = nullptr; |
1304 | for (auto Case : Switch->cases()) { |
1305 | if (Case.getCaseValue()->getValue() == NextState) { |
1306 | NextCase = Case.getCaseSuccessor(); |
1307 | break; |
1308 | } |
1309 | } |
1310 | if (!NextCase) |
1311 | NextCase = Switch->getDefaultDest(); |
1312 | return NextCase; |
1313 | } |
1314 | |
1315 | /// Returns true if IncomingBB is a predecessor of BB. |
1316 | bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { |
1317 | return llvm::is_contained(Range: predecessors(BB), Element: IncomingBB); |
1318 | } |
1319 | |
1320 | AllSwitchPaths *SwitchPaths; |
1321 | DominatorTree *DT; |
1322 | AssumptionCache *AC; |
1323 | TargetTransformInfo *TTI; |
1324 | OptimizationRemarkEmitter *ORE; |
1325 | SmallPtrSet<const Value *, 32> EphValues; |
1326 | std::vector<ThreadingPath> TPaths; |
1327 | }; |
1328 | |
1329 | bool DFAJumpThreading::run(Function &F) { |
1330 | LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n" ); |
1331 | |
1332 | if (F.hasOptSize()) { |
1333 | LLVM_DEBUG(dbgs() << "Skipping due to the 'minsize' attribute\n" ); |
1334 | return false; |
1335 | } |
1336 | |
1337 | if (ClViewCfgBefore) |
1338 | F.viewCFG(); |
1339 | |
1340 | SmallVector<AllSwitchPaths, 2> ThreadableLoops; |
1341 | bool MadeChanges = false; |
1342 | LoopInfoBroken = false; |
1343 | |
1344 | for (BasicBlock &BB : F) { |
1345 | auto *SI = dyn_cast<SwitchInst>(Val: BB.getTerminator()); |
1346 | if (!SI) |
1347 | continue; |
1348 | |
1349 | LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() |
1350 | << " is a candidate\n" ); |
1351 | MainSwitch Switch(SI, LI, ORE); |
1352 | |
1353 | if (!Switch.getInstr()) { |
1354 | LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is not a " |
1355 | << "candidate for jump threading\n" ); |
1356 | continue; |
1357 | } |
1358 | |
1359 | LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is a " |
1360 | << "candidate for jump threading\n" ); |
1361 | LLVM_DEBUG(SI->dump()); |
1362 | |
1363 | unfoldSelectInstrs(DT, SelectInsts: Switch.getSelectInsts()); |
1364 | if (!Switch.getSelectInsts().empty()) |
1365 | MadeChanges = true; |
1366 | |
1367 | AllSwitchPaths SwitchPaths(&Switch, ORE, LI, |
1368 | LI->getLoopFor(BB: &BB)->getOutermostLoop()); |
1369 | SwitchPaths.run(); |
1370 | |
1371 | if (SwitchPaths.getNumThreadingPaths() > 0) { |
1372 | ThreadableLoops.push_back(Elt: SwitchPaths); |
1373 | |
1374 | // For the time being limit this optimization to occurring once in a |
1375 | // function since it can change the CFG significantly. This is not a |
1376 | // strict requirement but it can cause buggy behavior if there is an |
1377 | // overlap of blocks in different opportunities. There is a lot of room to |
1378 | // experiment with catching more opportunities here. |
1379 | // NOTE: To release this contraint, we must handle LoopInfo invalidation |
1380 | break; |
1381 | } |
1382 | } |
1383 | |
1384 | #ifdef NDEBUG |
1385 | LI->verify(DomTree: *DT); |
1386 | #endif |
1387 | |
1388 | SmallPtrSet<const Value *, 32> EphValues; |
1389 | if (ThreadableLoops.size() > 0) |
1390 | CodeMetrics::collectEphemeralValues(L: &F, AC, EphValues); |
1391 | |
1392 | for (AllSwitchPaths SwitchPaths : ThreadableLoops) { |
1393 | TransformDFA Transform(&SwitchPaths, DT, AC, TTI, ORE, EphValues); |
1394 | Transform.run(); |
1395 | MadeChanges = true; |
1396 | LoopInfoBroken = true; |
1397 | } |
1398 | |
1399 | #ifdef EXPENSIVE_CHECKS |
1400 | assert(DT->verify(DominatorTree::VerificationLevel::Full)); |
1401 | verifyFunction(F, &dbgs()); |
1402 | #endif |
1403 | |
1404 | return MadeChanges; |
1405 | } |
1406 | |
1407 | } // end anonymous namespace |
1408 | |
1409 | /// Integrate with the new Pass Manager |
1410 | PreservedAnalyses DFAJumpThreadingPass::run(Function &F, |
1411 | FunctionAnalysisManager &AM) { |
1412 | AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
1413 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
1414 | LoopInfo &LI = AM.getResult<LoopAnalysis>(IR&: F); |
1415 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
1416 | OptimizationRemarkEmitter ORE(&F); |
1417 | DFAJumpThreading ThreadImpl(&AC, &DT, &LI, &TTI, &ORE); |
1418 | if (!ThreadImpl.run(F)) |
1419 | return PreservedAnalyses::all(); |
1420 | |
1421 | PreservedAnalyses PA; |
1422 | PA.preserve<DominatorTreeAnalysis>(); |
1423 | if (!ThreadImpl.LoopInfoBroken) |
1424 | PA.preserve<LoopAnalysis>(); |
1425 | return PA; |
1426 | } |
1427 | |