| 1 | //===- DFAJumpThreading.cpp - Threads a switch statement inside a loop ----===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // Transform each threading path to effectively jump thread the DFA. For |
| 10 | // example, the CFG below could be transformed as follows, where the cloned |
| 11 | // blocks unconditionally branch to the next correct case based on what is |
| 12 | // identified in the analysis. |
| 13 | // |
| 14 | // sw.bb sw.bb |
| 15 | // / | \ / | \ |
| 16 | // case1 case2 case3 case1 case2 case3 |
| 17 | // \ | / | | | |
| 18 | // determinator det.2 det.3 det.1 |
| 19 | // br sw.bb / | \ |
| 20 | // sw.bb.2 sw.bb.3 sw.bb.1 |
| 21 | // br case2 br case3 br case1ยง |
| 22 | // |
| 23 | // Definitions and Terminology: |
| 24 | // |
| 25 | // * Threading path: |
| 26 | // a list of basic blocks, the exit state, and the block that determines |
| 27 | // the next state, for which the following notation will be used: |
| 28 | // < path of BBs that form a cycle > [ state, determinator ] |
| 29 | // |
| 30 | // * Predictable switch: |
| 31 | // The switch variable is always a known constant so that all conditional |
| 32 | // jumps based on switch variable can be converted to unconditional jump. |
| 33 | // |
| 34 | // * Determinator: |
| 35 | // The basic block that determines the next state of the DFA. |
| 36 | // |
| 37 | // Representing the optimization in C-like pseudocode: the code pattern on the |
| 38 | // left could functionally be transformed to the right pattern if the switch |
| 39 | // condition is predictable. |
| 40 | // |
| 41 | // X = A goto A |
| 42 | // for (...) A: |
| 43 | // switch (X) ... |
| 44 | // case A goto B |
| 45 | // X = B B: |
| 46 | // case B ... |
| 47 | // X = C goto C |
| 48 | // |
| 49 | // The pass first checks that switch variable X is decided by the control flow |
| 50 | // path taken in the loop; for example, in case B, the next value of X is |
| 51 | // decided to be C. It then enumerates through all paths in the loop and labels |
| 52 | // the basic blocks where the next state is decided. |
| 53 | // |
| 54 | // Using this information it creates new paths that unconditionally branch to |
| 55 | // the next case. This involves cloning code, so it only gets triggered if the |
| 56 | // amount of code duplicated is below a threshold. |
| 57 | // |
| 58 | //===----------------------------------------------------------------------===// |
| 59 | |
| 60 | #include "llvm/Transforms/Scalar/DFAJumpThreading.h" |
| 61 | #include "llvm/ADT/APInt.h" |
| 62 | #include "llvm/ADT/DenseMap.h" |
| 63 | #include "llvm/ADT/Statistic.h" |
| 64 | #include "llvm/ADT/StringExtras.h" |
| 65 | #include "llvm/Analysis/AssumptionCache.h" |
| 66 | #include "llvm/Analysis/CodeMetrics.h" |
| 67 | #include "llvm/Analysis/DomTreeUpdater.h" |
| 68 | #include "llvm/Analysis/LoopInfo.h" |
| 69 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 70 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 71 | #include "llvm/IR/CFG.h" |
| 72 | #include "llvm/IR/Constants.h" |
| 73 | #include "llvm/IR/IntrinsicInst.h" |
| 74 | #include "llvm/Support/CommandLine.h" |
| 75 | #include "llvm/Support/Debug.h" |
| 76 | #include "llvm/Transforms/Utils/Cloning.h" |
| 77 | #include "llvm/Transforms/Utils/SSAUpdaterBulk.h" |
| 78 | #include "llvm/Transforms/Utils/ValueMapper.h" |
| 79 | #include <deque> |
| 80 | |
| 81 | #ifdef EXPENSIVE_CHECKS |
| 82 | #include "llvm/IR/Verifier.h" |
| 83 | #endif |
| 84 | |
| 85 | using namespace llvm; |
| 86 | |
| 87 | #define DEBUG_TYPE "dfa-jump-threading" |
| 88 | |
| 89 | STATISTIC(NumTransforms, "Number of transformations done" ); |
| 90 | STATISTIC(NumCloned, "Number of blocks cloned" ); |
| 91 | STATISTIC(NumPaths, "Number of individual paths threaded" ); |
| 92 | |
| 93 | namespace llvm { |
| 94 | static cl::opt<bool> |
| 95 | ClViewCfgBefore("dfa-jump-view-cfg-before" , |
| 96 | cl::desc("View the CFG before DFA Jump Threading" ), |
| 97 | cl::Hidden, cl::init(Val: false)); |
| 98 | |
| 99 | static cl::opt<bool> EarlyExitHeuristic( |
| 100 | "dfa-early-exit-heuristic" , |
| 101 | cl::desc("Exit early if an unpredictable value come from the same loop" ), |
| 102 | cl::Hidden, cl::init(Val: true)); |
| 103 | |
| 104 | static cl::opt<unsigned> MaxPathLength( |
| 105 | "dfa-max-path-length" , |
| 106 | cl::desc("Max number of blocks searched to find a threading path" ), |
| 107 | cl::Hidden, cl::init(Val: 20)); |
| 108 | |
| 109 | static cl::opt<unsigned> MaxNumVisitiedPaths( |
| 110 | "dfa-max-num-visited-paths" , |
| 111 | cl::desc( |
| 112 | "Max number of blocks visited while enumerating paths around a switch" ), |
| 113 | cl::Hidden, cl::init(Val: 2500)); |
| 114 | |
| 115 | static cl::opt<unsigned> |
| 116 | MaxNumPaths("dfa-max-num-paths" , |
| 117 | cl::desc("Max number of paths enumerated around a switch" ), |
| 118 | cl::Hidden, cl::init(Val: 200)); |
| 119 | |
| 120 | static cl::opt<unsigned> |
| 121 | CostThreshold("dfa-cost-threshold" , |
| 122 | cl::desc("Maximum cost accepted for the transformation" ), |
| 123 | cl::Hidden, cl::init(Val: 50)); |
| 124 | |
| 125 | static cl::opt<double> MaxClonedRate( |
| 126 | "dfa-max-cloned-rate" , |
| 127 | cl::desc( |
| 128 | "Maximum cloned instructions rate accepted for the transformation" ), |
| 129 | cl::Hidden, cl::init(Val: 7.5)); |
| 130 | |
| 131 | static cl::opt<unsigned> |
| 132 | MaxOuterUseBlocks("dfa-max-out-use-blocks" , |
| 133 | cl::desc("Maximum unduplicated blocks with outer uses " |
| 134 | "accepted for the transformation" ), |
| 135 | cl::Hidden, cl::init(Val: 40)); |
| 136 | |
| 137 | extern cl::opt<bool> ProfcheckDisableMetadataFixes; |
| 138 | |
| 139 | } // namespace llvm |
| 140 | |
| 141 | namespace { |
| 142 | class SelectInstToUnfold { |
| 143 | SelectInst *SI; |
| 144 | PHINode *SIUse; |
| 145 | |
| 146 | public: |
| 147 | SelectInstToUnfold(SelectInst *SI, PHINode *SIUse) : SI(SI), SIUse(SIUse) {} |
| 148 | |
| 149 | SelectInst *getInst() { return SI; } |
| 150 | PHINode *getUse() { return SIUse; } |
| 151 | |
| 152 | explicit operator bool() const { return SI && SIUse; } |
| 153 | }; |
| 154 | |
| 155 | class DFAJumpThreading { |
| 156 | public: |
| 157 | (AssumptionCache *AC, DomTreeUpdater *DTU, LoopInfo *LI, |
| 158 | TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) |
| 159 | : AC(AC), DTU(DTU), LI(LI), TTI(TTI), ORE(ORE) {} |
| 160 | |
| 161 | bool run(Function &F); |
| 162 | bool LoopInfoBroken; |
| 163 | |
| 164 | private: |
| 165 | void |
| 166 | unfoldSelectInstrs(const SmallVector<SelectInstToUnfold, 4> &SelectInsts) { |
| 167 | SmallVector<SelectInstToUnfold, 4> Stack(SelectInsts); |
| 168 | |
| 169 | while (!Stack.empty()) { |
| 170 | SelectInstToUnfold SIToUnfold = Stack.pop_back_val(); |
| 171 | |
| 172 | std::vector<SelectInstToUnfold> NewSIsToUnfold; |
| 173 | std::vector<BasicBlock *> NewBBs; |
| 174 | unfold(DTU, LI, SIToUnfold, NewSIsToUnfold: &NewSIsToUnfold, NewBBs: &NewBBs); |
| 175 | |
| 176 | // Put newly discovered select instructions into the work list. |
| 177 | llvm::append_range(C&: Stack, R&: NewSIsToUnfold); |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | static void unfold(DomTreeUpdater *DTU, LoopInfo *LI, |
| 182 | SelectInstToUnfold SIToUnfold, |
| 183 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
| 184 | std::vector<BasicBlock *> *NewBBs); |
| 185 | |
| 186 | AssumptionCache *AC; |
| 187 | DomTreeUpdater *DTU; |
| 188 | LoopInfo *LI; |
| 189 | TargetTransformInfo *TTI; |
| 190 | OptimizationRemarkEmitter *ORE; |
| 191 | }; |
| 192 | } // namespace |
| 193 | |
| 194 | /// Unfold the select instruction held in \p SIToUnfold by replacing it with |
| 195 | /// control flow. |
| 196 | /// |
| 197 | /// Put newly discovered select instructions into \p NewSIsToUnfold. Put newly |
| 198 | /// created basic blocks into \p NewBBs. |
| 199 | /// |
| 200 | /// TODO: merge it with CodeGenPrepare::optimizeSelectInst() if possible. |
| 201 | void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI, |
| 202 | SelectInstToUnfold SIToUnfold, |
| 203 | std::vector<SelectInstToUnfold> *NewSIsToUnfold, |
| 204 | std::vector<BasicBlock *> *NewBBs) { |
| 205 | SelectInst *SI = SIToUnfold.getInst(); |
| 206 | PHINode *SIUse = SIToUnfold.getUse(); |
| 207 | assert(SI->hasOneUse()); |
| 208 | // The select may come indirectly, instead of from where it is defined. |
| 209 | BasicBlock *StartBlock = SIUse->getIncomingBlock(U: *SI->use_begin()); |
| 210 | BranchInst *StartBlockTerm = |
| 211 | dyn_cast<BranchInst>(Val: StartBlock->getTerminator()); |
| 212 | assert(StartBlockTerm); |
| 213 | |
| 214 | if (StartBlockTerm->isUnconditional()) { |
| 215 | BasicBlock *EndBlock = StartBlock->getUniqueSuccessor(); |
| 216 | // Arbitrarily choose the 'false' side for a new input value to the PHI. |
| 217 | BasicBlock *NewBlock = BasicBlock::Create( |
| 218 | Context&: SI->getContext(), Name: Twine(SI->getName(), ".si.unfold.false" ), |
| 219 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 220 | NewBBs->push_back(x: NewBlock); |
| 221 | BranchInst::Create(IfTrue: EndBlock, InsertBefore: NewBlock); |
| 222 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBlock, EndBlock}}); |
| 223 | |
| 224 | // StartBlock |
| 225 | // | \ |
| 226 | // | NewBlock |
| 227 | // | / |
| 228 | // EndBlock |
| 229 | Value *SIOp1 = SI->getTrueValue(); |
| 230 | Value *SIOp2 = SI->getFalseValue(); |
| 231 | |
| 232 | PHINode *NewPhi = PHINode::Create(Ty: SIUse->getType(), NumReservedValues: 1, |
| 233 | NameStr: Twine(SIOp2->getName(), ".si.unfold.phi" ), |
| 234 | InsertBefore: NewBlock->getFirstInsertionPt()); |
| 235 | NewPhi->addIncoming(V: SIOp2, BB: StartBlock); |
| 236 | |
| 237 | // Update any other PHI nodes in EndBlock. |
| 238 | for (PHINode &Phi : EndBlock->phis()) { |
| 239 | if (SIUse == &Phi) |
| 240 | continue; |
| 241 | Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: StartBlock), BB: NewBlock); |
| 242 | } |
| 243 | |
| 244 | // Update the phi node of SI, which is its only use. |
| 245 | if (EndBlock == SIUse->getParent()) { |
| 246 | SIUse->addIncoming(V: NewPhi, BB: NewBlock); |
| 247 | SIUse->replaceUsesOfWith(From: SI, To: SIOp1); |
| 248 | } else { |
| 249 | PHINode *EndPhi = PHINode::Create(Ty: SIUse->getType(), NumReservedValues: pred_size(BB: EndBlock), |
| 250 | NameStr: Twine(SI->getName(), ".si.unfold.phi" ), |
| 251 | InsertBefore: EndBlock->getFirstInsertionPt()); |
| 252 | for (BasicBlock *Pred : predecessors(BB: EndBlock)) { |
| 253 | if (Pred != StartBlock && Pred != NewBlock) |
| 254 | EndPhi->addIncoming(V: EndPhi, BB: Pred); |
| 255 | } |
| 256 | |
| 257 | EndPhi->addIncoming(V: SIOp1, BB: StartBlock); |
| 258 | EndPhi->addIncoming(V: NewPhi, BB: NewBlock); |
| 259 | SIUse->replaceUsesOfWith(From: SI, To: EndPhi); |
| 260 | SIUse = EndPhi; |
| 261 | } |
| 262 | |
| 263 | if (auto *OpSi = dyn_cast<SelectInst>(Val: SIOp1)) |
| 264 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(OpSi, SIUse)); |
| 265 | if (auto *OpSi = dyn_cast<SelectInst>(Val: SIOp2)) |
| 266 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(OpSi, NewPhi)); |
| 267 | |
| 268 | // Insert the real conditional branch based on the original condition. |
| 269 | StartBlockTerm->eraseFromParent(); |
| 270 | auto *BI = |
| 271 | BranchInst::Create(IfTrue: EndBlock, IfFalse: NewBlock, Cond: SI->getCondition(), InsertBefore: StartBlock); |
| 272 | if (!ProfcheckDisableMetadataFixes) |
| 273 | BI->setMetadata(KindID: LLVMContext::MD_prof, |
| 274 | Node: SI->getMetadata(KindID: LLVMContext::MD_prof)); |
| 275 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, StartBlock, NewBlock}}); |
| 276 | } else { |
| 277 | BasicBlock *EndBlock = SIUse->getParent(); |
| 278 | BasicBlock *NewBlockT = BasicBlock::Create( |
| 279 | Context&: SI->getContext(), Name: Twine(SI->getName(), ".si.unfold.true" ), |
| 280 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 281 | BasicBlock *NewBlockF = BasicBlock::Create( |
| 282 | Context&: SI->getContext(), Name: Twine(SI->getName(), ".si.unfold.false" ), |
| 283 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
| 284 | |
| 285 | NewBBs->push_back(x: NewBlockT); |
| 286 | NewBBs->push_back(x: NewBlockF); |
| 287 | |
| 288 | // Def only has one use in EndBlock. |
| 289 | // Before transformation: |
| 290 | // StartBlock(Def) |
| 291 | // | \ |
| 292 | // EndBlock OtherBlock |
| 293 | // (Use) |
| 294 | // |
| 295 | // After transformation: |
| 296 | // StartBlock(Def) |
| 297 | // | \ |
| 298 | // | OtherBlock |
| 299 | // NewBlockT |
| 300 | // | \ |
| 301 | // | NewBlockF |
| 302 | // | / |
| 303 | // | / |
| 304 | // EndBlock |
| 305 | // (Use) |
| 306 | BranchInst::Create(IfTrue: EndBlock, InsertBefore: NewBlockF); |
| 307 | // Insert the real conditional branch based on the original condition. |
| 308 | auto *BI = |
| 309 | BranchInst::Create(IfTrue: EndBlock, IfFalse: NewBlockF, Cond: SI->getCondition(), InsertBefore: NewBlockT); |
| 310 | if (!ProfcheckDisableMetadataFixes) |
| 311 | BI->setMetadata(KindID: LLVMContext::MD_prof, |
| 312 | Node: SI->getMetadata(KindID: LLVMContext::MD_prof)); |
| 313 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBlockT, NewBlockF}, |
| 314 | {DominatorTree::Insert, NewBlockT, EndBlock}, |
| 315 | {DominatorTree::Insert, NewBlockF, EndBlock}}); |
| 316 | |
| 317 | Value *TrueVal = SI->getTrueValue(); |
| 318 | Value *FalseVal = SI->getFalseValue(); |
| 319 | |
| 320 | PHINode *NewPhiT = PHINode::Create( |
| 321 | Ty: SIUse->getType(), NumReservedValues: 1, NameStr: Twine(TrueVal->getName(), ".si.unfold.phi" ), |
| 322 | InsertBefore: NewBlockT->getFirstInsertionPt()); |
| 323 | PHINode *NewPhiF = PHINode::Create( |
| 324 | Ty: SIUse->getType(), NumReservedValues: 1, NameStr: Twine(FalseVal->getName(), ".si.unfold.phi" ), |
| 325 | InsertBefore: NewBlockF->getFirstInsertionPt()); |
| 326 | NewPhiT->addIncoming(V: TrueVal, BB: StartBlock); |
| 327 | NewPhiF->addIncoming(V: FalseVal, BB: NewBlockT); |
| 328 | |
| 329 | if (auto *TrueSI = dyn_cast<SelectInst>(Val: TrueVal)) |
| 330 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(TrueSI, NewPhiT)); |
| 331 | if (auto *FalseSi = dyn_cast<SelectInst>(Val: FalseVal)) |
| 332 | NewSIsToUnfold->push_back(x: SelectInstToUnfold(FalseSi, NewPhiF)); |
| 333 | |
| 334 | SIUse->addIncoming(V: NewPhiT, BB: NewBlockT); |
| 335 | SIUse->addIncoming(V: NewPhiF, BB: NewBlockF); |
| 336 | SIUse->removeIncomingValue(BB: StartBlock); |
| 337 | |
| 338 | // Update any other PHI nodes in EndBlock. |
| 339 | for (PHINode &Phi : EndBlock->phis()) { |
| 340 | if (SIUse == &Phi) |
| 341 | continue; |
| 342 | Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: StartBlock), BB: NewBlockT); |
| 343 | Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: StartBlock), BB: NewBlockF); |
| 344 | Phi.removeIncomingValue(BB: StartBlock); |
| 345 | } |
| 346 | |
| 347 | // Update the appropriate successor of the start block to point to the new |
| 348 | // unfolded block. |
| 349 | unsigned SuccNum = StartBlockTerm->getSuccessor(i: 1) == EndBlock ? 1 : 0; |
| 350 | StartBlockTerm->setSuccessor(idx: SuccNum, NewSucc: NewBlockT); |
| 351 | DTU->applyUpdates(Updates: {{DominatorTree::Delete, StartBlock, EndBlock}, |
| 352 | {DominatorTree::Insert, StartBlock, NewBlockT}}); |
| 353 | } |
| 354 | |
| 355 | // Preserve loop info |
| 356 | if (Loop *L = LI->getLoopFor(BB: StartBlock)) { |
| 357 | for (BasicBlock *NewBB : *NewBBs) |
| 358 | L->addBasicBlockToLoop(NewBB, LI&: *LI); |
| 359 | } |
| 360 | |
| 361 | // The select is now dead. |
| 362 | assert(SI->use_empty() && "Select must be dead now" ); |
| 363 | SI->eraseFromParent(); |
| 364 | } |
| 365 | |
| 366 | namespace { |
| 367 | struct ClonedBlock { |
| 368 | BasicBlock *BB; |
| 369 | APInt State; ///< \p State corresponds to the next value of a switch stmnt. |
| 370 | }; |
| 371 | } // namespace |
| 372 | |
| 373 | typedef std::deque<BasicBlock *> PathType; |
| 374 | typedef std::vector<PathType> PathsType; |
| 375 | typedef SmallPtrSet<const BasicBlock *, 8> VisitedBlocks; |
| 376 | typedef std::vector<ClonedBlock> CloneList; |
| 377 | |
| 378 | // This data structure keeps track of all blocks that have been cloned. If two |
| 379 | // different ThreadingPaths clone the same block for a certain state it should |
| 380 | // be reused, and it can be looked up in this map. |
| 381 | typedef DenseMap<BasicBlock *, CloneList> DuplicateBlockMap; |
| 382 | |
| 383 | // This map keeps track of all the new definitions for an instruction. This |
| 384 | // information is needed when restoring SSA form after cloning blocks. |
| 385 | typedef MapVector<Instruction *, std::vector<Instruction *>> DefMap; |
| 386 | |
| 387 | inline raw_ostream &operator<<(raw_ostream &OS, const PathType &Path) { |
| 388 | auto BBNames = llvm::map_range( |
| 389 | C: Path, F: [](const BasicBlock *BB) { return BB->getNameOrAsOperand(); }); |
| 390 | OS << "< " << llvm::join(R&: BBNames, Separator: ", " ) << " >" ; |
| 391 | return OS; |
| 392 | } |
| 393 | |
| 394 | namespace { |
| 395 | /// ThreadingPath is a path in the control flow of a loop that can be threaded |
| 396 | /// by cloning necessary basic blocks and replacing conditional branches with |
| 397 | /// unconditional ones. A threading path includes a list of basic blocks, the |
| 398 | /// exit state, and the block that determines the next state. |
| 399 | struct ThreadingPath { |
| 400 | /// Exit value is DFA's exit state for the given path. |
| 401 | APInt getExitValue() const { return ExitVal; } |
| 402 | void setExitValue(const ConstantInt *V) { |
| 403 | ExitVal = V->getValue(); |
| 404 | IsExitValSet = true; |
| 405 | } |
| 406 | void setExitValue(const APInt &V) { |
| 407 | ExitVal = V; |
| 408 | IsExitValSet = true; |
| 409 | } |
| 410 | bool isExitValueSet() const { return IsExitValSet; } |
| 411 | |
| 412 | /// Determinator is the basic block that determines the next state of the DFA. |
| 413 | const BasicBlock *getDeterminatorBB() const { return DBB; } |
| 414 | void setDeterminator(const BasicBlock *BB) { DBB = BB; } |
| 415 | |
| 416 | /// Path is a list of basic blocks. |
| 417 | const PathType &getPath() const { return Path; } |
| 418 | void setPath(const PathType &NewPath) { Path = NewPath; } |
| 419 | void push_back(BasicBlock *BB) { Path.push_back(x: BB); } |
| 420 | void push_front(BasicBlock *BB) { Path.push_front(x: BB); } |
| 421 | void appendExcludingFirst(const PathType &OtherPath) { |
| 422 | llvm::append_range(C&: Path, R: llvm::drop_begin(RangeOrContainer: OtherPath)); |
| 423 | } |
| 424 | |
| 425 | void print(raw_ostream &OS) const { |
| 426 | OS << Path << " [ " << ExitVal << ", " << DBB->getNameOrAsOperand() << " ]" ; |
| 427 | } |
| 428 | |
| 429 | private: |
| 430 | PathType Path; |
| 431 | APInt ExitVal; |
| 432 | const BasicBlock *DBB = nullptr; |
| 433 | bool IsExitValSet = false; |
| 434 | }; |
| 435 | |
| 436 | #ifndef NDEBUG |
| 437 | inline raw_ostream &operator<<(raw_ostream &OS, const ThreadingPath &TPath) { |
| 438 | TPath.print(OS); |
| 439 | return OS; |
| 440 | } |
| 441 | #endif |
| 442 | |
| 443 | struct MainSwitch { |
| 444 | MainSwitch(SwitchInst *SI, LoopInfo *LI, OptimizationRemarkEmitter *ORE) |
| 445 | : LI(LI) { |
| 446 | if (isCandidate(SI)) { |
| 447 | Instr = SI; |
| 448 | } else { |
| 449 | ORE->emit(RemarkBuilder: [&]() { |
| 450 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , SI) |
| 451 | << "Switch instruction is not predictable." ; |
| 452 | }); |
| 453 | } |
| 454 | } |
| 455 | |
| 456 | virtual ~MainSwitch() = default; |
| 457 | |
| 458 | SwitchInst *getInstr() const { return Instr; } |
| 459 | const SmallVector<SelectInstToUnfold, 4> getSelectInsts() { |
| 460 | return SelectInsts; |
| 461 | } |
| 462 | |
| 463 | private: |
| 464 | /// Do a use-def chain traversal starting from the switch condition to see if |
| 465 | /// \p SI is a potential condidate. |
| 466 | /// |
| 467 | /// Also, collect select instructions to unfold. |
| 468 | bool isCandidate(const SwitchInst *SI) { |
| 469 | std::deque<std::pair<Value *, BasicBlock *>> Q; |
| 470 | SmallPtrSet<Value *, 16> SeenValues; |
| 471 | SelectInsts.clear(); |
| 472 | |
| 473 | Value *SICond = SI->getCondition(); |
| 474 | LLVM_DEBUG(dbgs() << "\tSICond: " << *SICond << "\n" ); |
| 475 | if (!isa<PHINode>(Val: SICond)) |
| 476 | return false; |
| 477 | |
| 478 | // The switch must be in a loop. |
| 479 | const Loop *L = LI->getLoopFor(BB: SI->getParent()); |
| 480 | if (!L) |
| 481 | return false; |
| 482 | |
| 483 | addToQueue(Val: SICond, BB: nullptr, Q, SeenValues); |
| 484 | |
| 485 | while (!Q.empty()) { |
| 486 | Value *Current = Q.front().first; |
| 487 | BasicBlock *CurrentIncomingBB = Q.front().second; |
| 488 | Q.pop_front(); |
| 489 | |
| 490 | if (auto *Phi = dyn_cast<PHINode>(Val: Current)) { |
| 491 | for (BasicBlock *IncomingBB : Phi->blocks()) { |
| 492 | Value *Incoming = Phi->getIncomingValueForBlock(BB: IncomingBB); |
| 493 | addToQueue(Val: Incoming, BB: IncomingBB, Q, SeenValues); |
| 494 | } |
| 495 | LLVM_DEBUG(dbgs() << "\tphi: " << *Phi << "\n" ); |
| 496 | } else if (SelectInst *SelI = dyn_cast<SelectInst>(Val: Current)) { |
| 497 | if (!isValidSelectInst(SI: SelI)) |
| 498 | return false; |
| 499 | addToQueue(Val: SelI->getTrueValue(), BB: CurrentIncomingBB, Q, SeenValues); |
| 500 | addToQueue(Val: SelI->getFalseValue(), BB: CurrentIncomingBB, Q, SeenValues); |
| 501 | LLVM_DEBUG(dbgs() << "\tselect: " << *SelI << "\n" ); |
| 502 | if (auto *SelIUse = dyn_cast<PHINode>(Val: SelI->user_back())) |
| 503 | SelectInsts.push_back(Elt: SelectInstToUnfold(SelI, SelIUse)); |
| 504 | } else if (isa<Constant>(Val: Current)) { |
| 505 | LLVM_DEBUG(dbgs() << "\tconst: " << *Current << "\n" ); |
| 506 | continue; |
| 507 | } else { |
| 508 | LLVM_DEBUG(dbgs() << "\tother: " << *Current << "\n" ); |
| 509 | // Allow unpredictable values. The hope is that those will be the |
| 510 | // initial switch values that can be ignored (they will hit the |
| 511 | // unthreaded switch) but this assumption will get checked later after |
| 512 | // paths have been enumerated (in function getStateDefMap). |
| 513 | |
| 514 | // If the unpredictable value comes from the same inner loop it is |
| 515 | // likely that it will also be on the enumerated paths, causing us to |
| 516 | // exit after we have enumerated all the paths. This heuristic save |
| 517 | // compile time because a search for all the paths can become expensive. |
| 518 | if (EarlyExitHeuristic && |
| 519 | L->contains(L: LI->getLoopFor(BB: CurrentIncomingBB))) { |
| 520 | LLVM_DEBUG(dbgs() |
| 521 | << "\tExiting early due to unpredictability heuristic.\n" ); |
| 522 | return false; |
| 523 | } |
| 524 | |
| 525 | continue; |
| 526 | } |
| 527 | } |
| 528 | |
| 529 | return true; |
| 530 | } |
| 531 | |
| 532 | void addToQueue(Value *Val, BasicBlock *BB, |
| 533 | std::deque<std::pair<Value *, BasicBlock *>> &Q, |
| 534 | SmallPtrSet<Value *, 16> &SeenValues) { |
| 535 | if (SeenValues.insert(Ptr: Val).second) |
| 536 | Q.push_back(x: {Val, BB}); |
| 537 | } |
| 538 | |
| 539 | bool isValidSelectInst(SelectInst *SI) { |
| 540 | if (!SI->hasOneUse()) |
| 541 | return false; |
| 542 | |
| 543 | Instruction *SIUse = dyn_cast<Instruction>(Val: SI->user_back()); |
| 544 | // The use of the select inst should be either a phi or another select. |
| 545 | if (!SIUse || !(isa<PHINode>(Val: SIUse) || isa<SelectInst>(Val: SIUse))) |
| 546 | return false; |
| 547 | |
| 548 | BasicBlock *SIBB = SI->getParent(); |
| 549 | |
| 550 | // Currently, we can only expand select instructions in basic blocks with |
| 551 | // one successor. |
| 552 | BranchInst *SITerm = dyn_cast<BranchInst>(Val: SIBB->getTerminator()); |
| 553 | if (!SITerm || !SITerm->isUnconditional()) |
| 554 | return false; |
| 555 | |
| 556 | // Only fold the select coming from directly where it is defined. |
| 557 | // TODO: We have dealt with the select coming indirectly now. This |
| 558 | // constraint can be relaxed. |
| 559 | PHINode *PHIUser = dyn_cast<PHINode>(Val: SIUse); |
| 560 | if (PHIUser && PHIUser->getIncomingBlock(U: *SI->use_begin()) != SIBB) |
| 561 | return false; |
| 562 | |
| 563 | // If select will not be sunk during unfolding, and it is in the same basic |
| 564 | // block as another state defining select, then cannot unfold both. |
| 565 | for (SelectInstToUnfold SIToUnfold : SelectInsts) { |
| 566 | SelectInst *PrevSI = SIToUnfold.getInst(); |
| 567 | if (PrevSI->getTrueValue() != SI && PrevSI->getFalseValue() != SI && |
| 568 | PrevSI->getParent() == SI->getParent()) |
| 569 | return false; |
| 570 | } |
| 571 | |
| 572 | return true; |
| 573 | } |
| 574 | |
| 575 | LoopInfo *LI; |
| 576 | SwitchInst *Instr = nullptr; |
| 577 | SmallVector<SelectInstToUnfold, 4> SelectInsts; |
| 578 | }; |
| 579 | |
| 580 | struct AllSwitchPaths { |
| 581 | AllSwitchPaths(const MainSwitch *MSwitch, OptimizationRemarkEmitter *ORE, |
| 582 | LoopInfo *LI, Loop *L) |
| 583 | : Switch(MSwitch->getInstr()), SwitchBlock(Switch->getParent()), ORE(ORE), |
| 584 | LI(LI), SwitchOuterLoop(L) {} |
| 585 | |
| 586 | std::vector<ThreadingPath> &getThreadingPaths() { return TPaths; } |
| 587 | unsigned getNumThreadingPaths() { return TPaths.size(); } |
| 588 | SwitchInst *getSwitchInst() { return Switch; } |
| 589 | BasicBlock *getSwitchBlock() { return SwitchBlock; } |
| 590 | |
| 591 | void run() { |
| 592 | findTPaths(); |
| 593 | unifyTPaths(); |
| 594 | } |
| 595 | |
| 596 | private: |
| 597 | // Value: an instruction that defines a switch state; |
| 598 | // Key: the parent basic block of that instruction. |
| 599 | typedef DenseMap<const BasicBlock *, const PHINode *> StateDefMap; |
| 600 | std::vector<ThreadingPath> getPathsFromStateDefMap(StateDefMap &StateDef, |
| 601 | PHINode *Phi, |
| 602 | VisitedBlocks &VB, |
| 603 | unsigned PathsLimit) { |
| 604 | std::vector<ThreadingPath> Res; |
| 605 | auto *PhiBB = Phi->getParent(); |
| 606 | VB.insert(Ptr: PhiBB); |
| 607 | |
| 608 | VisitedBlocks UniqueBlocks; |
| 609 | for (auto *IncomingBB : Phi->blocks()) { |
| 610 | if (Res.size() >= PathsLimit) |
| 611 | break; |
| 612 | if (!UniqueBlocks.insert(Ptr: IncomingBB).second) |
| 613 | continue; |
| 614 | if (!SwitchOuterLoop->contains(BB: IncomingBB)) |
| 615 | continue; |
| 616 | |
| 617 | Value *IncomingValue = Phi->getIncomingValueForBlock(BB: IncomingBB); |
| 618 | // We found the determinator. This is the start of our path. |
| 619 | if (auto *C = dyn_cast<ConstantInt>(Val: IncomingValue)) { |
| 620 | // SwitchBlock is the determinator, unsupported unless its also the def. |
| 621 | if (PhiBB == SwitchBlock && |
| 622 | SwitchBlock != cast<PHINode>(Val: Switch->getOperand(i_nocapture: 0))->getParent()) |
| 623 | continue; |
| 624 | ThreadingPath NewPath; |
| 625 | NewPath.setDeterminator(PhiBB); |
| 626 | NewPath.setExitValue(C); |
| 627 | // Don't add SwitchBlock at the start, this is handled later. |
| 628 | if (IncomingBB != SwitchBlock) { |
| 629 | // Don't add a cycle to the path. |
| 630 | if (VB.contains(Ptr: IncomingBB)) |
| 631 | continue; |
| 632 | NewPath.push_back(BB: IncomingBB); |
| 633 | } |
| 634 | NewPath.push_back(BB: PhiBB); |
| 635 | Res.push_back(x: NewPath); |
| 636 | continue; |
| 637 | } |
| 638 | // Don't get into a cycle. |
| 639 | if (VB.contains(Ptr: IncomingBB) || IncomingBB == SwitchBlock) |
| 640 | continue; |
| 641 | // Recurse up the PHI chain. |
| 642 | auto *IncomingPhi = dyn_cast<PHINode>(Val: IncomingValue); |
| 643 | if (!IncomingPhi) |
| 644 | continue; |
| 645 | auto *IncomingPhiDefBB = IncomingPhi->getParent(); |
| 646 | if (!StateDef.contains(Val: IncomingPhiDefBB)) |
| 647 | continue; |
| 648 | |
| 649 | // Direct predecessor, just add to the path. |
| 650 | if (IncomingPhiDefBB == IncomingBB) { |
| 651 | assert(PathsLimit > Res.size()); |
| 652 | std::vector<ThreadingPath> PredPaths = getPathsFromStateDefMap( |
| 653 | StateDef, Phi: IncomingPhi, VB, PathsLimit: PathsLimit - Res.size()); |
| 654 | for (ThreadingPath &Path : PredPaths) { |
| 655 | Path.push_back(BB: PhiBB); |
| 656 | Res.push_back(x: std::move(Path)); |
| 657 | } |
| 658 | continue; |
| 659 | } |
| 660 | // Not a direct predecessor, find intermediate paths to append to the |
| 661 | // existing path. |
| 662 | if (VB.contains(Ptr: IncomingPhiDefBB)) |
| 663 | continue; |
| 664 | |
| 665 | PathsType IntermediatePaths; |
| 666 | assert(PathsLimit > Res.size()); |
| 667 | auto InterPathLimit = PathsLimit - Res.size(); |
| 668 | IntermediatePaths = paths(BB: IncomingPhiDefBB, ToBB: IncomingBB, Visited&: VB, |
| 669 | /* PathDepth = */ 1, PathsLimit: InterPathLimit); |
| 670 | if (IntermediatePaths.empty()) |
| 671 | continue; |
| 672 | |
| 673 | assert(InterPathLimit >= IntermediatePaths.size()); |
| 674 | auto PredPathLimit = InterPathLimit / IntermediatePaths.size(); |
| 675 | std::vector<ThreadingPath> PredPaths = |
| 676 | getPathsFromStateDefMap(StateDef, Phi: IncomingPhi, VB, PathsLimit: PredPathLimit); |
| 677 | for (const ThreadingPath &Path : PredPaths) { |
| 678 | for (const PathType &IPath : IntermediatePaths) { |
| 679 | ThreadingPath NewPath(Path); |
| 680 | NewPath.appendExcludingFirst(OtherPath: IPath); |
| 681 | NewPath.push_back(BB: PhiBB); |
| 682 | Res.push_back(x: NewPath); |
| 683 | } |
| 684 | } |
| 685 | } |
| 686 | VB.erase(Ptr: PhiBB); |
| 687 | return Res; |
| 688 | } |
| 689 | |
| 690 | PathsType paths(BasicBlock *BB, BasicBlock *ToBB, VisitedBlocks &Visited, |
| 691 | unsigned PathDepth, unsigned PathsLimit) { |
| 692 | PathsType Res; |
| 693 | |
| 694 | // Stop exploring paths after visiting MaxPathLength blocks |
| 695 | if (PathDepth > MaxPathLength) { |
| 696 | ORE->emit(RemarkBuilder: [&]() { |
| 697 | return OptimizationRemarkAnalysis(DEBUG_TYPE, "MaxPathLengthReached" , |
| 698 | Switch) |
| 699 | << "Exploration stopped after visiting MaxPathLength=" |
| 700 | << ore::NV("MaxPathLength" , MaxPathLength) << " blocks." ; |
| 701 | }); |
| 702 | return Res; |
| 703 | } |
| 704 | |
| 705 | Visited.insert(Ptr: BB); |
| 706 | if (++NumVisited > MaxNumVisitiedPaths) |
| 707 | return Res; |
| 708 | |
| 709 | // Stop if we have reached the BB out of loop, since its successors have no |
| 710 | // impact on the DFA. |
| 711 | if (!SwitchOuterLoop->contains(BB)) |
| 712 | return Res; |
| 713 | |
| 714 | // Some blocks have multiple edges to the same successor, and this set |
| 715 | // is used to prevent a duplicate path from being generated |
| 716 | SmallPtrSet<BasicBlock *, 4> Successors; |
| 717 | for (BasicBlock *Succ : successors(BB)) { |
| 718 | if (Res.size() >= PathsLimit) |
| 719 | break; |
| 720 | if (!Successors.insert(Ptr: Succ).second) |
| 721 | continue; |
| 722 | |
| 723 | // Found a cycle through the final block. |
| 724 | if (Succ == ToBB) { |
| 725 | Res.push_back(x: {BB, ToBB}); |
| 726 | continue; |
| 727 | } |
| 728 | |
| 729 | // We have encountered a cycle, do not get caught in it |
| 730 | if (Visited.contains(Ptr: Succ)) |
| 731 | continue; |
| 732 | |
| 733 | auto *CurrLoop = LI->getLoopFor(BB); |
| 734 | // Unlikely to be beneficial. |
| 735 | if (Succ == CurrLoop->getHeader()) |
| 736 | continue; |
| 737 | // Skip for now, revisit this condition later to see the impact on |
| 738 | // coverage and compile time. |
| 739 | if (LI->getLoopFor(BB: Succ) != CurrLoop) |
| 740 | continue; |
| 741 | assert(PathsLimit > Res.size()); |
| 742 | PathsType SuccPaths = |
| 743 | paths(BB: Succ, ToBB, Visited, PathDepth: PathDepth + 1, PathsLimit: PathsLimit - Res.size()); |
| 744 | for (PathType &Path : SuccPaths) { |
| 745 | Path.push_front(x: BB); |
| 746 | Res.push_back(x: Path); |
| 747 | } |
| 748 | } |
| 749 | // This block could now be visited again from a different predecessor. Note |
| 750 | // that this will result in exponential runtime. Subpaths could possibly be |
| 751 | // cached but it takes a lot of memory to store them. |
| 752 | Visited.erase(Ptr: BB); |
| 753 | return Res; |
| 754 | } |
| 755 | |
| 756 | /// Walk the use-def chain and collect all the state-defining blocks and the |
| 757 | /// PHI nodes in those blocks that define the state. |
| 758 | StateDefMap getStateDefMap() const { |
| 759 | StateDefMap Res; |
| 760 | PHINode *FirstDef = dyn_cast<PHINode>(Val: Switch->getOperand(i_nocapture: 0)); |
| 761 | assert(FirstDef && "The first definition must be a phi." ); |
| 762 | |
| 763 | SmallVector<PHINode *, 8> Stack; |
| 764 | Stack.push_back(Elt: FirstDef); |
| 765 | SmallPtrSet<Value *, 16> SeenValues; |
| 766 | |
| 767 | while (!Stack.empty()) { |
| 768 | PHINode *CurPhi = Stack.pop_back_val(); |
| 769 | |
| 770 | Res[CurPhi->getParent()] = CurPhi; |
| 771 | SeenValues.insert(Ptr: CurPhi); |
| 772 | |
| 773 | for (BasicBlock *IncomingBB : CurPhi->blocks()) { |
| 774 | PHINode *IncomingPhi = |
| 775 | dyn_cast<PHINode>(Val: CurPhi->getIncomingValueForBlock(BB: IncomingBB)); |
| 776 | if (!IncomingPhi) |
| 777 | continue; |
| 778 | bool IsOutsideLoops = !SwitchOuterLoop->contains(BB: IncomingBB); |
| 779 | if (SeenValues.contains(Ptr: IncomingPhi) || IsOutsideLoops) |
| 780 | continue; |
| 781 | |
| 782 | Stack.push_back(Elt: IncomingPhi); |
| 783 | } |
| 784 | } |
| 785 | |
| 786 | return Res; |
| 787 | } |
| 788 | |
| 789 | // Find all threadable paths. |
| 790 | void findTPaths() { |
| 791 | StateDefMap StateDef = getStateDefMap(); |
| 792 | if (StateDef.empty()) { |
| 793 | ORE->emit(RemarkBuilder: [&]() { |
| 794 | return OptimizationRemarkMissed(DEBUG_TYPE, "SwitchNotPredictable" , |
| 795 | Switch) |
| 796 | << "Switch instruction is not predictable." ; |
| 797 | }); |
| 798 | return; |
| 799 | } |
| 800 | |
| 801 | auto *SwitchPhi = cast<PHINode>(Val: Switch->getOperand(i_nocapture: 0)); |
| 802 | auto *SwitchPhiDefBB = SwitchPhi->getParent(); |
| 803 | VisitedBlocks VB; |
| 804 | // Get paths from the determinator BBs to SwitchPhiDefBB |
| 805 | std::vector<ThreadingPath> PathsToPhiDef = |
| 806 | getPathsFromStateDefMap(StateDef, Phi: SwitchPhi, VB, PathsLimit: MaxNumPaths); |
| 807 | if (SwitchPhiDefBB == SwitchBlock || PathsToPhiDef.empty()) { |
| 808 | TPaths = std::move(PathsToPhiDef); |
| 809 | return; |
| 810 | } |
| 811 | |
| 812 | assert(MaxNumPaths >= PathsToPhiDef.size() && !PathsToPhiDef.empty()); |
| 813 | auto PathsLimit = MaxNumPaths / PathsToPhiDef.size(); |
| 814 | // Find and append paths from SwitchPhiDefBB to SwitchBlock. |
| 815 | PathsType PathsToSwitchBB = |
| 816 | paths(BB: SwitchPhiDefBB, ToBB: SwitchBlock, Visited&: VB, /* PathDepth = */ 1, PathsLimit); |
| 817 | if (PathsToSwitchBB.empty()) |
| 818 | return; |
| 819 | |
| 820 | std::vector<ThreadingPath> TempList; |
| 821 | for (const ThreadingPath &Path : PathsToPhiDef) { |
| 822 | SmallPtrSet<BasicBlock *, 32> PathSet(Path.getPath().begin(), |
| 823 | Path.getPath().end()); |
| 824 | for (const PathType &PathToSw : PathsToSwitchBB) { |
| 825 | if (any_of(Range: llvm::drop_begin(RangeOrContainer: PathToSw), |
| 826 | P: [&](const BasicBlock *BB) { return PathSet.contains(Ptr: BB); })) |
| 827 | continue; |
| 828 | ThreadingPath PathCopy(Path); |
| 829 | PathCopy.appendExcludingFirst(OtherPath: PathToSw); |
| 830 | TempList.push_back(x: PathCopy); |
| 831 | } |
| 832 | } |
| 833 | TPaths = std::move(TempList); |
| 834 | } |
| 835 | |
| 836 | /// Fast helper to get the successor corresponding to a particular case value |
| 837 | /// for a switch statement. |
| 838 | BasicBlock *getNextCaseSuccessor(const APInt &NextState) { |
| 839 | // Precompute the value => successor mapping |
| 840 | if (CaseValToDest.empty()) { |
| 841 | for (auto Case : Switch->cases()) { |
| 842 | APInt CaseVal = Case.getCaseValue()->getValue(); |
| 843 | CaseValToDest[CaseVal] = Case.getCaseSuccessor(); |
| 844 | } |
| 845 | } |
| 846 | |
| 847 | auto SuccIt = CaseValToDest.find(Val: NextState); |
| 848 | return SuccIt == CaseValToDest.end() ? Switch->getDefaultDest() |
| 849 | : SuccIt->second; |
| 850 | } |
| 851 | |
| 852 | // Two states are equivalent if they have the same switch destination. |
| 853 | // Unify the states in different threading path if the states are equivalent. |
| 854 | void unifyTPaths() { |
| 855 | SmallDenseMap<BasicBlock *, APInt> DestToState; |
| 856 | for (ThreadingPath &Path : TPaths) { |
| 857 | APInt NextState = Path.getExitValue(); |
| 858 | BasicBlock *Dest = getNextCaseSuccessor(NextState); |
| 859 | auto [StateIt, Inserted] = DestToState.try_emplace(Key: Dest, Args&: NextState); |
| 860 | if (Inserted) |
| 861 | continue; |
| 862 | if (NextState != StateIt->second) { |
| 863 | LLVM_DEBUG(dbgs() << "Next state in " << Path << " is equivalent to " |
| 864 | << StateIt->second << "\n" ); |
| 865 | Path.setExitValue(StateIt->second); |
| 866 | } |
| 867 | } |
| 868 | } |
| 869 | |
| 870 | unsigned NumVisited = 0; |
| 871 | SwitchInst *Switch; |
| 872 | BasicBlock *SwitchBlock; |
| 873 | OptimizationRemarkEmitter *ORE; |
| 874 | std::vector<ThreadingPath> TPaths; |
| 875 | DenseMap<APInt, BasicBlock *> CaseValToDest; |
| 876 | LoopInfo *LI; |
| 877 | Loop *SwitchOuterLoop; |
| 878 | }; |
| 879 | |
| 880 | struct TransformDFA { |
| 881 | TransformDFA(AllSwitchPaths *SwitchPaths, DomTreeUpdater *DTU, |
| 882 | AssumptionCache *AC, TargetTransformInfo *TTI, |
| 883 | OptimizationRemarkEmitter *ORE, |
| 884 | SmallPtrSet<const Value *, 32> EphValues) |
| 885 | : SwitchPaths(SwitchPaths), DTU(DTU), AC(AC), TTI(TTI), ORE(ORE), |
| 886 | EphValues(EphValues) {} |
| 887 | |
| 888 | bool run() { |
| 889 | if (isLegalAndProfitableToTransform()) { |
| 890 | createAllExitPaths(); |
| 891 | NumTransforms++; |
| 892 | return true; |
| 893 | } |
| 894 | return false; |
| 895 | } |
| 896 | |
| 897 | private: |
| 898 | /// This function performs both a legality check and profitability check at |
| 899 | /// the same time since it is convenient to do so. It iterates through all |
| 900 | /// blocks that will be cloned, and keeps track of the duplication cost. It |
| 901 | /// also returns false if it is illegal to clone some required block. |
| 902 | bool isLegalAndProfitableToTransform() { |
| 903 | CodeMetrics Metrics; |
| 904 | uint64_t NumClonedInst = 0; |
| 905 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
| 906 | |
| 907 | // Don't thread switch without multiple successors. |
| 908 | if (Switch->getNumSuccessors() <= 1) |
| 909 | return false; |
| 910 | |
| 911 | // Note that DuplicateBlockMap is not being used as intended here. It is |
| 912 | // just being used to ensure (BB, State) pairs are only counted once. |
| 913 | DuplicateBlockMap DuplicateMap; |
| 914 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
| 915 | PathType PathBBs = TPath.getPath(); |
| 916 | APInt NextState = TPath.getExitValue(); |
| 917 | const BasicBlock *Determinator = TPath.getDeterminatorBB(); |
| 918 | |
| 919 | // Update Metrics for the Switch block, this is always cloned |
| 920 | BasicBlock *BB = SwitchPaths->getSwitchBlock(); |
| 921 | BasicBlock *VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
| 922 | if (!VisitedBB) { |
| 923 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
| 924 | NumClonedInst += BB->sizeWithoutDebug(); |
| 925 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
| 926 | } |
| 927 | |
| 928 | // If the Switch block is the Determinator, then we can continue since |
| 929 | // this is the only block that is cloned and we already counted for it. |
| 930 | if (PathBBs.front() == Determinator) |
| 931 | continue; |
| 932 | |
| 933 | // Otherwise update Metrics for all blocks that will be cloned. If any |
| 934 | // block is already cloned and would be reused, don't double count it. |
| 935 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
| 936 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
| 937 | BB = *BBIt; |
| 938 | VisitedBB = getClonedBB(BB, NextState, DuplicateMap); |
| 939 | if (VisitedBB) |
| 940 | continue; |
| 941 | Metrics.analyzeBasicBlock(BB, TTI: *TTI, EphValues); |
| 942 | NumClonedInst += BB->sizeWithoutDebug(); |
| 943 | DuplicateMap[BB].push_back(x: {.BB: BB, .State: NextState}); |
| 944 | } |
| 945 | |
| 946 | if (Metrics.notDuplicatable) { |
| 947 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
| 948 | << "non-duplicatable instructions.\n" ); |
| 949 | ORE->emit(RemarkBuilder: [&]() { |
| 950 | return OptimizationRemarkMissed(DEBUG_TYPE, "NonDuplicatableInst" , |
| 951 | Switch) |
| 952 | << "Contains non-duplicatable instructions." ; |
| 953 | }); |
| 954 | return false; |
| 955 | } |
| 956 | |
| 957 | // FIXME: Allow jump threading with controlled convergence. |
| 958 | if (Metrics.Convergence != ConvergenceKind::None) { |
| 959 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
| 960 | << "convergent instructions.\n" ); |
| 961 | ORE->emit(RemarkBuilder: [&]() { |
| 962 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
| 963 | << "Contains convergent instructions." ; |
| 964 | }); |
| 965 | return false; |
| 966 | } |
| 967 | |
| 968 | if (!Metrics.NumInsts.isValid()) { |
| 969 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " |
| 970 | << "instructions with invalid cost.\n" ); |
| 971 | ORE->emit(RemarkBuilder: [&]() { |
| 972 | return OptimizationRemarkMissed(DEBUG_TYPE, "ConvergentInst" , Switch) |
| 973 | << "Contains instructions with invalid cost." ; |
| 974 | }); |
| 975 | return false; |
| 976 | } |
| 977 | } |
| 978 | |
| 979 | // Too much cloned instructions slow down later optimizations, especially |
| 980 | // SLPVectorizer. |
| 981 | // TODO: Thread the switch partially before reaching the threshold. |
| 982 | uint64_t NumOrigInst = 0; |
| 983 | uint64_t NumOuterUseBlock = 0; |
| 984 | for (auto *BB : DuplicateMap.keys()) { |
| 985 | NumOrigInst += BB->sizeWithoutDebug(); |
| 986 | // Only unduplicated blocks with single predecessor require new phi |
| 987 | // nodes. |
| 988 | for (auto *Succ : successors(BB)) |
| 989 | if (!DuplicateMap.count(Val: Succ) && Succ->getSinglePredecessor()) |
| 990 | NumOuterUseBlock++; |
| 991 | } |
| 992 | |
| 993 | if (double(NumClonedInst) / double(NumOrigInst) > MaxClonedRate) { |
| 994 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " |
| 995 | "instructions wll be cloned\n" ); |
| 996 | ORE->emit(RemarkBuilder: [&]() { |
| 997 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable" , Switch) |
| 998 | << "Too much instructions will be cloned." ; |
| 999 | }); |
| 1000 | return false; |
| 1001 | } |
| 1002 | |
| 1003 | // Too much unduplicated blocks with outer uses may cause too much |
| 1004 | // insertions of phi nodes for duplicated definitions. TODO: Drop this |
| 1005 | // threshold if we come up with another way to reduce the number of inserted |
| 1006 | // phi nodes. |
| 1007 | if (NumOuterUseBlock > MaxOuterUseBlocks) { |
| 1008 | LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, too much " |
| 1009 | "blocks with outer uses\n" ); |
| 1010 | ORE->emit(RemarkBuilder: [&]() { |
| 1011 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable" , Switch) |
| 1012 | << "Too much blocks with outer uses." ; |
| 1013 | }); |
| 1014 | return false; |
| 1015 | } |
| 1016 | |
| 1017 | InstructionCost DuplicationCost = 0; |
| 1018 | |
| 1019 | unsigned JumpTableSize = 0; |
| 1020 | TTI->getEstimatedNumberOfCaseClusters(SI: *Switch, JTSize&: JumpTableSize, PSI: nullptr, |
| 1021 | BFI: nullptr); |
| 1022 | if (JumpTableSize == 0) { |
| 1023 | // Factor in the number of conditional branches reduced from jump |
| 1024 | // threading. Assume that lowering the switch block is implemented by |
| 1025 | // using binary search, hence the LogBase2(). |
| 1026 | unsigned CondBranches = |
| 1027 | APInt(32, Switch->getNumSuccessors()).ceilLogBase2(); |
| 1028 | assert(CondBranches > 0 && |
| 1029 | "The threaded switch must have multiple branches" ); |
| 1030 | DuplicationCost = Metrics.NumInsts / CondBranches; |
| 1031 | } else { |
| 1032 | // Compared with jump tables, the DFA optimizer removes an indirect branch |
| 1033 | // on each loop iteration, thus making branch prediction more precise. The |
| 1034 | // more branch targets there are, the more likely it is for the branch |
| 1035 | // predictor to make a mistake, and the more benefit there is in the DFA |
| 1036 | // optimizer. Thus, the more branch targets there are, the lower is the |
| 1037 | // cost of the DFA opt. |
| 1038 | DuplicationCost = Metrics.NumInsts / JumpTableSize; |
| 1039 | } |
| 1040 | |
| 1041 | LLVM_DEBUG(dbgs() << "\nDFA Jump Threading: Cost to jump thread block " |
| 1042 | << SwitchPaths->getSwitchBlock()->getName() |
| 1043 | << " is: " << DuplicationCost << "\n\n" ); |
| 1044 | |
| 1045 | if (DuplicationCost > CostThreshold) { |
| 1046 | LLVM_DEBUG(dbgs() << "Not jump threading, duplication cost exceeds the " |
| 1047 | << "cost threshold.\n" ); |
| 1048 | ORE->emit(RemarkBuilder: [&]() { |
| 1049 | return OptimizationRemarkMissed(DEBUG_TYPE, "NotProfitable" , Switch) |
| 1050 | << "Duplication cost exceeds the cost threshold (cost=" |
| 1051 | << ore::NV("Cost" , DuplicationCost) |
| 1052 | << ", threshold=" << ore::NV("Threshold" , CostThreshold) << ")." ; |
| 1053 | }); |
| 1054 | return false; |
| 1055 | } |
| 1056 | |
| 1057 | ORE->emit(RemarkBuilder: [&]() { |
| 1058 | return OptimizationRemark(DEBUG_TYPE, "JumpThreaded" , Switch) |
| 1059 | << "Switch statement jump-threaded." ; |
| 1060 | }); |
| 1061 | |
| 1062 | return true; |
| 1063 | } |
| 1064 | |
| 1065 | /// Transform each threading path to effectively jump thread the DFA. |
| 1066 | void createAllExitPaths() { |
| 1067 | // Move the switch block to the end of the path, since it will be duplicated |
| 1068 | BasicBlock *SwitchBlock = SwitchPaths->getSwitchBlock(); |
| 1069 | for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
| 1070 | LLVM_DEBUG(dbgs() << TPath << "\n" ); |
| 1071 | // TODO: Fix exit path creation logic so that we dont need this |
| 1072 | // placeholder. |
| 1073 | TPath.push_front(BB: SwitchBlock); |
| 1074 | } |
| 1075 | |
| 1076 | // Transform the ThreadingPaths and keep track of the cloned values |
| 1077 | DuplicateBlockMap DuplicateMap; |
| 1078 | DefMap NewDefs; |
| 1079 | |
| 1080 | SmallPtrSet<BasicBlock *, 16> BlocksToClean; |
| 1081 | BlocksToClean.insert_range(R: successors(BB: SwitchBlock)); |
| 1082 | |
| 1083 | for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { |
| 1084 | createExitPath(NewDefs, Path: TPath, DuplicateMap, BlocksToClean, DTU); |
| 1085 | NumPaths++; |
| 1086 | } |
| 1087 | |
| 1088 | // After all paths are cloned, now update the last successor of the cloned |
| 1089 | // path so it skips over the switch statement |
| 1090 | for (const ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) |
| 1091 | updateLastSuccessor(TPath, DuplicateMap, DTU); |
| 1092 | |
| 1093 | // For each instruction that was cloned and used outside, update its uses |
| 1094 | updateSSA(NewDefs); |
| 1095 | |
| 1096 | // Clean PHI Nodes for the newly created blocks |
| 1097 | for (BasicBlock *BB : BlocksToClean) |
| 1098 | cleanPhiNodes(BB); |
| 1099 | } |
| 1100 | |
| 1101 | /// For a specific ThreadingPath \p Path, create an exit path starting from |
| 1102 | /// the determinator block. |
| 1103 | /// |
| 1104 | /// To remember the correct destination, we have to duplicate blocks |
| 1105 | /// corresponding to each state. Also update the terminating instruction of |
| 1106 | /// the predecessors, and phis in the successor blocks. |
| 1107 | void createExitPath(DefMap &NewDefs, const ThreadingPath &Path, |
| 1108 | DuplicateBlockMap &DuplicateMap, |
| 1109 | SmallPtrSet<BasicBlock *, 16> &BlocksToClean, |
| 1110 | DomTreeUpdater *DTU) { |
| 1111 | APInt NextState = Path.getExitValue(); |
| 1112 | const BasicBlock *Determinator = Path.getDeterminatorBB(); |
| 1113 | PathType PathBBs = Path.getPath(); |
| 1114 | |
| 1115 | // Don't select the placeholder block in front |
| 1116 | if (PathBBs.front() == Determinator) |
| 1117 | PathBBs.pop_front(); |
| 1118 | |
| 1119 | auto DetIt = llvm::find(Range&: PathBBs, Val: Determinator); |
| 1120 | // When there is only one BB in PathBBs, the determinator takes itself as a |
| 1121 | // direct predecessor. |
| 1122 | BasicBlock *PrevBB = PathBBs.size() == 1 ? *DetIt : *std::prev(x: DetIt); |
| 1123 | for (auto BBIt = DetIt; BBIt != PathBBs.end(); BBIt++) { |
| 1124 | BasicBlock *BB = *BBIt; |
| 1125 | BlocksToClean.insert(Ptr: BB); |
| 1126 | |
| 1127 | // We already cloned BB for this NextState, now just update the branch |
| 1128 | // and continue. |
| 1129 | BasicBlock *NextBB = getClonedBB(BB, NextState, DuplicateMap); |
| 1130 | if (NextBB) { |
| 1131 | updatePredecessor(PrevBB, OldBB: BB, NewBB: NextBB, DTU); |
| 1132 | PrevBB = NextBB; |
| 1133 | continue; |
| 1134 | } |
| 1135 | |
| 1136 | // Clone the BB and update the successor of Prev to jump to the new block |
| 1137 | BasicBlock *NewBB = cloneBlockAndUpdatePredecessor( |
| 1138 | BB, PrevBB, NextState, DuplicateMap, NewDefs, DTU); |
| 1139 | DuplicateMap[BB].push_back(x: {.BB: NewBB, .State: NextState}); |
| 1140 | BlocksToClean.insert(Ptr: NewBB); |
| 1141 | PrevBB = NewBB; |
| 1142 | } |
| 1143 | } |
| 1144 | |
| 1145 | /// Restore SSA form after cloning blocks. |
| 1146 | /// |
| 1147 | /// Each cloned block creates new defs for a variable, and the uses need to be |
| 1148 | /// updated to reflect this. The uses may be replaced with a cloned value, or |
| 1149 | /// some derived phi instruction. Note that all uses of a value defined in the |
| 1150 | /// same block were already remapped when cloning the block. |
| 1151 | void updateSSA(DefMap &NewDefs) { |
| 1152 | SSAUpdaterBulk SSAUpdate; |
| 1153 | SmallVector<Use *, 16> UsesToRename; |
| 1154 | |
| 1155 | for (const auto &KV : NewDefs) { |
| 1156 | Instruction *I = KV.first; |
| 1157 | BasicBlock *BB = I->getParent(); |
| 1158 | std::vector<Instruction *> Cloned = KV.second; |
| 1159 | |
| 1160 | // Scan all uses of this instruction to see if it is used outside of its |
| 1161 | // block, and if so, record them in UsesToRename. |
| 1162 | for (Use &U : I->uses()) { |
| 1163 | Instruction *User = cast<Instruction>(Val: U.getUser()); |
| 1164 | if (PHINode *UserPN = dyn_cast<PHINode>(Val: User)) { |
| 1165 | if (UserPN->getIncomingBlock(U) == BB) |
| 1166 | continue; |
| 1167 | } else if (User->getParent() == BB) { |
| 1168 | continue; |
| 1169 | } |
| 1170 | |
| 1171 | UsesToRename.push_back(Elt: &U); |
| 1172 | } |
| 1173 | |
| 1174 | // If there are no uses outside the block, we're done with this |
| 1175 | // instruction. |
| 1176 | if (UsesToRename.empty()) |
| 1177 | continue; |
| 1178 | LLVM_DEBUG(dbgs() << "DFA-JT: Renaming non-local uses of: " << *I |
| 1179 | << "\n" ); |
| 1180 | |
| 1181 | // We found a use of I outside of BB. Rename all uses of I that are |
| 1182 | // outside its block to be uses of the appropriate PHI node etc. See |
| 1183 | // ValuesInBlocks with the values we know. |
| 1184 | unsigned VarNum = SSAUpdate.AddVariable(Name: I->getName(), Ty: I->getType()); |
| 1185 | SSAUpdate.AddAvailableValue(Var: VarNum, BB, V: I); |
| 1186 | for (Instruction *New : Cloned) |
| 1187 | SSAUpdate.AddAvailableValue(Var: VarNum, BB: New->getParent(), V: New); |
| 1188 | |
| 1189 | while (!UsesToRename.empty()) |
| 1190 | SSAUpdate.AddUse(Var: VarNum, U: UsesToRename.pop_back_val()); |
| 1191 | |
| 1192 | LLVM_DEBUG(dbgs() << "\n" ); |
| 1193 | } |
| 1194 | // SSAUpdater handles phi placement and renaming uses with the appropriate |
| 1195 | // value. |
| 1196 | SSAUpdate.RewriteAllUses(DT: &DTU->getDomTree()); |
| 1197 | } |
| 1198 | |
| 1199 | /// Helper to get the successor corresponding to a particular case value for |
| 1200 | /// a switch statement. |
| 1201 | /// TODO: Unify it with SwitchPaths->getNextCaseSuccessor(SwitchInst *Switch) |
| 1202 | /// by updating cached value => successor mapping during threading. |
| 1203 | static BasicBlock *getNextCaseSuccessor(SwitchInst *Switch, |
| 1204 | const APInt &NextState) { |
| 1205 | BasicBlock *NextCase = nullptr; |
| 1206 | for (auto Case : Switch->cases()) { |
| 1207 | if (Case.getCaseValue()->getValue() == NextState) { |
| 1208 | NextCase = Case.getCaseSuccessor(); |
| 1209 | break; |
| 1210 | } |
| 1211 | } |
| 1212 | if (!NextCase) |
| 1213 | NextCase = Switch->getDefaultDest(); |
| 1214 | return NextCase; |
| 1215 | } |
| 1216 | |
| 1217 | /// Clones a basic block, and adds it to the CFG. |
| 1218 | /// |
| 1219 | /// This function also includes updating phi nodes in the successors of the |
| 1220 | /// BB, and remapping uses that were defined locally in the cloned BB. |
| 1221 | BasicBlock *cloneBlockAndUpdatePredecessor(BasicBlock *BB, BasicBlock *PrevBB, |
| 1222 | const APInt &NextState, |
| 1223 | DuplicateBlockMap &DuplicateMap, |
| 1224 | DefMap &NewDefs, |
| 1225 | DomTreeUpdater *DTU) { |
| 1226 | ValueToValueMapTy VMap; |
| 1227 | BasicBlock *NewBB = CloneBasicBlock( |
| 1228 | BB, VMap, NameSuffix: ".jt" + std::to_string(val: NextState.getLimitedValue()), |
| 1229 | F: BB->getParent()); |
| 1230 | NewBB->moveAfter(MovePos: BB); |
| 1231 | NumCloned++; |
| 1232 | |
| 1233 | for (Instruction &I : *NewBB) { |
| 1234 | // Do not remap operands of PHINode in case a definition in BB is an |
| 1235 | // incoming value to a phi in the same block. This incoming value will |
| 1236 | // be renamed later while restoring SSA. |
| 1237 | if (isa<PHINode>(Val: &I)) |
| 1238 | continue; |
| 1239 | RemapInstruction(I: &I, VM&: VMap, |
| 1240 | Flags: RF_IgnoreMissingLocals | RF_NoModuleLevelChanges); |
| 1241 | if (AssumeInst *II = dyn_cast<AssumeInst>(Val: &I)) |
| 1242 | AC->registerAssumption(CI: II); |
| 1243 | } |
| 1244 | |
| 1245 | updateSuccessorPhis(BB, ClonedBB: NewBB, NextState, VMap, DuplicateMap); |
| 1246 | updatePredecessor(PrevBB, OldBB: BB, NewBB, DTU); |
| 1247 | updateDefMap(NewDefs, VMap); |
| 1248 | |
| 1249 | // Add all successors to the DominatorTree |
| 1250 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
| 1251 | for (auto *SuccBB : successors(BB: NewBB)) { |
| 1252 | if (SuccSet.insert(Ptr: SuccBB).second) |
| 1253 | DTU->applyUpdates(Updates: {{DominatorTree::Insert, NewBB, SuccBB}}); |
| 1254 | } |
| 1255 | SuccSet.clear(); |
| 1256 | return NewBB; |
| 1257 | } |
| 1258 | |
| 1259 | /// Update the phi nodes in BB's successors. |
| 1260 | /// |
| 1261 | /// This means creating a new incoming value from NewBB with the new |
| 1262 | /// instruction wherever there is an incoming value from BB. |
| 1263 | void updateSuccessorPhis(BasicBlock *BB, BasicBlock *ClonedBB, |
| 1264 | const APInt &NextState, ValueToValueMapTy &VMap, |
| 1265 | DuplicateBlockMap &DuplicateMap) { |
| 1266 | std::vector<BasicBlock *> BlocksToUpdate; |
| 1267 | |
| 1268 | // If BB is the last block in the path, we can simply update the one case |
| 1269 | // successor that will be reached. |
| 1270 | if (BB == SwitchPaths->getSwitchBlock()) { |
| 1271 | SwitchInst *Switch = SwitchPaths->getSwitchInst(); |
| 1272 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
| 1273 | BlocksToUpdate.push_back(x: NextCase); |
| 1274 | BasicBlock *ClonedSucc = getClonedBB(BB: NextCase, NextState, DuplicateMap); |
| 1275 | if (ClonedSucc) |
| 1276 | BlocksToUpdate.push_back(x: ClonedSucc); |
| 1277 | } |
| 1278 | // Otherwise update phis in all successors. |
| 1279 | else { |
| 1280 | for (BasicBlock *Succ : successors(BB)) { |
| 1281 | BlocksToUpdate.push_back(x: Succ); |
| 1282 | |
| 1283 | // Check if a successor has already been cloned for the particular exit |
| 1284 | // value. In this case if a successor was already cloned, the phi nodes |
| 1285 | // in the cloned block should be updated directly. |
| 1286 | BasicBlock *ClonedSucc = getClonedBB(BB: Succ, NextState, DuplicateMap); |
| 1287 | if (ClonedSucc) |
| 1288 | BlocksToUpdate.push_back(x: ClonedSucc); |
| 1289 | } |
| 1290 | } |
| 1291 | |
| 1292 | // If there is a phi with an incoming value from BB, create a new incoming |
| 1293 | // value for the new predecessor ClonedBB. The value will either be the same |
| 1294 | // value from BB or a cloned value. |
| 1295 | for (BasicBlock *Succ : BlocksToUpdate) { |
| 1296 | for (PHINode &Phi : Succ->phis()) { |
| 1297 | Value *Incoming = Phi.getIncomingValueForBlock(BB); |
| 1298 | if (Incoming) { |
| 1299 | if (isa<Constant>(Val: Incoming)) { |
| 1300 | Phi.addIncoming(V: Incoming, BB: ClonedBB); |
| 1301 | continue; |
| 1302 | } |
| 1303 | Value *ClonedVal = VMap[Incoming]; |
| 1304 | if (ClonedVal) |
| 1305 | Phi.addIncoming(V: ClonedVal, BB: ClonedBB); |
| 1306 | else |
| 1307 | Phi.addIncoming(V: Incoming, BB: ClonedBB); |
| 1308 | } |
| 1309 | } |
| 1310 | } |
| 1311 | } |
| 1312 | |
| 1313 | /// Sets the successor of PrevBB to be NewBB instead of OldBB. Note that all |
| 1314 | /// other successors are kept as well. |
| 1315 | void updatePredecessor(BasicBlock *PrevBB, BasicBlock *OldBB, |
| 1316 | BasicBlock *NewBB, DomTreeUpdater *DTU) { |
| 1317 | // When a path is reused, there is a chance that predecessors were already |
| 1318 | // updated before. Check if the predecessor needs to be updated first. |
| 1319 | if (!isPredecessor(BB: OldBB, IncomingBB: PrevBB)) |
| 1320 | return; |
| 1321 | |
| 1322 | Instruction *PrevTerm = PrevBB->getTerminator(); |
| 1323 | for (unsigned Idx = 0; Idx < PrevTerm->getNumSuccessors(); Idx++) { |
| 1324 | if (PrevTerm->getSuccessor(Idx) == OldBB) { |
| 1325 | OldBB->removePredecessor(Pred: PrevBB, /* KeepOneInputPHIs = */ true); |
| 1326 | PrevTerm->setSuccessor(Idx, BB: NewBB); |
| 1327 | } |
| 1328 | } |
| 1329 | DTU->applyUpdates(Updates: {{DominatorTree::Delete, PrevBB, OldBB}, |
| 1330 | {DominatorTree::Insert, PrevBB, NewBB}}); |
| 1331 | } |
| 1332 | |
| 1333 | /// Add new value mappings to the DefMap to keep track of all new definitions |
| 1334 | /// for a particular instruction. These will be used while updating SSA form. |
| 1335 | void updateDefMap(DefMap &NewDefs, ValueToValueMapTy &VMap) { |
| 1336 | SmallVector<std::pair<Instruction *, Instruction *>> NewDefsVector; |
| 1337 | NewDefsVector.reserve(N: VMap.size()); |
| 1338 | |
| 1339 | for (auto Entry : VMap) { |
| 1340 | Instruction *Inst = |
| 1341 | dyn_cast<Instruction>(Val: const_cast<Value *>(Entry.first)); |
| 1342 | if (!Inst || !Entry.second || isa<BranchInst>(Val: Inst) || |
| 1343 | isa<SwitchInst>(Val: Inst)) { |
| 1344 | continue; |
| 1345 | } |
| 1346 | |
| 1347 | Instruction *Cloned = dyn_cast<Instruction>(Val&: Entry.second); |
| 1348 | if (!Cloned) |
| 1349 | continue; |
| 1350 | |
| 1351 | NewDefsVector.push_back(Elt: {Inst, Cloned}); |
| 1352 | } |
| 1353 | |
| 1354 | // Sort the defs to get deterministic insertion order into NewDefs. |
| 1355 | sort(C&: NewDefsVector, Comp: [](const auto &LHS, const auto &RHS) { |
| 1356 | if (LHS.first == RHS.first) |
| 1357 | return LHS.second->comesBefore(RHS.second); |
| 1358 | return LHS.first->comesBefore(RHS.first); |
| 1359 | }); |
| 1360 | |
| 1361 | for (const auto &KV : NewDefsVector) |
| 1362 | NewDefs[KV.first].push_back(x: KV.second); |
| 1363 | } |
| 1364 | |
| 1365 | /// Update the last branch of a particular cloned path to point to the correct |
| 1366 | /// case successor. |
| 1367 | /// |
| 1368 | /// Note that this is an optional step and would have been done in later |
| 1369 | /// optimizations, but it makes the CFG significantly easier to work with. |
| 1370 | void updateLastSuccessor(const ThreadingPath &TPath, |
| 1371 | DuplicateBlockMap &DuplicateMap, |
| 1372 | DomTreeUpdater *DTU) { |
| 1373 | APInt NextState = TPath.getExitValue(); |
| 1374 | BasicBlock *BB = TPath.getPath().back(); |
| 1375 | BasicBlock *LastBlock = getClonedBB(BB, NextState, DuplicateMap); |
| 1376 | |
| 1377 | // Note multiple paths can end at the same block so check that it is not |
| 1378 | // updated yet |
| 1379 | if (!isa<SwitchInst>(Val: LastBlock->getTerminator())) |
| 1380 | return; |
| 1381 | SwitchInst *Switch = cast<SwitchInst>(Val: LastBlock->getTerminator()); |
| 1382 | BasicBlock *NextCase = getNextCaseSuccessor(Switch, NextState); |
| 1383 | |
| 1384 | std::vector<DominatorTree::UpdateType> DTUpdates; |
| 1385 | SmallPtrSet<BasicBlock *, 4> SuccSet; |
| 1386 | for (BasicBlock *Succ : successors(BB: LastBlock)) { |
| 1387 | if (Succ != NextCase && SuccSet.insert(Ptr: Succ).second) |
| 1388 | DTUpdates.push_back(x: {DominatorTree::Delete, LastBlock, Succ}); |
| 1389 | } |
| 1390 | |
| 1391 | Switch->eraseFromParent(); |
| 1392 | BranchInst::Create(IfTrue: NextCase, InsertBefore: LastBlock); |
| 1393 | |
| 1394 | DTU->applyUpdates(Updates: DTUpdates); |
| 1395 | } |
| 1396 | |
| 1397 | /// After cloning blocks, some of the phi nodes have extra incoming values |
| 1398 | /// that are no longer used. This function removes them. |
| 1399 | void cleanPhiNodes(BasicBlock *BB) { |
| 1400 | // If BB is no longer reachable, remove any remaining phi nodes |
| 1401 | if (pred_empty(BB)) { |
| 1402 | for (PHINode &PN : make_early_inc_range(Range: BB->phis())) { |
| 1403 | PN.replaceAllUsesWith(V: PoisonValue::get(T: PN.getType())); |
| 1404 | PN.eraseFromParent(); |
| 1405 | } |
| 1406 | return; |
| 1407 | } |
| 1408 | |
| 1409 | // Remove any incoming values that come from an invalid predecessor |
| 1410 | for (PHINode &Phi : BB->phis()) |
| 1411 | Phi.removeIncomingValueIf(Predicate: [&](unsigned Index) { |
| 1412 | BasicBlock *IncomingBB = Phi.getIncomingBlock(i: Index); |
| 1413 | return !isPredecessor(BB, IncomingBB); |
| 1414 | }); |
| 1415 | } |
| 1416 | |
| 1417 | /// Checks if BB was already cloned for a particular next state value. If it |
| 1418 | /// was then it returns this cloned block, and otherwise null. |
| 1419 | BasicBlock *getClonedBB(BasicBlock *BB, const APInt &NextState, |
| 1420 | DuplicateBlockMap &DuplicateMap) { |
| 1421 | CloneList ClonedBBs = DuplicateMap[BB]; |
| 1422 | |
| 1423 | // Find an entry in the CloneList with this NextState. If it exists then |
| 1424 | // return the corresponding BB |
| 1425 | auto It = llvm::find_if(Range&: ClonedBBs, P: [NextState](const ClonedBlock &C) { |
| 1426 | return C.State == NextState; |
| 1427 | }); |
| 1428 | return It != ClonedBBs.end() ? (*It).BB : nullptr; |
| 1429 | } |
| 1430 | |
| 1431 | /// Returns true if IncomingBB is a predecessor of BB. |
| 1432 | bool isPredecessor(BasicBlock *BB, BasicBlock *IncomingBB) { |
| 1433 | return llvm::is_contained(Range: predecessors(BB), Element: IncomingBB); |
| 1434 | } |
| 1435 | |
| 1436 | AllSwitchPaths *SwitchPaths; |
| 1437 | DomTreeUpdater *DTU; |
| 1438 | AssumptionCache *AC; |
| 1439 | TargetTransformInfo *TTI; |
| 1440 | OptimizationRemarkEmitter *ORE; |
| 1441 | SmallPtrSet<const Value *, 32> EphValues; |
| 1442 | std::vector<ThreadingPath> TPaths; |
| 1443 | }; |
| 1444 | } // namespace |
| 1445 | |
| 1446 | bool DFAJumpThreading::run(Function &F) { |
| 1447 | LLVM_DEBUG(dbgs() << "\nDFA Jump threading: " << F.getName() << "\n" ); |
| 1448 | |
| 1449 | if (F.hasOptSize()) { |
| 1450 | LLVM_DEBUG(dbgs() << "Skipping due to the 'minsize' attribute\n" ); |
| 1451 | return false; |
| 1452 | } |
| 1453 | |
| 1454 | if (ClViewCfgBefore) |
| 1455 | F.viewCFG(); |
| 1456 | |
| 1457 | SmallVector<AllSwitchPaths, 2> ThreadableLoops; |
| 1458 | bool MadeChanges = false; |
| 1459 | LoopInfoBroken = false; |
| 1460 | |
| 1461 | for (BasicBlock &BB : F) { |
| 1462 | auto *SI = dyn_cast<SwitchInst>(Val: BB.getTerminator()); |
| 1463 | if (!SI) |
| 1464 | continue; |
| 1465 | |
| 1466 | LLVM_DEBUG(dbgs() << "\nCheck if SwitchInst in BB " << BB.getName() |
| 1467 | << " is a candidate\n" ); |
| 1468 | MainSwitch Switch(SI, LI, ORE); |
| 1469 | |
| 1470 | if (!Switch.getInstr()) { |
| 1471 | LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is not a " |
| 1472 | << "candidate for jump threading\n" ); |
| 1473 | continue; |
| 1474 | } |
| 1475 | |
| 1476 | LLVM_DEBUG(dbgs() << "\nSwitchInst in BB " << BB.getName() << " is a " |
| 1477 | << "candidate for jump threading\n" ); |
| 1478 | LLVM_DEBUG(SI->dump()); |
| 1479 | |
| 1480 | unfoldSelectInstrs(SelectInsts: Switch.getSelectInsts()); |
| 1481 | if (!Switch.getSelectInsts().empty()) |
| 1482 | MadeChanges = true; |
| 1483 | |
| 1484 | AllSwitchPaths SwitchPaths(&Switch, ORE, LI, |
| 1485 | LI->getLoopFor(BB: &BB)->getOutermostLoop()); |
| 1486 | SwitchPaths.run(); |
| 1487 | |
| 1488 | if (SwitchPaths.getNumThreadingPaths() > 0) { |
| 1489 | ThreadableLoops.push_back(Elt: SwitchPaths); |
| 1490 | |
| 1491 | // For the time being limit this optimization to occurring once in a |
| 1492 | // function since it can change the CFG significantly. This is not a |
| 1493 | // strict requirement but it can cause buggy behavior if there is an |
| 1494 | // overlap of blocks in different opportunities. There is a lot of room to |
| 1495 | // experiment with catching more opportunities here. |
| 1496 | // NOTE: To release this contraint, we must handle LoopInfo invalidation |
| 1497 | break; |
| 1498 | } |
| 1499 | } |
| 1500 | |
| 1501 | #ifdef NDEBUG |
| 1502 | LI->verify(DomTree: DTU->getDomTree()); |
| 1503 | #endif |
| 1504 | |
| 1505 | SmallPtrSet<const Value *, 32> EphValues; |
| 1506 | if (ThreadableLoops.size() > 0) |
| 1507 | CodeMetrics::collectEphemeralValues(L: &F, AC, EphValues); |
| 1508 | |
| 1509 | for (AllSwitchPaths SwitchPaths : ThreadableLoops) { |
| 1510 | TransformDFA Transform(&SwitchPaths, DTU, AC, TTI, ORE, EphValues); |
| 1511 | if (Transform.run()) |
| 1512 | MadeChanges = LoopInfoBroken = true; |
| 1513 | } |
| 1514 | |
| 1515 | DTU->flush(); |
| 1516 | |
| 1517 | #ifdef EXPENSIVE_CHECKS |
| 1518 | verifyFunction(F, &dbgs()); |
| 1519 | #endif |
| 1520 | |
| 1521 | if (MadeChanges && VerifyDomInfo) |
| 1522 | assert(DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full) && |
| 1523 | "Failed to maintain validity of domtree!" ); |
| 1524 | |
| 1525 | return MadeChanges; |
| 1526 | } |
| 1527 | |
| 1528 | /// Integrate with the new Pass Manager |
| 1529 | PreservedAnalyses DFAJumpThreadingPass::run(Function &F, |
| 1530 | FunctionAnalysisManager &AM) { |
| 1531 | AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
| 1532 | DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 1533 | LoopInfo &LI = AM.getResult<LoopAnalysis>(IR&: F); |
| 1534 | TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
| 1535 | OptimizationRemarkEmitter ORE(&F); |
| 1536 | |
| 1537 | DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); |
| 1538 | DFAJumpThreading ThreadImpl(&AC, &DTU, &LI, &TTI, &ORE); |
| 1539 | if (!ThreadImpl.run(F)) |
| 1540 | return PreservedAnalyses::all(); |
| 1541 | |
| 1542 | PreservedAnalyses PA; |
| 1543 | PA.preserve<DominatorTreeAnalysis>(); |
| 1544 | if (!ThreadImpl.LoopInfoBroken) |
| 1545 | PA.preserve<LoopAnalysis>(); |
| 1546 | return PA; |
| 1547 | } |
| 1548 | |