| 1 | //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | /// \file |
| 9 | /// Insert hardware loop intrinsics into loops which are deemed profitable by |
| 10 | /// the target, by querying TargetTransformInfo. A hardware loop comprises of |
| 11 | /// two intrinsics: one, outside the loop, to set the loop iteration count and |
| 12 | /// another, in the exit block, to decrement the counter. The decremented value |
| 13 | /// can either be carried through the loop via a phi or handled in some opaque |
| 14 | /// way by the target. |
| 15 | /// |
| 16 | //===----------------------------------------------------------------------===// |
| 17 | |
| 18 | #include "llvm/CodeGen/HardwareLoops.h" |
| 19 | #include "llvm/ADT/Statistic.h" |
| 20 | #include "llvm/Analysis/AssumptionCache.h" |
| 21 | #include "llvm/Analysis/BranchProbabilityInfo.h" |
| 22 | #include "llvm/Analysis/LoopInfo.h" |
| 23 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 24 | #include "llvm/Analysis/ScalarEvolution.h" |
| 25 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 26 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 27 | #include "llvm/CodeGen/Passes.h" |
| 28 | #include "llvm/IR/BasicBlock.h" |
| 29 | #include "llvm/IR/Constants.h" |
| 30 | #include "llvm/IR/Dominators.h" |
| 31 | #include "llvm/IR/IRBuilder.h" |
| 32 | #include "llvm/IR/Instructions.h" |
| 33 | #include "llvm/IR/Value.h" |
| 34 | #include "llvm/InitializePasses.h" |
| 35 | #include "llvm/Pass.h" |
| 36 | #include "llvm/PassRegistry.h" |
| 37 | #include "llvm/Support/CommandLine.h" |
| 38 | #include "llvm/Support/Debug.h" |
| 39 | #include "llvm/Transforms/Utils.h" |
| 40 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
| 41 | #include "llvm/Transforms/Utils/Local.h" |
| 42 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 43 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
| 44 | |
| 45 | #define DEBUG_TYPE "hardware-loops" |
| 46 | |
| 47 | #define HW_LOOPS_NAME "Hardware Loop Insertion" |
| 48 | |
| 49 | using namespace llvm; |
| 50 | |
| 51 | static cl::opt<bool> |
| 52 | ForceHardwareLoops("force-hardware-loops" , cl::Hidden, cl::init(Val: false), |
| 53 | cl::desc("Force hardware loops intrinsics to be inserted" )); |
| 54 | |
| 55 | static cl::opt<bool> |
| 56 | ForceHardwareLoopPHI( |
| 57 | "force-hardware-loop-phi" , cl::Hidden, cl::init(Val: false), |
| 58 | cl::desc("Force hardware loop counter to be updated through a phi" )); |
| 59 | |
| 60 | static cl::opt<bool> |
| 61 | ForceNestedLoop("force-nested-hardware-loop" , cl::Hidden, cl::init(Val: false), |
| 62 | cl::desc("Force allowance of nested hardware loops" )); |
| 63 | |
| 64 | static cl::opt<unsigned> |
| 65 | LoopDecrement("hardware-loop-decrement" , cl::Hidden, cl::init(Val: 1), |
| 66 | cl::desc("Set the loop decrement value" )); |
| 67 | |
| 68 | static cl::opt<unsigned> |
| 69 | CounterBitWidth("hardware-loop-counter-bitwidth" , cl::Hidden, cl::init(Val: 32), |
| 70 | cl::desc("Set the loop counter bitwidth" )); |
| 71 | |
| 72 | static cl::opt<bool> |
| 73 | ForceGuardLoopEntry( |
| 74 | "force-hardware-loop-guard" , cl::Hidden, cl::init(Val: false), |
| 75 | cl::desc("Force generation of loop guard intrinsic" )); |
| 76 | |
| 77 | STATISTIC(NumHWLoops, "Number of loops converted to hardware loops" ); |
| 78 | |
| 79 | #ifndef NDEBUG |
| 80 | static void debugHWLoopFailure(const StringRef DebugMsg, |
| 81 | Instruction *I) { |
| 82 | dbgs() << "HWLoops: " << DebugMsg; |
| 83 | if (I) |
| 84 | dbgs() << ' ' << *I; |
| 85 | else |
| 86 | dbgs() << '.'; |
| 87 | dbgs() << '\n'; |
| 88 | } |
| 89 | #endif |
| 90 | |
| 91 | static OptimizationRemarkAnalysis |
| 92 | createHWLoopAnalysis(StringRef , Loop *L, Instruction *I) { |
| 93 | BasicBlock *CodeRegion = L->getHeader(); |
| 94 | DebugLoc DL = L->getStartLoc(); |
| 95 | |
| 96 | if (I) { |
| 97 | CodeRegion = I->getParent(); |
| 98 | // If there is no debug location attached to the instruction, revert back to |
| 99 | // using the loop's. |
| 100 | if (I->getDebugLoc()) |
| 101 | DL = I->getDebugLoc(); |
| 102 | } |
| 103 | |
| 104 | OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion); |
| 105 | R << "hardware-loop not created: " ; |
| 106 | return R; |
| 107 | } |
| 108 | |
| 109 | namespace { |
| 110 | |
| 111 | void (const StringRef Msg, const StringRef ORETag, |
| 112 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) { |
| 113 | LLVM_DEBUG(debugHWLoopFailure(Msg, I)); |
| 114 | ORE->emit(OptDiag: createHWLoopAnalysis(RemarkName: ORETag, L: TheLoop, I) << Msg); |
| 115 | } |
| 116 | |
| 117 | using TTI = TargetTransformInfo; |
| 118 | |
| 119 | class HardwareLoopsLegacy : public FunctionPass { |
| 120 | public: |
| 121 | static char ID; |
| 122 | |
| 123 | HardwareLoopsLegacy() : FunctionPass(ID) { |
| 124 | initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry()); |
| 125 | } |
| 126 | |
| 127 | bool runOnFunction(Function &F) override; |
| 128 | |
| 129 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 130 | AU.addRequired<LoopInfoWrapperPass>(); |
| 131 | AU.addPreserved<LoopInfoWrapperPass>(); |
| 132 | AU.addRequired<DominatorTreeWrapperPass>(); |
| 133 | AU.addPreserved<DominatorTreeWrapperPass>(); |
| 134 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
| 135 | AU.addPreserved<ScalarEvolutionWrapperPass>(); |
| 136 | AU.addRequired<AssumptionCacheTracker>(); |
| 137 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
| 138 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
| 139 | AU.addPreserved<BranchProbabilityInfoWrapperPass>(); |
| 140 | } |
| 141 | }; |
| 142 | |
| 143 | class HardwareLoopsImpl { |
| 144 | public: |
| 145 | HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA, |
| 146 | DominatorTree &DT, const DataLayout &DL, |
| 147 | const TargetTransformInfo &TTI, TargetLibraryInfo *TLI, |
| 148 | AssumptionCache &AC, OptimizationRemarkEmitter *ORE, |
| 149 | HardwareLoopOptions &Opts) |
| 150 | : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI), |
| 151 | TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { } |
| 152 | |
| 153 | bool run(Function &F); |
| 154 | |
| 155 | private: |
| 156 | // Try to convert the given Loop into a hardware loop. |
| 157 | bool TryConvertLoop(Loop *L, LLVMContext &Ctx); |
| 158 | |
| 159 | // Given that the target believes the loop to be profitable, try to |
| 160 | // convert it. |
| 161 | bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo); |
| 162 | |
| 163 | ScalarEvolution &SE; |
| 164 | LoopInfo &LI; |
| 165 | bool PreserveLCSSA; |
| 166 | DominatorTree &DT; |
| 167 | const DataLayout &DL; |
| 168 | const TargetTransformInfo &TTI; |
| 169 | TargetLibraryInfo *TLI = nullptr; |
| 170 | AssumptionCache &AC; |
| 171 | OptimizationRemarkEmitter *ORE; |
| 172 | HardwareLoopOptions &Opts; |
| 173 | bool MadeChange = false; |
| 174 | }; |
| 175 | |
| 176 | class HardwareLoop { |
| 177 | // Expand the trip count scev into a value that we can use. |
| 178 | Value *InitLoopCount(); |
| 179 | |
| 180 | // Insert the set_loop_iteration intrinsic. |
| 181 | Value *InsertIterationSetup(Value *LoopCountInit); |
| 182 | |
| 183 | // Insert the loop_decrement intrinsic. |
| 184 | void InsertLoopDec(); |
| 185 | |
| 186 | // Insert the loop_decrement_reg intrinsic. |
| 187 | Instruction *InsertLoopRegDec(Value *EltsRem); |
| 188 | |
| 189 | // If the target requires the counter value to be updated in the loop, |
| 190 | // insert a phi to hold the value. The intended purpose is for use by |
| 191 | // loop_decrement_reg. |
| 192 | PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem); |
| 193 | |
| 194 | // Create a new cmp, that checks the returned value of loop_decrement*, |
| 195 | // and update the exit branch to use it. |
| 196 | void UpdateBranch(Value *EltsRem); |
| 197 | |
| 198 | public: |
| 199 | (HardwareLoopInfo &Info, ScalarEvolution &SE, |
| 200 | const DataLayout &DL, |
| 201 | OptimizationRemarkEmitter *ORE, |
| 202 | HardwareLoopOptions &Opts) : |
| 203 | SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()), |
| 204 | ExitCount(Info.ExitCount), |
| 205 | CountType(Info.CountType), |
| 206 | ExitBranch(Info.ExitBranch), |
| 207 | LoopDecrement(Info.LoopDecrement), |
| 208 | UsePHICounter(Info.CounterInReg), |
| 209 | UseLoopGuard(Info.PerformEntryTest) { } |
| 210 | |
| 211 | void Create(); |
| 212 | |
| 213 | private: |
| 214 | ScalarEvolution &SE; |
| 215 | const DataLayout &DL; |
| 216 | OptimizationRemarkEmitter *ORE = nullptr; |
| 217 | HardwareLoopOptions &Opts; |
| 218 | Loop *L = nullptr; |
| 219 | Module *M = nullptr; |
| 220 | const SCEV *ExitCount = nullptr; |
| 221 | Type *CountType = nullptr; |
| 222 | BranchInst *ExitBranch = nullptr; |
| 223 | Value *LoopDecrement = nullptr; |
| 224 | bool UsePHICounter = false; |
| 225 | bool UseLoopGuard = false; |
| 226 | BasicBlock *BeginBB = nullptr; |
| 227 | }; |
| 228 | } |
| 229 | |
| 230 | char HardwareLoopsLegacy::ID = 0; |
| 231 | |
| 232 | bool HardwareLoopsLegacy::runOnFunction(Function &F) { |
| 233 | if (skipFunction(F)) |
| 234 | return false; |
| 235 | |
| 236 | LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n" ); |
| 237 | |
| 238 | auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
| 239 | auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
| 240 | auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| 241 | auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
| 242 | auto &DL = F.getDataLayout(); |
| 243 | auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
| 244 | auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
| 245 | auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; |
| 246 | auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
| 247 | bool PreserveLCSSA = mustPreserveAnalysisID(AID&: LCSSAID); |
| 248 | |
| 249 | HardwareLoopOptions Opts; |
| 250 | if (ForceHardwareLoops.getNumOccurrences()) |
| 251 | Opts.setForce(ForceHardwareLoops); |
| 252 | if (ForceHardwareLoopPHI.getNumOccurrences()) |
| 253 | Opts.setForcePhi(ForceHardwareLoopPHI); |
| 254 | if (ForceNestedLoop.getNumOccurrences()) |
| 255 | Opts.setForceNested(ForceNestedLoop); |
| 256 | if (ForceGuardLoopEntry.getNumOccurrences()) |
| 257 | Opts.setForceGuard(ForceGuardLoopEntry); |
| 258 | if (LoopDecrement.getNumOccurrences()) |
| 259 | Opts.setDecrement(LoopDecrement); |
| 260 | if (CounterBitWidth.getNumOccurrences()) |
| 261 | Opts.setCounterBitwidth(CounterBitWidth); |
| 262 | |
| 263 | HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE, |
| 264 | Opts); |
| 265 | return Impl.run(F); |
| 266 | } |
| 267 | |
| 268 | PreservedAnalyses HardwareLoopsPass::run(Function &F, |
| 269 | FunctionAnalysisManager &AM) { |
| 270 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
| 271 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
| 272 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 273 | auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
| 274 | auto *TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F); |
| 275 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
| 276 | auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
| 277 | auto &DL = F.getDataLayout(); |
| 278 | |
| 279 | HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts); |
| 280 | bool Changed = Impl.run(F); |
| 281 | if (!Changed) |
| 282 | return PreservedAnalyses::all(); |
| 283 | |
| 284 | PreservedAnalyses PA; |
| 285 | PA.preserve<LoopAnalysis>(); |
| 286 | PA.preserve<ScalarEvolutionAnalysis>(); |
| 287 | PA.preserve<DominatorTreeAnalysis>(); |
| 288 | PA.preserve<BranchProbabilityAnalysis>(); |
| 289 | return PA; |
| 290 | } |
| 291 | |
| 292 | bool HardwareLoopsImpl::run(Function &F) { |
| 293 | LLVMContext &Ctx = F.getContext(); |
| 294 | for (Loop *L : LI) |
| 295 | if (L->isOutermost()) |
| 296 | TryConvertLoop(L, Ctx); |
| 297 | return MadeChange; |
| 298 | } |
| 299 | |
| 300 | // Return true if the search should stop, which will be when an inner loop is |
| 301 | // converted and the parent loop doesn't support containing a hardware loop. |
| 302 | bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) { |
| 303 | // Process nested loops first. |
| 304 | bool AnyChanged = false; |
| 305 | for (Loop *SL : *L) |
| 306 | AnyChanged |= TryConvertLoop(L: SL, Ctx); |
| 307 | if (AnyChanged) { |
| 308 | reportHWLoopFailure(Msg: "nested hardware-loops not supported" , ORETag: "HWLoopNested" , |
| 309 | ORE, TheLoop: L); |
| 310 | return true; // Stop search. |
| 311 | } |
| 312 | |
| 313 | LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n" ); |
| 314 | |
| 315 | HardwareLoopInfo HWLoopInfo(L); |
| 316 | if (!HWLoopInfo.canAnalyze(LI)) { |
| 317 | reportHWLoopFailure(Msg: "cannot analyze loop, irreducible control flow" , |
| 318 | ORETag: "HWLoopCannotAnalyze" , ORE, TheLoop: L); |
| 319 | return false; |
| 320 | } |
| 321 | |
| 322 | if (!Opts.Force && |
| 323 | !TTI.isHardwareLoopProfitable(L, SE, AC, LibInfo: TLI, HWLoopInfo)) { |
| 324 | reportHWLoopFailure(Msg: "it's not profitable to create a hardware-loop" , |
| 325 | ORETag: "HWLoopNotProfitable" , ORE, TheLoop: L); |
| 326 | return false; |
| 327 | } |
| 328 | |
| 329 | // Allow overriding of the counter width and loop decrement value. |
| 330 | if (Opts.Bitwidth.has_value()) { |
| 331 | HWLoopInfo.CountType = IntegerType::get(C&: Ctx, NumBits: Opts.Bitwidth.value()); |
| 332 | } |
| 333 | |
| 334 | if (Opts.Decrement.has_value()) |
| 335 | HWLoopInfo.LoopDecrement = |
| 336 | ConstantInt::get(Ty: HWLoopInfo.CountType, V: Opts.Decrement.value()); |
| 337 | |
| 338 | MadeChange |= TryConvertLoop(HWLoopInfo); |
| 339 | return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested); |
| 340 | } |
| 341 | |
| 342 | bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) { |
| 343 | |
| 344 | Loop *L = HWLoopInfo.L; |
| 345 | LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L); |
| 346 | |
| 347 | if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, ForceNestedLoop: Opts.getForceNested(), |
| 348 | ForceHardwareLoopPHI: Opts.getForcePhi())) { |
| 349 | // TODO: there can be many reasons a loop is not considered a |
| 350 | // candidate, so we should let isHardwareLoopCandidate fill in the |
| 351 | // reason and then report a better message here. |
| 352 | reportHWLoopFailure(Msg: "loop is not a candidate" , ORETag: "HWLoopNoCandidate" , ORE, TheLoop: L); |
| 353 | return false; |
| 354 | } |
| 355 | |
| 356 | assert( |
| 357 | (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) && |
| 358 | "Hardware Loop must have set exit info." ); |
| 359 | |
| 360 | BasicBlock * = L->getLoopPreheader(); |
| 361 | |
| 362 | // If we don't have a preheader, then insert one. |
| 363 | if (!Preheader) |
| 364 | Preheader = InsertPreheaderForLoop(L, DT: &DT, LI: &LI, MSSAU: nullptr, PreserveLCSSA); |
| 365 | if (!Preheader) |
| 366 | return false; |
| 367 | |
| 368 | HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts); |
| 369 | HWLoop.Create(); |
| 370 | ++NumHWLoops; |
| 371 | return true; |
| 372 | } |
| 373 | |
| 374 | void HardwareLoop::Create() { |
| 375 | LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n" ); |
| 376 | |
| 377 | Value *LoopCountInit = InitLoopCount(); |
| 378 | if (!LoopCountInit) { |
| 379 | reportHWLoopFailure(Msg: "could not safely create a loop count expression" , |
| 380 | ORETag: "HWLoopNotSafe" , ORE, TheLoop: L); |
| 381 | return; |
| 382 | } |
| 383 | |
| 384 | Value *Setup = InsertIterationSetup(LoopCountInit); |
| 385 | |
| 386 | if (UsePHICounter || Opts.ForcePhi) { |
| 387 | Instruction *LoopDec = InsertLoopRegDec(EltsRem: LoopCountInit); |
| 388 | Value *EltsRem = InsertPHICounter(NumElts: Setup, EltsRem: LoopDec); |
| 389 | LoopDec->setOperand(i: 0, Val: EltsRem); |
| 390 | UpdateBranch(EltsRem: LoopDec); |
| 391 | } else |
| 392 | InsertLoopDec(); |
| 393 | |
| 394 | // Run through the basic blocks of the loop and see if any of them have dead |
| 395 | // PHIs that can be removed. |
| 396 | for (auto *I : L->blocks()) |
| 397 | DeleteDeadPHIs(BB: I); |
| 398 | } |
| 399 | |
| 400 | static bool CanGenerateTest(Loop *L, Value *Count) { |
| 401 | BasicBlock * = L->getLoopPreheader(); |
| 402 | if (!Preheader->getSinglePredecessor()) |
| 403 | return false; |
| 404 | |
| 405 | BasicBlock *Pred = Preheader->getSinglePredecessor(); |
| 406 | if (!isa<BranchInst>(Val: Pred->getTerminator())) |
| 407 | return false; |
| 408 | |
| 409 | auto *BI = cast<BranchInst>(Val: Pred->getTerminator()); |
| 410 | if (BI->isUnconditional() || !isa<ICmpInst>(Val: BI->getCondition())) |
| 411 | return false; |
| 412 | |
| 413 | // Check that the icmp is checking for equality of Count and zero and that |
| 414 | // a non-zero value results in entering the loop. |
| 415 | auto ICmp = cast<ICmpInst>(Val: BI->getCondition()); |
| 416 | LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n" ); |
| 417 | if (!ICmp->isEquality()) |
| 418 | return false; |
| 419 | |
| 420 | auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) { |
| 421 | if (auto *Const = dyn_cast<ConstantInt>(Val: ICmp->getOperand(i_nocapture: OpIdx))) |
| 422 | return Const->isZero() && ICmp->getOperand(i_nocapture: OpIdx ^ 1) == Count; |
| 423 | return false; |
| 424 | }; |
| 425 | |
| 426 | // Check if Count is a zext. |
| 427 | Value *CountBefZext = |
| 428 | isa<ZExtInst>(Val: Count) ? cast<ZExtInst>(Val: Count)->getOperand(i_nocapture: 0) : nullptr; |
| 429 | |
| 430 | if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) && |
| 431 | !IsCompareZero(ICmp, CountBefZext, 0) && |
| 432 | !IsCompareZero(ICmp, CountBefZext, 1)) |
| 433 | return false; |
| 434 | |
| 435 | unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1; |
| 436 | if (BI->getSuccessor(i: SuccIdx) != Preheader) |
| 437 | return false; |
| 438 | |
| 439 | return true; |
| 440 | } |
| 441 | |
| 442 | Value *HardwareLoop::InitLoopCount() { |
| 443 | LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n" ); |
| 444 | // Can we replace a conditional branch with an intrinsic that sets the |
| 445 | // loop counter and tests that is not zero? |
| 446 | |
| 447 | SCEVExpander SCEVE(SE, DL, "loopcnt" ); |
| 448 | if (!ExitCount->getType()->isPointerTy() && |
| 449 | ExitCount->getType() != CountType) |
| 450 | ExitCount = SE.getZeroExtendExpr(Op: ExitCount, Ty: CountType); |
| 451 | |
| 452 | ExitCount = SE.getAddExpr(LHS: ExitCount, RHS: SE.getOne(Ty: CountType)); |
| 453 | |
| 454 | // If we're trying to use the 'test and set' form of the intrinsic, we need |
| 455 | // to replace a conditional branch that is controlling entry to the loop. It |
| 456 | // is likely (guaranteed?) that the preheader has an unconditional branch to |
| 457 | // the loop header, so also check if it has a single predecessor. |
| 458 | if (SE.isLoopEntryGuardedByCond(L, Pred: ICmpInst::ICMP_NE, LHS: ExitCount, |
| 459 | RHS: SE.getZero(Ty: ExitCount->getType()))) { |
| 460 | LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n" ); |
| 461 | if (Opts.ForceGuard) |
| 462 | UseLoopGuard = true; |
| 463 | } else |
| 464 | UseLoopGuard = false; |
| 465 | |
| 466 | BasicBlock *BB = L->getLoopPreheader(); |
| 467 | if (UseLoopGuard && BB->getSinglePredecessor() && |
| 468 | cast<BranchInst>(Val: BB->getTerminator())->isUnconditional()) { |
| 469 | BasicBlock *Predecessor = BB->getSinglePredecessor(); |
| 470 | // If it's not safe to create a while loop then don't force it and create a |
| 471 | // do-while loop instead |
| 472 | if (!SCEVE.isSafeToExpandAt(S: ExitCount, InsertionPoint: Predecessor->getTerminator())) |
| 473 | UseLoopGuard = false; |
| 474 | else |
| 475 | BB = Predecessor; |
| 476 | } |
| 477 | |
| 478 | if (!SCEVE.isSafeToExpandAt(S: ExitCount, InsertionPoint: BB->getTerminator())) { |
| 479 | LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount " |
| 480 | << *ExitCount << "\n" ); |
| 481 | return nullptr; |
| 482 | } |
| 483 | |
| 484 | Value *Count = SCEVE.expandCodeFor(SH: ExitCount, Ty: CountType, |
| 485 | I: BB->getTerminator()); |
| 486 | |
| 487 | // FIXME: We've expanded Count where we hope to insert the counter setting |
| 488 | // intrinsic. But, in the case of the 'test and set' form, we may fallback to |
| 489 | // the just 'set' form and in which case the insertion block is most likely |
| 490 | // different. It means there will be instruction(s) in a block that possibly |
| 491 | // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue, |
| 492 | // but it's doesn't appear to work in all cases. |
| 493 | |
| 494 | UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count); |
| 495 | BeginBB = UseLoopGuard ? BB : L->getLoopPreheader(); |
| 496 | LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n" |
| 497 | << " - Expanded Count in " << BB->getName() << "\n" |
| 498 | << " - Will insert set counter intrinsic into: " |
| 499 | << BeginBB->getName() << "\n" ); |
| 500 | return Count; |
| 501 | } |
| 502 | |
| 503 | Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { |
| 504 | IRBuilder<> Builder(BeginBB->getTerminator()); |
| 505 | if (BeginBB->getParent()->getAttributes().hasFnAttr(Kind: Attribute::StrictFP)) |
| 506 | Builder.setIsFPConstrained(true); |
| 507 | Type *Ty = LoopCountInit->getType(); |
| 508 | bool UsePhi = UsePHICounter || Opts.ForcePhi; |
| 509 | Intrinsic::ID ID = UseLoopGuard |
| 510 | ? (UsePhi ? Intrinsic::test_start_loop_iterations |
| 511 | : Intrinsic::test_set_loop_iterations) |
| 512 | : (UsePhi ? Intrinsic::start_loop_iterations |
| 513 | : Intrinsic::set_loop_iterations); |
| 514 | Value *LoopSetup = Builder.CreateIntrinsic(ID, Types: Ty, Args: LoopCountInit); |
| 515 | |
| 516 | // Use the return value of the intrinsic to control the entry of the loop. |
| 517 | if (UseLoopGuard) { |
| 518 | assert((isa<BranchInst>(BeginBB->getTerminator()) && |
| 519 | cast<BranchInst>(BeginBB->getTerminator())->isConditional()) && |
| 520 | "Expected conditional branch" ); |
| 521 | |
| 522 | Value *SetCount = |
| 523 | UsePhi ? Builder.CreateExtractValue(Agg: LoopSetup, Idxs: 1) : LoopSetup; |
| 524 | auto *LoopGuard = cast<BranchInst>(Val: BeginBB->getTerminator()); |
| 525 | LoopGuard->setCondition(SetCount); |
| 526 | if (LoopGuard->getSuccessor(i: 0) != L->getLoopPreheader()) |
| 527 | LoopGuard->swapSuccessors(); |
| 528 | } |
| 529 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup |
| 530 | << "\n" ); |
| 531 | if (UsePhi && UseLoopGuard) |
| 532 | LoopSetup = Builder.CreateExtractValue(Agg: LoopSetup, Idxs: 0); |
| 533 | return !UsePhi ? LoopCountInit : LoopSetup; |
| 534 | } |
| 535 | |
| 536 | void HardwareLoop::InsertLoopDec() { |
| 537 | IRBuilder<> CondBuilder(ExitBranch); |
| 538 | if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr( |
| 539 | Kind: Attribute::StrictFP)) |
| 540 | CondBuilder.setIsFPConstrained(true); |
| 541 | |
| 542 | Value *Ops[] = { LoopDecrement }; |
| 543 | Value *NewCond = CondBuilder.CreateIntrinsic(ID: Intrinsic::loop_decrement, |
| 544 | Types: LoopDecrement->getType(), Args: Ops); |
| 545 | Value *OldCond = ExitBranch->getCondition(); |
| 546 | ExitBranch->setCondition(NewCond); |
| 547 | |
| 548 | // The false branch must exit the loop. |
| 549 | if (!L->contains(BB: ExitBranch->getSuccessor(i: 0))) |
| 550 | ExitBranch->swapSuccessors(); |
| 551 | |
| 552 | // The old condition may be dead now, and may have even created a dead PHI |
| 553 | // (the original induction variable). |
| 554 | RecursivelyDeleteTriviallyDeadInstructions(V: OldCond); |
| 555 | |
| 556 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n" ); |
| 557 | } |
| 558 | |
| 559 | Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { |
| 560 | IRBuilder<> CondBuilder(ExitBranch); |
| 561 | if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr( |
| 562 | Kind: Attribute::StrictFP)) |
| 563 | CondBuilder.setIsFPConstrained(true); |
| 564 | |
| 565 | Value *Ops[] = { EltsRem, LoopDecrement }; |
| 566 | Value *Call = CondBuilder.CreateIntrinsic(ID: Intrinsic::loop_decrement_reg, |
| 567 | Types: {EltsRem->getType()}, Args: Ops); |
| 568 | |
| 569 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n" ); |
| 570 | return cast<Instruction>(Val: Call); |
| 571 | } |
| 572 | |
| 573 | PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) { |
| 574 | BasicBlock * = L->getLoopPreheader(); |
| 575 | BasicBlock * = L->getHeader(); |
| 576 | BasicBlock *Latch = ExitBranch->getParent(); |
| 577 | IRBuilder<> Builder(Header, Header->getFirstNonPHIIt()); |
| 578 | PHINode *Index = Builder.CreatePHI(Ty: NumElts->getType(), NumReservedValues: 2); |
| 579 | Index->addIncoming(V: NumElts, BB: Preheader); |
| 580 | Index->addIncoming(V: EltsRem, BB: Latch); |
| 581 | LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n" ); |
| 582 | return Index; |
| 583 | } |
| 584 | |
| 585 | void HardwareLoop::UpdateBranch(Value *EltsRem) { |
| 586 | IRBuilder<> CondBuilder(ExitBranch); |
| 587 | Value *NewCond = |
| 588 | CondBuilder.CreateICmpNE(LHS: EltsRem, RHS: ConstantInt::get(Ty: EltsRem->getType(), V: 0)); |
| 589 | Value *OldCond = ExitBranch->getCondition(); |
| 590 | ExitBranch->setCondition(NewCond); |
| 591 | |
| 592 | // The false branch must exit the loop. |
| 593 | if (!L->contains(BB: ExitBranch->getSuccessor(i: 0))) |
| 594 | ExitBranch->swapSuccessors(); |
| 595 | |
| 596 | // The old condition may be dead now, and may have even created a dead PHI |
| 597 | // (the original induction variable). |
| 598 | RecursivelyDeleteTriviallyDeadInstructions(V: OldCond); |
| 599 | } |
| 600 | |
| 601 | INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) |
| 602 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
| 603 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
| 604 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
| 605 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
| 606 | INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) |
| 607 | |
| 608 | FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); } |
| 609 | |