1 | //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Insert hardware loop intrinsics into loops which are deemed profitable by |
10 | /// the target, by querying TargetTransformInfo. A hardware loop comprises of |
11 | /// two intrinsics: one, outside the loop, to set the loop iteration count and |
12 | /// another, in the exit block, to decrement the counter. The decremented value |
13 | /// can either be carried through the loop via a phi or handled in some opaque |
14 | /// way by the target. |
15 | /// |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "llvm/CodeGen/HardwareLoops.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/Analysis/AssumptionCache.h" |
21 | #include "llvm/Analysis/BranchProbabilityInfo.h" |
22 | #include "llvm/Analysis/LoopInfo.h" |
23 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
24 | #include "llvm/Analysis/ScalarEvolution.h" |
25 | #include "llvm/Analysis/TargetLibraryInfo.h" |
26 | #include "llvm/Analysis/TargetTransformInfo.h" |
27 | #include "llvm/CodeGen/Passes.h" |
28 | #include "llvm/IR/BasicBlock.h" |
29 | #include "llvm/IR/Constants.h" |
30 | #include "llvm/IR/Dominators.h" |
31 | #include "llvm/IR/IRBuilder.h" |
32 | #include "llvm/IR/Instructions.h" |
33 | #include "llvm/IR/IntrinsicInst.h" |
34 | #include "llvm/IR/Value.h" |
35 | #include "llvm/InitializePasses.h" |
36 | #include "llvm/Pass.h" |
37 | #include "llvm/PassRegistry.h" |
38 | #include "llvm/Support/CommandLine.h" |
39 | #include "llvm/Support/Debug.h" |
40 | #include "llvm/Transforms/Utils.h" |
41 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
42 | #include "llvm/Transforms/Utils/Local.h" |
43 | #include "llvm/Transforms/Utils/LoopUtils.h" |
44 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
45 | |
46 | #define DEBUG_TYPE "hardware-loops" |
47 | |
48 | #define HW_LOOPS_NAME "Hardware Loop Insertion" |
49 | |
50 | using namespace llvm; |
51 | |
52 | static cl::opt<bool> |
53 | ForceHardwareLoops("force-hardware-loops" , cl::Hidden, cl::init(Val: false), |
54 | cl::desc("Force hardware loops intrinsics to be inserted" )); |
55 | |
56 | static cl::opt<bool> |
57 | ForceHardwareLoopPHI( |
58 | "force-hardware-loop-phi" , cl::Hidden, cl::init(Val: false), |
59 | cl::desc("Force hardware loop counter to be updated through a phi" )); |
60 | |
61 | static cl::opt<bool> |
62 | ForceNestedLoop("force-nested-hardware-loop" , cl::Hidden, cl::init(Val: false), |
63 | cl::desc("Force allowance of nested hardware loops" )); |
64 | |
65 | static cl::opt<unsigned> |
66 | LoopDecrement("hardware-loop-decrement" , cl::Hidden, cl::init(Val: 1), |
67 | cl::desc("Set the loop decrement value" )); |
68 | |
69 | static cl::opt<unsigned> |
70 | CounterBitWidth("hardware-loop-counter-bitwidth" , cl::Hidden, cl::init(Val: 32), |
71 | cl::desc("Set the loop counter bitwidth" )); |
72 | |
73 | static cl::opt<bool> |
74 | ForceGuardLoopEntry( |
75 | "force-hardware-loop-guard" , cl::Hidden, cl::init(Val: false), |
76 | cl::desc("Force generation of loop guard intrinsic" )); |
77 | |
78 | STATISTIC(NumHWLoops, "Number of loops converted to hardware loops" ); |
79 | |
80 | #ifndef NDEBUG |
81 | static void debugHWLoopFailure(const StringRef DebugMsg, |
82 | Instruction *I) { |
83 | dbgs() << "HWLoops: " << DebugMsg; |
84 | if (I) |
85 | dbgs() << ' ' << *I; |
86 | else |
87 | dbgs() << '.'; |
88 | dbgs() << '\n'; |
89 | } |
90 | #endif |
91 | |
92 | static OptimizationRemarkAnalysis |
93 | createHWLoopAnalysis(StringRef , Loop *L, Instruction *I) { |
94 | Value *CodeRegion = L->getHeader(); |
95 | DebugLoc DL = L->getStartLoc(); |
96 | |
97 | if (I) { |
98 | CodeRegion = I->getParent(); |
99 | // If there is no debug location attached to the instruction, revert back to |
100 | // using the loop's. |
101 | if (I->getDebugLoc()) |
102 | DL = I->getDebugLoc(); |
103 | } |
104 | |
105 | OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion); |
106 | R << "hardware-loop not created: " ; |
107 | return R; |
108 | } |
109 | |
110 | namespace { |
111 | |
112 | void (const StringRef Msg, const StringRef ORETag, |
113 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) { |
114 | LLVM_DEBUG(debugHWLoopFailure(Msg, I)); |
115 | ORE->emit(OptDiag&: createHWLoopAnalysis(RemarkName: ORETag, L: TheLoop, I) << Msg); |
116 | } |
117 | |
118 | using TTI = TargetTransformInfo; |
119 | |
120 | class HardwareLoopsLegacy : public FunctionPass { |
121 | public: |
122 | static char ID; |
123 | |
124 | HardwareLoopsLegacy() : FunctionPass(ID) { |
125 | initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry()); |
126 | } |
127 | |
128 | bool runOnFunction(Function &F) override; |
129 | |
130 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
131 | AU.addRequired<LoopInfoWrapperPass>(); |
132 | AU.addPreserved<LoopInfoWrapperPass>(); |
133 | AU.addRequired<DominatorTreeWrapperPass>(); |
134 | AU.addPreserved<DominatorTreeWrapperPass>(); |
135 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
136 | AU.addPreserved<ScalarEvolutionWrapperPass>(); |
137 | AU.addRequired<AssumptionCacheTracker>(); |
138 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
139 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
140 | AU.addPreserved<BranchProbabilityInfoWrapperPass>(); |
141 | } |
142 | }; |
143 | |
144 | class HardwareLoopsImpl { |
145 | public: |
146 | HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA, |
147 | DominatorTree &DT, const DataLayout &DL, |
148 | const TargetTransformInfo &TTI, TargetLibraryInfo *TLI, |
149 | AssumptionCache &AC, OptimizationRemarkEmitter *ORE, |
150 | HardwareLoopOptions &Opts) |
151 | : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI), |
152 | TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { } |
153 | |
154 | bool run(Function &F); |
155 | |
156 | private: |
157 | // Try to convert the given Loop into a hardware loop. |
158 | bool TryConvertLoop(Loop *L, LLVMContext &Ctx); |
159 | |
160 | // Given that the target believes the loop to be profitable, try to |
161 | // convert it. |
162 | bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo); |
163 | |
164 | ScalarEvolution &SE; |
165 | LoopInfo &LI; |
166 | bool PreserveLCSSA; |
167 | DominatorTree &DT; |
168 | const DataLayout &DL; |
169 | const TargetTransformInfo &TTI; |
170 | TargetLibraryInfo *TLI = nullptr; |
171 | AssumptionCache &AC; |
172 | OptimizationRemarkEmitter *ORE; |
173 | HardwareLoopOptions &Opts; |
174 | bool MadeChange = false; |
175 | }; |
176 | |
177 | class HardwareLoop { |
178 | // Expand the trip count scev into a value that we can use. |
179 | Value *InitLoopCount(); |
180 | |
181 | // Insert the set_loop_iteration intrinsic. |
182 | Value *InsertIterationSetup(Value *LoopCountInit); |
183 | |
184 | // Insert the loop_decrement intrinsic. |
185 | void InsertLoopDec(); |
186 | |
187 | // Insert the loop_decrement_reg intrinsic. |
188 | Instruction *InsertLoopRegDec(Value *EltsRem); |
189 | |
190 | // If the target requires the counter value to be updated in the loop, |
191 | // insert a phi to hold the value. The intended purpose is for use by |
192 | // loop_decrement_reg. |
193 | PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem); |
194 | |
195 | // Create a new cmp, that checks the returned value of loop_decrement*, |
196 | // and update the exit branch to use it. |
197 | void UpdateBranch(Value *EltsRem); |
198 | |
199 | public: |
200 | (HardwareLoopInfo &Info, ScalarEvolution &SE, |
201 | const DataLayout &DL, |
202 | OptimizationRemarkEmitter *ORE, |
203 | HardwareLoopOptions &Opts) : |
204 | SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()), |
205 | ExitCount(Info.ExitCount), |
206 | CountType(Info.CountType), |
207 | ExitBranch(Info.ExitBranch), |
208 | LoopDecrement(Info.LoopDecrement), |
209 | UsePHICounter(Info.CounterInReg), |
210 | UseLoopGuard(Info.PerformEntryTest) { } |
211 | |
212 | void Create(); |
213 | |
214 | private: |
215 | ScalarEvolution &SE; |
216 | const DataLayout &DL; |
217 | OptimizationRemarkEmitter *ORE = nullptr; |
218 | HardwareLoopOptions &Opts; |
219 | Loop *L = nullptr; |
220 | Module *M = nullptr; |
221 | const SCEV *ExitCount = nullptr; |
222 | Type *CountType = nullptr; |
223 | BranchInst *ExitBranch = nullptr; |
224 | Value *LoopDecrement = nullptr; |
225 | bool UsePHICounter = false; |
226 | bool UseLoopGuard = false; |
227 | BasicBlock *BeginBB = nullptr; |
228 | }; |
229 | } |
230 | |
231 | char HardwareLoopsLegacy::ID = 0; |
232 | |
233 | bool HardwareLoopsLegacy::runOnFunction(Function &F) { |
234 | if (skipFunction(F)) |
235 | return false; |
236 | |
237 | LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n" ); |
238 | |
239 | auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
240 | auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
241 | auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
242 | auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
243 | auto &DL = F.getDataLayout(); |
244 | auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
245 | auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
246 | auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; |
247 | auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
248 | bool PreserveLCSSA = mustPreserveAnalysisID(AID&: LCSSAID); |
249 | |
250 | HardwareLoopOptions Opts; |
251 | if (ForceHardwareLoops.getNumOccurrences()) |
252 | Opts.setForce(ForceHardwareLoops); |
253 | if (ForceHardwareLoopPHI.getNumOccurrences()) |
254 | Opts.setForcePhi(ForceHardwareLoopPHI); |
255 | if (ForceNestedLoop.getNumOccurrences()) |
256 | Opts.setForceNested(ForceNestedLoop); |
257 | if (ForceGuardLoopEntry.getNumOccurrences()) |
258 | Opts.setForceGuard(ForceGuardLoopEntry); |
259 | if (LoopDecrement.getNumOccurrences()) |
260 | Opts.setDecrement(LoopDecrement); |
261 | if (CounterBitWidth.getNumOccurrences()) |
262 | Opts.setCounterBitwidth(CounterBitWidth); |
263 | |
264 | HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE, |
265 | Opts); |
266 | return Impl.run(F); |
267 | } |
268 | |
269 | PreservedAnalyses HardwareLoopsPass::run(Function &F, |
270 | FunctionAnalysisManager &AM) { |
271 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
272 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
273 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
274 | auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
275 | auto *TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F); |
276 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
277 | auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
278 | auto &DL = F.getDataLayout(); |
279 | |
280 | HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts); |
281 | bool Changed = Impl.run(F); |
282 | if (!Changed) |
283 | return PreservedAnalyses::all(); |
284 | |
285 | PreservedAnalyses PA; |
286 | PA.preserve<LoopAnalysis>(); |
287 | PA.preserve<ScalarEvolutionAnalysis>(); |
288 | PA.preserve<DominatorTreeAnalysis>(); |
289 | PA.preserve<BranchProbabilityAnalysis>(); |
290 | return PA; |
291 | } |
292 | |
293 | bool HardwareLoopsImpl::run(Function &F) { |
294 | LLVMContext &Ctx = F.getContext(); |
295 | for (Loop *L : LI) |
296 | if (L->isOutermost()) |
297 | TryConvertLoop(L, Ctx); |
298 | return MadeChange; |
299 | } |
300 | |
301 | // Return true if the search should stop, which will be when an inner loop is |
302 | // converted and the parent loop doesn't support containing a hardware loop. |
303 | bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) { |
304 | // Process nested loops first. |
305 | bool AnyChanged = false; |
306 | for (Loop *SL : *L) |
307 | AnyChanged |= TryConvertLoop(L: SL, Ctx); |
308 | if (AnyChanged) { |
309 | reportHWLoopFailure(Msg: "nested hardware-loops not supported" , ORETag: "HWLoopNested" , |
310 | ORE, TheLoop: L); |
311 | return true; // Stop search. |
312 | } |
313 | |
314 | LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n" ); |
315 | |
316 | HardwareLoopInfo HWLoopInfo(L); |
317 | if (!HWLoopInfo.canAnalyze(LI)) { |
318 | reportHWLoopFailure(Msg: "cannot analyze loop, irreducible control flow" , |
319 | ORETag: "HWLoopCannotAnalyze" , ORE, TheLoop: L); |
320 | return false; |
321 | } |
322 | |
323 | if (!Opts.Force && |
324 | !TTI.isHardwareLoopProfitable(L, SE, AC, LibInfo: TLI, HWLoopInfo)) { |
325 | reportHWLoopFailure(Msg: "it's not profitable to create a hardware-loop" , |
326 | ORETag: "HWLoopNotProfitable" , ORE, TheLoop: L); |
327 | return false; |
328 | } |
329 | |
330 | // Allow overriding of the counter width and loop decrement value. |
331 | if (Opts.Bitwidth.has_value()) { |
332 | HWLoopInfo.CountType = IntegerType::get(C&: Ctx, NumBits: Opts.Bitwidth.value()); |
333 | } |
334 | |
335 | if (Opts.Decrement.has_value()) |
336 | HWLoopInfo.LoopDecrement = |
337 | ConstantInt::get(Ty: HWLoopInfo.CountType, V: Opts.Decrement.value()); |
338 | |
339 | MadeChange |= TryConvertLoop(HWLoopInfo); |
340 | return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested); |
341 | } |
342 | |
343 | bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) { |
344 | |
345 | Loop *L = HWLoopInfo.L; |
346 | LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L); |
347 | |
348 | if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, ForceNestedLoop: Opts.getForceNested(), |
349 | ForceHardwareLoopPHI: Opts.getForcePhi())) { |
350 | // TODO: there can be many reasons a loop is not considered a |
351 | // candidate, so we should let isHardwareLoopCandidate fill in the |
352 | // reason and then report a better message here. |
353 | reportHWLoopFailure(Msg: "loop is not a candidate" , ORETag: "HWLoopNoCandidate" , ORE, TheLoop: L); |
354 | return false; |
355 | } |
356 | |
357 | assert( |
358 | (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) && |
359 | "Hardware Loop must have set exit info." ); |
360 | |
361 | BasicBlock * = L->getLoopPreheader(); |
362 | |
363 | // If we don't have a preheader, then insert one. |
364 | if (!Preheader) |
365 | Preheader = InsertPreheaderForLoop(L, DT: &DT, LI: &LI, MSSAU: nullptr, PreserveLCSSA); |
366 | if (!Preheader) |
367 | return false; |
368 | |
369 | HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts); |
370 | HWLoop.Create(); |
371 | ++NumHWLoops; |
372 | return true; |
373 | } |
374 | |
375 | void HardwareLoop::Create() { |
376 | LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n" ); |
377 | |
378 | Value *LoopCountInit = InitLoopCount(); |
379 | if (!LoopCountInit) { |
380 | reportHWLoopFailure(Msg: "could not safely create a loop count expression" , |
381 | ORETag: "HWLoopNotSafe" , ORE, TheLoop: L); |
382 | return; |
383 | } |
384 | |
385 | Value *Setup = InsertIterationSetup(LoopCountInit); |
386 | |
387 | if (UsePHICounter || Opts.ForcePhi) { |
388 | Instruction *LoopDec = InsertLoopRegDec(EltsRem: LoopCountInit); |
389 | Value *EltsRem = InsertPHICounter(NumElts: Setup, EltsRem: LoopDec); |
390 | LoopDec->setOperand(i: 0, Val: EltsRem); |
391 | UpdateBranch(EltsRem: LoopDec); |
392 | } else |
393 | InsertLoopDec(); |
394 | |
395 | // Run through the basic blocks of the loop and see if any of them have dead |
396 | // PHIs that can be removed. |
397 | for (auto *I : L->blocks()) |
398 | DeleteDeadPHIs(BB: I); |
399 | } |
400 | |
401 | static bool CanGenerateTest(Loop *L, Value *Count) { |
402 | BasicBlock * = L->getLoopPreheader(); |
403 | if (!Preheader->getSinglePredecessor()) |
404 | return false; |
405 | |
406 | BasicBlock *Pred = Preheader->getSinglePredecessor(); |
407 | if (!isa<BranchInst>(Val: Pred->getTerminator())) |
408 | return false; |
409 | |
410 | auto *BI = cast<BranchInst>(Val: Pred->getTerminator()); |
411 | if (BI->isUnconditional() || !isa<ICmpInst>(Val: BI->getCondition())) |
412 | return false; |
413 | |
414 | // Check that the icmp is checking for equality of Count and zero and that |
415 | // a non-zero value results in entering the loop. |
416 | auto ICmp = cast<ICmpInst>(Val: BI->getCondition()); |
417 | LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n" ); |
418 | if (!ICmp->isEquality()) |
419 | return false; |
420 | |
421 | auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) { |
422 | if (auto *Const = dyn_cast<ConstantInt>(Val: ICmp->getOperand(i_nocapture: OpIdx))) |
423 | return Const->isZero() && ICmp->getOperand(i_nocapture: OpIdx ^ 1) == Count; |
424 | return false; |
425 | }; |
426 | |
427 | // Check if Count is a zext. |
428 | Value *CountBefZext = |
429 | isa<ZExtInst>(Val: Count) ? cast<ZExtInst>(Val: Count)->getOperand(i_nocapture: 0) : nullptr; |
430 | |
431 | if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) && |
432 | !IsCompareZero(ICmp, CountBefZext, 0) && |
433 | !IsCompareZero(ICmp, CountBefZext, 1)) |
434 | return false; |
435 | |
436 | unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1; |
437 | if (BI->getSuccessor(i: SuccIdx) != Preheader) |
438 | return false; |
439 | |
440 | return true; |
441 | } |
442 | |
443 | Value *HardwareLoop::InitLoopCount() { |
444 | LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n" ); |
445 | // Can we replace a conditional branch with an intrinsic that sets the |
446 | // loop counter and tests that is not zero? |
447 | |
448 | SCEVExpander SCEVE(SE, DL, "loopcnt" ); |
449 | if (!ExitCount->getType()->isPointerTy() && |
450 | ExitCount->getType() != CountType) |
451 | ExitCount = SE.getZeroExtendExpr(Op: ExitCount, Ty: CountType); |
452 | |
453 | ExitCount = SE.getAddExpr(LHS: ExitCount, RHS: SE.getOne(Ty: CountType)); |
454 | |
455 | // If we're trying to use the 'test and set' form of the intrinsic, we need |
456 | // to replace a conditional branch that is controlling entry to the loop. It |
457 | // is likely (guaranteed?) that the preheader has an unconditional branch to |
458 | // the loop header, so also check if it has a single predecessor. |
459 | if (SE.isLoopEntryGuardedByCond(L, Pred: ICmpInst::ICMP_NE, LHS: ExitCount, |
460 | RHS: SE.getZero(Ty: ExitCount->getType()))) { |
461 | LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n" ); |
462 | if (Opts.ForceGuard) |
463 | UseLoopGuard = true; |
464 | } else |
465 | UseLoopGuard = false; |
466 | |
467 | BasicBlock *BB = L->getLoopPreheader(); |
468 | if (UseLoopGuard && BB->getSinglePredecessor() && |
469 | cast<BranchInst>(Val: BB->getTerminator())->isUnconditional()) { |
470 | BasicBlock *Predecessor = BB->getSinglePredecessor(); |
471 | // If it's not safe to create a while loop then don't force it and create a |
472 | // do-while loop instead |
473 | if (!SCEVE.isSafeToExpandAt(S: ExitCount, InsertionPoint: Predecessor->getTerminator())) |
474 | UseLoopGuard = false; |
475 | else |
476 | BB = Predecessor; |
477 | } |
478 | |
479 | if (!SCEVE.isSafeToExpandAt(S: ExitCount, InsertionPoint: BB->getTerminator())) { |
480 | LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount " |
481 | << *ExitCount << "\n" ); |
482 | return nullptr; |
483 | } |
484 | |
485 | Value *Count = SCEVE.expandCodeFor(SH: ExitCount, Ty: CountType, |
486 | I: BB->getTerminator()); |
487 | |
488 | // FIXME: We've expanded Count where we hope to insert the counter setting |
489 | // intrinsic. But, in the case of the 'test and set' form, we may fallback to |
490 | // the just 'set' form and in which case the insertion block is most likely |
491 | // different. It means there will be instruction(s) in a block that possibly |
492 | // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue, |
493 | // but it's doesn't appear to work in all cases. |
494 | |
495 | UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count); |
496 | BeginBB = UseLoopGuard ? BB : L->getLoopPreheader(); |
497 | LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n" |
498 | << " - Expanded Count in " << BB->getName() << "\n" |
499 | << " - Will insert set counter intrinsic into: " |
500 | << BeginBB->getName() << "\n" ); |
501 | return Count; |
502 | } |
503 | |
504 | Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { |
505 | IRBuilder<> Builder(BeginBB->getTerminator()); |
506 | if (BeginBB->getParent()->getAttributes().hasFnAttr(Kind: Attribute::StrictFP)) |
507 | Builder.setIsFPConstrained(true); |
508 | Type *Ty = LoopCountInit->getType(); |
509 | bool UsePhi = UsePHICounter || Opts.ForcePhi; |
510 | Intrinsic::ID ID = UseLoopGuard |
511 | ? (UsePhi ? Intrinsic::test_start_loop_iterations |
512 | : Intrinsic::test_set_loop_iterations) |
513 | : (UsePhi ? Intrinsic::start_loop_iterations |
514 | : Intrinsic::set_loop_iterations); |
515 | Function *LoopIter = Intrinsic::getDeclaration(M, id: ID, Tys: Ty); |
516 | Value *LoopSetup = Builder.CreateCall(Callee: LoopIter, Args: LoopCountInit); |
517 | |
518 | // Use the return value of the intrinsic to control the entry of the loop. |
519 | if (UseLoopGuard) { |
520 | assert((isa<BranchInst>(BeginBB->getTerminator()) && |
521 | cast<BranchInst>(BeginBB->getTerminator())->isConditional()) && |
522 | "Expected conditional branch" ); |
523 | |
524 | Value *SetCount = |
525 | UsePhi ? Builder.CreateExtractValue(Agg: LoopSetup, Idxs: 1) : LoopSetup; |
526 | auto *LoopGuard = cast<BranchInst>(Val: BeginBB->getTerminator()); |
527 | LoopGuard->setCondition(SetCount); |
528 | if (LoopGuard->getSuccessor(i: 0) != L->getLoopPreheader()) |
529 | LoopGuard->swapSuccessors(); |
530 | } |
531 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup |
532 | << "\n" ); |
533 | if (UsePhi && UseLoopGuard) |
534 | LoopSetup = Builder.CreateExtractValue(Agg: LoopSetup, Idxs: 0); |
535 | return !UsePhi ? LoopCountInit : LoopSetup; |
536 | } |
537 | |
538 | void HardwareLoop::InsertLoopDec() { |
539 | IRBuilder<> CondBuilder(ExitBranch); |
540 | if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr( |
541 | Kind: Attribute::StrictFP)) |
542 | CondBuilder.setIsFPConstrained(true); |
543 | |
544 | Function *DecFunc = |
545 | Intrinsic::getDeclaration(M, id: Intrinsic::loop_decrement, |
546 | Tys: LoopDecrement->getType()); |
547 | Value *Ops[] = { LoopDecrement }; |
548 | Value *NewCond = CondBuilder.CreateCall(Callee: DecFunc, Args: Ops); |
549 | Value *OldCond = ExitBranch->getCondition(); |
550 | ExitBranch->setCondition(NewCond); |
551 | |
552 | // The false branch must exit the loop. |
553 | if (!L->contains(BB: ExitBranch->getSuccessor(i: 0))) |
554 | ExitBranch->swapSuccessors(); |
555 | |
556 | // The old condition may be dead now, and may have even created a dead PHI |
557 | // (the original induction variable). |
558 | RecursivelyDeleteTriviallyDeadInstructions(V: OldCond); |
559 | |
560 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n" ); |
561 | } |
562 | |
563 | Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { |
564 | IRBuilder<> CondBuilder(ExitBranch); |
565 | if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr( |
566 | Kind: Attribute::StrictFP)) |
567 | CondBuilder.setIsFPConstrained(true); |
568 | |
569 | Function *DecFunc = |
570 | Intrinsic::getDeclaration(M, id: Intrinsic::loop_decrement_reg, |
571 | Tys: { EltsRem->getType() }); |
572 | Value *Ops[] = { EltsRem, LoopDecrement }; |
573 | Value *Call = CondBuilder.CreateCall(Callee: DecFunc, Args: Ops); |
574 | |
575 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n" ); |
576 | return cast<Instruction>(Val: Call); |
577 | } |
578 | |
579 | PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) { |
580 | BasicBlock * = L->getLoopPreheader(); |
581 | BasicBlock * = L->getHeader(); |
582 | BasicBlock *Latch = ExitBranch->getParent(); |
583 | IRBuilder<> Builder(Header, Header->getFirstNonPHIIt()); |
584 | PHINode *Index = Builder.CreatePHI(Ty: NumElts->getType(), NumReservedValues: 2); |
585 | Index->addIncoming(V: NumElts, BB: Preheader); |
586 | Index->addIncoming(V: EltsRem, BB: Latch); |
587 | LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n" ); |
588 | return Index; |
589 | } |
590 | |
591 | void HardwareLoop::UpdateBranch(Value *EltsRem) { |
592 | IRBuilder<> CondBuilder(ExitBranch); |
593 | Value *NewCond = |
594 | CondBuilder.CreateICmpNE(LHS: EltsRem, RHS: ConstantInt::get(Ty: EltsRem->getType(), V: 0)); |
595 | Value *OldCond = ExitBranch->getCondition(); |
596 | ExitBranch->setCondition(NewCond); |
597 | |
598 | // The false branch must exit the loop. |
599 | if (!L->contains(BB: ExitBranch->getSuccessor(i: 0))) |
600 | ExitBranch->swapSuccessors(); |
601 | |
602 | // The old condition may be dead now, and may have even created a dead PHI |
603 | // (the original induction variable). |
604 | RecursivelyDeleteTriviallyDeadInstructions(V: OldCond); |
605 | } |
606 | |
607 | INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) |
608 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
609 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
610 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
611 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
612 | INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) |
613 | |
614 | FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); } |
615 | |