1 | //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | /// \file |
9 | /// Insert hardware loop intrinsics into loops which are deemed profitable by |
10 | /// the target, by querying TargetTransformInfo. A hardware loop comprises of |
11 | /// two intrinsics: one, outside the loop, to set the loop iteration count and |
12 | /// another, in the exit block, to decrement the counter. The decremented value |
13 | /// can either be carried through the loop via a phi or handled in some opaque |
14 | /// way by the target. |
15 | /// |
16 | //===----------------------------------------------------------------------===// |
17 | |
18 | #include "llvm/CodeGen/HardwareLoops.h" |
19 | #include "llvm/ADT/Statistic.h" |
20 | #include "llvm/Analysis/AssumptionCache.h" |
21 | #include "llvm/Analysis/BranchProbabilityInfo.h" |
22 | #include "llvm/Analysis/LoopInfo.h" |
23 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
24 | #include "llvm/Analysis/ScalarEvolution.h" |
25 | #include "llvm/Analysis/TargetLibraryInfo.h" |
26 | #include "llvm/Analysis/TargetTransformInfo.h" |
27 | #include "llvm/CodeGen/Passes.h" |
28 | #include "llvm/IR/BasicBlock.h" |
29 | #include "llvm/IR/Constants.h" |
30 | #include "llvm/IR/Dominators.h" |
31 | #include "llvm/IR/IRBuilder.h" |
32 | #include "llvm/IR/Instructions.h" |
33 | #include "llvm/IR/Value.h" |
34 | #include "llvm/InitializePasses.h" |
35 | #include "llvm/Pass.h" |
36 | #include "llvm/PassRegistry.h" |
37 | #include "llvm/Support/CommandLine.h" |
38 | #include "llvm/Support/Debug.h" |
39 | #include "llvm/Transforms/Utils.h" |
40 | #include "llvm/Transforms/Utils/BasicBlockUtils.h" |
41 | #include "llvm/Transforms/Utils/Local.h" |
42 | #include "llvm/Transforms/Utils/LoopUtils.h" |
43 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
44 | |
45 | #define DEBUG_TYPE "hardware-loops" |
46 | |
47 | #define HW_LOOPS_NAME "Hardware Loop Insertion" |
48 | |
49 | using namespace llvm; |
50 | |
51 | static cl::opt<bool> |
52 | ForceHardwareLoops("force-hardware-loops" , cl::Hidden, cl::init(Val: false), |
53 | cl::desc("Force hardware loops intrinsics to be inserted" )); |
54 | |
55 | static cl::opt<bool> |
56 | ForceHardwareLoopPHI( |
57 | "force-hardware-loop-phi" , cl::Hidden, cl::init(Val: false), |
58 | cl::desc("Force hardware loop counter to be updated through a phi" )); |
59 | |
60 | static cl::opt<bool> |
61 | ForceNestedLoop("force-nested-hardware-loop" , cl::Hidden, cl::init(Val: false), |
62 | cl::desc("Force allowance of nested hardware loops" )); |
63 | |
64 | static cl::opt<unsigned> |
65 | LoopDecrement("hardware-loop-decrement" , cl::Hidden, cl::init(Val: 1), |
66 | cl::desc("Set the loop decrement value" )); |
67 | |
68 | static cl::opt<unsigned> |
69 | CounterBitWidth("hardware-loop-counter-bitwidth" , cl::Hidden, cl::init(Val: 32), |
70 | cl::desc("Set the loop counter bitwidth" )); |
71 | |
72 | static cl::opt<bool> |
73 | ForceGuardLoopEntry( |
74 | "force-hardware-loop-guard" , cl::Hidden, cl::init(Val: false), |
75 | cl::desc("Force generation of loop guard intrinsic" )); |
76 | |
77 | STATISTIC(NumHWLoops, "Number of loops converted to hardware loops" ); |
78 | |
79 | #ifndef NDEBUG |
80 | static void debugHWLoopFailure(const StringRef DebugMsg, |
81 | Instruction *I) { |
82 | dbgs() << "HWLoops: " << DebugMsg; |
83 | if (I) |
84 | dbgs() << ' ' << *I; |
85 | else |
86 | dbgs() << '.'; |
87 | dbgs() << '\n'; |
88 | } |
89 | #endif |
90 | |
91 | static OptimizationRemarkAnalysis |
92 | createHWLoopAnalysis(StringRef , Loop *L, Instruction *I) { |
93 | BasicBlock *CodeRegion = L->getHeader(); |
94 | DebugLoc DL = L->getStartLoc(); |
95 | |
96 | if (I) { |
97 | CodeRegion = I->getParent(); |
98 | // If there is no debug location attached to the instruction, revert back to |
99 | // using the loop's. |
100 | if (I->getDebugLoc()) |
101 | DL = I->getDebugLoc(); |
102 | } |
103 | |
104 | OptimizationRemarkAnalysis R(DEBUG_TYPE, RemarkName, DL, CodeRegion); |
105 | R << "hardware-loop not created: " ; |
106 | return R; |
107 | } |
108 | |
109 | namespace { |
110 | |
111 | void (const StringRef Msg, const StringRef ORETag, |
112 | OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I = nullptr) { |
113 | LLVM_DEBUG(debugHWLoopFailure(Msg, I)); |
114 | ORE->emit(OptDiag: createHWLoopAnalysis(RemarkName: ORETag, L: TheLoop, I) << Msg); |
115 | } |
116 | |
117 | using TTI = TargetTransformInfo; |
118 | |
119 | class HardwareLoopsLegacy : public FunctionPass { |
120 | public: |
121 | static char ID; |
122 | |
123 | HardwareLoopsLegacy() : FunctionPass(ID) { |
124 | initializeHardwareLoopsLegacyPass(*PassRegistry::getPassRegistry()); |
125 | } |
126 | |
127 | bool runOnFunction(Function &F) override; |
128 | |
129 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
130 | AU.addRequired<LoopInfoWrapperPass>(); |
131 | AU.addPreserved<LoopInfoWrapperPass>(); |
132 | AU.addRequired<DominatorTreeWrapperPass>(); |
133 | AU.addPreserved<DominatorTreeWrapperPass>(); |
134 | AU.addRequired<ScalarEvolutionWrapperPass>(); |
135 | AU.addPreserved<ScalarEvolutionWrapperPass>(); |
136 | AU.addRequired<AssumptionCacheTracker>(); |
137 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
138 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
139 | AU.addPreserved<BranchProbabilityInfoWrapperPass>(); |
140 | } |
141 | }; |
142 | |
143 | class HardwareLoopsImpl { |
144 | public: |
145 | HardwareLoopsImpl(ScalarEvolution &SE, LoopInfo &LI, bool PreserveLCSSA, |
146 | DominatorTree &DT, const DataLayout &DL, |
147 | const TargetTransformInfo &TTI, TargetLibraryInfo *TLI, |
148 | AssumptionCache &AC, OptimizationRemarkEmitter *ORE, |
149 | HardwareLoopOptions &Opts) |
150 | : SE(SE), LI(LI), PreserveLCSSA(PreserveLCSSA), DT(DT), DL(DL), TTI(TTI), |
151 | TLI(TLI), AC(AC), ORE(ORE), Opts(Opts) { } |
152 | |
153 | bool run(Function &F); |
154 | |
155 | private: |
156 | // Try to convert the given Loop into a hardware loop. |
157 | bool TryConvertLoop(Loop *L, LLVMContext &Ctx); |
158 | |
159 | // Given that the target believes the loop to be profitable, try to |
160 | // convert it. |
161 | bool TryConvertLoop(HardwareLoopInfo &HWLoopInfo); |
162 | |
163 | ScalarEvolution &SE; |
164 | LoopInfo &LI; |
165 | bool PreserveLCSSA; |
166 | DominatorTree &DT; |
167 | const DataLayout &DL; |
168 | const TargetTransformInfo &TTI; |
169 | TargetLibraryInfo *TLI = nullptr; |
170 | AssumptionCache &AC; |
171 | OptimizationRemarkEmitter *ORE; |
172 | HardwareLoopOptions &Opts; |
173 | bool MadeChange = false; |
174 | }; |
175 | |
176 | class HardwareLoop { |
177 | // Expand the trip count scev into a value that we can use. |
178 | Value *InitLoopCount(); |
179 | |
180 | // Insert the set_loop_iteration intrinsic. |
181 | Value *InsertIterationSetup(Value *LoopCountInit); |
182 | |
183 | // Insert the loop_decrement intrinsic. |
184 | void InsertLoopDec(); |
185 | |
186 | // Insert the loop_decrement_reg intrinsic. |
187 | Instruction *InsertLoopRegDec(Value *EltsRem); |
188 | |
189 | // If the target requires the counter value to be updated in the loop, |
190 | // insert a phi to hold the value. The intended purpose is for use by |
191 | // loop_decrement_reg. |
192 | PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem); |
193 | |
194 | // Create a new cmp, that checks the returned value of loop_decrement*, |
195 | // and update the exit branch to use it. |
196 | void UpdateBranch(Value *EltsRem); |
197 | |
198 | public: |
199 | (HardwareLoopInfo &Info, ScalarEvolution &SE, |
200 | const DataLayout &DL, |
201 | OptimizationRemarkEmitter *ORE, |
202 | HardwareLoopOptions &Opts) : |
203 | SE(SE), DL(DL), ORE(ORE), Opts(Opts), L(Info.L), M(L->getHeader()->getModule()), |
204 | ExitCount(Info.ExitCount), |
205 | CountType(Info.CountType), |
206 | ExitBranch(Info.ExitBranch), |
207 | LoopDecrement(Info.LoopDecrement), |
208 | UsePHICounter(Info.CounterInReg), |
209 | UseLoopGuard(Info.PerformEntryTest) { } |
210 | |
211 | void Create(); |
212 | |
213 | private: |
214 | ScalarEvolution &SE; |
215 | const DataLayout &DL; |
216 | OptimizationRemarkEmitter *ORE = nullptr; |
217 | HardwareLoopOptions &Opts; |
218 | Loop *L = nullptr; |
219 | Module *M = nullptr; |
220 | const SCEV *ExitCount = nullptr; |
221 | Type *CountType = nullptr; |
222 | BranchInst *ExitBranch = nullptr; |
223 | Value *LoopDecrement = nullptr; |
224 | bool UsePHICounter = false; |
225 | bool UseLoopGuard = false; |
226 | BasicBlock *BeginBB = nullptr; |
227 | }; |
228 | } |
229 | |
230 | char HardwareLoopsLegacy::ID = 0; |
231 | |
232 | bool HardwareLoopsLegacy::runOnFunction(Function &F) { |
233 | if (skipFunction(F)) |
234 | return false; |
235 | |
236 | LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n" ); |
237 | |
238 | auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
239 | auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); |
240 | auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
241 | auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
242 | auto &DL = F.getDataLayout(); |
243 | auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
244 | auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>(); |
245 | auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr; |
246 | auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); |
247 | bool PreserveLCSSA = mustPreserveAnalysisID(AID&: LCSSAID); |
248 | |
249 | HardwareLoopOptions Opts; |
250 | if (ForceHardwareLoops.getNumOccurrences()) |
251 | Opts.setForce(ForceHardwareLoops); |
252 | if (ForceHardwareLoopPHI.getNumOccurrences()) |
253 | Opts.setForcePhi(ForceHardwareLoopPHI); |
254 | if (ForceNestedLoop.getNumOccurrences()) |
255 | Opts.setForceNested(ForceNestedLoop); |
256 | if (ForceGuardLoopEntry.getNumOccurrences()) |
257 | Opts.setForceGuard(ForceGuardLoopEntry); |
258 | if (LoopDecrement.getNumOccurrences()) |
259 | Opts.setDecrement(LoopDecrement); |
260 | if (CounterBitWidth.getNumOccurrences()) |
261 | Opts.setCounterBitwidth(CounterBitWidth); |
262 | |
263 | HardwareLoopsImpl Impl(SE, LI, PreserveLCSSA, DT, DL, TTI, TLI, AC, ORE, |
264 | Opts); |
265 | return Impl.run(F); |
266 | } |
267 | |
268 | PreservedAnalyses HardwareLoopsPass::run(Function &F, |
269 | FunctionAnalysisManager &AM) { |
270 | auto &LI = AM.getResult<LoopAnalysis>(IR&: F); |
271 | auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
272 | auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F); |
273 | auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F); |
274 | auto *TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F); |
275 | auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F); |
276 | auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
277 | auto &DL = F.getDataLayout(); |
278 | |
279 | HardwareLoopsImpl Impl(SE, LI, true, DT, DL, TTI, TLI, AC, ORE, Opts); |
280 | bool Changed = Impl.run(F); |
281 | if (!Changed) |
282 | return PreservedAnalyses::all(); |
283 | |
284 | PreservedAnalyses PA; |
285 | PA.preserve<LoopAnalysis>(); |
286 | PA.preserve<ScalarEvolutionAnalysis>(); |
287 | PA.preserve<DominatorTreeAnalysis>(); |
288 | PA.preserve<BranchProbabilityAnalysis>(); |
289 | return PA; |
290 | } |
291 | |
292 | bool HardwareLoopsImpl::run(Function &F) { |
293 | LLVMContext &Ctx = F.getContext(); |
294 | for (Loop *L : LI) |
295 | if (L->isOutermost()) |
296 | TryConvertLoop(L, Ctx); |
297 | return MadeChange; |
298 | } |
299 | |
300 | // Return true if the search should stop, which will be when an inner loop is |
301 | // converted and the parent loop doesn't support containing a hardware loop. |
302 | bool HardwareLoopsImpl::TryConvertLoop(Loop *L, LLVMContext &Ctx) { |
303 | // Process nested loops first. |
304 | bool AnyChanged = false; |
305 | for (Loop *SL : *L) |
306 | AnyChanged |= TryConvertLoop(L: SL, Ctx); |
307 | if (AnyChanged) { |
308 | reportHWLoopFailure(Msg: "nested hardware-loops not supported" , ORETag: "HWLoopNested" , |
309 | ORE, TheLoop: L); |
310 | return true; // Stop search. |
311 | } |
312 | |
313 | LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n" ); |
314 | |
315 | HardwareLoopInfo HWLoopInfo(L); |
316 | if (!HWLoopInfo.canAnalyze(LI)) { |
317 | reportHWLoopFailure(Msg: "cannot analyze loop, irreducible control flow" , |
318 | ORETag: "HWLoopCannotAnalyze" , ORE, TheLoop: L); |
319 | return false; |
320 | } |
321 | |
322 | if (!Opts.Force && |
323 | !TTI.isHardwareLoopProfitable(L, SE, AC, LibInfo: TLI, HWLoopInfo)) { |
324 | reportHWLoopFailure(Msg: "it's not profitable to create a hardware-loop" , |
325 | ORETag: "HWLoopNotProfitable" , ORE, TheLoop: L); |
326 | return false; |
327 | } |
328 | |
329 | // Allow overriding of the counter width and loop decrement value. |
330 | if (Opts.Bitwidth.has_value()) { |
331 | HWLoopInfo.CountType = IntegerType::get(C&: Ctx, NumBits: Opts.Bitwidth.value()); |
332 | } |
333 | |
334 | if (Opts.Decrement.has_value()) |
335 | HWLoopInfo.LoopDecrement = |
336 | ConstantInt::get(Ty: HWLoopInfo.CountType, V: Opts.Decrement.value()); |
337 | |
338 | MadeChange |= TryConvertLoop(HWLoopInfo); |
339 | return MadeChange && (!HWLoopInfo.IsNestingLegal && !Opts.ForceNested); |
340 | } |
341 | |
342 | bool HardwareLoopsImpl::TryConvertLoop(HardwareLoopInfo &HWLoopInfo) { |
343 | |
344 | Loop *L = HWLoopInfo.L; |
345 | LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L); |
346 | |
347 | if (!HWLoopInfo.isHardwareLoopCandidate(SE, LI, DT, ForceNestedLoop: Opts.getForceNested(), |
348 | ForceHardwareLoopPHI: Opts.getForcePhi())) { |
349 | // TODO: there can be many reasons a loop is not considered a |
350 | // candidate, so we should let isHardwareLoopCandidate fill in the |
351 | // reason and then report a better message here. |
352 | reportHWLoopFailure(Msg: "loop is not a candidate" , ORETag: "HWLoopNoCandidate" , ORE, TheLoop: L); |
353 | return false; |
354 | } |
355 | |
356 | assert( |
357 | (HWLoopInfo.ExitBlock && HWLoopInfo.ExitBranch && HWLoopInfo.ExitCount) && |
358 | "Hardware Loop must have set exit info." ); |
359 | |
360 | BasicBlock * = L->getLoopPreheader(); |
361 | |
362 | // If we don't have a preheader, then insert one. |
363 | if (!Preheader) |
364 | Preheader = InsertPreheaderForLoop(L, DT: &DT, LI: &LI, MSSAU: nullptr, PreserveLCSSA); |
365 | if (!Preheader) |
366 | return false; |
367 | |
368 | HardwareLoop HWLoop(HWLoopInfo, SE, DL, ORE, Opts); |
369 | HWLoop.Create(); |
370 | ++NumHWLoops; |
371 | return true; |
372 | } |
373 | |
374 | void HardwareLoop::Create() { |
375 | LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n" ); |
376 | |
377 | Value *LoopCountInit = InitLoopCount(); |
378 | if (!LoopCountInit) { |
379 | reportHWLoopFailure(Msg: "could not safely create a loop count expression" , |
380 | ORETag: "HWLoopNotSafe" , ORE, TheLoop: L); |
381 | return; |
382 | } |
383 | |
384 | Value *Setup = InsertIterationSetup(LoopCountInit); |
385 | |
386 | if (UsePHICounter || Opts.ForcePhi) { |
387 | Instruction *LoopDec = InsertLoopRegDec(EltsRem: LoopCountInit); |
388 | Value *EltsRem = InsertPHICounter(NumElts: Setup, EltsRem: LoopDec); |
389 | LoopDec->setOperand(i: 0, Val: EltsRem); |
390 | UpdateBranch(EltsRem: LoopDec); |
391 | } else |
392 | InsertLoopDec(); |
393 | |
394 | // Run through the basic blocks of the loop and see if any of them have dead |
395 | // PHIs that can be removed. |
396 | for (auto *I : L->blocks()) |
397 | DeleteDeadPHIs(BB: I); |
398 | } |
399 | |
400 | static bool CanGenerateTest(Loop *L, Value *Count) { |
401 | BasicBlock * = L->getLoopPreheader(); |
402 | if (!Preheader->getSinglePredecessor()) |
403 | return false; |
404 | |
405 | BasicBlock *Pred = Preheader->getSinglePredecessor(); |
406 | if (!isa<BranchInst>(Val: Pred->getTerminator())) |
407 | return false; |
408 | |
409 | auto *BI = cast<BranchInst>(Val: Pred->getTerminator()); |
410 | if (BI->isUnconditional() || !isa<ICmpInst>(Val: BI->getCondition())) |
411 | return false; |
412 | |
413 | // Check that the icmp is checking for equality of Count and zero and that |
414 | // a non-zero value results in entering the loop. |
415 | auto ICmp = cast<ICmpInst>(Val: BI->getCondition()); |
416 | LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n" ); |
417 | if (!ICmp->isEquality()) |
418 | return false; |
419 | |
420 | auto IsCompareZero = [](ICmpInst *ICmp, Value *Count, unsigned OpIdx) { |
421 | if (auto *Const = dyn_cast<ConstantInt>(Val: ICmp->getOperand(i_nocapture: OpIdx))) |
422 | return Const->isZero() && ICmp->getOperand(i_nocapture: OpIdx ^ 1) == Count; |
423 | return false; |
424 | }; |
425 | |
426 | // Check if Count is a zext. |
427 | Value *CountBefZext = |
428 | isa<ZExtInst>(Val: Count) ? cast<ZExtInst>(Val: Count)->getOperand(i_nocapture: 0) : nullptr; |
429 | |
430 | if (!IsCompareZero(ICmp, Count, 0) && !IsCompareZero(ICmp, Count, 1) && |
431 | !IsCompareZero(ICmp, CountBefZext, 0) && |
432 | !IsCompareZero(ICmp, CountBefZext, 1)) |
433 | return false; |
434 | |
435 | unsigned SuccIdx = ICmp->getPredicate() == ICmpInst::ICMP_NE ? 0 : 1; |
436 | if (BI->getSuccessor(i: SuccIdx) != Preheader) |
437 | return false; |
438 | |
439 | return true; |
440 | } |
441 | |
442 | Value *HardwareLoop::InitLoopCount() { |
443 | LLVM_DEBUG(dbgs() << "HWLoops: Initialising loop counter value:\n" ); |
444 | // Can we replace a conditional branch with an intrinsic that sets the |
445 | // loop counter and tests that is not zero? |
446 | |
447 | SCEVExpander SCEVE(SE, DL, "loopcnt" ); |
448 | if (!ExitCount->getType()->isPointerTy() && |
449 | ExitCount->getType() != CountType) |
450 | ExitCount = SE.getZeroExtendExpr(Op: ExitCount, Ty: CountType); |
451 | |
452 | ExitCount = SE.getAddExpr(LHS: ExitCount, RHS: SE.getOne(Ty: CountType)); |
453 | |
454 | // If we're trying to use the 'test and set' form of the intrinsic, we need |
455 | // to replace a conditional branch that is controlling entry to the loop. It |
456 | // is likely (guaranteed?) that the preheader has an unconditional branch to |
457 | // the loop header, so also check if it has a single predecessor. |
458 | if (SE.isLoopEntryGuardedByCond(L, Pred: ICmpInst::ICMP_NE, LHS: ExitCount, |
459 | RHS: SE.getZero(Ty: ExitCount->getType()))) { |
460 | LLVM_DEBUG(dbgs() << " - Attempting to use test.set counter.\n" ); |
461 | if (Opts.ForceGuard) |
462 | UseLoopGuard = true; |
463 | } else |
464 | UseLoopGuard = false; |
465 | |
466 | BasicBlock *BB = L->getLoopPreheader(); |
467 | if (UseLoopGuard && BB->getSinglePredecessor() && |
468 | cast<BranchInst>(Val: BB->getTerminator())->isUnconditional()) { |
469 | BasicBlock *Predecessor = BB->getSinglePredecessor(); |
470 | // If it's not safe to create a while loop then don't force it and create a |
471 | // do-while loop instead |
472 | if (!SCEVE.isSafeToExpandAt(S: ExitCount, InsertionPoint: Predecessor->getTerminator())) |
473 | UseLoopGuard = false; |
474 | else |
475 | BB = Predecessor; |
476 | } |
477 | |
478 | if (!SCEVE.isSafeToExpandAt(S: ExitCount, InsertionPoint: BB->getTerminator())) { |
479 | LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount " |
480 | << *ExitCount << "\n" ); |
481 | return nullptr; |
482 | } |
483 | |
484 | Value *Count = SCEVE.expandCodeFor(SH: ExitCount, Ty: CountType, |
485 | I: BB->getTerminator()); |
486 | |
487 | // FIXME: We've expanded Count where we hope to insert the counter setting |
488 | // intrinsic. But, in the case of the 'test and set' form, we may fallback to |
489 | // the just 'set' form and in which case the insertion block is most likely |
490 | // different. It means there will be instruction(s) in a block that possibly |
491 | // aren't needed. The isLoopEntryGuardedByCond is trying to avoid this issue, |
492 | // but it's doesn't appear to work in all cases. |
493 | |
494 | UseLoopGuard = UseLoopGuard && CanGenerateTest(L, Count); |
495 | BeginBB = UseLoopGuard ? BB : L->getLoopPreheader(); |
496 | LLVM_DEBUG(dbgs() << " - Loop Count: " << *Count << "\n" |
497 | << " - Expanded Count in " << BB->getName() << "\n" |
498 | << " - Will insert set counter intrinsic into: " |
499 | << BeginBB->getName() << "\n" ); |
500 | return Count; |
501 | } |
502 | |
503 | Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { |
504 | IRBuilder<> Builder(BeginBB->getTerminator()); |
505 | if (BeginBB->getParent()->getAttributes().hasFnAttr(Kind: Attribute::StrictFP)) |
506 | Builder.setIsFPConstrained(true); |
507 | Type *Ty = LoopCountInit->getType(); |
508 | bool UsePhi = UsePHICounter || Opts.ForcePhi; |
509 | Intrinsic::ID ID = UseLoopGuard |
510 | ? (UsePhi ? Intrinsic::test_start_loop_iterations |
511 | : Intrinsic::test_set_loop_iterations) |
512 | : (UsePhi ? Intrinsic::start_loop_iterations |
513 | : Intrinsic::set_loop_iterations); |
514 | Value *LoopSetup = Builder.CreateIntrinsic(ID, Types: Ty, Args: LoopCountInit); |
515 | |
516 | // Use the return value of the intrinsic to control the entry of the loop. |
517 | if (UseLoopGuard) { |
518 | assert((isa<BranchInst>(BeginBB->getTerminator()) && |
519 | cast<BranchInst>(BeginBB->getTerminator())->isConditional()) && |
520 | "Expected conditional branch" ); |
521 | |
522 | Value *SetCount = |
523 | UsePhi ? Builder.CreateExtractValue(Agg: LoopSetup, Idxs: 1) : LoopSetup; |
524 | auto *LoopGuard = cast<BranchInst>(Val: BeginBB->getTerminator()); |
525 | LoopGuard->setCondition(SetCount); |
526 | if (LoopGuard->getSuccessor(i: 0) != L->getLoopPreheader()) |
527 | LoopGuard->swapSuccessors(); |
528 | } |
529 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop counter: " << *LoopSetup |
530 | << "\n" ); |
531 | if (UsePhi && UseLoopGuard) |
532 | LoopSetup = Builder.CreateExtractValue(Agg: LoopSetup, Idxs: 0); |
533 | return !UsePhi ? LoopCountInit : LoopSetup; |
534 | } |
535 | |
536 | void HardwareLoop::InsertLoopDec() { |
537 | IRBuilder<> CondBuilder(ExitBranch); |
538 | if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr( |
539 | Kind: Attribute::StrictFP)) |
540 | CondBuilder.setIsFPConstrained(true); |
541 | |
542 | Value *Ops[] = { LoopDecrement }; |
543 | Value *NewCond = CondBuilder.CreateIntrinsic(ID: Intrinsic::loop_decrement, |
544 | Types: LoopDecrement->getType(), Args: Ops); |
545 | Value *OldCond = ExitBranch->getCondition(); |
546 | ExitBranch->setCondition(NewCond); |
547 | |
548 | // The false branch must exit the loop. |
549 | if (!L->contains(BB: ExitBranch->getSuccessor(i: 0))) |
550 | ExitBranch->swapSuccessors(); |
551 | |
552 | // The old condition may be dead now, and may have even created a dead PHI |
553 | // (the original induction variable). |
554 | RecursivelyDeleteTriviallyDeadInstructions(V: OldCond); |
555 | |
556 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n" ); |
557 | } |
558 | |
559 | Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { |
560 | IRBuilder<> CondBuilder(ExitBranch); |
561 | if (ExitBranch->getParent()->getParent()->getAttributes().hasFnAttr( |
562 | Kind: Attribute::StrictFP)) |
563 | CondBuilder.setIsFPConstrained(true); |
564 | |
565 | Value *Ops[] = { EltsRem, LoopDecrement }; |
566 | Value *Call = CondBuilder.CreateIntrinsic(ID: Intrinsic::loop_decrement_reg, |
567 | Types: {EltsRem->getType()}, Args: Ops); |
568 | |
569 | LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n" ); |
570 | return cast<Instruction>(Val: Call); |
571 | } |
572 | |
573 | PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) { |
574 | BasicBlock * = L->getLoopPreheader(); |
575 | BasicBlock * = L->getHeader(); |
576 | BasicBlock *Latch = ExitBranch->getParent(); |
577 | IRBuilder<> Builder(Header, Header->getFirstNonPHIIt()); |
578 | PHINode *Index = Builder.CreatePHI(Ty: NumElts->getType(), NumReservedValues: 2); |
579 | Index->addIncoming(V: NumElts, BB: Preheader); |
580 | Index->addIncoming(V: EltsRem, BB: Latch); |
581 | LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n" ); |
582 | return Index; |
583 | } |
584 | |
585 | void HardwareLoop::UpdateBranch(Value *EltsRem) { |
586 | IRBuilder<> CondBuilder(ExitBranch); |
587 | Value *NewCond = |
588 | CondBuilder.CreateICmpNE(LHS: EltsRem, RHS: ConstantInt::get(Ty: EltsRem->getType(), V: 0)); |
589 | Value *OldCond = ExitBranch->getCondition(); |
590 | ExitBranch->setCondition(NewCond); |
591 | |
592 | // The false branch must exit the loop. |
593 | if (!L->contains(BB: ExitBranch->getSuccessor(i: 0))) |
594 | ExitBranch->swapSuccessors(); |
595 | |
596 | // The old condition may be dead now, and may have even created a dead PHI |
597 | // (the original induction variable). |
598 | RecursivelyDeleteTriviallyDeadInstructions(V: OldCond); |
599 | } |
600 | |
601 | INITIALIZE_PASS_BEGIN(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) |
602 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
603 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
604 | INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) |
605 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
606 | INITIALIZE_PASS_END(HardwareLoopsLegacy, DEBUG_TYPE, HW_LOOPS_NAME, false, false) |
607 | |
608 | FunctionPass *llvm::createHardwareLoopsLegacyPass() { return new HardwareLoopsLegacy(); } |
609 | |