1 | //===--- SelectOptimize.cpp - Convert select to branches if profitable ---===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This pass converts selects to conditional jumps when profitable. |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/CodeGen/SelectOptimize.h" |
14 | #include "llvm/ADT/SmallVector.h" |
15 | #include "llvm/ADT/Statistic.h" |
16 | #include "llvm/Analysis/BlockFrequencyInfo.h" |
17 | #include "llvm/Analysis/BranchProbabilityInfo.h" |
18 | #include "llvm/Analysis/LoopInfo.h" |
19 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
20 | #include "llvm/Analysis/ProfileSummaryInfo.h" |
21 | #include "llvm/Analysis/TargetTransformInfo.h" |
22 | #include "llvm/CodeGen/Passes.h" |
23 | #include "llvm/CodeGen/TargetLowering.h" |
24 | #include "llvm/CodeGen/TargetPassConfig.h" |
25 | #include "llvm/CodeGen/TargetSchedule.h" |
26 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
27 | #include "llvm/IR/BasicBlock.h" |
28 | #include "llvm/IR/Dominators.h" |
29 | #include "llvm/IR/Function.h" |
30 | #include "llvm/IR/IRBuilder.h" |
31 | #include "llvm/IR/Instruction.h" |
32 | #include "llvm/IR/PatternMatch.h" |
33 | #include "llvm/IR/ProfDataUtils.h" |
34 | #include "llvm/InitializePasses.h" |
35 | #include "llvm/Pass.h" |
36 | #include "llvm/Support/ScaledNumber.h" |
37 | #include "llvm/Target/TargetMachine.h" |
38 | #include "llvm/Transforms/Utils/SizeOpts.h" |
39 | #include <algorithm> |
40 | #include <memory> |
41 | #include <queue> |
42 | #include <stack> |
43 | |
44 | using namespace llvm; |
45 | using namespace llvm::PatternMatch; |
46 | |
47 | #define DEBUG_TYPE "select-optimize" |
48 | |
49 | STATISTIC(NumSelectOptAnalyzed, |
50 | "Number of select groups considered for conversion to branch" ); |
51 | STATISTIC(NumSelectConvertedExpColdOperand, |
52 | "Number of select groups converted due to expensive cold operand" ); |
53 | STATISTIC(NumSelectConvertedHighPred, |
54 | "Number of select groups converted due to high-predictability" ); |
55 | STATISTIC(NumSelectUnPred, |
56 | "Number of select groups not converted due to unpredictability" ); |
57 | STATISTIC(NumSelectColdBB, |
58 | "Number of select groups not converted due to cold basic block" ); |
59 | STATISTIC(NumSelectConvertedLoop, |
60 | "Number of select groups converted due to loop-level analysis" ); |
61 | STATISTIC(NumSelectsConverted, "Number of selects converted" ); |
62 | |
63 | static cl::opt<unsigned> ColdOperandThreshold( |
64 | "cold-operand-threshold" , |
65 | cl::desc("Maximum frequency of path for an operand to be considered cold." ), |
66 | cl::init(Val: 20), cl::Hidden); |
67 | |
68 | static cl::opt<unsigned> ColdOperandMaxCostMultiplier( |
69 | "cold-operand-max-cost-multiplier" , |
70 | cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " |
71 | "slice of a cold operand to be considered inexpensive." ), |
72 | cl::init(Val: 1), cl::Hidden); |
73 | |
74 | static cl::opt<unsigned> |
75 | GainGradientThreshold("select-opti-loop-gradient-gain-threshold" , |
76 | cl::desc("Gradient gain threshold (%)." ), |
77 | cl::init(Val: 25), cl::Hidden); |
78 | |
79 | static cl::opt<unsigned> |
80 | GainCycleThreshold("select-opti-loop-cycle-gain-threshold" , |
81 | cl::desc("Minimum gain per loop (in cycles) threshold." ), |
82 | cl::init(Val: 4), cl::Hidden); |
83 | |
84 | static cl::opt<unsigned> GainRelativeThreshold( |
85 | "select-opti-loop-relative-gain-threshold" , |
86 | cl::desc( |
87 | "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%" ), |
88 | cl::init(Val: 8), cl::Hidden); |
89 | |
90 | static cl::opt<unsigned> MispredictDefaultRate( |
91 | "mispredict-default-rate" , cl::Hidden, cl::init(Val: 25), |
92 | cl::desc("Default mispredict rate (initialized to 25%)." )); |
93 | |
94 | static cl::opt<bool> |
95 | DisableLoopLevelHeuristics("disable-loop-level-heuristics" , cl::Hidden, |
96 | cl::init(Val: false), |
97 | cl::desc("Disable loop-level heuristics." )); |
98 | |
99 | namespace { |
100 | |
101 | class SelectOptimizeImpl { |
102 | const TargetMachine *TM = nullptr; |
103 | const TargetSubtargetInfo *TSI = nullptr; |
104 | const TargetLowering *TLI = nullptr; |
105 | const TargetTransformInfo *TTI = nullptr; |
106 | const LoopInfo *LI = nullptr; |
107 | BlockFrequencyInfo *BFI; |
108 | ProfileSummaryInfo *PSI = nullptr; |
109 | OptimizationRemarkEmitter *ORE = nullptr; |
110 | TargetSchedModel TSchedModel; |
111 | |
112 | public: |
113 | SelectOptimizeImpl() = default; |
114 | SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){}; |
115 | PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); |
116 | bool runOnFunction(Function &F, Pass &P); |
117 | |
118 | using Scaled64 = ScaledNumber<uint64_t>; |
119 | |
120 | struct CostInfo { |
121 | /// Predicated cost (with selects as conditional moves). |
122 | Scaled64 PredCost; |
123 | /// Non-predicated cost (with selects converted to branches). |
124 | Scaled64 NonPredCost; |
125 | }; |
126 | |
127 | /// SelectLike is an abstraction over SelectInst and other operations that can |
128 | /// act like selects. For example Or(Zext(icmp), X) can be treated like |
129 | /// select(icmp, X|1, X). |
130 | class SelectLike { |
131 | SelectLike(Instruction *I) : I(I) {} |
132 | |
133 | /// The select (/or) instruction. |
134 | Instruction *I; |
135 | /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as |
136 | /// opposed to the original condition. |
137 | bool Inverted = false; |
138 | |
139 | public: |
140 | /// Match a select or select-like instruction, returning a SelectLike. |
141 | static SelectLike match(Instruction *I) { |
142 | // Select instruction are what we are usually looking for. |
143 | if (isa<SelectInst>(Val: I)) |
144 | return SelectLike(I); |
145 | |
146 | // An Or(zext(i1 X), Y) can also be treated like a select, with condition |
147 | // C and values Y|1 and Y. |
148 | Value *X; |
149 | if (PatternMatch::match( |
150 | V: I, P: m_c_Or(L: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))), R: m_Value())) && |
151 | X->getType()->isIntegerTy(Bitwidth: 1)) |
152 | return SelectLike(I); |
153 | |
154 | return SelectLike(nullptr); |
155 | } |
156 | |
157 | bool isValid() { return I; } |
158 | operator bool() { return isValid(); } |
159 | |
160 | /// Invert the select by inverting the condition and switching the operands. |
161 | void setInverted() { |
162 | assert(!Inverted && "Trying to invert an inverted SelectLike" ); |
163 | assert(isa<Instruction>(getCondition()) && |
164 | cast<Instruction>(getCondition())->getOpcode() == |
165 | Instruction::Xor); |
166 | Inverted = true; |
167 | } |
168 | bool isInverted() const { return Inverted; } |
169 | |
170 | Instruction *getI() { return I; } |
171 | const Instruction *getI() const { return I; } |
172 | |
173 | Type *getType() const { return I->getType(); } |
174 | |
175 | Value *getNonInvertedCondition() const { |
176 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
177 | return Sel->getCondition(); |
178 | // Or(zext) case |
179 | if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) { |
180 | Value *X; |
181 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 0), |
182 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
183 | return X; |
184 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 1), |
185 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
186 | return X; |
187 | } |
188 | |
189 | llvm_unreachable("Unhandled case in getCondition" ); |
190 | } |
191 | |
192 | /// Return the condition for the SelectLike instruction. For example the |
193 | /// condition of a select or c in `or(zext(c), x)` |
194 | Value *getCondition() const { |
195 | Value *CC = getNonInvertedCondition(); |
196 | // For inverted conditions the CC is checked when created to be a not |
197 | // (xor) instruction. |
198 | if (Inverted) |
199 | return cast<Instruction>(Val: CC)->getOperand(i: 0); |
200 | return CC; |
201 | } |
202 | |
203 | /// Return the true value for the SelectLike instruction. Note this may not |
204 | /// exist for all SelectLike instructions. For example, for `or(zext(c), x)` |
205 | /// the true value would be `or(x,1)`. As this value does not exist, nullptr |
206 | /// is returned. |
207 | Value *getTrueValue(bool HonorInverts = true) const { |
208 | if (Inverted && HonorInverts) |
209 | return getFalseValue(/*HonorInverts=*/false); |
210 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
211 | return Sel->getTrueValue(); |
212 | // Or(zext) case - The true value is Or(X), so return nullptr as the value |
213 | // does not yet exist. |
214 | if (isa<BinaryOperator>(Val: I)) |
215 | return nullptr; |
216 | |
217 | llvm_unreachable("Unhandled case in getTrueValue" ); |
218 | } |
219 | |
220 | /// Return the false value for the SelectLike instruction. For example the |
221 | /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is |
222 | /// `select(c, x|1, x)`) |
223 | Value *getFalseValue(bool HonorInverts = true) const { |
224 | if (Inverted && HonorInverts) |
225 | return getTrueValue(/*HonorInverts=*/false); |
226 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
227 | return Sel->getFalseValue(); |
228 | // Or(zext) case - return the operand which is not the zext. |
229 | if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) { |
230 | Value *X; |
231 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 0), |
232 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
233 | return BO->getOperand(i_nocapture: 1); |
234 | if (PatternMatch::match(V: BO->getOperand(i_nocapture: 1), |
235 | P: m_OneUse(SubPattern: m_ZExt(Op: m_Value(V&: X))))) |
236 | return BO->getOperand(i_nocapture: 0); |
237 | } |
238 | |
239 | llvm_unreachable("Unhandled case in getFalseValue" ); |
240 | } |
241 | |
242 | /// Return the NonPredCost cost of the true op, given the costs in |
243 | /// InstCostMap. This may need to be generated for select-like instructions. |
244 | Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap, |
245 | const TargetTransformInfo *TTI) { |
246 | if (isa<SelectInst>(Val: I)) |
247 | if (auto *I = dyn_cast<Instruction>(Val: getTrueValue())) |
248 | return InstCostMap.contains(Val: I) ? InstCostMap[I].NonPredCost |
249 | : Scaled64::getZero(); |
250 | |
251 | // Or case - add the cost of an extra Or to the cost of the False case. |
252 | if (isa<BinaryOperator>(Val: I)) |
253 | if (auto I = dyn_cast<Instruction>(Val: getFalseValue())) |
254 | if (InstCostMap.contains(Val: I)) { |
255 | InstructionCost OrCost = TTI->getArithmeticInstrCost( |
256 | Opcode: Instruction::Or, Ty: I->getType(), CostKind: TargetTransformInfo::TCK_Latency, |
257 | Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, |
258 | .Properties: TargetTransformInfo::OP_None}, |
259 | Opd2Info: {.Kind: TTI::OK_UniformConstantValue, .Properties: TTI::OP_PowerOf2}); |
260 | return InstCostMap[I].NonPredCost + |
261 | Scaled64::get(N: *OrCost.getValue()); |
262 | } |
263 | |
264 | return Scaled64::getZero(); |
265 | } |
266 | |
267 | /// Return the NonPredCost cost of the false op, given the costs in |
268 | /// InstCostMap. This may need to be generated for select-like instructions. |
269 | Scaled64 |
270 | getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap, |
271 | const TargetTransformInfo *TTI) { |
272 | if (isa<SelectInst>(Val: I)) |
273 | if (auto *I = dyn_cast<Instruction>(Val: getFalseValue())) |
274 | return InstCostMap.contains(Val: I) ? InstCostMap[I].NonPredCost |
275 | : Scaled64::getZero(); |
276 | |
277 | // Or case - return the cost of the false case |
278 | if (isa<BinaryOperator>(Val: I)) |
279 | if (auto I = dyn_cast<Instruction>(Val: getFalseValue())) |
280 | if (InstCostMap.contains(Val: I)) |
281 | return InstCostMap[I].NonPredCost; |
282 | |
283 | return Scaled64::getZero(); |
284 | } |
285 | }; |
286 | |
287 | private: |
288 | // Select groups consist of consecutive select instructions with the same |
289 | // condition. |
290 | using SelectGroup = SmallVector<SelectLike, 2>; |
291 | using SelectGroups = SmallVector<SelectGroup, 2>; |
292 | |
293 | // Converts select instructions of a function to conditional jumps when deemed |
294 | // profitable. Returns true if at least one select was converted. |
295 | bool optimizeSelects(Function &F); |
296 | |
297 | // Heuristics for determining which select instructions can be profitably |
298 | // conveted to branches. Separate heuristics for selects in inner-most loops |
299 | // and the rest of code regions (base heuristics for non-inner-most loop |
300 | // regions). |
301 | void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups); |
302 | void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups); |
303 | |
304 | // Converts to branches the select groups that were deemed |
305 | // profitable-to-convert. |
306 | void convertProfitableSIGroups(SelectGroups &ProfSIGroups); |
307 | |
308 | // Splits selects of a given basic block into select groups. |
309 | void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups); |
310 | |
311 | // Determines for which select groups it is profitable converting to branches |
312 | // (base and inner-most-loop heuristics). |
313 | void findProfitableSIGroupsBase(SelectGroups &SIGroups, |
314 | SelectGroups &ProfSIGroups); |
315 | void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups, |
316 | SelectGroups &ProfSIGroups); |
317 | |
318 | // Determines if a select group should be converted to a branch (base |
319 | // heuristics). |
320 | bool isConvertToBranchProfitableBase(const SelectGroup &ASI); |
321 | |
322 | // Returns true if there are expensive instructions in the cold value |
323 | // operand's (if any) dependence slice of any of the selects of the given |
324 | // group. |
325 | bool hasExpensiveColdOperand(const SelectGroup &ASI); |
326 | |
327 | // For a given source instruction, collect its backwards dependence slice |
328 | // consisting of instructions exclusively computed for producing the operands |
329 | // of the source instruction. |
330 | void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice, |
331 | Instruction *SI, bool ForSinking = false); |
332 | |
333 | // Returns true if the condition of the select is highly predictable. |
334 | bool isSelectHighlyPredictable(const SelectLike SI); |
335 | |
336 | // Loop-level checks to determine if a non-predicated version (with branches) |
337 | // of the given loop is more profitable than its predicated version. |
338 | bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]); |
339 | |
340 | // Computes instruction and loop-critical-path costs for both the predicated |
341 | // and non-predicated version of the given loop. |
342 | bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups, |
343 | DenseMap<const Instruction *, CostInfo> &InstCostMap, |
344 | CostInfo *LoopCost); |
345 | |
346 | // Returns a set of all the select instructions in the given select groups. |
347 | SmallDenseMap<const Instruction *, SelectLike, 2> |
348 | getSImap(const SelectGroups &SIGroups); |
349 | |
350 | // Returns the latency cost of a given instruction. |
351 | std::optional<uint64_t> computeInstCost(const Instruction *I); |
352 | |
353 | // Returns the misprediction cost of a given select when converted to branch. |
354 | Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost); |
355 | |
356 | // Returns the cost of a branch when the prediction is correct. |
357 | Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, |
358 | const SelectLike SI); |
359 | |
360 | // Returns true if the target architecture supports lowering a given select. |
361 | bool isSelectKindSupported(const SelectLike SI); |
362 | }; |
363 | |
364 | class SelectOptimize : public FunctionPass { |
365 | SelectOptimizeImpl Impl; |
366 | |
367 | public: |
368 | static char ID; |
369 | |
370 | SelectOptimize() : FunctionPass(ID) { |
371 | initializeSelectOptimizePass(*PassRegistry::getPassRegistry()); |
372 | } |
373 | |
374 | bool runOnFunction(Function &F) override { |
375 | return Impl.runOnFunction(F, P&: *this); |
376 | } |
377 | |
378 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
379 | AU.addRequired<ProfileSummaryInfoWrapperPass>(); |
380 | AU.addRequired<TargetPassConfig>(); |
381 | AU.addRequired<TargetTransformInfoWrapperPass>(); |
382 | AU.addRequired<LoopInfoWrapperPass>(); |
383 | AU.addRequired<BlockFrequencyInfoWrapperPass>(); |
384 | AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); |
385 | } |
386 | }; |
387 | |
388 | } // namespace |
389 | |
390 | PreservedAnalyses SelectOptimizePass::run(Function &F, |
391 | FunctionAnalysisManager &FAM) { |
392 | SelectOptimizeImpl Impl(TM); |
393 | return Impl.run(F, FAM); |
394 | } |
395 | |
396 | char SelectOptimize::ID = 0; |
397 | |
398 | INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects" , false, |
399 | false) |
400 | INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) |
401 | INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) |
402 | INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) |
403 | INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) |
404 | INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) |
405 | INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) |
406 | INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects" , false, |
407 | false) |
408 | |
409 | FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); } |
410 | |
411 | PreservedAnalyses SelectOptimizeImpl::run(Function &F, |
412 | FunctionAnalysisManager &FAM) { |
413 | TSI = TM->getSubtargetImpl(F); |
414 | TLI = TSI->getTargetLowering(); |
415 | |
416 | // If none of the select types are supported then skip this pass. |
417 | // This is an optimization pass. Legality issues will be handled by |
418 | // instruction selection. |
419 | if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && |
420 | !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && |
421 | !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) |
422 | return PreservedAnalyses::all(); |
423 | |
424 | TTI = &FAM.getResult<TargetIRAnalysis>(IR&: F); |
425 | if (!TTI->enableSelectOptimize()) |
426 | return PreservedAnalyses::all(); |
427 | |
428 | PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F) |
429 | .getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent()); |
430 | assert(PSI && "This pass requires module analysis pass `profile-summary`!" ); |
431 | BFI = &FAM.getResult<BlockFrequencyAnalysis>(IR&: F); |
432 | |
433 | // When optimizing for size, selects are preferable over branches. |
434 | if (F.hasOptSize() || llvm::shouldOptimizeForSize(F: &F, PSI, BFI)) |
435 | return PreservedAnalyses::all(); |
436 | |
437 | LI = &FAM.getResult<LoopAnalysis>(IR&: F); |
438 | ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
439 | TSchedModel.init(TSInfo: TSI); |
440 | |
441 | bool Changed = optimizeSelects(F); |
442 | return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); |
443 | } |
444 | |
445 | bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) { |
446 | TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); |
447 | TSI = TM->getSubtargetImpl(F); |
448 | TLI = TSI->getTargetLowering(); |
449 | |
450 | // If none of the select types are supported then skip this pass. |
451 | // This is an optimization pass. Legality issues will be handled by |
452 | // instruction selection. |
453 | if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) && |
454 | !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) && |
455 | !TLI->isSelectSupported(TargetLowering::VectorMaskSelect)) |
456 | return false; |
457 | |
458 | TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); |
459 | |
460 | if (!TTI->enableSelectOptimize()) |
461 | return false; |
462 | |
463 | LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); |
464 | BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(); |
465 | PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); |
466 | ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); |
467 | TSchedModel.init(TSInfo: TSI); |
468 | |
469 | // When optimizing for size, selects are preferable over branches. |
470 | if (F.hasOptSize() || llvm::shouldOptimizeForSize(F: &F, PSI, BFI)) |
471 | return false; |
472 | |
473 | return optimizeSelects(F); |
474 | } |
475 | |
476 | bool SelectOptimizeImpl::optimizeSelects(Function &F) { |
477 | // Determine for which select groups it is profitable converting to branches. |
478 | SelectGroups ProfSIGroups; |
479 | // Base heuristics apply only to non-loops and outer loops. |
480 | optimizeSelectsBase(F, ProfSIGroups); |
481 | // Separate heuristics for inner-most loops. |
482 | optimizeSelectsInnerLoops(F, ProfSIGroups); |
483 | |
484 | // Convert to branches the select groups that were deemed |
485 | // profitable-to-convert. |
486 | convertProfitableSIGroups(ProfSIGroups); |
487 | |
488 | // Code modified if at least one select group was converted. |
489 | return !ProfSIGroups.empty(); |
490 | } |
491 | |
492 | void SelectOptimizeImpl::optimizeSelectsBase(Function &F, |
493 | SelectGroups &ProfSIGroups) { |
494 | // Collect all the select groups. |
495 | SelectGroups SIGroups; |
496 | for (BasicBlock &BB : F) { |
497 | // Base heuristics apply only to non-loops and outer loops. |
498 | Loop *L = LI->getLoopFor(BB: &BB); |
499 | if (L && L->isInnermost()) |
500 | continue; |
501 | collectSelectGroups(BB, SIGroups); |
502 | } |
503 | |
504 | // Determine for which select groups it is profitable converting to branches. |
505 | findProfitableSIGroupsBase(SIGroups, ProfSIGroups); |
506 | } |
507 | |
508 | void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F, |
509 | SelectGroups &ProfSIGroups) { |
510 | SmallVector<Loop *, 4> Loops(LI->begin(), LI->end()); |
511 | // Need to check size on each iteration as we accumulate child loops. |
512 | for (unsigned long i = 0; i < Loops.size(); ++i) |
513 | for (Loop *ChildL : Loops[i]->getSubLoops()) |
514 | Loops.push_back(Elt: ChildL); |
515 | |
516 | for (Loop *L : Loops) { |
517 | if (!L->isInnermost()) |
518 | continue; |
519 | |
520 | SelectGroups SIGroups; |
521 | for (BasicBlock *BB : L->getBlocks()) |
522 | collectSelectGroups(BB&: *BB, SIGroups); |
523 | |
524 | findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups); |
525 | } |
526 | } |
527 | |
528 | /// If \p isTrue is true, return the true value of \p SI, otherwise return |
529 | /// false value of \p SI. If the true/false value of \p SI is defined by any |
530 | /// select instructions in \p Selects, look through the defining select |
531 | /// instruction until the true/false value is not defined in \p Selects. |
532 | static Value * |
533 | getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue, |
534 | const SmallPtrSet<const Instruction *, 2> &Selects, |
535 | IRBuilder<> &IB) { |
536 | Value *V = nullptr; |
537 | for (SelectInst *DefSI = dyn_cast<SelectInst>(Val: SI.getI()); |
538 | DefSI != nullptr && Selects.count(Ptr: DefSI); |
539 | DefSI = dyn_cast<SelectInst>(Val: V)) { |
540 | if (DefSI->getCondition() == SI.getCondition()) |
541 | V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); |
542 | else // Handle inverted SI |
543 | V = (!isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue()); |
544 | } |
545 | |
546 | if (isa<BinaryOperator>(Val: SI.getI())) { |
547 | assert(SI.getI()->getOpcode() == Instruction::Or && |
548 | "Only currently handling Or instructions." ); |
549 | V = SI.getFalseValue(); |
550 | if (isTrue) |
551 | V = IB.CreateOr(LHS: V, RHS: ConstantInt::get(Ty: V->getType(), V: 1)); |
552 | } |
553 | |
554 | assert(V && "Failed to get select true/false value" ); |
555 | return V; |
556 | } |
557 | |
558 | void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) { |
559 | for (SelectGroup &ASI : ProfSIGroups) { |
560 | // The code transformation here is a modified version of the sinking |
561 | // transformation in CodeGenPrepare::optimizeSelectInst with a more |
562 | // aggressive strategy of which instructions to sink. |
563 | // |
564 | // TODO: eliminate the redundancy of logic transforming selects to branches |
565 | // by removing CodeGenPrepare::optimizeSelectInst and optimizing here |
566 | // selects for all cases (with and without profile information). |
567 | |
568 | // Transform a sequence like this: |
569 | // start: |
570 | // %cmp = cmp uge i32 %a, %b |
571 | // %sel = select i1 %cmp, i32 %c, i32 %d |
572 | // |
573 | // Into: |
574 | // start: |
575 | // %cmp = cmp uge i32 %a, %b |
576 | // %cmp.frozen = freeze %cmp |
577 | // br i1 %cmp.frozen, label %select.true, label %select.false |
578 | // select.true: |
579 | // br label %select.end |
580 | // select.false: |
581 | // br label %select.end |
582 | // select.end: |
583 | // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ] |
584 | // |
585 | // %cmp should be frozen, otherwise it may introduce undefined behavior. |
586 | // In addition, we may sink instructions that produce %c or %d into the |
587 | // destination(s) of the new branch. |
588 | // If the true or false blocks do not contain a sunken instruction, that |
589 | // block and its branch may be optimized away. In that case, one side of the |
590 | // first branch will point directly to select.end, and the corresponding PHI |
591 | // predecessor block will be the start block. |
592 | |
593 | // Find all the instructions that can be soundly sunk to the true/false |
594 | // blocks. These are instructions that are computed solely for producing the |
595 | // operands of the select instructions in the group and can be sunk without |
596 | // breaking the semantics of the LLVM IR (e.g., cannot sink instructions |
597 | // with side effects). |
598 | SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices; |
599 | typedef std::stack<Instruction *>::size_type StackSizeType; |
600 | StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0; |
601 | for (SelectLike SI : ASI) { |
602 | // For each select, compute the sinkable dependence chains of the true and |
603 | // false operands. |
604 | if (auto *TI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue())) { |
605 | std::stack<Instruction *> TrueSlice; |
606 | getExclBackwardsSlice(I: TI, Slice&: TrueSlice, SI: SI.getI(), ForSinking: true); |
607 | maxTrueSliceLen = std::max(a: maxTrueSliceLen, b: TrueSlice.size()); |
608 | TrueSlices.push_back(Elt: TrueSlice); |
609 | } |
610 | if (auto *FI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue())) { |
611 | if (isa<SelectInst>(Val: SI.getI()) || !FI->hasOneUse()) { |
612 | std::stack<Instruction *> FalseSlice; |
613 | getExclBackwardsSlice(I: FI, Slice&: FalseSlice, SI: SI.getI(), ForSinking: true); |
614 | maxFalseSliceLen = std::max(a: maxFalseSliceLen, b: FalseSlice.size()); |
615 | FalseSlices.push_back(Elt: FalseSlice); |
616 | } |
617 | } |
618 | } |
619 | // In the case of multiple select instructions in the same group, the order |
620 | // of non-dependent instructions (instructions of different dependence |
621 | // slices) in the true/false blocks appears to affect performance. |
622 | // Interleaving the slices seems to experimentally be the optimal approach. |
623 | // This interleaving scheduling allows for more ILP (with a natural downside |
624 | // of increasing a bit register pressure) compared to a simple ordering of |
625 | // one whole chain after another. One would expect that this ordering would |
626 | // not matter since the scheduling in the backend of the compiler would |
627 | // take care of it, but apparently the scheduler fails to deliver optimal |
628 | // ILP with a naive ordering here. |
629 | SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved; |
630 | for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) { |
631 | for (auto &S : TrueSlices) { |
632 | if (!S.empty()) { |
633 | TrueSlicesInterleaved.push_back(Elt: S.top()); |
634 | S.pop(); |
635 | } |
636 | } |
637 | } |
638 | for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) { |
639 | for (auto &S : FalseSlices) { |
640 | if (!S.empty()) { |
641 | FalseSlicesInterleaved.push_back(Elt: S.top()); |
642 | S.pop(); |
643 | } |
644 | } |
645 | } |
646 | |
647 | // We split the block containing the select(s) into two blocks. |
648 | SelectLike SI = ASI.front(); |
649 | SelectLike LastSI = ASI.back(); |
650 | BasicBlock *StartBlock = SI.getI()->getParent(); |
651 | BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI())); |
652 | // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With |
653 | // RemoveDIs turned on, SplitPt would instead point to the next |
654 | // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs, |
655 | // tell splitBasicBlock that we want to include any DbgVariableRecords |
656 | // attached to SplitPt in the splice. |
657 | SplitPt.setHeadBit(true); |
658 | BasicBlock *EndBlock = StartBlock->splitBasicBlock(I: SplitPt, BBName: "select.end" ); |
659 | BFI->setBlockFreq(BB: EndBlock, Freq: BFI->getBlockFreq(BB: StartBlock)); |
660 | // Delete the unconditional branch that was just created by the split. |
661 | StartBlock->getTerminator()->eraseFromParent(); |
662 | |
663 | // Move any debug/pseudo instructions and not's that were in-between the |
664 | // select group to the newly-created end block. |
665 | SmallVector<Instruction *, 2> SinkInstrs; |
666 | auto DIt = SI.getI()->getIterator(); |
667 | while (&*DIt != LastSI.getI()) { |
668 | if (DIt->isDebugOrPseudoInst()) |
669 | SinkInstrs.push_back(Elt: &*DIt); |
670 | if (match(V: &*DIt, P: m_Not(V: m_Specific(V: SI.getCondition())))) |
671 | SinkInstrs.push_back(Elt: &*DIt); |
672 | DIt++; |
673 | } |
674 | for (auto *DI : SinkInstrs) |
675 | DI->moveBeforePreserving(MovePos: &*EndBlock->getFirstInsertionPt()); |
676 | |
677 | // Duplicate implementation for DbgRecords, the non-instruction debug-info |
678 | // format. Helper lambda for moving DbgRecords to the end block. |
679 | auto TransferDbgRecords = [&](Instruction &I) { |
680 | for (auto &DbgRecord : |
681 | llvm::make_early_inc_range(Range: I.getDbgRecordRange())) { |
682 | DbgRecord.removeFromParent(); |
683 | EndBlock->insertDbgRecordBefore(DR: &DbgRecord, |
684 | Here: EndBlock->getFirstInsertionPt()); |
685 | } |
686 | }; |
687 | |
688 | // Iterate over all instructions in between SI and LastSI, not including |
689 | // SI itself. These are all the variable assignments that happen "in the |
690 | // middle" of the select group. |
691 | auto R = make_range(x: std::next(x: SI.getI()->getIterator()), |
692 | y: std::next(x: LastSI.getI()->getIterator())); |
693 | llvm::for_each(Range&: R, F: TransferDbgRecords); |
694 | |
695 | // These are the new basic blocks for the conditional branch. |
696 | // At least one will become an actual new basic block. |
697 | BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr; |
698 | BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr; |
699 | if (!TrueSlicesInterleaved.empty()) { |
700 | TrueBlock = BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.true.sink" , |
701 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
702 | TrueBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: TrueBlock); |
703 | TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); |
704 | for (Instruction *TrueInst : TrueSlicesInterleaved) |
705 | TrueInst->moveBefore(MovePos: TrueBranch); |
706 | } |
707 | if (!FalseSlicesInterleaved.empty()) { |
708 | FalseBlock = |
709 | BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.false.sink" , |
710 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
711 | FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: FalseBlock); |
712 | FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc()); |
713 | for (Instruction *FalseInst : FalseSlicesInterleaved) |
714 | FalseInst->moveBefore(MovePos: FalseBranch); |
715 | } |
716 | // If there was nothing to sink, then arbitrarily choose the 'false' side |
717 | // for a new input value to the PHI. |
718 | if (TrueBlock == FalseBlock) { |
719 | assert(TrueBlock == nullptr && |
720 | "Unexpected basic block transform while optimizing select" ); |
721 | |
722 | FalseBlock = BasicBlock::Create(Context&: StartBlock->getContext(), Name: "select.false" , |
723 | Parent: EndBlock->getParent(), InsertBefore: EndBlock); |
724 | auto *FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: FalseBlock); |
725 | FalseBranch->setDebugLoc(SI.getI()->getDebugLoc()); |
726 | } |
727 | |
728 | // Insert the real conditional branch based on the original condition. |
729 | // If we did not create a new block for one of the 'true' or 'false' paths |
730 | // of the condition, it means that side of the branch goes to the end block |
731 | // directly and the path originates from the start block from the point of |
732 | // view of the new PHI. |
733 | BasicBlock *TT, *FT; |
734 | if (TrueBlock == nullptr) { |
735 | TT = EndBlock; |
736 | FT = FalseBlock; |
737 | TrueBlock = StartBlock; |
738 | } else if (FalseBlock == nullptr) { |
739 | TT = TrueBlock; |
740 | FT = EndBlock; |
741 | FalseBlock = StartBlock; |
742 | } else { |
743 | TT = TrueBlock; |
744 | FT = FalseBlock; |
745 | } |
746 | IRBuilder<> IB(SI.getI()); |
747 | auto *CondFr = IB.CreateFreeze(V: SI.getCondition(), |
748 | Name: SI.getCondition()->getName() + ".frozen" ); |
749 | |
750 | SmallPtrSet<const Instruction *, 2> INS; |
751 | for (auto SI : ASI) |
752 | INS.insert(Ptr: SI.getI()); |
753 | |
754 | // Use reverse iterator because later select may use the value of the |
755 | // earlier select, and we need to propagate value through earlier select |
756 | // to get the PHI operand. |
757 | for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) { |
758 | SelectLike SI = *It; |
759 | // The select itself is replaced with a PHI Node. |
760 | PHINode *PN = PHINode::Create(Ty: SI.getType(), NumReservedValues: 2, NameStr: "" ); |
761 | PN->insertBefore(InsertPos: EndBlock->begin()); |
762 | PN->takeName(V: SI.getI()); |
763 | PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: true, Selects: INS, IB), BB: TrueBlock); |
764 | PN->addIncoming(V: getTrueOrFalseValue(SI, isTrue: false, Selects: INS, IB), BB: FalseBlock); |
765 | PN->setDebugLoc(SI.getI()->getDebugLoc()); |
766 | SI.getI()->replaceAllUsesWith(V: PN); |
767 | INS.erase(Ptr: SI.getI()); |
768 | ++NumSelectsConverted; |
769 | } |
770 | IB.CreateCondBr(Cond: CondFr, True: TT, False: FT, MDSrc: SI.getI()); |
771 | |
772 | // Remove the old select instructions, now that they are not longer used. |
773 | for (auto SI : ASI) |
774 | SI.getI()->eraseFromParent(); |
775 | } |
776 | } |
777 | |
778 | void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB, |
779 | SelectGroups &SIGroups) { |
780 | BasicBlock::iterator BBIt = BB.begin(); |
781 | while (BBIt != BB.end()) { |
782 | Instruction *I = &*BBIt++; |
783 | if (SelectLike SI = SelectLike::match(I)) { |
784 | if (!TTI->shouldTreatInstructionLikeSelect(I)) |
785 | continue; |
786 | |
787 | SelectGroup SIGroup; |
788 | SIGroup.push_back(Elt: SI); |
789 | while (BBIt != BB.end()) { |
790 | Instruction *NI = &*BBIt; |
791 | // Debug/pseudo instructions should be skipped and not prevent the |
792 | // formation of a select group. |
793 | if (NI->isDebugOrPseudoInst()) { |
794 | ++BBIt; |
795 | continue; |
796 | } |
797 | |
798 | // Skip not(select(..)), if the not is part of the same select group |
799 | if (match(V: NI, P: m_Not(V: m_Specific(V: SI.getCondition())))) { |
800 | ++BBIt; |
801 | continue; |
802 | } |
803 | |
804 | // We only allow selects in the same group, not other select-like |
805 | // instructions. |
806 | if (!isa<SelectInst>(Val: NI)) |
807 | break; |
808 | |
809 | SelectLike NSI = SelectLike::match(I: NI); |
810 | if (NSI && SI.getCondition() == NSI.getCondition()) { |
811 | SIGroup.push_back(Elt: NSI); |
812 | } else if (NSI && match(V: NSI.getCondition(), |
813 | P: m_Not(V: m_Specific(V: SI.getCondition())))) { |
814 | NSI.setInverted(); |
815 | SIGroup.push_back(Elt: NSI); |
816 | } else |
817 | break; |
818 | ++BBIt; |
819 | } |
820 | |
821 | // If the select type is not supported, no point optimizing it. |
822 | // Instruction selection will take care of it. |
823 | if (!isSelectKindSupported(SI)) |
824 | continue; |
825 | |
826 | LLVM_DEBUG({ |
827 | dbgs() << "New Select group with\n" ; |
828 | for (auto SI : SIGroup) |
829 | dbgs() << " " << *SI.getI() << "\n" ; |
830 | }); |
831 | |
832 | SIGroups.push_back(Elt: SIGroup); |
833 | } |
834 | } |
835 | } |
836 | |
837 | void SelectOptimizeImpl::findProfitableSIGroupsBase( |
838 | SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { |
839 | for (SelectGroup &ASI : SIGroups) { |
840 | ++NumSelectOptAnalyzed; |
841 | if (isConvertToBranchProfitableBase(ASI)) |
842 | ProfSIGroups.push_back(Elt: ASI); |
843 | } |
844 | } |
845 | |
846 | static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, |
847 | DiagnosticInfoOptimizationBase &Rem) { |
848 | LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n" ); |
849 | ORE->emit(OptDiag&: Rem); |
850 | } |
851 | |
852 | void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops( |
853 | const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) { |
854 | NumSelectOptAnalyzed += SIGroups.size(); |
855 | // For each select group in an inner-most loop, |
856 | // a branch is more preferable than a select/conditional-move if: |
857 | // i) conversion to branches for all the select groups of the loop satisfies |
858 | // loop-level heuristics including reducing the loop's critical path by |
859 | // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and |
860 | // ii) the total cost of the select group is cheaper with a branch compared |
861 | // to its predicated version. The cost is in terms of latency and the cost |
862 | // of a select group is the cost of its most expensive select instruction |
863 | // (assuming infinite resources and thus fully leveraging available ILP). |
864 | |
865 | DenseMap<const Instruction *, CostInfo> InstCostMap; |
866 | CostInfo LoopCost[2] = {{.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}, |
867 | {.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}}; |
868 | if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) || |
869 | !checkLoopHeuristics(L, LoopDepth: LoopCost)) { |
870 | return; |
871 | } |
872 | |
873 | for (SelectGroup &ASI : SIGroups) { |
874 | // Assuming infinite resources, the cost of a group of instructions is the |
875 | // cost of the most expensive instruction of the group. |
876 | Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero(); |
877 | for (SelectLike SI : ASI) { |
878 | SelectCost = std::max(a: SelectCost, b: InstCostMap[SI.getI()].PredCost); |
879 | BranchCost = std::max(a: BranchCost, b: InstCostMap[SI.getI()].NonPredCost); |
880 | } |
881 | if (BranchCost < SelectCost) { |
882 | OptimizationRemark OR(DEBUG_TYPE, "SelectOpti" , ASI.front().getI()); |
883 | OR << "Profitable to convert to branch (loop analysis). BranchCost=" |
884 | << BranchCost.toString() << ", SelectCost=" << SelectCost.toString() |
885 | << ". " ; |
886 | EmitAndPrintRemark(ORE, Rem&: OR); |
887 | ++NumSelectConvertedLoop; |
888 | ProfSIGroups.push_back(Elt: ASI); |
889 | } else { |
890 | OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti" , |
891 | ASI.front().getI()); |
892 | ORmiss << "Select is more profitable (loop analysis). BranchCost=" |
893 | << BranchCost.toString() |
894 | << ", SelectCost=" << SelectCost.toString() << ". " ; |
895 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
896 | } |
897 | } |
898 | } |
899 | |
900 | bool SelectOptimizeImpl::isConvertToBranchProfitableBase( |
901 | const SelectGroup &ASI) { |
902 | SelectLike SI = ASI.front(); |
903 | LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI() |
904 | << "\n" ); |
905 | OptimizationRemark OR(DEBUG_TYPE, "SelectOpti" , SI.getI()); |
906 | OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti" , SI.getI()); |
907 | |
908 | // Skip cold basic blocks. Better to optimize for size for cold blocks. |
909 | if (PSI->isColdBlock(BB: SI.getI()->getParent(), BFI)) { |
910 | ++NumSelectColdBB; |
911 | ORmiss << "Not converted to branch because of cold basic block. " ; |
912 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
913 | return false; |
914 | } |
915 | |
916 | // If unpredictable, branch form is less profitable. |
917 | if (SI.getI()->getMetadata(KindID: LLVMContext::MD_unpredictable)) { |
918 | ++NumSelectUnPred; |
919 | ORmiss << "Not converted to branch because of unpredictable branch. " ; |
920 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
921 | return false; |
922 | } |
923 | |
924 | // If highly predictable, branch form is more profitable, unless a |
925 | // predictable select is inexpensive in the target architecture. |
926 | if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) { |
927 | ++NumSelectConvertedHighPred; |
928 | OR << "Converted to branch because of highly predictable branch. " ; |
929 | EmitAndPrintRemark(ORE, Rem&: OR); |
930 | return true; |
931 | } |
932 | |
933 | // Look for expensive instructions in the cold operand's (if any) dependence |
934 | // slice of any of the selects in the group. |
935 | if (hasExpensiveColdOperand(ASI)) { |
936 | ++NumSelectConvertedExpColdOperand; |
937 | OR << "Converted to branch because of expensive cold operand." ; |
938 | EmitAndPrintRemark(ORE, Rem&: OR); |
939 | return true; |
940 | } |
941 | |
942 | ORmiss << "Not profitable to convert to branch (base heuristic)." ; |
943 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
944 | return false; |
945 | } |
946 | |
947 | static InstructionCost divideNearest(InstructionCost Numerator, |
948 | uint64_t Denominator) { |
949 | return (Numerator + (Denominator / 2)) / Denominator; |
950 | } |
951 | |
952 | static bool (const SelectOptimizeImpl::SelectLike SI, |
953 | uint64_t &TrueVal, uint64_t &FalseVal) { |
954 | if (isa<SelectInst>(Val: SI.getI())) |
955 | return extractBranchWeights(I: *SI.getI(), TrueVal, FalseVal); |
956 | return false; |
957 | } |
958 | |
959 | bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) { |
960 | bool ColdOperand = false; |
961 | uint64_t TrueWeight, FalseWeight, TotalWeight; |
962 | if (extractBranchWeights(SI: ASI.front(), TrueVal&: TrueWeight, FalseVal&: FalseWeight)) { |
963 | uint64_t MinWeight = std::min(a: TrueWeight, b: FalseWeight); |
964 | TotalWeight = TrueWeight + FalseWeight; |
965 | // Is there a path with frequency <ColdOperandThreshold% (default:20%) ? |
966 | ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight; |
967 | } else if (PSI->hasProfileSummary()) { |
968 | OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti" , |
969 | ASI.front().getI()); |
970 | ORmiss << "Profile data available but missing branch-weights metadata for " |
971 | "select instruction. " ; |
972 | EmitAndPrintRemark(ORE, Rem&: ORmiss); |
973 | } |
974 | if (!ColdOperand) |
975 | return false; |
976 | // Check if the cold path's dependence slice is expensive for any of the |
977 | // selects of the group. |
978 | for (SelectLike SI : ASI) { |
979 | Instruction *ColdI = nullptr; |
980 | uint64_t HotWeight; |
981 | if (TrueWeight < FalseWeight) { |
982 | ColdI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue()); |
983 | HotWeight = FalseWeight; |
984 | } else { |
985 | ColdI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue()); |
986 | HotWeight = TrueWeight; |
987 | } |
988 | if (ColdI) { |
989 | std::stack<Instruction *> ColdSlice; |
990 | getExclBackwardsSlice(I: ColdI, Slice&: ColdSlice, SI: SI.getI()); |
991 | InstructionCost SliceCost = 0; |
992 | while (!ColdSlice.empty()) { |
993 | SliceCost += TTI->getInstructionCost(U: ColdSlice.top(), |
994 | CostKind: TargetTransformInfo::TCK_Latency); |
995 | ColdSlice.pop(); |
996 | } |
997 | // The colder the cold value operand of the select is the more expensive |
998 | // the cmov becomes for computing the cold value operand every time. Thus, |
999 | // the colder the cold operand is the more its cost counts. |
1000 | // Get nearest integer cost adjusted for coldness. |
1001 | InstructionCost AdjSliceCost = |
1002 | divideNearest(Numerator: SliceCost * HotWeight, Denominator: TotalWeight); |
1003 | if (AdjSliceCost >= |
1004 | ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive) |
1005 | return true; |
1006 | } |
1007 | } |
1008 | return false; |
1009 | } |
1010 | |
1011 | // Check if it is safe to move LoadI next to the SI. |
1012 | // Conservatively assume it is safe only if there is no instruction |
1013 | // modifying memory in-between the load and the select instruction. |
1014 | static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) { |
1015 | // Assume loads from different basic blocks are unsafe to move. |
1016 | if (LoadI->getParent() != SI->getParent()) |
1017 | return false; |
1018 | auto It = LoadI->getIterator(); |
1019 | while (&*It != SI) { |
1020 | if (It->mayWriteToMemory()) |
1021 | return false; |
1022 | It++; |
1023 | } |
1024 | return true; |
1025 | } |
1026 | |
1027 | // For a given source instruction, collect its backwards dependence slice |
1028 | // consisting of instructions exclusively computed for the purpose of producing |
1029 | // the operands of the source instruction. As an approximation |
1030 | // (sufficiently-accurate in practice), we populate this set with the |
1031 | // instructions of the backwards dependence slice that only have one-use and |
1032 | // form an one-use chain that leads to the source instruction. |
1033 | void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I, |
1034 | std::stack<Instruction *> &Slice, |
1035 | Instruction *SI, |
1036 | bool ForSinking) { |
1037 | SmallPtrSet<Instruction *, 2> Visited; |
1038 | std::queue<Instruction *> Worklist; |
1039 | Worklist.push(x: I); |
1040 | while (!Worklist.empty()) { |
1041 | Instruction *II = Worklist.front(); |
1042 | Worklist.pop(); |
1043 | |
1044 | // Avoid cycles. |
1045 | if (!Visited.insert(Ptr: II).second) |
1046 | continue; |
1047 | |
1048 | if (!II->hasOneUse()) |
1049 | continue; |
1050 | |
1051 | // Cannot soundly sink instructions with side-effects. |
1052 | // Terminator or phi instructions cannot be sunk. |
1053 | // Avoid sinking other select instructions (should be handled separetely). |
1054 | if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() || |
1055 | isa<SelectInst>(Val: II) || isa<PHINode>(Val: II))) |
1056 | continue; |
1057 | |
1058 | // Avoid sinking loads in order not to skip state-modifying instructions, |
1059 | // that may alias with the loaded address. |
1060 | // Only allow sinking of loads within the same basic block that are |
1061 | // conservatively proven to be safe. |
1062 | if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(LoadI: II, SI)) |
1063 | continue; |
1064 | |
1065 | // Avoid considering instructions with less frequency than the source |
1066 | // instruction (i.e., avoid colder code regions of the dependence slice). |
1067 | if (BFI->getBlockFreq(BB: II->getParent()) < BFI->getBlockFreq(BB: I->getParent())) |
1068 | continue; |
1069 | |
1070 | // Eligible one-use instruction added to the dependence slice. |
1071 | Slice.push(x: II); |
1072 | |
1073 | // Explore all the operands of the current instruction to expand the slice. |
1074 | for (Value *Op : II->operand_values()) |
1075 | if (auto *OpI = dyn_cast<Instruction>(Val: Op)) |
1076 | Worklist.push(x: OpI); |
1077 | } |
1078 | } |
1079 | |
1080 | bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) { |
1081 | uint64_t TrueWeight, FalseWeight; |
1082 | if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) { |
1083 | uint64_t Max = std::max(a: TrueWeight, b: FalseWeight); |
1084 | uint64_t Sum = TrueWeight + FalseWeight; |
1085 | if (Sum != 0) { |
1086 | auto Probability = BranchProbability::getBranchProbability(Numerator: Max, Denominator: Sum); |
1087 | if (Probability > TTI->getPredictableBranchThreshold()) |
1088 | return true; |
1089 | } |
1090 | } |
1091 | return false; |
1092 | } |
1093 | |
1094 | bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L, |
1095 | const CostInfo LoopCost[2]) { |
1096 | // Loop-level checks to determine if a non-predicated version (with branches) |
1097 | // of the loop is more profitable than its predicated version. |
1098 | |
1099 | if (DisableLoopLevelHeuristics) |
1100 | return true; |
1101 | |
1102 | OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti" , |
1103 | L->getHeader()->getFirstNonPHI()); |
1104 | |
1105 | if (LoopCost[0].NonPredCost > LoopCost[0].PredCost || |
1106 | LoopCost[1].NonPredCost >= LoopCost[1].PredCost) { |
1107 | ORmissL << "No select conversion in the loop due to no reduction of loop's " |
1108 | "critical path. " ; |
1109 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1110 | return false; |
1111 | } |
1112 | |
1113 | Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost, |
1114 | LoopCost[1].PredCost - LoopCost[1].NonPredCost}; |
1115 | |
1116 | // Profitably converting to branches need to reduce the loop's critical path |
1117 | // by at least some threshold (absolute gain of GainCycleThreshold cycles and |
1118 | // relative gain of 12.5%). |
1119 | if (Gain[1] < Scaled64::get(N: GainCycleThreshold) || |
1120 | Gain[1] * Scaled64::get(N: GainRelativeThreshold) < LoopCost[1].PredCost) { |
1121 | Scaled64 RelativeGain = Scaled64::get(N: 100) * Gain[1] / LoopCost[1].PredCost; |
1122 | ORmissL << "No select conversion in the loop due to small reduction of " |
1123 | "loop's critical path. Gain=" |
1124 | << Gain[1].toString() |
1125 | << ", RelativeGain=" << RelativeGain.toString() << "%. " ; |
1126 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1127 | return false; |
1128 | } |
1129 | |
1130 | // If the loop's critical path involves loop-carried dependences, the gradient |
1131 | // of the gain needs to be at least GainGradientThreshold% (defaults to 25%). |
1132 | // This check ensures that the latency reduction for the loop's critical path |
1133 | // keeps decreasing with sufficient rate beyond the two analyzed loop |
1134 | // iterations. |
1135 | if (Gain[1] > Gain[0]) { |
1136 | Scaled64 GradientGain = Scaled64::get(N: 100) * (Gain[1] - Gain[0]) / |
1137 | (LoopCost[1].PredCost - LoopCost[0].PredCost); |
1138 | if (GradientGain < Scaled64::get(N: GainGradientThreshold)) { |
1139 | ORmissL << "No select conversion in the loop due to small gradient gain. " |
1140 | "GradientGain=" |
1141 | << GradientGain.toString() << "%. " ; |
1142 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1143 | return false; |
1144 | } |
1145 | } |
1146 | // If the gain decreases it is not profitable to convert. |
1147 | else if (Gain[1] < Gain[0]) { |
1148 | ORmissL |
1149 | << "No select conversion in the loop due to negative gradient gain. " ; |
1150 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1151 | return false; |
1152 | } |
1153 | |
1154 | // Non-predicated version of the loop is more profitable than its |
1155 | // predicated version. |
1156 | return true; |
1157 | } |
1158 | |
1159 | // Computes instruction and loop-critical-path costs for both the predicated |
1160 | // and non-predicated version of the given loop. |
1161 | // Returns false if unable to compute these costs due to invalid cost of loop |
1162 | // instruction(s). |
1163 | bool SelectOptimizeImpl::computeLoopCosts( |
1164 | const Loop *L, const SelectGroups &SIGroups, |
1165 | DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) { |
1166 | LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop " |
1167 | << L->getHeader()->getName() << "\n" ); |
1168 | const auto &SImap = getSImap(SIGroups); |
1169 | // Compute instruction and loop-critical-path costs across two iterations for |
1170 | // both predicated and non-predicated version. |
1171 | const unsigned Iterations = 2; |
1172 | for (unsigned Iter = 0; Iter < Iterations; ++Iter) { |
1173 | // Cost of the loop's critical path. |
1174 | CostInfo &MaxCost = LoopCost[Iter]; |
1175 | for (BasicBlock *BB : L->getBlocks()) { |
1176 | for (const Instruction &I : *BB) { |
1177 | if (I.isDebugOrPseudoInst()) |
1178 | continue; |
1179 | // Compute the predicated and non-predicated cost of the instruction. |
1180 | Scaled64 IPredCost = Scaled64::getZero(), |
1181 | INonPredCost = Scaled64::getZero(); |
1182 | |
1183 | // Assume infinite resources that allow to fully exploit the available |
1184 | // instruction-level parallelism. |
1185 | // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost) |
1186 | for (const Use &U : I.operands()) { |
1187 | auto UI = dyn_cast<Instruction>(Val: U.get()); |
1188 | if (!UI) |
1189 | continue; |
1190 | if (InstCostMap.count(Val: UI)) { |
1191 | IPredCost = std::max(a: IPredCost, b: InstCostMap[UI].PredCost); |
1192 | INonPredCost = std::max(a: INonPredCost, b: InstCostMap[UI].NonPredCost); |
1193 | } |
1194 | } |
1195 | auto ILatency = computeInstCost(I: &I); |
1196 | if (!ILatency) { |
1197 | OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti" , &I); |
1198 | ORmissL << "Invalid instruction cost preventing analysis and " |
1199 | "optimization of the inner-most loop containing this " |
1200 | "instruction. " ; |
1201 | EmitAndPrintRemark(ORE, Rem&: ORmissL); |
1202 | return false; |
1203 | } |
1204 | IPredCost += Scaled64::get(N: *ILatency); |
1205 | INonPredCost += Scaled64::get(N: *ILatency); |
1206 | |
1207 | // For a select that can be converted to branch, |
1208 | // compute its cost as a branch (non-predicated cost). |
1209 | // |
1210 | // BranchCost = PredictedPathCost + MispredictCost |
1211 | // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb |
1212 | // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate |
1213 | if (SImap.contains(Val: &I)) { |
1214 | auto SI = SImap.at(Val: &I); |
1215 | Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI); |
1216 | Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI); |
1217 | Scaled64 PredictedPathCost = |
1218 | getPredictedPathCost(TrueCost: TrueOpCost, FalseCost: FalseOpCost, SI); |
1219 | |
1220 | Scaled64 CondCost = Scaled64::getZero(); |
1221 | if (auto *CI = dyn_cast<Instruction>(Val: SI.getCondition())) |
1222 | if (InstCostMap.count(Val: CI)) |
1223 | CondCost = InstCostMap[CI].NonPredCost; |
1224 | Scaled64 MispredictCost = getMispredictionCost(SI, CondCost); |
1225 | |
1226 | INonPredCost = PredictedPathCost + MispredictCost; |
1227 | } |
1228 | LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/" |
1229 | << INonPredCost << " for " << I << "\n" ); |
1230 | |
1231 | InstCostMap[&I] = {.PredCost: IPredCost, .NonPredCost: INonPredCost}; |
1232 | MaxCost.PredCost = std::max(a: MaxCost.PredCost, b: IPredCost); |
1233 | MaxCost.NonPredCost = std::max(a: MaxCost.NonPredCost, b: INonPredCost); |
1234 | } |
1235 | } |
1236 | LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1 |
1237 | << " MaxCost = " << MaxCost.PredCost << " " |
1238 | << MaxCost.NonPredCost << "\n" ); |
1239 | } |
1240 | return true; |
1241 | } |
1242 | |
1243 | SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2> |
1244 | SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) { |
1245 | SmallDenseMap<const Instruction *, SelectLike, 2> SImap; |
1246 | for (const SelectGroup &ASI : SIGroups) |
1247 | for (SelectLike SI : ASI) |
1248 | SImap.try_emplace(Key: SI.getI(), Args&: SI); |
1249 | return SImap; |
1250 | } |
1251 | |
1252 | std::optional<uint64_t> |
1253 | SelectOptimizeImpl::computeInstCost(const Instruction *I) { |
1254 | InstructionCost ICost = |
1255 | TTI->getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency); |
1256 | if (auto OC = ICost.getValue()) |
1257 | return std::optional<uint64_t>(*OC); |
1258 | return std::nullopt; |
1259 | } |
1260 | |
1261 | ScaledNumber<uint64_t> |
1262 | SelectOptimizeImpl::getMispredictionCost(const SelectLike SI, |
1263 | const Scaled64 CondCost) { |
1264 | uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; |
1265 | |
1266 | // Account for the default misprediction rate when using a branch |
1267 | // (conservatively set to 25% by default). |
1268 | uint64_t MispredictRate = MispredictDefaultRate; |
1269 | // If the select condition is obviously predictable, then the misprediction |
1270 | // rate is zero. |
1271 | if (isSelectHighlyPredictable(SI)) |
1272 | MispredictRate = 0; |
1273 | |
1274 | // CondCost is included to account for cases where the computation of the |
1275 | // condition is part of a long dependence chain (potentially loop-carried) |
1276 | // that would delay detection of a misprediction and increase its cost. |
1277 | Scaled64 MispredictCost = |
1278 | std::max(a: Scaled64::get(N: MispredictPenalty), b: CondCost) * |
1279 | Scaled64::get(N: MispredictRate); |
1280 | MispredictCost /= Scaled64::get(N: 100); |
1281 | |
1282 | return MispredictCost; |
1283 | } |
1284 | |
1285 | // Returns the cost of a branch when the prediction is correct. |
1286 | // TrueCost * TrueProbability + FalseCost * FalseProbability. |
1287 | ScaledNumber<uint64_t> |
1288 | SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost, |
1289 | const SelectLike SI) { |
1290 | Scaled64 PredPathCost; |
1291 | uint64_t TrueWeight, FalseWeight; |
1292 | if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) { |
1293 | uint64_t SumWeight = TrueWeight + FalseWeight; |
1294 | if (SumWeight != 0) { |
1295 | PredPathCost = TrueCost * Scaled64::get(N: TrueWeight) + |
1296 | FalseCost * Scaled64::get(N: FalseWeight); |
1297 | PredPathCost /= Scaled64::get(N: SumWeight); |
1298 | return PredPathCost; |
1299 | } |
1300 | } |
1301 | // Without branch weight metadata, we assume 75% for the one path and 25% for |
1302 | // the other, and pick the result with the biggest cost. |
1303 | PredPathCost = std::max(a: TrueCost * Scaled64::get(N: 3) + FalseCost, |
1304 | b: FalseCost * Scaled64::get(N: 3) + TrueCost); |
1305 | PredPathCost /= Scaled64::get(N: 4); |
1306 | return PredPathCost; |
1307 | } |
1308 | |
1309 | bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) { |
1310 | bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(Bitwidth: 1); |
1311 | if (VectorCond) |
1312 | return false; |
1313 | TargetLowering::SelectSupportKind SelectKind; |
1314 | if (SI.getType()->isVectorTy()) |
1315 | SelectKind = TargetLowering::ScalarCondVectorVal; |
1316 | else |
1317 | SelectKind = TargetLowering::ScalarValSelect; |
1318 | return TLI->isSelectSupported(SelectKind); |
1319 | } |
1320 | |