1//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass converts selects to conditional jumps when profitable.
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/CodeGen/SelectOptimize.h"
14#include "llvm/ADT/SetVector.h"
15#include "llvm/ADT/SmallVector.h"
16#include "llvm/ADT/Statistic.h"
17#include "llvm/Analysis/BlockFrequencyInfo.h"
18#include "llvm/Analysis/BranchProbabilityInfo.h"
19#include "llvm/Analysis/LoopInfo.h"
20#include "llvm/Analysis/OptimizationRemarkEmitter.h"
21#include "llvm/Analysis/ProfileSummaryInfo.h"
22#include "llvm/Analysis/TargetTransformInfo.h"
23#include "llvm/CodeGen/Passes.h"
24#include "llvm/CodeGen/TargetLowering.h"
25#include "llvm/CodeGen/TargetPassConfig.h"
26#include "llvm/CodeGen/TargetSchedule.h"
27#include "llvm/CodeGen/TargetSubtargetInfo.h"
28#include "llvm/IR/BasicBlock.h"
29#include "llvm/IR/Dominators.h"
30#include "llvm/IR/Function.h"
31#include "llvm/IR/IRBuilder.h"
32#include "llvm/IR/Instruction.h"
33#include "llvm/IR/PatternMatch.h"
34#include "llvm/IR/ProfDataUtils.h"
35#include "llvm/InitializePasses.h"
36#include "llvm/Pass.h"
37#include "llvm/Support/ScaledNumber.h"
38#include "llvm/Target/TargetMachine.h"
39#include "llvm/Transforms/Utils/SizeOpts.h"
40#include <algorithm>
41#include <queue>
42#include <stack>
43
44using namespace llvm;
45using namespace llvm::PatternMatch;
46
47#define DEBUG_TYPE "select-optimize"
48
49STATISTIC(NumSelectOptAnalyzed,
50 "Number of select groups considered for conversion to branch");
51STATISTIC(NumSelectConvertedExpColdOperand,
52 "Number of select groups converted due to expensive cold operand");
53STATISTIC(NumSelectConvertedHighPred,
54 "Number of select groups converted due to high-predictability");
55STATISTIC(NumSelectUnPred,
56 "Number of select groups not converted due to unpredictability");
57STATISTIC(NumSelectColdBB,
58 "Number of select groups not converted due to cold basic block");
59STATISTIC(NumSelectConvertedLoop,
60 "Number of select groups converted due to loop-level analysis");
61STATISTIC(NumSelectsConverted, "Number of selects converted");
62
63static cl::opt<unsigned> ColdOperandThreshold(
64 "cold-operand-threshold",
65 cl::desc("Maximum frequency of path for an operand to be considered cold."),
66 cl::init(Val: 20), cl::Hidden);
67
68static cl::opt<unsigned> ColdOperandMaxCostMultiplier(
69 "cold-operand-max-cost-multiplier",
70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71 "slice of a cold operand to be considered inexpensive."),
72 cl::init(Val: 1), cl::Hidden);
73
74static cl::opt<unsigned>
75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76 cl::desc("Gradient gain threshold (%)."),
77 cl::init(Val: 25), cl::Hidden);
78
79static cl::opt<unsigned>
80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81 cl::desc("Minimum gain per loop (in cycles) threshold."),
82 cl::init(Val: 4), cl::Hidden);
83
84static cl::opt<unsigned> GainRelativeThreshold(
85 "select-opti-loop-relative-gain-threshold",
86 cl::desc(
87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
88 cl::init(Val: 8), cl::Hidden);
89
90static cl::opt<unsigned> MispredictDefaultRate(
91 "mispredict-default-rate", cl::Hidden, cl::init(Val: 25),
92 cl::desc("Default mispredict rate (initialized to 25%)."));
93
94static cl::opt<bool>
95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96 cl::init(Val: false),
97 cl::desc("Disable loop-level heuristics."));
98
99namespace {
100
101class SelectOptimizeImpl {
102 const TargetMachine *TM = nullptr;
103 const TargetSubtargetInfo *TSI = nullptr;
104 const TargetLowering *TLI = nullptr;
105 const TargetTransformInfo *TTI = nullptr;
106 const LoopInfo *LI = nullptr;
107 BlockFrequencyInfo *BFI;
108 ProfileSummaryInfo *PSI = nullptr;
109 OptimizationRemarkEmitter *ORE = nullptr;
110 TargetSchedModel TSchedModel;
111
112public:
113 SelectOptimizeImpl() = default;
114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
115 PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
116 bool runOnFunction(Function &F, Pass &P);
117
118 using Scaled64 = ScaledNumber<uint64_t>;
119
120 struct CostInfo {
121 /// Predicated cost (with selects as conditional moves).
122 Scaled64 PredCost;
123 /// Non-predicated cost (with selects converted to branches).
124 Scaled64 NonPredCost;
125 };
126
127 /// SelectLike is an abstraction over SelectInst and other operations that can
128 /// act like selects. For example Or(Zext(icmp), X) can be treated like
129 /// select(icmp, X|1, X).
130 class SelectLike {
131 /// The select (/or) instruction.
132 Instruction *I;
133 /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as
134 /// opposed to the original condition.
135 bool Inverted = false;
136
137 /// The index of the operand that depends on condition. Only for select-like
138 /// instruction such as Or/Add.
139 unsigned CondIdx;
140
141 public:
142 SelectLike(Instruction *I, bool Inverted = false, unsigned CondIdx = 0)
143 : I(I), Inverted(Inverted), CondIdx(CondIdx) {}
144
145 Instruction *getI() { return I; }
146 const Instruction *getI() const { return I; }
147
148 Type *getType() const { return I->getType(); }
149
150 unsigned getConditionOpIndex() { return CondIdx; };
151
152 /// Return the true value for the SelectLike instruction. Note this may not
153 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
154 /// the true value would be `or(x,1)`. As this value does not exist, nullptr
155 /// is returned.
156 Value *getTrueValue(bool HonorInverts = true) const {
157 if (Inverted && HonorInverts)
158 return getFalseValue(/*HonorInverts=*/false);
159 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
160 return Sel->getTrueValue();
161 // Or(zext) case - The true value is Or(X), so return nullptr as the value
162 // does not yet exist.
163 if (isa<BinaryOperator>(Val: I))
164 return nullptr;
165
166 llvm_unreachable("Unhandled case in getTrueValue");
167 }
168
169 /// Return the false value for the SelectLike instruction. For example the
170 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
171 /// `select(c, x|1, x)`)
172 Value *getFalseValue(bool HonorInverts = true) const {
173 if (Inverted && HonorInverts)
174 return getTrueValue(/*HonorInverts=*/false);
175 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
176 return Sel->getFalseValue();
177 // We are on the branch where the condition is zero, which means BinOp
178 // does not perform any computation, and we can simply return the operand
179 // that is not related to the condition
180 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
181 return BO->getOperand(i_nocapture: 1 - CondIdx);
182
183 llvm_unreachable("Unhandled case in getFalseValue");
184 }
185
186 /// Return the NonPredCost cost of the op on \p isTrue branch, given the
187 /// costs in \p InstCostMap. This may need to be generated for select-like
188 /// instructions.
189 Scaled64 getOpCostOnBranch(
190 bool IsTrue, const DenseMap<const Instruction *, CostInfo> &InstCostMap,
191 const TargetTransformInfo *TTI) {
192 auto *V = IsTrue ? getTrueValue() : getFalseValue();
193 if (V) {
194 if (auto *IV = dyn_cast<Instruction>(Val: V)) {
195 auto It = InstCostMap.find(Val: IV);
196 return It != InstCostMap.end() ? It->second.NonPredCost
197 : Scaled64::getZero();
198 }
199 return Scaled64::getZero();
200 }
201 // If getTrue(False)Value() return nullptr, it means we are dealing with
202 // select-like instructions on the branch where the actual computation is
203 // happening. In that case the cost is equal to the cost of computation +
204 // cost of non-dependant on condition operand
205 InstructionCost Cost = TTI->getArithmeticInstrCost(
206 Opcode: getI()->getOpcode(), Ty: I->getType(), CostKind: TargetTransformInfo::TCK_Latency,
207 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
208 Opd2Info: {.Kind: TTI::OK_UniformConstantValue, .Properties: TTI::OP_PowerOf2});
209 auto TotalCost = Scaled64::get(N: Cost.getValue());
210 if (auto *OpI = dyn_cast<Instruction>(Val: I->getOperand(i: 1 - CondIdx))) {
211 auto It = InstCostMap.find(Val: OpI);
212 if (It != InstCostMap.end())
213 TotalCost += It->second.NonPredCost;
214 }
215 return TotalCost;
216 }
217 };
218
219private:
220 // Select groups consist of consecutive select-like instructions with the same
221 // condition. Between select-likes could be any number of auxiliary
222 // instructions related to the condition like not, zext, ashr/lshr
223 struct SelectGroup {
224 Value *Condition;
225 SmallVector<SelectLike, 2> Selects;
226 };
227 using SelectGroups = SmallVector<SelectGroup, 2>;
228
229 // Converts select instructions of a function to conditional jumps when deemed
230 // profitable. Returns true if at least one select was converted.
231 bool optimizeSelects(Function &F);
232
233 // Heuristics for determining which select instructions can be profitably
234 // conveted to branches. Separate heuristics for selects in inner-most loops
235 // and the rest of code regions (base heuristics for non-inner-most loop
236 // regions).
237 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
238 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
239
240 // Converts to branches the select groups that were deemed
241 // profitable-to-convert.
242 void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
243
244 // Splits selects of a given basic block into select groups.
245 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
246
247 // Determines for which select groups it is profitable converting to branches
248 // (base and inner-most-loop heuristics).
249 void findProfitableSIGroupsBase(SelectGroups &SIGroups,
250 SelectGroups &ProfSIGroups);
251 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
252 SelectGroups &ProfSIGroups);
253
254 // Determines if a select group should be converted to a branch (base
255 // heuristics).
256 bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
257
258 // Returns true if there are expensive instructions in the cold value
259 // operand's (if any) dependence slice of any of the selects of the given
260 // group.
261 bool hasExpensiveColdOperand(const SelectGroup &ASI);
262
263 // For a given source instruction, collect its backwards dependence slice
264 // consisting of instructions exclusively computed for producing the operands
265 // of the source instruction.
266 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
267 Instruction *SI, bool ForSinking = false);
268
269 // Returns true if the condition of the select is highly predictable.
270 bool isSelectHighlyPredictable(const SelectLike SI);
271
272 // Loop-level checks to determine if a non-predicated version (with branches)
273 // of the given loop is more profitable than its predicated version.
274 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
275
276 // Computes instruction and loop-critical-path costs for both the predicated
277 // and non-predicated version of the given loop.
278 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
279 DenseMap<const Instruction *, CostInfo> &InstCostMap,
280 CostInfo *LoopCost);
281
282 // Returns a set of all the select instructions in the given select groups.
283 SmallDenseMap<const Instruction *, SelectLike, 2>
284 getSImap(const SelectGroups &SIGroups);
285
286 // Returns a map from select-like instructions to the corresponding select
287 // group.
288 SmallDenseMap<const Instruction *, const SelectGroup *, 2>
289 getSGmap(const SelectGroups &SIGroups);
290
291 // Returns the latency cost of a given instruction.
292 std::optional<uint64_t> computeInstCost(const Instruction *I);
293
294 // Returns the misprediction cost of a given select when converted to branch.
295 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
296
297 // Returns the cost of a branch when the prediction is correct.
298 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
299 const SelectLike SI);
300
301 // Returns true if the target architecture supports lowering a given select.
302 bool isSelectKindSupported(const SelectLike SI);
303};
304
305class SelectOptimize : public FunctionPass {
306 SelectOptimizeImpl Impl;
307
308public:
309 static char ID;
310
311 SelectOptimize() : FunctionPass(ID) {}
312
313 bool runOnFunction(Function &F) override {
314 if (skipFunction(F))
315 return false;
316
317 return Impl.runOnFunction(F, P&: *this);
318 }
319
320 void getAnalysisUsage(AnalysisUsage &AU) const override {
321 AU.addRequired<ProfileSummaryInfoWrapperPass>();
322 AU.addRequired<TargetPassConfig>();
323 AU.addRequired<TargetTransformInfoWrapperPass>();
324 AU.addRequired<LoopInfoWrapperPass>();
325 AU.addRequired<BlockFrequencyInfoWrapperPass>();
326 AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
327 }
328};
329
330} // namespace
331
332PreservedAnalyses SelectOptimizePass::run(Function &F,
333 FunctionAnalysisManager &FAM) {
334 SelectOptimizeImpl Impl(TM);
335 return Impl.run(F, FAM);
336}
337
338char SelectOptimize::ID = 0;
339
340INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
341 false)
342INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
343INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
344INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
345INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
346INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
347INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
348INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
349 false)
350
351FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
352
353PreservedAnalyses SelectOptimizeImpl::run(Function &F,
354 FunctionAnalysisManager &FAM) {
355 TSI = TM->getSubtargetImpl(F);
356 TLI = TSI->getTargetLowering();
357
358 // If none of the select types are supported then skip this pass.
359 // This is an optimization pass. Legality issues will be handled by
360 // instruction selection.
361 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
362 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
363 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
364 return PreservedAnalyses::all();
365
366 TTI = &FAM.getResult<TargetIRAnalysis>(IR&: F);
367 if (!TTI->enableSelectOptimize())
368 return PreservedAnalyses::all();
369
370 PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F)
371 .getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
372 assert(PSI && "This pass requires module analysis pass `profile-summary`!");
373 BFI = &FAM.getResult<BlockFrequencyAnalysis>(IR&: F);
374
375 // When optimizing for size, selects are preferable over branches.
376 if (llvm::shouldOptimizeForSize(F: &F, PSI, BFI))
377 return PreservedAnalyses::all();
378
379 LI = &FAM.getResult<LoopAnalysis>(IR&: F);
380 ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
381 TSchedModel.init(TSInfo: TSI);
382
383 bool Changed = optimizeSelects(F);
384 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
385}
386
387bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
388 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
389 TSI = TM->getSubtargetImpl(F);
390 TLI = TSI->getTargetLowering();
391
392 // If none of the select types are supported then skip this pass.
393 // This is an optimization pass. Legality issues will be handled by
394 // instruction selection.
395 if (!TLI->isSelectSupported(TargetLowering::ScalarValSelect) &&
396 !TLI->isSelectSupported(TargetLowering::ScalarCondVectorVal) &&
397 !TLI->isSelectSupported(TargetLowering::VectorMaskSelect))
398 return false;
399
400 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
401
402 if (!TTI->enableSelectOptimize())
403 return false;
404
405 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
406 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
407 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
408 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
409 TSchedModel.init(TSInfo: TSI);
410
411 // When optimizing for size, selects are preferable over branches.
412 if (llvm::shouldOptimizeForSize(F: &F, PSI, BFI))
413 return false;
414
415 return optimizeSelects(F);
416}
417
418bool SelectOptimizeImpl::optimizeSelects(Function &F) {
419 // Determine for which select groups it is profitable converting to branches.
420 SelectGroups ProfSIGroups;
421 // Base heuristics apply only to non-loops and outer loops.
422 optimizeSelectsBase(F, ProfSIGroups);
423 // Separate heuristics for inner-most loops.
424 optimizeSelectsInnerLoops(F, ProfSIGroups);
425
426 // Convert to branches the select groups that were deemed
427 // profitable-to-convert.
428 convertProfitableSIGroups(ProfSIGroups);
429
430 // Code modified if at least one select group was converted.
431 return !ProfSIGroups.empty();
432}
433
434void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
435 SelectGroups &ProfSIGroups) {
436 // Collect all the select groups.
437 SelectGroups SIGroups;
438 for (BasicBlock &BB : F) {
439 // Base heuristics apply only to non-loops and outer loops.
440 Loop *L = LI->getLoopFor(BB: &BB);
441 if (L && L->isInnermost())
442 continue;
443 collectSelectGroups(BB, SIGroups);
444 }
445
446 // Determine for which select groups it is profitable converting to branches.
447 findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
448}
449
450void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
451 SelectGroups &ProfSIGroups) {
452 SmallVector<Loop *, 4> Loops(LI->begin(), LI->end());
453 // Need to check size on each iteration as we accumulate child loops.
454 for (unsigned long i = 0; i < Loops.size(); ++i)
455 llvm::append_range(C&: Loops, R: Loops[i]->getSubLoops());
456
457 for (Loop *L : Loops) {
458 if (!L->isInnermost())
459 continue;
460
461 SelectGroups SIGroups;
462 for (BasicBlock *BB : L->getBlocks())
463 collectSelectGroups(BB&: *BB, SIGroups);
464
465 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
466 }
467}
468
469/// Returns optimised value on \p IsTrue branch. For SelectInst that would be
470/// either True or False value. For (BinaryOperator) instructions, where the
471/// condition may be skipped, the operation will use a non-conditional operand.
472/// For example, for `or(V,zext(cond))` this function would return V.
473/// However, if the conditional operand on \p IsTrue branch matters, we create a
474/// clone of instruction at the end of that branch \p B and replace the
475/// condition operand with a constant.
476///
477/// Also /p OptSelects contains previously optimised select-like instructions.
478/// If the current value uses one of the optimised values, we can optimise it
479/// further by replacing it with the corresponding value on the given branch
480static Value *getTrueOrFalseValue(
481 SelectOptimizeImpl::SelectLike &SI, bool isTrue,
482 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> &OptSelects,
483 BasicBlock *B) {
484 Value *V = isTrue ? SI.getTrueValue() : SI.getFalseValue();
485 if (V) {
486 if (auto *IV = dyn_cast<Instruction>(Val: V))
487 if (auto It = OptSelects.find(Val: IV); It != OptSelects.end())
488 return isTrue ? It->second.first : It->second.second;
489 return V;
490 }
491
492 auto *BO = cast<BinaryOperator>(Val: SI.getI());
493 assert((BO->getOpcode() == Instruction::Add ||
494 BO->getOpcode() == Instruction::Or ||
495 BO->getOpcode() == Instruction::Sub) &&
496 "Only currently handling Add, Or and Sub binary operators.");
497
498 auto *CBO = BO->clone();
499 auto CondIdx = SI.getConditionOpIndex();
500 auto *AuxI = cast<Instruction>(Val: CBO->getOperand(i: CondIdx));
501 if (isa<ZExtInst>(Val: AuxI) || isa<LShrOperator>(Val: AuxI)) {
502 CBO->setOperand(i: CondIdx, Val: ConstantInt::get(Ty: CBO->getType(), V: 1));
503 } else {
504 assert((isa<AShrOperator>(AuxI) || isa<SExtInst>(AuxI)) &&
505 "Unexpected opcode");
506 CBO->setOperand(i: CondIdx, Val: ConstantInt::getAllOnesValue(Ty: CBO->getType()));
507 }
508
509 unsigned OtherIdx = 1 - CondIdx;
510 if (auto *IV = dyn_cast<Instruction>(Val: CBO->getOperand(i: OtherIdx))) {
511 if (auto It = OptSelects.find(Val: IV); It != OptSelects.end())
512 CBO->setOperand(i: OtherIdx, Val: isTrue ? It->second.first : It->second.second);
513 }
514 CBO->insertBefore(InsertPos: B->getTerminator()->getIterator());
515 return CBO;
516}
517
518void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
519 for (SelectGroup &ASI : ProfSIGroups) {
520 // The code transformation here is a modified version of the sinking
521 // transformation in CodeGenPrepare::optimizeSelectInst with a more
522 // aggressive strategy of which instructions to sink.
523 //
524 // TODO: eliminate the redundancy of logic transforming selects to branches
525 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
526 // selects for all cases (with and without profile information).
527
528 // Transform a sequence like this:
529 // start:
530 // %cmp = cmp uge i32 %a, %b
531 // %sel = select i1 %cmp, i32 %c, i32 %d
532 //
533 // Into:
534 // start:
535 // %cmp = cmp uge i32 %a, %b
536 // %cmp.frozen = freeze %cmp
537 // br i1 %cmp.frozen, label %select.true, label %select.false
538 // select.true:
539 // br label %select.end
540 // select.false:
541 // br label %select.end
542 // select.end:
543 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
544 //
545 // %cmp should be frozen, otherwise it may introduce undefined behavior.
546 // In addition, we may sink instructions that produce %c or %d into the
547 // destination(s) of the new branch.
548 // If the true or false blocks do not contain a sunken instruction, that
549 // block and its branch may be optimized away. In that case, one side of the
550 // first branch will point directly to select.end, and the corresponding PHI
551 // predecessor block will be the start block.
552
553 // Find all the instructions that can be soundly sunk to the true/false
554 // blocks. These are instructions that are computed solely for producing the
555 // operands of the select instructions in the group and can be sunk without
556 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
557 // with side effects).
558 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
559 typedef std::stack<Instruction *>::size_type StackSizeType;
560 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
561 for (SelectLike &SI : ASI.Selects) {
562 if (!isa<SelectInst>(Val: SI.getI()))
563 continue;
564 // For each select, compute the sinkable dependence chains of the true and
565 // false operands.
566 if (auto *TI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue())) {
567 std::stack<Instruction *> TrueSlice;
568 getExclBackwardsSlice(I: TI, Slice&: TrueSlice, SI: SI.getI(), ForSinking: true);
569 maxTrueSliceLen = std::max(a: maxTrueSliceLen, b: TrueSlice.size());
570 TrueSlices.push_back(Elt: TrueSlice);
571 }
572 if (auto *FI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue())) {
573 if (isa<SelectInst>(Val: SI.getI()) || !FI->hasOneUse()) {
574 std::stack<Instruction *> FalseSlice;
575 getExclBackwardsSlice(I: FI, Slice&: FalseSlice, SI: SI.getI(), ForSinking: true);
576 maxFalseSliceLen = std::max(a: maxFalseSliceLen, b: FalseSlice.size());
577 FalseSlices.push_back(Elt: FalseSlice);
578 }
579 }
580 }
581 // In the case of multiple select instructions in the same group, the order
582 // of non-dependent instructions (instructions of different dependence
583 // slices) in the true/false blocks appears to affect performance.
584 // Interleaving the slices seems to experimentally be the optimal approach.
585 // This interleaving scheduling allows for more ILP (with a natural downside
586 // of increasing a bit register pressure) compared to a simple ordering of
587 // one whole chain after another. One would expect that this ordering would
588 // not matter since the scheduling in the backend of the compiler would
589 // take care of it, but apparently the scheduler fails to deliver optimal
590 // ILP with a naive ordering here.
591 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
592 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
593 for (auto &S : TrueSlices) {
594 if (!S.empty()) {
595 TrueSlicesInterleaved.push_back(Elt: S.top());
596 S.pop();
597 }
598 }
599 }
600 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
601 for (auto &S : FalseSlices) {
602 if (!S.empty()) {
603 FalseSlicesInterleaved.push_back(Elt: S.top());
604 S.pop();
605 }
606 }
607 }
608
609 // We split the block containing the select(s) into two blocks.
610 SelectLike &SI = ASI.Selects.front();
611 SelectLike &LastSI = ASI.Selects.back();
612 BasicBlock *StartBlock = SI.getI()->getParent();
613 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
614 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
615 // RemoveDIs turned on, SplitPt would instead point to the next
616 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
617 // tell splitBasicBlock that we want to include any DbgVariableRecords
618 // attached to SplitPt in the splice.
619 SplitPt.setHeadBit(true);
620 BasicBlock *EndBlock = StartBlock->splitBasicBlock(I: SplitPt, BBName: "select.end");
621 BFI->setBlockFreq(BB: EndBlock, Freq: BFI->getBlockFreq(BB: StartBlock));
622 // Delete the unconditional branch that was just created by the split.
623 StartBlock->getTerminator()->eraseFromParent();
624
625 // Move any debug/pseudo and auxiliary instructions that were in-between the
626 // select group to the newly-created end block.
627 SmallVector<Instruction *, 2> SinkInstrs;
628 auto DIt = SI.getI()->getIterator();
629 auto NIt = ASI.Selects.begin();
630 while (&*DIt != LastSI.getI()) {
631 if (NIt != ASI.Selects.end() && &*DIt == NIt->getI())
632 ++NIt;
633 else
634 SinkInstrs.push_back(Elt: &*DIt);
635 DIt++;
636 }
637 auto InsertionPoint = EndBlock->getFirstInsertionPt();
638 for (auto *DI : SinkInstrs)
639 DI->moveBeforePreserving(MovePos: InsertionPoint);
640
641 // Duplicate implementation for DbgRecords, the non-instruction debug-info
642 // format. Helper lambda for moving DbgRecords to the end block.
643 auto TransferDbgRecords = [&](Instruction &I) {
644 for (auto &DbgRecord :
645 llvm::make_early_inc_range(Range: I.getDbgRecordRange())) {
646 DbgRecord.removeFromParent();
647 EndBlock->insertDbgRecordBefore(DR: &DbgRecord,
648 Here: EndBlock->getFirstInsertionPt());
649 }
650 };
651
652 // Iterate over all instructions in between SI and LastSI, not including
653 // SI itself. These are all the variable assignments that happen "in the
654 // middle" of the select group.
655 auto R = make_range(x: std::next(x: SI.getI()->getIterator()),
656 y: std::next(x: LastSI.getI()->getIterator()));
657 llvm::for_each(Range&: R, F: TransferDbgRecords);
658
659 // These are the new basic blocks for the conditional branch.
660 // At least one will become an actual new basic block.
661 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
662 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
663 // Checks if select-like instruction would materialise on the given branch
664 auto HasSelectLike = [](SelectGroup &SG, bool IsTrue) {
665 for (auto &SL : SG.Selects) {
666 if ((IsTrue ? SL.getTrueValue() : SL.getFalseValue()) == nullptr)
667 return true;
668 }
669 return false;
670 };
671 if (!TrueSlicesInterleaved.empty() || HasSelectLike(ASI, true)) {
672 TrueBlock = BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.true.sink",
673 Parent: EndBlock->getParent(), InsertBefore: EndBlock);
674 TrueBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: TrueBlock);
675 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
676 for (Instruction *TrueInst : TrueSlicesInterleaved)
677 TrueInst->moveBefore(InsertPos: TrueBranch->getIterator());
678 }
679 if (!FalseSlicesInterleaved.empty() || HasSelectLike(ASI, false)) {
680 FalseBlock =
681 BasicBlock::Create(Context&: EndBlock->getContext(), Name: "select.false.sink",
682 Parent: EndBlock->getParent(), InsertBefore: EndBlock);
683 FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: FalseBlock);
684 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
685 for (Instruction *FalseInst : FalseSlicesInterleaved)
686 FalseInst->moveBefore(InsertPos: FalseBranch->getIterator());
687 }
688 // If there was nothing to sink, then arbitrarily choose the 'false' side
689 // for a new input value to the PHI.
690 if (TrueBlock == FalseBlock) {
691 assert(TrueBlock == nullptr &&
692 "Unexpected basic block transform while optimizing select");
693
694 FalseBlock = BasicBlock::Create(Context&: StartBlock->getContext(), Name: "select.false",
695 Parent: EndBlock->getParent(), InsertBefore: EndBlock);
696 auto *FalseBranch = BranchInst::Create(IfTrue: EndBlock, InsertBefore: FalseBlock);
697 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
698 }
699
700 // Insert the real conditional branch based on the original condition.
701 // If we did not create a new block for one of the 'true' or 'false' paths
702 // of the condition, it means that side of the branch goes to the end block
703 // directly and the path originates from the start block from the point of
704 // view of the new PHI.
705 BasicBlock *TT, *FT;
706 if (TrueBlock == nullptr) {
707 TT = EndBlock;
708 FT = FalseBlock;
709 TrueBlock = StartBlock;
710 } else if (FalseBlock == nullptr) {
711 TT = TrueBlock;
712 FT = EndBlock;
713 FalseBlock = StartBlock;
714 } else {
715 TT = TrueBlock;
716 FT = FalseBlock;
717 }
718 IRBuilder<> IB(SI.getI());
719 auto *CondFr =
720 IB.CreateFreeze(V: ASI.Condition, Name: ASI.Condition->getName() + ".frozen");
721
722 SmallDenseMap<Instruction *, std::pair<Value *, Value *>, 2> INS;
723
724 // Use reverse iterator because later select may use the value of the
725 // earlier select, and we need to propagate value through earlier select
726 // to get the PHI operand.
727 InsertionPoint = EndBlock->begin();
728 for (SelectLike &SI : ASI.Selects) {
729 // The select itself is replaced with a PHI Node.
730 PHINode *PN = PHINode::Create(Ty: SI.getType(), NumReservedValues: 2, NameStr: "");
731 PN->insertBefore(InsertPos: InsertionPoint);
732 PN->takeName(V: SI.getI());
733 // Current instruction might be a condition of some other group, so we
734 // need to replace it there to avoid dangling pointer
735 if (PN->getType()->isIntegerTy(Bitwidth: 1)) {
736 for (auto &SG : ProfSIGroups) {
737 if (SG.Condition == SI.getI())
738 SG.Condition = PN;
739 }
740 }
741 SI.getI()->replaceAllUsesWith(V: PN);
742 auto *TV = getTrueOrFalseValue(SI, isTrue: true, OptSelects&: INS, B: TrueBlock);
743 auto *FV = getTrueOrFalseValue(SI, isTrue: false, OptSelects&: INS, B: FalseBlock);
744 INS[PN] = {TV, FV};
745 PN->addIncoming(V: TV, BB: TrueBlock);
746 PN->addIncoming(V: FV, BB: FalseBlock);
747 PN->setDebugLoc(SI.getI()->getDebugLoc());
748 ++NumSelectsConverted;
749 }
750 IB.CreateCondBr(Cond: CondFr, True: TT, False: FT, MDSrc: SI.getI());
751
752 // Remove the old select instructions, now that they are not longer used.
753 for (SelectLike &SI : ASI.Selects)
754 SI.getI()->eraseFromParent();
755 }
756}
757
758void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
759 SelectGroups &SIGroups) {
760 // Represents something that can be considered as select instruction.
761 // Auxiliary instruction are instructions that depends on a condition and have
762 // zero or some constant value on True/False branch, such as:
763 // * ZExt(1bit)
764 // * SExt(1bit)
765 // * Not(1bit)
766 // * A(L)Shr(Val), ValBitSize - 1, where there is a condition like `Val <= 0`
767 // earlier in the BB. For conditions that check the sign of the Val compiler
768 // may generate shifts instead of ZExt/SExt.
769 struct SelectLikeInfo {
770 Value *Cond;
771 bool IsAuxiliary;
772 bool IsInverted;
773 unsigned ConditionIdx;
774 };
775
776 DenseMap<Value *, SelectLikeInfo> SelectInfo;
777 // Keeps visited comparisons to help identify AShr/LShr variants of auxiliary
778 // instructions.
779 SmallSetVector<CmpInst *, 4> SeenCmp;
780
781 // Check if the instruction is SelectLike or might be part of SelectLike
782 // expression, put information into SelectInfo and return the iterator to the
783 // inserted position.
784 auto ProcessSelectInfo = [&SelectInfo, &SeenCmp](Instruction *I) {
785 if (auto *Cmp = dyn_cast<CmpInst>(Val: I)) {
786 SeenCmp.insert(X: Cmp);
787 return SelectInfo.end();
788 }
789
790 Value *Cond;
791 if (match(V: I, P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_Value(V&: Cond)))) &&
792 Cond->getType()->isIntegerTy(Bitwidth: 1)) {
793 bool Inverted = match(V: Cond, P: m_Not(V: m_Value(V&: Cond)));
794 return SelectInfo.insert(KV: {I, {.Cond: Cond, .IsAuxiliary: true, .IsInverted: Inverted, .ConditionIdx: 0}}).first;
795 }
796
797 if (match(V: I, P: m_Not(V: m_Value(V&: Cond)))) {
798 return SelectInfo.insert(KV: {I, {.Cond: Cond, .IsAuxiliary: true, .IsInverted: true, .ConditionIdx: 0}}).first;
799 }
800
801 // Select instruction are what we are usually looking for.
802 if (match(V: I, P: m_Select(C: m_Value(V&: Cond), L: m_Value(), R: m_Value()))) {
803 bool Inverted = match(V: Cond, P: m_Not(V: m_Value(V&: Cond)));
804 return SelectInfo.insert(KV: {I, {.Cond: Cond, .IsAuxiliary: false, .IsInverted: Inverted, .ConditionIdx: 0}}).first;
805 }
806 Value *Val;
807 ConstantInt *Shift;
808 if (match(V: I, P: m_Shr(L: m_Value(V&: Val), R: m_ConstantInt(CI&: Shift))) &&
809 I->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1) {
810 for (auto *CmpI : SeenCmp) {
811 auto Pred = CmpI->getPredicate();
812 if (Val != CmpI->getOperand(i_nocapture: 0))
813 continue;
814 if ((Pred == CmpInst::ICMP_SGT &&
815 match(V: CmpI->getOperand(i_nocapture: 1), P: m_ConstantInt<-1>())) ||
816 (Pred == CmpInst::ICMP_SGE &&
817 match(V: CmpI->getOperand(i_nocapture: 1), P: m_Zero())) ||
818 (Pred == CmpInst::ICMP_SLT &&
819 match(V: CmpI->getOperand(i_nocapture: 1), P: m_Zero())) ||
820 (Pred == CmpInst::ICMP_SLE &&
821 match(V: CmpI->getOperand(i_nocapture: 1), P: m_ConstantInt<-1>()))) {
822 bool Inverted =
823 Pred == CmpInst::ICMP_SGT || Pred == CmpInst::ICMP_SGE;
824 return SelectInfo.insert(KV: {I, {.Cond: CmpI, .IsAuxiliary: true, .IsInverted: Inverted, .ConditionIdx: 0}}).first;
825 }
826 }
827 return SelectInfo.end();
828 }
829
830 // An BinOp(Aux(X), Y) can also be treated like a select, with condition X
831 // and values Y|1 and Y.
832 // `Aux` can be either `ZExt(1bit)`, `SExt(1bit)` or `XShr(Val), ValBitSize
833 // - 1` `BinOp` can be Add, Sub, Or
834 Value *X;
835 auto MatchZExtOrSExtPattern =
836 m_c_BinOp(L: m_Value(), R: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_Value(V&: X))));
837 auto MatchShiftPattern =
838 m_c_BinOp(L: m_Value(), R: m_OneUse(SubPattern: m_Shr(L: m_Value(V&: X), R: m_ConstantInt(CI&: Shift))));
839
840 // This check is unnecessary, but it prevents costly access to the
841 // SelectInfo map.
842 if ((match(V: I, P: MatchZExtOrSExtPattern) && X->getType()->isIntegerTy(Bitwidth: 1)) ||
843 (match(V: I, P: MatchShiftPattern) &&
844 X->getType()->getIntegerBitWidth() == Shift->getZExtValue() + 1)) {
845 if (I->getOpcode() != Instruction::Add &&
846 I->getOpcode() != Instruction::Sub &&
847 I->getOpcode() != Instruction::Or)
848 return SelectInfo.end();
849
850 if (I->getOpcode() == Instruction::Or && I->getType()->isIntegerTy(Bitwidth: 1))
851 return SelectInfo.end();
852
853 // Iterate through operands and find dependant on recognised sign
854 // extending auxiliary select-like instructions. The operand index does
855 // not matter for Add and Or. However, for Sub, we can only safely
856 // transform when the operand is second.
857 unsigned Idx = I->getOpcode() == Instruction::Sub ? 1 : 0;
858 for (; Idx < 2; Idx++) {
859 auto *Op = I->getOperand(i: Idx);
860 auto It = SelectInfo.find(Val: Op);
861 if (It != SelectInfo.end() && It->second.IsAuxiliary) {
862 Cond = It->second.Cond;
863 bool Inverted = It->second.IsInverted;
864 return SelectInfo.insert(KV: {I, {.Cond: Cond, .IsAuxiliary: false, .IsInverted: Inverted, .ConditionIdx: Idx}}).first;
865 }
866 }
867 }
868 return SelectInfo.end();
869 };
870
871 bool AlreadyProcessed = false;
872 BasicBlock::iterator BBIt = BB.begin();
873 DenseMap<Value *, SelectLikeInfo>::iterator It;
874 while (BBIt != BB.end()) {
875 Instruction *I = &*BBIt++;
876 if (I->isDebugOrPseudoInst())
877 continue;
878
879 if (!AlreadyProcessed)
880 It = ProcessSelectInfo(I);
881 else
882 AlreadyProcessed = false;
883
884 if (It == SelectInfo.end() || It->second.IsAuxiliary)
885 continue;
886
887 if (!TTI->shouldTreatInstructionLikeSelect(I))
888 continue;
889
890 Value *Cond = It->second.Cond;
891 // Vector conditions are not supported.
892 if (!Cond->getType()->isIntegerTy(Bitwidth: 1))
893 continue;
894
895 SelectGroup SIGroup = {.Condition: Cond, .Selects: {}};
896 SIGroup.Selects.emplace_back(Args&: I, Args&: It->second.IsInverted,
897 Args&: It->second.ConditionIdx);
898
899 // If the select type is not supported, no point optimizing it.
900 // Instruction selection will take care of it.
901 if (!isSelectKindSupported(SI: SIGroup.Selects.front()))
902 continue;
903
904 while (BBIt != BB.end()) {
905 Instruction *NI = &*BBIt;
906 // Debug/pseudo instructions should be skipped and not prevent the
907 // formation of a select group.
908 if (NI->isDebugOrPseudoInst()) {
909 ++BBIt;
910 continue;
911 }
912
913 It = ProcessSelectInfo(NI);
914 if (It == SelectInfo.end()) {
915 AlreadyProcessed = true;
916 break;
917 }
918
919 // Auxiliary with same condition
920 auto [CurrCond, IsAux, IsRev, CondIdx] = It->second;
921 if (Cond != CurrCond) {
922 AlreadyProcessed = true;
923 break;
924 }
925
926 if (!IsAux)
927 SIGroup.Selects.emplace_back(Args&: NI, Args&: IsRev, Args&: CondIdx);
928 ++BBIt;
929 }
930 LLVM_DEBUG({
931 dbgs() << "New Select group (" << SIGroup.Selects.size() << ") with\n";
932 for (auto &SI : SIGroup.Selects)
933 dbgs() << " " << *SI.getI() << "\n";
934 });
935
936 SIGroups.push_back(Elt: SIGroup);
937 }
938}
939
940void SelectOptimizeImpl::findProfitableSIGroupsBase(
941 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
942 for (SelectGroup &ASI : SIGroups) {
943 ++NumSelectOptAnalyzed;
944 if (isConvertToBranchProfitableBase(ASI))
945 ProfSIGroups.push_back(Elt: ASI);
946 }
947}
948
949static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE,
950 DiagnosticInfoOptimizationBase &Rem) {
951 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
952 ORE->emit(OptDiag&: Rem);
953}
954
955void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
956 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
957 NumSelectOptAnalyzed += SIGroups.size();
958 // For each select group in an inner-most loop,
959 // a branch is more preferable than a select/conditional-move if:
960 // i) conversion to branches for all the select groups of the loop satisfies
961 // loop-level heuristics including reducing the loop's critical path by
962 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
963 // ii) the total cost of the select group is cheaper with a branch compared
964 // to its predicated version. The cost is in terms of latency and the cost
965 // of a select group is the cost of its most expensive select instruction
966 // (assuming infinite resources and thus fully leveraging available ILP).
967
968 DenseMap<const Instruction *, CostInfo> InstCostMap;
969 CostInfo LoopCost[2] = {{.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()},
970 {.PredCost: Scaled64::getZero(), .NonPredCost: Scaled64::getZero()}};
971 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
972 !checkLoopHeuristics(L, LoopDepth: LoopCost)) {
973 return;
974 }
975
976 for (SelectGroup &ASI : SIGroups) {
977 // Assuming infinite resources, the cost of a group of instructions is the
978 // cost of the most expensive instruction of the group.
979 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
980 for (SelectLike &SI : ASI.Selects) {
981 const auto &ICM = InstCostMap[SI.getI()];
982 SelectCost = std::max(a: SelectCost, b: ICM.PredCost);
983 BranchCost = std::max(a: BranchCost, b: ICM.NonPredCost);
984 }
985 if (BranchCost < SelectCost) {
986 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti",
987 ASI.Selects.front().getI());
988 OR << "Profitable to convert to branch (loop analysis). BranchCost="
989 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
990 << ". ";
991 EmitAndPrintRemark(ORE, Rem&: OR);
992 ++NumSelectConvertedLoop;
993 ProfSIGroups.push_back(Elt: ASI);
994 } else {
995 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
996 ASI.Selects.front().getI());
997 ORmiss << "Select is more profitable (loop analysis). BranchCost="
998 << BranchCost.toString()
999 << ", SelectCost=" << SelectCost.toString() << ". ";
1000 EmitAndPrintRemark(ORE, Rem&: ORmiss);
1001 }
1002 }
1003}
1004
1005bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
1006 const SelectGroup &ASI) {
1007 const SelectLike &SI = ASI.Selects.front();
1008 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI()
1009 << "\n");
1010 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
1011 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
1012
1013 // Skip cold basic blocks. Better to optimize for size for cold blocks.
1014 if (PSI->isColdBlock(BB: SI.getI()->getParent(), BFI)) {
1015 ++NumSelectColdBB;
1016 ORmiss << "Not converted to branch because of cold basic block. ";
1017 EmitAndPrintRemark(ORE, Rem&: ORmiss);
1018 return false;
1019 }
1020
1021 // If unpredictable, branch form is less profitable.
1022 if (SI.getI()->getMetadata(KindID: LLVMContext::MD_unpredictable)) {
1023 ++NumSelectUnPred;
1024 ORmiss << "Not converted to branch because of unpredictable branch. ";
1025 EmitAndPrintRemark(ORE, Rem&: ORmiss);
1026 return false;
1027 }
1028
1029 // If highly predictable, branch form is more profitable, unless a
1030 // predictable select is inexpensive in the target architecture.
1031 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
1032 ++NumSelectConvertedHighPred;
1033 OR << "Converted to branch because of highly predictable branch. ";
1034 EmitAndPrintRemark(ORE, Rem&: OR);
1035 return true;
1036 }
1037
1038 // Look for expensive instructions in the cold operand's (if any) dependence
1039 // slice of any of the selects in the group.
1040 if (hasExpensiveColdOperand(ASI)) {
1041 ++NumSelectConvertedExpColdOperand;
1042 OR << "Converted to branch because of expensive cold operand.";
1043 EmitAndPrintRemark(ORE, Rem&: OR);
1044 return true;
1045 }
1046
1047 // If latch has a select group with several elements, it is usually profitable
1048 // to convert it to branches. We let `optimizeSelectsInnerLoops` decide if
1049 // conversion is profitable for innermost loops.
1050 auto *BB = SI.getI()->getParent();
1051 auto *L = LI->getLoopFor(BB);
1052 if (L && !L->isInnermost() && L->getLoopLatch() == BB &&
1053 ASI.Selects.size() >= 3) {
1054 OR << "Converted to branch because select group in the latch block is big.";
1055 EmitAndPrintRemark(ORE, Rem&: OR);
1056 return true;
1057 }
1058
1059 ORmiss << "Not profitable to convert to branch (base heuristic).";
1060 EmitAndPrintRemark(ORE, Rem&: ORmiss);
1061 return false;
1062}
1063
1064static InstructionCost divideNearest(InstructionCost Numerator,
1065 uint64_t Denominator) {
1066 return (Numerator + (Denominator / 2)) / Denominator;
1067}
1068
1069static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
1070 uint64_t &TrueVal, uint64_t &FalseVal) {
1071 if (isa<SelectInst>(Val: SI.getI()))
1072 return extractBranchWeights(I: *SI.getI(), TrueVal, FalseVal);
1073 return false;
1074}
1075
1076bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
1077 bool ColdOperand = false;
1078 uint64_t TrueWeight, FalseWeight, TotalWeight;
1079 if (extractBranchWeights(SI: ASI.Selects.front(), TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1080 uint64_t MinWeight = std::min(a: TrueWeight, b: FalseWeight);
1081 TotalWeight = TrueWeight + FalseWeight;
1082 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
1083 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
1084 } else if (PSI->hasProfileSummary()) {
1085 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
1086 ASI.Selects.front().getI());
1087 ORmiss << "Profile data available but missing branch-weights metadata for "
1088 "select instruction. ";
1089 EmitAndPrintRemark(ORE, Rem&: ORmiss);
1090 }
1091 if (!ColdOperand)
1092 return false;
1093 // Check if the cold path's dependence slice is expensive for any of the
1094 // selects of the group.
1095 for (SelectLike SI : ASI.Selects) {
1096 Instruction *ColdI = nullptr;
1097 uint64_t HotWeight;
1098 if (TrueWeight < FalseWeight) {
1099 ColdI = dyn_cast_or_null<Instruction>(Val: SI.getTrueValue());
1100 HotWeight = FalseWeight;
1101 } else {
1102 ColdI = dyn_cast_or_null<Instruction>(Val: SI.getFalseValue());
1103 HotWeight = TrueWeight;
1104 }
1105 if (ColdI) {
1106 std::stack<Instruction *> ColdSlice;
1107 getExclBackwardsSlice(I: ColdI, Slice&: ColdSlice, SI: SI.getI());
1108 InstructionCost SliceCost = 0;
1109 while (!ColdSlice.empty()) {
1110 SliceCost += TTI->getInstructionCost(U: ColdSlice.top(),
1111 CostKind: TargetTransformInfo::TCK_Latency);
1112 ColdSlice.pop();
1113 }
1114 // The colder the cold value operand of the select is the more expensive
1115 // the cmov becomes for computing the cold value operand every time. Thus,
1116 // the colder the cold operand is the more its cost counts.
1117 // Get nearest integer cost adjusted for coldness.
1118 InstructionCost AdjSliceCost =
1119 divideNearest(Numerator: SliceCost * HotWeight, Denominator: TotalWeight);
1120 if (AdjSliceCost >=
1121 ColdOperandMaxCostMultiplier * TargetTransformInfo::TCC_Expensive)
1122 return true;
1123 }
1124 }
1125 return false;
1126}
1127
1128// Check if it is safe to move LoadI next to the SI.
1129// Conservatively assume it is safe only if there is no instruction
1130// modifying memory in-between the load and the select instruction.
1131static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
1132 // Assume loads from different basic blocks are unsafe to move.
1133 if (LoadI->getParent() != SI->getParent())
1134 return false;
1135 auto It = LoadI->getIterator();
1136 while (&*It != SI) {
1137 if (It->mayWriteToMemory())
1138 return false;
1139 It++;
1140 }
1141 return true;
1142}
1143
1144// For a given source instruction, collect its backwards dependence slice
1145// consisting of instructions exclusively computed for the purpose of producing
1146// the operands of the source instruction. As an approximation
1147// (sufficiently-accurate in practice), we populate this set with the
1148// instructions of the backwards dependence slice that only have one-use and
1149// form an one-use chain that leads to the source instruction.
1150void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
1151 std::stack<Instruction *> &Slice,
1152 Instruction *SI,
1153 bool ForSinking) {
1154 SmallPtrSet<Instruction *, 2> Visited;
1155 std::queue<Instruction *> Worklist;
1156 Worklist.push(x: I);
1157 while (!Worklist.empty()) {
1158 Instruction *II = Worklist.front();
1159 Worklist.pop();
1160
1161 // Avoid cycles.
1162 if (!Visited.insert(Ptr: II).second)
1163 continue;
1164
1165 if (!II->hasOneUse())
1166 continue;
1167
1168 // Cannot soundly sink instructions with side-effects.
1169 // Terminator or phi instructions cannot be sunk.
1170 // Avoid sinking other select instructions (should be handled separetely).
1171 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1172 isa<SelectInst>(Val: II) || isa<PHINode>(Val: II)))
1173 continue;
1174
1175 // Avoid sinking loads in order not to skip state-modifying instructions,
1176 // that may alias with the loaded address.
1177 // Only allow sinking of loads within the same basic block that are
1178 // conservatively proven to be safe.
1179 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(LoadI: II, SI))
1180 continue;
1181
1182 // Avoid considering instructions with less frequency than the source
1183 // instruction (i.e., avoid colder code regions of the dependence slice).
1184 if (BFI->getBlockFreq(BB: II->getParent()) < BFI->getBlockFreq(BB: I->getParent()))
1185 continue;
1186
1187 // Eligible one-use instruction added to the dependence slice.
1188 Slice.push(x: II);
1189
1190 // Explore all the operands of the current instruction to expand the slice.
1191 for (Value *Op : II->operand_values())
1192 if (auto *OpI = dyn_cast<Instruction>(Val: Op))
1193 Worklist.push(x: OpI);
1194 }
1195}
1196
1197bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1198 uint64_t TrueWeight, FalseWeight;
1199 if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1200 uint64_t Max = std::max(a: TrueWeight, b: FalseWeight);
1201 uint64_t Sum = TrueWeight + FalseWeight;
1202 if (Sum != 0) {
1203 auto Probability = BranchProbability::getBranchProbability(Numerator: Max, Denominator: Sum);
1204 if (Probability > TTI->getPredictableBranchThreshold())
1205 return true;
1206 }
1207 }
1208 return false;
1209}
1210
1211bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1212 const CostInfo LoopCost[2]) {
1213 // Loop-level checks to determine if a non-predicated version (with branches)
1214 // of the loop is more profitable than its predicated version.
1215
1216 if (DisableLoopLevelHeuristics)
1217 return true;
1218
1219 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1220 &*L->getHeader()->getFirstNonPHIIt());
1221
1222 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1223 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1224 ORmissL << "No select conversion in the loop due to no reduction of loop's "
1225 "critical path. ";
1226 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1227 return false;
1228 }
1229
1230 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1231 LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1232
1233 // Profitably converting to branches need to reduce the loop's critical path
1234 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1235 // relative gain of 12.5%).
1236 if (Gain[1] < Scaled64::get(N: GainCycleThreshold) ||
1237 Gain[1] * Scaled64::get(N: GainRelativeThreshold) < LoopCost[1].PredCost) {
1238 Scaled64 RelativeGain = Scaled64::get(N: 100) * Gain[1] / LoopCost[1].PredCost;
1239 ORmissL << "No select conversion in the loop due to small reduction of "
1240 "loop's critical path. Gain="
1241 << Gain[1].toString()
1242 << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1243 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1244 return false;
1245 }
1246
1247 // If the loop's critical path involves loop-carried dependences, the gradient
1248 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1249 // This check ensures that the latency reduction for the loop's critical path
1250 // keeps decreasing with sufficient rate beyond the two analyzed loop
1251 // iterations.
1252 if (Gain[1] > Gain[0]) {
1253 Scaled64 GradientGain = Scaled64::get(N: 100) * (Gain[1] - Gain[0]) /
1254 (LoopCost[1].PredCost - LoopCost[0].PredCost);
1255 if (GradientGain < Scaled64::get(N: GainGradientThreshold)) {
1256 ORmissL << "No select conversion in the loop due to small gradient gain. "
1257 "GradientGain="
1258 << GradientGain.toString() << "%. ";
1259 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1260 return false;
1261 }
1262 }
1263 // If the gain decreases it is not profitable to convert.
1264 else if (Gain[1] < Gain[0]) {
1265 ORmissL
1266 << "No select conversion in the loop due to negative gradient gain. ";
1267 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1268 return false;
1269 }
1270
1271 // Non-predicated version of the loop is more profitable than its
1272 // predicated version.
1273 return true;
1274}
1275
1276// Computes instruction and loop-critical-path costs for both the predicated
1277// and non-predicated version of the given loop.
1278// Returns false if unable to compute these costs due to invalid cost of loop
1279// instruction(s).
1280bool SelectOptimizeImpl::computeLoopCosts(
1281 const Loop *L, const SelectGroups &SIGroups,
1282 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1283 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1284 << L->getHeader()->getName() << "\n");
1285 const auto SImap = getSImap(SIGroups);
1286 const auto SGmap = getSGmap(SIGroups);
1287 // Compute instruction and loop-critical-path costs across two iterations for
1288 // both predicated and non-predicated version.
1289 const unsigned Iterations = 2;
1290 for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1291 // Cost of the loop's critical path.
1292 CostInfo &MaxCost = LoopCost[Iter];
1293 for (BasicBlock *BB : L->getBlocks()) {
1294 for (const Instruction &I : *BB) {
1295 if (I.isDebugOrPseudoInst())
1296 continue;
1297 // Compute the predicated and non-predicated cost of the instruction.
1298 Scaled64 IPredCost = Scaled64::getZero(),
1299 INonPredCost = Scaled64::getZero();
1300
1301 // Assume infinite resources that allow to fully exploit the available
1302 // instruction-level parallelism.
1303 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1304 for (const Use &U : I.operands()) {
1305 auto UI = dyn_cast<Instruction>(Val: U.get());
1306 if (!UI)
1307 continue;
1308 if (auto It = InstCostMap.find(Val: UI); It != InstCostMap.end()) {
1309 IPredCost = std::max(a: IPredCost, b: It->second.PredCost);
1310 INonPredCost = std::max(a: INonPredCost, b: It->second.NonPredCost);
1311 }
1312 }
1313 auto ILatency = computeInstCost(I: &I);
1314 if (!ILatency) {
1315 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1316 ORmissL << "Invalid instruction cost preventing analysis and "
1317 "optimization of the inner-most loop containing this "
1318 "instruction. ";
1319 EmitAndPrintRemark(ORE, Rem&: ORmissL);
1320 return false;
1321 }
1322 IPredCost += Scaled64::get(N: *ILatency);
1323 INonPredCost += Scaled64::get(N: *ILatency);
1324
1325 // For a select that can be converted to branch,
1326 // compute its cost as a branch (non-predicated cost).
1327 //
1328 // BranchCost = PredictedPathCost + MispredictCost
1329 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1330 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1331 if (auto It = SImap.find(Val: &I); It != SImap.end()) {
1332 auto SI = It->second;
1333 const auto *SG = SGmap.at(Val: &I);
1334 Scaled64 TrueOpCost = SI.getOpCostOnBranch(IsTrue: true, InstCostMap, TTI);
1335 Scaled64 FalseOpCost = SI.getOpCostOnBranch(IsTrue: false, InstCostMap, TTI);
1336 Scaled64 PredictedPathCost =
1337 getPredictedPathCost(TrueCost: TrueOpCost, FalseCost: FalseOpCost, SI);
1338
1339 Scaled64 CondCost = Scaled64::getZero();
1340 if (auto *CI = dyn_cast<Instruction>(Val: SG->Condition))
1341 if (auto It = InstCostMap.find(Val: CI); It != InstCostMap.end())
1342 CondCost = It->second.NonPredCost;
1343 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1344
1345 INonPredCost = PredictedPathCost + MispredictCost;
1346 }
1347 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1348 << INonPredCost << " for " << I << "\n");
1349
1350 InstCostMap[&I] = {.PredCost: IPredCost, .NonPredCost: INonPredCost};
1351 MaxCost.PredCost = std::max(a: MaxCost.PredCost, b: IPredCost);
1352 MaxCost.NonPredCost = std::max(a: MaxCost.NonPredCost, b: INonPredCost);
1353 }
1354 }
1355 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1356 << " MaxCost = " << MaxCost.PredCost << " "
1357 << MaxCost.NonPredCost << "\n");
1358 }
1359 return true;
1360}
1361
1362SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2>
1363SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1364 SmallDenseMap<const Instruction *, SelectLike, 2> SImap;
1365 for (const SelectGroup &ASI : SIGroups)
1366 for (const SelectLike &SI : ASI.Selects)
1367 SImap.try_emplace(Key: SI.getI(), Args: SI);
1368 return SImap;
1369}
1370
1371SmallDenseMap<const Instruction *, const SelectOptimizeImpl::SelectGroup *, 2>
1372SelectOptimizeImpl::getSGmap(const SelectGroups &SIGroups) {
1373 SmallDenseMap<const Instruction *, const SelectGroup *, 2> SImap;
1374 for (const SelectGroup &ASI : SIGroups)
1375 for (const SelectLike &SI : ASI.Selects)
1376 SImap.try_emplace(Key: SI.getI(), Args: &ASI);
1377 return SImap;
1378}
1379
1380std::optional<uint64_t>
1381SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1382 InstructionCost ICost =
1383 TTI->getInstructionCost(U: I, CostKind: TargetTransformInfo::TCK_Latency);
1384 if (ICost.isValid())
1385 return std::optional<uint64_t>(ICost.getValue());
1386 return std::nullopt;
1387}
1388
1389ScaledNumber<uint64_t>
1390SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1391 const Scaled64 CondCost) {
1392 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1393
1394 // Account for the default misprediction rate when using a branch
1395 // (conservatively set to 25% by default).
1396 uint64_t MispredictRate = MispredictDefaultRate;
1397 // If the select condition is obviously predictable, then the misprediction
1398 // rate is zero.
1399 if (isSelectHighlyPredictable(SI))
1400 MispredictRate = 0;
1401
1402 // CondCost is included to account for cases where the computation of the
1403 // condition is part of a long dependence chain (potentially loop-carried)
1404 // that would delay detection of a misprediction and increase its cost.
1405 Scaled64 MispredictCost =
1406 std::max(a: Scaled64::get(N: MispredictPenalty), b: CondCost) *
1407 Scaled64::get(N: MispredictRate);
1408 MispredictCost /= Scaled64::get(N: 100);
1409
1410 return MispredictCost;
1411}
1412
1413// Returns the cost of a branch when the prediction is correct.
1414// TrueCost * TrueProbability + FalseCost * FalseProbability.
1415ScaledNumber<uint64_t>
1416SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1417 const SelectLike SI) {
1418 Scaled64 PredPathCost;
1419 uint64_t TrueWeight, FalseWeight;
1420 if (extractBranchWeights(SI, TrueVal&: TrueWeight, FalseVal&: FalseWeight)) {
1421 uint64_t SumWeight = TrueWeight + FalseWeight;
1422 if (SumWeight != 0) {
1423 PredPathCost = TrueCost * Scaled64::get(N: TrueWeight) +
1424 FalseCost * Scaled64::get(N: FalseWeight);
1425 PredPathCost /= Scaled64::get(N: SumWeight);
1426 return PredPathCost;
1427 }
1428 }
1429 // Without branch weight metadata, we assume 75% for the one path and 25% for
1430 // the other, and pick the result with the biggest cost.
1431 PredPathCost = std::max(a: TrueCost * Scaled64::get(N: 3) + FalseCost,
1432 b: FalseCost * Scaled64::get(N: 3) + TrueCost);
1433 PredPathCost /= Scaled64::get(N: 4);
1434 return PredPathCost;
1435}
1436
1437bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1438 TargetLowering::SelectSupportKind SelectKind;
1439 if (SI.getType()->isVectorTy())
1440 SelectKind = TargetLowering::ScalarCondVectorVal;
1441 else
1442 SelectKind = TargetLowering::ScalarValSelect;
1443 return TLI->isSelectSupported(SelectKind);
1444}
1445