1 | //====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | /// \file |
10 | /// This file implements a pass that converts X86 cmov instructions into |
11 | /// branches when profitable. This pass is conservative. It transforms if and |
12 | /// only if it can guarantee a gain with high confidence. |
13 | /// |
14 | /// Thus, the optimization applies under the following conditions: |
15 | /// 1. Consider as candidates only CMOVs in innermost loops (assume that |
16 | /// most hotspots are represented by these loops). |
17 | /// 2. Given a group of CMOV instructions that are using the same EFLAGS def |
18 | /// instruction: |
19 | /// a. Consider them as candidates only if all have the same code condition |
20 | /// or the opposite one to prevent generating more than one conditional |
21 | /// jump per EFLAGS def instruction. |
22 | /// b. Consider them as candidates only if all are profitable to be |
23 | /// converted (assume that one bad conversion may cause a degradation). |
24 | /// 3. Apply conversion only for loops that are found profitable and only for |
25 | /// CMOV candidates that were found profitable. |
26 | /// a. A loop is considered profitable only if conversion will reduce its |
27 | /// depth cost by some threshold. |
28 | /// b. CMOV is considered profitable if the cost of its condition is higher |
29 | /// than the average cost of its true-value and false-value by 25% of |
30 | /// branch-misprediction-penalty. This assures no degradation even with |
31 | /// 25% branch misprediction. |
32 | /// |
33 | /// Note: This pass is assumed to run on SSA machine code. |
34 | // |
35 | //===----------------------------------------------------------------------===// |
36 | // |
37 | // External interfaces: |
38 | // FunctionPass *llvm::createX86CmovConverterPass(); |
39 | // bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); |
40 | // |
41 | //===----------------------------------------------------------------------===// |
42 | |
43 | #include "X86.h" |
44 | #include "X86InstrInfo.h" |
45 | #include "llvm/ADT/ArrayRef.h" |
46 | #include "llvm/ADT/DenseMap.h" |
47 | #include "llvm/ADT/STLExtras.h" |
48 | #include "llvm/ADT/SmallPtrSet.h" |
49 | #include "llvm/ADT/SmallVector.h" |
50 | #include "llvm/ADT/Statistic.h" |
51 | #include "llvm/CodeGen/MachineBasicBlock.h" |
52 | #include "llvm/CodeGen/MachineFunction.h" |
53 | #include "llvm/CodeGen/MachineFunctionPass.h" |
54 | #include "llvm/CodeGen/MachineInstr.h" |
55 | #include "llvm/CodeGen/MachineInstrBuilder.h" |
56 | #include "llvm/CodeGen/MachineLoopInfo.h" |
57 | #include "llvm/CodeGen/MachineOperand.h" |
58 | #include "llvm/CodeGen/MachineRegisterInfo.h" |
59 | #include "llvm/CodeGen/TargetInstrInfo.h" |
60 | #include "llvm/CodeGen/TargetRegisterInfo.h" |
61 | #include "llvm/CodeGen/TargetSchedule.h" |
62 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
63 | #include "llvm/IR/DebugLoc.h" |
64 | #include "llvm/InitializePasses.h" |
65 | #include "llvm/MC/MCSchedule.h" |
66 | #include "llvm/Pass.h" |
67 | #include "llvm/Support/CommandLine.h" |
68 | #include "llvm/Support/Debug.h" |
69 | #include "llvm/Support/raw_ostream.h" |
70 | #include "llvm/Target/CGPassBuilderOption.h" |
71 | #include <algorithm> |
72 | #include <cassert> |
73 | #include <iterator> |
74 | #include <utility> |
75 | |
76 | using namespace llvm; |
77 | |
78 | #define DEBUG_TYPE "x86-cmov-conversion" |
79 | |
80 | STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups" ); |
81 | STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates" ); |
82 | STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops" ); |
83 | STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups" ); |
84 | |
85 | // This internal switch can be used to turn off the cmov/branch optimization. |
86 | static cl::opt<bool> |
87 | EnableCmovConverter("x86-cmov-converter" , |
88 | cl::desc("Enable the X86 cmov-to-branch optimization." ), |
89 | cl::init(Val: true), cl::Hidden); |
90 | |
91 | static cl::opt<unsigned> |
92 | GainCycleThreshold("x86-cmov-converter-threshold" , |
93 | cl::desc("Minimum gain per loop (in cycles) threshold." ), |
94 | cl::init(Val: 4), cl::Hidden); |
95 | |
96 | static cl::opt<bool> ForceMemOperand( |
97 | "x86-cmov-converter-force-mem-operand" , |
98 | cl::desc("Convert cmovs to branches whenever they have memory operands." ), |
99 | cl::init(Val: true), cl::Hidden); |
100 | |
101 | static cl::opt<bool> ForceAll( |
102 | "x86-cmov-converter-force-all" , |
103 | cl::desc("Convert all cmovs to branches." ), |
104 | cl::init(Val: false), cl::Hidden); |
105 | |
106 | namespace { |
107 | |
108 | /// Converts X86 cmov instructions into branches when profitable. |
109 | class X86CmovConverterPass : public MachineFunctionPass { |
110 | public: |
111 | X86CmovConverterPass() : MachineFunctionPass(ID) { } |
112 | |
113 | StringRef getPassName() const override { return "X86 cmov Conversion" ; } |
114 | bool runOnMachineFunction(MachineFunction &MF) override; |
115 | void getAnalysisUsage(AnalysisUsage &AU) const override; |
116 | |
117 | /// Pass identification, replacement for typeid. |
118 | static char ID; |
119 | |
120 | private: |
121 | MachineRegisterInfo *MRI = nullptr; |
122 | const TargetInstrInfo *TII = nullptr; |
123 | const TargetRegisterInfo *TRI = nullptr; |
124 | MachineLoopInfo *MLI = nullptr; |
125 | TargetSchedModel TSchedModel; |
126 | |
127 | /// List of consecutive CMOV instructions. |
128 | using CmovGroup = SmallVector<MachineInstr *, 2>; |
129 | using CmovGroups = SmallVector<CmovGroup, 2>; |
130 | |
131 | /// Collect all CMOV-group-candidates in \p CurrLoop and update \p |
132 | /// CmovInstGroups accordingly. |
133 | /// |
134 | /// \param Blocks List of blocks to process. |
135 | /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. |
136 | /// \returns true iff it found any CMOV-group-candidate. |
137 | bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, |
138 | CmovGroups &CmovInstGroups, |
139 | bool IncludeLoads = false); |
140 | |
141 | /// Check if it is profitable to transform each CMOV-group-candidates into |
142 | /// branch. Remove all groups that are not profitable from \p CmovInstGroups. |
143 | /// |
144 | /// \param Blocks List of blocks to process. |
145 | /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. |
146 | /// \returns true iff any CMOV-group-candidate remain. |
147 | bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, |
148 | CmovGroups &CmovInstGroups); |
149 | |
150 | /// Convert the given list of consecutive CMOV instructions into a branch. |
151 | /// |
152 | /// \param Group Consecutive CMOV instructions to be converted into branch. |
153 | void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const; |
154 | }; |
155 | |
156 | } // end anonymous namespace |
157 | |
158 | char X86CmovConverterPass::ID = 0; |
159 | |
160 | void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { |
161 | MachineFunctionPass::getAnalysisUsage(AU); |
162 | AU.addRequired<MachineLoopInfoWrapperPass>(); |
163 | } |
164 | |
165 | bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { |
166 | if (skipFunction(F: MF.getFunction())) |
167 | return false; |
168 | if (!EnableCmovConverter) |
169 | return false; |
170 | |
171 | // If the SelectOptimize pass is enabled, cmovs have already been optimized. |
172 | if (!getCGPassBuilderOption().DisableSelectOptimize) |
173 | return false; |
174 | |
175 | LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() |
176 | << "**********\n" ); |
177 | |
178 | bool Changed = false; |
179 | MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); |
180 | const TargetSubtargetInfo &STI = MF.getSubtarget(); |
181 | MRI = &MF.getRegInfo(); |
182 | TII = STI.getInstrInfo(); |
183 | TRI = STI.getRegisterInfo(); |
184 | TSchedModel.init(TSInfo: &STI); |
185 | |
186 | // Before we handle the more subtle cases of register-register CMOVs inside |
187 | // of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or |
188 | // the ones with a memory operand (ForceMemOperand option). The latter CMOV |
189 | // will risk a stall waiting for the load to complete that speculative |
190 | // execution behind a branch is better suited to handle on modern x86 chips. |
191 | if (ForceMemOperand || ForceAll) { |
192 | CmovGroups AllCmovGroups; |
193 | SmallVector<MachineBasicBlock *, 4> Blocks; |
194 | for (auto &MBB : MF) |
195 | Blocks.push_back(Elt: &MBB); |
196 | if (collectCmovCandidates(Blocks, CmovInstGroups&: AllCmovGroups, /*IncludeLoads*/ true)) { |
197 | for (auto &Group : AllCmovGroups) { |
198 | // Skip any group that doesn't do at least one memory operand cmov. |
199 | if (ForceMemOperand && !ForceAll && |
200 | llvm::none_of(Range&: Group, P: [&](MachineInstr *I) { return I->mayLoad(); })) |
201 | continue; |
202 | |
203 | // For CMOV groups which we can rewrite and which contain a memory load, |
204 | // always rewrite them. On x86, a CMOV will dramatically amplify any |
205 | // memory latency by blocking speculative execution. |
206 | Changed = true; |
207 | convertCmovInstsToBranches(Group); |
208 | } |
209 | } |
210 | // Early return as ForceAll converts all CmovGroups. |
211 | if (ForceAll) |
212 | return Changed; |
213 | } |
214 | |
215 | //===--------------------------------------------------------------------===// |
216 | // Register-operand Conversion Algorithm |
217 | // --------- |
218 | // For each innermost loop |
219 | // collectCmovCandidates() { |
220 | // Find all CMOV-group-candidates. |
221 | // } |
222 | // |
223 | // checkForProfitableCmovCandidates() { |
224 | // * Calculate both loop-depth and optimized-loop-depth. |
225 | // * Use these depth to check for loop transformation profitability. |
226 | // * Check for CMOV-group-candidate transformation profitability. |
227 | // } |
228 | // |
229 | // For each profitable CMOV-group-candidate |
230 | // convertCmovInstsToBranches() { |
231 | // * Create FalseBB, SinkBB, Conditional branch to SinkBB. |
232 | // * Replace each CMOV instruction with a PHI instruction in SinkBB. |
233 | // } |
234 | // |
235 | // Note: For more details, see each function description. |
236 | //===--------------------------------------------------------------------===// |
237 | |
238 | // Build up the loops in pre-order. |
239 | SmallVector<MachineLoop *, 4> Loops(MLI->begin(), MLI->end()); |
240 | // Note that we need to check size on each iteration as we accumulate child |
241 | // loops. |
242 | for (int i = 0; i < (int)Loops.size(); ++i) |
243 | for (MachineLoop *Child : Loops[i]->getSubLoops()) |
244 | Loops.push_back(Elt: Child); |
245 | |
246 | for (MachineLoop *CurrLoop : Loops) { |
247 | // Optimize only innermost loops. |
248 | if (!CurrLoop->getSubLoops().empty()) |
249 | continue; |
250 | |
251 | // List of consecutive CMOV instructions to be processed. |
252 | CmovGroups CmovInstGroups; |
253 | |
254 | if (!collectCmovCandidates(Blocks: CurrLoop->getBlocks(), CmovInstGroups)) |
255 | continue; |
256 | |
257 | if (!checkForProfitableCmovCandidates(Blocks: CurrLoop->getBlocks(), |
258 | CmovInstGroups)) |
259 | continue; |
260 | |
261 | Changed = true; |
262 | for (auto &Group : CmovInstGroups) |
263 | convertCmovInstsToBranches(Group); |
264 | } |
265 | |
266 | return Changed; |
267 | } |
268 | |
269 | bool X86CmovConverterPass::collectCmovCandidates( |
270 | ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups, |
271 | bool IncludeLoads) { |
272 | //===--------------------------------------------------------------------===// |
273 | // Collect all CMOV-group-candidates and add them into CmovInstGroups. |
274 | // |
275 | // CMOV-group: |
276 | // CMOV instructions, in same MBB, that uses same EFLAGS def instruction. |
277 | // |
278 | // CMOV-group-candidate: |
279 | // CMOV-group where all the CMOV instructions are |
280 | // 1. consecutive. |
281 | // 2. have same condition code or opposite one. |
282 | // 3. have only operand registers (X86::CMOVrr). |
283 | //===--------------------------------------------------------------------===// |
284 | // List of possible improvement (TODO's): |
285 | // -------------------------------------- |
286 | // TODO: Add support for X86::CMOVrm instructions. |
287 | // TODO: Add support for X86::SETcc instructions. |
288 | // TODO: Add support for CMOV-groups with non consecutive CMOV instructions. |
289 | //===--------------------------------------------------------------------===// |
290 | |
291 | // Current processed CMOV-Group. |
292 | CmovGroup Group; |
293 | for (auto *MBB : Blocks) { |
294 | Group.clear(); |
295 | // Condition code of first CMOV instruction current processed range and its |
296 | // opposite condition code. |
297 | X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID, |
298 | MemOpCC = X86::COND_INVALID; |
299 | // Indicator of a non CMOVrr instruction in the current processed range. |
300 | bool FoundNonCMOVInst = false; |
301 | // Indicator for current processed CMOV-group if it should be skipped. |
302 | bool SkipGroup = false; |
303 | |
304 | for (auto &I : *MBB) { |
305 | // Skip debug instructions. |
306 | if (I.isDebugInstr()) |
307 | continue; |
308 | |
309 | X86::CondCode CC = X86::getCondFromCMov(MI: I); |
310 | // Check if we found a X86::CMOVrr instruction. If it is marked as |
311 | // unpredictable, skip it and do not convert it to branch. |
312 | if (CC != X86::COND_INVALID && |
313 | !I.getFlag(Flag: MachineInstr::MIFlag::Unpredictable) && |
314 | (IncludeLoads || !I.mayLoad())) { |
315 | if (Group.empty()) { |
316 | // We found first CMOV in the range, reset flags. |
317 | FirstCC = CC; |
318 | FirstOppCC = X86::GetOppositeBranchCondition(CC); |
319 | // Clear out the prior group's memory operand CC. |
320 | MemOpCC = X86::COND_INVALID; |
321 | FoundNonCMOVInst = false; |
322 | SkipGroup = false; |
323 | } |
324 | Group.push_back(Elt: &I); |
325 | // Check if it is a non-consecutive CMOV instruction or it has different |
326 | // condition code than FirstCC or FirstOppCC. |
327 | if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) |
328 | // Mark the SKipGroup indicator to skip current processed CMOV-Group. |
329 | SkipGroup = true; |
330 | if (I.mayLoad()) { |
331 | if (MemOpCC == X86::COND_INVALID) |
332 | // The first memory operand CMOV. |
333 | MemOpCC = CC; |
334 | else if (CC != MemOpCC) |
335 | // Can't handle mixed conditions with memory operands. |
336 | SkipGroup = true; |
337 | } |
338 | // Check if we were relying on zero-extending behavior of the CMOV. |
339 | if (!SkipGroup && |
340 | llvm::any_of( |
341 | Range: MRI->use_nodbg_instructions(Reg: I.defs().begin()->getReg()), |
342 | P: [&](MachineInstr &UseI) { |
343 | return UseI.getOpcode() == X86::SUBREG_TO_REG; |
344 | })) |
345 | // FIXME: We should model the cost of using an explicit MOV to handle |
346 | // the zero-extension rather than just refusing to handle this. |
347 | SkipGroup = true; |
348 | continue; |
349 | } |
350 | // If Group is empty, keep looking for first CMOV in the range. |
351 | if (Group.empty()) |
352 | continue; |
353 | |
354 | // We found a non X86::CMOVrr instruction. |
355 | FoundNonCMOVInst = true; |
356 | // Check if this instruction define EFLAGS, to determine end of processed |
357 | // range, as there would be no more instructions using current EFLAGS def. |
358 | if (I.definesRegister(Reg: X86::EFLAGS, /*TRI=*/nullptr)) { |
359 | // Check if current processed CMOV-group should not be skipped and add |
360 | // it as a CMOV-group-candidate. |
361 | if (!SkipGroup) |
362 | CmovInstGroups.push_back(Elt: Group); |
363 | else |
364 | ++NumOfSkippedCmovGroups; |
365 | Group.clear(); |
366 | } |
367 | } |
368 | // End of basic block is considered end of range, check if current processed |
369 | // CMOV-group should not be skipped and add it as a CMOV-group-candidate. |
370 | if (Group.empty()) |
371 | continue; |
372 | if (!SkipGroup) |
373 | CmovInstGroups.push_back(Elt: Group); |
374 | else |
375 | ++NumOfSkippedCmovGroups; |
376 | } |
377 | |
378 | NumOfCmovGroupCandidate += CmovInstGroups.size(); |
379 | return !CmovInstGroups.empty(); |
380 | } |
381 | |
382 | /// \returns Depth of CMOV instruction as if it was converted into branch. |
383 | /// \param TrueOpDepth depth cost of CMOV true value operand. |
384 | /// \param FalseOpDepth depth cost of CMOV false value operand. |
385 | static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { |
386 | // The depth of the result after branch conversion is |
387 | // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability. |
388 | // As we have no info about branch weight, we assume 75% for one and 25% for |
389 | // the other, and pick the result with the largest resulting depth. |
390 | return std::max( |
391 | a: divideCeil(Numerator: TrueOpDepth * 3 + FalseOpDepth, Denominator: 4), |
392 | b: divideCeil(Numerator: FalseOpDepth * 3 + TrueOpDepth, Denominator: 4)); |
393 | } |
394 | |
395 | bool X86CmovConverterPass::checkForProfitableCmovCandidates( |
396 | ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) { |
397 | struct DepthInfo { |
398 | /// Depth of original loop. |
399 | unsigned Depth; |
400 | /// Depth of optimized loop. |
401 | unsigned OptDepth; |
402 | }; |
403 | /// Number of loop iterations to calculate depth for ?! |
404 | static const unsigned LoopIterations = 2; |
405 | DenseMap<MachineInstr *, DepthInfo> DepthMap; |
406 | DepthInfo LoopDepth[LoopIterations] = {{.Depth: 0, .OptDepth: 0}, {.Depth: 0, .OptDepth: 0}}; |
407 | enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 }; |
408 | /// For each register type maps the register to its last def instruction. |
409 | DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum]; |
410 | /// Maps register operand to its def instruction, which can be nullptr if it |
411 | /// is unknown (e.g., operand is defined outside the loop). |
412 | DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap; |
413 | |
414 | // Set depth of unknown instruction (i.e., nullptr) to zero. |
415 | DepthMap[nullptr] = {.Depth: 0, .OptDepth: 0}; |
416 | |
417 | SmallPtrSet<MachineInstr *, 4> CmovInstructions; |
418 | for (auto &Group : CmovInstGroups) |
419 | CmovInstructions.insert(I: Group.begin(), E: Group.end()); |
420 | |
421 | //===--------------------------------------------------------------------===// |
422 | // Step 1: Calculate instruction depth and loop depth. |
423 | // Optimized-Loop: |
424 | // loop with CMOV-group-candidates converted into branches. |
425 | // |
426 | // Instruction-Depth: |
427 | // instruction latency + max operand depth. |
428 | // * For CMOV instruction in optimized loop the depth is calculated as: |
429 | // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth) |
430 | // TODO: Find a better way to estimate the latency of the branch instruction |
431 | // rather than using the CMOV latency. |
432 | // |
433 | // Loop-Depth: |
434 | // max instruction depth of all instructions in the loop. |
435 | // Note: instruction with max depth represents the critical-path in the loop. |
436 | // |
437 | // Loop-Depth[i]: |
438 | // Loop-Depth calculated for first `i` iterations. |
439 | // Note: it is enough to calculate depth for up to two iterations. |
440 | // |
441 | // Depth-Diff[i]: |
442 | // Number of cycles saved in first 'i` iterations by optimizing the loop. |
443 | //===--------------------------------------------------------------------===// |
444 | for (DepthInfo &MaxDepth : LoopDepth) { |
445 | for (auto *MBB : Blocks) { |
446 | // Clear physical registers Def map. |
447 | RegDefMaps[PhyRegType].clear(); |
448 | for (MachineInstr &MI : *MBB) { |
449 | // Skip debug instructions. |
450 | if (MI.isDebugInstr()) |
451 | continue; |
452 | unsigned MIDepth = 0; |
453 | unsigned MIDepthOpt = 0; |
454 | bool IsCMOV = CmovInstructions.count(Ptr: &MI); |
455 | for (auto &MO : MI.uses()) { |
456 | // Checks for "isUse()" as "uses()" returns also implicit definitions. |
457 | if (!MO.isReg() || !MO.isUse()) |
458 | continue; |
459 | Register Reg = MO.getReg(); |
460 | auto &RDM = RegDefMaps[Reg.isVirtual()]; |
461 | if (MachineInstr *DefMI = RDM.lookup(Val: Reg)) { |
462 | OperandToDefMap[&MO] = DefMI; |
463 | DepthInfo Info = DepthMap.lookup(Val: DefMI); |
464 | MIDepth = std::max(a: MIDepth, b: Info.Depth); |
465 | if (!IsCMOV) |
466 | MIDepthOpt = std::max(a: MIDepthOpt, b: Info.OptDepth); |
467 | } |
468 | } |
469 | |
470 | if (IsCMOV) |
471 | MIDepthOpt = getDepthOfOptCmov( |
472 | TrueOpDepth: DepthMap[OperandToDefMap.lookup(Val: &MI.getOperand(i: 1))].OptDepth, |
473 | FalseOpDepth: DepthMap[OperandToDefMap.lookup(Val: &MI.getOperand(i: 2))].OptDepth); |
474 | |
475 | // Iterates over all operands to handle implicit definitions as well. |
476 | for (auto &MO : MI.operands()) { |
477 | if (!MO.isReg() || !MO.isDef()) |
478 | continue; |
479 | Register Reg = MO.getReg(); |
480 | RegDefMaps[Reg.isVirtual()][Reg] = &MI; |
481 | } |
482 | |
483 | unsigned Latency = TSchedModel.computeInstrLatency(MI: &MI); |
484 | DepthMap[&MI] = {.Depth: MIDepth += Latency, .OptDepth: MIDepthOpt += Latency}; |
485 | MaxDepth.Depth = std::max(a: MaxDepth.Depth, b: MIDepth); |
486 | MaxDepth.OptDepth = std::max(a: MaxDepth.OptDepth, b: MIDepthOpt); |
487 | } |
488 | } |
489 | } |
490 | |
491 | unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth, |
492 | LoopDepth[1].Depth - LoopDepth[1].OptDepth}; |
493 | |
494 | //===--------------------------------------------------------------------===// |
495 | // Step 2: Check if Loop worth to be optimized. |
496 | // Worth-Optimize-Loop: |
497 | // case 1: Diff[1] == Diff[0] |
498 | // Critical-path is iteration independent - there is no dependency |
499 | // of critical-path instructions on critical-path instructions of |
500 | // previous iteration. |
501 | // Thus, it is enough to check gain percent of 1st iteration - |
502 | // To be conservative, the optimized loop need to have a depth of |
503 | // 12.5% cycles less than original loop, per iteration. |
504 | // |
505 | // case 2: Diff[1] > Diff[0] |
506 | // Critical-path is iteration dependent - there is dependency of |
507 | // critical-path instructions on critical-path instructions of |
508 | // previous iteration. |
509 | // Thus, check the gain percent of the 2nd iteration (similar to the |
510 | // previous case), but it is also required to check the gradient of |
511 | // the gain - the change in Depth-Diff compared to the change in |
512 | // Loop-Depth between 1st and 2nd iterations. |
513 | // To be conservative, the gradient need to be at least 50%. |
514 | // |
515 | // In addition, In order not to optimize loops with very small gain, the |
516 | // gain (in cycles) after 2nd iteration should not be less than a given |
517 | // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply. |
518 | // |
519 | // If loop is not worth optimizing, remove all CMOV-group-candidates. |
520 | //===--------------------------------------------------------------------===// |
521 | if (Diff[1] < GainCycleThreshold) |
522 | return false; |
523 | |
524 | bool WorthOptLoop = false; |
525 | if (Diff[1] == Diff[0]) |
526 | WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; |
527 | else if (Diff[1] > Diff[0]) |
528 | WorthOptLoop = |
529 | (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) && |
530 | (Diff[1] * 8 >= LoopDepth[1].Depth); |
531 | |
532 | if (!WorthOptLoop) |
533 | return false; |
534 | |
535 | ++NumOfLoopCandidate; |
536 | |
537 | //===--------------------------------------------------------------------===// |
538 | // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. |
539 | // Worth-Optimize-Group: |
540 | // Iff it is worth to optimize all CMOV instructions in the group. |
541 | // |
542 | // Worth-Optimize-CMOV: |
543 | // Predicted branch is faster than CMOV by the difference between depth of |
544 | // condition operand and depth of taken (predicted) value operand. |
545 | // To be conservative, the gain of such CMOV transformation should cover at |
546 | // at least 25% of branch-misprediction-penalty. |
547 | //===--------------------------------------------------------------------===// |
548 | unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; |
549 | CmovGroups TempGroups; |
550 | std::swap(LHS&: TempGroups, RHS&: CmovInstGroups); |
551 | for (auto &Group : TempGroups) { |
552 | bool WorthOpGroup = true; |
553 | for (auto *MI : Group) { |
554 | // Avoid CMOV instruction which value is used as a pointer to load from. |
555 | // This is another conservative check to avoid converting CMOV instruction |
556 | // used with tree-search like algorithm, where the branch is unpredicted. |
557 | auto UIs = MRI->use_instructions(Reg: MI->defs().begin()->getReg()); |
558 | if (!UIs.empty() && ++UIs.begin() == UIs.end()) { |
559 | unsigned Op = UIs.begin()->getOpcode(); |
560 | if (Op == X86::MOV64rm || Op == X86::MOV32rm) { |
561 | WorthOpGroup = false; |
562 | break; |
563 | } |
564 | } |
565 | |
566 | unsigned CondCost = |
567 | DepthMap[OperandToDefMap.lookup(Val: &MI->getOperand(i: 4))].Depth; |
568 | unsigned ValCost = getDepthOfOptCmov( |
569 | TrueOpDepth: DepthMap[OperandToDefMap.lookup(Val: &MI->getOperand(i: 1))].Depth, |
570 | FalseOpDepth: DepthMap[OperandToDefMap.lookup(Val: &MI->getOperand(i: 2))].Depth); |
571 | if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) { |
572 | WorthOpGroup = false; |
573 | break; |
574 | } |
575 | } |
576 | |
577 | if (WorthOpGroup) |
578 | CmovInstGroups.push_back(Elt: Group); |
579 | } |
580 | |
581 | return !CmovInstGroups.empty(); |
582 | } |
583 | |
584 | static bool checkEFLAGSLive(MachineInstr *MI) { |
585 | if (MI->killsRegister(Reg: X86::EFLAGS, /*TRI=*/nullptr)) |
586 | return false; |
587 | |
588 | // The EFLAGS operand of MI might be missing a kill marker. |
589 | // Figure out whether EFLAGS operand should LIVE after MI instruction. |
590 | MachineBasicBlock *BB = MI->getParent(); |
591 | MachineBasicBlock::iterator ItrMI = MI; |
592 | |
593 | // Scan forward through BB for a use/def of EFLAGS. |
594 | for (auto I = std::next(x: ItrMI), E = BB->end(); I != E; ++I) { |
595 | if (I->readsRegister(Reg: X86::EFLAGS, /*TRI=*/nullptr)) |
596 | return true; |
597 | if (I->definesRegister(Reg: X86::EFLAGS, /*TRI=*/nullptr)) |
598 | return false; |
599 | } |
600 | |
601 | // We hit the end of the block, check whether EFLAGS is live into a successor. |
602 | for (MachineBasicBlock *Succ : BB->successors()) |
603 | if (Succ->isLiveIn(Reg: X86::EFLAGS)) |
604 | return true; |
605 | |
606 | return false; |
607 | } |
608 | |
609 | /// Given /p First CMOV instruction and /p Last CMOV instruction representing a |
610 | /// group of CMOV instructions, which may contain debug instructions in between, |
611 | /// move all debug instructions to after the last CMOV instruction, making the |
612 | /// CMOV group consecutive. |
613 | static void packCmovGroup(MachineInstr *First, MachineInstr *Last) { |
614 | assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID && |
615 | "Last instruction in a CMOV group must be a CMOV instruction" ); |
616 | |
617 | SmallVector<MachineInstr *, 2> DBGInstructions; |
618 | for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) { |
619 | if (I->isDebugInstr()) |
620 | DBGInstructions.push_back(Elt: &*I); |
621 | } |
622 | |
623 | // Splice the debug instruction after the cmov group. |
624 | MachineBasicBlock *MBB = First->getParent(); |
625 | for (auto *MI : DBGInstructions) |
626 | MBB->insertAfter(I: Last, MI: MI->removeFromParent()); |
627 | } |
628 | |
629 | void X86CmovConverterPass::convertCmovInstsToBranches( |
630 | SmallVectorImpl<MachineInstr *> &Group) const { |
631 | assert(!Group.empty() && "No CMOV instructions to convert" ); |
632 | ++NumOfOptimizedCmovGroups; |
633 | |
634 | // If the CMOV group is not packed, e.g., there are debug instructions between |
635 | // first CMOV and last CMOV, then pack the group and make the CMOV instruction |
636 | // consecutive by moving the debug instructions to after the last CMOV. |
637 | packCmovGroup(First: Group.front(), Last: Group.back()); |
638 | |
639 | // To convert a CMOVcc instruction, we actually have to insert the diamond |
640 | // control-flow pattern. The incoming instruction knows the destination vreg |
641 | // to set, the condition code register to branch on, the true/false values to |
642 | // select between, and a branch opcode to use. |
643 | |
644 | // Before |
645 | // ----- |
646 | // MBB: |
647 | // cond = cmp ... |
648 | // v1 = CMOVge t1, f1, cond |
649 | // v2 = CMOVlt t2, f2, cond |
650 | // v3 = CMOVge v1, f3, cond |
651 | // |
652 | // After |
653 | // ----- |
654 | // MBB: |
655 | // cond = cmp ... |
656 | // jge %SinkMBB |
657 | // |
658 | // FalseMBB: |
659 | // jmp %SinkMBB |
660 | // |
661 | // SinkMBB: |
662 | // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] |
663 | // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch |
664 | // ; true-value with false-value |
665 | // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use |
666 | // ; previous Phi instruction result |
667 | |
668 | MachineInstr &MI = *Group.front(); |
669 | MachineInstr *LastCMOV = Group.back(); |
670 | DebugLoc DL = MI.getDebugLoc(); |
671 | |
672 | X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI)); |
673 | X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); |
674 | // Potentially swap the condition codes so that any memory operand to a CMOV |
675 | // is in the *false* position instead of the *true* position. We can invert |
676 | // any non-memory operand CMOV instructions to cope with this and we ensure |
677 | // memory operand CMOVs are only included with a single condition code. |
678 | if (llvm::any_of(Range&: Group, P: [&](MachineInstr *I) { |
679 | return I->mayLoad() && X86::getCondFromCMov(MI: *I) == CC; |
680 | })) |
681 | std::swap(a&: CC, b&: OppCC); |
682 | |
683 | MachineBasicBlock *MBB = MI.getParent(); |
684 | MachineFunction::iterator It = ++MBB->getIterator(); |
685 | MachineFunction *F = MBB->getParent(); |
686 | const BasicBlock *BB = MBB->getBasicBlock(); |
687 | |
688 | MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB); |
689 | MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB); |
690 | F->insert(MBBI: It, MBB: FalseMBB); |
691 | F->insert(MBBI: It, MBB: SinkMBB); |
692 | |
693 | // If the EFLAGS register isn't dead in the terminator, then claim that it's |
694 | // live into the sink and copy blocks. |
695 | if (checkEFLAGSLive(MI: LastCMOV)) { |
696 | FalseMBB->addLiveIn(PhysReg: X86::EFLAGS); |
697 | SinkMBB->addLiveIn(PhysReg: X86::EFLAGS); |
698 | } |
699 | |
700 | // Transfer the remainder of BB and its successor edges to SinkMBB. |
701 | SinkMBB->splice(Where: SinkMBB->begin(), Other: MBB, |
702 | From: std::next(x: MachineBasicBlock::iterator(LastCMOV)), To: MBB->end()); |
703 | SinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB); |
704 | |
705 | // Add the false and sink blocks as its successors. |
706 | MBB->addSuccessor(Succ: FalseMBB); |
707 | MBB->addSuccessor(Succ: SinkMBB); |
708 | |
709 | // Create the conditional branch instruction. |
710 | BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: X86::JCC_1)).addMBB(MBB: SinkMBB).addImm(Val: CC); |
711 | |
712 | // Add the sink block to the false block successors. |
713 | FalseMBB->addSuccessor(Succ: SinkMBB); |
714 | |
715 | MachineInstrBuilder MIB; |
716 | MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); |
717 | MachineBasicBlock::iterator MIItEnd = |
718 | std::next(x: MachineBasicBlock::iterator(LastCMOV)); |
719 | MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin(); |
720 | MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); |
721 | |
722 | // First we need to insert an explicit load on the false path for any memory |
723 | // operand. We also need to potentially do register rewriting here, but it is |
724 | // simpler as the memory operands are always on the false path so we can |
725 | // simply take that input, whatever it is. |
726 | DenseMap<unsigned, unsigned> FalseBBRegRewriteTable; |
727 | for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) { |
728 | auto &MI = *MIIt++; |
729 | // Skip any CMOVs in this group which don't load from memory. |
730 | if (!MI.mayLoad()) { |
731 | // Remember the false-side register input. |
732 | Register FalseReg = |
733 | MI.getOperand(i: X86::getCondFromCMov(MI) == CC ? 1 : 2).getReg(); |
734 | // Walk back through any intermediate cmovs referenced. |
735 | while (true) { |
736 | auto FRIt = FalseBBRegRewriteTable.find(Val: FalseReg); |
737 | if (FRIt == FalseBBRegRewriteTable.end()) |
738 | break; |
739 | FalseReg = FRIt->second; |
740 | } |
741 | FalseBBRegRewriteTable[MI.getOperand(i: 0).getReg()] = FalseReg; |
742 | continue; |
743 | } |
744 | |
745 | // The condition must be the *opposite* of the one we've decided to branch |
746 | // on as the branch will go *around* the load and the load should happen |
747 | // when the CMOV condition is false. |
748 | assert(X86::getCondFromCMov(MI) == OppCC && |
749 | "Can only handle memory-operand cmov instructions with a condition " |
750 | "opposite to the selected branch direction." ); |
751 | |
752 | // The goal is to rewrite the cmov from: |
753 | // |
754 | // MBB: |
755 | // %A = CMOVcc %B (tied), (mem) |
756 | // |
757 | // to |
758 | // |
759 | // MBB: |
760 | // %A = CMOVcc %B (tied), %C |
761 | // FalseMBB: |
762 | // %C = MOV (mem) |
763 | // |
764 | // Which will allow the next loop to rewrite the CMOV in terms of a PHI: |
765 | // |
766 | // MBB: |
767 | // JMP!cc SinkMBB |
768 | // FalseMBB: |
769 | // %C = MOV (mem) |
770 | // SinkMBB: |
771 | // %A = PHI [ %C, FalseMBB ], [ %B, MBB] |
772 | |
773 | // Get a fresh register to use as the destination of the MOV. |
774 | const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI.getOperand(i: 0).getReg()); |
775 | Register TmpReg = MRI->createVirtualRegister(RegClass: RC); |
776 | |
777 | // Retain debug instr number when unfolded. |
778 | unsigned OldDebugInstrNum = MI.peekDebugInstrNum(); |
779 | SmallVector<MachineInstr *, 4> NewMIs; |
780 | bool Unfolded = TII->unfoldMemoryOperand(MF&: *MBB->getParent(), MI, Reg: TmpReg, |
781 | /*UnfoldLoad*/ true, |
782 | /*UnfoldStore*/ false, NewMIs); |
783 | (void)Unfolded; |
784 | assert(Unfolded && "Should never fail to unfold a loading cmov!" ); |
785 | |
786 | // Move the new CMOV to just before the old one and reset any impacted |
787 | // iterator. |
788 | auto *NewCMOV = NewMIs.pop_back_val(); |
789 | assert(X86::getCondFromCMov(*NewCMOV) == OppCC && |
790 | "Last new instruction isn't the expected CMOV!" ); |
791 | LLVM_DEBUG(dbgs() << "\tRewritten cmov: " ; NewCMOV->dump()); |
792 | MBB->insert(I: MachineBasicBlock::iterator(MI), MI: NewCMOV); |
793 | if (&*MIItBegin == &MI) |
794 | MIItBegin = MachineBasicBlock::iterator(NewCMOV); |
795 | |
796 | if (OldDebugInstrNum) |
797 | NewCMOV->setDebugInstrNum(OldDebugInstrNum); |
798 | |
799 | // Sink whatever instructions were needed to produce the unfolded operand |
800 | // into the false block. |
801 | for (auto *NewMI : NewMIs) { |
802 | LLVM_DEBUG(dbgs() << "\tRewritten load instr: " ; NewMI->dump()); |
803 | FalseMBB->insert(I: FalseInsertionPoint, MI: NewMI); |
804 | // Re-map any operands that are from other cmovs to the inputs for this block. |
805 | for (auto &MOp : NewMI->uses()) { |
806 | if (!MOp.isReg()) |
807 | continue; |
808 | auto It = FalseBBRegRewriteTable.find(Val: MOp.getReg()); |
809 | if (It == FalseBBRegRewriteTable.end()) |
810 | continue; |
811 | |
812 | MOp.setReg(It->second); |
813 | // This might have been a kill when it referenced the cmov result, but |
814 | // it won't necessarily be once rewritten. |
815 | // FIXME: We could potentially improve this by tracking whether the |
816 | // operand to the cmov was also a kill, and then skipping the PHI node |
817 | // construction below. |
818 | MOp.setIsKill(false); |
819 | } |
820 | } |
821 | MBB->erase(I: &MI); |
822 | |
823 | // Add this PHI to the rewrite table. |
824 | FalseBBRegRewriteTable[NewCMOV->getOperand(i: 0).getReg()] = TmpReg; |
825 | } |
826 | |
827 | // As we are creating the PHIs, we have to be careful if there is more than |
828 | // one. Later CMOVs may reference the results of earlier CMOVs, but later |
829 | // PHIs have to reference the individual true/false inputs from earlier PHIs. |
830 | // That also means that PHI construction must work forward from earlier to |
831 | // later, and that the code must maintain a mapping from earlier PHI's |
832 | // destination registers, and the registers that went into the PHI. |
833 | DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; |
834 | |
835 | for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { |
836 | Register DestReg = MIIt->getOperand(i: 0).getReg(); |
837 | Register Op1Reg = MIIt->getOperand(i: 1).getReg(); |
838 | Register Op2Reg = MIIt->getOperand(i: 2).getReg(); |
839 | |
840 | // If this CMOV we are processing is the opposite condition from the jump we |
841 | // generated, then we have to swap the operands for the PHI that is going to |
842 | // be generated. |
843 | if (X86::getCondFromCMov(MI: *MIIt) == OppCC) |
844 | std::swap(a&: Op1Reg, b&: Op2Reg); |
845 | |
846 | auto Op1Itr = RegRewriteTable.find(Val: Op1Reg); |
847 | if (Op1Itr != RegRewriteTable.end()) |
848 | Op1Reg = Op1Itr->second.first; |
849 | |
850 | auto Op2Itr = RegRewriteTable.find(Val: Op2Reg); |
851 | if (Op2Itr != RegRewriteTable.end()) |
852 | Op2Reg = Op2Itr->second.second; |
853 | |
854 | // SinkMBB: |
855 | // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ] |
856 | // ... |
857 | MIB = BuildMI(BB&: *SinkMBB, I: SinkInsertionPoint, MIMD: DL, MCID: TII->get(Opcode: X86::PHI), DestReg) |
858 | .addReg(RegNo: Op1Reg) |
859 | .addMBB(MBB: FalseMBB) |
860 | .addReg(RegNo: Op2Reg) |
861 | .addMBB(MBB); |
862 | (void)MIB; |
863 | LLVM_DEBUG(dbgs() << "\tFrom: " ; MIIt->dump()); |
864 | LLVM_DEBUG(dbgs() << "\tTo: " ; MIB->dump()); |
865 | |
866 | // debug-info: we can just copy the instr-ref number from one instruction |
867 | // to the other, seeing how it's a one-for-one substitution. |
868 | if (unsigned InstrNum = MIIt->peekDebugInstrNum()) |
869 | MIB->setDebugInstrNum(InstrNum); |
870 | |
871 | // Add this PHI to the rewrite table. |
872 | RegRewriteTable[DestReg] = std::make_pair(x&: Op1Reg, y&: Op2Reg); |
873 | } |
874 | |
875 | // Reset the NoPHIs property if a PHI was inserted to prevent a conflict with |
876 | // the MachineVerifier during testing. |
877 | if (MIItBegin != MIItEnd) |
878 | F->getProperties().reset(P: MachineFunctionProperties::Property::NoPHIs); |
879 | |
880 | // Now remove the CMOV(s). |
881 | MBB->erase(I: MIItBegin, E: MIItEnd); |
882 | |
883 | // Add new basic blocks to MachineLoopInfo. |
884 | if (MachineLoop *L = MLI->getLoopFor(BB: MBB)) { |
885 | L->addBasicBlockToLoop(NewBB: FalseMBB, LI&: *MLI); |
886 | L->addBasicBlockToLoop(NewBB: SinkMBB, LI&: *MLI); |
887 | } |
888 | } |
889 | |
890 | INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion" , |
891 | false, false) |
892 | INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) |
893 | INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion" , |
894 | false, false) |
895 | |
896 | FunctionPass *llvm::createX86CmovConverterPass() { |
897 | return new X86CmovConverterPass(); |
898 | } |
899 | |