X86CmovConversion.cpp source code [llvm_projects/llvm/lib/Target/X86/X86CmovConversion.cpp]

1	//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file implements a pass that converts X86 cmov instructions into
11	/// branches when profitable. This pass is conservative. It transforms if and
12	/// only if it can guarantee a gain with high confidence.
13	///
14	/// Thus, the optimization applies under the following conditions:
15	/// 1. Consider as candidates only CMOVs in innermost loops (assume that
16	/// most hotspots are represented by these loops).
17	/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
18	/// instruction:
19	/// a. Consider them as candidates only if all have the same code condition
20	/// or the opposite one to prevent generating more than one conditional
21	/// jump per EFLAGS def instruction.
22	/// b. Consider them as candidates only if all are profitable to be
23	/// converted (assume that one bad conversion may cause a degradation).
24	/// 3. Apply conversion only for loops that are found profitable and only for
25	/// CMOV candidates that were found profitable.
26	/// a. A loop is considered profitable only if conversion will reduce its
27	/// depth cost by some threshold.
28	/// b. CMOV is considered profitable if the cost of its condition is higher
29	/// than the average cost of its true-value and false-value by 25% of
30	/// branch-misprediction-penalty. This assures no degradation even with
31	/// 25% branch misprediction.
32	///
33	/// Note: This pass is assumed to run on SSA machine code.
34	//
35	//===----------------------------------------------------------------------===//
36	//
37	// External interfaces:
38	// FunctionPass llvm::createX86CmovConverterPass();*
39	// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
40	//
41	//===----------------------------------------------------------------------===//
42
43	#include "X86.h"
44	#include "X86InstrInfo.h"
45	#include "llvm/ADT/ArrayRef.h"
46	#include "llvm/ADT/DenseMap.h"
47	#include "llvm/ADT/STLExtras.h"
48	#include "llvm/ADT/SmallPtrSet.h"
49	#include "llvm/ADT/SmallVector.h"
50	#include "llvm/ADT/Statistic.h"
51	#include "llvm/CodeGen/MachineBasicBlock.h"
52	#include "llvm/CodeGen/MachineFunction.h"
53	#include "llvm/CodeGen/MachineFunctionPass.h"
54	#include "llvm/CodeGen/MachineInstr.h"
55	#include "llvm/CodeGen/MachineInstrBuilder.h"
56	#include "llvm/CodeGen/MachineLoopInfo.h"
57	#include "llvm/CodeGen/MachineOperand.h"
58	#include "llvm/CodeGen/MachineRegisterInfo.h"
59	#include "llvm/CodeGen/TargetInstrInfo.h"
60	#include "llvm/CodeGen/TargetRegisterInfo.h"
61	#include "llvm/CodeGen/TargetSchedule.h"
62	#include "llvm/CodeGen/TargetSubtargetInfo.h"
63	#include "llvm/IR/DebugLoc.h"
64	#include "llvm/InitializePasses.h"
65	#include "llvm/MC/MCSchedule.h"
66	#include "llvm/Pass.h"
67	#include "llvm/Support/CommandLine.h"
68	#include "llvm/Support/Debug.h"
69	#include "llvm/Support/raw_ostream.h"
70	#include "llvm/Target/CGPassBuilderOption.h"
71	#include <algorithm>
72	#include <cassert>
73	#include <iterator>
74	#include <utility>
75
76	using namespace llvm;
77
78	#define DEBUG_TYPE "x86-cmov-conversion"
79
80	STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
81	STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
82	STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
83	STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
84
85	// This internal switch can be used to turn off the cmov/branch optimization.
86	static cl::opt<bool>
87	EnableCmovConverter("x86-cmov-converter",
88	cl::desc ("Enable the X86 cmov-to-branch optimization."),
89	cl::init(Val: true), cl::Hidden);
90
91	static cl::opt<unsigned>
92	GainCycleThreshold("x86-cmov-converter-threshold",
93	cl::desc ("Minimum gain per loop (in cycles) threshold."),
94	cl::init(Val: `4`), cl::Hidden);
95
96	static cl::opt<bool> ForceMemOperand(
97	"x86-cmov-converter-force-mem-operand",
98	cl::desc ("Convert cmovs to branches whenever they have memory operands."),
99	cl::init(Val: true), cl::Hidden);
100
101	static cl::opt<bool> ForceAll(
102	"x86-cmov-converter-force-all",
103	cl::desc ("Convert all cmovs to branches."),
104	cl::init(Val: false), cl::Hidden);
105
106	namespace {
107
108	/// Converts X86 cmov instructions into branches when profitable.
109	class X86CmovConverterPass : public MachineFunctionPass {
110	public:
111	X86CmovConverterPass() : MachineFunctionPass (ID) { }
112
113	StringRef getPassName() const override { return "X86 cmov Conversion"; }
114	bool runOnMachineFunction(MachineFunction &MF) override;
115	void getAnalysisUsage(AnalysisUsage &AU) const override;
116
117	/// Pass identification, replacement for typeid.
118	static char ID;
119
120	private:
121	MachineRegisterInfo MRI = nullptr*;
122	const TargetInstrInfo TII = nullptr*;
123	const TargetRegisterInfo TRI = nullptr*;
124	MachineLoopInfo MLI = nullptr*;
125	TargetSchedModel TSchedModel;
126
127	/// List of consecutive CMOV instructions.
128	using CmovGroup = SmallVector<MachineInstr *, `2`>;
129	using CmovGroups = SmallVector<CmovGroup, `2`>;
130
131	/// Collect all CMOV-group-candidates in \p CurrLoop and update \p
132	/// CmovInstGroups accordingly.
133	///
134	/// \param Blocks List of blocks to process.
135	/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
136	/// \returns true iff it found any CMOV-group-candidate.
137	bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
138	CmovGroups &CmovInstGroups,
139	bool IncludeLoads = false);
140
141	/// Check if it is profitable to transform each CMOV-group-candidates into
142	/// branch. Remove all groups that are not profitable from \p CmovInstGroups.
143	///
144	/// \param Blocks List of blocks to process.
145	/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
146	/// \returns true iff any CMOV-group-candidate remain.
147	bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
148	CmovGroups &CmovInstGroups);
149
150	/// Convert the given list of consecutive CMOV instructions into a branch.
151	///
152	/// \param Group Consecutive CMOV instructions to be converted into branch.
153	void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr > &Group) const*;
154	};
155
156	} // end anonymous namespace
157
158	char X86CmovConverterPass::ID = `0`;
159
160	void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
161	MachineFunctionPass::getAnalysisUsage(AU);
162	AU.addRequired<MachineLoopInfoWrapperPass>();
163	}
164
165	bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
166	if (skipFunction(F: MF.getFunction()))
167	return false;
168	if (!EnableCmovConverter)
169	return false;
170
171	// If the SelectOptimize pass is enabled, cmovs have already been optimized.
172	if (!getCGPassBuilderOption().DisableSelectOptimize)
173	return false;
174
175	LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
176	<< "**********\n");
177
178	bool Changed = false;
179	MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
180	const TargetSubtargetInfo &STI = MF.getSubtarget();
181	MRI = &MF.getRegInfo();
182	TII = STI.getInstrInfo();
183	TRI = STI.getRegisterInfo();
184	TSchedModel.init(TSInfo: &STI);
185
186	// Before we handle the more subtle cases of register-register CMOVs inside
187	// of potentially hot loops, we want to quickly remove all CMOVs (ForceAll) or
188	// the ones with a memory operand (ForceMemOperand option). The latter CMOV
189	// will risk a stall waiting for the load to complete that speculative
190	// execution behind a branch is better suited to handle on modern x86 chips.
191	if (ForceMemOperand \|\| ForceAll) {
192	CmovGroups AllCmovGroups;
193	SmallVector<MachineBasicBlock *, `4`> Blocks(llvm::make_pointer_range(Range&: MF));
194	if (collectCmovCandidates(Blocks, CmovInstGroups&: AllCmovGroups, /IncludeLoads/ true)) {
195	for (auto &Group : AllCmovGroups) {
196	// Skip any group that doesn't do at least one memory operand cmov.
197	if (ForceMemOperand && !ForceAll &&
198	llvm::none_of(Range&: Group, P: [&](MachineInstr I) { return* I->mayLoad(); }))
199	continue;
200
201	// For CMOV groups which we can rewrite and which contain a memory load,
202	// always rewrite them. On x86, a CMOV will dramatically amplify any
203	// memory latency by blocking speculative execution.
204	Changed = true;
205	convertCmovInstsToBranches(Group);
206	}
207	}
208	// Early return as ForceAll converts all CmovGroups.
209	if (ForceAll)
210	return Changed;
211	}
212
213	//===--------------------------------------------------------------------===//
214	// Register-operand Conversion Algorithm
215	// ---------
216	// For each innermost loop
217	// collectCmovCandidates() {
218	// Find all CMOV-group-candidates.
219	// }
220	//
221	// checkForProfitableCmovCandidates() {
222	// Calculate both loop-depth and optimized-loop-depth.*
223	// Use these depth to check for loop transformation profitability.*
224	// Check for CMOV-group-candidate transformation profitability.*
225	// }
226	//
227	// For each profitable CMOV-group-candidate
228	// convertCmovInstsToBranches() {
229	// Create FalseBB, SinkBB, Conditional branch to SinkBB.*
230	// Replace each CMOV instruction with a PHI instruction in SinkBB.*
231	// }
232	//
233	// Note: For more details, see each function description.
234	//===--------------------------------------------------------------------===//
235
236	// Build up the loops in pre-order.
237	SmallVector<MachineLoop *, `4`> Loops(MLI->begin(), MLI->end());
238	// Note that we need to check size on each iteration as we accumulate child
239	// loops.
240	for (int i = `0`; i < (int)Loops.size(); ++i)
241	llvm::append_range(C&: Loops, R: Loops [i]->getSubLoops());
242
243	for (MachineLoop *CurrLoop : Loops) {
244	// Optimize only innermost loops.
245	if (!CurrLoop->getSubLoops().empty())
246	continue;
247
248	// List of consecutive CMOV instructions to be processed.
249	CmovGroups CmovInstGroups;
250
251	if (!collectCmovCandidates(Blocks: CurrLoop->getBlocks(), CmovInstGroups))
252	continue;
253
254	if (!checkForProfitableCmovCandidates(Blocks: CurrLoop->getBlocks(),
255	CmovInstGroups))
256	continue;
257
258	Changed = true;
259	for (auto &Group : CmovInstGroups)
260	convertCmovInstsToBranches(Group);
261	}
262
263	return Changed;
264	}
265
266	bool X86CmovConverterPass::collectCmovCandidates(
267	ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
268	bool IncludeLoads) {
269	//===--------------------------------------------------------------------===//
270	// Collect all CMOV-group-candidates and add them into CmovInstGroups.
271	//
272	// CMOV-group:
273	// CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
274	//
275	// CMOV-group-candidate:
276	// CMOV-group where all the CMOV instructions are
277	// 1. consecutive.
278	// 2. have same condition code or opposite one.
279	// 3. have only operand registers (X86::CMOVrr).
280	//===--------------------------------------------------------------------===//
281	// List of possible improvement (TODO's):
282	// --------------------------------------
283	// TODO: Add support for X86::CMOVrm instructions.
284	// TODO: Add support for X86::SETcc instructions.
285	// TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
286	//===--------------------------------------------------------------------===//
287
288	// Current processed CMOV-Group.
289	CmovGroup Group;
290	for (auto *MBB : Blocks) {
291	Group.clear();
292	// Condition code of first CMOV instruction current processed range and its
293	// opposite condition code.
294	X86::CondCode FirstCC = X86::COND_INVALID, FirstOppCC = X86::COND_INVALID,
295	MemOpCC = X86::COND_INVALID;
296	// Indicator of a non CMOVrr instruction in the current processed range.
297	bool FoundNonCMOVInst = false;
298	// Indicator for current processed CMOV-group if it should be skipped.
299	bool SkipGroup = false;
300
301	for (auto &I : *MBB) {
302	// Skip debug instructions.
303	if (I.isDebugInstr())
304	continue;
305
306	X86::CondCode CC = X86::getCondFromCMov(MI: I);
307	// Check if we found a X86::CMOVrr instruction. If it is marked as
308	// unpredictable, skip it and do not convert it to branch.
309	if (CC != X86::COND_INVALID &&
310	!I.getFlag(Flag: MachineInstr::MIFlag::Unpredictable) &&
311	(IncludeLoads \|\| !I.mayLoad())) {
312	if (Group.empty()) {
313	// We found first CMOV in the range, reset flags.
314	FirstCC = CC;
315	FirstOppCC = X86::GetOppositeBranchCondition(CC);
316	// Clear out the prior group's memory operand CC.
317	MemOpCC = X86::COND_INVALID;
318	FoundNonCMOVInst = false;
319	SkipGroup = false;
320	}
321	Group.push_back(Elt: &I);
322	// Check if it is a non-consecutive CMOV instruction or it has different
323	// condition code than FirstCC or FirstOppCC.
324	if (FoundNonCMOVInst \|\| (CC != FirstCC && CC != FirstOppCC))
325	// Mark the SKipGroup indicator to skip current processed CMOV-Group.
326	SkipGroup = true;
327	if (I.mayLoad()) {
328	if (MemOpCC == X86::COND_INVALID)
329	// The first memory operand CMOV.
330	MemOpCC = CC;
331	else if (CC != MemOpCC)
332	// Can't handle mixed conditions with memory operands.
333	SkipGroup = true;
334	}
335	// Check if we were relying on zero-extending behavior of the CMOV.
336	if (!SkipGroup &&
337	llvm::any_of(
338	Range: MRI->use_nodbg_instructions(Reg: I.defs().begin()->getReg()),
339	P: [&](MachineInstr &UseI) {
340	return UseI.getOpcode() == X86::SUBREG_TO_REG;
341	}))
342	// FIXME: We should model the cost of using an explicit MOV to handle
343	// the zero-extension rather than just refusing to handle this.
344	SkipGroup = true;
345	continue;
346	}
347	// If Group is empty, keep looking for first CMOV in the range.
348	if (Group.empty())
349	continue;
350
351	// We found a non X86::CMOVrr instruction.
352	FoundNonCMOVInst = true;
353	// Check if this instruction define EFLAGS, to determine end of processed
354	// range, as there would be no more instructions using current EFLAGS def.
355	if (I.definesRegister(Reg: X86::EFLAGS, /TRI=/nullptr)) {
356	// Check if current processed CMOV-group should not be skipped and add
357	// it as a CMOV-group-candidate.
358	if (!SkipGroup)
359	CmovInstGroups.push_back(Elt: Group);
360	else
361	++NumOfSkippedCmovGroups;
362	Group.clear();
363	}
364	}
365	// End of basic block is considered end of range, check if current processed
366	// CMOV-group should not be skipped and add it as a CMOV-group-candidate.
367	if (Group.empty())
368	continue;
369	if (!SkipGroup)
370	CmovInstGroups.push_back(Elt: Group);
371	else
372	++NumOfSkippedCmovGroups;
373	}
374
375	NumOfCmovGroupCandidate += CmovInstGroups.size();
376	return !CmovInstGroups.empty();
377	}
378
379	/// \returns Depth of CMOV instruction as if it was converted into branch.
380	/// \param TrueOpDepth depth cost of CMOV true value operand.
381	/// \param FalseOpDepth depth cost of CMOV false value operand.
382	static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
383	// The depth of the result after branch conversion is
384	// TrueOpDepth TrueOpProbability + FalseOpDepth * FalseOpProbability.*
385	// As we have no info about branch weight, we assume 75% for one and 25% for
386	// the other, and pick the result with the largest resulting depth.
387	return std::max(
388	a: divideCeil(Numerator: TrueOpDepth * `3` + FalseOpDepth, Denominator: `4`),
389	b: divideCeil(Numerator: FalseOpDepth * `3` + TrueOpDepth, Denominator: `4`));
390	}
391
392	bool X86CmovConverterPass::checkForProfitableCmovCandidates(
393	ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
394	struct DepthInfo {
395	/// Depth of original loop.
396	unsigned Depth;
397	/// Depth of optimized loop.
398	unsigned OptDepth;
399	};
400	/// Number of loop iterations to calculate depth for ?!
401	static const unsigned LoopIterations = `2`;
402	DenseMap<MachineInstr *, DepthInfo> DepthMap;
403	DepthInfo LoopDepth[LoopIterations] = {{.Depth: `0`, .OptDepth: `0`}, {.Depth: `0`, .OptDepth: `0`}};
404	enum { PhyRegType = `0`, VirRegType = `1`, RegTypeNum = `2` };
405	/// For each register type maps the register to its last def instruction.
406	DenseMap<Register, MachineInstr *> RegDefMaps[RegTypeNum];
407	/// Maps register operand to its def instruction, which can be nullptr if it
408	/// is unknown (e.g., operand is defined outside the loop).
409	DenseMap<MachineOperand , MachineInstr > OperandToDefMap;
410
411	// Set depth of unknown instruction (i.e., nullptr) to zero.
412	DepthMap [nullptr] = {.Depth: `0`, .OptDepth: `0`};
413
414	SmallPtrSet<MachineInstr *, `4`> CmovInstructions;
415	for (auto &Group : CmovInstGroups)
416	CmovInstructions.insert_range(R&: Group);
417
418	//===--------------------------------------------------------------------===//
419	// Step 1: Calculate instruction depth and loop depth.
420	// Optimized-Loop:
421	// loop with CMOV-group-candidates converted into branches.
422	//
423	// Instruction-Depth:
424	// instruction latency + max operand depth.
425	// For CMOV instruction in optimized loop the depth is calculated as:*
426	// CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
427	// TODO: Find a better way to estimate the latency of the branch instruction
428	// rather than using the CMOV latency.
429	//
430	// Loop-Depth:
431	// max instruction depth of all instructions in the loop.
432	// Note: instruction with max depth represents the critical-path in the loop.
433	//
434	// Loop-Depth[i]:
435	// Loop-Depth calculated for first `i` iterations.
436	// Note: it is enough to calculate depth for up to two iterations.
437	//
438	// Depth-Diff[i]:
439	// Number of cycles saved in first 'i` iterations by optimizing the loop.
440	//===--------------------------------------------------------------------===//
441	for (DepthInfo &MaxDepth : LoopDepth) {
442	for (auto *MBB : Blocks) {
443	// Clear physical registers Def map.
444	RegDefMaps[PhyRegType].clear();
445	for (MachineInstr &MI : *MBB) {
446	// Skip debug instructions.
447	if (MI.isDebugInstr())
448	continue;
449	unsigned MIDepth = `0`;
450	unsigned MIDepthOpt = `0`;
451	bool IsCMOV = CmovInstructions.count(Ptr: &MI);
452	for (auto &MO : MI.uses()) {
453	// Checks for "isUse()" as "uses()" returns also implicit definitions.
454	if (!MO.isReg() \|\| !MO.isUse())
455	continue;
456	Register Reg = MO.getReg();
457	auto &RDM = RegDefMaps[Reg.isVirtual()];
458	if (MachineInstr *DefMI = RDM.lookup(Val: Reg)) {
459	OperandToDefMap [&MO] = DefMI;
460	DepthInfo Info = DepthMap.lookup(Val: DefMI);
461	MIDepth = std::max(a: MIDepth, b: Info.Depth);
462	if (!IsCMOV)
463	MIDepthOpt = std::max(a: MIDepthOpt, b: Info.OptDepth);
464	}
465	}
466
467	if (IsCMOV)
468	MIDepthOpt = getDepthOfOptCmov(
469	TrueOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI.getOperand(i: `1`))].OptDepth,
470	FalseOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI.getOperand(i: `2`))].OptDepth);
471
472	// Iterates over all operands to handle implicit definitions as well.
473	for (auto &MO : MI.operands()) {
474	if (!MO.isReg() \|\| !MO.isDef())
475	continue;
476	Register Reg = MO.getReg();
477	RegDefMaps[Reg.isVirtual()][Reg] = &MI;
478	}
479
480	unsigned Latency = TSchedModel.computeInstrLatency(MI: &MI);
481	DepthMap [&MI] = {.Depth: MIDepth += Latency, .OptDepth: MIDepthOpt += Latency};
482	MaxDepth.Depth = std::max(a: MaxDepth.Depth, b: MIDepth);
483	MaxDepth.OptDepth = std::max(a: MaxDepth.OptDepth, b: MIDepthOpt);
484	}
485	}
486	}
487
488	unsigned Diff[LoopIterations] = {LoopDepth[`0`].Depth - LoopDepth[`0`].OptDepth,
489	LoopDepth[`1`].Depth - LoopDepth[`1`].OptDepth};
490
491	//===--------------------------------------------------------------------===//
492	// Step 2: Check if Loop worth to be optimized.
493	// Worth-Optimize-Loop:
494	// case 1: Diff[1] == Diff[0]
495	// Critical-path is iteration independent - there is no dependency
496	// of critical-path instructions on critical-path instructions of
497	// previous iteration.
498	// Thus, it is enough to check gain percent of 1st iteration -
499	// To be conservative, the optimized loop need to have a depth of
500	// 12.5% cycles less than original loop, per iteration.
501	//
502	// case 2: Diff[1] > Diff[0]
503	// Critical-path is iteration dependent - there is dependency of
504	// critical-path instructions on critical-path instructions of
505	// previous iteration.
506	// Thus, check the gain percent of the 2nd iteration (similar to the
507	// previous case), but it is also required to check the gradient of
508	// the gain - the change in Depth-Diff compared to the change in
509	// Loop-Depth between 1st and 2nd iterations.
510	// To be conservative, the gradient need to be at least 50%.
511	//
512	// In addition, In order not to optimize loops with very small gain, the
513	// gain (in cycles) after 2nd iteration should not be less than a given
514	// threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
515	//
516	// If loop is not worth optimizing, remove all CMOV-group-candidates.
517	//===--------------------------------------------------------------------===//
518	if (Diff[`1`] < GainCycleThreshold)
519	return false;
520
521	bool WorthOptLoop = false;
522	if (Diff[`1`] == Diff[`0`])
523	WorthOptLoop = Diff[`0`] * `8` >= LoopDepth[`0`].Depth;
524	else if (Diff[`1`] > Diff[`0`])
525	WorthOptLoop =
526	(Diff[`1`] - Diff[`0`]) * `2` >= (LoopDepth[`1`].Depth - LoopDepth[`0`].Depth) &&
527	(Diff[`1`] * `8` >= LoopDepth[`1`].Depth);
528
529	if (!WorthOptLoop)
530	return false;
531
532	++NumOfLoopCandidate;
533
534	//===--------------------------------------------------------------------===//
535	// Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
536	// Worth-Optimize-Group:
537	// Iff it is worth to optimize all CMOV instructions in the group.
538	//
539	// Worth-Optimize-CMOV:
540	// Predicted branch is faster than CMOV by the difference between depth of
541	// condition operand and depth of taken (predicted) value operand.
542	// To be conservative, the gain of such CMOV transformation should cover at
543	// at least 25% of branch-misprediction-penalty.
544	//===--------------------------------------------------------------------===//
545	unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
546	CmovGroups TempGroups;
547	std::swap(LHS&: TempGroups, RHS&: CmovInstGroups);
548	for (auto &Group : TempGroups) {
549	bool WorthOpGroup = true;
550	for (auto *MI : Group) {
551	// Avoid CMOV instruction which value is used as a pointer to load from.
552	// This is another conservative check to avoid converting CMOV instruction
553	// used with tree-search like algorithm, where the branch is unpredicted.
554	auto UIs = MRI->use_instructions(Reg: MI->defs().begin()->getReg());
555	if (hasSingleElement(C&: UIs)) {
556	unsigned Op = UIs.begin()->getOpcode();
557	if (Op == X86::MOV64rm \|\| Op == X86::MOV32rm) {
558	WorthOpGroup = false;
559	break;
560	}
561	}
562
563	unsigned CondCost =
564	DepthMap [OperandToDefMap.lookup(Val: &MI->getOperand(i: `4`))].Depth;
565	unsigned ValCost = getDepthOfOptCmov(
566	TrueOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI->getOperand(i: `1`))].Depth,
567	FalseOpDepth: DepthMap [OperandToDefMap.lookup(Val: &MI->getOperand(i: `2`))].Depth);
568	if (ValCost > CondCost \|\| (CondCost - ValCost) * `4` < MispredictPenalty) {
569	WorthOpGroup = false;
570	break;
571	}
572	}
573
574	if (WorthOpGroup)
575	CmovInstGroups.push_back(Elt: Group);
576	}
577
578	return !CmovInstGroups.empty();
579	}
580
581	static bool checkEFLAGSLive(MachineInstr *MI) {
582	if (MI->killsRegister(Reg: X86::EFLAGS, /TRI=/nullptr))
583	return false;
584
585	// The EFLAGS operand of MI might be missing a kill marker.
586	// Figure out whether EFLAGS operand should LIVE after MI instruction.
587	MachineBasicBlock *BB = MI->getParent();
588	MachineBasicBlock::iterator ItrMI = MI;
589
590	// Scan forward through BB for a use/def of EFLAGS.
591	for (auto I = std::next(x: ItrMI), E = BB->end(); I != E; ++I) {
592	if (I ->readsRegister(Reg: X86::EFLAGS, /TRI=/nullptr))
593	return true;
594	if (I ->definesRegister(Reg: X86::EFLAGS, /TRI=/nullptr))
595	return false;
596	}
597
598	// We hit the end of the block, check whether EFLAGS is live into a successor.
599	for (MachineBasicBlock *Succ : BB->successors())
600	if (Succ->isLiveIn(Reg: X86::EFLAGS))
601	return true;
602
603	return false;
604	}
605
606	/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
607	/// group of CMOV instructions, which may contain debug instructions in between,
608	/// move all debug instructions to after the last CMOV instruction, making the
609	/// CMOV group consecutive.
610	static void packCmovGroup(MachineInstr First, MachineInstr Last) {
611	assert(X86::getCondFromCMov(*Last) != X86::COND_INVALID &&
612	"Last instruction in a CMOV group must be a CMOV instruction");
613
614	SmallVector<MachineInstr *, `2`> DBGInstructions;
615	for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I ++) {
616	if (I ->isDebugInstr())
617	DBGInstructions.push_back(Elt: &*I);
618	}
619
620	// Splice the debug instruction after the cmov group.
621	MachineBasicBlock *MBB = First->getParent();
622	for (auto *MI : DBGInstructions)
623	MBB->insertAfter(I: Last, MI: MI->removeFromParent());
624	}
625
626	void X86CmovConverterPass::convertCmovInstsToBranches(
627	SmallVectorImpl<MachineInstr > &Group) const* {
628	assert(!Group.empty() && "No CMOV instructions to convert");
629	++NumOfOptimizedCmovGroups;
630
631	// If the CMOV group is not packed, e.g., there are debug instructions between
632	// first CMOV and last CMOV, then pack the group and make the CMOV instruction
633	// consecutive by moving the debug instructions to after the last CMOV.
634	packCmovGroup(First: Group.front(), Last: Group.back());
635
636	// To convert a CMOVcc instruction, we actually have to insert the diamond
637	// control-flow pattern. The incoming instruction knows the destination vreg
638	// to set, the condition code register to branch on, the true/false values to
639	// select between, and a branch opcode to use.
640
641	// Before
642	// -----
643	// MBB:
644	// cond = cmp ...
645	// v1 = CMOVge t1, f1, cond
646	// v2 = CMOVlt t2, f2, cond
647	// v3 = CMOVge v1, f3, cond
648	//
649	// After
650	// -----
651	// MBB:
652	// cond = cmp ...
653	// jge %SinkMBB
654	//
655	// FalseMBB:
656	// jmp %SinkMBB
657	//
658	// SinkMBB:
659	// %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
660	// %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
661	// ; true-value with false-value
662	// %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
663	// ; previous Phi instruction result
664
665	MachineInstr &MI = *Group.front();
666	MachineInstr *LastCMOV = Group.back();
667	DebugLoc DL = MI.getDebugLoc();
668
669	X86::CondCode CC = X86::CondCode(X86::getCondFromCMov(MI));
670	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
671	// Potentially swap the condition codes so that any memory operand to a CMOV
672	// is in the false* position instead of the true position. We can invert*
673	// any non-memory operand CMOV instructions to cope with this and we ensure
674	// memory operand CMOVs are only included with a single condition code.
675	if (llvm::any_of(Range&: Group, P: [&](MachineInstr *I) {
676	return I->mayLoad() && X86::getCondFromCMov(MI: *I) == CC;
677	}))
678	std::swap(a&: CC, b&: OppCC);
679
680	MachineBasicBlock *MBB = MI.getParent();
681	MachineFunction::iterator It = ++MBB->getIterator();
682	MachineFunction *F = MBB->getParent();
683	const BasicBlock *BB = MBB->getBasicBlock();
684
685	MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
686	MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
687	F->insert(MBBI: It, MBB: FalseMBB);
688	F->insert(MBBI: It, MBB: SinkMBB);
689
690	// If the EFLAGS register isn't dead in the terminator, then claim that it's
691	// live into the sink and copy blocks.
692	if (checkEFLAGSLive(MI: LastCMOV)) {
693	FalseMBB->addLiveIn(PhysReg: X86::EFLAGS);
694	SinkMBB->addLiveIn(PhysReg: X86::EFLAGS);
695	}
696
697	// Transfer the remainder of BB and its successor edges to SinkMBB.
698	SinkMBB->splice(Where: SinkMBB->begin(), Other: MBB,
699	From: std::next(x: MachineBasicBlock::iterator (LastCMOV)), To: MBB->end());
700	SinkMBB->transferSuccessorsAndUpdatePHIs(FromMBB: MBB);
701
702	// Add the false and sink blocks as its successors.
703	MBB->addSuccessor(Succ: FalseMBB);
704	MBB->addSuccessor(Succ: SinkMBB);
705
706	// Create the conditional branch instruction.
707	BuildMI(BB: MBB, MIMD: DL, MCID: TII->get(Opcode: X86::JCC_1)).addMBB(MBB: SinkMBB).addImm(Val: CC);
708
709	// Add the sink block to the false block successors.
710	FalseMBB->addSuccessor(Succ: SinkMBB);
711
712	MachineInstrBuilder MIB;
713	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator (MI);
714	MachineBasicBlock::iterator MIItEnd =
715	std::next(x: MachineBasicBlock::iterator (LastCMOV));
716	MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
717	MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
718
719	// First we need to insert an explicit load on the false path for any memory
720	// operand. We also need to potentially do register rewriting here, but it is
721	// simpler as the memory operands are always on the false path so we can
722	// simply take that input, whatever it is.
723	DenseMap<Register, Register> FalseBBRegRewriteTable;
724	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
725	auto &MI = *MIIt ++;
726	// Skip any CMOVs in this group which don't load from memory.
727	if (!MI.mayLoad()) {
728	// Remember the false-side register input.
729	Register FalseReg =
730	MI.getOperand(i: X86::getCondFromCMov(MI) == CC ? `1` : `2`).getReg();
731	// Walk back through any intermediate cmovs referenced.
732	while (true) {
733	auto FRIt = FalseBBRegRewriteTable.find(Val: FalseReg);
734	if (FRIt == FalseBBRegRewriteTable.end())
735	break;
736	FalseReg = FRIt ->second;
737	}
738	FalseBBRegRewriteTable [MI.getOperand(i: `0`).getReg()] = FalseReg;
739	continue;
740	}
741
742	// The condition must be the opposite* of the one we've decided to branch*
743	// on as the branch will go around* the load and the load should happen*
744	// when the CMOV condition is false.
745	assert(X86::getCondFromCMov(MI) == OppCC &&
746	"Can only handle memory-operand cmov instructions with a condition "
747	"opposite to the selected branch direction.");
748
749	// The goal is to rewrite the cmov from:
750	//
751	// MBB:
752	// %A = CMOVcc %B (tied), (mem)
753	//
754	// to
755	//
756	// MBB:
757	// %A = CMOVcc %B (tied), %C
758	// FalseMBB:
759	// %C = MOV (mem)
760	//
761	// Which will allow the next loop to rewrite the CMOV in terms of a PHI:
762	//
763	// MBB:
764	// JMP!cc SinkMBB
765	// FalseMBB:
766	// %C = MOV (mem)
767	// SinkMBB:
768	// %A = PHI [ %C, FalseMBB ], [ %B, MBB]
769
770	// Get a fresh register to use as the destination of the MOV.
771	const TargetRegisterClass *RC = MRI->getRegClass(Reg: MI.getOperand(i: `0`).getReg());
772	Register TmpReg = MRI->createVirtualRegister(RegClass: RC);
773
774	// Retain debug instr number when unfolded.
775	unsigned OldDebugInstrNum = MI.peekDebugInstrNum();
776	SmallVector<MachineInstr *, `4`> NewMIs;
777	bool Unfolded = TII->unfoldMemoryOperand(MF&: *MBB->getParent(), MI, Reg: TmpReg,
778	/UnfoldLoad/ true,
779	/UnfoldStore/ false, NewMIs);
780	(void)Unfolded;
781	assert(Unfolded && "Should never fail to unfold a loading cmov!");
782
783	// Move the new CMOV to just before the old one and reset any impacted
784	// iterator.
785	auto *NewCMOV = NewMIs.pop_back_val();
786	assert(X86::getCondFromCMov(*NewCMOV) == OppCC &&
787	"Last new instruction isn't the expected CMOV!");
788	LLVM_DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
789	MBB->insert(I: MachineBasicBlock::iterator (MI), MI: NewCMOV);
790	if (&*MIItBegin == &MI)
791	MIItBegin = MachineBasicBlock::iterator (NewCMOV);
792
793	if (OldDebugInstrNum)
794	NewCMOV->setDebugInstrNum(OldDebugInstrNum);
795
796	// Sink whatever instructions were needed to produce the unfolded operand
797	// into the false block.
798	for (auto *NewMI : NewMIs) {
799	LLVM_DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
800	FalseMBB->insert(I: FalseInsertionPoint, MI: NewMI);
801	// Re-map any operands that are from other cmovs to the inputs for this block.
802	for (auto &MOp : NewMI->uses()) {
803	if (!MOp.isReg())
804	continue;
805	auto It = FalseBBRegRewriteTable.find(Val: MOp.getReg());
806	if (It == FalseBBRegRewriteTable.end())
807	continue;
808
809	MOp.setReg(It ->second);
810	// This might have been a kill when it referenced the cmov result, but
811	// it won't necessarily be once rewritten.
812	// FIXME: We could potentially improve this by tracking whether the
813	// operand to the cmov was also a kill, and then skipping the PHI node
814	// construction below.
815	MOp.setIsKill(false);
816	}
817	}
818	MBB->erase(I: &MI);
819
820	// Add this PHI to the rewrite table.
821	FalseBBRegRewriteTable [NewCMOV->getOperand(i: `0`).getReg()] = TmpReg;
822	}
823
824	// As we are creating the PHIs, we have to be careful if there is more than
825	// one. Later CMOVs may reference the results of earlier CMOVs, but later
826	// PHIs have to reference the individual true/false inputs from earlier PHIs.
827	// That also means that PHI construction must work forward from earlier to
828	// later, and that the code must maintain a mapping from earlier PHI's
829	// destination registers, and the registers that went into the PHI.
830	DenseMap<Register, std::pair<Register, Register>> RegRewriteTable;
831
832	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
833	Register DestReg = MIIt ->getOperand(i: `0`).getReg();
834	Register Op1Reg = MIIt ->getOperand(i: `1`).getReg();
835	Register Op2Reg = MIIt ->getOperand(i: `2`).getReg();
836
837	// If this CMOV we are processing is the opposite condition from the jump we
838	// generated, then we have to swap the operands for the PHI that is going to
839	// be generated.
840	if (X86::getCondFromCMov(MI: *MIIt) == OppCC)
841	std::swap(a&: Op1Reg, b&: Op2Reg);
842
843	auto Op1Itr = RegRewriteTable.find(Val: Op1Reg);
844	if (Op1Itr != RegRewriteTable.end())
845	Op1Reg = Op1Itr ->second.first;
846
847	auto Op2Itr = RegRewriteTable.find(Val: Op2Reg);
848	if (Op2Itr != RegRewriteTable.end())
849	Op2Reg = Op2Itr ->second.second;
850
851	// SinkMBB:
852	// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
853	// ...
854	MIB = BuildMI(BB&: *SinkMBB, I: SinkInsertionPoint, MIMD: DL, MCID: TII->get(Opcode: X86::PHI), DestReg)
855	.addReg(RegNo: Op1Reg)
856	.addMBB(MBB: FalseMBB)
857	.addReg(RegNo: Op2Reg)
858	.addMBB(MBB);
859	(void)MIB;
860	LLVM_DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
861	LLVM_DEBUG(dbgs() << "\tTo: "; MIB->dump());
862
863	// debug-info: we can just copy the instr-ref number from one instruction
864	// to the other, seeing how it's a one-for-one substitution.
865	if (unsigned InstrNum = MIIt ->peekDebugInstrNum())
866	MIB ->setDebugInstrNum(InstrNum);
867
868	// Add this PHI to the rewrite table.
869	RegRewriteTable [DestReg] = std::make_pair(x&: Op1Reg, y&: Op2Reg);
870	}
871
872	// Reset the NoPHIs property if a PHI was inserted to prevent a conflict with
873	// the MachineVerifier during testing.
874	if (MIItBegin != MIItEnd)
875	F->getProperties().resetNoPHIs();
876
877	// Now remove the CMOV(s).
878	MBB->erase(I: MIItBegin, E: MIItEnd);
879
880	// Add new basic blocks to MachineLoopInfo.
881	if (MachineLoop *L = MLI->getLoopFor(BB: MBB)) {
882	L->addBasicBlockToLoop(NewBB: FalseMBB, LI&: *MLI);
883	L->addBasicBlockToLoop(NewBB: SinkMBB, LI&: *MLI);
884	}
885	}
886
887	INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
888	false, false)
889	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
890	INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
891	false, false)
892
893	FunctionPass *llvm::createX86CmovConverterPass() {
894	return new X86CmovConverterPass ();
895	}
896

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86CmovConversion.cpp