MergeICmps.cpp source code [llvm_projects/llvm/lib/Transforms/Scalar/MergeICmps.cpp]

1	//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass turns chains of integer comparisons into memcmp (the memcmp is
10	// later typically inlined as a chain of efficient hardware comparisons). This
11	// typically benefits c++ member or nonmember operator==().
12	//
13	// The basic idea is to replace a longer chain of integer comparisons loaded
14	// from contiguous memory locations into a shorter chain of larger integer
15	// comparisons. Benefits are double:
16	// - There are less jumps, and therefore less opportunities for mispredictions
17	// and I-cache misses.
18	// - Code size is smaller, both because jumps are removed and because the
19	// encoding of a 2n byte compare is smaller than that of two n-byte*
20	// compares.
21	//
22	// Example:
23	//
24	// struct S {
25	// int a;
26	// char b;
27	// char c;
28	// uint16_t d;
29	// bool operator==(const S& o) const {
30	// return a == o.a && b == o.b && c == o.c && d == o.d;
31	// }
32	// };
33	//
34	// Is optimized as :
35	//
36	// bool S::operator==(const S& o) const {
37	// return memcmp(this, &o, 8) == 0;
38	// }
39	//
40	// Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
41	//
42	//===----------------------------------------------------------------------===//
43
44	#include "llvm/Transforms/Scalar/MergeICmps.h"
45	#include "llvm/ADT/SmallString.h"
46	#include "llvm/Analysis/DomTreeUpdater.h"
47	#include "llvm/Analysis/GlobalsModRef.h"
48	#include "llvm/Analysis/Loads.h"
49	#include "llvm/Analysis/TargetLibraryInfo.h"
50	#include "llvm/Analysis/TargetTransformInfo.h"
51	#include "llvm/IR/Dominators.h"
52	#include "llvm/IR/Function.h"
53	#include "llvm/IR/IRBuilder.h"
54	#include "llvm/IR/Instruction.h"
55	#include "llvm/IR/ProfDataUtils.h"
56	#include "llvm/InitializePasses.h"
57	#include "llvm/Pass.h"
58	#include "llvm/Transforms/Scalar.h"
59	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
60	#include "llvm/Transforms/Utils/BuildLibCalls.h"
61	#include <algorithm>
62	#include <numeric>
63	#include <utility>
64	#include <vector>
65
66	using namespace llvm;
67
68	#define DEBUG_TYPE "mergeicmps"
69
70	namespace llvm {
71	extern cl::opt<bool> ProfcheckDisableMetadataFixes;
72	} // namespace llvm
73	namespace {
74
75	// A BCE atom "Binary Compare Expression Atom" represents an integer load
76	// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
77	// at the top.
78	struct BCEAtom {
79	BCEAtom() = default;
80	BCEAtom(GetElementPtrInst GEP, LoadInst LoadI, int BaseId, APInt Offset)
81	: GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset (std::move(Offset)) {}
82
83	BCEAtom(const BCEAtom &) = delete;
84	BCEAtom &operator=(const BCEAtom &) = delete;
85
86	BCEAtom(BCEAtom &&that) = default;
87	BCEAtom &operator=(BCEAtom &&that) {
88	if (this == &that)
89	return *this;
90	GEP = that.GEP;
91	LoadI = that.LoadI;
92	BaseId = that.BaseId;
93	Offset = std::move(that.Offset);
94	return *this;
95	}
96
97	// We want to order BCEAtoms by (Base, Offset). However we cannot use
98	// the pointer values for Base because these are non-deterministic.
99	// To make sure that the sort order is stable, we first assign to each atom
100	// base value an index based on its order of appearance in the chain of
101	// comparisons. We call this index `BaseOrdering`. For example, for:
102	// b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
103	// \| block 1 \| \| block 2 \| \| block 3 \|
104	// b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
105	// which is before block 2.
106	// We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
107	bool operator<(const BCEAtom &O) const {
108	return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(RHS: O.Offset);
109	}
110
111	GetElementPtrInst GEP = nullptr*;
112	LoadInst LoadI = nullptr*;
113	unsigned BaseId = `0`;
114	APInt Offset;
115	};
116
117	// A class that assigns increasing ids to values in the order in which they are
118	// seen. See comment in `BCEAtom::operator<()``.
119	class BaseIdentifier {
120	public:
121	// Returns the id for value `Base`, after assigning one if `Base` has not been
122	// seen before.
123	int getBaseId(const Value *Base) {
124	assert(Base && "invalid base");
125	const auto Insertion = BaseToIndex.try_emplace(Key: Base, Args&: Order);
126	if (Insertion.second)
127	++Order;
128	return Insertion.first ->second;
129	}
130
131	private:
132	unsigned Order = `1`;
133	DenseMap<const Value, int*> BaseToIndex;
134	};
135	} // namespace
136
137	// If this value is a load from a constant offset w.r.t. a base address, and
138	// there are no other users of the load or address, returns the base address and
139	// the offset.
140	static BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
141	auto *const LoadI = dyn_cast<LoadInst>(Val);
142	if (!LoadI)
143	return {};
144	LLVM_DEBUG(dbgs() << "load\n");
145	if (LoadI->isUsedOutsideOfBlock(BB: LoadI->getParent())) {
146	LLVM_DEBUG(dbgs() << "used outside of block\n");
147	return {};
148	}
149	// Do not optimize atomic loads to non-atomic memcmp
150	if (!LoadI->isSimple()) {
151	LLVM_DEBUG(dbgs() << "volatile or atomic\n");
152	return {};
153	}
154	Value *Addr = LoadI->getOperand(i_nocapture: `0`);
155	if (Addr->getType()->getPointerAddressSpace() != `0`) {
156	LLVM_DEBUG(dbgs() << "from non-zero AddressSpace\n");
157	return {};
158	}
159	const auto &DL = LoadI->getDataLayout();
160	if (!isDereferenceablePointer(V: Addr, Ty: LoadI->getType(), DL)) {
161	LLVM_DEBUG(dbgs() << "not dereferenceable\n");
162	// We need to make sure that we can do comparison in any order, so we
163	// require memory to be unconditionally dereferenceable.
164	return {};
165	}
166
167	APInt Offset = APInt (DL.getIndexTypeSizeInBits(Ty: Addr->getType()), `0`);
168	Value *Base = Addr;
169	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Addr);
170	if (GEP) {
171	LLVM_DEBUG(dbgs() << "GEP\n");
172	if (GEP->isUsedOutsideOfBlock(BB: LoadI->getParent())) {
173	LLVM_DEBUG(dbgs() << "used outside of block\n");
174	return {};
175	}
176	if (!GEP->accumulateConstantOffset(DL, Offset))
177	return {};
178	Base = GEP->getPointerOperand();
179	}
180	return BCEAtom (GEP, LoadI, BaseId.getBaseId(Base), Offset);
181	}
182
183	namespace {
184	// A comparison between two BCE atoms, e.g. `a == o.a` in the example at the
185	// top.
186	// Note: the terminology is misleading: the comparison is symmetric, so there
187	// is no real {l/r}hs. What we want though is to have the same base on the
188	// left (resp. right), so that we can detect consecutive loads. To ensure this
189	// we put the smallest atom on the left.
190	struct BCECmp {
191	BCEAtom Lhs;
192	BCEAtom Rhs;
193	int SizeBits;
194	const ICmpInst *CmpI;
195
196	BCECmp(BCEAtom L, BCEAtom R, int SizeBits, const ICmpInst *CmpI)
197	: Lhs (std::move(L)), Rhs (std::move(R)), SizeBits(SizeBits), CmpI(CmpI) {
198	if (Rhs < Lhs) std::swap(a&: Rhs, b&: Lhs);
199	}
200	};
201
202	// A basic block with a comparison between two BCE atoms.
203	// The block might do extra work besides the atom comparison, in which case
204	// doesOtherWork() returns true. Under some conditions, the block can be
205	// split into the atom comparison part and the "other work" part
206	// (see canSplit()).
207	class BCECmpBlock {
208	public:
209	typedef SmallDenseSet<const Instruction *, `8`> InstructionSet;
210
211	BCECmpBlock(BCECmp Cmp, BasicBlock *BB, InstructionSet BlockInsts)
212	: BB(BB), BlockInsts (std::move(BlockInsts)), Cmp (std::move(Cmp)) {}
213
214	const BCEAtom &Lhs() const { return Cmp.Lhs; }
215	const BCEAtom &Rhs() const { return Cmp.Rhs; }
216	int SizeBits() const { return Cmp.SizeBits; }
217
218	// Returns true if the block does other works besides comparison.
219	bool doesOtherWork() const;
220
221	// Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
222	// instructions in the block.
223	bool canSplit(AliasAnalysis &AA) const;
224
225	// Return true if this all the relevant instructions in the BCE-cmp-block can
226	// be sunk below this instruction. By doing this, we know we can separate the
227	// BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
228	// block.
229	bool canSinkBCECmpInst(const Instruction , AliasAnalysis &AA) const*;
230
231	// We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
232	// instructions. Split the old block and move all non-BCE-cmp-insts into the
233	// new parent block.
234	void split(BasicBlock NewParent, AliasAnalysis &AA) const*;
235
236	// The basic block where this comparison happens.
237	BasicBlock *BB;
238	// Instructions relating to the BCECmp and branch.
239	InstructionSet BlockInsts;
240	// The block requires splitting.
241	bool RequireSplit = false;
242	// Original order of this block in the chain.
243	unsigned OrigOrder = `0`;
244
245	private:
246	BCECmp Cmp;
247	};
248	} // namespace
249
250	bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
251	AliasAnalysis &AA) const {
252	// If this instruction may clobber the loads and is in middle of the BCE cmp
253	// block instructions, then bail for now.
254	if (Inst->mayWriteToMemory()) {
255	auto MayClobber = [&](LoadInst *LI) {
256	// If a potentially clobbering instruction comes before the load,
257	// we can still safely sink the load.
258	return (Inst->getParent() != LI->getParent() \|\| !Inst->comesBefore(Other: LI)) &&
259	isModSet(MRI: AA.getModRefInfo(I: Inst, OptLoc: MemoryLocation::get(LI)));
260	};
261	if (MayClobber (Cmp.Lhs.LoadI) \|\| MayClobber (Cmp.Rhs.LoadI))
262	return false;
263	}
264	// Make sure this instruction does not use any of the BCE cmp block
265	// instructions as operand.
266	return llvm::none_of(Range: Inst->operands(), P: [&](const Value *Op) {
267	const Instruction *OpI = dyn_cast<Instruction>(Val: Op);
268	return OpI && BlockInsts.contains(V: OpI);
269	});
270	}
271
272	void BCECmpBlock::split(BasicBlock NewParent, AliasAnalysis &AA) const* {
273	llvm::SmallVector<Instruction *, `4`> OtherInsts;
274	for (Instruction &Inst : *BB) {
275	if (BlockInsts.count(V: &Inst))
276	continue;
277	assert(canSinkBCECmpInst(&Inst, AA) && "Split unsplittable block");
278	// This is a non-BCE-cmp-block instruction. And it can be separated
279	// from the BCE-cmp-block instruction.
280	OtherInsts.push_back(Elt: &Inst);
281	}
282
283	// Do the actual spliting.
284	for (Instruction *Inst : reverse(C&: OtherInsts))
285	Inst->moveBeforePreserving(BB&: *NewParent, I: NewParent->begin());
286	}
287
288	bool BCECmpBlock::canSplit(AliasAnalysis &AA) const {
289	for (Instruction &Inst : *BB) {
290	if (!BlockInsts.count(V: &Inst)) {
291	if (!canSinkBCECmpInst(Inst: &Inst, AA))
292	return false;
293	}
294	}
295	return true;
296	}
297
298	bool BCECmpBlock::doesOtherWork() const {
299	// TODO(courbet): Can we allow some other things ? This is very conservative.
300	// We might be able to get away with anything does not have any side
301	// effects outside of the basic block.
302	// Note: The GEPs and/or loads are not necessarily in the same block.
303	for (const Instruction &Inst : *BB) {
304	if (!BlockInsts.count(V: &Inst))
305	return true;
306	}
307	return false;
308	}
309
310	// Visit the given comparison. If this is a comparison between two valid
311	// BCE atoms, returns the comparison.
312	static std::optional<BCECmp>
313	visitICmp(const ICmpInst *const CmpI,
314	const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId) {
315	// The comparison can only be used once:
316	// - For intermediate blocks, as a branch condition.
317	// - For the final block, as an incoming value for the Phi.
318	// If there are any other uses of the comparison, we cannot merge it with
319	// other comparisons as we would create an orphan use of the value.
320	if (!CmpI->hasOneUse()) {
321	LLVM_DEBUG(dbgs() << "cmp has several uses\n");
322	return std::nullopt;
323	}
324	if (CmpI->getPredicate() != ExpectedPredicate)
325	return std::nullopt;
326	LLVM_DEBUG(dbgs() << "cmp "
327	<< (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
328	<< "\n");
329	auto Lhs = visitICmpLoadOperand(Val: CmpI->getOperand(i_nocapture: `0`), BaseId);
330	if (!Lhs.BaseId)
331	return std::nullopt;
332	auto Rhs = visitICmpLoadOperand(Val: CmpI->getOperand(i_nocapture: `1`), BaseId);
333	if (!Rhs.BaseId)
334	return std::nullopt;
335	const auto &DL = CmpI->getDataLayout();
336	return BCECmp (std::move(Lhs), std::move(Rhs),
337	DL.getTypeSizeInBits(Ty: CmpI->getOperand(i_nocapture: `0`)->getType()), CmpI);
338	}
339
340	// Visit the given comparison block. If this is a comparison between two valid
341	// BCE atoms, returns the comparison.
342	static std::optional<BCECmpBlock>
343	visitCmpBlock(Value *const Val, BasicBlock *const Block,
344	const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) {
345	if (Block->empty())
346	return std::nullopt;
347	auto *Term = Block->getTerminator();
348	Value *Cond;
349	ICmpInst::Predicate ExpectedPredicate;
350	if (isa<UncondBrInst>(Val: Term)) {
351	// In this case, we expect an incoming value which is the result of the
352	// comparison. This is the last link in the chain of comparisons (note
353	// that this does not mean that this is the last incoming value, blocks
354	// can be reordered).
355	Cond = Val;
356	ExpectedPredicate = ICmpInst::ICMP_EQ;
357	} else if (auto *BranchI = dyn_cast<CondBrInst>(Val: Term)) {
358	// In this case, we expect a constant incoming value (the comparison is
359	// chained).
360	const auto *const Const = cast<ConstantInt>(Val);
361	LLVM_DEBUG(dbgs() << "const\n");
362	if (!Const->isZero())
363	return std::nullopt;
364	LLVM_DEBUG(dbgs() << "false\n");
365	assert(BranchI->getNumSuccessors() == `2` && "expecting a cond branch");
366	BasicBlock *const FalseBlock = BranchI->getSuccessor(i: `1`);
367	Cond = BranchI->getCondition();
368	ExpectedPredicate =
369	FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
370	} else
371	return std::nullopt;
372
373	auto *CmpI = dyn_cast<ICmpInst>(Val: Cond);
374	if (!CmpI)
375	return std::nullopt;
376	LLVM_DEBUG(dbgs() << "icmp\n");
377
378	std::optional<BCECmp> Result = visitICmp(CmpI, ExpectedPredicate, BaseId);
379	if (!Result)
380	return std::nullopt;
381
382	BCECmpBlock::InstructionSet BlockInsts(
383	{Result ->Lhs.LoadI, Result ->Rhs.LoadI, Result ->CmpI, Term});
384	if (Result ->Lhs.GEP)
385	BlockInsts.insert(V: Result ->Lhs.GEP);
386	if (Result ->Rhs.GEP)
387	BlockInsts.insert(V: Result ->Rhs.GEP);
388	return BCECmpBlock (std::move(*Result), Block, BlockInsts);
389	}
390
391	static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
392	BCECmpBlock &&Comparison) {
393	LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
394	<< "': Found cmp of " << Comparison.SizeBits()
395	<< " bits between " << Comparison.Lhs().BaseId << " + "
396	<< Comparison.Lhs().Offset << " and "
397	<< Comparison.Rhs().BaseId << " + "
398	<< Comparison.Rhs().Offset << "\n");
399	LLVM_DEBUG(dbgs() << "\n");
400	Comparison.OrigOrder = Comparisons.size();
401	Comparisons.push_back(x: std::move(Comparison));
402	}
403
404	namespace {
405	// A chain of comparisons.
406	class BCECmpChain {
407	public:
408	using ContiguousBlocks = std::vector<BCECmpBlock>;
409
410	BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
411	AliasAnalysis &AA);
412
413	bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
414	DomTreeUpdater &DTU);
415
416	bool atLeastOneMerged() const {
417	return any_of(Range: MergedBlocks_,
418	P: [](const auto &Blocks) { return Blocks.size() > `1`; });
419	}
420
421	private:
422	PHINode &Phi_;
423	// The list of all blocks in the chain, grouped by contiguity.
424	std::vector<ContiguousBlocks> MergedBlocks_;
425	// The original entry block (before sorting);
426	BasicBlock *EntryBlock_;
427	};
428	} // namespace
429
430	static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) {
431	return First.Lhs().BaseId == Second.Lhs().BaseId &&
432	First.Rhs().BaseId == Second.Rhs().BaseId &&
433	First.Lhs().Offset + First.SizeBits() / `8` == Second.Lhs().Offset &&
434	First.Rhs().Offset + First.SizeBits() / `8` == Second.Rhs().Offset;
435	}
436
437	static unsigned getMinOrigOrder(const BCECmpChain::ContiguousBlocks &Blocks) {
438	unsigned MinOrigOrder = std::numeric_limits<unsigned>::max();
439	for (const BCECmpBlock &Block : Blocks)
440	MinOrigOrder = std::min(a: MinOrigOrder, b: Block.OrigOrder);
441	return MinOrigOrder;
442	}
443
444	/// Given a chain of comparison blocks, groups the blocks into contiguous
445	/// ranges that can be merged together into a single comparison.
446	static std::vector<BCECmpChain::ContiguousBlocks>
447	mergeBlocks(std::vector<BCECmpBlock> &&Blocks) {
448	std::vector<BCECmpChain::ContiguousBlocks> MergedBlocks;
449
450	// Sort to detect continuous offsets.
451	llvm::sort(C&: Blocks,
452	Comp: [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
453	return std::tie(args: LhsBlock.Lhs(), args: LhsBlock.Rhs()) <
454	std::tie(args: RhsBlock.Lhs(), args: RhsBlock.Rhs());
455	});
456
457	BCECmpChain::ContiguousBlocks LastMergedBlock = nullptr*;
458	for (BCECmpBlock &Block : Blocks) {
459	if (!LastMergedBlock \|\| !areContiguous(First: LastMergedBlock->back(), Second: Block)) {
460	MergedBlocks.emplace_back();
461	LastMergedBlock = &MergedBlocks.back();
462	} else {
463	LLVM_DEBUG(dbgs() << "Merging block " << Block.BB->getName() << " into "
464	<< LastMergedBlock->back().BB->getName() << "\n");
465	}
466	LastMergedBlock->push_back(x: std::move(Block));
467	}
468
469	// While we allow reordering for merging, do not reorder unmerged comparisons.
470	// Doing so may introduce branch on poison.
471	llvm::sort(C&: MergedBlocks, Comp: [](const BCECmpChain::ContiguousBlocks &LhsBlocks,
472	const BCECmpChain::ContiguousBlocks &RhsBlocks) {
473	return getMinOrigOrder(Blocks: LhsBlocks) < getMinOrigOrder(Blocks: RhsBlocks);
474	});
475
476	return MergedBlocks;
477	}
478
479	BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
480	AliasAnalysis &AA)
481	: Phi_(Phi) {
482	assert(!Blocks.empty() && "a chain should have at least one block");
483	// Now look inside blocks to check for BCE comparisons.
484	std::vector<BCECmpBlock> Comparisons;
485	BaseIdentifier BaseId;
486	for (BasicBlock *const Block : Blocks) {
487	assert(Block && "invalid block");
488	if (Block->hasAddressTaken()) {
489	LLVM_DEBUG(dbgs() << "cannot merge blocks with blockaddress\n");
490	return;
491	}
492	std::optional<BCECmpBlock> Comparison = visitCmpBlock(
493	Val: Phi.getIncomingValueForBlock(BB: Block), Block, PhiBlock: Phi.getParent(), BaseId);
494	if (!Comparison) {
495	LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
496	return;
497	}
498	if (Comparison ->doesOtherWork()) {
499	LLVM_DEBUG(dbgs() << "block '" << Comparison->BB->getName()
500	<< "' does extra work besides compare\n");
501	if (Comparisons.empty()) {
502	// This is the initial block in the chain, in case this block does other
503	// work, we can try to split the block and move the irrelevant
504	// instructions to the predecessor.
505	//
506	// If this is not the initial block in the chain, splitting it wont
507	// work.
508	//
509	// As once split, there will still be instructions before the BCE cmp
510	// instructions that do other work in program order, i.e. within the
511	// chain before sorting. Unless we can abort the chain at this point
512	// and start anew.
513	//
514	// NOTE: we only handle blocks a with single predecessor for now.
515	if (Comparison ->canSplit(AA)) {
516	LLVM_DEBUG(dbgs()
517	<< "Split initial block '" << Comparison->BB->getName()
518	<< "' that does extra work besides compare\n");
519	Comparison ->RequireSplit = true;
520	enqueueBlock(Comparisons, Comparison: std::move(*Comparison));
521	} else {
522	LLVM_DEBUG(dbgs()
523	<< "ignoring initial block '" << Comparison->BB->getName()
524	<< "' that does extra work besides compare\n");
525	}
526	continue;
527	}
528	// TODO(courbet): Right now we abort the whole chain. We could be
529	// merging only the blocks that don't do other work and resume the
530	// chain from there. For example:
531	// if (a[0] == b[0]) { // bb1
532	// if (a[1] == b[1]) { // bb2
533	// some_value = 3; //bb3
534	// if (a[2] == b[2]) { //bb3
535	// do a ton of stuff //bb4
536	// }
537	// }
538	// }
539	//
540	// This is:
541	//
542	// bb1 --eq--> bb2 --eq--> bb3 -eq--> bb4 --+*
543	// \ \ \ \
544	// ne ne ne \
545	// \ \ \ v
546	// +------------+-----------+----------> bb_phi
547	//
548	// We can only merge the first two comparisons, because bb3 does*
549	// "other work" (setting some_value to 3).
550	// We could still merge bb1 and bb2 though.
551	return;
552	}
553	enqueueBlock(Comparisons, Comparison: std::move(*Comparison));
554	}
555
556	// It is possible we have no suitable comparison to merge.
557	if (Comparisons.empty()) {
558	LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
559	return;
560	}
561	EntryBlock_ = Comparisons [`0`].BB;
562	MergedBlocks_ = mergeBlocks(Blocks: std::move(Comparisons));
563	}
564
565	namespace {
566
567	// A class to compute the name of a set of merged basic blocks.
568	// This is optimized for the common case of no block names.
569	class MergedBlockName {
570	// Storage for the uncommon case of several named blocks.
571	SmallString<`16`> Scratch;
572
573	public:
574	explicit MergedBlockName(ArrayRef<BCECmpBlock> Comparisons)
575	: Name(makeName(Comparisons)) {}
576	const StringRef Name;
577
578	private:
579	StringRef makeName(ArrayRef<BCECmpBlock> Comparisons) {
580	assert(!Comparisons.empty() && "no basic block");
581	// Fast path: only one block, or no names at all.
582	if (Comparisons.size() == `1`)
583	return Comparisons [`0`].BB->getName();
584	const int size = std::accumulate(first: Comparisons.begin(), last: Comparisons.end(), init: `0`,
585	binary_op: [](int i, const BCECmpBlock &Cmp) {
586	return i + Cmp.BB->getName().size();
587	});
588	if (size == `0`)
589	return StringRef ("", `0`);
590
591	// Slow path: at least two blocks, at least one block with a name.
592	Scratch.clear();
593	// We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for
594	// separators.
595	Scratch.reserve(N: size + Comparisons.size() - `1`);
596	const auto append = [this](StringRef str) {
597	Scratch.append(in_start: str.begin(), in_end: str.end());
598	};
599	append(Comparisons [`0`].BB->getName());
600	for (int I = `1`, E = Comparisons.size(); I < E; ++I) {
601	const BasicBlock *const BB = Comparisons [I].BB;
602	if (!BB->getName().empty()) {
603	append("+");
604	append(BB->getName());
605	}
606	}
607	return Scratch.str();
608	}
609	};
610	} // namespace
611
612	/// Determine the branch weights for the resulting conditional branch, resulting
613	/// after merging \p Comparisons.
614	static std::optional<SmallVector<uint32_t, `2`>>
615	computeMergedBranchWeights(ArrayRef<BCECmpBlock> Comparisons) {
616	assert(!Comparisons.empty());
617	if (ProfcheckDisableMetadataFixes)
618	return std::nullopt;
619	if (Comparisons.size() == `1`) {
620	SmallVector<uint32_t, `2`> Weights;
621	if (!extractBranchWeights(I: *Comparisons [`0`].BB->getTerminator(), Weights))
622	return std::nullopt;
623	return Weights;
624	}
625	// The probability to go to the phi block is the disjunction of the
626	// probability to go to the phi block from the individual Comparisons. We'll
627	// swap the weights because `getDisjunctionWeights` computes the disjunction
628	// for the "true" branch, then swap back.
629	SmallVector<uint64_t, `2`> Weights{`0`, `1`};
630	// At this point, Weights encodes "0-probability" for the "true" side.
631	for (const auto &C : Comparisons) {
632	SmallVector<uint32_t, `2`> W;
633	if (!extractBranchWeights(I: *C.BB->getTerminator(), Weights&: W))
634	return std::nullopt;
635
636	std::swap(a&: W [`0`], b&: W [`1`]);
637	Weights = getDisjunctionWeights(B1: Weights, B2: W);
638	}
639	std::swap(a&: Weights [`0`], b&: Weights [`1`]);
640	return fitWeights(Weights);
641	}
642
643	// Merges the given contiguous comparison blocks into one memcmp block.
644	static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
645	BasicBlock *const InsertBefore,
646	BasicBlock *const NextCmpBlock,
647	PHINode &Phi, const TargetLibraryInfo &TLI,
648	AliasAnalysis &AA, DomTreeUpdater &DTU) {
649	assert(!Comparisons.empty() && "merging zero comparisons");
650	LLVMContext &Context = NextCmpBlock->getContext();
651	const BCECmpBlock &FirstCmp = Comparisons [`0`];
652
653	// Create a new cmp block before next cmp block.
654	BasicBlock *const BB =
655	BasicBlock::Create(Context, Name: MergedBlockName (Comparisons).Name,
656	Parent: NextCmpBlock->getParent(), InsertBefore);
657	IRBuilder<> Builder(BB);
658	// Add the GEPs from the first BCECmpBlock.
659	Value Lhs, Rhs;
660	if (FirstCmp.Lhs().GEP)
661	Lhs = Builder.Insert(I: FirstCmp.Lhs().GEP->clone());
662	else
663	Lhs = FirstCmp.Lhs().LoadI->getPointerOperand();
664	if (FirstCmp.Rhs().GEP)
665	Rhs = Builder.Insert(I: FirstCmp.Rhs().GEP->clone());
666	else
667	Rhs = FirstCmp.Rhs().LoadI->getPointerOperand();
668
669	Value IsEqual = nullptr*;
670	LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
671	<< BB->getName() << "\n");
672
673	// If there is one block that requires splitting, we do it now, i.e.
674	// just before we know we will collapse the chain. The instructions
675	// can be executed before any of the instructions in the chain.
676	const auto *ToSplit = llvm::find_if(
677	Range&: Comparisons, P: [](const BCECmpBlock &B) { return B.RequireSplit; });
678	if (ToSplit != Comparisons.end()) {
679	LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
680	ToSplit->split(NewParent: BB, AA);
681	}
682
683	if (Comparisons.size() == `1`) {
684	LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
685	// Use clone to keep the metadata
686	Instruction *const LhsLoad = Builder.Insert(I: FirstCmp.Lhs().LoadI->clone());
687	Instruction *const RhsLoad = Builder.Insert(I: FirstCmp.Rhs().LoadI->clone());
688	LhsLoad->replaceUsesOfWith(From: LhsLoad->getOperand(i: `0`), To: Lhs);
689	RhsLoad->replaceUsesOfWith(From: RhsLoad->getOperand(i: `0`), To: Rhs);
690	// There are no blocks to merge, just do the comparison.
691	// If we condition on this IsEqual, we already have its probabilities.
692	IsEqual = Builder.CreateICmpEQ(LHS: LhsLoad, RHS: RhsLoad);
693	} else {
694	const unsigned TotalSizeBits = std::accumulate(
695	first: Comparisons.begin(), last: Comparisons.end(), init: `0u`,
696	binary_op: [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
697
698	// memcmp expects a 'size_t' argument and returns 'int'.
699	unsigned SizeTBits = TLI.getSizeTSize(M: *Phi.getModule());
700	unsigned IntBits = TLI.getIntSize();
701
702	// Create memcmp() == 0.
703	const auto &DL = Phi.getDataLayout();
704	Value *const MemCmpCall = emitMemCmp(
705	Ptr1: Lhs, Ptr2: Rhs,
706	Len: ConstantInt::get(Ty: Builder.getIntNTy(N: SizeTBits), V: TotalSizeBits / `8`),
707	B&: Builder, DL, TLI: &TLI);
708	IsEqual = Builder.CreateICmpEQ(
709	LHS: MemCmpCall, RHS: ConstantInt::get(Ty: Builder.getIntNTy(N: IntBits), V: `0`));
710	}
711
712	BasicBlock *const PhiBB = Phi.getParent();
713	// Add a branch to the next basic block in the chain.
714	if (NextCmpBlock == PhiBB) {
715	// Continue to phi, passing it the comparison result.
716	Builder.CreateBr(Dest: PhiBB);
717	Phi.addIncoming(V: IsEqual, BB);
718	DTU.applyUpdates(Updates: {{DominatorTree::Insert, BB, PhiBB}});
719	} else {
720	// Continue to next block if equal, exit to phi else.
721	auto *BI = Builder.CreateCondBr(Cond: IsEqual, True: NextCmpBlock, False: PhiBB);
722	if (auto BranchWeights = computeMergedBranchWeights(Comparisons))
723	setBranchWeights(I&: BI, Weights: BranchWeights.value(), /IsExpected=/*false);
724	Phi.addIncoming(V: ConstantInt::getFalse(Context), BB);
725	DTU.applyUpdates(Updates: {{DominatorTree::Insert, BB, NextCmpBlock},
726	{DominatorTree::Insert, BB, PhiBB}});
727	}
728	return BB;
729	}
730
731	bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
732	DomTreeUpdater &DTU) {
733	assert(atLeastOneMerged() && "simplifying trivial BCECmpChain");
734	LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block "
735	<< EntryBlock_->getName() << "\n");
736
737	// Effectively merge blocks. We go in the reverse direction from the phi block
738	// so that the next block is always available to branch to.
739	BasicBlock *InsertBefore = EntryBlock_;
740	BasicBlock *NextCmpBlock = Phi_.getParent();
741	for (const auto &Blocks : reverse(C&: MergedBlocks_)) {
742	InsertBefore = NextCmpBlock = mergeComparisons(
743	Comparisons: Blocks, InsertBefore, NextCmpBlock, Phi&: Phi_, TLI, AA, DTU);
744	}
745
746	// Replace the original cmp chain with the new cmp chain by pointing all
747	// predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp
748	// blocks in the old chain unreachable.
749	while (!pred_empty(BB: EntryBlock_)) {
750	BasicBlock* const Pred = *pred_begin(BB: EntryBlock_);
751	LLVM_DEBUG(dbgs() << "Updating jump into old chain from " << Pred->getName()
752	<< "\n");
753	Pred->getTerminator()->replaceUsesOfWith(From: EntryBlock_, To: NextCmpBlock);
754	DTU.applyUpdates(Updates: {{DominatorTree::Delete, Pred, EntryBlock_},
755	{DominatorTree::Insert, Pred, NextCmpBlock}});
756	}
757
758	// If the old cmp chain was the function entry, we need to update the function
759	// entry.
760	const bool ChainEntryIsFnEntry = EntryBlock_->isEntryBlock();
761	if (ChainEntryIsFnEntry && DTU.hasDomTree()) {
762	LLVM_DEBUG(dbgs() << "Changing function entry from "
763	<< EntryBlock_->getName() << " to "
764	<< NextCmpBlock->getName() << "\n");
765	DTU.getDomTree().setNewRoot(NextCmpBlock);
766	DTU.applyUpdates(Updates: {{DominatorTree::Delete, NextCmpBlock, EntryBlock_}});
767	}
768	EntryBlock_ = nullptr;
769
770	// Delete merged blocks. This also removes incoming values in phi.
771	SmallVector<BasicBlock *, `16`> DeadBlocks;
772	for (const auto &Blocks : MergedBlocks_) {
773	for (const BCECmpBlock &Block : Blocks) {
774	LLVM_DEBUG(dbgs() << "Deleting merged block " << Block.BB->getName()
775	<< "\n");
776	DeadBlocks.push_back(Elt: Block.BB);
777	}
778	}
779	DeleteDeadBlocks(BBs: DeadBlocks, DTU: &DTU);
780
781	MergedBlocks_.clear();
782	return true;
783	}
784
785	static std::vector<BasicBlock *>
786	getOrderedBlocks(PHINode &Phi, BasicBlock *const LastBlock, int NumBlocks) {
787	// Walk up from the last block to find other blocks.
788	std::vector<BasicBlock *> Blocks(NumBlocks);
789	assert(LastBlock && "invalid last block");
790	BasicBlock *CurBlock = LastBlock;
791	for (int BlockIndex = NumBlocks - `1`; BlockIndex > `0`; --BlockIndex) {
792	if (CurBlock->hasAddressTaken()) {
793	// Somebody is jumping to the block through an address, all bets are
794	// off.
795	LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
796	<< " has its address taken\n");
797	return {};
798	}
799	Blocks [BlockIndex] = CurBlock;
800	auto *SinglePredecessor = CurBlock->getSinglePredecessor();
801	if (!SinglePredecessor) {
802	// The block has two or more predecessors.
803	LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
804	<< " has two or more predecessors\n");
805	return {};
806	}
807	if (Phi.getBasicBlockIndex(BB: SinglePredecessor) < `0`) {
808	// The block does not link back to the phi.
809	LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
810	<< " does not link back to the phi\n");
811	return {};
812	}
813	CurBlock = SinglePredecessor;
814	}
815	Blocks [`0`] = CurBlock;
816	return Blocks;
817	}
818
819	static bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI,
820	AliasAnalysis &AA, DomTreeUpdater &DTU) {
821	LLVM_DEBUG(dbgs() << "processPhi()\n");
822	if (Phi.getNumIncomingValues() <= `1`) {
823	LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
824	return false;
825	}
826	// We are looking for something that has the following structure:
827	// bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
828	// \ \ \ \
829	// ne ne ne \
830	// \ \ \ v
831	// +------------+-----------+----------> bb_phi
832	//
833	// - The last basic block (bb4 here) must branch unconditionally to bb_phi.
834	// It's the only block that contributes a non-constant value to the Phi.
835	// - All other blocks (b1, b2, b3) must have exactly two successors, one of
836	// them being the phi block.
837	// - All intermediate blocks (bb2, bb3) must have only one predecessor.
838	// - Blocks cannot do other work besides the comparison, see doesOtherWork()
839
840	// The blocks are not necessarily ordered in the phi, so we start from the
841	// last block and reconstruct the order.
842	BasicBlock LastBlock = nullptr*;
843	for (unsigned I = `0`; I < Phi.getNumIncomingValues(); ++I) {
844	if (isa<ConstantInt>(Val: Phi.getIncomingValue(i: I))) continue;
845	if (LastBlock) {
846	// There are several non-constant values.
847	LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
848	return false;
849	}
850	if (!isa<ICmpInst>(Val: Phi.getIncomingValue(i: I)) \|\|
851	cast<ICmpInst>(Val: Phi.getIncomingValue(i: I))->getParent() !=
852	Phi.getIncomingBlock(i: I)) {
853	// Non-constant incoming value is not from a cmp instruction or not
854	// produced by the last block. We could end up processing the value
855	// producing block more than once.
856	//
857	// This is an uncommon case, so we bail.
858	LLVM_DEBUG(
859	dbgs()
860	<< "skip: non-constant value not from cmp or not from last block.\n");
861	return false;
862	}
863	LastBlock = Phi.getIncomingBlock(i: I);
864	}
865	if (!LastBlock) {
866	// There is no non-constant block.
867	LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
868	return false;
869	}
870	if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
871	LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
872	return false;
873	}
874
875	const auto Blocks =
876	getOrderedBlocks(Phi, LastBlock, NumBlocks: Phi.getNumIncomingValues());
877	if (Blocks.empty()) return false;
878	BCECmpChain CmpChain(Blocks, Phi, AA);
879
880	if (!CmpChain.atLeastOneMerged()) {
881	LLVM_DEBUG(dbgs() << "skip: nothing merged\n");
882	return false;
883	}
884
885	return CmpChain.simplify(TLI, AA, DTU);
886	}
887
888	static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
889	const TargetTransformInfo &TTI, AliasAnalysis &AA,
890	DominatorTree *DT) {
891	LLVM_DEBUG(dbgs() << "MergeICmpsLegacyPass: " << F.getName() << "\n");
892
893	// We only try merging comparisons if the target wants to expand memcmp later.
894	// The rationale is to avoid turning small chains into memcmp calls.
895	if (!TTI.enableMemCmpExpansion(OptSize: F.hasOptSize(), IsZeroCmp: true))
896	return false;
897
898	// If we don't have memcmp avaiable we can't emit calls to it.
899	if (!TLI.has(F: LibFunc_memcmp))
900	return false;
901
902	DomTreeUpdater DTU(DT, /PostDominatorTree/ nullptr,
903	DomTreeUpdater::UpdateStrategy::Eager);
904
905	bool MadeChange = false;
906
907	for (BasicBlock &BB : llvm::drop_begin(RangeOrContainer&: F)) {
908	// A Phi operation is always first in a basic block.
909	if (auto *const Phi = dyn_cast<PHINode>(Val: &*BB.begin()))
910	MadeChange \|= processPhi(Phi&: *Phi, TLI, AA, DTU);
911	}
912
913	return MadeChange;
914	}
915
916	namespace {
917	class MergeICmpsLegacyPass : public FunctionPass {
918	public:
919	static char ID;
920
921	MergeICmpsLegacyPass() : FunctionPass (ID) {
922	initializeMergeICmpsLegacyPassPass(*PassRegistry::getPassRegistry());
923	}
924
925	bool runOnFunction(Function &F) override {
926	if (skipFunction(F)) return false;
927	const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
928	const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
929	// MergeICmps does not need the DominatorTree, but we update it if it's
930	// already available.
931	auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
932	auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
933	return runImpl(F, TLI, TTI, AA, DT: DTWP ? &DTWP->getDomTree() : nullptr);
934	}
935
936	private:
937	void getAnalysisUsage(AnalysisUsage &AU) const override {
938	AU.addRequired<TargetLibraryInfoWrapperPass>();
939	AU.addRequired<TargetTransformInfoWrapperPass>();
940	AU.addRequired<AAResultsWrapperPass>();
941	AU.addPreserved<GlobalsAAWrapperPass>();
942	AU.addPreserved<DominatorTreeWrapperPass>();
943	}
944	};
945
946	} // namespace
947
948	char MergeICmpsLegacyPass::ID = `0`;
949	INITIALIZE_PASS_BEGIN(MergeICmpsLegacyPass, "mergeicmps",
950	"Merge contiguous icmps into a memcmp", false, false)
951	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
952	INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
953	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
954	INITIALIZE_PASS_END(MergeICmpsLegacyPass, "mergeicmps",
955	"Merge contiguous icmps into a memcmp", false, false)
956
957	Pass llvm::createMergeICmpsLegacyPass() { return* new MergeICmpsLegacyPass (); }
958
959	PreservedAnalyses MergeICmpsPass::run(Function &F,
960	FunctionAnalysisManager &AM) {
961	auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
962	auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
963	auto &AA = AM.getResult<AAManager>(IR&: F);
964	auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(IR&: F);
965	const bool MadeChanges = runImpl(F, TLI, TTI, AA, DT);
966	if (!MadeChanges)
967	return PreservedAnalyses::all();
968	PreservedAnalyses PA;
969	PA.preserve<DominatorTreeAnalysis>();
970	return PA;
971	}
972

Browse the source code of llvm_projects/llvm/lib/Transforms/Scalar/MergeICmps.cpp