AggressiveInstCombine.cpp source code [llvm_projects/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp]

1	//===- AggressiveInstCombine.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the aggressive expression pattern combiner classes.
10	// Currently, it handles expression patterns for:
11	// Truncate instruction*
12	//
13	//===----------------------------------------------------------------------===//
14
15	#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
16	#include "AggressiveInstCombineInternal.h"
17	#include "llvm/ADT/Statistic.h"
18	#include "llvm/Analysis/AliasAnalysis.h"
19	#include "llvm/Analysis/AssumptionCache.h"
20	#include "llvm/Analysis/BasicAliasAnalysis.h"
21	#include "llvm/Analysis/ConstantFolding.h"
22	#include "llvm/Analysis/DomTreeUpdater.h"
23	#include "llvm/Analysis/GlobalsModRef.h"
24	#include "llvm/Analysis/TargetLibraryInfo.h"
25	#include "llvm/Analysis/TargetTransformInfo.h"
26	#include "llvm/Analysis/ValueTracking.h"
27	#include "llvm/IR/DataLayout.h"
28	#include "llvm/IR/Dominators.h"
29	#include "llvm/IR/Function.h"
30	#include "llvm/IR/IRBuilder.h"
31	#include "llvm/IR/Instruction.h"
32	#include "llvm/IR/MDBuilder.h"
33	#include "llvm/IR/PatternMatch.h"
34	#include "llvm/IR/ProfDataUtils.h"
35	#include "llvm/Support/Casting.h"
36	#include "llvm/Support/CommandLine.h"
37	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
38	#include "llvm/Transforms/Utils/BuildLibCalls.h"
39	#include "llvm/Transforms/Utils/Local.h"
40
41	using namespace llvm;
42	using namespace PatternMatch;
43
44	#define DEBUG_TYPE "aggressive-instcombine"
45
46	namespace llvm {
47	extern cl::opt<bool> ProfcheckDisableMetadataFixes;
48	}
49
50	STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
51	STATISTIC(NumGuardedRotates,
52	"Number of guarded rotates transformed into funnel shifts");
53	STATISTIC(NumGuardedFunnelShifts,
54	"Number of guarded funnel shifts transformed into funnel shifts");
55	STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
56
57	static cl::opt<unsigned> MaxInstrsToScan(
58	"aggressive-instcombine-max-scan-instrs", cl::init(Val: `64`), cl::Hidden,
59	cl::desc ("Max number of instructions to scan for aggressive instcombine."));
60
61	static cl::opt<unsigned> StrNCmpInlineThreshold(
62	"strncmp-inline-threshold", cl::init(Val: `3`), cl::Hidden,
63	cl::desc ("The maximum length of a constant string for a builtin string cmp "
64	"call eligible for inlining. The default value is 3."));
65
66	static cl::opt<unsigned>
67	MemChrInlineThreshold("memchr-inline-threshold", cl::init(Val: `3`), cl::Hidden,
68	cl::desc ("The maximum length of a constant string to "
69	"inline a memchr call."));
70
71	/// Match a pattern for a bitwise funnel/rotate operation that partially guards
72	/// against undefined behavior by branching around the funnel-shift/rotation
73	/// when the shift amount is 0.
74	static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
75	if (I.getOpcode() != Instruction::PHI \|\| I.getNumOperands() != `2`)
76	return false;
77
78	// As with the one-use checks below, this is not strictly necessary, but we
79	// are being cautious to avoid potential perf regressions on targets that
80	// do not actually have a funnel/rotate instruction (where the funnel shift
81	// would be expanded back into math/shift/logic ops).
82	if (!isPowerOf2_32(Value: I.getType()->getScalarSizeInBits()))
83	return false;
84
85	// Match V to funnel shift left/right and capture the source operands and
86	// shift amount.
87	auto matchFunnelShift = [](Value V, Value &ShVal0, Value *&ShVal1,
88	Value *&ShAmt) {
89	unsigned Width = V->getType()->getScalarSizeInBits();
90
91	// fshl(ShVal0, ShVal1, ShAmt)
92	// == (ShVal0 << ShAmt) \| (ShVal1 >> (Width -ShAmt))
93	if (match(V, P: m_OneUse(SubPattern: m_c_Or(
94	L: m_Shl(L: m_Value(V&: ShVal0), R: m_Value(V&: ShAmt)),
95	R: m_LShr(L: m_Value(V&: ShVal1), R: m_Sub(L: m_SpecificInt(V: Width),
96	R: m_Deferred(V: ShAmt))))))) {
97	return Intrinsic::fshl;
98	}
99
100	// fshr(ShVal0, ShVal1, ShAmt)
101	// == (ShVal0 >> ShAmt) \| (ShVal1 << (Width - ShAmt))
102	if (match(V,
103	P: m_OneUse(SubPattern: m_c_Or(L: m_Shl(L: m_Value(V&: ShVal0), R: m_Sub(L: m_SpecificInt(V: Width),
104	R: m_Value(V&: ShAmt))),
105	R: m_LShr(L: m_Value(V&: ShVal1), R: m_Deferred(V: ShAmt)))))) {
106	return Intrinsic::fshr;
107	}
108
109	return Intrinsic::not_intrinsic;
110	};
111
112	// One phi operand must be a funnel/rotate operation, and the other phi
113	// operand must be the source value of that funnel/rotate operation:
114	// phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
115	// phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
116	// phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
117	PHINode &Phi = cast<PHINode>(Val&: I);
118	unsigned FunnelOp = `0`, GuardOp = `1`;
119	Value P0 = Phi.getOperand(i_nocapture: `0`), P1 = Phi.getOperand(i_nocapture: `1`);
120	Value ShVal0, ShVal1, *ShAmt;
121	Intrinsic::ID IID = matchFunnelShift (P0, ShVal0, ShVal1, ShAmt);
122	if (IID == Intrinsic::not_intrinsic \|\|
123	(IID == Intrinsic::fshl && ShVal0 != P1) \|\|
124	(IID == Intrinsic::fshr && ShVal1 != P1)) {
125	IID = matchFunnelShift (P1, ShVal0, ShVal1, ShAmt);
126	if (IID == Intrinsic::not_intrinsic \|\|
127	(IID == Intrinsic::fshl && ShVal0 != P0) \|\|
128	(IID == Intrinsic::fshr && ShVal1 != P0))
129	return false;
130	assert((IID == Intrinsic::fshl \|\| IID == Intrinsic::fshr) &&
131	"Pattern must match funnel shift left or right");
132	std::swap(a&: FunnelOp, b&: GuardOp);
133	}
134
135	// The incoming block with our source operand must be the "guard" block.
136	// That must contain a cmp+branch to avoid the funnel/rotate when the shift
137	// amount is equal to 0. The other incoming block is the block with the
138	// funnel/rotate.
139	BasicBlock *GuardBB = Phi.getIncomingBlock(i: GuardOp);
140	BasicBlock *FunnelBB = Phi.getIncomingBlock(i: FunnelOp);
141	Instruction *TermI = GuardBB->getTerminator();
142
143	// Ensure that the shift values dominate each block.
144	if (!DT.dominates(Def: ShVal0, User: TermI) \|\| !DT.dominates(Def: ShVal1, User: TermI))
145	return false;
146
147	BasicBlock *PhiBB = Phi.getParent();
148	if (!match(V: TermI, P: m_Br(C: m_SpecificICmp(MatchPred: CmpInst::ICMP_EQ, L: m_Specific(V: ShAmt),
149	R: m_ZeroInt()),
150	T: m_SpecificBB(BB: PhiBB), F: m_SpecificBB(BB: FunnelBB))))
151	return false;
152
153	IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
154
155	if (ShVal0 == ShVal1)
156	++NumGuardedRotates;
157	else
158	++NumGuardedFunnelShifts;
159
160	// If this is not a rotate then the select was blocking poison from the
161	// 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
162	bool IsFshl = IID == Intrinsic::fshl;
163	if (ShVal0 != ShVal1) {
164	if (IsFshl && !llvm::isGuaranteedNotToBePoison(V: ShVal1))
165	ShVal1 = Builder.CreateFreeze(V: ShVal1);
166	else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(V: ShVal0))
167	ShVal0 = Builder.CreateFreeze(V: ShVal0);
168	}
169
170	// We matched a variation of this IR pattern:
171	// GuardBB:
172	// %cmp = icmp eq i32 %ShAmt, 0
173	// br i1 %cmp, label %PhiBB, label %FunnelBB
174	// FunnelBB:
175	// %sub = sub i32 32, %ShAmt
176	// %shr = lshr i32 %ShVal1, %sub
177	// %shl = shl i32 %ShVal0, %ShAmt
178	// %fsh = or i32 %shr, %shl
179	// br label %PhiBB
180	// PhiBB:
181	// %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
182	// -->
183	// llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
184	Phi.replaceAllUsesWith(
185	V: Builder.CreateIntrinsic(ID: IID, Types: Phi.getType(), Args: {ShVal0, ShVal1, ShAmt}));
186	return true;
187	}
188
189	/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
190	/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
191	/// of 'and' ops, then we also need to capture the fact that we saw an
192	/// "and X, 1", so that's an extra return value for that case.
193	namespace {
194	struct MaskOps {
195	Value Root = nullptr*;
196	APInt Mask;
197	bool MatchAndChain;
198	bool FoundAnd1 = false;
199
200	MaskOps(unsigned BitWidth, bool MatchAnds)
201	: Mask(APInt::getZero(numBits: BitWidth)), MatchAndChain(MatchAnds) {}
202	};
203	} // namespace
204
205	/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
206	/// chain of 'and' or 'or' instructions looking for shift ops of a common source
207	/// value. Examples:
208	/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
209	/// returns { X, 0x129 }
210	/// and (and (X >> 1), 1), (X >> 4)
211	/// returns { X, 0x12 }
212	static bool matchAndOrChain(Value *V, MaskOps &MOps) {
213	Value Op0, Op1;
214	if (MOps.MatchAndChain) {
215	// Recurse through a chain of 'and' operands. This requires an extra check
216	// vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
217	// in the chain to know that all of the high bits are cleared.
218	if (match(V, P: m_And(L: m_Value(V&: Op0), R: m_One()))) {
219	MOps.FoundAnd1 = true;
220	return matchAndOrChain(V: Op0, MOps);
221	}
222	if (match(V, P: m_And(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))
223	return matchAndOrChain(V: Op0, MOps) && matchAndOrChain(V: Op1, MOps);
224	} else {
225	// Recurse through a chain of 'or' operands.
226	if (match(V, P: m_Or(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))
227	return matchAndOrChain(V: Op0, MOps) && matchAndOrChain(V: Op1, MOps);
228	}
229
230	// We need a shift-right or a bare value representing a compare of bit 0 of
231	// the original source operand.
232	Value *Candidate;
233	const APInt BitIndex = nullptr*;
234	if (!match(V, P: m_LShr(L: m_Value(V&: Candidate), R: m_APInt(Res&: BitIndex))))
235	Candidate = V;
236
237	// Initialize result source operand.
238	if (!MOps.Root)
239	MOps.Root = Candidate;
240
241	// The shift constant is out-of-range? This code hasn't been simplified.
242	if (BitIndex && BitIndex->uge(RHS: MOps.Mask.getBitWidth()))
243	return false;
244
245	// Fill in the mask bit derived from the shift constant.
246	MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : `0`);
247	return MOps.Root == Candidate;
248	}
249
250	/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
251	/// These will include a chain of 'or' or 'and'-shifted bits from a
252	/// common source value:
253	/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
254	/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
255	/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
256	/// that differ only with a final 'not' of the result. We expect that final
257	/// 'not' to be folded with the compare that we create here (invert predicate).
258	static bool foldAnyOrAllBitsSet(Instruction &I) {
259	// The 'any-bits-set' ('or' chain) pattern is simpler to match because the
260	// final "and X, 1" instruction must be the final op in the sequence.
261	bool MatchAllBitsSet;
262	bool MatchTrunc;
263	Value *X;
264	if (I.getType()->isIntOrIntVectorTy(BitWidth: `1`)) {
265	if (match(V: &I, P: m_Trunc(Op: m_OneUse(SubPattern: m_And(L: m_Value(), R: m_Value())))))
266	MatchAllBitsSet = true;
267	else if (match(V: &I, P: m_Trunc(Op: m_OneUse(SubPattern: m_Or(L: m_Value(), R: m_Value())))))
268	MatchAllBitsSet = false;
269	else
270	return false;
271	MatchTrunc = true;
272	X = I.getOperand(i: `0`);
273	} else {
274	if (match(V: &I, P: m_c_And(L: m_OneUse(SubPattern: m_And(L: m_Value(), R: m_Value())), R: m_Value()))) {
275	X = &I;
276	MatchAllBitsSet = true;
277	} else if (match(V: &I,
278	P: m_And(L: m_OneUse(SubPattern: m_Or(L: m_Value(), R: m_Value())), R: m_One()))) {
279	X = I.getOperand(i: `0`);
280	MatchAllBitsSet = false;
281	} else
282	return false;
283	MatchTrunc = false;
284	}
285	Type *Ty = X->getType();
286
287	MaskOps MOps(Ty->getScalarSizeInBits(), MatchAllBitsSet);
288	if (!matchAndOrChain(V: X, MOps) \|\|
289	(MatchAllBitsSet && !MatchTrunc && !MOps.FoundAnd1))
290	return false;
291
292	// The pattern was found. Create a masked compare that replaces all of the
293	// shift and logic ops.
294	IRBuilder<> Builder(&I);
295	Constant *Mask = ConstantInt::get(Ty, V: MOps.Mask);
296	Value *And = Builder.CreateAnd(LHS: MOps.Root, RHS: Mask);
297	Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(LHS: And, RHS: Mask)
298	: Builder.CreateIsNotNull(Arg: And);
299	Value *Zext = MatchTrunc ? Cmp : Builder.CreateZExt(V: Cmp, DestTy: Ty);
300	I.replaceAllUsesWith(V: Zext);
301	++NumAnyOrAllBitsSet;
302	return true;
303	}
304
305	// Try to recognize below function as popcount intrinsic.
306	// This is the "best" algorithm from
307	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
308	// Also used in TargetLowering::expandCTPOP().
309	//
310	// int popcount(unsigned int i) {
311	// i = i - ((i >> 1) & 0x55555555);
312	// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
313	// i = ((i + (i >> 4)) & 0x0F0F0F0F);
314	// return (i 0x01010101) >> 24;*
315	// }
316	static bool tryToRecognizePopCount(Instruction &I) {
317	if (I.getOpcode() != Instruction::LShr)
318	return false;
319
320	Type *Ty = I.getType();
321	if (!Ty->isIntOrIntVectorTy())
322	return false;
323
324	unsigned Len = Ty->getScalarSizeInBits();
325	// FIXME: fix Len == 8 and other irregular type lengths.
326	if (!(Len <= `128` && Len > `8` && Len % `8` == `0`))
327	return false;
328
329	APInt Mask55 = APInt::getSplat(NewLen: Len, V: APInt (`8`, `0x55`));
330	APInt Mask33 = APInt::getSplat(NewLen: Len, V: APInt (`8`, `0x33`));
331	APInt Mask0F = APInt::getSplat(NewLen: Len, V: APInt (`8`, `0x0F`));
332	APInt Mask01 = APInt::getSplat(NewLen: Len, V: APInt (`8`, `0x01`));
333	APInt MaskShift = APInt (Len, Len - `8`);
334
335	Value *Op0 = I.getOperand(i: `0`);
336	Value *Op1 = I.getOperand(i: `1`);
337	Value *MulOp0;
338	// Matching "(i 0x01010101...) >> 24".*
339	if ((match(V: Op0, P: m_Mul(L: m_Value(V&: MulOp0), R: m_SpecificInt(V: Mask01)))) &&
340	match(V: Op1, P: m_SpecificInt(V: MaskShift))) {
341	Value *ShiftOp0;
342	// Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
343	if (match(V: MulOp0, P: m_And(L: m_c_Add(L: m_LShr(L: m_Value(V&: ShiftOp0), R: m_SpecificInt(V: `4`)),
344	R: m_Deferred(V: ShiftOp0)),
345	R: m_SpecificInt(V: Mask0F)))) {
346	Value *AndOp0;
347	// Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
348	if (match(V: ShiftOp0,
349	P: m_c_Add(L: m_And(L: m_Value(V&: AndOp0), R: m_SpecificInt(V: Mask33)),
350	R: m_And(L: m_LShr(L: m_Deferred(V: AndOp0), R: m_SpecificInt(V: `2`)),
351	R: m_SpecificInt(V: Mask33))))) {
352	Value Root, SubOp1;
353	// Matching "i - ((i >> 1) & 0x55555555...)".
354	const APInt *AndMask;
355	if (match(V: AndOp0, P: m_Sub(L: m_Value(V&: Root), R: m_Value(V&: SubOp1))) &&
356	match(V: SubOp1, P: m_And(L: m_LShr(L: m_Specific(V: Root), R: m_SpecificInt(V: `1`)),
357	R: m_APInt(Res&: AndMask)))) {
358	auto CheckAndMask = [&]() {
359	if (*AndMask == Mask55)
360	return true;
361
362	// Exact match failed, see if any bits are known to be 0 where we
363	// expect a 1 in the mask.
364	if (!AndMask->isSubsetOf(RHS: Mask55))
365	return false;
366
367	APInt NeededMask = Mask55 & ~*AndMask;
368	return MaskedValueIsZero(V: cast<Instruction>(Val: SubOp1)->getOperand(i: `0`),
369	Mask: NeededMask,
370	SQ: SimplifyQuery (I.getDataLayout()));
371	};
372
373	if (CheckAndMask ()) {
374	LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
375	IRBuilder<> Builder(&I);
376	I.replaceAllUsesWith(
377	V: Builder.CreateIntrinsic(ID: Intrinsic::ctpop, Types: I.getType(), Args: {Root}));
378	++NumPopCountRecognized;
379	return true;
380	}
381	}
382	}
383	}
384	}
385
386	return false;
387	}
388
389	/// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and
390	/// C2 saturate the value of the fp conversion. The transform is not reversable
391	/// as the fptosi.sat is more defined than the input - all values produce a
392	/// valid value for the fptosi.sat, where as some produce poison for original
393	/// that were out of range of the integer conversion. The reversed pattern may
394	/// use fmax and fmin instead. As we cannot directly reverse the transform, and
395	/// it is not always profitable, we make it conditional on the cost being
396	/// reported as lower by TTI.
397	static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
398	// Look for min(max(fptosi, converting to fptosi_sat.
399	Value *In;
400	const APInt MinC, MaxC;
401	if (!match(V: &I, P: m_SMax(L: m_OneUse(SubPattern: m_SMin(L: m_OneUse(SubPattern: m_FPToSI(Op: m_Value(V&: In))),
402	R: m_APInt(Res&: MinC))),
403	R: m_APInt(Res&: MaxC))) &&
404	!match(V: &I, P: m_SMin(L: m_OneUse(SubPattern: m_SMax(L: m_OneUse(SubPattern: m_FPToSI(Op: m_Value(V&: In))),
405	R: m_APInt(Res&: MaxC))),
406	R: m_APInt(Res&: MinC))))
407	return false;
408
409	// Check that the constants clamp a saturate.
410	if (!(MinC + `1`).isPowerOf2() \|\| -MaxC != *MinC + `1`)
411	return false;
412
413	Type *IntTy = I.getType();
414	Type *FpTy = In->getType();
415	Type *SatTy =
416	IntegerType::get(C&: IntTy->getContext(), NumBits: (*MinC + `1`).exactLogBase2() + `1`);
417	if (auto *VecTy = dyn_cast<VectorType>(Val: IntTy))
418	SatTy = VectorType::get(ElementType: SatTy, EC: VecTy->getElementCount());
419
420	// Get the cost of the intrinsic, and check that against the cost of
421	// fptosi+smin+smax
422	InstructionCost SatCost = TTI.getIntrinsicInstrCost(
423	ICA: IntrinsicCostAttributes (Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}),
424	CostKind: TTI::TCK_RecipThroughput);
425	SatCost += TTI.getCastInstrCost(Opcode: Instruction::SExt, Dst: IntTy, Src: SatTy,
426	CCH: TTI::CastContextHint::None,
427	CostKind: TTI::TCK_RecipThroughput);
428
429	InstructionCost MinMaxCost = TTI.getCastInstrCost(
430	Opcode: Instruction::FPToSI, Dst: IntTy, Src: FpTy, CCH: TTI::CastContextHint::None,
431	CostKind: TTI::TCK_RecipThroughput);
432	MinMaxCost += TTI.getIntrinsicInstrCost(
433	ICA: IntrinsicCostAttributes (Intrinsic::smin, IntTy, {IntTy}),
434	CostKind: TTI::TCK_RecipThroughput);
435	MinMaxCost += TTI.getIntrinsicInstrCost(
436	ICA: IntrinsicCostAttributes (Intrinsic::smax, IntTy, {IntTy}),
437	CostKind: TTI::TCK_RecipThroughput);
438
439	if (SatCost >= MinMaxCost)
440	return false;
441
442	IRBuilder<> Builder(&I);
443	Value *Sat =
444	Builder.CreateIntrinsic(ID: Intrinsic::fptosi_sat, Types: {SatTy, FpTy}, Args: In);
445	I.replaceAllUsesWith(V: Builder.CreateSExt(V: Sat, DestTy: IntTy));
446	return true;
447	}
448
449	/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
450	/// pessimistic codegen that has to account for setting errno and can enable
451	/// vectorization.
452	static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
453	TargetLibraryInfo &TLI, AssumptionCache &AC,
454	DominatorTree &DT) {
455	// If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
456	// (because NNAN or the operand arg must not be less than -0.0) and (2) we
457	// would not end up lowering to a libcall anyway (which could change the value
458	// of errno), then:
459	// (1) errno won't be set.
460	// (2) it is safe to convert this to an intrinsic call.
461	Type *Ty = Call->getType();
462	Value *Arg = Call->getArgOperand(i: `0`);
463	if (TTI.haveFastSqrt(Ty) &&
464	(Call->hasNoNaNs() \|\|
465	cannotBeOrderedLessThanZero(
466	V: Arg, SQ: SimplifyQuery (Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {
467	IRBuilder<> Builder(Call);
468	Value *NewSqrt =
469	Builder.CreateIntrinsic(ID: Intrinsic::sqrt, Types: Ty, Args: Arg, FMFSource: Call, Name: "sqrt");
470	Call->replaceAllUsesWith(V: NewSqrt);
471
472	// Explicitly erase the old call because a call with side effects is not
473	// trivially dead.
474	Call->eraseFromParent();
475	return true;
476	}
477
478	return false;
479	}
480
481	// Check if this array of constants represents a cttz table.
482	// Iterate over the elements from \p Table by trying to find/match all
483	// the numbers from 0 to \p InputBits that should represent cttz results.
484	static bool isCTTZTable(Constant Table, const* APInt &Mul, const APInt &Shift,
485	const APInt &AndMask, Type *AccessTy,
486	unsigned InputBits, const APInt &GEPIdxFactor,
487	const DataLayout &DL) {
488	for (unsigned Idx = `0`; Idx < InputBits; Idx++) {
489	APInt Index = (APInt (InputBits, `1`).shl(shiftAmt: Idx) * Mul).lshr(ShiftAmt: Shift) & AndMask;
490	ConstantInt *C = dyn_cast_or_null<ConstantInt>(
491	Val: ConstantFoldLoadFromConst(C: Table, Ty: AccessTy, Offset: Index * GEPIdxFactor, DL));
492	if (!C \|\| C->getValue() != Idx)
493	return false;
494	}
495
496	return true;
497	}
498
499	// Try to recognize table-based ctz implementation.
500	// E.g., an example in C (for more cases please see the llvm/tests):
501	// int f(unsigned x) {
502	// static const char table[32] =
503	// {0, 1, 28, 2, 29, 14, 24, 3, 30,
504	// 22, 20, 15, 25, 17, 4, 8, 31, 27,
505	// 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
506	// return table[((unsigned)((x & -x) 0x077CB531U)) >> 27];*
507	// }
508	// this can be lowered to `cttz` instruction.
509	// There is also a special case when the element is 0.
510	//
511	// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
512	// sequence that contains each pattern of bits in it. The shift extracts
513	// the top bits after the multiply, and that index into the table should
514	// represent the number of trailing zeros in the original number.
515	//
516	// Here are some examples or LLVM IR for a 64-bit target:
517	//
518	// CASE 1:
519	// %sub = sub i32 0, %x
520	// %and = and i32 %sub, %x
521	// %mul = mul i32 %and, 125613361
522	// %shr = lshr i32 %mul, 27
523	// %idxprom = zext i32 %shr to i64
524	// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8] @ctz1.table, i64 0,*
525	// i64 %idxprom
526	// %0 = load i8, i8 %arrayidx, align 1, !tbaa !8*
527	//
528	// CASE 2:
529	// %sub = sub i32 0, %x
530	// %and = and i32 %sub, %x
531	// %mul = mul i32 %and, 72416175
532	// %shr = lshr i32 %mul, 26
533	// %idxprom = zext i32 %shr to i64
534	// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16] @ctz2.table,*
535	// i64 0, i64 %idxprom
536	// %0 = load i16, i16 %arrayidx, align 2, !tbaa !8*
537	//
538	// CASE 3:
539	// %sub = sub i32 0, %x
540	// %and = and i32 %sub, %x
541	// %mul = mul i32 %and, 81224991
542	// %shr = lshr i32 %mul, 27
543	// %idxprom = zext i32 %shr to i64
544	// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32] @ctz3.table,*
545	// i64 0, i64 %idxprom
546	// %0 = load i32, i32 %arrayidx, align 4, !tbaa !8*
547	//
548	// CASE 4:
549	// %sub = sub i64 0, %x
550	// %and = and i64 %sub, %x
551	// %mul = mul i64 %and, 283881067100198605
552	// %shr = lshr i64 %mul, 58
553	// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8] @table, i64 0,*
554	// i64 %shr
555	// %0 = load i8, i8 %arrayidx, align 1, !tbaa !8*
556	//
557	// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
558	static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
559	LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
560	if (!LI)
561	return false;
562
563	Type *AccessType = LI->getType();
564	if (!AccessType->isIntegerTy())
565	return false;
566
567	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: LI->getPointerOperand());
568	if (!GEP \|\| !GEP->hasNoUnsignedSignedWrap())
569	return false;
570
571	GlobalVariable *GVTable = dyn_cast<GlobalVariable>(Val: GEP->getPointerOperand());
572	if (!GVTable \|\| !GVTable->hasInitializer() \|\| !GVTable->isConstant())
573	return false;
574
575	unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
576	APInt ModOffset(BW, `0`);
577	SmallMapVector<Value *, APInt, `4`> VarOffsets;
578	if (!GEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ModOffset) \|\|
579	VarOffsets.size() != `1` \|\| ModOffset != `0`)
580	return false;
581	auto [GepIdx, GEPScale] = VarOffsets.front();
582
583	Value *X1;
584	const APInt MulConst, ShiftConst, AndCst = nullptr*;
585	// Check that the gep variable index is ((x & -x) MulConst) >> ShiftConst.*
586	// This might be extended to the pointer index type, and if the gep index type
587	// has been replaced with an i8 then a new And (and different ShiftConst) will
588	// be present.
589	auto MatchInner = m_LShr(
590	L: m_Mul(L: m_c_And(L: m_Neg(V: m_Value(V&: X1)), R: m_Deferred(V: X1)), R: m_APInt(Res&: MulConst)),
591	R: m_APInt(Res&: ShiftConst));
592	if (!match(V: GepIdx, P: m_CastOrSelf(Op: MatchInner)) &&
593	!match(V: GepIdx, P: m_CastOrSelf(Op: m_And(L: MatchInner, R: m_APInt(Res&: AndCst)))))
594	return false;
595
596	unsigned InputBits = X1->getType()->getScalarSizeInBits();
597	if (InputBits != `16` && InputBits != `32` && InputBits != `64` && InputBits != `128`)
598	return false;
599
600	if (!GEPScale.isIntN(N: InputBits) \|\|
601	!isCTTZTable(Table: GVTable->getInitializer(), Mul: MulConst, Shift: ShiftConst,
602	AndMask: AndCst ? *AndCst : APInt::getAllOnes(numBits: InputBits), AccessTy: AccessType,
603	InputBits, GEPIdxFactor: GEPScale.zextOrTrunc(width: InputBits), DL))
604	return false;
605
606	ConstantInt *ZeroTableElem = cast<ConstantInt>(
607	Val: ConstantFoldLoadFromConst(C: GVTable->getInitializer(), Ty: AccessType, DL));
608	bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
609
610	IRBuilder<> B(LI);
611	ConstantInt *BoolConst = B.getInt1(V: !DefinedForZero);
612	Type *XType = X1->getType();
613	auto Cttz = B.CreateIntrinsic(ID: Intrinsic::cttz, Types: {XType}, Args: {X1, BoolConst});
614	Value ZExtOrTrunc = nullptr*;
615
616	if (DefinedForZero) {
617	ZExtOrTrunc = B.CreateZExtOrTrunc(V: Cttz, DestTy: AccessType);
618	} else {
619	// If the value in elem 0 isn't the same as InputBits, we still want to
620	// produce the value from the table.
621	auto Cmp = B.CreateICmpEQ(LHS: X1, RHS: ConstantInt::get(Ty: XType, V: `0`));
622	auto Select = B.CreateSelect(C: Cmp, True: B.CreateZExt(V: ZeroTableElem, DestTy: XType), False: Cttz);
623
624	// The true branch of select handles the cttz(0) case, which is rare.
625	if (!ProfcheckDisableMetadataFixes) {
626	if (Instruction *SelectI = dyn_cast<Instruction>(Val: Select))
627	SelectI->setMetadata(
628	KindID: LLVMContext::MD_prof,
629	Node: MDBuilder (SelectI->getContext()).createUnlikelyBranchWeights());
630	}
631
632	// NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
633	// it should be handled as: `cttz(x) & (typeSize - 1)`.
634
635	ZExtOrTrunc = B.CreateZExtOrTrunc(V: Select, DestTy: AccessType);
636	}
637
638	LI->replaceAllUsesWith(V: ZExtOrTrunc);
639
640	return true;
641	}
642
643	// Check if this array of constants represents a log2 table.
644	// Iterate over the elements from \p Table by trying to find/match all
645	// the numbers from 0 to \p InputBits that should represent log2 results.
646	static bool isLog2Table(Constant Table, const* APInt &Mul, const APInt &Shift,
647	Type AccessTy, unsigned* InputBits,
648	const APInt &GEPIdxFactor, const DataLayout &DL) {
649	for (unsigned Idx = `0`; Idx < InputBits; Idx++) {
650	APInt Index = (APInt::getLowBitsSet(numBits: InputBits, loBitsSet: Idx + `1`) * Mul).lshr(ShiftAmt: Shift);
651	ConstantInt *C = dyn_cast_or_null<ConstantInt>(
652	Val: ConstantFoldLoadFromConst(C: Table, Ty: AccessTy, Offset: Index * GEPIdxFactor, DL));
653	if (!C \|\| C->getValue() != Idx)
654	return false;
655	}
656
657	// Verify that an input of zero will select table index 0.
658	APInt ZeroIndex = Mul.lshr(ShiftAmt: Shift);
659	if (!ZeroIndex.isZero())
660	return false;
661
662	return true;
663	}
664
665	// Try to recognize table-based log2 implementation.
666	// E.g., an example in C (for more cases please the llvm/tests):
667	// int f(unsigned v) {
668	// static const char table[32] =
669	// {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
670	// 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
671	//
672	// v \|= v >> 1; // first round down to one less than a power of 2
673	// v \|= v >> 2;
674	// v \|= v >> 4;
675	// v \|= v >> 8;
676	// v \|= v >> 16;
677	//
678	// return table[(unsigned)(v 0x07C4ACDDU) >> 27];*
679	// }
680	// this can be lowered to `ctlz` instruction.
681	// There is also a special case when the element is 0.
682	//
683	// The >> and \|= sequence sets all bits below the most significant set bit. The
684	// multiply is a de-bruijn sequence that contains each pattern of bits in it.
685	// The shift extracts the top bits after the multiply, and that index into the
686	// table should represent the floor log base 2 of the original number.
687	//
688	// Here are some examples of LLVM IR for a 64-bit target.
689	//
690	// CASE 1:
691	// %shr = lshr i32 %v, 1
692	// %or = or i32 %shr, %v
693	// %shr1 = lshr i32 %or, 2
694	// %or2 = or i32 %shr1, %or
695	// %shr3 = lshr i32 %or2, 4
696	// %or4 = or i32 %shr3, %or2
697	// %shr5 = lshr i32 %or4, 8
698	// %or6 = or i32 %shr5, %or4
699	// %shr7 = lshr i32 %or6, 16
700	// %or8 = or i32 %shr7, %or6
701	// %mul = mul i32 %or8, 130329821
702	// %shr9 = lshr i32 %mul, 27
703	// %idxprom = zext nneg i32 %shr9 to i64
704	// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
705	// %0 = load i8, ptr %arrayidx, align 1
706	//
707	// CASE 2:
708	// %shr = lshr i64 %v, 1
709	// %or = or i64 %shr, %v
710	// %shr1 = lshr i64 %or, 2
711	// %or2 = or i64 %shr1, %or
712	// %shr3 = lshr i64 %or2, 4
713	// %or4 = or i64 %shr3, %or2
714	// %shr5 = lshr i64 %or4, 8
715	// %or6 = or i64 %shr5, %or4
716	// %shr7 = lshr i64 %or6, 16
717	// %or8 = or i64 %shr7, %or6
718	// %shr9 = lshr i64 %or8, 32
719	// %or10 = or i64 %shr9, %or8
720	// %mul = mul i64 %or10, 285870213051386505
721	// %shr11 = lshr i64 %mul, 58
722	// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
723	// %0 = load i8, ptr %arrayidx, align 1
724	//
725	// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
726	static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
727	TargetTransformInfo &TTI) {
728	LoadInst *LI = dyn_cast<LoadInst>(Val: &I);
729	if (!LI)
730	return false;
731
732	Type *AccessType = LI->getType();
733	if (!AccessType->isIntegerTy())
734	return false;
735
736	GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Val: LI->getPointerOperand());
737	if (!GEP \|\| !GEP->hasNoUnsignedSignedWrap())
738	return false;
739
740	GlobalVariable *GVTable = dyn_cast<GlobalVariable>(Val: GEP->getPointerOperand());
741	if (!GVTable \|\| !GVTable->hasInitializer() \|\| !GVTable->isConstant())
742	return false;
743
744	unsigned BW = DL.getIndexTypeSizeInBits(Ty: GEP->getType());
745	APInt ModOffset(BW, `0`);
746	SmallMapVector<Value *, APInt, `4`> VarOffsets;
747	if (!GEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ModOffset) \|\|
748	VarOffsets.size() != `1` \|\| ModOffset != `0`)
749	return false;
750	auto [GepIdx, GEPScale] = VarOffsets.front();
751
752	Value *X;
753	const APInt MulConst, ShiftConst;
754	// Check that the gep variable index is (x MulConst) >> ShiftConst.*
755	auto MatchInner =
756	m_LShr(L: m_Mul(L: m_Value(V&: X), R: m_APInt(Res&: MulConst)), R: m_APInt(Res&: ShiftConst));
757	if (!match(V: GepIdx, P: m_CastOrSelf(Op: MatchInner)))
758	return false;
759
760	unsigned InputBits = X->getType()->getScalarSizeInBits();
761	if (InputBits != `16` && InputBits != `32` && InputBits != `64` && InputBits != `128`)
762	return false;
763
764	// Verify shift amount.
765	// TODO: Allow other shift amounts when we have proper test coverage.
766	if (*ShiftConst != InputBits - Log2_32(Value: InputBits))
767	return false;
768
769	// Match the sequence of OR operations with right shifts by powers of 2.
770	for (unsigned ShiftAmt = InputBits / `2`; ShiftAmt != `0`; ShiftAmt /= `2`) {
771	Value *Y;
772	if (!match(V: X, P: m_c_Or(L: m_LShr(L: m_Value(V&: Y), R: m_SpecificInt(V: ShiftAmt)),
773	R: m_Deferred(V: Y))))
774	return false;
775	X = Y;
776	}
777
778	if (!GEPScale.isIntN(N: InputBits) \|\|
779	!isLog2Table(Table: GVTable->getInitializer(), Mul: MulConst, Shift: ShiftConst,
780	AccessTy: AccessType, InputBits, GEPIdxFactor: GEPScale.zextOrTrunc(width: InputBits), DL))
781	return false;
782
783	ConstantInt *ZeroTableElem = cast<ConstantInt>(
784	Val: ConstantFoldLoadFromConst(C: GVTable->getInitializer(), Ty: AccessType, DL));
785
786	// Use InputBits - 1 - ctlz(X) to compute log2(X).
787	IRBuilder<> B(LI);
788	ConstantInt *BoolConst = B.getTrue();
789	Type *XType = X->getType();
790
791	// Check the the backend has an efficient ctlz instruction.
792	// FIXME: Teach the backend to emit the original code when ctlz isn't
793	// supported like we do for cttz.
794	IntrinsicCostAttributes Attrs(
795	Intrinsic::ctlz, XType,
796	{PoisonValue::get(T: XType), /is_zero_poison=/BoolConst});
797	InstructionCost Cost =
798	TTI.getIntrinsicInstrCost(ICA: Attrs, CostKind: TargetTransformInfo::TCK_SizeAndLatency);
799	if (Cost > TargetTransformInfo::TCC_Basic)
800	return false;
801
802	Value *Ctlz = B.CreateIntrinsic(ID: Intrinsic::ctlz, Types: {XType}, Args: {X, BoolConst});
803
804	Constant *InputBitsM1 = ConstantInt::get(Ty: XType, V: InputBits - `1`);
805	Value *Sub = B.CreateSub(LHS: InputBitsM1, RHS: Ctlz);
806
807	// The table won't produce a sensible result for 0.
808	Value *Cmp = B.CreateICmpEQ(LHS: X, RHS: ConstantInt::get(Ty: XType, V: `0`));
809	Value *Select = B.CreateSelect(C: Cmp, True: B.CreateZExt(V: ZeroTableElem, DestTy: XType), False: Sub);
810
811	// The true branch of select handles the log2(0) case, which is rare.
812	if (!ProfcheckDisableMetadataFixes) {
813	if (Instruction *SelectI = dyn_cast<Instruction>(Val: Select))
814	SelectI->setMetadata(
815	KindID: LLVMContext::MD_prof,
816	Node: MDBuilder (SelectI->getContext()).createUnlikelyBranchWeights());
817	}
818
819	Value *ZExtOrTrunc = B.CreateZExtOrTrunc(V: Select, DestTy: AccessType);
820
821	LI->replaceAllUsesWith(V: ZExtOrTrunc);
822
823	return true;
824	}
825
826	/// This is used by foldLoadsRecursive() to capture a Root Load node which is
827	/// of type or(load, load) and recursively build the wide load. Also capture the
828	/// shift amount, zero extend type and loadSize.
829	struct LoadOps {
830	LoadInst Root = nullptr*;
831	LoadInst RootInsert = nullptr*;
832	bool FoundRoot = false;
833	uint64_t LoadSize = `0`;
834	uint64_t Shift = `0`;
835	Type *ZextType;
836	AAMDNodes AATags;
837	};
838
839	// Identify and Merge consecutive loads recursively which is of the form
840	// (ZExt(L1) << shift1) \| (ZExt(L2) << shift2) -> ZExt(L3) << shift1
841	// (ZExt(L1) << shift1) \| ZExt(L2) -> ZExt(L3)
842	static bool foldLoadsRecursive(Value V, LoadOps &LOps, const* DataLayout &DL,
843	AliasAnalysis &AA, bool IsRoot = false) {
844	uint64_t ShAmt2;
845	Value *X;
846	Instruction L1, L2;
847
848	// For the root instruction, allow multiple uses since the final result
849	// may legitimately be used in multiple places. For intermediate values,
850	// require single use to avoid creating duplicate loads.
851	if (!IsRoot && !V->hasOneUse())
852	return false;
853
854	if (!match(V, P: m_c_Or(L: m_Value(V&: X),
855	R: m_OneUse(SubPattern: m_ShlOrSelf(L: m_OneUse(SubPattern: m_ZExt(Op: m_Instruction(I&: L2))),
856	R&: ShAmt2)))))
857	return false;
858
859	if (!foldLoadsRecursive(V: X, LOps, DL, AA, /IsRoot=/false) && LOps.FoundRoot)
860	// Avoid Partial chain merge.
861	return false;
862
863	// Check if the pattern has loads
864	LoadInst *LI1 = LOps.Root;
865	uint64_t ShAmt1 = LOps.Shift;
866	if (LOps.FoundRoot == false &&
867	match(V: X, P: m_OneUse(
868	SubPattern: m_ShlOrSelf(L: m_OneUse(SubPattern: m_ZExt(Op: m_Instruction(I&: L1))), R&: ShAmt1)))) {
869	LI1 = dyn_cast<LoadInst>(Val: L1);
870	}
871	LoadInst *LI2 = dyn_cast<LoadInst>(Val: L2);
872
873	// Check if loads are same, atomic, volatile and having same address space.
874	if (LI1 == LI2 \|\| !LI1 \|\| !LI2 \|\| !LI1->isSimple() \|\| !LI2->isSimple() \|\|
875	LI1->getPointerAddressSpace() != LI2->getPointerAddressSpace())
876	return false;
877
878	// Check if Loads come from same BB.
879	if (LI1->getParent() != LI2->getParent())
880	return false;
881
882	// Find the data layout
883	bool IsBigEndian = DL.isBigEndian();
884
885	// Check if loads are consecutive and same size.
886	Value *Load1Ptr = LI1->getPointerOperand();
887	APInt Offset1(DL.getIndexTypeSizeInBits(Ty: Load1Ptr->getType()), `0`);
888	Load1Ptr =
889	Load1Ptr->stripAndAccumulateConstantOffsets(DL, Offset&: Offset1,
890	/ AllowNonInbounds / true);
891
892	Value *Load2Ptr = LI2->getPointerOperand();
893	APInt Offset2(DL.getIndexTypeSizeInBits(Ty: Load2Ptr->getType()), `0`);
894	Load2Ptr =
895	Load2Ptr->stripAndAccumulateConstantOffsets(DL, Offset&: Offset2,
896	/ AllowNonInbounds / true);
897
898	// Verify if both loads have same base pointers
899	uint64_t LoadSize1 = LI1->getType()->getPrimitiveSizeInBits();
900	uint64_t LoadSize2 = LI2->getType()->getPrimitiveSizeInBits();
901	if (Load1Ptr != Load2Ptr)
902	return false;
903
904	// Make sure that there are no padding bits.
905	if (!DL.typeSizeEqualsStoreSize(Ty: LI1->getType()) \|\|
906	!DL.typeSizeEqualsStoreSize(Ty: LI2->getType()))
907	return false;
908
909	// Alias Analysis to check for stores b/w the loads.
910	LoadInst Start = LOps.FoundRoot ? LOps.RootInsert : LI1, End = LI2;
911	MemoryLocation Loc;
912	if (!Start->comesBefore(Other: End)) {
913	std::swap(a&: Start, b&: End);
914	// If LOps.RootInsert comes after LI2, since we use LI2 as the new insert
915	// point, we should make sure whether the memory region accessed by LOps
916	// isn't modified.
917	if (LOps.FoundRoot)
918	Loc = MemoryLocation (
919	LOps.Root->getPointerOperand(),
920	LocationSize::precise(Value: DL.getTypeStoreSize(
921	Ty: IntegerType::get(C&: LI1->getContext(), NumBits: LOps.LoadSize))),
922	LOps.AATags);
923	else
924	Loc = MemoryLocation::get(LI: End);
925	} else
926	Loc = MemoryLocation::get(LI: End);
927	unsigned NumScanned = `0`;
928	for (Instruction &Inst :
929	make_range(x: Start->getIterator(), y: End->getIterator())) {
930	if (Inst.mayWriteToMemory() && isModSet(MRI: AA.getModRefInfo(I: &Inst, OptLoc: Loc)))
931	return false;
932
933	if (++NumScanned > MaxInstrsToScan)
934	return false;
935	}
936
937	// Make sure Load with lower Offset is at LI1
938	bool Reverse = false;
939	if (Offset2.slt(RHS: Offset1)) {
940	std::swap(a&: LI1, b&: LI2);
941	std::swap(a&: ShAmt1, b&: ShAmt2);
942	std::swap(a&: Offset1, b&: Offset2);
943	std::swap(a&: Load1Ptr, b&: Load2Ptr);
944	std::swap(a&: LoadSize1, b&: LoadSize2);
945	Reverse = true;
946	}
947
948	// Big endian swap the shifts
949	if (IsBigEndian)
950	std::swap(a&: ShAmt1, b&: ShAmt2);
951
952	// First load is always LI1. This is where we put the new load.
953	// Use the merged load size available from LI1 for forward loads.
954	if (LOps.FoundRoot) {
955	if (!Reverse)
956	LoadSize1 = LOps.LoadSize;
957	else
958	LoadSize2 = LOps.LoadSize;
959	}
960
961	// Verify if shift amount and load index aligns and verifies that loads
962	// are consecutive.
963	uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
964	uint64_t PrevSize =
965	DL.getTypeStoreSize(Ty: IntegerType::get(C&: LI1->getContext(), NumBits: LoadSize1));
966	if ((ShAmt2 - ShAmt1) != ShiftDiff \|\| (Offset2 - Offset1) != PrevSize)
967	return false;
968
969	// Update LOps
970	AAMDNodes AATags1 = LOps.AATags;
971	AAMDNodes AATags2 = LI2->getAAMetadata();
972	if (LOps.FoundRoot == false) {
973	LOps.FoundRoot = true;
974	AATags1 = LI1->getAAMetadata();
975	}
976	LOps.LoadSize = LoadSize1 + LoadSize2;
977	LOps.RootInsert = Start;
978
979	// Concatenate the AATags of the Merged Loads.
980	LOps.AATags = AATags1.concat(Other: AATags2);
981
982	LOps.Root = LI1;
983	LOps.Shift = ShAmt1;
984	LOps.ZextType = X->getType();
985	return true;
986	}
987
988	// For a given BB instruction, evaluate all loads in the chain that form a
989	// pattern which suggests that the loads can be combined. The one and only use
990	// of the loads is to form a wider load.
991	static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL,
992	TargetTransformInfo &TTI, AliasAnalysis &AA,
993	const DominatorTree &DT) {
994	// Only consider load chains of scalar values.
995	if (isa<VectorType>(Val: I.getType()))
996	return false;
997
998	LoadOps LOps;
999	if (!foldLoadsRecursive(V: &I, LOps, DL, AA, /IsRoot=/true) \|\| !LOps.FoundRoot)
1000	return false;
1001
1002	IRBuilder<> Builder(&I);
1003	LoadInst NewLoad = nullptr, LI1 = LOps.Root;
1004
1005	IntegerType *WiderType = IntegerType::get(C&: I.getContext(), NumBits: LOps.LoadSize);
1006	// TTI based checks if we want to proceed with wider load
1007	bool Allowed = TTI.isTypeLegal(Ty: WiderType);
1008	if (!Allowed)
1009	return false;
1010
1011	unsigned AS = LI1->getPointerAddressSpace();
1012	unsigned Fast = `0`;
1013	Allowed = TTI.allowsMisalignedMemoryAccesses(Context&: I.getContext(), BitWidth: LOps.LoadSize,
1014	AddressSpace: AS, Alignment: LI1->getAlign(), Fast: &Fast);
1015	if (!Allowed \|\| !Fast)
1016	return false;
1017
1018	// Get the Index and Ptr for the new GEP.
1019	Value *Load1Ptr = LI1->getPointerOperand();
1020	Builder.SetInsertPoint(LOps.RootInsert);
1021	if (!DT.dominates(Def: Load1Ptr, User: LOps.RootInsert)) {
1022	APInt Offset1(DL.getIndexTypeSizeInBits(Ty: Load1Ptr->getType()), `0`);
1023	Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
1024	DL, Offset&: Offset1, / AllowNonInbounds / true);
1025	Load1Ptr = Builder.CreatePtrAdd(Ptr: Load1Ptr, Offset: Builder.getInt(AI: Offset1));
1026	}
1027	// Generate wider load.
1028	NewLoad = Builder.CreateAlignedLoad(Ty: WiderType, Ptr: Load1Ptr, Align: LI1->getAlign(),
1029	isVolatile: LI1->isVolatile(), Name: "");
1030	NewLoad->takeName(V: LI1);
1031	// Set the New Load AATags Metadata.
1032	if (LOps.AATags)
1033	NewLoad->setAAMetadata(LOps.AATags);
1034
1035	Value *NewOp = NewLoad;
1036	// Check if zero extend needed.
1037	if (LOps.ZextType)
1038	NewOp = Builder.CreateZExt(V: NewOp, DestTy: LOps.ZextType);
1039
1040	// Check if shift needed. We need to shift with the amount of load1
1041	// shift if not zero.
1042	if (LOps.Shift)
1043	NewOp = Builder.CreateShl(LHS: NewOp, RHS: LOps.Shift);
1044	I.replaceAllUsesWith(V: NewOp);
1045
1046	return true;
1047	}
1048
1049	/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
1050	struct PartStore {
1051	Value *PtrBase;
1052	APInt PtrOffset;
1053	Value *Val;
1054	uint64_t ValOffset;
1055	uint64_t ValWidth;
1056	StoreInst *Store;
1057
1058	bool isCompatibleWith(const PartStore &Other) const {
1059	return PtrBase == Other.PtrBase && Val == Other.Val;
1060	}
1061
1062	bool operator<(const PartStore &Other) const {
1063	return PtrOffset.slt(RHS: Other.PtrOffset);
1064	}
1065	};
1066
1067	static std::optional<PartStore> matchPartStore(Instruction &I,
1068	const DataLayout &DL) {
1069	auto *Store = dyn_cast<StoreInst>(Val: &I);
1070	if (!Store \|\| !Store->isSimple())
1071	return std::nullopt;
1072
1073	Value *StoredVal = Store->getValueOperand();
1074	Type *StoredTy = StoredVal->getType();
1075	if (!StoredTy->isIntegerTy() \|\| !DL.typeSizeEqualsStoreSize(Ty: StoredTy))
1076	return std::nullopt;
1077
1078	uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
1079	uint64_t ValOffset;
1080	Value *Val;
1081	if (!match(V: StoredVal, P: m_Trunc(Op: m_LShrOrSelf(L: m_Value(V&: Val), R&: ValOffset))))
1082	return std::nullopt;
1083
1084	Value *Ptr = Store->getPointerOperand();
1085	APInt PtrOffset(DL.getIndexTypeSizeInBits(Ty: Ptr->getType()), `0`);
1086	Value *PtrBase = Ptr->stripAndAccumulateConstantOffsets(
1087	DL, Offset&: PtrOffset, /AllowNonInbounds=/true);
1088	return {{.PtrBase: PtrBase, .PtrOffset: PtrOffset, .Val: Val, .ValOffset: ValOffset, .ValWidth: ValWidth, .Store: Store}};
1089	}
1090
1091	static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
1092	unsigned Width, const DataLayout &DL,
1093	TargetTransformInfo &TTI) {
1094	if (Parts.size() < `2`)
1095	return false;
1096
1097	// Check whether combining the stores is profitable.
1098	// FIXME: We could generate smaller stores if we can't produce a large one.
1099	const PartStore &First = Parts.front();
1100	LLVMContext &Ctx = First.Store->getContext();
1101	Type *NewTy = Type::getIntNTy(C&: Ctx, N: Width);
1102	unsigned Fast = `0`;
1103	if (!TTI.isTypeLegal(Ty: NewTy) \|\|
1104	!TTI.allowsMisalignedMemoryAccesses(Context&: Ctx, BitWidth: Width,
1105	AddressSpace: First.Store->getPointerAddressSpace(),
1106	Alignment: First.Store->getAlign(), Fast: &Fast) \|\|
1107	!Fast)
1108	return false;
1109
1110	// Generate the combined store.
1111	IRBuilder<> Builder(First.Store);
1112	Value *Val = First.Val;
1113	if (First.ValOffset != `0`)
1114	Val = Builder.CreateLShr(LHS: Val, RHS: First.ValOffset);
1115	Val = Builder.CreateZExtOrTrunc(V: Val, DestTy: NewTy);
1116	StoreInst *Store = Builder.CreateAlignedStore(
1117	Val, Ptr: First.Store->getPointerOperand(), Align: First.Store->getAlign());
1118
1119	// Merge various metadata onto the new store.
1120	AAMDNodes AATags = First.Store->getAAMetadata();
1121	SmallVector<Instruction *> Stores = {First.Store};
1122	Stores.reserve(N: Parts.size());
1123	SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
1124	DbgLocs.reserve(N: Parts.size());
1125	for (const PartStore &Part : drop_begin(RangeOrContainer&: Parts)) {
1126	AATags = AATags.concat(Other: Part.Store->getAAMetadata());
1127	Stores.push_back(Elt: Part.Store);
1128	DbgLocs.push_back(Elt: Part.Store->getDebugLoc());
1129	}
1130	Store->setAAMetadata(AATags);
1131	Store->mergeDIAssignID(SourceInstructions: Stores);
1132	Store->setDebugLoc(DebugLoc::getMergedLocations(Locs: DbgLocs));
1133
1134	// Remove the old stores.
1135	for (const PartStore &Part : Parts)
1136	Part.Store->eraseFromParent();
1137
1138	return true;
1139	}
1140
1141	static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
1142	const DataLayout &DL, TargetTransformInfo &TTI) {
1143	if (Parts.size() < `2`)
1144	return false;
1145
1146	// We now have multiple parts of the same value stored to the same pointer.
1147	// Sort the parts by pointer offset, and make sure they are consistent with
1148	// the value offsets. Also check that the value is fully covered without
1149	// overlaps.
1150	bool Changed = false;
1151	llvm::sort(C&: Parts);
1152	int64_t LastEndOffsetFromFirst = `0`;
1153	const PartStore *First = &Parts [`0`];
1154	for (const PartStore &Part : Parts) {
1155	APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
1156	int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
1157	if (PtrOffsetFromFirst * `8` != ValOffsetFromFirst \|\|
1158	LastEndOffsetFromFirst != ValOffsetFromFirst) {
1159	Changed \|= mergeConsecutivePartStores(Parts: ArrayRef(First, &Part),
1160	Width: LastEndOffsetFromFirst, DL, TTI);
1161	First = &Part;
1162	LastEndOffsetFromFirst = Part.ValWidth;
1163	continue;
1164	}
1165
1166	LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
1167	}
1168
1169	Changed \|= mergeConsecutivePartStores(Parts: ArrayRef(First, Parts.end()),
1170	Width: LastEndOffsetFromFirst, DL, TTI);
1171	return Changed;
1172	}
1173
1174	static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
1175	TargetTransformInfo &TTI, AliasAnalysis &AA) {
1176	// FIXME: Add big endian support.
1177	if (DL.isBigEndian())
1178	return false;
1179
1180	BatchAAResults BatchAA(AA);
1181	SmallVector<PartStore, `8`> Parts;
1182	bool MadeChange = false;
1183	for (Instruction &I : make_early_inc_range(Range&: BB)) {
1184	if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
1185	if (Parts.empty() \|\| Part ->isCompatibleWith(Other: Parts [`0`])) {
1186	Parts.push_back(Elt: std::move(*Part));
1187	continue;
1188	}
1189
1190	MadeChange \|= mergePartStores(Parts, DL, TTI);
1191	Parts.clear();
1192	Parts.push_back(Elt: std::move(*Part));
1193	continue;
1194	}
1195
1196	if (Parts.empty())
1197	continue;
1198
1199	if (I.mayThrow() \|\|
1200	(I.mayReadOrWriteMemory() &&
1201	isModOrRefSet(MRI: BatchAA.getModRefInfo(
1202	I: &I, OptLoc: MemoryLocation::getBeforeOrAfter(Ptr: Parts [`0`].PtrBase))))) {
1203	MadeChange \|= mergePartStores(Parts, DL, TTI);
1204	Parts.clear();
1205	continue;
1206	}
1207	}
1208
1209	MadeChange \|= mergePartStores(Parts, DL, TTI);
1210	return MadeChange;
1211	}
1212
1213	/// Combine away instructions providing they are still equivalent when compared
1214	/// against 0. i.e do they have any bits set.
1215	static Value optimizeShiftInOrChain(Value V, IRBuilder<> &Builder) {
1216	auto *I = dyn_cast<Instruction>(Val: V);
1217	if (!I \|\| I->getOpcode() != Instruction::Or \|\| !I->hasOneUse())
1218	return nullptr;
1219
1220	Value *A;
1221
1222	// Look deeper into the chain of or's, combining away shl (so long as they are
1223	// nuw or nsw).
1224	Value *Op0 = I->getOperand(i: `0`);
1225	if (match(V: Op0, P: m_CombineOr(L: m_NSWShl(L: m_Value(V&: A), R: m_Value()),
1226	R: m_NUWShl(L: m_Value(V&: A), R: m_Value()))))
1227	Op0 = A;
1228	else if (auto *NOp = optimizeShiftInOrChain(V: Op0, Builder))
1229	Op0 = NOp;
1230
1231	Value *Op1 = I->getOperand(i: `1`);
1232	if (match(V: Op1, P: m_CombineOr(L: m_NSWShl(L: m_Value(V&: A), R: m_Value()),
1233	R: m_NUWShl(L: m_Value(V&: A), R: m_Value()))))
1234	Op1 = A;
1235	else if (auto *NOp = optimizeShiftInOrChain(V: Op1, Builder))
1236	Op1 = NOp;
1237
1238	if (Op0 != I->getOperand(i: `0`) \|\| Op1 != I->getOperand(i: `1`))
1239	return Builder.CreateOr(LHS: Op0, RHS: Op1);
1240	return nullptr;
1241	}
1242
1243	static bool foldICmpOrChain(Instruction &I, const DataLayout &DL,
1244	TargetTransformInfo &TTI, AliasAnalysis &AA,
1245	const DominatorTree &DT) {
1246	CmpPredicate Pred;
1247	Value *Op0;
1248	if (!match(V: &I, P: m_ICmp(Pred, L: m_Value(V&: Op0), R: m_Zero())) \|\|
1249	!ICmpInst::isEquality(P: Pred))
1250	return false;
1251
1252	// If the chain or or's matches a load, combine to that before attempting to
1253	// remove shifts.
1254	if (auto OpI = dyn_cast<Instruction>(Val: Op0))
1255	if (OpI->getOpcode() == Instruction::Or)
1256	if (foldConsecutiveLoads(I&: *OpI, DL, TTI, AA, DT))
1257	return true;
1258
1259	IRBuilder<> Builder(&I);
1260	// icmp eq/ne or(shl(a), b), 0 -> icmp eq/ne or(a, b), 0
1261	if (auto *Res = optimizeShiftInOrChain(V: Op0, Builder)) {
1262	I.replaceAllUsesWith(V: Builder.CreateICmp(P: Pred, LHS: Res, RHS: I.getOperand(i: `1`)));
1263	return true;
1264	}
1265
1266	return false;
1267	}
1268
1269	// Calculate GEP Stride and accumulated const ModOffset. Return Stride and
1270	// ModOffset
1271	static std::pair<APInt, APInt>
1272	getStrideAndModOffsetOfGEP(Value PtrOp, const* DataLayout &DL) {
1273	unsigned BW = DL.getIndexTypeSizeInBits(Ty: PtrOp->getType());
1274	std::optional<APInt> Stride;
1275	APInt ModOffset(BW, `0`);
1276	// Return a minimum gep stride, greatest common divisor of consective gep
1277	// index scales(c.f. Bézout's identity).
1278	while (auto *GEP = dyn_cast<GEPOperator>(Val: PtrOp)) {
1279	SmallMapVector<Value *, APInt, `4`> VarOffsets;
1280	if (!GEP->collectOffset(DL, BitWidth: BW, VariableOffsets&: VarOffsets, ConstantOffset&: ModOffset))
1281	break;
1282
1283	for (auto [V, Scale] : VarOffsets) {
1284	// Only keep a power of two factor for non-inbounds
1285	if (!GEP->hasNoUnsignedSignedWrap())
1286	Scale = APInt::getOneBitSet(numBits: Scale.getBitWidth(), BitNo: Scale.countr_zero());
1287
1288	if (!Stride)
1289	Stride = Scale;
1290	else
1291	Stride = APIntOps::GreatestCommonDivisor(A: *Stride, B: Scale);
1292	}
1293
1294	PtrOp = GEP->getPointerOperand();
1295	}
1296
1297	// Check whether pointer arrives back at Global Variable via at least one GEP.
1298	// Even if it doesn't, we can check by alignment.
1299	if (!isa<GlobalVariable>(Val: PtrOp) \|\| !Stride)
1300	return {APInt (BW, `1`), APInt (BW, `0`)};
1301
1302	// In consideration of signed GEP indices, non-negligible offset become
1303	// remainder of division by minimum GEP stride.
1304	ModOffset = ModOffset.srem(RHS: *Stride);
1305	if (ModOffset.isNegative())
1306	ModOffset += *Stride;
1307
1308	return {*Stride, ModOffset};
1309	}
1310
1311	/// If C is a constant patterned array and all valid loaded results for given
1312	/// alignment are same to a constant, return that constant.
1313	static bool foldPatternedLoads(Instruction &I, const DataLayout &DL) {
1314	auto *LI = dyn_cast<LoadInst>(Val: &I);
1315	if (!LI \|\| LI->isVolatile())
1316	return false;
1317
1318	// We can only fold the load if it is from a constant global with definitive
1319	// initializer. Skip expensive logic if this is not the case.
1320	auto *PtrOp = LI->getPointerOperand();
1321	auto *GV = dyn_cast<GlobalVariable>(Val: getUnderlyingObject(V: PtrOp));
1322	if (!GV \|\| !GV->isConstant() \|\| !GV->hasDefinitiveInitializer())
1323	return false;
1324
1325	// Bail for large initializers in excess of 4K to avoid too many scans.
1326	Constant *C = GV->getInitializer();
1327	uint64_t GVSize = DL.getTypeAllocSize(Ty: C->getType());
1328	if (!GVSize \|\| `4096` < GVSize)
1329	return false;
1330
1331	Type *LoadTy = LI->getType();
1332	unsigned BW = DL.getIndexTypeSizeInBits(Ty: PtrOp->getType());
1333	auto [Stride, ConstOffset] = getStrideAndModOffsetOfGEP(PtrOp, DL);
1334
1335	// Any possible offset could be multiple of GEP stride. And any valid
1336	// offset is multiple of load alignment, so checking only multiples of bigger
1337	// one is sufficient to say results' equality.
1338	if (auto LA = LI->getAlign();
1339	LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) {
1340	ConstOffset = APInt (BW, `0`);
1341	Stride = APInt (BW, LA.value());
1342	}
1343
1344	Constant *Ca = ConstantFoldLoadFromConst(C, Ty: LoadTy, Offset: ConstOffset, DL);
1345	if (!Ca)
1346	return false;
1347
1348	unsigned E = GVSize - DL.getTypeStoreSize(Ty: LoadTy);
1349	for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)
1350	if (Ca != ConstantFoldLoadFromConst(C, Ty: LoadTy, Offset: ConstOffset, DL))
1351	return false;
1352
1353	I.replaceAllUsesWith(V: Ca);
1354
1355	return true;
1356	}
1357
1358	namespace {
1359	class StrNCmpInliner {
1360	public:
1361	StrNCmpInliner(CallInst CI, LibFunc Func, DomTreeUpdater DTU,
1362	const DataLayout &DL)
1363	: CI(CI), Func(Func), DTU(DTU), DL(DL) {}
1364
1365	bool optimizeStrNCmp();
1366
1367	private:
1368	void inlineCompare(Value LHS, StringRef RHS, uint64_t N, bool* Swapped);
1369
1370	CallInst *CI;
1371	LibFunc Func;
1372	DomTreeUpdater *DTU;
1373	const DataLayout &DL;
1374	};
1375
1376	} // namespace
1377
1378	/// First we normalize calls to strncmp/strcmp to the form of
1379	/// compare(s1, s2, N), which means comparing first N bytes of s1 and s2
1380	/// (without considering '\0').
1381	///
1382	/// Examples:
1383	///
1384	/// \code
1385	/// strncmp(s, "a", 3) -> compare(s, "a", 2)
1386	/// strncmp(s, "abc", 3) -> compare(s, "abc", 3)
1387	/// strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2)
1388	/// strcmp(s, "a") -> compare(s, "a", 2)
1389	///
1390	/// char s2[] = {'a'}
1391	/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1392	///
1393	/// char s2[] = {'a', 'b', 'c', 'd'}
1394	/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1395	/// \endcode
1396	///
1397	/// We only handle cases where N and exactly one of s1 and s2 are constant.
1398	/// Cases that s1 and s2 are both constant are already handled by the
1399	/// instcombine pass.
1400	///
1401	/// We do not handle cases where N > StrNCmpInlineThreshold.
1402	///
1403	/// We also do not handles cases where N < 2, which are already
1404	/// handled by the instcombine pass.
1405	///
1406	bool StrNCmpInliner::optimizeStrNCmp() {
1407	if (StrNCmpInlineThreshold < `2`)
1408	return false;
1409
1410	if (!isOnlyUsedInZeroComparison(CxtI: CI))
1411	return false;
1412
1413	Value *Str1P = CI->getArgOperand(i: `0`);
1414	Value *Str2P = CI->getArgOperand(i: `1`);
1415	// Should be handled elsewhere.
1416	if (Str1P == Str2P)
1417	return false;
1418
1419	StringRef Str1, Str2;
1420	bool HasStr1 = getConstantStringInfo(V: Str1P, Str&: Str1, /TrimAtNul=/false);
1421	bool HasStr2 = getConstantStringInfo(V: Str2P, Str&: Str2, /TrimAtNul=/false);
1422	if (HasStr1 == HasStr2)
1423	return false;
1424
1425	// Note that '\0' and characters after it are not trimmed.
1426	StringRef Str = HasStr1 ? Str1 : Str2;
1427	Value *StrP = HasStr1 ? Str2P : Str1P;
1428
1429	size_t Idx = Str.find(C: `'\0'`);
1430	uint64_t N = Idx == StringRef::npos ? UINT64_MAX : Idx + `1`;
1431	if (Func == LibFunc_strncmp) {
1432	if (auto *ConstInt = dyn_cast<ConstantInt>(Val: CI->getArgOperand(i: `2`)))
1433	N = std::min(a: N, b: ConstInt->getZExtValue());
1434	else
1435	return false;
1436	}
1437	// Now N means how many bytes we need to compare at most.
1438	if (N > Str.size() \|\| N < `2` \|\| N > StrNCmpInlineThreshold)
1439	return false;
1440
1441	// Cases where StrP has two or more dereferenceable bytes might be better
1442	// optimized elsewhere.
1443	bool CanBeNull = false, CanBeFreed = false;
1444	if (StrP->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed) > `1`)
1445	return false;
1446	inlineCompare(LHS: StrP, RHS: Str, N, Swapped: HasStr1);
1447	return true;
1448	}
1449
1450	/// Convert
1451	///
1452	/// \code
1453	/// ret = compare(s1, s2, N)
1454	/// \endcode
1455	///
1456	/// into
1457	///
1458	/// \code
1459	/// ret = (int)s1[0] - (int)s2[0]
1460	/// if (ret != 0)
1461	/// goto NE
1462	/// ...
1463	/// ret = (int)s1[N-2] - (int)s2[N-2]
1464	/// if (ret != 0)
1465	/// goto NE
1466	/// ret = (int)s1[N-1] - (int)s2[N-1]
1467	/// NE:
1468	/// \endcode
1469	///
1470	/// CFG before and after the transformation:
1471	///
1472	/// (before)
1473	/// BBCI
1474	///
1475	/// (after)
1476	/// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail
1477	/// \| ^
1478	/// E \|
1479	/// \| \|
1480	/// BBSubs[1] (sub,icmp) --NE-----+
1481	/// ... \|
1482	/// BBSubs[N-1] (sub) ---------+
1483	///
1484	void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
1485	bool Swapped) {
1486	auto &Ctx = CI->getContext();
1487	IRBuilder<> B(Ctx);
1488	// We want these instructions to be recognized as inlined instructions for the
1489	// compare call, but we don't have a source location for the definition of
1490	// that function, since we're generating that code now. Because the generated
1491	// code is a viable point for a memory access error, we make the pragmatic
1492	// choice here to directly use CI's location so that we have useful
1493	// attribution for the generated code.
1494	B.SetCurrentDebugLocation(CI->getDebugLoc());
1495
1496	BasicBlock *BBCI = CI->getParent();
1497	BasicBlock *BBTail =
1498	SplitBlock(Old: BBCI, SplitPt: CI, DTU, LI: nullptr, MSSAU: nullptr, BBName: BBCI->getName() + ".tail");
1499
1500	SmallVector<BasicBlock *> BBSubs;
1501	for (uint64_t I = `0`; I < N; ++I)
1502	BBSubs.push_back(
1503	Elt: BasicBlock::Create(Context&: Ctx, Name: "sub_" + Twine (I), Parent: BBCI->getParent(), InsertBefore: BBTail));
1504	BasicBlock *BBNE = BasicBlock::Create(Context&: Ctx, Name: "ne", Parent: BBCI->getParent(), InsertBefore: BBTail);
1505
1506	cast<UncondBrInst>(Val: BBCI->getTerminator())->setSuccessor(BBSubs [`0`]);
1507
1508	B.SetInsertPoint(BBNE);
1509	PHINode *Phi = B.CreatePHI(Ty: CI->getType(), NumReservedValues: N);
1510	B.CreateBr(Dest: BBTail);
1511
1512	Value *Base = LHS;
1513	for (uint64_t i = `0`; i < N; ++i) {
1514	B.SetInsertPoint(BBSubs [i]);
1515	Value *VL =
1516	B.CreateZExt(V: B.CreateLoad(Ty: B.getInt8Ty(),
1517	Ptr: B.CreateInBoundsPtrAdd(Ptr: Base, Offset: B.getInt64(C: i))),
1518	DestTy: CI->getType());
1519	Value *VR =
1520	ConstantInt::get(Ty: CI->getType(), V: static_cast<unsigned char>(RHS [i]));
1521	Value *Sub = Swapped ? B.CreateSub(LHS: VR, RHS: VL) : B.CreateSub(LHS: VL, RHS: VR);
1522	if (i < N - `1`) {
1523	CondBrInst *CondBrInst = B.CreateCondBr(
1524	Cond: B.CreateICmpNE(LHS: Sub, RHS: ConstantInt::get(Ty: CI->getType(), V: `0`)), True: BBNE,
1525	False: BBSubs [i + `1`]);
1526
1527	Function *F = CI->getFunction();
1528	assert(F && "Instruction does not belong to a function!");
1529	std::optional<Function::ProfileCount> EC = F->getEntryCount();
1530	if (EC && EC ->getCount() > `0`)
1531	setExplicitlyUnknownBranchWeights(I&: *CondBrInst, DEBUG_TYPE);
1532	} else {
1533	B.CreateBr(Dest: BBNE);
1534	}
1535
1536	Phi->addIncoming(V: Sub, BB: BBSubs [i]);
1537	}
1538
1539	CI->replaceAllUsesWith(V: Phi);
1540	CI->eraseFromParent();
1541
1542	if (DTU) {
1543	SmallVector<DominatorTree::UpdateType, `8`> Updates;
1544	Updates.push_back(Elt: {DominatorTree::Insert, BBCI, BBSubs [`0`]});
1545	for (uint64_t i = `0`; i < N; ++i) {
1546	if (i < N - `1`)
1547	Updates.push_back(Elt: {DominatorTree::Insert, BBSubs [i], BBSubs [i + `1`]});
1548	Updates.push_back(Elt: {DominatorTree::Insert, BBSubs [i], BBNE});
1549	}
1550	Updates.push_back(Elt: {DominatorTree::Insert, BBNE, BBTail});
1551	Updates.push_back(Elt: {DominatorTree::Delete, BBCI, BBTail});
1552	DTU->applyUpdates(Updates);
1553	}
1554	}
1555
1556	/// Convert memchr with a small constant string into a switch
1557	static bool foldMemChr(CallInst Call, DomTreeUpdater DTU,
1558	const DataLayout &DL) {
1559	if (isa<Constant>(Val: Call->getArgOperand(i: `1`)))
1560	return false;
1561
1562	StringRef Str;
1563	Value *Base = Call->getArgOperand(i: `0`);
1564	if (!getConstantStringInfo(V: Base, Str, /TrimAtNul=/false))
1565	return false;
1566
1567	uint64_t N = Str.size();
1568	if (auto *ConstInt = dyn_cast<ConstantInt>(Val: Call->getArgOperand(i: `2`))) {
1569	uint64_t Val = ConstInt->getZExtValue();
1570	// Ignore the case that n is larger than the size of string.
1571	if (Val > N)
1572	return false;
1573	N = Val;
1574	} else
1575	return false;
1576
1577	if (N > MemChrInlineThreshold)
1578	return false;
1579
1580	BasicBlock *BB = Call->getParent();
1581	BasicBlock *BBNext = SplitBlock(Old: BB, SplitPt: Call, DTU);
1582	IRBuilder<> IRB(BB);
1583	IRB.SetCurrentDebugLocation(Call->getDebugLoc());
1584	IntegerType *ByteTy = IRB.getInt8Ty();
1585	BB->getTerminator()->eraseFromParent();
1586	SwitchInst *SI = IRB.CreateSwitch(
1587	V: IRB.CreateTrunc(V: Call->getArgOperand(i: `1`), DestTy: ByteTy), Dest: BBNext, NumCases: N);
1588	// We can't know the precise weights here, as they would depend on the value
1589	// distribution of Call->getArgOperand(1). So we just mark it as "unknown".
1590	setExplicitlyUnknownBranchWeightsIfProfiled(I&: *SI, DEBUG_TYPE);
1591	Type *IndexTy = DL.getIndexType(PtrTy: Call->getType());
1592	SmallVector<DominatorTree::UpdateType, `8`> Updates;
1593
1594	BasicBlock *BBSuccess = BasicBlock::Create(
1595	Context&: Call->getContext(), Name: "memchr.success", Parent: BB->getParent(), InsertBefore: BBNext);
1596	IRB.SetInsertPoint(BBSuccess);
1597	PHINode *IndexPHI = IRB.CreatePHI(Ty: IndexTy, NumReservedValues: N, Name: "memchr.idx");
1598	Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Ptr: Base, Offset: IndexPHI);
1599	IRB.CreateBr(Dest: BBNext);
1600	if (DTU)
1601	Updates.push_back(Elt: {DominatorTree::Insert, BBSuccess, BBNext});
1602
1603	SmallPtrSet<ConstantInt *, `4`> Cases;
1604	for (uint64_t I = `0`; I < N; ++I) {
1605	ConstantInt *CaseVal =
1606	ConstantInt::get(Ty: ByteTy, V: static_cast<unsigned char>(Str [I]));
1607	if (!Cases.insert(Ptr: CaseVal).second)
1608	continue;
1609
1610	BasicBlock *BBCase = BasicBlock::Create(Context&: Call->getContext(), Name: "memchr.case",
1611	Parent: BB->getParent(), InsertBefore: BBSuccess);
1612	SI->addCase(OnVal: CaseVal, Dest: BBCase);
1613	IRB.SetInsertPoint(BBCase);
1614	IndexPHI->addIncoming(V: ConstantInt::get(Ty: IndexTy, V: I), BB: BBCase);
1615	IRB.CreateBr(Dest: BBSuccess);
1616	if (DTU) {
1617	Updates.push_back(Elt: {DominatorTree::Insert, BB, BBCase});
1618	Updates.push_back(Elt: {DominatorTree::Insert, BBCase, BBSuccess});
1619	}
1620	}
1621
1622	PHINode *PHI =
1623	PHINode::Create(Ty: Call->getType(), NumReservedValues: `2`, NameStr: Call->getName(), InsertBefore: BBNext->begin());
1624	PHI->addIncoming(V: Constant::getNullValue(Ty: Call->getType()), BB);
1625	PHI->addIncoming(V: FirstOccursLocation, BB: BBSuccess);
1626
1627	Call->replaceAllUsesWith(V: PHI);
1628	Call->eraseFromParent();
1629
1630	if (DTU)
1631	DTU->applyUpdates(Updates);
1632
1633	return true;
1634	}
1635
1636	static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI,
1637	TargetLibraryInfo &TLI, AssumptionCache &AC,
1638	DominatorTree &DT, const DataLayout &DL,
1639	bool &MadeCFGChange) {
1640
1641	auto *CI = dyn_cast<CallInst>(Val: &I);
1642	if (!CI \|\| CI->isNoBuiltin())
1643	return false;
1644
1645	Function *CalledFunc = CI->getCalledFunction();
1646	if (!CalledFunc)
1647	return false;
1648
1649	LibFunc LF;
1650	if (!TLI.getLibFunc(FDecl: *CalledFunc, F&: LF) \|\|
1651	!isLibFuncEmittable(M: CI->getModule(), TLI: &TLI, TheLibFunc: LF))
1652	return false;
1653
1654	DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
1655
1656	switch (LF) {
1657	case LibFunc_sqrt:
1658	case LibFunc_sqrtf:
1659	case LibFunc_sqrtl:
1660	return foldSqrt(Call: CI, Func: LF, TTI, TLI, AC, DT);
1661	case LibFunc_strcmp:
1662	case LibFunc_strncmp:
1663	if (StrNCmpInliner (CI, LF, &DTU, DL).optimizeStrNCmp()) {
1664	MadeCFGChange = true;
1665	return true;
1666	}
1667	break;
1668	case LibFunc_memchr:
1669	if (foldMemChr(Call: CI, DTU: &DTU, DL)) {
1670	MadeCFGChange = true;
1671	return true;
1672	}
1673	break;
1674	default:;
1675	}
1676	return false;
1677	}
1678
1679	/// Match high part of long multiplication.
1680	///
1681	/// Considering a multiply made up of high and low parts, we can split the
1682	/// multiply into:
1683	/// x y == (xhT + xl) (yhT + yl)
1684	/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32.
1685	/// This expands to
1686	/// xhyhTT + xhylT + xlyhT + xlyl
1687	/// which can be drawn as
1688	/// [ xhyh ]*
1689	/// [ xhyl ]*
1690	/// [ xlyh ]*
1691	/// [ xlyl ]*
1692	/// We are looking for the "high" half, which is xhyh + xhyl>>32 + xlyh>>32 +*
1693	/// some carrys. The carry makes this difficult and there are multiple ways of
1694	/// representing it. The ones we attempt to support here are:
1695	/// Carry: xhyh + carry + lowsum*
1696	/// carry = lowsum < xhyl ? 0x1000000 : 0*
1697	/// lowsum = xhyl + xlyh + (xlyl>>32)*
1698	/// Ladder: xhyh + c2>>32 + c3>>32*
1699	/// c2 = xhyl + (xlyl>>32); c3 = c2&0xffffffff + xlyh*
1700	/// or c2 = (xlyh&0xffffffff) + xhyl + (xlyl>>32); c3 = xlyh
1701	/// Carry4: xhyh + carry + crosssum>>32 + (xlyl + crosssum&0xffffffff) >> 32
1702	/// crosssum = xhyl + xlyh
1703	/// carry = crosssum < xhyl ? 0x1000000 : 0*
1704	/// Ladder4: xhyh + (xlyh)>>32 + (xhyl)>>32 + low>>32;*
1705	/// low = (xlyl)>>32 + (xlyh)&0xffffffff + (xhyl)&0xffffffff*
1706	///
1707	/// They all start by matching xhyh + 2 or 3 other operands. The bottom of the*
1708	/// tree is xhyh, xhyl, xlyh and xlyl.
1709	static bool foldMulHigh(Instruction &I) {
1710	Type *Ty = I.getType();
1711	if (!Ty->isIntOrIntVectorTy())
1712	return false;
1713
1714	unsigned BitWidth = Ty->getScalarSizeInBits();
1715	APInt LowMask = APInt::getLowBitsSet(numBits: BitWidth, loBitsSet: BitWidth / `2`);
1716	if (BitWidth % `2` != `0`)
1717	return false;
1718
1719	auto CreateMulHigh = [&](Value X, Value Y) {
1720	IRBuilder<> Builder(&I);
1721	Type NTy = Ty->getWithNewBitWidth(NewBitWidth: BitWidth `2`);
1722	Value *XExt = Builder.CreateZExt(V: X, DestTy: NTy);
1723	Value *YExt = Builder.CreateZExt(V: Y, DestTy: NTy);
1724	Value Mul = Builder.CreateMul(LHS: XExt, RHS: YExt, Name: "", /HasNUW=/*true);
1725	Value *High = Builder.CreateLShr(LHS: Mul, RHS: BitWidth);
1726	Value Res = Builder.CreateTrunc(V: High, DestTy: Ty, Name: "", /HasNUW=/IsNUW: true*);
1727	Res->takeName(V: &I);
1728	I.replaceAllUsesWith(V: Res);
1729	LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
1730	<< *Y << "\n");
1731	return true;
1732	};
1733
1734	// Common check routines for X_loY_lo and X_hiY_lo
1735	auto CheckLoLo = [&](Value XlYl, Value X, Value *Y) {
1736	return match(V: XlYl, P: m_c_Mul(L: m_And(L: m_Specific(V: X), R: m_SpecificInt(V: LowMask)),
1737	R: m_And(L: m_Specific(V: Y), R: m_SpecificInt(V: LowMask))));
1738	};
1739	auto CheckHiLo = [&](Value XhYl, Value X, Value *Y) {
1740	return match(V: XhYl,
1741	P: m_c_Mul(L: m_LShr(L: m_Specific(V: X), R: m_SpecificInt(V: BitWidth / `2`)),
1742	R: m_And(L: m_Specific(V: Y), R: m_SpecificInt(V: LowMask))));
1743	};
1744
1745	auto FoldMulHighCarry = [&](Value X, Value Y, Instruction *Carry,
1746	Instruction *B) {
1747	// Looking for LowSum >> 32 and carry (select)
1748	if (Carry->getOpcode() != Instruction::Select)
1749	std::swap(a&: Carry, b&: B);
1750
1751	// Carry = LowSum < XhYl ? 0x100000000 : 0
1752	Value LowSum, XhYl;
1753	if (!match(V: Carry,
1754	P: m_OneUse(SubPattern: m_Select(
1755	C: m_OneUse(SubPattern: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT, L: m_Value(V&: LowSum),
1756	R: m_Value(V&: XhYl))),
1757	L: m_SpecificInt(V: APInt::getOneBitSet(numBits: BitWidth, BitNo: BitWidth / `2`)),
1758	R: m_Zero()))))
1759	return false;
1760
1761	// XhYl can be XhYl or XlYh
1762	if (!CheckHiLo (XhYl, X, Y)) {
1763	if (CheckHiLo (XhYl, Y, X))
1764	std::swap(a&: X, b&: Y);
1765	else
1766	return false;
1767	}
1768	if (XhYl->hasNUsesOrMore(N: `3`))
1769	return false;
1770
1771	// B = LowSum >> 32
1772	if (!match(V: B, P: m_OneUse(SubPattern: m_LShr(L: m_Specific(V: LowSum),
1773	R: m_SpecificInt(V: BitWidth / `2`)))) \|\|
1774	LowSum->hasNUsesOrMore(N: `3`))
1775	return false;
1776
1777	// LowSum = XhYl + XlYh + XlYl>>32
1778	Value XlYh, XlYl;
1779	auto XlYlHi = m_LShr(L: m_Value(V&: XlYl), R: m_SpecificInt(V: BitWidth / `2`));
1780	if (!match(V: LowSum,
1781	P: m_c_Add(L: m_Specific(V: XhYl),
1782	R: m_OneUse(SubPattern: m_c_Add(L: m_OneUse(SubPattern: m_Value(V&: XlYh)), R: XlYlHi)))) &&
1783	!match(V: LowSum, P: m_c_Add(L: m_OneUse(SubPattern: m_Value(V&: XlYh)),
1784	R: m_OneUse(SubPattern: m_c_Add(L: m_Specific(V: XhYl), R: XlYlHi)))) &&
1785	!match(V: LowSum,
1786	P: m_c_Add(L: XlYlHi, R: m_OneUse(SubPattern: m_c_Add(L: m_Specific(V: XhYl),
1787	R: m_OneUse(SubPattern: m_Value(V&: XlYh)))))))
1788	return false;
1789
1790	// Check XlYl and XlYh
1791	if (!CheckLoLo (XlYl, X, Y))
1792	return false;
1793	if (!CheckHiLo (XlYh, Y, X))
1794	return false;
1795
1796	return CreateMulHigh (X, Y);
1797	};
1798
1799	auto FoldMulHighLadder = [&](Value X, Value Y, Instruction *A,
1800	Instruction *B) {
1801	// xhyh + c2>>32 + c3>>32*
1802	// c2 = xhyl + (xlyl>>32); c3 = c2&0xffffffff + xlyh*
1803	// or c2 = (xlyh&0xffffffff) + xhyl + (xlyl>>32); c3 = xhyl
1804	Value XlYh, XhYl, XlYl, C2, *C3;
1805	// Strip off the two expected shifts.
1806	if (!match(V: A, P: m_LShr(L: m_Value(V&: C2), R: m_SpecificInt(V: BitWidth / `2`))) \|\|
1807	!match(V: B, P: m_LShr(L: m_Value(V&: C3), R: m_SpecificInt(V: BitWidth / `2`))))
1808	return false;
1809
1810	if (match(V: C3, P: m_c_Add(L: m_Add(L: m_Value(), R: m_Value()), R: m_Value())))
1811	std::swap(a&: C2, b&: C3);
1812	// Try to match c2 = (xlyh&0xffffffff) + xhyl + (xlyl>>32)*
1813	if (match(V: C2,
1814	P: m_c_Add(L: m_c_Add(L: m_And(L: m_Specific(V: C3), R: m_SpecificInt(V: LowMask)),
1815	R: m_Value(V&: XlYh)),
1816	R: m_LShr(L: m_Value(V&: XlYl), R: m_SpecificInt(V: BitWidth / `2`)))) \|\|
1817	match(V: C2, P: m_c_Add(L: m_c_Add(L: m_And(L: m_Specific(V: C3), R: m_SpecificInt(V: LowMask)),
1818	R: m_LShr(L: m_Value(V&: XlYl),
1819	R: m_SpecificInt(V: BitWidth / `2`))),
1820	R: m_Value(V&: XlYh))) \|\|
1821	match(V: C2, P: m_c_Add(L: m_c_Add(L: m_LShr(L: m_Value(V&: XlYl),
1822	R: m_SpecificInt(V: BitWidth / `2`)),
1823	R: m_Value(V&: XlYh)),
1824	R: m_And(L: m_Specific(V: C3), R: m_SpecificInt(V: LowMask))))) {
1825	XhYl = C3;
1826	} else {
1827	// Match c3 = c2&0xffffffff + xlyh*
1828	if (!match(V: C3, P: m_c_Add(L: m_And(L: m_Specific(V: C2), R: m_SpecificInt(V: LowMask)),
1829	R: m_Value(V&: XlYh))))
1830	std::swap(a&: C2, b&: C3);
1831	if (!match(V: C3, P: m_c_Add(L: m_OneUse(
1832	SubPattern: m_And(L: m_Specific(V: C2), R: m_SpecificInt(V: LowMask))),
1833	R: m_Value(V&: XlYh))) \|\|
1834	!C3->hasOneUse() \|\| C2->hasNUsesOrMore(N: `3`))
1835	return false;
1836
1837	// Match c2 = xhyl + (xlyl >> 32)
1838	if (!match(V: C2, P: m_c_Add(L: m_LShr(L: m_Value(V&: XlYl), R: m_SpecificInt(V: BitWidth / `2`)),
1839	R: m_Value(V&: XhYl))))
1840	return false;
1841	}
1842
1843	// Match XhYl and XlYh - they can appear either way around.
1844	if (!CheckHiLo (XlYh, Y, X))
1845	std::swap(a&: XlYh, b&: XhYl);
1846	if (!CheckHiLo (XlYh, Y, X))
1847	return false;
1848	if (!CheckHiLo (XhYl, X, Y))
1849	return false;
1850	if (!CheckLoLo (XlYl, X, Y))
1851	return false;
1852
1853	return CreateMulHigh (X, Y);
1854	};
1855
1856	auto FoldMulHighLadder4 = [&](Value X, Value Y, Instruction *A,
1857	Instruction B, Instruction C) {
1858	/// Ladder4: xhyh + (xlyh)>>32 + (xh+yl)>>32 + low>>32;
1859	/// low = (xlyl)>>32 + (xlyh)&0xffffffff + (xhyl)&0xffffffff*
1860
1861	// Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32.
1862	auto ShiftAdd =
1863	m_LShr(L: m_Add(L: m_Value(), R: m_Value()), R: m_SpecificInt(V: BitWidth / `2`));
1864	if (!match(V: A, P: ShiftAdd))
1865	std::swap(a&: A, b&: B);
1866	if (!match(V: A, P: ShiftAdd))
1867	std::swap(a&: A, b&: C);
1868	Value *Low;
1869	if (!match(V: A, P: m_LShr(L: m_OneUse(SubPattern: m_Value(V&: Low)), R: m_SpecificInt(V: BitWidth / `2`))))
1870	return false;
1871
1872	// Match B == XhYl>>32 and C == XlYh>>32
1873	Value XhYl, XlYh;
1874	if (!match(V: B, P: m_LShr(L: m_Value(V&: XhYl), R: m_SpecificInt(V: BitWidth / `2`))) \|\|
1875	!match(V: C, P: m_LShr(L: m_Value(V&: XlYh), R: m_SpecificInt(V: BitWidth / `2`))))
1876	return false;
1877	if (!CheckHiLo (XhYl, X, Y))
1878	std::swap(a&: XhYl, b&: XlYh);
1879	if (!CheckHiLo (XhYl, X, Y) \|\| XhYl->hasNUsesOrMore(N: `3`))
1880	return false;
1881	if (!CheckHiLo (XlYh, Y, X) \|\| XlYh->hasNUsesOrMore(N: `3`))
1882	return false;
1883
1884	// Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff
1885	Value *XlYl;
1886	if (!match(
1887	V: Low,
1888	P: m_c_Add(
1889	L: m_OneUse(SubPattern: m_c_Add(
1890	L: m_OneUse(SubPattern: m_And(L: m_Specific(V: XhYl), R: m_SpecificInt(V: LowMask))),
1891	R: m_OneUse(SubPattern: m_And(L: m_Specific(V: XlYh), R: m_SpecificInt(V: LowMask))))),
1892	R: m_OneUse(
1893	SubPattern: m_LShr(L: m_Value(V&: XlYl), R: m_SpecificInt(V: BitWidth / `2`))))) &&
1894	!match(
1895	V: Low,
1896	P: m_c_Add(
1897	L: m_OneUse(SubPattern: m_c_Add(
1898	L: m_OneUse(SubPattern: m_And(L: m_Specific(V: XhYl), R: m_SpecificInt(V: LowMask))),
1899	R: m_OneUse(
1900	SubPattern: m_LShr(L: m_Value(V&: XlYl), R: m_SpecificInt(V: BitWidth / `2`))))),
1901	R: m_OneUse(SubPattern: m_And(L: m_Specific(V: XlYh), R: m_SpecificInt(V: LowMask))))) &&
1902	!match(
1903	V: Low,
1904	P: m_c_Add(
1905	L: m_OneUse(SubPattern: m_c_Add(
1906	L: m_OneUse(SubPattern: m_And(L: m_Specific(V: XlYh), R: m_SpecificInt(V: LowMask))),
1907	R: m_OneUse(
1908	SubPattern: m_LShr(L: m_Value(V&: XlYl), R: m_SpecificInt(V: BitWidth / `2`))))),
1909	R: m_OneUse(SubPattern: m_And(L: m_Specific(V: XhYl), R: m_SpecificInt(V: LowMask))))))
1910	return false;
1911	if (!CheckLoLo (XlYl, X, Y))
1912	return false;
1913
1914	return CreateMulHigh (X, Y);
1915	};
1916
1917	auto FoldMulHighCarry4 = [&](Value X, Value Y, Instruction *Carry,
1918	Instruction B, Instruction C) {
1919	// xhyh + carry + crosssum>>32 + (xlyl + crosssum&0xffffffff) >> 32
1920	// crosssum = xhyl+xlyh
1921	// carry = crosssum < xhyl ? 0x1000000 : 0*
1922	if (Carry->getOpcode() != Instruction::Select)
1923	std::swap(a&: Carry, b&: B);
1924	if (Carry->getOpcode() != Instruction::Select)
1925	std::swap(a&: Carry, b&: C);
1926
1927	// Carry = CrossSum < XhYl ? 0x100000000 : 0
1928	Value CrossSum, XhYl;
1929	if (!match(V: Carry,
1930	P: m_OneUse(SubPattern: m_Select(
1931	C: m_OneUse(SubPattern: m_SpecificICmp(MatchPred: ICmpInst::ICMP_ULT,
1932	L: m_Value(V&: CrossSum), R: m_Value(V&: XhYl))),
1933	L: m_SpecificInt(V: APInt::getOneBitSet(numBits: BitWidth, BitNo: BitWidth / `2`)),
1934	R: m_Zero()))))
1935	return false;
1936
1937	if (!match(V: B, P: m_LShr(L: m_Specific(V: CrossSum), R: m_SpecificInt(V: BitWidth / `2`))))
1938	std::swap(a&: B, b&: C);
1939	if (!match(V: B, P: m_LShr(L: m_Specific(V: CrossSum), R: m_SpecificInt(V: BitWidth / `2`))))
1940	return false;
1941
1942	Value XlYl, LowAccum;
1943	if (!match(V: C, P: m_LShr(L: m_Value(V&: LowAccum), R: m_SpecificInt(V: BitWidth / `2`))) \|\|
1944	!match(V: LowAccum, P: m_c_Add(L: m_OneUse(SubPattern: m_LShr(L: m_Value(V&: XlYl),
1945	R: m_SpecificInt(V: BitWidth / `2`))),
1946	R: m_OneUse(SubPattern: m_And(L: m_Specific(V: CrossSum),
1947	R: m_SpecificInt(V: LowMask))))) \|\|
1948	LowAccum->hasNUsesOrMore(N: `3`))
1949	return false;
1950	if (!CheckLoLo (XlYl, X, Y))
1951	return false;
1952
1953	if (!CheckHiLo (XhYl, X, Y))
1954	std::swap(a&: X, b&: Y);
1955	if (!CheckHiLo (XhYl, X, Y))
1956	return false;
1957	Value *XlYh;
1958	if (!match(V: CrossSum, P: m_c_Add(L: m_Specific(V: XhYl), R: m_OneUse(SubPattern: m_Value(V&: XlYh)))) \|\|
1959	!CheckHiLo (XlYh, Y, X) \|\| CrossSum->hasNUsesOrMore(N: `4`) \|\|
1960	XhYl->hasNUsesOrMore(N: `3`))
1961	return false;
1962
1963	return CreateMulHigh (X, Y);
1964	};
1965
1966	// X and Y are the two inputs, A, B and C are other parts of the pattern
1967	// (crosssum>>32, carry, etc).
1968	Value X, Y;
1969	Instruction A, B, *C;
1970	auto HiHi = m_OneUse(SubPattern: m_Mul(L: m_LShr(L: m_Value(V&: X), R: m_SpecificInt(V: BitWidth / `2`)),
1971	R: m_LShr(L: m_Value(V&: Y), R: m_SpecificInt(V: BitWidth / `2`))));
1972	if ((match(V: &I, P: m_c_Add(L: HiHi, R: m_OneUse(SubPattern: m_Add(L: m_Instruction(I&: A),
1973	R: m_Instruction(I&: B))))) \|\|
1974	match(V: &I, P: m_c_Add(L: m_Instruction(I&: A),
1975	R: m_OneUse(SubPattern: m_c_Add(L: HiHi, R: m_Instruction(I&: B)))))) &&
1976	A->hasOneUse() && B->hasOneUse())
1977	if (FoldMulHighCarry (X, Y, A, B) \|\| FoldMulHighLadder (X, Y, A, B))
1978	return true;
1979
1980	if ((match(V: &I, P: m_c_Add(L: HiHi, R: m_OneUse(SubPattern: m_c_Add(
1981	L: m_Instruction(I&: A),
1982	R: m_OneUse(SubPattern: m_Add(L: m_Instruction(I&: B),
1983	R: m_Instruction(I&: C))))))) \|\|
1984	match(V: &I, P: m_c_Add(L: m_Instruction(I&: A),
1985	R: m_OneUse(SubPattern: m_c_Add(
1986	L: HiHi, R: m_OneUse(SubPattern: m_Add(L: m_Instruction(I&: B),
1987	R: m_Instruction(I&: C))))))) \|\|
1988	match(V: &I, P: m_c_Add(L: m_Instruction(I&: A),
1989	R: m_OneUse(SubPattern: m_c_Add(
1990	L: m_Instruction(I&: B),
1991	R: m_OneUse(SubPattern: m_c_Add(L: HiHi, R: m_Instruction(I&: C))))))) \|\|
1992	match(V: &I,
1993	P: m_c_Add(L: m_OneUse(SubPattern: m_c_Add(L: HiHi, R: m_Instruction(I&: A))),
1994	R: m_OneUse(SubPattern: m_Add(L: m_Instruction(I&: B), R: m_Instruction(I&: C)))))) &&
1995	A->hasOneUse() && B->hasOneUse() && C->hasOneUse())
1996	return FoldMulHighCarry4 (X, Y, A, B, C) \|\|
1997	FoldMulHighLadder4 (X, Y, A, B, C);
1998
1999	return false;
2000	}
2001
2002	/// This is the entry point for folds that could be implemented in regular
2003	/// InstCombine, but they are separated because they are not expected to
2004	/// occur frequently and/or have more than a constant-length pattern match.
2005	static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
2006	TargetTransformInfo &TTI,
2007	TargetLibraryInfo &TLI, AliasAnalysis &AA,
2008	AssumptionCache &AC, bool &MadeCFGChange) {
2009	bool MadeChange = false;
2010	for (BasicBlock &BB : F) {
2011	// Ignore unreachable basic blocks.
2012	if (!DT.isReachableFromEntry(A: &BB))
2013	continue;
2014
2015	const DataLayout &DL = F.getDataLayout();
2016
2017	// Walk the block backwards for efficiency. We're matching a chain of
2018	// use->defs, so we're more likely to succeed by starting from the bottom.
2019	// Also, we want to avoid matching partial patterns.
2020	// TODO: It would be more efficient if we removed dead instructions
2021	// iteratively in this loop rather than waiting until the end.
2022	for (Instruction &I : make_early_inc_range(Range: llvm::reverse(C&: BB))) {
2023	MadeChange \|= foldAnyOrAllBitsSet(I);
2024	MadeChange \|= foldGuardedFunnelShift(I, DT);
2025	MadeChange \|= tryToRecognizePopCount(I);
2026	MadeChange \|= tryToFPToSat(I, TTI);
2027	MadeChange \|= tryToRecognizeTableBasedCttz(I, DL);
2028	MadeChange \|= tryToRecognizeTableBasedLog2(I, DL, TTI);
2029	MadeChange \|= foldConsecutiveLoads(I, DL, TTI, AA, DT);
2030	MadeChange \|= foldPatternedLoads(I, DL);
2031	MadeChange \|= foldICmpOrChain(I, DL, TTI, AA, DT);
2032	MadeChange \|= foldMulHigh(I);
2033	// NOTE: This function introduces erasing of the instruction `I`, so it
2034	// needs to be called at the end of this sequence, otherwise we may make
2035	// bugs.
2036	MadeChange \|= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
2037	}
2038
2039	// Do this separately to avoid redundantly scanning stores multiple times.
2040	MadeChange \|= foldConsecutiveStores(BB, DL, TTI, AA);
2041	}
2042
2043	// We're done with transforms, so remove dead instructions.
2044	if (MadeChange)
2045	for (BasicBlock &BB : F)
2046	SimplifyInstructionsInBlock(BB: &BB);
2047
2048	return MadeChange;
2049	}
2050
2051	/// This is the entry point for all transforms. Pass manager differences are
2052	/// handled in the callers of this function.
2053	static bool runImpl(Function &F, AssumptionCache &AC, TargetTransformInfo &TTI,
2054	TargetLibraryInfo &TLI, DominatorTree &DT,
2055	AliasAnalysis &AA, bool &MadeCFGChange) {
2056	bool MadeChange = false;
2057	const DataLayout &DL = F.getDataLayout();
2058	TruncInstCombine TIC(AC, TLI, DL, DT);
2059	MadeChange \|= TIC.run(F);
2060	MadeChange \|= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);
2061	return MadeChange;
2062	}
2063
2064	PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
2065	FunctionAnalysisManager &AM) {
2066	auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
2067	auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
2068	auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
2069	auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
2070	auto &AA = AM.getResult<AAManager>(IR&: F);
2071	bool MadeCFGChange = false;
2072	if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
2073	// No changes, all analyses are preserved.
2074	return PreservedAnalyses::all();
2075	}
2076	// Mark all the analyses that instcombine updates as preserved.
2077	PreservedAnalyses PA;
2078	if (MadeCFGChange)
2079	PA.preserve<DominatorTreeAnalysis>();
2080	else
2081	PA.preserveSet<CFGAnalyses>();
2082	return PA;
2083	}
2084

Browse the source code of llvm_projects/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp