SLPVectorizer.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp]

1	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10	// stores that can be put together into vector-stores. Next, it attempts to
11	// construct vectorizable tree using the use-def chains. If a profitable tree
12	// was found, the SLP vectorizer performs vectorization on the tree.
13	//
14	// The pass is inspired by the work described in the paper:
15	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16	//
17	//===----------------------------------------------------------------------===//
18
19	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20	#include "llvm/ADT/DenseMap.h"
21	#include "llvm/ADT/DenseSet.h"
22	#include "llvm/ADT/PriorityQueue.h"
23	#include "llvm/ADT/STLExtras.h"
24	#include "llvm/ADT/ScopeExit.h"
25	#include "llvm/ADT/SetOperations.h"
26	#include "llvm/ADT/SetVector.h"
27	#include "llvm/ADT/SmallBitVector.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallString.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/ADT/iterator.h"
33	#include "llvm/ADT/iterator_range.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/Analysis/AssumptionCache.h"
36	#include "llvm/Analysis/CodeMetrics.h"
37	#include "llvm/Analysis/ConstantFolding.h"
38	#include "llvm/Analysis/DemandedBits.h"
39	#include "llvm/Analysis/GlobalsModRef.h"
40	#include "llvm/Analysis/IVDescriptors.h"
41	#include "llvm/Analysis/LoopAccessAnalysis.h"
42	#include "llvm/Analysis/LoopInfo.h"
43	#include "llvm/Analysis/MemoryLocation.h"
44	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45	#include "llvm/Analysis/ScalarEvolution.h"
46	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47	#include "llvm/Analysis/TargetLibraryInfo.h"
48	#include "llvm/Analysis/TargetTransformInfo.h"
49	#include "llvm/Analysis/ValueTracking.h"
50	#include "llvm/Analysis/VectorUtils.h"
51	#include "llvm/IR/Attributes.h"
52	#include "llvm/IR/BasicBlock.h"
53	#include "llvm/IR/Constant.h"
54	#include "llvm/IR/Constants.h"
55	#include "llvm/IR/DataLayout.h"
56	#include "llvm/IR/DerivedTypes.h"
57	#include "llvm/IR/Dominators.h"
58	#include "llvm/IR/Function.h"
59	#include "llvm/IR/IRBuilder.h"
60	#include "llvm/IR/InstrTypes.h"
61	#include "llvm/IR/Instruction.h"
62	#include "llvm/IR/Instructions.h"
63	#include "llvm/IR/IntrinsicInst.h"
64	#include "llvm/IR/Intrinsics.h"
65	#include "llvm/IR/Module.h"
66	#include "llvm/IR/Operator.h"
67	#include "llvm/IR/PatternMatch.h"
68	#include "llvm/IR/Type.h"
69	#include "llvm/IR/Use.h"
70	#include "llvm/IR/User.h"
71	#include "llvm/IR/Value.h"
72	#include "llvm/IR/ValueHandle.h"
73	#ifdef EXPENSIVE_CHECKS
74	#include "llvm/IR/Verifier.h"
75	#endif
76	#include "llvm/Pass.h"
77	#include "llvm/Support/Casting.h"
78	#include "llvm/Support/CommandLine.h"
79	#include "llvm/Support/Compiler.h"
80	#include "llvm/Support/DOTGraphTraits.h"
81	#include "llvm/Support/Debug.h"
82	#include "llvm/Support/ErrorHandling.h"
83	#include "llvm/Support/GraphWriter.h"
84	#include "llvm/Support/InstructionCost.h"
85	#include "llvm/Support/KnownBits.h"
86	#include "llvm/Support/MathExtras.h"
87	#include "llvm/Support/raw_ostream.h"
88	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
89	#include "llvm/Transforms/Utils/Local.h"
90	#include "llvm/Transforms/Utils/LoopUtils.h"
91	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92	#include <algorithm>
93	#include <cassert>
94	#include <cstdint>
95	#include <iterator>
96	#include <memory>
97	#include <optional>
98	#include <set>
99	#include <string>
100	#include <tuple>
101	#include <utility>
102
103	using namespace llvm;
104	using namespace llvm::PatternMatch;
105	using namespace slpvectorizer;
106
107	#define SV_NAME "slp-vectorizer"
108	#define DEBUG_TYPE "SLP"
109
110	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112	static cl::opt<bool>
113	RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
114	cl::desc ("Run the SLP vectorization passes"));
115
116	static cl::opt<bool>
117	SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
118	cl::desc ("Enable vectorization for wider vector utilization"));
119
120	static cl::opt<int>
121	SLPCostThreshold("slp-threshold", cl::init(Val: `0`), cl::Hidden,
122	cl::desc ("Only vectorize if you gain more than this "
123	"number "));
124
125	static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
126	"slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
127	cl::desc ("When true, SLP vectorizer bypasses profitability checks based on "
128	"heuristics and makes vectorization decision via cost modeling."));
129
130	static cl::opt<bool>
131	ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
132	cl::desc ("Attempt to vectorize horizontal reductions"));
133
134	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
135	"slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
136	cl::desc (
137	"Attempt to vectorize horizontal reductions feeding into a store"));
138
139	// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140	// even if we match a reduction but do not vectorize in the end.
141	static cl::opt<bool> AllowHorRdxIdenityOptimization(
142	"slp-optimize-identity-hor-reduction-ops", cl::init(Val: true), cl::Hidden,
143	cl::desc ("Allow optimization of original scalar identity operations on "
144	"matched horizontal reductions."));
145
146	static cl::opt<int>
147	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: `128`), cl::Hidden,
148	cl::desc ("Attempt to vectorize for this register size in bits"));
149
150	static cl::opt<unsigned>
151	MaxVFOption("slp-max-vf", cl::init(Val: `0`), cl::Hidden,
152	cl::desc ("Maximum SLP vectorization factor (0=unlimited)"));
153
154	/// Limits the size of scheduling regions in a block.
155	/// It avoid long compile times for _very_ large blocks where vector
156	/// instructions are spread over a wide range.
157	/// This limit is way higher than needed by real-world functions.
158	static cl::opt<int>
159	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: `100000`), cl::Hidden,
160	cl::desc ("Limit the size of the SLP scheduling region per block"));
161
162	static cl::opt<int> MinVectorRegSizeOption(
163	"slp-min-reg-size", cl::init(Val: `128`), cl::Hidden,
164	cl::desc ("Attempt to vectorize for this register size in bits"));
165
166	static cl::opt<unsigned> RecursionMaxDepth(
167	"slp-recursion-max-depth", cl::init(Val: `12`), cl::Hidden,
168	cl::desc ("Limit the recursion depth when building a vectorizable tree"));
169
170	static cl::opt<unsigned> MinTreeSize(
171	"slp-min-tree-size", cl::init(Val: `3`), cl::Hidden,
172	cl::desc ("Only vectorize small trees if they are fully vectorizable"));
173
174	// The maximum depth that the look-ahead score heuristic will explore.
175	// The higher this value, the higher the compilation time overhead.
176	static cl::opt<int> LookAheadMaxDepth(
177	"slp-max-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
178	cl::desc ("The maximum look-ahead depth for operand reordering scores"));
179
180	// The maximum depth that the look-ahead score heuristic will explore
181	// when it probing among candidates for vectorization tree roots.
182	// The higher this value, the higher the compilation time overhead but unlike
183	// similar limit for operands ordering this is less frequently used, hence
184	// impact of higher value is less noticeable.
185	static cl::opt<int> RootLookAheadMaxDepth(
186	"slp-max-root-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
187	cl::desc ("The maximum look-ahead depth for searching best rooting option"));
188
189	static cl::opt<unsigned> MinProfitableStridedLoads(
190	"slp-min-strided-loads", cl::init(Val: `2`), cl::Hidden,
191	cl::desc ("The minimum number of loads, which should be considered strided, "
192	"if the stride is > 1 or is runtime value"));
193
194	static cl::opt<unsigned> MaxProfitableLoadStride(
195	"slp-max-stride", cl::init(Val: `8`), cl::Hidden,
196	cl::desc ("The maximum stride, considered to be profitable."));
197
198	static cl::opt<bool>
199	ViewSLPTree("view-slp-tree", cl::Hidden,
200	cl::desc ("Display the SLP trees with Graphviz"));
201
202	static cl::opt<bool> VectorizeNonPowerOf2(
203	"slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
204	cl::desc ("Try to vectorize with non-power-of-2 number of elements."));
205
206	// Limit the number of alias checks. The limit is chosen so that
207	// it has no negative effect on the llvm benchmarks.
208	static const unsigned AliasedCheckLimit = `10`;
209
210	// Limit of the number of uses for potentially transformed instructions/values,
211	// used in checks to avoid compile-time explode.
212	static constexpr int UsesLimit = `64`;
213
214	// Another limit for the alias checks: The maximum distance between load/store
215	// instructions where alias checks are done.
216	// This limit is useful for very large basic blocks.
217	static const unsigned MaxMemDepDistance = `160`;
218
219	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220	/// regions to be handled.
221	static const int MinScheduleRegionSize = `16`;
222
223	/// Maximum allowed number of operands in the PHI nodes.
224	static const unsigned MaxPHINumOperands = `128`;
225
226	/// Predicate for the element types that the SLP vectorizer supports.
227	///
228	/// The most important thing to filter here are types which are invalid in LLVM
229	/// vectors. We also filter target specific types which have absolutely no
230	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231	/// avoids spending time checking the cost model and realizing that they will
232	/// be inevitably scalarized.
233	static bool isValidElementType(Type *Ty) {
234	// TODO: Support ScalableVectorType.
235	if (SLPReVec && isa<FixedVectorType>(Val: Ty))
236	Ty = Ty->getScalarType();
237	return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
238	!Ty->isPPC_FP128Ty();
239	}
240
241	/// \returns the number of elements for Ty.
242	static unsigned getNumElements(Type *Ty) {
243	assert(!isa<ScalableVectorType>(Ty) &&
244	"ScalableVectorType is not supported.");
245	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
246	return VecTy->getNumElements();
247	return `1`;
248	}
249
250	/// \returns the vector type of ScalarTy based on vectorization factor.
251	static FixedVectorType getWidenedType(Type ScalarTy, unsigned VF) {
252	return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
253	NumElts: VF * getNumElements(Ty: ScalarTy));
254	}
255
256	/// \returns True if the value is a constant (but not globals/constant
257	/// expressions).
258	static bool isConstant(Value *V) {
259	return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
260	}
261
262	/// Checks if \p V is one of vector-like instructions, i.e. undef,
263	/// insertelement/extractelement with constant indices for fixed vector type or
264	/// extractvalue instruction.
265	static bool isVectorLikeInstWithConstOps(Value *V) {
266	if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
267	!isa<ExtractValueInst, UndefValue>(Val: V))
268	return false;
269	auto *I = dyn_cast<Instruction>(Val: V);
270	if (!I \|\| isa<ExtractValueInst>(Val: I))
271	return true;
272	if (!isa<FixedVectorType>(Val: I->getOperand(i: `0`)->getType()))
273	return false;
274	if (isa<ExtractElementInst>(Val: I))
275	return isConstant(V: I->getOperand(i: `1`));
276	assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
277	return isConstant(V: I->getOperand(i: `2`));
278	}
279
280	/// Returns power-of-2 number of elements in a single register (part), given the
281	/// total number of elements \p Size and number of registers (parts) \p
282	/// NumParts.
283	static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
284	return PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: NumParts));
285	}
286
287	/// Returns correct remaining number of elements, considering total amount \p
288	/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
289	/// and current register (part) \p Part.
290	static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
291	unsigned Part) {
292	return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
293	}
294
295	#if !defined(NDEBUG)
296	/// Print a short descriptor of the instruction bundle suitable for debug output.
297	static std::string shortBundleName(ArrayRef<Value *> VL) {
298	std::string Result;
299	raw_string_ostream OS(Result);
300	OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
301	OS.flush();
302	return Result;
303	}
304	#endif
305
306	/// \returns true if all of the instructions in \p VL are in the same block or
307	/// false otherwise.
308	static bool allSameBlock(ArrayRef<Value *> VL) {
309	Instruction *I0 = dyn_cast<Instruction>(Val: VL [`0`]);
310	if (!I0)
311	return false;
312	if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
313	return true;
314
315	BasicBlock *BB = I0->getParent();
316	for (int I = `1`, E = VL.size(); I < E; I++) {
317	auto *II = dyn_cast<Instruction>(Val: VL [I]);
318	if (!II)
319	return false;
320
321	if (BB != II->getParent())
322	return false;
323	}
324	return true;
325	}
326
327	/// \returns True if all of the values in \p VL are constants (but not
328	/// globals/constant expressions).
329	static bool allConstant(ArrayRef<Value *> VL) {
330	// Constant expressions and globals can't be vectorized like normal integer/FP
331	// constants.
332	return all_of(Range&: VL, P: isConstant);
333	}
334
335	/// \returns True if all of the values in \p VL are identical or some of them
336	/// are UndefValue.
337	static bool isSplat(ArrayRef<Value *> VL) {
338	Value FirstNonUndef = nullptr*;
339	for (Value *V : VL) {
340	if (isa<UndefValue>(Val: V))
341	continue;
342	if (!FirstNonUndef) {
343	FirstNonUndef = V;
344	continue;
345	}
346	if (V != FirstNonUndef)
347	return false;
348	}
349	return FirstNonUndef != nullptr;
350	}
351
352	/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
353	static bool isCommutative(Instruction *I) {
354	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
355	return Cmp->isCommutative();
356	if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
357	return BO->isCommutative() \|\|
358	(BO->getOpcode() == Instruction::Sub &&
359	!BO->hasNUsesOrMore(N: UsesLimit) &&
360	all_of(
361	Range: BO->uses(),
362	P: [](const Use &U) {
363	// Commutative, if icmp eq/ne sub, 0
364	ICmpInst::Predicate Pred;
365	if (match(V: U.getUser(),
366	P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
367	(Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_NE))
368	return true;
369	// Commutative, if abs(sub nsw, true) or abs(sub, false).
370	ConstantInt *Flag;
371	return match(V: U.getUser(),
372	P: m_Intrinsic<Intrinsic::abs>(
373	Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
374	(!cast<Instruction>(Val: U.get())->hasNoSignedWrap() \|\|
375	Flag->isOne());
376	})) \|\|
377	(BO->getOpcode() == Instruction::FSub &&
378	!BO->hasNUsesOrMore(N: UsesLimit) &&
379	all_of(Range: BO->uses(), P: [](const Use &U) {
380	return match(V: U.getUser(),
381	P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
382	}));
383	return I->isCommutative();
384	}
385
386	template <typename T>
387	static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
388	unsigned Offset) {
389	static_assert(std::is_same_v<T, InsertElementInst> \|\|
390	std::is_same_v<T, ExtractElementInst>,
391	"unsupported T");
392	int Index = Offset;
393	if (const auto *IE = dyn_cast<T>(Inst)) {
394	const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
395	if (!VT)
396	return std::nullopt;
397	const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(`2`));
398	if (!CI)
399	return std::nullopt;
400	if (CI->getValue().uge(VT->getNumElements()))
401	return std::nullopt;
402	Index *= VT->getNumElements();
403	Index += CI->getZExtValue();
404	return Index;
405	}
406	return std::nullopt;
407	}
408
409	/// \returns inserting or extracting index of InsertElement, ExtractElement or
410	/// InsertValue instruction, using Offset as base offset for index.
411	/// \returns std::nullopt if the index is not an immediate.
412	static std::optional<unsigned> getElementIndex(const Value *Inst,
413	unsigned Offset = `0`) {
414	if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
415	return Index;
416	if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
417	return Index;
418
419	int Index = Offset;
420
421	const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
422	if (!IV)
423	return std::nullopt;
424
425	Type *CurrentType = IV->getType();
426	for (unsigned I : IV->indices()) {
427	if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
428	Index *= ST->getNumElements();
429	CurrentType = ST->getElementType(N: I);
430	} else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
431	Index *= AT->getNumElements();
432	CurrentType = AT->getElementType();
433	} else {
434	return std::nullopt;
435	}
436	Index += I;
437	}
438	return Index;
439	}
440
441	namespace {
442	/// Specifies the way the mask should be analyzed for undefs/poisonous elements
443	/// in the shuffle mask.
444	enum class UseMask {
445	FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
446	///< check for the mask elements for the first argument (mask
447	///< indices are in range [0:VF)).
448	SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
449	///< for the mask elements for the second argument (mask indices
450	///< are in range [VF:2VF))*
451	UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
452	///< future shuffle elements and mark them as ones as being used
453	///< in future. Non-undef elements are considered as unused since
454	///< they're already marked as used in the mask.
455	};
456	} // namespace
457
458	/// Prepares a use bitset for the given mask either for the first argument or
459	/// for the second.
460	static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
461	UseMask MaskArg) {
462	SmallBitVector UseMask(VF, true);
463	for (auto [Idx, Value] : enumerate(First&: Mask)) {
464	if (Value == PoisonMaskElem) {
465	if (MaskArg == UseMask::UndefsAsMask)
466	UseMask.reset(Idx);
467	continue;
468	}
469	if (MaskArg == UseMask::FirstArg && Value < VF)
470	UseMask.reset(Idx: Value);
471	else if (MaskArg == UseMask::SecondArg && Value >= VF)
472	UseMask.reset(Idx: Value - VF);
473	}
474	return UseMask;
475	}
476
477	/// Checks if the given value is actually an undefined constant vector.
478	/// Also, if the \p UseMask is not empty, tries to check if the non-masked
479	/// elements actually mask the insertelement buildvector, if any.
480	template <bool IsPoisonOnly = false>
481	static SmallBitVector isUndefVector(const Value *V,
482	const SmallBitVector &UseMask = {}) {
483	SmallBitVector Res(UseMask.empty() ? `1` : UseMask.size(), true);
484	using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
485	if (isa<T>(V))
486	return Res;
487	auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
488	if (!VecTy)
489	return Res.reset();
490	auto *C = dyn_cast<Constant>(Val: V);
491	if (!C) {
492	if (!UseMask.empty()) {
493	const Value *Base = V;
494	while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
495	Base = II->getOperand(i_nocapture: `0`);
496	if (isa<T>(II->getOperand(i_nocapture: `1`)))
497	continue;
498	std::optional<unsigned> Idx = getElementIndex(Inst: II);
499	if (!Idx) {
500	Res.reset();
501	return Res;
502	}
503	if (Idx < UseMask.size() && !UseMask.test(Idx: Idx))
504	Res.reset(Idx: *Idx);
505	}
506	// TODO: Add analysis for shuffles here too.
507	if (V == Base) {
508	Res.reset();
509	} else {
510	SmallBitVector SubMask(UseMask.size(), false);
511	Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
512	}
513	} else {
514	Res.reset();
515	}
516	return Res;
517	}
518	for (unsigned I = `0`, E = VecTy->getNumElements(); I != E; ++I) {
519	if (Constant *Elem = C->getAggregateElement(Elt: I))
520	if (!isa<T>(Elem) &&
521	(UseMask.empty() \|\| (I < UseMask.size() && !UseMask.test(Idx: I))))
522	Res.reset(Idx: I);
523	}
524	return Res;
525	}
526
527	/// Checks if the vector of instructions can be represented as a shuffle, like:
528	/// %x0 = extractelement <4 x i8> %x, i32 0
529	/// %x3 = extractelement <4 x i8> %x, i32 3
530	/// %y1 = extractelement <4 x i8> %y, i32 1
531	/// %y2 = extractelement <4 x i8> %y, i32 2
532	/// %x0x0 = mul i8 %x0, %x0
533	/// %x3x3 = mul i8 %x3, %x3
534	/// %y1y1 = mul i8 %y1, %y1
535	/// %y2y2 = mul i8 %y2, %y2
536	/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
537	/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
538	/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
539	/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
540	/// ret <4 x i8> %ins4
541	/// can be transformed into:
542	/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
543	/// i32 6>
544	/// %2 = mul <4 x i8> %1, %1
545	/// ret <4 x i8> %2
546	/// Mask will return the Shuffle Mask equivalent to the extracted elements.
547	/// TODO: Can we split off and reuse the shuffle mask detection from
548	/// ShuffleVectorInst/getShuffleCost?
549	static std::optional<TargetTransformInfo::ShuffleKind>
550	isFixedVectorShuffle(ArrayRef<Value > VL, SmallVectorImpl<int*> &Mask) {
551	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
552	if (It == VL.end())
553	return std::nullopt;
554	unsigned Size =
555	std::accumulate(first: VL.begin(), last: VL.end(), init: `0u`, binary_op: [](unsigned S, Value *V) {
556	auto *EI = dyn_cast<ExtractElementInst>(Val: V);
557	if (!EI)
558	return S;
559	auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
560	if (!VTy)
561	return S;
562	return std::max(a: S, b: VTy->getNumElements());
563	});
564
565	Value Vec1 = nullptr*;
566	Value Vec2 = nullptr*;
567	bool HasNonUndefVec = any_of(Range&: VL, P: [](Value *V) {
568	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
569	if (!EE)
570	return false;
571	Value *Vec = EE->getVectorOperand();
572	if (isa<UndefValue>(Val: Vec))
573	return false;
574	return isGuaranteedNotToBePoison(V: Vec);
575	});
576	enum ShuffleMode { Unknown, Select, Permute };
577	ShuffleMode CommonShuffleMode = Unknown;
578	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
579	for (unsigned I = `0`, E = VL.size(); I < E; ++I) {
580	// Undef can be represented as an undef element in a vector.
581	if (isa<UndefValue>(Val: VL [I]))
582	continue;
583	auto *EI = cast<ExtractElementInst>(Val: VL [I]);
584	if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
585	return std::nullopt;
586	auto *Vec = EI->getVectorOperand();
587	// We can extractelement from undef or poison vector.
588	if (isUndefVector</isPoisonOnly=/true>(V: Vec).all())
589	continue;
590	// All vector operands must have the same number of vector elements.
591	if (isa<UndefValue>(Val: Vec)) {
592	Mask [I] = I;
593	} else {
594	if (isa<UndefValue>(Val: EI->getIndexOperand()))
595	continue;
596	auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
597	if (!Idx)
598	return std::nullopt;
599	// Undefined behavior if Idx is negative or >= Size.
600	if (Idx->getValue().uge(RHS: Size))
601	continue;
602	unsigned IntIdx = Idx->getValue().getZExtValue();
603	Mask [I] = IntIdx;
604	}
605	if (isUndefVector(V: Vec).all() && HasNonUndefVec)
606	continue;
607	// For correct shuffling we have to have at most 2 different vector operands
608	// in all extractelement instructions.
609	if (!Vec1 \|\| Vec1 == Vec) {
610	Vec1 = Vec;
611	} else if (!Vec2 \|\| Vec2 == Vec) {
612	Vec2 = Vec;
613	Mask [I] += Size;
614	} else {
615	return std::nullopt;
616	}
617	if (CommonShuffleMode == Permute)
618	continue;
619	// If the extract index is not the same as the operation number, it is a
620	// permutation.
621	if (Mask [I] % Size != I) {
622	CommonShuffleMode = Permute;
623	continue;
624	}
625	CommonShuffleMode = Select;
626	}
627	// If we're not crossing lanes in different vectors, consider it as blending.
628	if (CommonShuffleMode == Select && Vec2)
629	return TargetTransformInfo::SK_Select;
630	// If Vec2 was never used, we have a permutation of a single vector, otherwise
631	// we have permutation of 2 vectors.
632	return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
633	: TargetTransformInfo::SK_PermuteSingleSrc;
634	}
635
636	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
637	static std::optional<unsigned> getExtractIndex(Instruction *E) {
638	unsigned Opcode = E->getOpcode();
639	assert((Opcode == Instruction::ExtractElement \|\|
640	Opcode == Instruction::ExtractValue) &&
641	"Expected extractelement or extractvalue instruction.");
642	if (Opcode == Instruction::ExtractElement) {
643	auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: `1`));
644	if (!CI)
645	return std::nullopt;
646	return CI->getZExtValue();
647	}
648	auto *EI = cast<ExtractValueInst>(Val: E);
649	if (EI->getNumIndices() != `1`)
650	return std::nullopt;
651	return *EI->idx_begin();
652	}
653
654	namespace {
655
656	/// Main data required for vectorization of instructions.
657	struct InstructionsState {
658	/// The very first instruction in the list with the main opcode.
659	Value OpValue = nullptr*;
660
661	/// The main/alternate instruction.
662	Instruction MainOp = nullptr*;
663	Instruction AltOp = nullptr*;
664
665	/// The main/alternate opcodes for the list of instructions.
666	unsigned getOpcode() const {
667	return MainOp ? MainOp->getOpcode() : `0`;
668	}
669
670	unsigned getAltOpcode() const {
671	return AltOp ? AltOp->getOpcode() : `0`;
672	}
673
674	/// Some of the instructions in the list have alternate opcodes.
675	bool isAltShuffle() const { return AltOp != MainOp; }
676
677	bool isOpcodeOrAlt(Instruction I) const* {
678	unsigned CheckedOpcode = I->getOpcode();
679	return getOpcode() == CheckedOpcode \|\| getAltOpcode() == CheckedOpcode;
680	}
681
682	InstructionsState() = delete;
683	InstructionsState(Value OpValue, Instruction MainOp, Instruction *AltOp)
684	: OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685	};
686
687	} // end anonymous namespace
688
689	/// Chooses the correct key for scheduling data. If \p Op has the same (or
690	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691	/// OpValue.
692	static Value isOneOf(const* InstructionsState &S, Value *Op) {
693	auto *I = dyn_cast<Instruction>(Val: Op);
694	if (I && S.isOpcodeOrAlt(I))
695	return Op;
696	return S.OpValue;
697	}
698
699	/// \returns true if \p Opcode is allowed as part of the main/alternate
700	/// instruction for SLP vectorization.
701	///
702	/// Example of unsupported opcode is SDIV that can potentially cause UB if the
703	/// "shuffled out" lane would result in division by zero.
704	static bool isValidForAlternation(unsigned Opcode) {
705	if (Instruction::isIntDivRem(Opcode))
706	return false;
707
708	return true;
709	}
710
711	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712	const TargetLibraryInfo &TLI,
713	unsigned BaseIndex = `0`);
714
715	/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716	/// compatible instructions or constants, or just some other regular values.
717	static bool areCompatibleCmpOps(Value BaseOp0, Value BaseOp1, Value *Op0,
718	Value Op1, const* TargetLibraryInfo &TLI) {
719	return (isConstant(V: BaseOp0) && isConstant(V: Op0)) \|\|
720	(isConstant(V: BaseOp1) && isConstant(V: Op1)) \|\|
721	(!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
722	!isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) \|\|
723	BaseOp0 == Op0 \|\| BaseOp1 == Op1 \|\|
724	getSameOpcode(VL: {BaseOp0, Op0}, TLI).getOpcode() \|\|
725	getSameOpcode(VL: {BaseOp1, Op1}, TLI).getOpcode();
726	}
727
728	/// \returns true if a compare instruction \p CI has similar "look" and
729	/// same predicate as \p BaseCI, "as is" or with its operands and predicate
730	/// swapped, false otherwise.
731	static bool isCmpSameOrSwapped(const CmpInst BaseCI, const* CmpInst *CI,
732	const TargetLibraryInfo &TLI) {
733	assert(BaseCI->getOperand(`0`)->getType() == CI->getOperand(`0`)->getType() &&
734	"Assessing comparisons of different types?");
735	CmpInst::Predicate BasePred = BaseCI->getPredicate();
736	CmpInst::Predicate Pred = CI->getPredicate();
737	CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
738
739	Value *BaseOp0 = BaseCI->getOperand(i_nocapture: `0`);
740	Value *BaseOp1 = BaseCI->getOperand(i_nocapture: `1`);
741	Value *Op0 = CI->getOperand(i_nocapture: `0`);
742	Value *Op1 = CI->getOperand(i_nocapture: `1`);
743
744	return (BasePred == Pred &&
745	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) \|\|
746	(BasePred == SwappedPred &&
747	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
748	}
749
750	/// \returns analysis of the Instructions in \p VL described in
751	/// InstructionsState, the Opcode that we suppose the whole list
752	/// could be vectorized even if its structure is diverse.
753	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754	const TargetLibraryInfo &TLI,
755	unsigned BaseIndex) {
756	// Make sure these are all Instructions.
757	if (llvm::any_of(Range&: VL, P: [](Value V) { return* !isa<Instruction>(Val: V); }))
758	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
759
760	bool IsCastOp = isa<CastInst>(Val: VL [BaseIndex]);
761	bool IsBinOp = isa<BinaryOperator>(Val: VL [BaseIndex]);
762	bool IsCmpOp = isa<CmpInst>(Val: VL [BaseIndex]);
763	CmpInst::Predicate BasePred =
764	IsCmpOp ? cast<CmpInst>(Val: VL [BaseIndex])->getPredicate()
765	: CmpInst::BAD_ICMP_PREDICATE;
766	unsigned Opcode = cast<Instruction>(Val: VL [BaseIndex])->getOpcode();
767	unsigned AltOpcode = Opcode;
768	unsigned AltIndex = BaseIndex;
769
770	bool SwappedPredsCompatible = [&]() {
771	if (!IsCmpOp)
772	return false;
773	SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
774	UniquePreds.insert(X: BasePred);
775	UniqueNonSwappedPreds.insert(X: BasePred);
776	for (Value *V : VL) {
777	auto *I = dyn_cast<CmpInst>(Val: V);
778	if (!I)
779	return false;
780	CmpInst::Predicate CurrentPred = I->getPredicate();
781	CmpInst::Predicate SwappedCurrentPred =
782	CmpInst::getSwappedPredicate(pred: CurrentPred);
783	UniqueNonSwappedPreds.insert(X: CurrentPred);
784	if (!UniquePreds.contains(key: CurrentPred) &&
785	!UniquePreds.contains(key: SwappedCurrentPred))
786	UniquePreds.insert(X: CurrentPred);
787	}
788	// Total number of predicates > 2, but if consider swapped predicates
789	// compatible only 2, consider swappable predicates as compatible opcodes,
790	// not alternate.
791	return UniqueNonSwappedPreds.size() > `2` && UniquePreds.size() == `2`;
792	}();
793	// Check for one alternate opcode from another BinaryOperator.
794	// TODO - generalize to support all operators (types, calls etc.).
795	auto *IBase = cast<Instruction>(Val: VL [BaseIndex]);
796	Intrinsic::ID BaseID = `0`;
797	SmallVector<VFInfo> BaseMappings;
798	if (auto *CallBase = dyn_cast<CallInst>(Val: IBase)) {
799	BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
800	BaseMappings = VFDatabase (CallBase).getMappings(CI: CallBase);
801	if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
802	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
803	}
804	for (int Cnt = `0`, E = VL.size(); Cnt < E; Cnt++) {
805	auto *I = cast<Instruction>(Val: VL [Cnt]);
806	unsigned InstOpcode = I->getOpcode();
807	if (IsBinOp && isa<BinaryOperator>(Val: I)) {
808	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
809	continue;
810	if (Opcode == AltOpcode && isValidForAlternation(Opcode: InstOpcode) &&
811	isValidForAlternation(Opcode)) {
812	AltOpcode = InstOpcode;
813	AltIndex = Cnt;
814	continue;
815	}
816	} else if (IsCastOp && isa<CastInst>(Val: I)) {
817	Value *Op0 = IBase->getOperand(i: `0`);
818	Type *Ty0 = Op0->getType();
819	Value *Op1 = I->getOperand(i: `0`);
820	Type *Ty1 = Op1->getType();
821	if (Ty0 == Ty1) {
822	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
823	continue;
824	if (Opcode == AltOpcode) {
825	assert(isValidForAlternation(Opcode) &&
826	isValidForAlternation(InstOpcode) &&
827	"Cast isn't safe for alternation, logic needs to be updated!");
828	AltOpcode = InstOpcode;
829	AltIndex = Cnt;
830	continue;
831	}
832	}
833	} else if (auto *Inst = dyn_cast<CmpInst>(Val: VL [Cnt]); Inst && IsCmpOp) {
834	auto *BaseInst = cast<CmpInst>(Val: VL [BaseIndex]);
835	Type *Ty0 = BaseInst->getOperand(i_nocapture: `0`)->getType();
836	Type *Ty1 = Inst->getOperand(i_nocapture: `0`)->getType();
837	if (Ty0 == Ty1) {
838	assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
839	// Check for compatible operands. If the corresponding operands are not
840	// compatible - need to perform alternate vectorization.
841	CmpInst::Predicate CurrentPred = Inst->getPredicate();
842	CmpInst::Predicate SwappedCurrentPred =
843	CmpInst::getSwappedPredicate(pred: CurrentPred);
844
845	if ((E == `2` \|\| SwappedPredsCompatible) &&
846	(BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred))
847	continue;
848
849	if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
850	continue;
851	auto *AltInst = cast<CmpInst>(Val: VL [AltIndex]);
852	if (AltIndex != BaseIndex) {
853	if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
854	continue;
855	} else if (BasePred != CurrentPred) {
856	assert(
857	isValidForAlternation(InstOpcode) &&
858	"CmpInst isn't safe for alternation, logic needs to be updated!");
859	AltIndex = Cnt;
860	continue;
861	}
862	CmpInst::Predicate AltPred = AltInst->getPredicate();
863	if (BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred \|\|
864	AltPred == CurrentPred \|\| AltPred == SwappedCurrentPred)
865	continue;
866	}
867	} else if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode) {
868	if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
869	if (Gep->getNumOperands() != `2` \|\|
870	Gep->getOperand(i_nocapture: `0`)->getType() != IBase->getOperand(i: `0`)->getType())
871	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
872	} else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
873	if (!isVectorLikeInstWithConstOps(V: EI))
874	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
875	} else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
876	auto *BaseLI = cast<LoadInst>(Val: IBase);
877	if (!LI->isSimple() \|\| !BaseLI->isSimple())
878	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
879	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
880	auto *CallBase = cast<CallInst>(Val: IBase);
881	if (Call->getCalledFunction() != CallBase->getCalledFunction())
882	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
883	if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() \|\|
884	!std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885	Call->op_begin() + Call->getBundleOperandsEndIndex(),
886	CallBase->op_begin() +
887	CallBase->getBundleOperandsStartIndex())))
888	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
889	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
890	if (ID != BaseID)
891	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
892	if (!ID) {
893	SmallVector<VFInfo> Mappings = VFDatabase (Call).getMappings(CI: Call);
894	if (Mappings.size() != BaseMappings.size() \|\|
895	Mappings.front().ISA != BaseMappings.front().ISA \|\|
896	Mappings.front().ScalarName != BaseMappings.front().ScalarName \|\|
897	Mappings.front().VectorName != BaseMappings.front().VectorName \|\|
898	Mappings.front().Shape.VF != BaseMappings.front().Shape.VF \|\|
899	Mappings.front().Shape.Parameters !=
900	BaseMappings.front().Shape.Parameters)
901	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
902	}
903	}
904	continue;
905	}
906	return InstructionsState (VL [BaseIndex], nullptr, nullptr);
907	}
908
909	return InstructionsState (VL [BaseIndex], cast<Instruction>(Val: VL [BaseIndex]),
910	cast<Instruction>(Val: VL [AltIndex]));
911	}
912
913	/// \returns true if all of the values in \p VL have the same type or false
914	/// otherwise.
915	static bool allSameType(ArrayRef<Value *> VL) {
916	Type *Ty = VL.front()->getType();
917	return all_of(Range: VL.drop_front(), P: [&](Value V) { return* V->getType() == Ty; });
918	}
919
920	/// \returns True if in-tree use also needs extract. This refers to
921	/// possible scalar operand in vectorized instruction.
922	static bool doesInTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
923	TargetLibraryInfo *TLI) {
924	unsigned Opcode = UserInst->getOpcode();
925	switch (Opcode) {
926	case Instruction::Load: {
927	LoadInst *LI = cast<LoadInst>(Val: UserInst);
928	return (LI->getPointerOperand() == Scalar);
929	}
930	case Instruction::Store: {
931	StoreInst *SI = cast<StoreInst>(Val: UserInst);
932	return (SI->getPointerOperand() == Scalar);
933	}
934	case Instruction::Call: {
935	CallInst *CI = cast<CallInst>(Val: UserInst);
936	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
937	return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
938	return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939	Arg.value().get() == Scalar;
940	});
941	}
942	default:
943	return false;
944	}
945	}
946
947	/// \returns the AA location that is being access by the instruction.
948	static MemoryLocation getLocation(Instruction *I) {
949	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
950	return MemoryLocation::get(SI);
951	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
952	return MemoryLocation::get(LI);
953	return MemoryLocation ();
954	}
955
956	/// \returns True if the instruction is not a volatile or atomic load/store.
957	static bool isSimple(Instruction *I) {
958	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
959	return LI->isSimple();
960	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
961	return SI->isSimple();
962	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
963	return !MI->isVolatile();
964	return true;
965	}
966
967	/// Shuffles \p Mask in accordance with the given \p SubMask.
968	/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
969	/// one but two input vectors.
970	static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
971	bool ExtendingManyInputs = false) {
972	if (SubMask.empty())
973	return;
974	assert(
975	(!ExtendingManyInputs \|\| SubMask.size() > Mask.size() \|\|
976	// Check if input scalars were extended to match the size of other node.
977	(SubMask.size() == Mask.size() &&
978	std::all_of(std::next(Mask.begin(), Mask.size() / `2`), Mask.end(),
979	[](int Idx) { return Idx == PoisonMaskElem; }))) &&
980	"SubMask with many inputs support must be larger than the mask.");
981	if (Mask.empty()) {
982	Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
983	return;
984	}
985	SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
986	int TermValue = std::min(a: Mask.size(), b: SubMask.size());
987	for (int I = `0`, E = SubMask.size(); I < E; ++I) {
988	if (SubMask [I] == PoisonMaskElem \|\|
989	(!ExtendingManyInputs &&
990	(SubMask [I] >= TermValue \|\| Mask [SubMask [I]] >= TermValue)))
991	continue;
992	NewMask [I] = Mask [SubMask [I]];
993	}
994	Mask.swap(RHS&: NewMask);
995	}
996
997	/// Order may have elements assigned special value (size) which is out of
998	/// bounds. Such indices only appear on places which correspond to undef values
999	/// (see canReuseExtract for details) and used in order to avoid undef values
1000	/// have effect on operands ordering.
1001	/// The first loop below simply finds all unused indices and then the next loop
1002	/// nest assigns these indices for undef values positions.
1003	/// As an example below Order has two undef positions and they have assigned
1004	/// values 3 and 7 respectively:
1005	/// before: 6 9 5 4 9 2 1 0
1006	/// after: 6 3 5 4 7 2 1 0
1007	static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1008	const unsigned Sz = Order.size();
1009	SmallBitVector UnusedIndices(Sz, /t=/true);
1010	SmallBitVector MaskedIndices(Sz);
1011	for (unsigned I = `0`; I < Sz; ++I) {
1012	if (Order [I] < Sz)
1013	UnusedIndices.reset(Idx: Order [I]);
1014	else
1015	MaskedIndices.set(I);
1016	}
1017	if (MaskedIndices.none())
1018	return;
1019	assert(UnusedIndices.count() == MaskedIndices.count() &&
1020	"Non-synced masked/available indices.");
1021	int Idx = UnusedIndices.find_first();
1022	int MIdx = MaskedIndices.find_first();
1023	while (MIdx >= `0`) {
1024	assert(Idx >= `0` && "Indices must be synced.");
1025	Order [MIdx] = Idx;
1026	Idx = UnusedIndices.find_next(Prev: Idx);
1027	MIdx = MaskedIndices.find_next(Prev: MIdx);
1028	}
1029	}
1030
1031	/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1032	/// Opcode1.
1033	SmallBitVector getAltInstrMask(ArrayRef<Value > VL, unsigned* Opcode0,
1034	unsigned Opcode1) {
1035	SmallBitVector OpcodeMask(VL.size(), false);
1036	for (unsigned Lane : seq<unsigned>(Size: VL.size()))
1037	if (cast<Instruction>(Val: VL [Lane])->getOpcode() == Opcode1)
1038	OpcodeMask.set(Lane);
1039	return OpcodeMask;
1040	}
1041
1042	namespace llvm {
1043
1044	static void inversePermutation(ArrayRef<unsigned> Indices,
1045	SmallVectorImpl<int> &Mask) {
1046	Mask.clear();
1047	const unsigned E = Indices.size();
1048	Mask.resize(N: E, NV: PoisonMaskElem);
1049	for (unsigned I = `0`; I < E; ++I)
1050	Mask [Indices [I]] = I;
1051	}
1052
1053	/// Reorders the list of scalars in accordance with the given \p Mask.
1054	static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1055	ArrayRef<int> Mask) {
1056	assert(!Mask.empty() && "Expected non-empty mask.");
1057	SmallVector<Value *> Prev(Scalars.size(),
1058	PoisonValue::get(T: Scalars.front()->getType()));
1059	Prev.swap(RHS&: Scalars);
1060	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
1061	if (Mask [I] != PoisonMaskElem)
1062	Scalars [Mask [I]] = Prev [I];
1063	}
1064
1065	/// Checks if the provided value does not require scheduling. It does not
1066	/// require scheduling if this is not an instruction or it is an instruction
1067	/// that does not read/write memory and all operands are either not instructions
1068	/// or phi nodes or instructions from different blocks.
1069	static bool areAllOperandsNonInsts(Value *V) {
1070	auto *I = dyn_cast<Instruction>(Val: V);
1071	if (!I)
1072	return true;
1073	return !mayHaveNonDefUseDependency(I: *I) &&
1074	all_of(Range: I->operands(), P: [I](Value *V) {
1075	auto *IO = dyn_cast<Instruction>(Val: V);
1076	if (!IO)
1077	return true;
1078	return isa<PHINode>(Val: IO) \|\| IO->getParent() != I->getParent();
1079	});
1080	}
1081
1082	/// Checks if the provided value does not require scheduling. It does not
1083	/// require scheduling if this is not an instruction or it is an instruction
1084	/// that does not read/write memory and all users are phi nodes or instructions
1085	/// from the different blocks.
1086	static bool isUsedOutsideBlock(Value *V) {
1087	auto *I = dyn_cast<Instruction>(Val: V);
1088	if (!I)
1089	return true;
1090	// Limits the number of uses to save compile time.
1091	return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1092	all_of(Range: I->users(), P: [I](User *U) {
1093	auto *IU = dyn_cast<Instruction>(Val: U);
1094	if (!IU)
1095	return true;
1096	return IU->getParent() != I->getParent() \|\| isa<PHINode>(Val: IU);
1097	});
1098	}
1099
1100	/// Checks if the specified value does not require scheduling. It does not
1101	/// require scheduling if all operands and all users do not need to be scheduled
1102	/// in the current basic block.
1103	static bool doesNotNeedToBeScheduled(Value *V) {
1104	return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1105	}
1106
1107	/// Checks if the specified array of instructions does not require scheduling.
1108	/// It is so if all either instructions have operands that do not require
1109	/// scheduling or their users do not require scheduling since they are phis or
1110	/// in other basic blocks.
1111	static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1112	return !VL.empty() &&
1113	(all_of(Range&: VL, P: isUsedOutsideBlock) \|\| all_of(Range&: VL, P: areAllOperandsNonInsts));
1114	}
1115
1116	namespace slpvectorizer {
1117
1118	/// Bottom Up SLP Vectorizer.
1119	class BoUpSLP {
1120	struct TreeEntry;
1121	struct ScheduleData;
1122	class ShuffleCostEstimator;
1123	class ShuffleInstructionBuilder;
1124
1125	public:
1126	/// Tracks the state we can represent the loads in the given sequence.
1127	enum class LoadsState {
1128	Gather,
1129	Vectorize,
1130	ScatterVectorize,
1131	StridedVectorize
1132	};
1133
1134	using ValueList = SmallVector<Value *, `8`>;
1135	using InstrList = SmallVector<Instruction *, `16`>;
1136	using ValueSet = SmallPtrSet<Value *, `16`>;
1137	using StoreList = SmallVector<StoreInst *, `8`>;
1138	using ExtraValueToDebugLocsMap =
1139	MapVector<Value , SmallVector<Instruction , `2`>>;
1140	using OrdersType = SmallVector<unsigned, `4`>;
1141
1142	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
1143	TargetLibraryInfo TLi, AAResults Aa, LoopInfo *Li,
1144	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
1145	const DataLayout DL, OptimizationRemarkEmitter ORE)
1146	: BatchAA (*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147	AC(AC), DB(DB), DL(DL), ORE(ORE),
1148	Builder (Se->getContext(), TargetFolder (*DL)) {
1149	CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1150	// Use the vector register size specified by the target unless overridden
1151	// by a command-line option.
1152	// TODO: It would be better to limit the vectorization factor based on
1153	// data type rather than just register size. For example, x86 AVX has
1154	// 256-bit registers, but it does not support integer operations
1155	// at that width (that requires AVX2).
1156	if (MaxVectorRegSizeOption.getNumOccurrences())
1157	MaxVecRegSize = MaxVectorRegSizeOption;
1158	else
1159	MaxVecRegSize =
1160	TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
1161	.getFixedValue();
1162
1163	if (MinVectorRegSizeOption.getNumOccurrences())
1164	MinVecRegSize = MinVectorRegSizeOption;
1165	else
1166	MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1167	}
1168
1169	/// Vectorize the tree that starts with the elements in \p VL.
1170	/// Returns the vectorized root.
1171	Value *vectorizeTree();
1172
1173	/// Vectorize the tree but with the list of externally used values \p
1174	/// ExternallyUsedValues. Values in this MapVector can be replaced but the
1175	/// generated extractvalue instructions.
1176	/// \param ReplacedExternals containd list of replaced external values
1177	/// {scalar, replace} after emitting extractelement for external uses.
1178	Value *
1179	vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1180	SmallVectorImpl<std::pair<Value , Value >> &ReplacedExternals,
1181	Instruction ReductionRoot = nullptr*);
1182
1183	/// \returns the cost incurred by unwanted spills and fills, caused by
1184	/// holding live values over call sites.
1185	InstructionCost getSpillCost() const;
1186
1187	/// \returns the vectorization cost of the subtree that starts at \p VL.
1188	/// A negative number means that this is profitable.
1189	InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1190
1191	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1192	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1193	void buildTree(ArrayRef<Value *> Roots,
1194	const SmallDenseSet<Value *> &UserIgnoreLst);
1195
1196	/// Construct a vectorizable tree that starts at \p Roots.
1197	void buildTree(ArrayRef<Value *> Roots);
1198
1199	/// Returns whether the root node has in-tree uses.
1200	bool doesRootHaveInTreeUses() const {
1201	return !VectorizableTree.empty() &&
1202	!VectorizableTree.front()->UserTreeIndices.empty();
1203	}
1204
1205	/// Return the scalars of the root node.
1206	ArrayRef<Value > getRootNodeScalars() const* {
1207	assert(!VectorizableTree.empty() && "No graph to get the first node from");
1208	return VectorizableTree.front()->Scalars;
1209	}
1210
1211	/// Checks if the root graph node can be emitted with narrower bitwidth at
1212	/// codegen and returns it signedness, if so.
1213	bool isSignedMinBitwidthRootNode() const {
1214	return MinBWs.at(Val: VectorizableTree.front().get()).second;
1215	}
1216
1217	/// Builds external uses of the vectorized scalars, i.e. the list of
1218	/// vectorized scalars to be extracted, their lanes and their scalar users. \p
1219	/// ExternallyUsedValues contains additional list of external uses to handle
1220	/// vectorization of reductions.
1221	void
1222	buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1223
1224	/// Transforms graph nodes to target specific representations, if profitable.
1225	void transformNodes();
1226
1227	/// Clear the internal data structures that are created by 'buildTree'.
1228	void deleteTree() {
1229	VectorizableTree.clear();
1230	ScalarToTreeEntry.clear();
1231	MultiNodeScalars.clear();
1232	MustGather.clear();
1233	NonScheduledFirst.clear();
1234	EntryToLastInstruction.clear();
1235	ExternalUses.clear();
1236	ExternalUsesAsGEPs.clear();
1237	for (auto &Iter : BlocksSchedules) {
1238	BlockScheduling *BS = Iter.second.get();
1239	BS->clear();
1240	}
1241	MinBWs.clear();
1242	ReductionBitWidth = `0`;
1243	CastMaxMinBWSizes.reset();
1244	ExtraBitWidthNodes.clear();
1245	InstrElementSize.clear();
1246	UserIgnoreList = nullptr;
1247	PostponedGathers.clear();
1248	ValueToGatherNodes.clear();
1249	}
1250
1251	unsigned getTreeSize() const { return VectorizableTree.size(); }
1252
1253	/// Perform LICM and CSE on the newly generated gather sequences.
1254	void optimizeGatherSequence();
1255
1256	/// Checks if the specified gather tree entry \p TE can be represented as a
1257	/// shuffled vector entry + (possibly) permutation with other gathers. It
1258	/// implements the checks only for possibly ordered scalars (Loads,
1259	/// ExtractElement, ExtractValue), which can be part of the graph.
1260	std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1261
1262	/// Sort loads into increasing pointers offsets to allow greater clustering.
1263	std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1264
1265	/// Gets reordering data for the given tree entry. If the entry is vectorized
1266	/// - just return ReorderIndices, otherwise check if the scalars can be
1267	/// reordered and return the most optimal order.
1268	/// \return std::nullopt if ordering is not important, empty order, if
1269	/// identity order is important, or the actual order.
1270	/// \param TopToBottom If true, include the order of vectorized stores and
1271	/// insertelement nodes, otherwise skip them.
1272	std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1273	bool TopToBottom);
1274
1275	/// Reorders the current graph to the most profitable order starting from the
1276	/// root node to the leaf nodes. The best order is chosen only from the nodes
1277	/// of the same size (vectorization factor). Smaller nodes are considered
1278	/// parts of subgraph with smaller VF and they are reordered independently. We
1279	/// can make it because we still need to extend smaller nodes to the wider VF
1280	/// and we can merge reordering shuffles with the widening shuffles.
1281	void reorderTopToBottom();
1282
1283	/// Reorders the current graph to the most profitable order starting from
1284	/// leaves to the root. It allows to rotate small subgraphs and reduce the
1285	/// number of reshuffles if the leaf nodes use the same order. In this case we
1286	/// can merge the orders and just shuffle user node instead of shuffling its
1287	/// operands. Plus, even the leaf nodes have different orders, it allows to
1288	/// sink reordering in the graph closer to the root node and merge it later
1289	/// during analysis.
1290	void reorderBottomToTop(bool IgnoreReorder = false);
1291
1292	/// \return The vector element size in bits to use when vectorizing the
1293	/// expression tree ending at \p V. If V is a store, the size is the width of
1294	/// the stored value. Otherwise, the size is the width of the largest loaded
1295	/// value reaching V. This method is used by the vectorizer to calculate
1296	/// vectorization factors.
1297	unsigned getVectorElementSize(Value *V);
1298
1299	/// Compute the minimum type sizes required to represent the entries in a
1300	/// vectorizable tree.
1301	void computeMinimumValueSizes();
1302
1303	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
1304	unsigned getMaxVecRegSize() const {
1305	return MaxVecRegSize;
1306	}
1307
1308	// \returns minimum vector register size as set by cl::opt.
1309	unsigned getMinVecRegSize() const {
1310	return MinVecRegSize;
1311	}
1312
1313	unsigned getMinVF(unsigned Sz) const {
1314	return std::max(a: `2U`, b: getMinVecRegSize() / Sz);
1315	}
1316
1317	unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1318	unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1319	MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1320	return MaxVF ? MaxVF : UINT_MAX;
1321	}
1322
1323	/// Check if homogeneous aggregate is isomorphic to some VectorType.
1324	/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1325	/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1326	/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1327	///
1328	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1329	unsigned canMapToVector(Type T) const*;
1330
1331	/// \returns True if the VectorizableTree is both tiny and not fully
1332	/// vectorizable. We do not vectorize such trees.
1333	bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1334
1335	/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1336	/// can be load combined in the backend. Load combining may not be allowed in
1337	/// the IR optimizer, so we do not want to alter the pattern. For example,
1338	/// partially transforming a scalar bswap() pattern into vector code is
1339	/// effectively impossible for the backend to undo.
1340	/// TODO: If load combining is allowed in the IR optimizer, this analysis
1341	/// may not be necessary.
1342	bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1343
1344	/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1345	/// can be load combined in the backend. Load combining may not be allowed in
1346	/// the IR optimizer, so we do not want to alter the pattern. For example,
1347	/// partially transforming a scalar bswap() pattern into vector code is
1348	/// effectively impossible for the backend to undo.
1349	/// TODO: If load combining is allowed in the IR optimizer, this analysis
1350	/// may not be necessary.
1351	bool isLoadCombineCandidate(ArrayRef<Value > Stores) const*;
1352
1353	/// Checks if the given array of loads can be represented as a vectorized,
1354	/// scatter or just simple gather.
1355	/// \param VL list of loads.
1356	/// \param VL0 main load value.
1357	/// \param Order returned order of load instructions.
1358	/// \param PointerOps returned list of pointer operands.
1359	/// \param TryRecursiveCheck used to check if long masked gather can be
1360	/// represented as a serie of loads/insert subvector, if profitable.
1361	LoadsState canVectorizeLoads(ArrayRef<Value > VL, const* Value *VL0,
1362	SmallVectorImpl<unsigned> &Order,
1363	SmallVectorImpl<Value *> &PointerOps,
1364	bool TryRecursiveCheck = true) const;
1365
1366	OptimizationRemarkEmitter getORE() { return* ORE; }
1367
1368	/// This structure holds any data we need about the edges being traversed
1369	/// during buildTree_rec(). We keep track of:
1370	/// (i) the user TreeEntry index, and
1371	/// (ii) the index of the edge.
1372	struct EdgeInfo {
1373	EdgeInfo() = default;
1374	EdgeInfo(TreeEntry UserTE, unsigned* EdgeIdx)
1375	: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1376	/// The user TreeEntry.
1377	TreeEntry UserTE = nullptr*;
1378	/// The operand index of the use.
1379	unsigned EdgeIdx = UINT_MAX;
1380	#ifndef NDEBUG
1381	friend inline raw_ostream &operator<<(raw_ostream &OS,
1382	const BoUpSLP::EdgeInfo &EI) {
1383	EI.dump(OS);
1384	return OS;
1385	}
1386	/// Debug print.
1387	void dump(raw_ostream &OS) const {
1388	OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1389	<< " EdgeIdx:" << EdgeIdx << "}";
1390	}
1391	LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1392	#endif
1393	bool operator == (const EdgeInfo &Other) const {
1394	return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1395	}
1396	};
1397
1398	/// A helper class used for scoring candidates for two consecutive lanes.
1399	class LookAheadHeuristics {
1400	const TargetLibraryInfo &TLI;
1401	const DataLayout &DL;
1402	ScalarEvolution &SE;
1403	const BoUpSLP &R;
1404	int NumLanes; // Total number of lanes (aka vectorization factor).
1405	int MaxLevel; // The maximum recursion depth for accumulating score.
1406
1407	public:
1408	LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1409	ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1410	int MaxLevel)
1411	: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1412	MaxLevel(MaxLevel) {}
1413
1414	// The hard-coded scores listed here are not very important, though it shall
1415	// be higher for better matches to improve the resulting cost. When
1416	// computing the scores of matching one sub-tree with another, we are
1417	// basically counting the number of values that are matching. So even if all
1418	// scores are set to 1, we would still get a decent matching result.
1419	// However, sometimes we have to break ties. For example we may have to
1420	// choose between matching loads vs matching opcodes. This is what these
1421	// scores are helping us with: they provide the order of preference. Also,
1422	// this is important if the scalar is externally used or used in another
1423	// tree entry node in the different lane.
1424
1425	/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1426	static const int ScoreConsecutiveLoads = `4`;
1427	/// The same load multiple times. This should have a better score than
1428	/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1429	/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1430	/// a vector load and 1.0 for a broadcast.
1431	static const int ScoreSplatLoads = `3`;
1432	/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1433	static const int ScoreReversedLoads = `3`;
1434	/// A load candidate for masked gather.
1435	static const int ScoreMaskedGatherCandidate = `1`;
1436	/// ExtractElementInst from same vector and consecutive indexes.
1437	static const int ScoreConsecutiveExtracts = `4`;
1438	/// ExtractElementInst from same vector and reversed indices.
1439	static const int ScoreReversedExtracts = `3`;
1440	/// Constants.
1441	static const int ScoreConstants = `2`;
1442	/// Instructions with the same opcode.
1443	static const int ScoreSameOpcode = `2`;
1444	/// Instructions with alt opcodes (e.g, add + sub).
1445	static const int ScoreAltOpcodes = `1`;
1446	/// Identical instructions (a.k.a. splat or broadcast).
1447	static const int ScoreSplat = `1`;
1448	/// Matching with an undef is preferable to failing.
1449	static const int ScoreUndef = `1`;
1450	/// Score for failing to find a decent match.
1451	static const int ScoreFail = `0`;
1452	/// Score if all users are vectorized.
1453	static const int ScoreAllUserVectorized = `1`;
1454
1455	/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1456	/// \p U1 and \p U2 are the users of \p V1 and \p V2.
1457	/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1458	/// MainAltOps.
1459	int getShallowScore(Value V1, Value V2, Instruction U1, Instruction U2,
1460	ArrayRef<Value > MainAltOps) const* {
1461	if (!isValidElementType(Ty: V1->getType()) \|\|
1462	!isValidElementType(Ty: V2->getType()))
1463	return LookAheadHeuristics::ScoreFail;
1464
1465	if (V1 == V2) {
1466	if (isa<LoadInst>(Val: V1)) {
1467	// Retruns true if the users of V1 and V2 won't need to be extracted.
1468	auto AllUsersAreInternal = [U1, U2, this](Value V1, Value V2) {
1469	// Bail out if we have too many uses to save compilation time.
1470	if (V1->hasNUsesOrMore(N: UsesLimit) \|\| V2->hasNUsesOrMore(N: UsesLimit))
1471	return false;
1472
1473	auto AllUsersVectorized = [U1, U2, this](Value *V) {
1474	return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
1475	return U == U1 \|\| U == U2 \|\| R.getTreeEntry(V: U) != nullptr;
1476	});
1477	};
1478	return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1479	};
1480	// A broadcast of a load can be cheaper on some targets.
1481	if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
1482	NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
1483	((int)V1->getNumUses() == NumLanes \|\|
1484	AllUsersAreInternal(V1, V2)))
1485	return LookAheadHeuristics::ScoreSplatLoads;
1486	}
1487	return LookAheadHeuristics::ScoreSplat;
1488	}
1489
1490	auto CheckSameEntryOrFail = [&]() {
1491	if (const TreeEntry *TE1 = R.getTreeEntry(V: V1);
1492	TE1 && TE1 == R.getTreeEntry(V: V2))
1493	return LookAheadHeuristics::ScoreSplatLoads;
1494	return LookAheadHeuristics::ScoreFail;
1495	};
1496
1497	auto *LI1 = dyn_cast<LoadInst>(Val: V1);
1498	auto *LI2 = dyn_cast<LoadInst>(Val: V2);
1499	if (LI1 && LI2) {
1500	if (LI1->getParent() != LI2->getParent() \|\| !LI1->isSimple() \|\|
1501	!LI2->isSimple())
1502	return CheckSameEntryOrFail();
1503
1504	std::optional<int> Dist = getPointersDiff(
1505	ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
1506	PtrB: LI2->getPointerOperand(), DL, SE, /StrictCheck=/true);
1507	if (!Dist \|\| *Dist == `0`) {
1508	if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
1509	getUnderlyingObject(V: LI2->getPointerOperand()) &&
1510	R.TTI->isLegalMaskedGather(
1511	DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
1512	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1513	return CheckSameEntryOrFail();
1514	}
1515	// The distance is too large - still may be profitable to use masked
1516	// loads/gathers.
1517	if (std::abs(x: *Dist) > NumLanes / `2`)
1518	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1519	// This still will detect consecutive loads, but we might have "holes"
1520	// in some cases. It is ok for non-power-2 vectorization and may produce
1521	// better results. It should not affect current vectorization.
1522	return (*Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveLoads
1523	: LookAheadHeuristics::ScoreReversedLoads;
1524	}
1525
1526	auto *C1 = dyn_cast<Constant>(Val: V1);
1527	auto *C2 = dyn_cast<Constant>(Val: V2);
1528	if (C1 && C2)
1529	return LookAheadHeuristics::ScoreConstants;
1530
1531	// Extracts from consecutive indexes of the same vector better score as
1532	// the extracts could be optimized away.
1533	Value *EV1;
1534	ConstantInt *Ex1Idx;
1535	if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
1536	// Undefs are always profitable for extractelements.
1537	// Compiler can easily combine poison and extractelement <non-poison> or
1538	// undef and extractelement <poison>. But combining undef +
1539	// extractelement <non-poison-but-may-produce-poison> requires some
1540	// extra operations.
1541	if (isa<UndefValue>(Val: V2))
1542	return (isa<PoisonValue>(Val: V2) \|\| isUndefVector(V: EV1).all())
1543	? LookAheadHeuristics::ScoreConsecutiveExtracts
1544	: LookAheadHeuristics::ScoreSameOpcode;
1545	Value EV2 = nullptr*;
1546	ConstantInt Ex2Idx = nullptr*;
1547	if (match(V: V2,
1548	P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
1549	R: m_Undef())))) {
1550	// Undefs are always profitable for extractelements.
1551	if (!Ex2Idx)
1552	return LookAheadHeuristics::ScoreConsecutiveExtracts;
1553	if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
1554	return LookAheadHeuristics::ScoreConsecutiveExtracts;
1555	if (EV2 == EV1) {
1556	int Idx1 = Ex1Idx->getZExtValue();
1557	int Idx2 = Ex2Idx->getZExtValue();
1558	int Dist = Idx2 - Idx1;
1559	// The distance is too large - still may be profitable to use
1560	// shuffles.
1561	if (std::abs(x: Dist) == `0`)
1562	return LookAheadHeuristics::ScoreSplat;
1563	if (std::abs(x: Dist) > NumLanes / `2`)
1564	return LookAheadHeuristics::ScoreSameOpcode;
1565	return (Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1566	: LookAheadHeuristics::ScoreReversedExtracts;
1567	}
1568	return LookAheadHeuristics::ScoreAltOpcodes;
1569	}
1570	return CheckSameEntryOrFail();
1571	}
1572
1573	auto *I1 = dyn_cast<Instruction>(Val: V1);
1574	auto *I2 = dyn_cast<Instruction>(Val: V2);
1575	if (I1 && I2) {
1576	if (I1->getParent() != I2->getParent())
1577	return CheckSameEntryOrFail();
1578	SmallVector<Value *, `4`> Ops(MainAltOps.begin(), MainAltOps.end());
1579	Ops.push_back(Elt: I1);
1580	Ops.push_back(Elt: I2);
1581	InstructionsState S = getSameOpcode(VL: Ops, TLI);
1582	// Note: Only consider instructions with <= 2 operands to avoid
1583	// complexity explosion.
1584	if (S.getOpcode() &&
1585	(S.MainOp->getNumOperands() <= `2` \|\| !MainAltOps.empty() \|\|
1586	!S.isAltShuffle()) &&
1587	all_of(Range&: Ops, P: [&S](Value *V) {
1588	return cast<Instruction>(Val: V)->getNumOperands() ==
1589	S.MainOp->getNumOperands();
1590	}))
1591	return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1592	: LookAheadHeuristics::ScoreSameOpcode;
1593	}
1594
1595	if (isa<UndefValue>(Val: V2))
1596	return LookAheadHeuristics::ScoreUndef;
1597
1598	return CheckSameEntryOrFail();
1599	}
1600
1601	/// Go through the operands of \p LHS and \p RHS recursively until
1602	/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1603	/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1604	/// of \p U1 and \p U2), except at the beginning of the recursion where
1605	/// these are set to nullptr.
1606	///
1607	/// For example:
1608	/// \verbatim
1609	/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1610	/// \ / \ / \ / \ /
1611	/// + + + +
1612	/// G1 G2 G3 G4
1613	/// \endverbatim
1614	/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1615	/// each level recursively, accumulating the score. It starts from matching
1616	/// the additions at level 0, then moves on to the loads (level 1). The
1617	/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1618	/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1619	/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1620	/// Please note that the order of the operands does not matter, as we
1621	/// evaluate the score of all profitable combinations of operands. In
1622	/// other words the score of G1 and G4 is the same as G1 and G2. This
1623	/// heuristic is based on ideas described in:
1624	/// Look-ahead SLP: Auto-vectorization in the presence of commutative
1625	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1626	/// Luís F. W. Góes
1627	int getScoreAtLevelRec(Value LHS, Value RHS, Instruction *U1,
1628	Instruction U2, int* CurrLevel,
1629	ArrayRef<Value > MainAltOps) const* {
1630
1631	// Get the shallow score of V1 and V2.
1632	int ShallowScoreAtThisLevel =
1633	getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
1634
1635	// If reached MaxLevel,
1636	// or if V1 and V2 are not instructions,
1637	// or if they are SPLAT,
1638	// or if they are not consecutive,
1639	// or if profitable to vectorize loads or extractelements, early return
1640	// the current cost.
1641	auto *I1 = dyn_cast<Instruction>(Val: LHS);
1642	auto *I2 = dyn_cast<Instruction>(Val: RHS);
1643	if (CurrLevel == MaxLevel \|\| !(I1 && I2) \|\| I1 == I2 \|\|
1644	ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail \|\|
1645	(((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) \|\|
1646	(I1->getNumOperands() > `2` && I2->getNumOperands() > `2`) \|\|
1647	(isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
1648	ShallowScoreAtThisLevel))
1649	return ShallowScoreAtThisLevel;
1650	assert(I1 && I2 && "Should have early exited.");
1651
1652	// Contains the I2 operand indexes that got matched with I1 operands.
1653	SmallSet<unsigned, `4`> Op2Used;
1654
1655	// Recursion towards the operands of I1 and I2. We are trying all possible
1656	// operand pairs, and keeping track of the best score.
1657	for (unsigned OpIdx1 = `0`, NumOperands1 = I1->getNumOperands();
1658	OpIdx1 != NumOperands1; ++OpIdx1) {
1659	// Try to pair op1I with the best operand of I2.
1660	int MaxTmpScore = `0`;
1661	unsigned MaxOpIdx2 = `0`;
1662	bool FoundBest = false;
1663	// If I2 is commutative try all combinations.
1664	unsigned FromIdx = isCommutative(I: I2) ? `0` : OpIdx1;
1665	unsigned ToIdx = isCommutative(I: I2)
1666	? I2->getNumOperands()
1667	: std::min(a: I2->getNumOperands(), b: OpIdx1 + `1`);
1668	assert(FromIdx <= ToIdx && "Bad index");
1669	for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1670	// Skip operands already paired with OpIdx1.
1671	if (Op2Used.count(V: OpIdx2))
1672	continue;
1673	// Recursively calculate the cost at each level
1674	int TmpScore =
1675	getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
1676	U1: I1, U2: I2, CurrLevel: CurrLevel + `1`, MainAltOps: std::nullopt);
1677	// Look for the best score.
1678	if (TmpScore > LookAheadHeuristics::ScoreFail &&
1679	TmpScore > MaxTmpScore) {
1680	MaxTmpScore = TmpScore;
1681	MaxOpIdx2 = OpIdx2;
1682	FoundBest = true;
1683	}
1684	}
1685	if (FoundBest) {
1686	// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1687	Op2Used.insert(V: MaxOpIdx2);
1688	ShallowScoreAtThisLevel += MaxTmpScore;
1689	}
1690	}
1691	return ShallowScoreAtThisLevel;
1692	}
1693	};
1694	/// A helper data structure to hold the operands of a vector of instructions.
1695	/// This supports a fixed vector length for all operand vectors.
1696	class VLOperands {
1697	/// For each operand we need (i) the value, and (ii) the opcode that it
1698	/// would be attached to if the expression was in a left-linearized form.
1699	/// This is required to avoid illegal operand reordering.
1700	/// For example:
1701	/// \verbatim
1702	/// 0 Op1
1703	/// \|/
1704	/// Op1 Op2 Linearized + Op2
1705	/// \ / ----------> \|/
1706	/// - -
1707	///
1708	/// Op1 - Op2 (0 + Op1) - Op2
1709	/// \endverbatim
1710	///
1711	/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1712	///
1713	/// Another way to think of this is to track all the operations across the
1714	/// path from the operand all the way to the root of the tree and to
1715	/// calculate the operation that corresponds to this path. For example, the
1716	/// path from Op2 to the root crosses the RHS of the '-', therefore the
1717	/// corresponding operation is a '-' (which matches the one in the
1718	/// linearized tree, as shown above).
1719	///
1720	/// For lack of a better term, we refer to this operation as Accumulated
1721	/// Path Operation (APO).
1722	struct OperandData {
1723	OperandData() = default;
1724	OperandData(Value V, bool* APO, bool IsUsed)
1725	: V(V), APO(APO), IsUsed(IsUsed) {}
1726	/// The operand value.
1727	Value V = nullptr*;
1728	/// TreeEntries only allow a single opcode, or an alternate sequence of
1729	/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1730	/// APO. It is set to 'true' if 'V' is attached to an inverse operation
1731	/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1732	/// (e.g., Add/Mul)
1733	bool APO = false;
1734	/// Helper data for the reordering function.
1735	bool IsUsed = false;
1736	};
1737
1738	/// During operand reordering, we are trying to select the operand at lane
1739	/// that matches best with the operand at the neighboring lane. Our
1740	/// selection is based on the type of value we are looking for. For example,
1741	/// if the neighboring lane has a load, we need to look for a load that is
1742	/// accessing a consecutive address. These strategies are summarized in the
1743	/// 'ReorderingMode' enumerator.
1744	enum class ReorderingMode {
1745	Load, ///< Matching loads to consecutive memory addresses
1746	Opcode, ///< Matching instructions based on opcode (same or alternate)
1747	Constant, ///< Matching constants
1748	Splat, ///< Matching the same instruction multiple times (broadcast)
1749	Failed, ///< We failed to create a vectorizable group
1750	};
1751
1752	using OperandDataVec = SmallVector<OperandData, `2`>;
1753
1754	/// A vector of operand vectors.
1755	SmallVector<OperandDataVec, `4`> OpsVec;
1756
1757	const TargetLibraryInfo &TLI;
1758	const DataLayout &DL;
1759	ScalarEvolution &SE;
1760	const BoUpSLP &R;
1761	const Loop L = nullptr*;
1762
1763	/// \returns the operand data at \p OpIdx and \p Lane.
1764	OperandData &getData(unsigned OpIdx, unsigned Lane) {
1765	return OpsVec [OpIdx][Lane];
1766	}
1767
1768	/// \returns the operand data at \p OpIdx and \p Lane. Const version.
1769	const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1770	return OpsVec [OpIdx][Lane];
1771	}
1772
1773	/// Clears the used flag for all entries.
1774	void clearUsed() {
1775	for (unsigned OpIdx = `0`, NumOperands = getNumOperands();
1776	OpIdx != NumOperands; ++OpIdx)
1777	for (unsigned Lane = `0`, NumLanes = getNumLanes(); Lane != NumLanes;
1778	++Lane)
1779	OpsVec [OpIdx][Lane].IsUsed = false;
1780	}
1781
1782	/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1783	void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1784	std::swap(a&: OpsVec [OpIdx1][Lane], b&: OpsVec [OpIdx2][Lane]);
1785	}
1786
1787	/// \param Lane lane of the operands under analysis.
1788	/// \param OpIdx operand index in \p Lane lane we're looking the best
1789	/// candidate for.
1790	/// \param Idx operand index of the current candidate value.
1791	/// \returns The additional score due to possible broadcasting of the
1792	/// elements in the lane. It is more profitable to have power-of-2 unique
1793	/// elements in the lane, it will be vectorized with higher probability
1794	/// after removing duplicates. Currently the SLP vectorizer supports only
1795	/// vectorization of the power-of-2 number of unique scalars.
1796	int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1797	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1798	if (!isa<Instruction>(Val: IdxLaneV) \|\| IdxLaneV == getData(OpIdx, Lane).V)
1799	return `0`;
1800	SmallPtrSet<Value *, `4`> Uniques;
1801	for (unsigned Ln = `0`, E = getNumLanes(); Ln < E; ++Ln) {
1802	if (Ln == Lane)
1803	continue;
1804	Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
1805	if (!isa<Instruction>(Val: OpIdxLnV))
1806	return `0`;
1807	Uniques.insert(Ptr: OpIdxLnV);
1808	}
1809	int UniquesCount = Uniques.size();
1810	int UniquesCntWithIdxLaneV =
1811	Uniques.contains(Ptr: IdxLaneV) ? UniquesCount : UniquesCount + `1`;
1812	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813	int UniquesCntWithOpIdxLaneV =
1814	Uniques.contains(Ptr: OpIdxLaneV) ? UniquesCount : UniquesCount + `1`;
1815	if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1816	return `0`;
1817	return (PowerOf2Ceil(A: UniquesCntWithOpIdxLaneV) -
1818	UniquesCntWithOpIdxLaneV) -
1819	(PowerOf2Ceil(A: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1820	}
1821
1822	/// \param Lane lane of the operands under analysis.
1823	/// \param OpIdx operand index in \p Lane lane we're looking the best
1824	/// candidate for.
1825	/// \param Idx operand index of the current candidate value.
1826	/// \returns The additional score for the scalar which users are all
1827	/// vectorized.
1828	int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1829	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1830	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1831	// Do not care about number of uses for vector-like instructions
1832	// (extractelement/extractvalue with constant indices), they are extracts
1833	// themselves and already externally used. Vectorization of such
1834	// instructions does not add extra extractelement instruction, just may
1835	// remove it.
1836	if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
1837	isVectorLikeInstWithConstOps(V: OpIdxLaneV))
1838	return LookAheadHeuristics::ScoreAllUserVectorized;
1839	auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
1840	if (!IdxLaneI \|\| !isa<Instruction>(Val: OpIdxLaneV))
1841	return `0`;
1842	return R.areAllUsersVectorized(I: IdxLaneI)
1843	? LookAheadHeuristics::ScoreAllUserVectorized
1844	: `0`;
1845	}
1846
1847	/// Score scaling factor for fully compatible instructions but with
1848	/// different number of external uses. Allows better selection of the
1849	/// instructions with less external uses.
1850	static const int ScoreScaleFactor = `10`;
1851
1852	/// \Returns the look-ahead score, which tells us how much the sub-trees
1853	/// rooted at \p LHS and \p RHS match, the more they match the higher the
1854	/// score. This helps break ties in an informed way when we cannot decide on
1855	/// the order of the operands by just considering the immediate
1856	/// predecessors.
1857	int getLookAheadScore(Value LHS, Value RHS, ArrayRef<Value *> MainAltOps,
1858	int Lane, unsigned OpIdx, unsigned Idx,
1859	bool &IsUsed) {
1860	LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1861	LookAheadMaxDepth);
1862	// Keep track of the instruction stack as we recurse into the operands
1863	// during the look-ahead score exploration.
1864	int Score =
1865	LookAhead.getScoreAtLevelRec(LHS, RHS, /U1=/nullptr, /U2=/nullptr,
1866	/CurrLevel=/`1`, MainAltOps);
1867	if (Score) {
1868	int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1869	if (Score <= -SplatScore) {
1870	// Set the minimum score for splat-like sequence to avoid setting
1871	// failed state.
1872	Score = `1`;
1873	} else {
1874	Score += SplatScore;
1875	// Scale score to see the difference between different operands
1876	// and similar operands but all vectorized/not all vectorized
1877	// uses. It does not affect actual selection of the best
1878	// compatible operand in general, just allows to select the
1879	// operand with all vectorized uses.
1880	Score *= ScoreScaleFactor;
1881	Score += getExternalUseScore(Lane, OpIdx, Idx);
1882	IsUsed = true;
1883	}
1884	}
1885	return Score;
1886	}
1887
1888	/// Best defined scores per lanes between the passes. Used to choose the
1889	/// best operand (with the highest score) between the passes.
1890	/// The key - {Operand Index, Lane}.
1891	/// The value - the best score between the passes for the lane and the
1892	/// operand.
1893	SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, `8`>
1894	BestScoresPerLanes;
1895
1896	// Search all operands in Ops[][Lane] for the one that matches best*
1897	// Ops[OpIdx][LastLane] and return its opreand index.
1898	// If no good match can be found, return std::nullopt.
1899	std::optional<unsigned>
1900	getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1901	ArrayRef<ReorderingMode> ReorderingModes,
1902	ArrayRef<Value *> MainAltOps) {
1903	unsigned NumOperands = getNumOperands();
1904
1905	// The operand of the previous lane at OpIdx.
1906	Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
1907
1908	// Our strategy mode for OpIdx.
1909	ReorderingMode RMode = ReorderingModes [OpIdx];
1910	if (RMode == ReorderingMode::Failed)
1911	return std::nullopt;
1912
1913	// The linearized opcode of the operand at OpIdx, Lane.
1914	bool OpIdxAPO = getData(OpIdx, Lane).APO;
1915
1916	// The best operand index and its score.
1917	// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1918	// are using the score to differentiate between the two.
1919	struct BestOpData {
1920	std::optional<unsigned> Idx;
1921	unsigned Score = `0`;
1922	} BestOp;
1923	BestOp.Score =
1924	BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: `0`)
1925	.first ->second;
1926
1927	// Track if the operand must be marked as used. If the operand is set to
1928	// Score 1 explicitly (because of non power-of-2 unique scalars, we may
1929	// want to reestimate the operands again on the following iterations).
1930	bool IsUsed = RMode == ReorderingMode::Splat \|\|
1931	RMode == ReorderingMode::Constant \|\|
1932	RMode == ReorderingMode::Load;
1933	// Iterate through all unused operands and look for the best.
1934	for (unsigned Idx = `0`; Idx != NumOperands; ++Idx) {
1935	// Get the operand at Idx and Lane.
1936	OperandData &OpData = getData(OpIdx: Idx, Lane);
1937	Value *Op = OpData.V;
1938	bool OpAPO = OpData.APO;
1939
1940	// Skip already selected operands.
1941	if (OpData.IsUsed)
1942	continue;
1943
1944	// Skip if we are trying to move the operand to a position with a
1945	// different opcode in the linearized tree form. This would break the
1946	// semantics.
1947	if (OpAPO != OpIdxAPO)
1948	continue;
1949
1950	// Look for an operand that matches the current mode.
1951	switch (RMode) {
1952	case ReorderingMode::Load:
1953	case ReorderingMode::Opcode: {
1954	bool LeftToRight = Lane > LastLane;
1955	Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1956	Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1957	int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
1958	OpIdx, Idx, IsUsed);
1959	if (Score > static_cast<int>(BestOp.Score) \|\|
1960	(Score > `0` && Score == static_cast<int>(BestOp.Score) &&
1961	Idx == OpIdx)) {
1962	BestOp.Idx = Idx;
1963	BestOp.Score = Score;
1964	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] = Score;
1965	}
1966	break;
1967	}
1968	case ReorderingMode::Constant:
1969	if (isa<Constant>(Val: Op) \|\|
1970	(!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
1971	BestOp.Idx = Idx;
1972	if (isa<Constant>(Val: Op)) {
1973	BestOp.Score = LookAheadHeuristics::ScoreConstants;
1974	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] =
1975	LookAheadHeuristics::ScoreConstants;
1976	}
1977	if (isa<UndefValue>(Val: Op) \|\| !isa<Constant>(Val: Op))
1978	IsUsed = false;
1979	}
1980	break;
1981	case ReorderingMode::Splat:
1982	if (Op == OpLastLane \|\| (!BestOp.Score && isa<Constant>(Val: Op))) {
1983	IsUsed = Op == OpLastLane;
1984	if (Op == OpLastLane) {
1985	BestOp.Score = LookAheadHeuristics::ScoreSplat;
1986	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] =
1987	LookAheadHeuristics::ScoreSplat;
1988	}
1989	BestOp.Idx = Idx;
1990	}
1991	break;
1992	case ReorderingMode::Failed:
1993	llvm_unreachable("Not expected Failed reordering mode.");
1994	}
1995	}
1996
1997	if (BestOp.Idx) {
1998	getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
1999	return BestOp.Idx;
2000	}
2001	// If we could not find a good match return std::nullopt.
2002	return std::nullopt;
2003	}
2004
2005	/// Helper for reorderOperandVecs.
2006	/// \returns the lane that we should start reordering from. This is the one
2007	/// which has the least number of operands that can freely move about or
2008	/// less profitable because it already has the most optimal set of operands.
2009	unsigned getBestLaneToStartReordering() const {
2010	unsigned Min = UINT_MAX;
2011	unsigned SameOpNumber = `0`;
2012	// std::pair<unsigned, unsigned> is used to implement a simple voting
2013	// algorithm and choose the lane with the least number of operands that
2014	// can freely move about or less profitable because it already has the
2015	// most optimal set of operands. The first unsigned is a counter for
2016	// voting, the second unsigned is the counter of lanes with instructions
2017	// with same/alternate opcodes and same parent basic block.
2018	MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2019	// Try to be closer to the original results, if we have multiple lanes
2020	// with same cost. If 2 lanes have the same cost, use the one with the
2021	// lowest index.
2022	for (int I = getNumLanes(); I > `0`; --I) {
2023	unsigned Lane = I - `1`;
2024	OperandsOrderData NumFreeOpsHash =
2025	getMaxNumOperandsThatCanBeReordered(Lane);
2026	// Compare the number of operands that can move and choose the one with
2027	// the least number.
2028	if (NumFreeOpsHash.NumOfAPOs < Min) {
2029	Min = NumFreeOpsHash.NumOfAPOs;
2030	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2031	HashMap.clear();
2032	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
2033	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034	NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2035	// Select the most optimal lane in terms of number of operands that
2036	// should be moved around.
2037	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
2039	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040	NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041	auto *It = HashMap.find(Key: NumFreeOpsHash.Hash);
2042	if (It == HashMap.end())
2043	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
2044	else
2045	++It->second.first;
2046	}
2047	}
2048	// Select the lane with the minimum counter.
2049	unsigned BestLane = `0`;
2050	unsigned CntMin = UINT_MAX;
2051	for (const auto &Data : reverse(C&: HashMap)) {
2052	if (Data.second.first < CntMin) {
2053	CntMin = Data.second.first;
2054	BestLane = Data.second.second;
2055	}
2056	}
2057	return BestLane;
2058	}
2059
2060	/// Data structure that helps to reorder operands.
2061	struct OperandsOrderData {
2062	/// The best number of operands with the same APOs, which can be
2063	/// reordered.
2064	unsigned NumOfAPOs = UINT_MAX;
2065	/// Number of operands with the same/alternate instruction opcode and
2066	/// parent.
2067	unsigned NumOpsWithSameOpcodeParent = `0`;
2068	/// Hash for the actual operands ordering.
2069	/// Used to count operands, actually their position id and opcode
2070	/// value. It is used in the voting mechanism to find the lane with the
2071	/// least number of operands that can freely move about or less profitable
2072	/// because it already has the most optimal set of operands. Can be
2073	/// replaced with SmallVector<unsigned> instead but hash code is faster
2074	/// and requires less memory.
2075	unsigned Hash = `0`;
2076	};
2077	/// \returns the maximum number of operands that are allowed to be reordered
2078	/// for \p Lane and the number of compatible instructions(with the same
2079	/// parent/opcode). This is used as a heuristic for selecting the first lane
2080	/// to start operand reordering.
2081	OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2082	unsigned CntTrue = `0`;
2083	unsigned NumOperands = getNumOperands();
2084	// Operands with the same APO can be reordered. We therefore need to count
2085	// how many of them we have for each APO, like this: Cnt[APO] = x.
2086	// Since we only have two APOs, namely true and false, we can avoid using
2087	// a map. Instead we can simply count the number of operands that
2088	// correspond to one of them (in this case the 'true' APO), and calculate
2089	// the other by subtracting it from the total number of operands.
2090	// Operands with the same instruction opcode and parent are more
2091	// profitable since we don't need to move them in many cases, with a high
2092	// probability such lane already can be vectorized effectively.
2093	bool AllUndefs = true;
2094	unsigned NumOpsWithSameOpcodeParent = `0`;
2095	Instruction OpcodeI = nullptr*;
2096	BasicBlock Parent = nullptr*;
2097	unsigned Hash = `0`;
2098	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2099	const OperandData &OpData = getData(OpIdx, Lane);
2100	if (OpData.APO)
2101	++CntTrue;
2102	// Use Boyer-Moore majority voting for finding the majority opcode and
2103	// the number of times it occurs.
2104	if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
2105	if (!OpcodeI \|\| !getSameOpcode(VL: {OpcodeI, I}, TLI).getOpcode() \|\|
2106	I->getParent() != Parent) {
2107	if (NumOpsWithSameOpcodeParent == `0`) {
2108	NumOpsWithSameOpcodeParent = `1`;
2109	OpcodeI = I;
2110	Parent = I->getParent();
2111	} else {
2112	--NumOpsWithSameOpcodeParent;
2113	}
2114	} else {
2115	++NumOpsWithSameOpcodeParent;
2116	}
2117	}
2118	Hash = hash_combine(
2119	args: Hash, args: hash_value(value: (OpIdx + `1`) * (OpData.V->getValueID() + `1`)));
2120	AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
2121	}
2122	if (AllUndefs)
2123	return {};
2124	OperandsOrderData Data;
2125	Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
2126	Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2127	Data.Hash = Hash;
2128	return Data;
2129	}
2130
2131	/// Go through the instructions in VL and append their operands.
2132	void appendOperandsOfVL(ArrayRef<Value *> VL) {
2133	assert(!VL.empty() && "Bad VL");
2134	assert((empty() \|\| VL.size() == getNumLanes()) &&
2135	"Expected same number of lanes");
2136	assert(isa<Instruction>(VL[`0`]) && "Expected instruction");
2137	unsigned NumOperands = cast<Instruction>(Val: VL [`0`])->getNumOperands();
2138	constexpr unsigned IntrinsicNumOperands = `2`;
2139	if (isa<IntrinsicInst>(Val: VL [`0`]))
2140	NumOperands = IntrinsicNumOperands;
2141	OpsVec.resize(N: NumOperands);
2142	unsigned NumLanes = VL.size();
2143	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2144	OpsVec [OpIdx].resize(N: NumLanes);
2145	for (unsigned Lane = `0`; Lane != NumLanes; ++Lane) {
2146	assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2147	// Our tree has just 3 nodes: the root and two operands.
2148	// It is therefore trivial to get the APO. We only need to check the
2149	// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2150	// RHS operand. The LHS operand of both add and sub is never attached
2151	// to an inversese operation in the linearized form, therefore its APO
2152	// is false. The RHS is true only if VL[Lane] is an inverse operation.
2153
2154	// Since operand reordering is performed on groups of commutative
2155	// operations or alternating sequences (e.g., +, -), we can safely
2156	// tell the inverse operations by checking commutativity.
2157	bool IsInverseOperation = !isCommutative(I: cast<Instruction>(Val: VL [Lane]));
2158	bool APO = (OpIdx == `0`) ? false : IsInverseOperation;
2159	OpsVec [OpIdx][Lane] = {cast<Instruction>(Val: VL [Lane])->getOperand(i: OpIdx),
2160	APO, false};
2161	}
2162	}
2163	}
2164
2165	/// \returns the number of operands.
2166	unsigned getNumOperands() const { return OpsVec.size(); }
2167
2168	/// \returns the number of lanes.
2169	unsigned getNumLanes() const { return OpsVec [`0`].size(); }
2170
2171	/// \returns the operand value at \p OpIdx and \p Lane.
2172	Value getValue(unsigned* OpIdx, unsigned Lane) const {
2173	return getData(OpIdx, Lane).V;
2174	}
2175
2176	/// \returns true if the data structure is empty.
2177	bool empty() const { return OpsVec.empty(); }
2178
2179	/// Clears the data.
2180	void clear() { OpsVec.clear(); }
2181
2182	/// \Returns true if there are enough operands identical to \p Op to fill
2183	/// the whole vector (it is mixed with constants or loop invariant values).
2184	/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2185	bool shouldBroadcast(Value Op, unsigned* OpIdx, unsigned Lane) {
2186	bool OpAPO = getData(OpIdx, Lane).APO;
2187	bool IsInvariant = L && L->isLoopInvariant(V: Op);
2188	unsigned Cnt = `0`;
2189	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2190	if (Ln == Lane)
2191	continue;
2192	// This is set to true if we found a candidate for broadcast at Lane.
2193	bool FoundCandidate = false;
2194	for (unsigned OpI = `0`, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195	OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2196	if (Data.APO != OpAPO \|\| Data.IsUsed)
2197	continue;
2198	Value *OpILane = getValue(OpIdx: OpI, Lane);
2199	bool IsConstantOp = isa<Constant>(Val: OpILane);
2200	// Consider the broadcast candidate if:
2201	// 1. Same value is found in one of the operands.
2202	if (Data.V == Op \|\|
2203	// 2. The operand in the given lane is not constant but there is a
2204	// constant operand in another lane (which can be moved to the
2205	// given lane). In this case we can represent it as a simple
2206	// permutation of constant and broadcast.
2207	(!IsConstantOp &&
2208	((Lns > `2` && isa<Constant>(Val: Data.V)) \|\|
2209	// 2.1. If we have only 2 lanes, need to check that value in the
2210	// next lane does not build same opcode sequence.
2211	(Lns == `2` &&
2212	!getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + `1`) % OpE, Lane: Ln)}, TLI)
2213	.getOpcode() &&
2214	isa<Constant>(Val: Data.V)))) \|\|
2215	// 3. The operand in the current lane is loop invariant (can be
2216	// hoisted out) and another operand is also a loop invariant
2217	// (though not a constant). In this case the whole vector can be
2218	// hoisted out.
2219	// FIXME: need to teach the cost model about this case for better
2220	// estimation.
2221	(IsInvariant && !isa<Constant>(Val: Data.V) &&
2222	!getSameOpcode(VL: {Op, Data.V}, TLI).getOpcode() &&
2223	L->isLoopInvariant(V: Data.V))) {
2224	FoundCandidate = true;
2225	Data.IsUsed = Data.V == Op;
2226	if (Data.V == Op)
2227	++Cnt;
2228	break;
2229	}
2230	}
2231	if (!FoundCandidate)
2232	return false;
2233	}
2234	return getNumLanes() == `2` \|\| Cnt > `1`;
2235	}
2236
2237	/// Checks if there is at least single compatible operand in lanes other
2238	/// than \p Lane, compatible with the operand \p Op.
2239	bool canBeVectorized(Instruction Op, unsigned* OpIdx, unsigned Lane) const {
2240	bool OpAPO = getData(OpIdx, Lane).APO;
2241	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2242	if (Ln == Lane)
2243	continue;
2244	if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
2245	const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2246	if (Data.APO != OpAPO \|\| Data.IsUsed)
2247	return true;
2248	Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
2249	return (L && L->isLoopInvariant(V: OpILn)) \|\|
2250	(getSameOpcode(VL: {Op, OpILn}, TLI).getOpcode() &&
2251	Op->getParent() == cast<Instruction>(Val: OpILn)->getParent());
2252	}))
2253	return true;
2254	}
2255	return false;
2256	}
2257
2258	public:
2259	/// Initialize with all the operands of the instruction vector \p RootVL.
2260	VLOperands(ArrayRef<Value > RootVL, const* BoUpSLP &R)
2261	: TLI(R.TLI), DL(R.DL), SE(*R.SE), R(R),
2262	L(R.LI->getLoopFor(
2263	BB: (cast<Instruction>(Val: RootVL.front())->getParent()))) {
2264	// Append all the operands of RootVL.
2265	appendOperandsOfVL(VL: RootVL);
2266	}
2267
2268	/// \Returns a value vector with the operands across all lanes for the
2269	/// opearnd at \p OpIdx.
2270	ValueList getVL(unsigned OpIdx) const {
2271	ValueList OpVL(OpsVec [OpIdx].size());
2272	assert(OpsVec[OpIdx].size() == getNumLanes() &&
2273	"Expected same num of lanes across all operands");
2274	for (unsigned Lane = `0`, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275	OpVL [Lane] = OpsVec [OpIdx][Lane].V;
2276	return OpVL;
2277	}
2278
2279	// Performs operand reordering for 2 or more operands.
2280	// The original operands are in OrigOps[OpIdx][Lane].
2281	// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2282	void reorder() {
2283	unsigned NumOperands = getNumOperands();
2284	unsigned NumLanes = getNumLanes();
2285	// Each operand has its own mode. We are using this mode to help us select
2286	// the instructions for each lane, so that they match best with the ones
2287	// we have selected so far.
2288	SmallVector<ReorderingMode, `2`> ReorderingModes(NumOperands);
2289
2290	// This is a greedy single-pass algorithm. We are going over each lane
2291	// once and deciding on the best order right away with no back-tracking.
2292	// However, in order to increase its effectiveness, we start with the lane
2293	// that has operands that can move the least. For example, given the
2294	// following lanes:
2295	// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2296	// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2297	// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2298	// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2299	// we will start at Lane 1, since the operands of the subtraction cannot
2300	// be reordered. Then we will visit the rest of the lanes in a circular
2301	// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2302
2303	// Find the first lane that we will start our search from.
2304	unsigned FirstLane = getBestLaneToStartReordering();
2305
2306	// Initialize the modes.
2307	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2308	Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
2309	// Keep track if we have instructions with all the same opcode on one
2310	// side.
2311	if (isa<LoadInst>(Val: OpLane0))
2312	ReorderingModes [OpIdx] = ReorderingMode::Load;
2313	else if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
2314	// Check if OpLane0 should be broadcast.
2315	if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) \|\|
2316	!canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
2317	ReorderingModes [OpIdx] = ReorderingMode::Splat;
2318	else
2319	ReorderingModes [OpIdx] = ReorderingMode::Opcode;
2320	} else if (isa<Constant>(Val: OpLane0))
2321	ReorderingModes [OpIdx] = ReorderingMode::Constant;
2322	else if (isa<Argument>(Val: OpLane0))
2323	// Our best hope is a Splat. It may save some cost in some cases.
2324	ReorderingModes [OpIdx] = ReorderingMode::Splat;
2325	else
2326	// NOTE: This should be unreachable.
2327	ReorderingModes [OpIdx] = ReorderingMode::Failed;
2328	}
2329
2330	// Check that we don't have same operands. No need to reorder if operands
2331	// are just perfect diamond or shuffled diamond match. Do not do it only
2332	// for possible broadcasts or non-power of 2 number of scalars (just for
2333	// now).
2334	auto &&SkipReordering = [this]() {
2335	SmallPtrSet<Value *, `4`> UniqueValues;
2336	ArrayRef<OperandData> Op0 = OpsVec.front();
2337	for (const OperandData &Data : Op0)
2338	UniqueValues.insert(Ptr: Data.V);
2339	for (ArrayRef<OperandData> Op : drop_begin(RangeOrContainer&: OpsVec, N: `1`)) {
2340	if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
2341	return !UniqueValues.contains(Ptr: Data.V);
2342	}))
2343	return false;
2344	}
2345	// TODO: Check if we can remove a check for non-power-2 number of
2346	// scalars after full support of non-power-2 vectorization.
2347	return UniqueValues.size() != `2` && isPowerOf2_32(Value: UniqueValues.size());
2348	};
2349
2350	// If the initial strategy fails for any of the operand indexes, then we
2351	// perform reordering again in a second pass. This helps avoid assigning
2352	// high priority to the failed strategy, and should improve reordering for
2353	// the non-failed operand indexes.
2354	for (int Pass = `0`; Pass != `2`; ++Pass) {
2355	// Check if no need to reorder operands since they're are perfect or
2356	// shuffled diamond match.
2357	// Need to do it to avoid extra external use cost counting for
2358	// shuffled matches, which may cause regressions.
2359	if (SkipReordering())
2360	break;
2361	// Skip the second pass if the first pass did not fail.
2362	bool StrategyFailed = false;
2363	// Mark all operand data as free to use.
2364	clearUsed();
2365	// We keep the original operand order for the FirstLane, so reorder the
2366	// rest of the lanes. We are visiting the nodes in a circular fashion,
2367	// using FirstLane as the center point and increasing the radius
2368	// distance.
2369	SmallVector<SmallVector<Value *, `2`>> MainAltOps(NumOperands);
2370	for (unsigned I = `0`; I < NumOperands; ++I)
2371	MainAltOps [I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
2372
2373	for (unsigned Distance = `1`; Distance != NumLanes; ++Distance) {
2374	// Visit the lane on the right and then the lane on the left.
2375	for (int Direction : {+`1`, -`1`}) {
2376	int Lane = FirstLane + Direction * Distance;
2377	if (Lane < `0` \|\| Lane >= (int)NumLanes)
2378	continue;
2379	int LastLane = Lane - Direction;
2380	assert(LastLane >= `0` && LastLane < (int)NumLanes &&
2381	"Out of bounds");
2382	// Look for a good match for each operand.
2383	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2384	// Search for the operand that matches SortedOps[OpIdx][Lane-1].
2385	std::optional<unsigned> BestIdx = getBestOperand(
2386	OpIdx, Lane, LastLane, ReorderingModes, MainAltOps: MainAltOps [OpIdx]);
2387	// By not selecting a value, we allow the operands that follow to
2388	// select a better matching value. We will get a non-null value in
2389	// the next run of getBestOperand().
2390	if (BestIdx) {
2391	// Swap the current operand with the one returned by
2392	// getBestOperand().
2393	swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
2394	} else {
2395	// Enable the second pass.
2396	StrategyFailed = true;
2397	}
2398	// Try to get the alternate opcode and follow it during analysis.
2399	if (MainAltOps [OpIdx].size() != `2`) {
2400	OperandData &AltOp = getData(OpIdx, Lane);
2401	InstructionsState OpS =
2402	getSameOpcode(VL: {MainAltOps [OpIdx].front(), AltOp.V}, TLI);
2403	if (OpS.getOpcode() && OpS.isAltShuffle())
2404	MainAltOps [OpIdx].push_back(Elt: AltOp.V);
2405	}
2406	}
2407	}
2408	}
2409	// Skip second pass if the strategy did not fail.
2410	if (!StrategyFailed)
2411	break;
2412	}
2413	}
2414
2415	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
2416	LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2417	switch (RMode) {
2418	case ReorderingMode::Load:
2419	return "Load";
2420	case ReorderingMode::Opcode:
2421	return "Opcode";
2422	case ReorderingMode::Constant:
2423	return "Constant";
2424	case ReorderingMode::Splat:
2425	return "Splat";
2426	case ReorderingMode::Failed:
2427	return "Failed";
2428	}
2429	llvm_unreachable("Unimplemented Reordering Type");
2430	}
2431
2432	LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2433	raw_ostream &OS) {
2434	return OS << getModeStr(RMode);
2435	}
2436
2437	/// Debug print.
2438	LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2439	printMode(RMode, dbgs());
2440	}
2441
2442	friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2443	return printMode(RMode, OS);
2444	}
2445
2446	LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2447	const unsigned Indent = `2`;
2448	unsigned Cnt = `0`;
2449	for (const OperandDataVec &OpDataVec : OpsVec) {
2450	OS << "Operand " << Cnt++ << "\n";
2451	for (const OperandData &OpData : OpDataVec) {
2452	OS.indent(Indent) << "{";
2453	if (Value *V = OpData.V)
2454	OS << *V;
2455	else
2456	OS << "null";
2457	OS << ", APO:" << OpData.APO << "}\n";
2458	}
2459	OS << "\n";
2460	}
2461	return OS;
2462	}
2463
2464	/// Debug print.
2465	LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2466	#endif
2467	};
2468
2469	/// Evaluate each pair in \p Candidates and return index into \p Candidates
2470	/// for a pair which have highest score deemed to have best chance to form
2471	/// root of profitable tree to vectorize. Return std::nullopt if no candidate
2472	/// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2473	/// of the cost, considered to be good enough score.
2474	std::optional<int>
2475	findBestRootPair(ArrayRef<std::pair<Value , Value >> Candidates,
2476	int Limit = LookAheadHeuristics::ScoreFail) const {
2477	LookAheadHeuristics LookAhead(TLI, DL, SE, this, /NumLanes=/`2`,
2478	RootLookAheadMaxDepth);
2479	int BestScore = Limit;
2480	std::optional<int> Index;
2481	for (int I : seq<int>(Begin: `0`, End: Candidates.size())) {
2482	int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates [I].first,
2483	RHS: Candidates [I].second,
2484	/U1=/nullptr, /U2=/nullptr,
2485	/Level=/CurrLevel: `1`, MainAltOps: std::nullopt);
2486	if (Score > BestScore) {
2487	BestScore = Score;
2488	Index = I;
2489	}
2490	}
2491	return Index;
2492	}
2493
2494	/// Checks if the instruction is marked for deletion.
2495	bool isDeleted(Instruction I) const* { return DeletedInstructions.count(V: I); }
2496
2497	/// Removes an instruction from its block and eventually deletes it.
2498	/// It's like Instruction::eraseFromParent() except that the actual deletion
2499	/// is delayed until BoUpSLP is destructed.
2500	void eraseInstruction(Instruction *I) {
2501	DeletedInstructions.insert(V: I);
2502	}
2503
2504	/// Remove instructions from the parent function and clear the operands of \p
2505	/// DeadVals instructions, marking for deletion trivially dead operands.
2506	template <typename T>
2507	void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2508	SmallVector<WeakTrackingVH> DeadInsts;
2509	for (T *V : DeadVals) {
2510	auto *I = cast<Instruction>(V);
2511	DeletedInstructions.insert(I);
2512	}
2513	DenseSet<Value *> Processed;
2514	for (T *V : DeadVals) {
2515	if (!V \|\| !Processed.insert(V).second)
2516	continue;
2517	auto *I = cast<Instruction>(V);
2518	salvageDebugInfo(*I);
2519	SmallVector<const TreeEntry *> Entries;
2520	if (const TreeEntry *Entry = getTreeEntry(I)) {
2521	Entries.push_back(Elt: Entry);
2522	auto It = MultiNodeScalars.find(I);
2523	if (It != MultiNodeScalars.end())
2524	Entries.append(It->second.begin(), It->second.end());
2525	}
2526	for (Use &U : I->operands()) {
2527	if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
2528	OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
2529	wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
2530	(Entries.empty() \|\| none_of(Entries, [&](const TreeEntry *Entry) {
2531	return Entry->VectorizedValue == OpI;
2532	})))
2533	DeadInsts.push_back(Elt: OpI);
2534	}
2535	I->dropAllReferences();
2536	}
2537	for (T *V : DeadVals) {
2538	auto *I = cast<Instruction>(V);
2539	if (!I->getParent())
2540	continue;
2541	assert((I->use_empty() \|\| all_of(I->uses(),
2542	[&](Use &U) {
2543	return isDeleted(
2544	cast<Instruction>(U.getUser()));
2545	})) &&
2546	"trying to erase instruction with users.");
2547	I->removeFromParent();
2548	SE->forgetValue(V: I);
2549	}
2550	// Process the dead instruction list until empty.
2551	while (!DeadInsts.empty()) {
2552	Value *V = DeadInsts.pop_back_val();
2553	Instruction *VI = cast_or_null<Instruction>(Val: V);
2554	if (!VI \|\| !VI->getParent())
2555	continue;
2556	assert(isInstructionTriviallyDead(VI, TLI) &&
2557	"Live instruction found in dead worklist!");
2558	assert(VI->use_empty() && "Instructions with uses are not dead.");
2559
2560	// Don't lose the debug info while deleting the instructions.
2561	salvageDebugInfo(I&: *VI);
2562
2563	// Null out all of the instruction's operands to see if any operand
2564	// becomes dead as we go.
2565	for (Use &OpU : VI->operands()) {
2566	Value *OpV = OpU.get();
2567	if (!OpV)
2568	continue;
2569	OpU.set(nullptr);
2570
2571	if (!OpV->use_empty())
2572	continue;
2573
2574	// If the operand is an instruction that became dead as we nulled out
2575	// the operand, and if it is 'trivially' dead, delete it in a future
2576	// loop iteration.
2577	if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
2578	if (!DeletedInstructions.contains(V: OpI) &&
2579	isInstructionTriviallyDead(I: OpI, TLI))
2580	DeadInsts.push_back(Elt: OpI);
2581	}
2582
2583	VI->removeFromParent();
2584	DeletedInstructions.insert(V: VI);
2585	SE->forgetValue(V: VI);
2586	}
2587	}
2588
2589	/// Checks if the instruction was already analyzed for being possible
2590	/// reduction root.
2591	bool isAnalyzedReductionRoot(Instruction I) const* {
2592	return AnalyzedReductionsRoots.count(Ptr: I);
2593	}
2594	/// Register given instruction as already analyzed for being possible
2595	/// reduction root.
2596	void analyzedReductionRoot(Instruction *I) {
2597	AnalyzedReductionsRoots.insert(Ptr: I);
2598	}
2599	/// Checks if the provided list of reduced values was checked already for
2600	/// vectorization.
2601	bool areAnalyzedReductionVals(ArrayRef<Value > VL) const* {
2602	return AnalyzedReductionVals.contains(V: hash_value(S: VL));
2603	}
2604	/// Adds the list of reduced values to list of already checked values for the
2605	/// vectorization.
2606	void analyzedReductionVals(ArrayRef<Value *> VL) {
2607	AnalyzedReductionVals.insert(V: hash_value(S: VL));
2608	}
2609	/// Clear the list of the analyzed reduction root instructions.
2610	void clearReductionData() {
2611	AnalyzedReductionsRoots.clear();
2612	AnalyzedReductionVals.clear();
2613	AnalyzedMinBWVals.clear();
2614	}
2615	/// Checks if the given value is gathered in one of the nodes.
2616	bool isAnyGathered(const SmallDenseSet<Value > &Vals) const* {
2617	return any_of(Range: MustGather, P: [&](Value V) { return* Vals.contains(V); });
2618	}
2619	/// Checks if the given value is gathered in one of the nodes.
2620	bool isGathered(const Value V) const* {
2621	return MustGather.contains(Ptr: V);
2622	}
2623	/// Checks if the specified value was not schedule.
2624	bool isNotScheduled(const Value V) const* {
2625	return NonScheduledFirst.contains(Ptr: V);
2626	}
2627
2628	/// Check if the value is vectorized in the tree.
2629	bool isVectorized(Value V) const* { return getTreeEntry(V); }
2630
2631	~BoUpSLP();
2632
2633	private:
2634	/// Determine if a node \p E in can be demoted to a smaller type with a
2635	/// truncation. We collect the entries that will be demoted in ToDemote.
2636	/// \param E Node for analysis
2637	/// \param ToDemote indices of the nodes to be demoted.
2638	bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2639	unsigned &BitWidth,
2640	SmallVectorImpl<unsigned> &ToDemote,
2641	DenseSet<const TreeEntry *> &Visited,
2642	unsigned &MaxDepthLevel,
2643	bool &IsProfitableToDemote,
2644	bool IsTruncRoot) const;
2645
2646	/// Check if the operands on the edges \p Edges of the \p UserTE allows
2647	/// reordering (i.e. the operands can be reordered because they have only one
2648	/// user and reordarable).
2649	/// \param ReorderableGathers List of all gather nodes that require reordering
2650	/// (e.g., gather of extractlements or partially vectorizable loads).
2651	/// \param GatherOps List of gather operand nodes for \p UserTE that require
2652	/// reordering, subset of \p NonVectorized.
2653	bool
2654	canReorderOperands(TreeEntry *UserTE,
2655	SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2656	ArrayRef<TreeEntry *> ReorderableGathers,
2657	SmallVectorImpl<TreeEntry *> &GatherOps);
2658
2659	/// Checks if the given \p TE is a gather node with clustered reused scalars
2660	/// and reorders it per given \p Mask.
2661	void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2662
2663	/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2664	/// if any. If it is not vectorized (gather node), returns nullptr.
2665	TreeEntry getVectorizedOperand(TreeEntry UserTE, unsigned OpIdx) {
2666	ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2667	TreeEntry TE = nullptr*;
2668	const auto It = find_if(Range&: VL, P: [&](Value V) {
2669	TE = getTreeEntry(V);
2670	if (TE && is_contained(Range&: TE->UserTreeIndices, Element: EdgeInfo (UserTE, OpIdx)))
2671	return true;
2672	auto It = MultiNodeScalars.find(Val: V);
2673	if (It != MultiNodeScalars.end()) {
2674	for (TreeEntry *E : It ->second) {
2675	if (is_contained(Range&: E->UserTreeIndices, Element: EdgeInfo (UserTE, OpIdx))) {
2676	TE = E;
2677	return true;
2678	}
2679	}
2680	}
2681	return false;
2682	});
2683	if (It != VL.end()) {
2684	assert(TE->isSame(VL) && "Expected same scalars.");
2685	return TE;
2686	}
2687	return nullptr;
2688	}
2689
2690	/// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2691	/// if any. If it is not vectorized (gather node), returns nullptr.
2692	const TreeEntry getVectorizedOperand(const* TreeEntry *UserTE,
2693	unsigned OpIdx) const {
2694	return const_cast<BoUpSLP >(this*)->getVectorizedOperand(
2695	UserTE: const_cast<TreeEntry *>(UserTE), OpIdx);
2696	}
2697
2698	/// Checks if all users of \p I are the part of the vectorization tree.
2699	bool areAllUsersVectorized(
2700	Instruction *I,
2701	const SmallDenseSet<Value > VectorizedVals = nullptr) const;
2702
2703	/// Return information about the vector formed for the specified index
2704	/// of a vector of (the same) instruction.
2705	TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2706
2707	/// \ returns the graph entry for the \p Idx operand of the \p E entry.
2708	const TreeEntry getOperandEntry(const* TreeEntry E, unsigned* Idx) const;
2709
2710	/// \returns Cast context for the given graph node.
2711	TargetTransformInfo::CastContextHint
2712	getCastContextHint(const TreeEntry &TE) const;
2713
2714	/// \returns the cost of the vectorizable entry.
2715	InstructionCost getEntryCost(const TreeEntry *E,
2716	ArrayRef<Value *> VectorizedVals,
2717	SmallPtrSetImpl<Value *> &CheckedExtracts);
2718
2719	/// This is the recursive part of buildTree.
2720	void buildTree_rec(ArrayRef<Value > Roots, unsigned* Depth,
2721	const EdgeInfo &EI);
2722
2723	/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2724	/// be vectorized to use the original vector (or aggregate "bitcast" to a
2725	/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2726	/// returns false, setting \p CurrentOrder to either an empty vector or a
2727	/// non-identity permutation that allows to reuse extract instructions.
2728	/// \param ResizeAllowed indicates whether it is allowed to handle subvector
2729	/// extract order.
2730	bool canReuseExtract(ArrayRef<Value > VL, Value OpValue,
2731	SmallVectorImpl<unsigned> &CurrentOrder,
2732	bool ResizeAllowed = false) const;
2733
2734	/// Vectorize a single entry in the tree.
2735	/// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2736	/// avoid issues with def-use order.
2737	Value vectorizeTree(TreeEntry E, bool PostponedPHIs);
2738
2739	/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2740	/// \p E.
2741	/// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2742	/// avoid issues with def-use order.
2743	Value vectorizeOperand(TreeEntry E, unsigned NodeIdx, bool PostponedPHIs);
2744
2745	/// Create a new vector from a list of scalar values. Produces a sequence
2746	/// which exploits values reused across lanes, and arranges the inserts
2747	/// for ease of later optimization.
2748	template <typename BVTy, typename ResTy, typename... Args>
2749	ResTy processBuildVector(const TreeEntry E, Type ScalarTy, Args &...Params);
2750
2751	/// Create a new vector from a list of scalar values. Produces a sequence
2752	/// which exploits values reused across lanes, and arranges the inserts
2753	/// for ease of later optimization.
2754	Value createBuildVector(const* TreeEntry E, Type ScalarTy);
2755
2756	/// Returns the instruction in the bundle, which can be used as a base point
2757	/// for scheduling. Usually it is the last instruction in the bundle, except
2758	/// for the case when all operands are external (in this case, it is the first
2759	/// instruction in the list).
2760	Instruction &getLastInstructionInBundle(const TreeEntry *E);
2761
2762	/// Tries to find extractelement instructions with constant indices from fixed
2763	/// vector type and gather such instructions into a bunch, which highly likely
2764	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2765	/// was successful, the matched scalars are replaced by poison values in \p VL
2766	/// for future analysis.
2767	std::optional<TargetTransformInfo::ShuffleKind>
2768	tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2769	SmallVectorImpl<int> &Mask) const;
2770
2771	/// Tries to find extractelement instructions with constant indices from fixed
2772	/// vector type and gather such instructions into a bunch, which highly likely
2773	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2774	/// was successful, the matched scalars are replaced by poison values in \p VL
2775	/// for future analysis.
2776	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2777	tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2778	SmallVectorImpl<int> &Mask,
2779	unsigned NumParts) const;
2780
2781	/// Checks if the gathered \p VL can be represented as a single register
2782	/// shuffle(s) of previous tree entries.
2783	/// \param TE Tree entry checked for permutation.
2784	/// \param VL List of scalars (a subset of the TE scalar), checked for
2785	/// permutations. Must form single-register vector.
2786	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2787	/// commands to build the mask using the original vector value, without
2788	/// relying on the potential reordering.
2789	/// \returns ShuffleKind, if gathered values can be represented as shuffles of
2790	/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2791	std::optional<TargetTransformInfo::ShuffleKind>
2792	isGatherShuffledSingleRegisterEntry(
2793	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
2794	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part,
2795	bool ForOrder);
2796
2797	/// Checks if the gathered \p VL can be represented as multi-register
2798	/// shuffle(s) of previous tree entries.
2799	/// \param TE Tree entry checked for permutation.
2800	/// \param VL List of scalars (a subset of the TE scalar), checked for
2801	/// permutations.
2802	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2803	/// commands to build the mask using the original vector value, without
2804	/// relying on the potential reordering.
2805	/// \returns per-register series of ShuffleKind, if gathered values can be
2806	/// represented as shuffles of previous tree entries. \p Mask is filled with
2807	/// the shuffle mask (also on per-register base).
2808	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2809	isGatherShuffledEntry(
2810	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
2811	SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2812	unsigned NumParts, bool ForOrder = false);
2813
2814	/// \returns the scalarization cost for this list of values. Assuming that
2815	/// this subtree gets vectorized, we may need to extract the values from the
2816	/// roots. This method calculates the cost of extracting the values.
2817	/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2818	InstructionCost getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc,
2819	Type ScalarTy) const*;
2820
2821	/// Set the Builder insert point to one after the last instruction in
2822	/// the bundle
2823	void setInsertPointAfterBundle(const TreeEntry *E);
2824
2825	/// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2826	/// specified, the starting vector value is poison.
2827	Value gather(ArrayRef<Value > VL, Value Root, Type ScalarTy);
2828
2829	/// \returns whether the VectorizableTree is fully vectorizable and will
2830	/// be beneficial even the tree height is tiny.
2831	bool isFullyVectorizableTinyTree(bool ForReduction) const;
2832
2833	/// Reorder commutative or alt operands to get better probability of
2834	/// generating vectorized code.
2835	static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2836	SmallVectorImpl<Value *> &Left,
2837	SmallVectorImpl<Value *> &Right,
2838	const BoUpSLP &R);
2839
2840	/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2841	/// users of \p TE and collects the stores. It returns the map from the store
2842	/// pointers to the collected stores.
2843	DenseMap<Value , SmallVector<StoreInst >>
2844	collectUserStores(const BoUpSLP::TreeEntry TE) const*;
2845
2846	/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2847	/// stores in \p StoresVec can form a vector instruction. If so it returns
2848	/// true and populates \p ReorderIndices with the shuffle indices of the
2849	/// stores when compared to the sorted vector.
2850	bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2851	OrdersType &ReorderIndices) const;
2852
2853	/// Iterates through the users of \p TE, looking for scalar stores that can be
2854	/// potentially vectorized in a future SLP-tree. If found, it keeps track of
2855	/// their order and builds an order index vector for each store bundle. It
2856	/// returns all these order vectors found.
2857	/// We run this after the tree has formed, otherwise we may come across user
2858	/// instructions that are not yet in the tree.
2859	SmallVector<OrdersType, `1`>
2860	findExternalStoreUsersReorderIndices(TreeEntry TE) const*;
2861
2862	struct TreeEntry {
2863	using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, `8`>;
2864	TreeEntry(VecTreeTy &Container) : Container(Container) {}
2865
2866	/// \returns Common mask for reorder indices and reused scalars.
2867	SmallVector<int> getCommonMask() const {
2868	SmallVector<int> Mask;
2869	inversePermutation(Indices: ReorderIndices, Mask);
2870	::addMask(Mask, SubMask: ReuseShuffleIndices);
2871	return Mask;
2872	}
2873
2874	/// \returns true if the scalars in VL are equal to this entry.
2875	bool isSame(ArrayRef<Value > VL) const* {
2876	auto &&IsSame = [VL](ArrayRef<Value > Scalars, ArrayRef<int*> Mask) {
2877	if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2878	return std::equal(VL.begin(), VL.end(), Scalars.begin());
2879	return VL.size() == Mask.size() &&
2880	std::equal(VL.begin(), VL.end(), Mask.begin(),
2881	[Scalars](Value V, int* Idx) {
2882	return (isa<UndefValue>(Val: V) &&
2883	Idx == PoisonMaskElem) \|\|
2884	(Idx != PoisonMaskElem && V == Scalars [Idx]);
2885	});
2886	};
2887	if (!ReorderIndices.empty()) {
2888	// TODO: implement matching if the nodes are just reordered, still can
2889	// treat the vector as the same if the list of scalars matches VL
2890	// directly, without reordering.
2891	SmallVector<int> Mask;
2892	inversePermutation(Indices: ReorderIndices, Mask);
2893	if (VL.size() == Scalars.size())
2894	return IsSame(Scalars, Mask);
2895	if (VL.size() == ReuseShuffleIndices.size()) {
2896	::addMask(Mask, SubMask: ReuseShuffleIndices);
2897	return IsSame(Scalars, Mask);
2898	}
2899	return false;
2900	}
2901	return IsSame(Scalars, ReuseShuffleIndices);
2902	}
2903
2904	bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2905	return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906	UserTreeIndices.front().UserTE == UserEI.UserTE;
2907	}
2908
2909	/// \returns true if current entry has same operands as \p TE.
2910	bool hasEqualOperands(const TreeEntry &TE) const {
2911	if (TE.getNumOperands() != getNumOperands())
2912	return false;
2913	SmallBitVector Used(getNumOperands());
2914	for (unsigned I = `0`, E = getNumOperands(); I < E; ++I) {
2915	unsigned PrevCount = Used.count();
2916	for (unsigned K = `0`; K < E; ++K) {
2917	if (Used.test(Idx: K))
2918	continue;
2919	if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
2920	Used.set(K);
2921	break;
2922	}
2923	}
2924	// Check if we actually found the matching operand.
2925	if (PrevCount == Used.count())
2926	return false;
2927	}
2928	return true;
2929	}
2930
2931	/// \return Final vectorization factor for the node. Defined by the total
2932	/// number of vectorized scalars, including those, used several times in the
2933	/// entry and counted in the \a ReuseShuffleIndices, if any.
2934	unsigned getVectorFactor() const {
2935	if (!ReuseShuffleIndices.empty())
2936	return ReuseShuffleIndices.size();
2937	return Scalars.size();
2938	};
2939
2940	/// Checks if the current node is a gather node.
2941	bool isGather() const {return State == NeedToGather; }
2942
2943	/// A vector of scalars.
2944	ValueList Scalars;
2945
2946	/// The Scalars are vectorized into this value. It is initialized to Null.
2947	WeakTrackingVH VectorizedValue = nullptr;
2948
2949	/// New vector phi instructions emitted for the vectorized phi nodes.
2950	PHINode PHI = nullptr*;
2951
2952	/// Do we need to gather this sequence or vectorize it
2953	/// (either with vector instruction or with scatter/gather
2954	/// intrinsics for store/load)?
2955	enum EntryState {
2956	Vectorize,
2957	ScatterVectorize,
2958	StridedVectorize,
2959	NeedToGather
2960	};
2961	EntryState State;
2962
2963	/// Does this sequence require some shuffling?
2964	SmallVector<int, `4`> ReuseShuffleIndices;
2965
2966	/// Does this entry require reordering?
2967	SmallVector<unsigned, `4`> ReorderIndices;
2968
2969	/// Points back to the VectorizableTree.
2970	///
2971	/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2972	/// to be a pointer and needs to be able to initialize the child iterator.
2973	/// Thus we need a reference back to the container to translate the indices
2974	/// to entries.
2975	VecTreeTy &Container;
2976
2977	/// The TreeEntry index containing the user of this entry. We can actually
2978	/// have multiple users so the data structure is not truly a tree.
2979	SmallVector<EdgeInfo, `1`> UserTreeIndices;
2980
2981	/// The index of this treeEntry in VectorizableTree.
2982	int Idx = -`1`;
2983
2984	private:
2985	/// The operands of each instruction in each lane Operands[op_index][lane].
2986	/// Note: This helps avoid the replication of the code that performs the
2987	/// reordering of operands during buildTree_rec() and vectorizeTree().
2988	SmallVector<ValueList, `2`> Operands;
2989
2990	/// The main/alternate instruction.
2991	Instruction MainOp = nullptr*;
2992	Instruction AltOp = nullptr*;
2993
2994	public:
2995	/// Set this bundle's \p OpIdx'th operand to \p OpVL.
2996	void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2997	if (Operands.size() < OpIdx + `1`)
2998	Operands.resize(N: OpIdx + `1`);
2999	assert(Operands[OpIdx].empty() && "Already resized?");
3000	assert(OpVL.size() <= Scalars.size() &&
3001	"Number of operands is greater than the number of scalars.");
3002	Operands [OpIdx].resize(N: OpVL.size());
3003	copy(Range&: OpVL, Out: Operands [OpIdx].begin());
3004	}
3005
3006	/// Set the operands of this bundle in their original order.
3007	void setOperandsInOrder() {
3008	assert(Operands.empty() && "Already initialized?");
3009	auto *I0 = cast<Instruction>(Val: Scalars [`0`]);
3010	Operands.resize(N: I0->getNumOperands());
3011	unsigned NumLanes = Scalars.size();
3012	for (unsigned OpIdx = `0`, NumOperands = I0->getNumOperands();
3013	OpIdx != NumOperands; ++OpIdx) {
3014	Operands [OpIdx].resize(N: NumLanes);
3015	for (unsigned Lane = `0`; Lane != NumLanes; ++Lane) {
3016	auto *I = cast<Instruction>(Val: Scalars [Lane]);
3017	assert(I->getNumOperands() == NumOperands &&
3018	"Expected same number of operands");
3019	Operands [OpIdx][Lane] = I->getOperand(i: OpIdx);
3020	}
3021	}
3022	}
3023
3024	/// Reorders operands of the node to the given mask \p Mask.
3025	void reorderOperands(ArrayRef<int> Mask) {
3026	for (ValueList &Operand : Operands)
3027	reorderScalars(Scalars&: Operand, Mask);
3028	}
3029
3030	/// \returns the \p OpIdx operand of this TreeEntry.
3031	ValueList &getOperand(unsigned OpIdx) {
3032	assert(OpIdx < Operands.size() && "Off bounds");
3033	return Operands [OpIdx];
3034	}
3035
3036	/// \returns the \p OpIdx operand of this TreeEntry.
3037	ArrayRef<Value > getOperand(unsigned* OpIdx) const {
3038	assert(OpIdx < Operands.size() && "Off bounds");
3039	return Operands [OpIdx];
3040	}
3041
3042	/// \returns the number of operands.
3043	unsigned getNumOperands() const { return Operands.size(); }
3044
3045	/// \return the single \p OpIdx operand.
3046	Value getSingleOperand(unsigned* OpIdx) const {
3047	assert(OpIdx < Operands.size() && "Off bounds");
3048	assert(!Operands[OpIdx].empty() && "No operand available");
3049	return Operands [OpIdx][`0`];
3050	}
3051
3052	/// Some of the instructions in the list have alternate opcodes.
3053	bool isAltShuffle() const { return MainOp != AltOp; }
3054
3055	bool isOpcodeOrAlt(Instruction I) const* {
3056	unsigned CheckedOpcode = I->getOpcode();
3057	return (getOpcode() == CheckedOpcode \|\|
3058	getAltOpcode() == CheckedOpcode);
3059	}
3060
3061	/// Chooses the correct key for scheduling data. If \p Op has the same (or
3062	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3063	/// \p OpValue.
3064	Value isOneOf(Value Op) const {
3065	auto *I = dyn_cast<Instruction>(Val: Op);
3066	if (I && isOpcodeOrAlt(I))
3067	return Op;
3068	return MainOp;
3069	}
3070
3071	void setOperations(const InstructionsState &S) {
3072	MainOp = S.MainOp;
3073	AltOp = S.AltOp;
3074	}
3075
3076	Instruction getMainOp() const* {
3077	return MainOp;
3078	}
3079
3080	Instruction getAltOp() const* {
3081	return AltOp;
3082	}
3083
3084	/// The main/alternate opcodes for the list of instructions.
3085	unsigned getOpcode() const {
3086	return MainOp ? MainOp->getOpcode() : `0`;
3087	}
3088
3089	unsigned getAltOpcode() const {
3090	return AltOp ? AltOp->getOpcode() : `0`;
3091	}
3092
3093	/// When ReuseReorderShuffleIndices is empty it just returns position of \p
3094	/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3095	int findLaneForValue(Value V) const* {
3096	unsigned FoundLane = std::distance(first: Scalars.begin(), last: find(Range: Scalars, Val: V));
3097	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3098	if (!ReorderIndices.empty())
3099	FoundLane = ReorderIndices [FoundLane];
3100	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3101	if (!ReuseShuffleIndices.empty()) {
3102	FoundLane = std::distance(first: ReuseShuffleIndices.begin(),
3103	last: find(Range: ReuseShuffleIndices, Val: FoundLane));
3104	}
3105	return FoundLane;
3106	}
3107
3108	/// Build a shuffle mask for graph entry which represents a merge of main
3109	/// and alternate operations.
3110	void
3111	buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3112	SmallVectorImpl<int> &Mask,
3113	SmallVectorImpl<Value > OpScalars = nullptr,
3114	SmallVectorImpl<Value > AltScalars = nullptr) const;
3115
3116	/// Return true if this is a non-power-of-2 node.
3117	bool isNonPowOf2Vec() const {
3118	bool IsNonPowerOf2 = !isPowerOf2_32(Value: Scalars.size());
3119	assert((!IsNonPowerOf2 \|\| ReuseShuffleIndices.empty()) &&
3120	"Reshuffling not supported with non-power-of-2 vectors yet.");
3121	return IsNonPowerOf2;
3122	}
3123
3124	#ifndef NDEBUG
3125	/// Debug printer.
3126	LLVM_DUMP_METHOD void dump() const {
3127	dbgs() << Idx << ".\n";
3128	for (unsigned OpI = `0`, OpE = Operands.size(); OpI != OpE; ++OpI) {
3129	dbgs() << "Operand " << OpI << ":\n";
3130	for (const Value *V : Operands[OpI])
3131	dbgs().indent(`2`) << *V << "\n";
3132	}
3133	dbgs() << "Scalars: \n";
3134	for (Value *V : Scalars)
3135	dbgs().indent(`2`) << *V << "\n";
3136	dbgs() << "State: ";
3137	switch (State) {
3138	case Vectorize:
3139	dbgs() << "Vectorize\n";
3140	break;
3141	case ScatterVectorize:
3142	dbgs() << "ScatterVectorize\n";
3143	break;
3144	case StridedVectorize:
3145	dbgs() << "StridedVectorize\n";
3146	break;
3147	case NeedToGather:
3148	dbgs() << "NeedToGather\n";
3149	break;
3150	}
3151	dbgs() << "MainOp: ";
3152	if (MainOp)
3153	dbgs() << *MainOp << "\n";
3154	else
3155	dbgs() << "NULL\n";
3156	dbgs() << "AltOp: ";
3157	if (AltOp)
3158	dbgs() << *AltOp << "\n";
3159	else
3160	dbgs() << "NULL\n";
3161	dbgs() << "VectorizedValue: ";
3162	if (VectorizedValue)
3163	dbgs() << *VectorizedValue << "\n";
3164	else
3165	dbgs() << "NULL\n";
3166	dbgs() << "ReuseShuffleIndices: ";
3167	if (ReuseShuffleIndices.empty())
3168	dbgs() << "Empty";
3169	else
3170	for (int ReuseIdx : ReuseShuffleIndices)
3171	dbgs() << ReuseIdx << ", ";
3172	dbgs() << "\n";
3173	dbgs() << "ReorderIndices: ";
3174	for (unsigned ReorderIdx : ReorderIndices)
3175	dbgs() << ReorderIdx << ", ";
3176	dbgs() << "\n";
3177	dbgs() << "UserTreeIndices: ";
3178	for (const auto &EInfo : UserTreeIndices)
3179	dbgs() << EInfo << ", ";
3180	dbgs() << "\n";
3181	}
3182	#endif
3183	};
3184
3185	#ifndef NDEBUG
3186	void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3187	InstructionCost VecCost, InstructionCost ScalarCost,
3188	StringRef Banner) const {
3189	dbgs() << "SLP: " << Banner << ":\n";
3190	E->dump();
3191	dbgs() << "SLP: Costs:\n";
3192	dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3193	dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3194	dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3195	dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3196	<< ReuseShuffleCost + VecCost - ScalarCost << "\n";
3197	}
3198	#endif
3199
3200	/// Create a new VectorizableTree entry.
3201	TreeEntry newTreeEntry(ArrayRef<Value > VL,
3202	std::optional<ScheduleData *> Bundle,
3203	const InstructionsState &S,
3204	const EdgeInfo &UserTreeIdx,
3205	ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3206	ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3207	TreeEntry::EntryState EntryState =
3208	Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209	return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210	ReuseShuffleIndices, ReorderIndices);
3211	}
3212
3213	TreeEntry newTreeEntry(ArrayRef<Value > VL,
3214	TreeEntry::EntryState EntryState,
3215	std::optional<ScheduleData *> Bundle,
3216	const InstructionsState &S,
3217	const EdgeInfo &UserTreeIdx,
3218	ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3219	ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3220	assert(((!Bundle && EntryState == TreeEntry::NeedToGather) \|\|
3221	(Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222	"Need to vectorize gather entry?");
3223	VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
3224	TreeEntry *Last = VectorizableTree.back().get();
3225	Last->Idx = VectorizableTree.size() - `1`;
3226	Last->State = EntryState;
3227	Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
3228	in_end: ReuseShuffleIndices.end());
3229	if (ReorderIndices.empty()) {
3230	Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
3231	Last->setOperations(S);
3232	} else {
3233	// Reorder scalars and build final mask.
3234	Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
3235	transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
3236	F: [VL](unsigned Idx) -> Value * {
3237	if (Idx >= VL.size())
3238	return UndefValue::get(T: VL.front()->getType());
3239	return VL [Idx];
3240	});
3241	InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
3242	Last->setOperations(S);
3243	Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
3244	}
3245	if (!Last->isGather()) {
3246	for (Value *V : VL) {
3247	const TreeEntry *TE = getTreeEntry(V);
3248	assert((!TE \|\| TE == Last \|\| doesNotNeedToBeScheduled(V)) &&
3249	"Scalar already in tree!");
3250	if (TE) {
3251	if (TE != Last)
3252	MultiNodeScalars.try_emplace(Key: V).first ->getSecond().push_back(Elt: Last);
3253	continue;
3254	}
3255	ScalarToTreeEntry [V] = Last;
3256	}
3257	// Update the scheduler bundle to point to this TreeEntry.
3258	ScheduleData BundleMember = Bundle;
3259	assert((BundleMember \|\| isa<PHINode>(S.MainOp) \|\|
3260	isVectorLikeInstWithConstOps(S.MainOp) \|\|
3261	doesNotNeedToSchedule(VL)) &&
3262	"Bundle and VL out of sync");
3263	if (BundleMember) {
3264	for (Value *V : VL) {
3265	if (doesNotNeedToBeScheduled(V))
3266	continue;
3267	if (!BundleMember)
3268	continue;
3269	BundleMember->TE = Last;
3270	BundleMember = BundleMember->NextInBundle;
3271	}
3272	}
3273	assert(!BundleMember && "Bundle and VL out of sync");
3274	} else {
3275	// Build a map for gathered scalars to the nodes where they are used.
3276	bool AllConstsOrCasts = true;
3277	for (Value *V : VL)
3278	if (!isConstant(V)) {
3279	auto *I = dyn_cast<CastInst>(Val: V);
3280	AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3281	ValueToGatherNodes.try_emplace(Key: V).first ->getSecond().insert(Ptr: Last);
3282	}
3283	if (AllConstsOrCasts)
3284	CastMaxMinBWSizes =
3285	std::make_pair(x: std::numeric_limits<unsigned>::max(), y: `1`);
3286	MustGather.insert(I: VL.begin(), E: VL.end());
3287	}
3288
3289	if (UserTreeIdx.UserTE) {
3290	Last->UserTreeIndices.push_back(Elt: UserTreeIdx);
3291	assert((!Last->isNonPowOf2Vec() \|\| Last->ReorderIndices.empty()) &&
3292	"Reordering isn't implemented for non-power-of-2 nodes yet");
3293	}
3294	return Last;
3295	}
3296
3297	/// -- Vectorization State --
3298	/// Holds all of the tree entries.
3299	TreeEntry::VecTreeTy VectorizableTree;
3300
3301	#ifndef NDEBUG
3302	/// Debug printer.
3303	LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3304	for (unsigned Id = `0`, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305	VectorizableTree[Id]->dump();
3306	dbgs() << "\n";
3307	}
3308	}
3309	#endif
3310
3311	TreeEntry getTreeEntry(Value V) { return ScalarToTreeEntry.lookup(Val: V); }
3312
3313	const TreeEntry getTreeEntry(Value V) const {
3314	return ScalarToTreeEntry.lookup(Val: V);
3315	}
3316
3317	/// Check that the operand node of alternate node does not generate
3318	/// buildvector sequence. If it is, then probably not worth it to build
3319	/// alternate shuffle, if number of buildvector operands + alternate
3320	/// instruction > than the number of buildvector instructions.
3321	/// \param S the instructions state of the analyzed values.
3322	/// \param VL list of the instructions with alternate opcodes.
3323	bool areAltOperandsProfitable(const InstructionsState &S,
3324	ArrayRef<Value > VL) const*;
3325
3326	/// Checks if the specified list of the instructions/values can be vectorized
3327	/// and fills required data before actual scheduling of the instructions.
3328	TreeEntry::EntryState getScalarsVectorizationState(
3329	InstructionsState &S, ArrayRef<Value > VL, bool* IsScatterVectorizeUserTE,
3330	OrdersType &CurrentOrder, SmallVectorImpl<Value > &PointerOps) const*;
3331
3332	/// Maps a specific scalar to its tree entry.
3333	SmallDenseMap<Value , TreeEntry > ScalarToTreeEntry;
3334
3335	/// List of scalars, used in several vectorize nodes, and the list of the
3336	/// nodes.
3337	SmallDenseMap<Value , SmallVector<TreeEntry >> MultiNodeScalars;
3338
3339	/// Maps a value to the proposed vectorizable size.
3340	SmallDenseMap<Value , unsigned*> InstrElementSize;
3341
3342	/// A list of scalars that we found that we need to keep as scalars.
3343	ValueSet MustGather;
3344
3345	/// A set of first non-schedulable values.
3346	ValueSet NonScheduledFirst;
3347
3348	/// A map between the vectorized entries and the last instructions in the
3349	/// bundles. The bundles are built in use order, not in the def order of the
3350	/// instructions. So, we cannot rely directly on the last instruction in the
3351	/// bundle being the last instruction in the program order during
3352	/// vectorization process since the basic blocks are affected, need to
3353	/// pre-gather them before.
3354	DenseMap<const TreeEntry , Instruction > EntryToLastInstruction;
3355
3356	/// List of gather nodes, depending on other gather/vector nodes, which should
3357	/// be emitted after the vector instruction emission process to correctly
3358	/// handle order of the vector instructions and shuffles.
3359	SetVector<const TreeEntry *> PostponedGathers;
3360
3361	using ValueToGatherNodesMap =
3362	DenseMap<Value , SmallPtrSet<const* TreeEntry *, `4`>>;
3363	ValueToGatherNodesMap ValueToGatherNodes;
3364
3365	/// This POD struct describes one external user in the vectorized tree.
3366	struct ExternalUser {
3367	ExternalUser(Value S, llvm::User U, int L)
3368	: Scalar(S), User(U), Lane(L) {}
3369
3370	// Which scalar in our function.
3371	Value *Scalar;
3372
3373	// Which user that uses the scalar.
3374	llvm::User *User;
3375
3376	// Which lane does the scalar belong to.
3377	int Lane;
3378	};
3379	using UserList = SmallVector<ExternalUser, `16`>;
3380
3381	/// Checks if two instructions may access the same memory.
3382	///
3383	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3384	/// is invariant in the calling loop.
3385	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3386	Instruction *Inst2) {
3387	if (!Loc1.Ptr \|\| !isSimple(I: Inst1) \|\| !isSimple(I: Inst2))
3388	return true;
3389	// First check if the result is already in the cache.
3390	AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
3391	auto It = AliasCache.find(Val: Key);
3392	if (It != AliasCache.end())
3393	return It ->second;
3394	bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
3395	// Store the result in the cache.
3396	AliasCache.try_emplace(Key, Args&: Aliased);
3397	AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased);
3398	return Aliased;
3399	}
3400
3401	using AliasCacheKey = std::pair<Instruction , Instruction >;
3402
3403	/// Cache for alias results.
3404	/// TODO: consider moving this to the AliasAnalysis itself.
3405	DenseMap<AliasCacheKey, bool> AliasCache;
3406
3407	// Cache for pointerMayBeCaptured calls inside AA. This is preserved
3408	// globally through SLP because we don't perform any action which
3409	// invalidates capture results.
3410	BatchAAResults BatchAA;
3411
3412	/// Temporary store for deleted instructions. Instructions will be deleted
3413	/// eventually when the BoUpSLP is destructed. The deferral is required to
3414	/// ensure that there are no incorrect collisions in the AliasCache, which
3415	/// can happen if a new instruction is allocated at the same address as a
3416	/// previously deleted instruction.
3417	DenseSet<Instruction *> DeletedInstructions;
3418
3419	/// Set of the instruction, being analyzed already for reductions.
3420	SmallPtrSet<Instruction *, `16`> AnalyzedReductionsRoots;
3421
3422	/// Set of hashes for the list of reduction values already being analyzed.
3423	DenseSet<size_t> AnalyzedReductionVals;
3424
3425	/// Values, already been analyzed for mininmal bitwidth and found to be
3426	/// non-profitable.
3427	DenseSet<Value *> AnalyzedMinBWVals;
3428
3429	/// A list of values that need to extracted out of the tree.
3430	/// This list holds pairs of (Internal Scalar : External User). External User
3431	/// can be nullptr, it means that this Internal Scalar will be used later,
3432	/// after vectorization.
3433	UserList ExternalUses;
3434
3435	/// A list of GEPs which can be reaplced by scalar GEPs instead of
3436	/// extractelement instructions.
3437	SmallPtrSet<Value *, `4`> ExternalUsesAsGEPs;
3438
3439	/// Values used only by @llvm.assume calls.
3440	SmallPtrSet<const Value *, `32`> EphValues;
3441
3442	/// Holds all of the instructions that we gathered, shuffle instructions and
3443	/// extractelements.
3444	SetVector<Instruction *> GatherShuffleExtractSeq;
3445
3446	/// A list of blocks that we are going to CSE.
3447	DenseSet<BasicBlock *> CSEBlocks;
3448
3449	/// Contains all scheduling relevant data for an instruction.
3450	/// A ScheduleData either represents a single instruction or a member of an
3451	/// instruction bundle (= a group of instructions which is combined into a
3452	/// vector instruction).
3453	struct ScheduleData {
3454	// The initial value for the dependency counters. It means that the
3455	// dependencies are not calculated yet.
3456	enum { InvalidDeps = -`1` };
3457
3458	ScheduleData() = default;
3459
3460	void init(int BlockSchedulingRegionID, Value *OpVal) {
3461	FirstInBundle = this;
3462	NextInBundle = nullptr;
3463	NextLoadStore = nullptr;
3464	IsScheduled = false;
3465	SchedulingRegionID = BlockSchedulingRegionID;
3466	clearDependencies();
3467	OpValue = OpVal;
3468	TE = nullptr;
3469	}
3470
3471	/// Verify basic self consistency properties
3472	void verify() {
3473	if (hasValidDependencies()) {
3474	assert(UnscheduledDeps <= Dependencies && "invariant");
3475	} else {
3476	assert(UnscheduledDeps == Dependencies && "invariant");
3477	}
3478
3479	if (IsScheduled) {
3480	assert(isSchedulingEntity() &&
3481	"unexpected scheduled state");
3482	for (const ScheduleData BundleMember = this*; BundleMember;
3483	BundleMember = BundleMember->NextInBundle) {
3484	assert(BundleMember->hasValidDependencies() &&
3485	BundleMember->UnscheduledDeps == `0` &&
3486	"unexpected scheduled state");
3487	assert((BundleMember == this \|\| !BundleMember->IsScheduled) &&
3488	"only bundle is marked scheduled");
3489	}
3490	}
3491
3492	assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493	"all bundle members must be in same basic block");
3494	}
3495
3496	/// Returns true if the dependency information has been calculated.
3497	/// Note that depenendency validity can vary between instructions within
3498	/// a single bundle.
3499	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3500
3501	/// Returns true for single instructions and for bundle representatives
3502	/// (= the head of a bundle).
3503	bool isSchedulingEntity() const { return FirstInBundle == this; }
3504
3505	/// Returns true if it represents an instruction bundle and not only a
3506	/// single instruction.
3507	bool isPartOfBundle() const {
3508	return NextInBundle != nullptr \|\| FirstInBundle != this \|\| TE;
3509	}
3510
3511	/// Returns true if it is ready for scheduling, i.e. it has no more
3512	/// unscheduled depending instructions/bundles.
3513	bool isReady() const {
3514	assert(isSchedulingEntity() &&
3515	"can't consider non-scheduling entity for ready list");
3516	return unscheduledDepsInBundle() == `0` && !IsScheduled;
3517	}
3518
3519	/// Modifies the number of unscheduled dependencies for this instruction,
3520	/// and returns the number of remaining dependencies for the containing
3521	/// bundle.
3522	int incrementUnscheduledDeps(int Incr) {
3523	assert(hasValidDependencies() &&
3524	"increment of unscheduled deps would be meaningless");
3525	UnscheduledDeps += Incr;
3526	return FirstInBundle->unscheduledDepsInBundle();
3527	}
3528
3529	/// Sets the number of unscheduled dependencies to the number of
3530	/// dependencies.
3531	void resetUnscheduledDeps() {
3532	UnscheduledDeps = Dependencies;
3533	}
3534
3535	/// Clears all dependency information.
3536	void clearDependencies() {
3537	Dependencies = InvalidDeps;
3538	resetUnscheduledDeps();
3539	MemoryDependencies.clear();
3540	ControlDependencies.clear();
3541	}
3542
3543	int unscheduledDepsInBundle() const {
3544	assert(isSchedulingEntity() && "only meaningful on the bundle");
3545	int Sum = `0`;
3546	for (const ScheduleData BundleMember = this*; BundleMember;
3547	BundleMember = BundleMember->NextInBundle) {
3548	if (BundleMember->UnscheduledDeps == InvalidDeps)
3549	return InvalidDeps;
3550	Sum += BundleMember->UnscheduledDeps;
3551	}
3552	return Sum;
3553	}
3554
3555	void dump(raw_ostream &os) const {
3556	if (!isSchedulingEntity()) {
3557	os << "/ " << *Inst;
3558	} else if (NextInBundle) {
3559	os << `'['` << *Inst;
3560	ScheduleData *SD = NextInBundle;
3561	while (SD) {
3562	os << `';'` << *SD->Inst;
3563	SD = SD->NextInBundle;
3564	}
3565	os << `']'`;
3566	} else {
3567	os << *Inst;
3568	}
3569	}
3570
3571	Instruction Inst = nullptr*;
3572
3573	/// Opcode of the current instruction in the schedule data.
3574	Value OpValue = nullptr*;
3575
3576	/// The TreeEntry that this instruction corresponds to.
3577	TreeEntry TE = nullptr*;
3578
3579	/// Points to the head in an instruction bundle (and always to this for
3580	/// single instructions).
3581	ScheduleData FirstInBundle = nullptr*;
3582
3583	/// Single linked list of all instructions in a bundle. Null if it is a
3584	/// single instruction.
3585	ScheduleData NextInBundle = nullptr*;
3586
3587	/// Single linked list of all memory instructions (e.g. load, store, call)
3588	/// in the block - until the end of the scheduling region.
3589	ScheduleData NextLoadStore = nullptr*;
3590
3591	/// The dependent memory instructions.
3592	/// This list is derived on demand in calculateDependencies().
3593	SmallVector<ScheduleData *, `4`> MemoryDependencies;
3594
3595	/// List of instructions which this instruction could be control dependent
3596	/// on. Allowing such nodes to be scheduled below this one could introduce
3597	/// a runtime fault which didn't exist in the original program.
3598	/// ex: this is a load or udiv following a readonly call which inf loops
3599	SmallVector<ScheduleData *, `4`> ControlDependencies;
3600
3601	/// This ScheduleData is in the current scheduling region if this matches
3602	/// the current SchedulingRegionID of BlockScheduling.
3603	int SchedulingRegionID = `0`;
3604
3605	/// Used for getting a "good" final ordering of instructions.
3606	int SchedulingPriority = `0`;
3607
3608	/// The number of dependencies. Constitutes of the number of users of the
3609	/// instruction plus the number of dependent memory instructions (if any).
3610	/// This value is calculated on demand.
3611	/// If InvalidDeps, the number of dependencies is not calculated yet.
3612	int Dependencies = InvalidDeps;
3613
3614	/// The number of dependencies minus the number of dependencies of scheduled
3615	/// instructions. As soon as this is zero, the instruction/bundle gets ready
3616	/// for scheduling.
3617	/// Note that this is negative as long as Dependencies is not calculated.
3618	int UnscheduledDeps = InvalidDeps;
3619
3620	/// True if this instruction is scheduled (or considered as scheduled in the
3621	/// dry-run).
3622	bool IsScheduled = false;
3623	};
3624
3625	#ifndef NDEBUG
3626	friend inline raw_ostream &operator<<(raw_ostream &os,
3627	const BoUpSLP::ScheduleData &SD) {
3628	SD.dump(os);
3629	return os;
3630	}
3631	#endif
3632
3633	friend struct GraphTraits<BoUpSLP *>;
3634	friend struct DOTGraphTraits<BoUpSLP *>;
3635
3636	/// Contains all scheduling data for a basic block.
3637	/// It does not schedules instructions, which are not memory read/write
3638	/// instructions and their operands are either constants, or arguments, or
3639	/// phis, or instructions from others blocks, or their users are phis or from
3640	/// the other blocks. The resulting vector instructions can be placed at the
3641	/// beginning of the basic block without scheduling (if operands does not need
3642	/// to be scheduled) or at the end of the block (if users are outside of the
3643	/// block). It allows to save some compile time and memory used by the
3644	/// compiler.
3645	/// ScheduleData is assigned for each instruction in between the boundaries of
3646	/// the tree entry, even for those, which are not part of the graph. It is
3647	/// required to correctly follow the dependencies between the instructions and
3648	/// their correct scheduling. The ScheduleData is not allocated for the
3649	/// instructions, which do not require scheduling, like phis, nodes with
3650	/// extractelements/insertelements only or nodes with instructions, with
3651	/// uses/operands outside of the block.
3652	struct BlockScheduling {
3653	BlockScheduling(BasicBlock *BB)
3654	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3655
3656	void clear() {
3657	ReadyInsts.clear();
3658	ScheduleStart = nullptr;
3659	ScheduleEnd = nullptr;
3660	FirstLoadStoreInRegion = nullptr;
3661	LastLoadStoreInRegion = nullptr;
3662	RegionHasStackSave = false;
3663
3664	// Reduce the maximum schedule region size by the size of the
3665	// previous scheduling run.
3666	ScheduleRegionSizeLimit -= ScheduleRegionSize;
3667	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3668	ScheduleRegionSizeLimit = MinScheduleRegionSize;
3669	ScheduleRegionSize = `0`;
3670
3671	// Make a new scheduling region, i.e. all existing ScheduleData is not
3672	// in the new region yet.
3673	++SchedulingRegionID;
3674	}
3675
3676	ScheduleData getScheduleData(Instruction I) {
3677	if (BB != I->getParent())
3678	// Avoid lookup if can't possibly be in map.
3679	return nullptr;
3680	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
3681	if (SD && isInSchedulingRegion(SD))
3682	return SD;
3683	return nullptr;
3684	}
3685
3686	ScheduleData getScheduleData(Value V) {
3687	if (auto *I = dyn_cast<Instruction>(Val: V))
3688	return getScheduleData(I);
3689	return nullptr;
3690	}
3691
3692	ScheduleData getScheduleData(Value V, Value *Key) {
3693	if (V == Key)
3694	return getScheduleData(V);
3695	auto I = ExtraScheduleDataMap.find(Val: V);
3696	if (I != ExtraScheduleDataMap.end()) {
3697	ScheduleData *SD = I ->second.lookup(Val: Key);
3698	if (SD && isInSchedulingRegion(SD))
3699	return SD;
3700	}
3701	return nullptr;
3702	}
3703
3704	bool isInSchedulingRegion(ScheduleData SD) const* {
3705	return SD->SchedulingRegionID == SchedulingRegionID;
3706	}
3707
3708	/// Marks an instruction as scheduled and puts all dependent ready
3709	/// instructions into the ready-list.
3710	template <typename ReadyListType>
3711	void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712	SD->IsScheduled = true;
3713	LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3714
3715	for (ScheduleData *BundleMember = SD; BundleMember;
3716	BundleMember = BundleMember->NextInBundle) {
3717	if (BundleMember->Inst != BundleMember->OpValue)
3718	continue;
3719
3720	// Handle the def-use chain dependencies.
3721
3722	// Decrement the unscheduled counter and insert to ready list if ready.
3723	auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3724	doForAllOpcodes(V: I, Action: [&ReadyList](ScheduleData *OpDef) {
3725	if (OpDef && OpDef->hasValidDependencies() &&
3726	OpDef->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
3727	// There are no more unscheduled dependencies after
3728	// decrementing, so we can put the dependent instruction
3729	// into the ready list.
3730	ScheduleData *DepBundle = OpDef->FirstInBundle;
3731	assert(!DepBundle->IsScheduled &&
3732	"already scheduled bundle gets ready");
3733	ReadyList.insert(DepBundle);
3734	LLVM_DEBUG(dbgs()
3735	<< "SLP: gets ready (def): " << *DepBundle << "\n");
3736	}
3737	});
3738	};
3739
3740	// If BundleMember is a vector bundle, its operands may have been
3741	// reordered during buildTree(). We therefore need to get its operands
3742	// through the TreeEntry.
3743	if (TreeEntry *TE = BundleMember->TE) {
3744	// Need to search for the lane since the tree entry can be reordered.
3745	int Lane = std::distance(first: TE->Scalars.begin(),
3746	last: find(Range&: TE->Scalars, Val: BundleMember->Inst));
3747	assert(Lane >= `0` && "Lane not set");
3748
3749	// Since vectorization tree is being built recursively this assertion
3750	// ensures that the tree entry has all operands set before reaching
3751	// this code. Couple of exceptions known at the moment are extracts
3752	// where their second (immediate) operand is not added. Since
3753	// immediates do not affect scheduler behavior this is considered
3754	// okay.
3755	auto *In = BundleMember->Inst;
3756	assert(
3757	In &&
3758	(isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) \|\|
3759	In->getNumOperands() == TE->getNumOperands()) &&
3760	"Missed TreeEntry operands?");
3761	(void)In; // fake use to avoid build failure when assertions disabled
3762
3763	for (unsigned OpIdx = `0`, NumOperands = TE->getNumOperands();
3764	OpIdx != NumOperands; ++OpIdx)
3765	if (auto *I = dyn_cast<Instruction>(Val: TE->getOperand(OpIdx)[Lane]))
3766	DecrUnsched(I);
3767	} else {
3768	// If BundleMember is a stand-alone instruction, no operand reordering
3769	// has taken place, so we directly access its operands.
3770	for (Use &U : BundleMember->Inst->operands())
3771	if (auto *I = dyn_cast<Instruction>(Val: U.get()))
3772	DecrUnsched(I);
3773	}
3774	// Handle the memory dependencies.
3775	for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776	if (MemoryDepSD->hasValidDependencies() &&
3777	MemoryDepSD->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
3778	// There are no more unscheduled dependencies after decrementing,
3779	// so we can put the dependent instruction into the ready list.
3780	ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781	assert(!DepBundle->IsScheduled &&
3782	"already scheduled bundle gets ready");
3783	ReadyList.insert(DepBundle);
3784	LLVM_DEBUG(dbgs()
3785	<< "SLP: gets ready (mem): " << *DepBundle << "\n");
3786	}
3787	}
3788	// Handle the control dependencies.
3789	for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790	if (DepSD->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
3791	// There are no more unscheduled dependencies after decrementing,
3792	// so we can put the dependent instruction into the ready list.
3793	ScheduleData *DepBundle = DepSD->FirstInBundle;
3794	assert(!DepBundle->IsScheduled &&
3795	"already scheduled bundle gets ready");
3796	ReadyList.insert(DepBundle);
3797	LLVM_DEBUG(dbgs()
3798	<< "SLP: gets ready (ctl): " << *DepBundle << "\n");
3799	}
3800	}
3801	}
3802	}
3803
3804	/// Verify basic self consistency properties of the data structure.
3805	void verify() {
3806	if (!ScheduleStart)
3807	return;
3808
3809	assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810	ScheduleStart->comesBefore(ScheduleEnd) &&
3811	"Not a valid scheduling region?");
3812
3813	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3814	auto *SD = getScheduleData(I);
3815	if (!SD)
3816	continue;
3817	assert(isInSchedulingRegion(SD) &&
3818	"primary schedule data not in window?");
3819	assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820	"entire bundle in window!");
3821	(void)SD;
3822	doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->verify(); });
3823	}
3824
3825	for (auto *SD : ReadyInsts) {
3826	assert(SD->isSchedulingEntity() && SD->isReady() &&
3827	"item in ready list not ready?");
3828	(void)SD;
3829	}
3830	}
3831
3832	void doForAllOpcodes(Value *V,
3833	function_ref<void(ScheduleData *SD)> Action) {
3834	if (ScheduleData *SD = getScheduleData(V))
3835	Action (SD);
3836	auto I = ExtraScheduleDataMap.find(Val: V);
3837	if (I != ExtraScheduleDataMap.end())
3838	for (auto &P : I ->second)
3839	if (isInSchedulingRegion(SD: P.second))
3840	Action (P.second);
3841	}
3842
3843	/// Put all instructions into the ReadyList which are ready for scheduling.
3844	template <typename ReadyListType>
3845	void initialFillReadyList(ReadyListType &ReadyList) {
3846	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3847	doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
3848	if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3849	SD->isReady()) {
3850	ReadyList.insert(SD);
3851	LLVM_DEBUG(dbgs()
3852	<< "SLP: initially in ready list: " << *SD << "\n");
3853	}
3854	});
3855	}
3856	}
3857
3858	/// Build a bundle from the ScheduleData nodes corresponding to the
3859	/// scalar instruction for each lane.
3860	ScheduleData buildBundle(ArrayRef<Value > VL);
3861
3862	/// Checks if a bundle of instructions can be scheduled, i.e. has no
3863	/// cyclic dependencies. This is only a dry-run, no instructions are
3864	/// actually moved at this stage.
3865	/// \returns the scheduling bundle. The returned Optional value is not
3866	/// std::nullopt if \p VL is allowed to be scheduled.
3867	std::optional<ScheduleData *>
3868	tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
3869	const InstructionsState &S);
3870
3871	/// Un-bundles a group of instructions.
3872	void cancelScheduling(ArrayRef<Value > VL, Value OpValue);
3873
3874	/// Allocates schedule data chunk.
3875	ScheduleData *allocateScheduleDataChunks();
3876
3877	/// Extends the scheduling region so that V is inside the region.
3878	/// \returns true if the region size is within the limit.
3879	bool extendSchedulingRegion(Value V, const* InstructionsState &S);
3880
3881	/// Initialize the ScheduleData structures for new instructions in the
3882	/// scheduling region.
3883	void initScheduleData(Instruction FromI, Instruction ToI,
3884	ScheduleData *PrevLoadStore,
3885	ScheduleData *NextLoadStore);
3886
3887	/// Updates the dependency information of a bundle and of all instructions/
3888	/// bundles which depend on the original bundle.
3889	void calculateDependencies(ScheduleData SD, bool* InsertInReadyList,
3890	BoUpSLP *SLP);
3891
3892	/// Sets all instruction in the scheduling region to un-scheduled.
3893	void resetSchedule();
3894
3895	BasicBlock *BB;
3896
3897	/// Simple memory allocation for ScheduleData.
3898	SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3899
3900	/// The size of a ScheduleData array in ScheduleDataChunks.
3901	int ChunkSize;
3902
3903	/// The allocator position in the current chunk, which is the last entry
3904	/// of ScheduleDataChunks.
3905	int ChunkPos;
3906
3907	/// Attaches ScheduleData to Instruction.
3908	/// Note that the mapping survives during all vectorization iterations, i.e.
3909	/// ScheduleData structures are recycled.
3910	DenseMap<Instruction , ScheduleData > ScheduleDataMap;
3911
3912	/// Attaches ScheduleData to Instruction with the leading key.
3913	DenseMap<Value , SmallDenseMap<Value , ScheduleData *>>
3914	ExtraScheduleDataMap;
3915
3916	/// The ready-list for scheduling (only used for the dry-run).
3917	SetVector<ScheduleData *> ReadyInsts;
3918
3919	/// The first instruction of the scheduling region.
3920	Instruction ScheduleStart = nullptr*;
3921
3922	/// The first instruction _after_ the scheduling region.
3923	Instruction ScheduleEnd = nullptr*;
3924
3925	/// The first memory accessing instruction in the scheduling region
3926	/// (can be null).
3927	ScheduleData FirstLoadStoreInRegion = nullptr*;
3928
3929	/// The last memory accessing instruction in the scheduling region
3930	/// (can be null).
3931	ScheduleData LastLoadStoreInRegion = nullptr*;
3932
3933	/// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3934	/// region? Used to optimize the dependence calculation for the
3935	/// common case where there isn't.
3936	bool RegionHasStackSave = false;
3937
3938	/// The current size of the scheduling region.
3939	int ScheduleRegionSize = `0`;
3940
3941	/// The maximum size allowed for the scheduling region.
3942	int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3943
3944	/// The ID of the scheduling region. For a new vectorization iteration this
3945	/// is incremented which "removes" all ScheduleData from the region.
3946	/// Make sure that the initial SchedulingRegionID is greater than the
3947	/// initial SchedulingRegionID in ScheduleData (which is 0).
3948	int SchedulingRegionID = `1`;
3949	};
3950
3951	/// Attaches the BlockScheduling structures to basic blocks.
3952	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3953
3954	/// Performs the "real" scheduling. Done before vectorization is actually
3955	/// performed in a basic block.
3956	void scheduleBlock(BlockScheduling *BS);
3957
3958	/// List of users to ignore during scheduling and that don't need extracting.
3959	const SmallDenseSet<Value > UserIgnoreList = nullptr;
3960
3961	/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3962	/// sorted SmallVectors of unsigned.
3963	struct OrdersTypeDenseMapInfo {
3964	static OrdersType getEmptyKey() {
3965	OrdersType V;
3966	V.push_back(Elt: ~`1U`);
3967	return V;
3968	}
3969
3970	static OrdersType getTombstoneKey() {
3971	OrdersType V;
3972	V.push_back(Elt: ~`2U`);
3973	return V;
3974	}
3975
3976	static unsigned getHashValue(const OrdersType &V) {
3977	return static_cast<unsigned>(hash_combine_range(first: V.begin(), last: V.end()));
3978	}
3979
3980	static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3981	return LHS == RHS;
3982	}
3983	};
3984
3985	// Analysis and block reference.
3986	Function *F;
3987	ScalarEvolution *SE;
3988	TargetTransformInfo *TTI;
3989	TargetLibraryInfo *TLI;
3990	LoopInfo *LI;
3991	DominatorTree *DT;
3992	AssumptionCache *AC;
3993	DemandedBits *DB;
3994	const DataLayout *DL;
3995	OptimizationRemarkEmitter *ORE;
3996
3997	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3998	unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3999
4000	/// Instruction builder to construct the vectorized tree.
4001	IRBuilder<TargetFolder> Builder;
4002
4003	/// A map of scalar integer values to the smallest bit width with which they
4004	/// can legally be represented. The values map to (width, signed) pairs,
4005	/// where "width" indicates the minimum bit width and "signed" is True if the
4006	/// value must be signed-extended, rather than zero-extended, back to its
4007	/// original width.
4008	DenseMap<const TreeEntry , std::pair<uint64_t, bool*>> MinBWs;
4009
4010	/// Final size of the reduced vector, if the current graph represents the
4011	/// input for the reduction and it was possible to narrow the size of the
4012	/// reduction.
4013	unsigned ReductionBitWidth = `0`;
4014
4015	/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4016	/// type sizes, used in the tree.
4017	std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4018
4019	/// Indices of the vectorized nodes, which supposed to be the roots of the new
4020	/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4021	DenseSet<unsigned> ExtraBitWidthNodes;
4022	};
4023
4024	} // end namespace slpvectorizer
4025
4026	template <> struct GraphTraits<BoUpSLP *> {
4027	using TreeEntry = BoUpSLP::TreeEntry;
4028
4029	/// NodeRef has to be a pointer per the GraphWriter.
4030	using NodeRef = TreeEntry *;
4031
4032	using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
4033
4034	/// Add the VectorizableTree to the index iterator to be able to return
4035	/// TreeEntry pointers.
4036	struct ChildIteratorType
4037	: public iterator_adaptor_base<
4038	ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator> {
4039	ContainerTy &VectorizableTree;
4040
4041	ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator W,
4042	ContainerTy &VT)
4043	: ChildIteratorType::iterator_adaptor_base (W), VectorizableTree(VT) {}
4044
4045	NodeRef operator() { return* I->UserTE; }
4046	};
4047
4048	static NodeRef getEntryNode(BoUpSLP &R) {
4049	return R.VectorizableTree [`0`].get();
4050	}
4051
4052	static ChildIteratorType child_begin(NodeRef N) {
4053	return {N->UserTreeIndices.begin(), N->Container};
4054	}
4055
4056	static ChildIteratorType child_end(NodeRef N) {
4057	return {N->UserTreeIndices.end(), N->Container};
4058	}
4059
4060	/// For the node iterator we just need to turn the TreeEntry iterator into a
4061	/// TreeEntry iterator so that it dereferences to NodeRef.*
4062	class nodes_iterator {
4063	using ItTy = ContainerTy::iterator;
4064	ItTy It;
4065
4066	public:
4067	nodes_iterator(const ItTy &It2) : It(It2) {}
4068	NodeRef operator() { return* It->get(); }
4069	nodes_iterator operator++() {
4070	++It;
4071	return *this;
4072	}
4073	bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4074	};
4075
4076	static nodes_iterator nodes_begin(BoUpSLP *R) {
4077	return nodes_iterator (R->VectorizableTree.begin());
4078	}
4079
4080	static nodes_iterator nodes_end(BoUpSLP *R) {
4081	return nodes_iterator (R->VectorizableTree.end());
4082	}
4083
4084	static unsigned size(BoUpSLP R) { return* R->VectorizableTree.size(); }
4085	};
4086
4087	template <> struct DOTGraphTraits<BoUpSLP > : public* DefaultDOTGraphTraits {
4088	using TreeEntry = BoUpSLP::TreeEntry;
4089
4090	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits (IsSimple) {}
4091
4092	std::string getNodeLabel(const TreeEntry Entry, const* BoUpSLP *R) {
4093	std::string Str;
4094	raw_string_ostream OS(Str);
4095	OS << Entry->Idx << ".\n";
4096	if (isSplat(VL: Entry->Scalars))
4097	OS << "<splat> ";
4098	for (auto *V : Entry->Scalars) {
4099	OS << *V;
4100	if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
4101	return EU.Scalar == V;
4102	}))
4103	OS << " <extract>";
4104	OS << "\n";
4105	}
4106	return Str;
4107	}
4108
4109	static std::string getNodeAttributes(const TreeEntry *Entry,
4110	const BoUpSLP *) {
4111	if (Entry->isGather())
4112	return "color=red";
4113	if (Entry->State == TreeEntry::ScatterVectorize \|\|
4114	Entry->State == TreeEntry::StridedVectorize)
4115	return "color=blue";
4116	return "";
4117	}
4118	};
4119
4120	} // end namespace llvm
4121
4122	BoUpSLP::~BoUpSLP() {
4123	SmallVector<WeakTrackingVH> DeadInsts;
4124	for (auto *I : DeletedInstructions) {
4125	if (!I->getParent()) {
4126	// Temporarily insert instruction back to erase them from parent and
4127	// memory later.
4128	if (isa<PHINode>(Val: I))
4129	// Phi nodes must be the very first instructions in the block.
4130	I->insertBefore(BB&: F->getEntryBlock(),
4131	InsertPos: F->getEntryBlock().getFirstNonPHIIt());
4132	else
4133	I->insertBefore(InsertPos: F->getEntryBlock().getTerminator());
4134	continue;
4135	}
4136	for (Use &U : I->operands()) {
4137	auto *Op = dyn_cast<Instruction>(Val: U.get());
4138	if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
4139	wouldInstructionBeTriviallyDead(I: Op, TLI))
4140	DeadInsts.emplace_back(Args&: Op);
4141	}
4142	I->dropAllReferences();
4143	}
4144	for (auto *I : DeletedInstructions) {
4145	assert(I->use_empty() &&
4146	"trying to erase instruction with users.");
4147	I->eraseFromParent();
4148	}
4149
4150	// Cleanup any dead scalar code feeding the vectorized instructions
4151	RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4152
4153	#ifdef EXPENSIVE_CHECKS
4154	// If we could guarantee that this call is not extremely slow, we could
4155	// remove the ifdef limitation (see PR47712).
4156	assert(!verifyFunction(*F, &dbgs()));
4157	#endif
4158	}
4159
4160	/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4161	/// contains original mask for the scalars reused in the node. Procedure
4162	/// transform this mask in accordance with the given \p Mask.
4163	static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
4164	assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4165	"Expected non-empty mask.");
4166	SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4167	Prev.swap(RHS&: Reuses);
4168	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
4169	if (Mask [I] != PoisonMaskElem)
4170	Reuses [Mask [I]] = Prev [I];
4171	}
4172
4173	/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4174	/// the original order of the scalars. Procedure transforms the provided order
4175	/// in accordance with the given \p Mask. If the resulting \p Order is just an
4176	/// identity order, \p Order is cleared.
4177	static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
4178	bool BottomOrder = false) {
4179	assert(!Mask.empty() && "Expected non-empty mask.");
4180	unsigned Sz = Mask.size();
4181	if (BottomOrder) {
4182	SmallVector<unsigned> PrevOrder;
4183	if (Order.empty()) {
4184	PrevOrder.resize(N: Sz);
4185	std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: `0`);
4186	} else {
4187	PrevOrder.swap(RHS&: Order);
4188	}
4189	Order.assign(NumElts: Sz, Elt: Sz);
4190	for (unsigned I = `0`; I < Sz; ++I)
4191	if (Mask [I] != PoisonMaskElem)
4192	Order [I] = PrevOrder [Mask [I]];
4193	if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
4194	return Data.value() == Sz \|\| Data.index() == Data.value();
4195	})) {
4196	Order.clear();
4197	return;
4198	}
4199	fixupOrderingIndices(Order);
4200	return;
4201	}
4202	SmallVector<int> MaskOrder;
4203	if (Order.empty()) {
4204	MaskOrder.resize(N: Sz);
4205	std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: `0`);
4206	} else {
4207	inversePermutation(Indices: Order, Mask&: MaskOrder);
4208	}
4209	reorderReuses(Reuses&: MaskOrder, Mask);
4210	if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
4211	Order.clear();
4212	return;
4213	}
4214	Order.assign(NumElts: Sz, Elt: Sz);
4215	for (unsigned I = `0`; I < Sz; ++I)
4216	if (MaskOrder [I] != PoisonMaskElem)
4217	Order [MaskOrder [I]] = I;
4218	fixupOrderingIndices(Order);
4219	}
4220
4221	std::optional<BoUpSLP::OrdersType>
4222	BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4223	assert(TE.isGather() && "Expected gather node only.");
4224	// Try to find subvector extract/insert patterns and reorder only such
4225	// patterns.
4226	SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4227	Type *ScalarTy = GatheredScalars.front()->getType();
4228	int NumScalars = GatheredScalars.size();
4229	if (!isValidElementType(Ty: ScalarTy))
4230	return std::nullopt;
4231	auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
4232	int NumParts = TTI->getNumberOfParts(Tp: VecTy);
4233	if (NumParts == `0` \|\| NumParts >= NumScalars)
4234	NumParts = `1`;
4235	SmallVector<int> ExtractMask;
4236	SmallVector<int> Mask;
4237	SmallVector<SmallVector<const TreeEntry *>> Entries;
4238	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4239	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
4240	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4241	isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
4242	/ForOrder=/true);
4243	// No shuffled operands - ignore.
4244	if (GatherShuffles.empty() && ExtractShuffles.empty())
4245	return std::nullopt;
4246	OrdersType CurrentOrder(NumScalars, NumScalars);
4247	if (GatherShuffles.size() == `1` &&
4248	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4249	Entries.front().front()->isSame(VL: TE.Scalars)) {
4250	// Perfect match in the graph, will reuse the previously vectorized
4251	// node. Cost is 0.
4252	std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: `0`);
4253	return CurrentOrder;
4254	}
4255	auto IsSplatMask = [](ArrayRef<int> Mask) {
4256	int SingleElt = PoisonMaskElem;
4257	return all_of(Range&: Mask, P: [&](int I) {
4258	if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4259	SingleElt = I;
4260	return I == PoisonMaskElem \|\| I == SingleElt;
4261	});
4262	};
4263	// Exclusive broadcast mask - ignore.
4264	if ((ExtractShuffles.empty() && IsSplatMask (Mask) &&
4265	(Entries.size() != `1` \|\|
4266	Entries.front().front()->ReorderIndices.empty())) \|\|
4267	(GatherShuffles.empty() && IsSplatMask (ExtractMask)))
4268	return std::nullopt;
4269	SmallBitVector ShuffledSubMasks(NumParts);
4270	auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4271	ArrayRef<int> Mask, int PartSz, int NumParts,
4272	function_ref<unsigned(unsigned)> GetVF) {
4273	for (int I : seq<int>(Begin: `0`, End: NumParts)) {
4274	if (ShuffledSubMasks.test(Idx: I))
4275	continue;
4276	const int VF = GetVF (I);
4277	if (VF == `0`)
4278	continue;
4279	unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
4280	MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
4281	// Shuffle of at least 2 vectors - ignore.
4282	if (any_of(Range&: Slice, P: [&](int I) { return I != NumScalars; })) {
4283	std::fill(Slice.begin(), Slice.end(), NumScalars);
4284	ShuffledSubMasks.set(I);
4285	continue;
4286	}
4287	// Try to include as much elements from the mask as possible.
4288	int FirstMin = INT_MAX;
4289	int SecondVecFound = false;
4290	for (int K : seq<int>(Size: Limit)) {
4291	int Idx = Mask [I * PartSz + K];
4292	if (Idx == PoisonMaskElem) {
4293	Value V = GatheredScalars [I PartSz + K];
4294	if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
4295	SecondVecFound = true;
4296	break;
4297	}
4298	continue;
4299	}
4300	if (Idx < VF) {
4301	if (FirstMin > Idx)
4302	FirstMin = Idx;
4303	} else {
4304	SecondVecFound = true;
4305	break;
4306	}
4307	}
4308	FirstMin = (FirstMin / PartSz) * PartSz;
4309	// Shuffle of at least 2 vectors - ignore.
4310	if (SecondVecFound) {
4311	std::fill(Slice.begin(), Slice.end(), NumScalars);
4312	ShuffledSubMasks.set(I);
4313	continue;
4314	}
4315	for (int K : seq<int>(Size: Limit)) {
4316	int Idx = Mask [I * PartSz + K];
4317	if (Idx == PoisonMaskElem)
4318	continue;
4319	Idx -= FirstMin;
4320	if (Idx >= PartSz) {
4321	SecondVecFound = true;
4322	break;
4323	}
4324	if (CurrentOrder [I * PartSz + Idx] >
4325	static_cast<unsigned>(I * PartSz + K) &&
4326	CurrentOrder [I * PartSz + Idx] !=
4327	static_cast<unsigned>(I * PartSz + Idx))
4328	CurrentOrder [I * PartSz + Idx] = I * PartSz + K;
4329	}
4330	// Shuffle of at least 2 vectors - ignore.
4331	if (SecondVecFound) {
4332	std::fill(Slice.begin(), Slice.end(), NumScalars);
4333	ShuffledSubMasks.set(I);
4334	continue;
4335	}
4336	}
4337	};
4338	int PartSz = getPartNumElems(Size: NumScalars, NumParts);
4339	if (!ExtractShuffles.empty())
4340	TransformMaskToOrder (
4341	CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4342	if (!ExtractShuffles [I])
4343	return `0U`;
4344	unsigned VF = `0`;
4345	unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
4346	for (unsigned Idx : seq<unsigned>(Size: Sz)) {
4347	int K = I * PartSz + Idx;
4348	if (ExtractMask [K] == PoisonMaskElem)
4349	continue;
4350	if (!TE.ReuseShuffleIndices.empty())
4351	K = TE.ReuseShuffleIndices [K];
4352	if (!TE.ReorderIndices.empty())
4353	K = std::distance(first: TE.ReorderIndices.begin(),
4354	last: find(Range: TE.ReorderIndices, Val: K));
4355	auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars [K]);
4356	if (!EI)
4357	continue;
4358	VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
4359	->getElementCount()
4360	.getKnownMinValue());
4361	}
4362	return VF;
4363	});
4364	// Check special corner case - single shuffle of the same entry.
4365	if (GatherShuffles.size() == `1` && NumParts != `1`) {
4366	if (ShuffledSubMasks.any())
4367	return std::nullopt;
4368	PartSz = NumScalars;
4369	NumParts = `1`;
4370	}
4371	if (!Entries.empty())
4372	TransformMaskToOrder (CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4373	if (!GatherShuffles [I])
4374	return `0U`;
4375	return std::max(a: Entries [I].front()->getVectorFactor(),
4376	b: Entries [I].back()->getVectorFactor());
4377	});
4378	int NumUndefs =
4379	count_if(Range&: CurrentOrder, P: [&](int Idx) { return Idx == NumScalars; });
4380	if (ShuffledSubMasks.all() \|\| (NumScalars > `2` && NumUndefs >= NumScalars / `2`))
4381	return std::nullopt;
4382	return std::move(CurrentOrder);
4383	}
4384
4385	static bool arePointersCompatible(Value Ptr1, Value Ptr2,
4386	const TargetLibraryInfo &TLI,
4387	bool CompareOpcodes = true) {
4388	if (getUnderlyingObject(V: Ptr1) != getUnderlyingObject(V: Ptr2))
4389	return false;
4390	auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
4391	if (!GEP1)
4392	return false;
4393	auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
4394	if (!GEP2)
4395	return false;
4396	return GEP1->getNumOperands() == `2` && GEP2->getNumOperands() == `2` &&
4397	((isConstant(V: GEP1->getOperand(i_nocapture: `1`)) &&
4398	isConstant(V: GEP2->getOperand(i_nocapture: `1`))) \|\|
4399	!CompareOpcodes \|\|
4400	getSameOpcode(VL: {GEP1->getOperand(i_nocapture: `1`), GEP2->getOperand(i_nocapture: `1`)}, TLI)
4401	.getOpcode());
4402	}
4403
4404	/// Calculates minimal alignment as a common alignment.
4405	template <typename T>
4406	static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4407	Align CommonAlignment = cast<T>(VL.front())->getAlign();
4408	for (Value *V : VL.drop_front())
4409	CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4410	return CommonAlignment;
4411	}
4412
4413	/// Check if \p Order represents reverse order.
4414	static bool isReverseOrder(ArrayRef<unsigned> Order) {
4415	unsigned Sz = Order.size();
4416	return !Order.empty() && all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
4417	return Pair.value() == Sz \|\| Sz - Pair.index() - `1` == Pair.value();
4418	});
4419	}
4420
4421	/// Checks if the provided list of pointers \p Pointers represents the strided
4422	/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4423	/// Otherwise, if \p Inst is not specified, just initialized optional value is
4424	/// returned to show that the pointers represent strided pointers. If \p Inst
4425	/// specified, the runtime stride is materialized before the given \p Inst.
4426	/// \returns std::nullopt if the pointers are not pointers with the runtime
4427	/// stride, nullptr or actual stride value, otherwise.
4428	static std::optional<Value *>
4429	calculateRtStride(ArrayRef<Value > PointerOps, Type ElemTy,
4430	const DataLayout &DL, ScalarEvolution &SE,
4431	SmallVectorImpl<unsigned> &SortedIndices,
4432	Instruction Inst = nullptr*) {
4433	SmallVector<const SCEV *> SCEVs;
4434	const SCEV PtrSCEVLowest = nullptr*;
4435	const SCEV PtrSCEVHighest = nullptr*;
4436	// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4437	// addresses).
4438	for (Value *Ptr : PointerOps) {
4439	const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
4440	if (!PtrSCEV)
4441	return std::nullopt;
4442	SCEVs.push_back(Elt: PtrSCEV);
4443	if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444	PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4445	continue;
4446	}
4447	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4448	if (isa<SCEVCouldNotCompute>(Val: Diff))
4449	return std::nullopt;
4450	if (Diff->isNonConstantNegative()) {
4451	PtrSCEVLowest = PtrSCEV;
4452	continue;
4453	}
4454	const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
4455	if (isa<SCEVCouldNotCompute>(Val: Diff1))
4456	return std::nullopt;
4457	if (Diff1->isNonConstantNegative()) {
4458	PtrSCEVHighest = PtrSCEV;
4459	continue;
4460	}
4461	}
4462	// Dist = PtrSCEVHighest - PtrSCEVLowest;
4463	const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
4464	if (isa<SCEVCouldNotCompute>(Val: Dist))
4465	return std::nullopt;
4466	int Size = DL.getTypeStoreSize(Ty: ElemTy);
4467	auto TryGetStride = [&](const SCEV *Dist,
4468	const SCEV Multiplier) -> const* SCEV * {
4469	if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
4470	if (M->getOperand(i: `0`) == Multiplier)
4471	return M->getOperand(i: `1`);
4472	if (M->getOperand(i: `1`) == Multiplier)
4473	return M->getOperand(i: `0`);
4474	return nullptr;
4475	}
4476	if (Multiplier == Dist)
4477	return SE.getConstant(Ty: Dist->getType(), V: `1`);
4478	return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
4479	};
4480	// Stride_in_elements = Dist / element_size (num_elems - 1).*
4481	const SCEV Stride = nullptr*;
4482	if (Size != `1` \|\| SCEVs.size() > `2`) {
4483	const SCEV Sz = SE.getConstant(Ty: Dist->getType(), V: Size (SCEVs.size() - `1`));
4484	Stride = TryGetStride (Dist, Sz);
4485	if (!Stride)
4486	return std::nullopt;
4487	}
4488	if (!Stride \|\| isa<SCEVConstant>(Val: Stride))
4489	return std::nullopt;
4490	// Iterate through all pointers and check if all distances are
4491	// unique multiple of Stride.
4492	using DistOrdPair = std::pair<int64_t, int>;
4493	auto Compare = llvm::less_first ();
4494	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4495	int Cnt = `0`;
4496	bool IsConsecutive = true;
4497	for (const SCEV *PtrSCEV : SCEVs) {
4498	unsigned Dist = `0`;
4499	if (PtrSCEV != PtrSCEVLowest) {
4500	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4501	const SCEV *Coeff = TryGetStride (Diff, Stride);
4502	if (!Coeff)
4503	return std::nullopt;
4504	const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
4505	if (!SC \|\| isa<SCEVCouldNotCompute>(Val: SC))
4506	return std::nullopt;
4507	if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
4508	RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
4509	->isZero())
4510	return std::nullopt;
4511	Dist = SC->getAPInt().getZExtValue();
4512	}
4513	// If the strides are not the same or repeated, we can't vectorize.
4514	if ((Dist / Size) * Size != Dist \|\| (Dist / Size) >= SCEVs.size())
4515	return std::nullopt;
4516	auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
4517	if (!Res.second)
4518	return std::nullopt;
4519	// Consecutive order if the inserted element is the last one.
4520	IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
4521	++Cnt;
4522	}
4523	if (Offsets.size() != SCEVs.size())
4524	return std::nullopt;
4525	SortedIndices.clear();
4526	if (!IsConsecutive) {
4527	// Fill SortedIndices array only if it is non-consecutive.
4528	SortedIndices.resize(N: PointerOps.size());
4529	Cnt = `0`;
4530	for (const std::pair<int64_t, int> &Pair : Offsets) {
4531	SortedIndices [Cnt] = Pair.second;
4532	++Cnt;
4533	}
4534	}
4535	if (!Inst)
4536	return nullptr;
4537	SCEVExpander Expander(SE, DL, "strided-load-vec");
4538	return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst);
4539	}
4540
4541	static std::pair<InstructionCost, InstructionCost>
4542	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4543	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
4544	Type ScalarTy, VectorType VecTy);
4545
4546	BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4547	ArrayRef<Value > VL, const* Value VL0, SmallVectorImpl<unsigned*> &Order,
4548	SmallVectorImpl<Value > &PointerOps, bool* TryRecursiveCheck) const {
4549	// Check that a vectorized load would load the same memory as a scalar
4550	// load. For example, we don't want to vectorize loads that are smaller
4551	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4552	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
4553	// from such a struct, we read/write packed bits disagreeing with the
4554	// unvectorized version.
4555	Type *ScalarTy = VL0->getType();
4556
4557	if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
4558	return LoadsState::Gather;
4559
4560	// Make sure all loads in the bundle are simple - we can't vectorize
4561	// atomic or volatile loads.
4562	PointerOps.clear();
4563	const unsigned Sz = VL.size();
4564	PointerOps.resize(N: Sz);
4565	auto *POIter = PointerOps.begin();
4566	for (Value *V : VL) {
4567	auto *L = cast<LoadInst>(Val: V);
4568	if (!L->isSimple())
4569	return LoadsState::Gather;
4570	*POIter = L->getPointerOperand();
4571	++POIter;
4572	}
4573
4574	Order.clear();
4575	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
4576	// Check the order of pointer operands or that all pointers are the same.
4577	bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order);
4578	// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4579	if (!Order.empty() && !isPowerOf2_32(Value: VL.size())) {
4580	assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4581	"supported with VectorizeNonPowerOf2");
4582	return LoadsState::Gather;
4583	}
4584
4585	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4586	if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy) &&
4587	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) &&
4588	calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
4589	return LoadsState::StridedVectorize;
4590	if (IsSorted \|\| all_of(Range&: PointerOps, P: [&](Value *P) {
4591	return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
4592	})) {
4593	if (IsSorted) {
4594	Value *Ptr0;
4595	Value *PtrN;
4596	if (Order.empty()) {
4597	Ptr0 = PointerOps.front();
4598	PtrN = PointerOps.back();
4599	} else {
4600	Ptr0 = PointerOps [Order.front()];
4601	PtrN = PointerOps [Order.back()];
4602	}
4603	std::optional<int> Diff =
4604	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
4605	// Check that the sorted loads are consecutive.
4606	if (static_cast<unsigned>(*Diff) == Sz - `1`)
4607	return LoadsState::Vectorize;
4608	// Simple check if not a strided access - clear order.
4609	bool IsPossibleStrided = *Diff % (Sz - `1`) == `0`;
4610	// Try to generate strided load node if:
4611	// 1. Target with strided load support is detected.
4612	// 2. The number of loads is greater than MinProfitableStridedLoads,
4613	// or the potential stride <= MaxProfitableLoadStride and the
4614	// potential stride is power-of-2 (to avoid perf regressions for the very
4615	// small number of loads) and max distance > number of loads, or potential
4616	// stride is -1.
4617	// 3. The loads are ordered, or number of unordered loads <=
4618	// MaxProfitableUnorderedLoads, or loads are in reversed order.
4619	// (this check is to avoid extra costs for very expensive shuffles).
4620	if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads \|\|
4621	(static_cast<unsigned>(std::abs(x: *Diff)) <=
4622	MaxProfitableLoadStride * Sz &&
4623	isPowerOf2_32(Value: std::abs(x: *Diff)))) &&
4624	static_cast<unsigned>(std::abs(x: *Diff)) > Sz) \|\|
4625	Diff == -(static_cast<int*>(Sz) - `1`))) {
4626	int Stride = Diff / static_cast<int*>(Sz - `1`);
4627	if (Diff == Stride static_cast<int>(Sz - `1`)) {
4628	Align Alignment =
4629	cast<LoadInst>(Val: Order.empty() ? VL.front() : VL [Order.front()])
4630	->getAlign();
4631	if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment)) {
4632	// Iterate through all pointers and check if all distances are
4633	// unique multiple of Dist.
4634	SmallSet<int, `4`> Dists;
4635	for (Value *Ptr : PointerOps) {
4636	int Dist = `0`;
4637	if (Ptr == PtrN)
4638	Dist = *Diff;
4639	else if (Ptr != Ptr0)
4640	Dist =
4641	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL: DL, SE&: *SE);
4642	// If the strides are not the same or repeated, we can't
4643	// vectorize.
4644	if (((Dist / Stride) * Stride) != Dist \|\|
4645	!Dists.insert(V: Dist).second)
4646	break;
4647	}
4648	if (Dists.size() == Sz)
4649	return LoadsState::StridedVectorize;
4650	}
4651	}
4652	}
4653	}
4654	auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4655	unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
4656	unsigned MinVF = getMinVF(Sz);
4657	unsigned MaxVF = std::max<unsigned>(a: bit_floor(Value: VL.size() / `2`), b: MinVF);
4658	MaxVF = std::min(a: getMaximumVF(ElemWidth: Sz, Opcode: Instruction::Load), b: MaxVF);
4659	for (unsigned VF = MaxVF; VF >= MinVF; VF /= `2`) {
4660	unsigned VectorizedCnt = `0`;
4661	SmallVector<LoadsState> States;
4662	for (unsigned Cnt = `0`, End = VL.size(); Cnt + VF <= End;
4663	Cnt += VF, ++VectorizedCnt) {
4664	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
4665	SmallVector<unsigned> Order;
4666	SmallVector<Value *> PointerOps;
4667	LoadsState LS =
4668	canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
4669	/TryRecursiveCheck=/false);
4670	// Check that the sorted loads are consecutive.
4671	if (LS == LoadsState::Gather)
4672	break;
4673	// If need the reorder - consider as high-cost masked gather for now.
4674	if ((LS == LoadsState::Vectorize \|\|
4675	LS == LoadsState::StridedVectorize) &&
4676	!Order.empty() && !isReverseOrder(Order))
4677	LS = LoadsState::ScatterVectorize;
4678	States.push_back(Elt: LS);
4679	}
4680	// Can be vectorized later as a serie of loads/insertelements.
4681	if (VectorizedCnt == VL.size() / VF) {
4682	// Compare masked gather cost and loads + insersubvector costs.
4683	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4684	auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4685	TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(), Opcode: Instruction::GetElementPtr,
4686	CostKind, ScalarTy, VecTy);
4687	InstructionCost MaskedGatherCost =
4688	TTI.getGatherScatterOpCost(
4689	Opcode: Instruction::Load, DataTy: VecTy,
4690	Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(),
4691	/VariableMask=/false, Alignment: CommonAlignment, CostKind) +
4692	VectorGEPCost - ScalarGEPCost;
4693	InstructionCost VecLdCost = `0`;
4694	auto *SubVecTy = getWidenedType(ScalarTy, VF);
4695	for (auto [I, LS] : enumerate(First&: States)) {
4696	auto LI0 = cast<LoadInst>(Val: VL [I VF]);
4697	switch (LS) {
4698	case LoadsState::Vectorize: {
4699	auto [ScalarGEPCost, VectorGEPCost] =
4700	getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
4701	BasePtr: LI0->getPointerOperand(), Opcode: Instruction::Load,
4702	CostKind, ScalarTy, VecTy: SubVecTy);
4703	VecLdCost += TTI.getMemoryOpCost(
4704	Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
4705	AddressSpace: LI0->getPointerAddressSpace(), CostKind,
4706	OpdInfo: TTI::OperandValueInfo ()) +
4707	VectorGEPCost - ScalarGEPCost;
4708	break;
4709	}
4710	case LoadsState::StridedVectorize: {
4711	auto [ScalarGEPCost, VectorGEPCost] =
4712	getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
4713	BasePtr: LI0->getPointerOperand(), Opcode: Instruction::Load,
4714	CostKind, ScalarTy, VecTy: SubVecTy);
4715	VecLdCost +=
4716	TTI.getStridedMemoryOpCost(
4717	Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4718	/VariableMask=/false, Alignment: CommonAlignment, CostKind) +
4719	VectorGEPCost - ScalarGEPCost;
4720	break;
4721	}
4722	case LoadsState::ScatterVectorize: {
4723	auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4724	TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
4725	BasePtr: LI0->getPointerOperand(), Opcode: Instruction::GetElementPtr,
4726	CostKind, ScalarTy, VecTy: SubVecTy);
4727	VecLdCost +=
4728	TTI.getGatherScatterOpCost(
4729	Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4730	/VariableMask=/false, Alignment: CommonAlignment, CostKind) +
4731	VectorGEPCost - ScalarGEPCost;
4732	break;
4733	}
4734	case LoadsState::Gather:
4735	llvm_unreachable(
4736	"Expected only consecutive, strided or masked gather loads.");
4737	}
4738	SmallVector<int> ShuffleMask(VL.size());
4739	for (int Idx : seq<int>(Begin: `0`, End: VL.size()))
4740	ShuffleMask [Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4741	VecLdCost +=
4742	TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
4743	CostKind, Index: I * VF, SubTp: SubVecTy);
4744	}
4745	// If masked gather cost is higher - better to vectorize, so
4746	// consider it as a gather node. It will be better estimated
4747	// later.
4748	if (MaskedGatherCost >= VecLdCost)
4749	return true;
4750	}
4751	}
4752	return false;
4753	};
4754	// TODO: need to improve analysis of the pointers, if not all of them are
4755	// GEPs or have > 2 operands, we end up with a gather node, which just
4756	// increases the cost.
4757	Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
4758	bool ProfitableGatherPointers =
4759	L && Sz > `2` &&
4760	static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
4761	return L->isLoopInvariant(V);
4762	})) <= Sz / `2`;
4763	if (ProfitableGatherPointers \|\| all_of(Range&: PointerOps, P: [IsSorted](Value *P) {
4764	auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
4765	return (IsSorted && !GEP && doesNotNeedToBeScheduled(V: P)) \|\|
4766	(GEP && GEP->getNumOperands() == `2` &&
4767	isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: `1`)));
4768	})) {
4769	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4770	if (TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) &&
4771	!TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment)) {
4772	// Check if potential masked gather can be represented as series
4773	// of loads + insertsubvectors.
4774	if (TryRecursiveCheck && CheckForShuffledLoads (CommonAlignment)) {
4775	// If masked gather cost is higher - better to vectorize, so
4776	// consider it as a gather node. It will be better estimated
4777	// later.
4778	return LoadsState::Gather;
4779	}
4780	return LoadsState::ScatterVectorize;
4781	}
4782	}
4783	}
4784
4785	return LoadsState::Gather;
4786	}
4787
4788	static bool clusterSortPtrAccesses(ArrayRef<Value > VL, Type ElemTy,
4789	const DataLayout &DL, ScalarEvolution &SE,
4790	SmallVectorImpl<unsigned> &SortedIndices) {
4791	assert(llvm::all_of(
4792	VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
4793	"Expected list of pointer operands.");
4794	// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4795	// Ptr into, sort and return the sorted indices with values next to one
4796	// another.
4797	MapVector<Value , SmallVector<std::tuple<Value , int, unsigned>>> Bases;
4798	Bases [VL [`0`]].push_back(Elt: std::make_tuple(args: VL [`0`], args: `0U`, args: `0U`));
4799
4800	unsigned Cnt = `1`;
4801	for (Value *Ptr : VL.drop_front()) {
4802	bool Found = any_of(Range&: Bases, P: [&](auto &Base) {
4803	std::optional<int> Diff =
4804	getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4805	/StrictCheck=/true);
4806	if (!Diff)
4807	return false;
4808
4809	Base.second.emplace_back(Ptr, *Diff, Cnt++);
4810	return true;
4811	});
4812
4813	if (!Found) {
4814	// If we haven't found enough to usefully cluster, return early.
4815	if (Bases.size() > VL.size() / `2` - `1`)
4816	return false;
4817
4818	// Not found already - add a new Base
4819	Bases [Ptr].emplace_back(Args&: Ptr, Args: `0`, Args: Cnt++);
4820	}
4821	}
4822
4823	// For each of the bases sort the pointers by Offset and check if any of the
4824	// base become consecutively allocated.
4825	bool AnyConsecutive = false;
4826	for (auto &Base : Bases) {
4827	auto &Vec = Base.second;
4828	if (Vec.size() > `1`) {
4829	llvm::stable_sort(Range&: Vec, C: [](const std::tuple<Value , int, unsigned*> &X,
4830	const std::tuple<Value , int, unsigned*> &Y) {
4831	return std::get<`1`>(t: X) < std::get<`1`>(t: Y);
4832	});
4833	int InitialOffset = std::get<`1`>(t&: Vec [`0`]);
4834	AnyConsecutive \|= all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
4835	return std::get<`1`>(P.value()) == int(P.index()) + InitialOffset;
4836	});
4837	}
4838	}
4839
4840	// Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4841	SortedIndices.clear();
4842	if (!AnyConsecutive)
4843	return false;
4844
4845	for (auto &Base : Bases) {
4846	for (auto &T : Base.second)
4847	SortedIndices.push_back(Elt: std::get<`2`>(t&: T));
4848	}
4849
4850	assert(SortedIndices.size() == VL.size() &&
4851	"Expected SortedIndices to be the size of VL");
4852	return true;
4853	}
4854
4855	std::optional<BoUpSLP::OrdersType>
4856	BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4857	assert(TE.isGather() && "Expected gather node only.");
4858	Type *ScalarTy = TE.Scalars [`0`]->getType();
4859
4860	SmallVector<Value *> Ptrs;
4861	Ptrs.reserve(N: TE.Scalars.size());
4862	for (Value *V : TE.Scalars) {
4863	auto *L = dyn_cast<LoadInst>(Val: V);
4864	if (!L \|\| !L->isSimple())
4865	return std::nullopt;
4866	Ptrs.push_back(Elt: L->getPointerOperand());
4867	}
4868
4869	BoUpSLP::OrdersType Order;
4870	if (clusterSortPtrAccesses(VL: Ptrs, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
4871	return std::move(Order);
4872	return std::nullopt;
4873	}
4874
4875	/// Check if two insertelement instructions are from the same buildvector.
4876	static bool areTwoInsertFromSameBuildVector(
4877	InsertElementInst VU, InsertElementInst V,
4878	function_ref<Value (InsertElementInst )> GetBaseOperand) {
4879	// Instructions must be from the same basic blocks.
4880	if (VU->getParent() != V->getParent())
4881	return false;
4882	// Checks if 2 insertelements are from the same buildvector.
4883	if (VU->getType() != V->getType())
4884	return false;
4885	// Multiple used inserts are separate nodes.
4886	if (!VU->hasOneUse() && !V->hasOneUse())
4887	return false;
4888	auto *IE1 = VU;
4889	auto *IE2 = V;
4890	std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
4891	std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
4892	if (Idx1 == std::nullopt \|\| Idx2 == std::nullopt)
4893	return false;
4894	// Go through the vector operand of insertelement instructions trying to find
4895	// either VU as the original vector for IE2 or V as the original vector for
4896	// IE1.
4897	SmallBitVector ReusedIdx(
4898	cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
4899	bool IsReusedIdx = false;
4900	do {
4901	if (IE2 == VU && !IE1)
4902	return VU->hasOneUse();
4903	if (IE1 == V && !IE2)
4904	return V->hasOneUse();
4905	if (IE1 && IE1 != V) {
4906	unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
4907	IsReusedIdx \|= ReusedIdx.test(Idx: Idx1);
4908	ReusedIdx.set(Idx1);
4909	if ((IE1 != VU && !IE1->hasOneUse()) \|\| IsReusedIdx)
4910	IE1 = nullptr;
4911	else
4912	IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE1));
4913	}
4914	if (IE2 && IE2 != VU) {
4915	unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
4916	IsReusedIdx \|= ReusedIdx.test(Idx: Idx2);
4917	ReusedIdx.set(Idx2);
4918	if ((IE2 != V && !IE2->hasOneUse()) \|\| IsReusedIdx)
4919	IE2 = nullptr;
4920	else
4921	IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE2));
4922	}
4923	} while (!IsReusedIdx && (IE1 \|\| IE2));
4924	return false;
4925	}
4926
4927	std::optional<BoUpSLP::OrdersType>
4928	BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4929	// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4930	if (TE.isNonPowOf2Vec())
4931	return std::nullopt;
4932
4933	// No need to reorder if need to shuffle reuses, still need to shuffle the
4934	// node.
4935	if (!TE.ReuseShuffleIndices.empty()) {
4936	if (isSplat(VL: TE.Scalars))
4937	return std::nullopt;
4938	// Check if reuse shuffle indices can be improved by reordering.
4939	// For this, check that reuse mask is "clustered", i.e. each scalar values
4940	// is used once in each submask of size <number_of_scalars>.
4941	// Example: 4 scalar values.
4942	// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4943	// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4944	// element 3 is used twice in the second submask.
4945	unsigned Sz = TE.Scalars.size();
4946	if (TE.isGather()) {
4947	if (std::optional<OrdersType> CurrentOrder =
4948	findReusedOrderedScalars(TE)) {
4949	SmallVector<int> Mask;
4950	fixupOrderingIndices(Order: *CurrentOrder);
4951	inversePermutation(Indices: *CurrentOrder, Mask);
4952	::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
4953	OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954	unsigned Sz = TE.Scalars.size();
4955	for (int K = `0`, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4956	for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
4957	if (Idx != PoisonMaskElem)
4958	Res [Idx + K * Sz] = I + K * Sz;
4959	}
4960	return std::move(Res);
4961	}
4962	}
4963	if (Sz == `2` && TE.getVectorFactor() == `4` &&
4964	TTI->getNumberOfParts(Tp: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
4965	VF: `2` * TE.getVectorFactor())) == `1`)
4966	return std::nullopt;
4967	if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
4968	VF: Sz)) {
4969	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4970	if (TE.ReorderIndices.empty())
4971	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
4972	else
4973	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
4974	::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
4975	unsigned VF = ReorderMask.size();
4976	OrdersType ResOrder(VF, VF);
4977	unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
4978	SmallBitVector UsedVals(NumParts);
4979	for (unsigned I = `0`; I < VF; I += Sz) {
4980	int Val = PoisonMaskElem;
4981	unsigned UndefCnt = `0`;
4982	unsigned Limit = std::min(a: Sz, b: VF - I);
4983	if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
4984	P: [&](int Idx) {
4985	if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4986	Val = Idx;
4987	if (Idx == PoisonMaskElem)
4988	++UndefCnt;
4989	return Idx != PoisonMaskElem && Idx != Val;
4990	}) \|\|
4991	Val >= static_cast<int>(NumParts) \|\| UsedVals.test(Idx: Val) \|\|
4992	UndefCnt > Sz / `2`)
4993	return std::nullopt;
4994	UsedVals.set(Val);
4995	for (unsigned K = `0`; K < NumParts; ++K)
4996	ResOrder [Val + Sz * K] = I + K;
4997	}
4998	return std::move(ResOrder);
4999	}
5000	unsigned VF = TE.getVectorFactor();
5001	// Try build correct order for extractelement instructions.
5002	SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5003	TE.ReuseShuffleIndices.end());
5004	if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5005	all_of(Range: TE.Scalars, P: [Sz](Value *V) {
5006	std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
5007	return Idx && *Idx < Sz;
5008	})) {
5009	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5010	if (TE.ReorderIndices.empty())
5011	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
5012	else
5013	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
5014	for (unsigned I = `0`; I < VF; ++I) {
5015	int &Idx = ReusedMask [I];
5016	if (Idx == PoisonMaskElem)
5017	continue;
5018	Value *V = TE.Scalars [ReorderMask [Idx]];
5019	std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
5020	Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
5021	}
5022	}
5023	// Build the order of the VF size, need to reorder reuses shuffles, they are
5024	// always of VF size.
5025	OrdersType ResOrder(VF);
5026	std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: `0`);
5027	auto *It = ResOrder.begin();
5028	for (unsigned K = `0`; K < VF; K += Sz) {
5029	OrdersType CurrentOrder(TE.ReorderIndices);
5030	SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
5031	if (SubMask.front() == PoisonMaskElem)
5032	std::iota(first: SubMask.begin(), last: SubMask.end(), value: `0`);
5033	reorderOrder(Order&: CurrentOrder, Mask: SubMask);
5034	transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
5035	std::advance(i&: It, n: Sz);
5036	}
5037	if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
5038	return Data.index() == Data.value();
5039	}))
5040	return std::nullopt; // No need to reorder.
5041	return std::move(ResOrder);
5042	}
5043	if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044	any_of(Range: TE.UserTreeIndices,
5045	P: [](const EdgeInfo &EI) {
5046	return !Instruction::isBinaryOp(Opcode: EI.UserTE->getOpcode());
5047	}) &&
5048	(TE.ReorderIndices.empty() \|\| isReverseOrder(Order: TE.ReorderIndices)))
5049	return std::nullopt;
5050	if ((TE.State == TreeEntry::Vectorize \|\|
5051	TE.State == TreeEntry::StridedVectorize) &&
5052	(isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) \|\|
5053	(TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))) &&
5054	!TE.isAltShuffle())
5055	return TE.ReorderIndices;
5056	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057	auto PHICompare = [&](unsigned I1, unsigned I2) {
5058	Value *V1 = TE.Scalars [I1];
5059	Value *V2 = TE.Scalars [I2];
5060	if (V1 == V2 \|\| (V1->getNumUses() == `0` && V2->getNumUses() == `0`))
5061	return false;
5062	if (V1->getNumUses() < V2->getNumUses())
5063	return true;
5064	if (V1->getNumUses() > V2->getNumUses())
5065	return false;
5066	auto FirstUserOfPhi1 = cast<Instruction>(Val: V1->user_begin());
5067	auto FirstUserOfPhi2 = cast<Instruction>(Val: V2->user_begin());
5068	if (auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1))
5069	if (auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2)) {
5070	if (!areTwoInsertFromSameBuildVector(
5071	VU: IE1, V: IE2,
5072	GetBaseOperand: [](InsertElementInst II) { return* II->getOperand(i_nocapture: `0`); }))
5073	return I1 < I2;
5074	return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
5075	}
5076	if (auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1))
5077	if (auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2)) {
5078	if (EE1->getOperand(i_nocapture: `0`) != EE2->getOperand(i_nocapture: `0`))
5079	return I1 < I2;
5080	return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
5081	}
5082	return I1 < I2;
5083	};
5084	auto IsIdentityOrder = [](const OrdersType &Order) {
5085	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Order.size()))
5086	if (Idx != Order [Idx])
5087	return false;
5088	return true;
5089	};
5090	if (!TE.ReorderIndices.empty())
5091	return TE.ReorderIndices;
5092	DenseMap<unsigned, unsigned> PhiToId;
5093	SmallVector<unsigned> Phis(TE.Scalars.size());
5094	std::iota(first: Phis.begin(), last: Phis.end(), value: `0`);
5095	OrdersType ResOrder(TE.Scalars.size());
5096	for (unsigned Id = `0`, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5097	PhiToId [Id] = Id;
5098	stable_sort(Range&: Phis, C: PHICompare);
5099	for (unsigned Id = `0`, Sz = Phis.size(); Id < Sz; ++Id)
5100	ResOrder [Id] = PhiToId [Phis [Id]];
5101	if (IsIdentityOrder (ResOrder))
5102	return std::nullopt; // No need to reorder.
5103	return std::move(ResOrder);
5104	}
5105	if (TE.isGather() && !TE.isAltShuffle() && allSameType(VL: TE.Scalars)) {
5106	// TODO: add analysis of other gather nodes with extractelement
5107	// instructions and other values/instructions, not only undefs.
5108	if ((TE.getOpcode() == Instruction::ExtractElement \|\|
5109	(all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
5110	any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
5111	all_of(Range: TE.Scalars, P: [](Value *V) {
5112	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
5113	return !EE \|\| isa<FixedVectorType>(Val: EE->getVectorOperandType());
5114	})) {
5115	// Check that gather of extractelements can be represented as
5116	// just a shuffle of a single vector.
5117	OrdersType CurrentOrder;
5118	bool Reuse = canReuseExtract(VL: TE.Scalars, OpValue: TE.getMainOp(), CurrentOrder,
5119	/ResizeAllowed=/true);
5120	if (Reuse \|\| !CurrentOrder.empty())
5121	return std::move(CurrentOrder);
5122	}
5123	// If the gather node is <undef, v, .., poison> and
5124	// insertelement poison, v, 0 [+ permute]
5125	// is cheaper than
5126	// insertelement poison, v, n - try to reorder.
5127	// If rotating the whole graph, exclude the permute cost, the whole graph
5128	// might be transformed.
5129	int Sz = TE.Scalars.size();
5130	if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
5131	count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - `1`) {
5132	const auto *It =
5133	find_if(Range: TE.Scalars, P: [](Value V) { return* !isConstant(V); });
5134	if (It == TE.Scalars.begin())
5135	return OrdersType ();
5136	auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
5137	if (It != TE.Scalars.end()) {
5138	OrdersType Order(Sz, Sz);
5139	unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
5140	Order [Idx] = `0`;
5141	fixupOrderingIndices(Order);
5142	SmallVector<int> Mask;
5143	inversePermutation(Indices: Order, Mask);
5144	InstructionCost PermuteCost =
5145	TopToBottom
5146	? `0`
5147	: TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
5148	InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5149	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: `0`,
5150	Op0: PoisonValue::get(T: Ty), Op1: *It);
5151	InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5152	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
5153	Op0: PoisonValue::get(T: Ty), Op1: *It);
5154	if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5155	OrdersType Order(Sz, Sz);
5156	Order [Idx] = `0`;
5157	return std::move(Order);
5158	}
5159	}
5160	}
5161	if (isSplat(VL: TE.Scalars))
5162	return std::nullopt;
5163	if (TE.Scalars.size() >= `4`)
5164	if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5165	return Order;
5166	if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5167	return CurrentOrder;
5168	}
5169	return std::nullopt;
5170	}
5171
5172	/// Checks if the given mask is a "clustered" mask with the same clusters of
5173	/// size \p Sz, which are not identity submasks.
5174	static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5175	unsigned Sz) {
5176	ArrayRef<int> FirstCluster = Mask.slice(N: `0`, M: Sz);
5177	if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
5178	return false;
5179	for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5180	ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
5181	if (Cluster != FirstCluster)
5182	return false;
5183	}
5184	return true;
5185	}
5186
5187	void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5188	// Reorder reuses mask.
5189	reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
5190	const unsigned Sz = TE.Scalars.size();
5191	// For vectorized and non-clustered reused no need to do anything else.
5192	if (!TE.isGather() \|\|
5193	!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
5194	VF: Sz) \|\|
5195	!isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
5196	return;
5197	SmallVector<int> NewMask;
5198	inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
5199	addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
5200	// Clear reorder since it is going to be applied to the new mask.
5201	TE.ReorderIndices.clear();
5202	// Try to improve gathered nodes with clustered reuses, if possible.
5203	ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: `0`, M: Sz);
5204	SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5205	inversePermutation(Indices: NewOrder, Mask&: NewMask);
5206	reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
5207	// Fill the reuses mask with the identity submasks.
5208	for (auto *It = TE.ReuseShuffleIndices.begin(),
5209	*End = TE.ReuseShuffleIndices.end();
5210	It != End; std::advance(i&: It, n: Sz))
5211	std::iota(first: It, last: std::next(x: It, n: Sz), value: `0`);
5212	}
5213
5214	static void combineOrders(MutableArrayRef<unsigned> Order,
5215	ArrayRef<unsigned> SecondaryOrder) {
5216	assert((SecondaryOrder.empty() \|\| Order.size() == SecondaryOrder.size()) &&
5217	"Expected same size of orders");
5218	unsigned Sz = Order.size();
5219	SmallBitVector UsedIndices(Sz);
5220	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz)) {
5221	if (Order [Idx] != Sz)
5222	UsedIndices.set(Order [Idx]);
5223	}
5224	if (SecondaryOrder.empty()) {
5225	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
5226	if (Order [Idx] == Sz && !UsedIndices.test(Idx))
5227	Order [Idx] = Idx;
5228	} else {
5229	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
5230	if (SecondaryOrder [Idx] != Sz && Order [Idx] == Sz &&
5231	!UsedIndices.test(Idx: SecondaryOrder [Idx]))
5232	Order [Idx] = SecondaryOrder [Idx];
5233	}
5234	}
5235
5236	void BoUpSLP::reorderTopToBottom() {
5237	// Maps VF to the graph nodes.
5238	DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5239	// ExtractElement gather nodes which can be vectorized and need to handle
5240	// their ordering.
5241	DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5242
5243	// Phi nodes can have preferred ordering based on their result users
5244	DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5245
5246	// AltShuffles can also have a preferred ordering that leads to fewer
5247	// instructions, e.g., the addsub instruction in x86.
5248	DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5249
5250	// Maps a TreeEntry to the reorder indices of external users.
5251	DenseMap<const TreeEntry *, SmallVector<OrdersType, `1`>>
5252	ExternalUserReorderMap;
5253	// Find all reorderable nodes with the given VF.
5254	// Currently the are vectorized stores,loads,extracts + some gathering of
5255	// extracts.
5256	for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
5257	const std::unique_ptr<TreeEntry> &TE) {
5258	// Look for external users that will probably be vectorized.
5259	SmallVector<OrdersType, `1`> ExternalUserReorderIndices =
5260	findExternalStoreUsersReorderIndices(TE: TE.get());
5261	if (!ExternalUserReorderIndices.empty()) {
5262	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
5263	ExternalUserReorderMap.try_emplace(Key: TE.get(),
5264	Args: std::move(ExternalUserReorderIndices));
5265	}
5266
5267	// Patterns like [fadd,fsub] can be combined into a single instruction in
5268	// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5269	// to take into account their order when looking for the most used order.
5270	if (TE ->isAltShuffle()) {
5271	VectorType *VecTy =
5272	getWidenedType(ScalarTy: TE ->Scalars [`0`]->getType(), VF: TE ->Scalars.size());
5273	unsigned Opcode0 = TE ->getOpcode();
5274	unsigned Opcode1 = TE ->getAltOpcode();
5275	SmallBitVector OpcodeMask(getAltInstrMask(VL: TE ->Scalars, Opcode0, Opcode1));
5276	// If this pattern is supported by the target then we consider the order.
5277	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
5279	AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType ());
5280	}
5281	// TODO: Check the reverse order too.
5282	}
5283
5284	if (std::optional<OrdersType> CurrentOrder =
5285	getReorderingData(TE: TE, /TopToBottom=/*true)) {
5286	// Do not include ordering for nodes used in the alt opcode vectorization,
5287	// better to reorder them during bottom-to-top stage. If follow the order
5288	// here, it causes reordering of the whole graph though actually it is
5289	// profitable just to reorder the subgraph that starts from the alternate
5290	// opcode vectorization node. Such nodes already end-up with the shuffle
5291	// instruction and it is just enough to change this shuffle rather than
5292	// rotate the scalars for the whole graph.
5293	unsigned Cnt = `0`;
5294	const TreeEntry *UserTE = TE.get();
5295	while (UserTE && Cnt < RecursionMaxDepth) {
5296	if (UserTE->UserTreeIndices.size() != `1`)
5297	break;
5298	if (all_of(Range: UserTE->UserTreeIndices, P: [](const EdgeInfo &EI) {
5299	return EI.UserTE->State == TreeEntry::Vectorize &&
5300	EI.UserTE->isAltShuffle() && EI.UserTE->Idx != `0`;
5301	}))
5302	return;
5303	UserTE = UserTE->UserTreeIndices.back().UserTE;
5304	++Cnt;
5305	}
5306	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
5307	if (!(TE ->State == TreeEntry::Vectorize \|\|
5308	TE ->State == TreeEntry::StridedVectorize) \|\|
5309	!TE ->ReuseShuffleIndices.empty())
5310	GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
5311	if (TE ->State == TreeEntry::Vectorize &&
5312	TE ->getOpcode() == Instruction::PHI)
5313	PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
5314	}
5315	});
5316
5317	// Reorder the graph nodes according to their vectorization factor.
5318	for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > `1`;
5319	VF /= `2`) {
5320	auto It = VFToOrderedEntries.find(Val: VF);
5321	if (It == VFToOrderedEntries.end())
5322	continue;
5323	// Try to find the most profitable order. We just are looking for the most
5324	// used order and reorder scalar elements in the nodes according to this
5325	// mostly used order.
5326	ArrayRef<TreeEntry *> OrderedEntries = It ->second.getArrayRef();
5327	// All operands are reordered and used only in this node - propagate the
5328	// most used order to the user node.
5329	MapVector<OrdersType, unsigned,
5330	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5331	OrdersUses;
5332	SmallPtrSet<const TreeEntry *, `4`> VisitedOps;
5333	for (const TreeEntry *OpTE : OrderedEntries) {
5334	// No need to reorder this nodes, still need to extend and to use shuffle,
5335	// just need to merge reordering shuffle and the reuse shuffle.
5336	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE))
5337	continue;
5338	// Count number of orders uses.
5339	const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5340	&PhisToOrders]() -> const OrdersType & {
5341	if (OpTE->isGather() \|\| !OpTE->ReuseShuffleIndices.empty()) {
5342	auto It = GathersToOrders.find(Val: OpTE);
5343	if (It != GathersToOrders.end())
5344	return It ->second;
5345	}
5346	if (OpTE->isAltShuffle()) {
5347	auto It = AltShufflesToOrders.find(Val: OpTE);
5348	if (It != AltShufflesToOrders.end())
5349	return It ->second;
5350	}
5351	if (OpTE->State == TreeEntry::Vectorize &&
5352	OpTE->getOpcode() == Instruction::PHI) {
5353	auto It = PhisToOrders.find(Val: OpTE);
5354	if (It != PhisToOrders.end())
5355	return It ->second;
5356	}
5357	return OpTE->ReorderIndices;
5358	}();
5359	// First consider the order of the external scalar users.
5360	auto It = ExternalUserReorderMap.find(Val: OpTE);
5361	if (It != ExternalUserReorderMap.end()) {
5362	const auto &ExternalUserReorderIndices = It ->second;
5363	// If the OpTE vector factor != number of scalars - use natural order,
5364	// it is an attempt to reorder node with reused scalars but with
5365	// external uses.
5366	if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367	OrdersUses.insert(KV: std::make_pair(x: OrdersType (), y: `0`)).first->second +=
5368	ExternalUserReorderIndices.size();
5369	} else {
5370	for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371	++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: `0`)).first->second;
5372	}
5373	// No other useful reorder data in this entry.
5374	if (Order.empty())
5375	continue;
5376	}
5377	// Stores actually store the mask, not the order, need to invert.
5378	if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5380	SmallVector<int> Mask;
5381	inversePermutation(Indices: Order, Mask);
5382	unsigned E = Order.size();
5383	OrdersType CurrentOrder(E, E);
5384	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5385	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5386	});
5387	fixupOrderingIndices(Order: CurrentOrder);
5388	++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: `0`)).first->second;
5389	} else {
5390	++OrdersUses.insert(KV: std::make_pair(x: Order, y: `0`)).first->second;
5391	}
5392	}
5393	if (OrdersUses.empty())
5394	continue;
5395	auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5396	const unsigned Sz = Order.size();
5397	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
5398	if (Idx != Order [Idx] && Order [Idx] != Sz)
5399	return false;
5400	return true;
5401	};
5402	// Choose the most used order.
5403	unsigned IdentityCnt = `0`;
5404	unsigned FilledIdentityCnt = `0`;
5405	OrdersType IdentityOrder(VF, VF);
5406	for (auto &Pair : OrdersUses) {
5407	if (Pair.first.empty() \|\| IsIdentityOrder (Pair.first)) {
5408	if (!Pair.first.empty())
5409	FilledIdentityCnt += Pair.second;
5410	IdentityCnt += Pair.second;
5411	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5412	}
5413	}
5414	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5415	unsigned Cnt = IdentityCnt;
5416	for (auto &Pair : OrdersUses) {
5417	// Prefer identity order. But, if filled identity found (non-empty order)
5418	// with same number of uses, as the new candidate order, we can choose
5419	// this candidate order.
5420	if (Cnt < Pair.second \|\|
5421	(Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422	Cnt == Pair.second && !BestOrder.empty() &&
5423	IsIdentityOrder (BestOrder))) {
5424	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5425	BestOrder = Pair.first;
5426	Cnt = Pair.second;
5427	} else {
5428	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5429	}
5430	}
5431	// Set order of the user node.
5432	if (IsIdentityOrder (BestOrder))
5433	continue;
5434	fixupOrderingIndices(Order: BestOrder);
5435	SmallVector<int> Mask;
5436	inversePermutation(Indices: BestOrder, Mask);
5437	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5438	unsigned E = BestOrder.size();
5439	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5440	return I < E ? static_cast<int>(I) : PoisonMaskElem;
5441	});
5442	// Do an actual reordering, if profitable.
5443	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5444	// Just do the reordering for the nodes with the given VF.
5445	if (TE ->Scalars.size() != VF) {
5446	if (TE ->ReuseShuffleIndices.size() == VF) {
5447	// Need to reorder the reuses masks of the operands with smaller VF to
5448	// be able to find the match between the graph nodes and scalar
5449	// operands of the given node during vectorization/cost estimation.
5450	assert(all_of(TE->UserTreeIndices,
5451	[VF, &TE](const EdgeInfo &EI) {
5452	return EI.UserTE->Scalars.size() == VF \|\|
5453	EI.UserTE->Scalars.size() ==
5454	TE->Scalars.size();
5455	}) &&
5456	"All users must be of VF size.");
5457	// Update ordering of the operands with the smaller VF than the given
5458	// one.
5459	reorderNodeWithReuses(TE&: *TE, Mask);
5460	}
5461	continue;
5462	}
5463	if ((TE ->State == TreeEntry::Vectorize \|\|
5464	TE ->State == TreeEntry::StridedVectorize) &&
5465	isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5466	InsertElementInst>(Val: TE ->getMainOp()) &&
5467	!TE ->isAltShuffle()) {
5468	// Build correct orders for extract{element,value}, loads and
5469	// stores.
5470	reorderOrder(Order&: TE ->ReorderIndices, Mask);
5471	if (isa<InsertElementInst, StoreInst>(Val: TE ->getMainOp()))
5472	TE ->reorderOperands(Mask);
5473	} else {
5474	// Reorder the node and its operands.
5475	TE ->reorderOperands(Mask);
5476	assert(TE->ReorderIndices.empty() &&
5477	"Expected empty reorder sequence.");
5478	reorderScalars(Scalars&: TE ->Scalars, Mask);
5479	}
5480	if (!TE ->ReuseShuffleIndices.empty()) {
5481	// Apply reversed order to keep the original ordering of the reused
5482	// elements to avoid extra reorder indices shuffling.
5483	OrdersType CurrentOrder;
5484	reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
5485	SmallVector<int> NewReuses;
5486	inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
5487	addMask(Mask&: NewReuses, SubMask: TE ->ReuseShuffleIndices);
5488	TE ->ReuseShuffleIndices.swap(RHS&: NewReuses);
5489	}
5490	}
5491	}
5492	}
5493
5494	bool BoUpSLP::canReorderOperands(
5495	TreeEntry UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry >> &Edges,
5496	ArrayRef<TreeEntry *> ReorderableGathers,
5497	SmallVectorImpl<TreeEntry *> &GatherOps) {
5498	// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5499	if (UserTE->isNonPowOf2Vec())
5500	return false;
5501
5502	for (unsigned I = `0`, E = UserTE->getNumOperands(); I < E; ++I) {
5503	if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5504	return OpData.first == I &&
5505	(OpData.second->State == TreeEntry::Vectorize \|\|
5506	OpData.second->State == TreeEntry::StridedVectorize);
5507	}))
5508	continue;
5509	if (TreeEntry *TE = getVectorizedOperand(UserTE, OpIdx: I)) {
5510	// Do not reorder if operand node is used by many user nodes.
5511	if (any_of(Range&: TE->UserTreeIndices,
5512	P: [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5513	return false;
5514	// Add the node to the list of the ordered nodes with the identity
5515	// order.
5516	Edges.emplace_back(Args&: I, Args&: TE);
5517	// Add ScatterVectorize nodes to the list of operands, where just
5518	// reordering of the scalars is required. Similar to the gathers, so
5519	// simply add to the list of gathered ops.
5520	// If there are reused scalars, process this node as a regular vectorize
5521	// node, just reorder reuses mask.
5522	if (TE->State != TreeEntry::Vectorize &&
5523	TE->State != TreeEntry::StridedVectorize &&
5524	TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5525	GatherOps.push_back(Elt: TE);
5526	continue;
5527	}
5528	TreeEntry Gather = nullptr*;
5529	if (count_if(Range&: ReorderableGathers,
5530	P: [&Gather, UserTE, I](TreeEntry *TE) {
5531	assert(TE->State != TreeEntry::Vectorize &&
5532	TE->State != TreeEntry::StridedVectorize &&
5533	"Only non-vectorized nodes are expected.");
5534	if (any_of(Range&: TE->UserTreeIndices,
5535	P: [UserTE, I](const EdgeInfo &EI) {
5536	return EI.UserTE == UserTE && EI.EdgeIdx == I;
5537	})) {
5538	assert(TE->isSame(UserTE->getOperand(I)) &&
5539	"Operand entry does not match operands.");
5540	Gather = TE;
5541	return true;
5542	}
5543	return false;
5544	}) > `1` &&
5545	!allConstant(VL: UserTE->getOperand(OpIdx: I)))
5546	return false;
5547	if (Gather)
5548	GatherOps.push_back(Elt: Gather);
5549	}
5550	return true;
5551	}
5552
5553	void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5554	SetVector<TreeEntry *> OrderedEntries;
5555	DenseSet<const TreeEntry *> GathersToOrders;
5556	// Find all reorderable leaf nodes with the given VF.
5557	// Currently the are vectorized loads,extracts without alternate operands +
5558	// some gathering of extracts.
5559	SmallVector<TreeEntry *> NonVectorized;
5560	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561	if (TE ->State != TreeEntry::Vectorize &&
5562	TE ->State != TreeEntry::StridedVectorize)
5563	NonVectorized.push_back(Elt: TE.get());
5564	if (std::optional<OrdersType> CurrentOrder =
5565	getReorderingData(TE: TE, /TopToBottom=/*false)) {
5566	OrderedEntries.insert(X: TE.get());
5567	if (!(TE ->State == TreeEntry::Vectorize \|\|
5568	TE ->State == TreeEntry::StridedVectorize) \|\|
5569	!TE ->ReuseShuffleIndices.empty())
5570	GathersToOrders.insert(V: TE.get());
5571	}
5572	}
5573
5574	// 1. Propagate order to the graph nodes, which use only reordered nodes.
5575	// I.e., if the node has operands, that are reordered, try to make at least
5576	// one operand order in the natural order and reorder others + reorder the
5577	// user node itself.
5578	SmallPtrSet<const TreeEntry *, `4`> Visited;
5579	while (!OrderedEntries.empty()) {
5580	// 1. Filter out only reordered nodes.
5581	// 2. If the entry has multiple uses - skip it and jump to the next node.
5582	DenseMap<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>> Users;
5583	SmallVector<TreeEntry *> Filtered;
5584	for (TreeEntry *TE : OrderedEntries) {
5585	if (!(TE->State == TreeEntry::Vectorize \|\|
5586	TE->State == TreeEntry::StridedVectorize \|\|
5587	(TE->isGather() && GathersToOrders.contains(V: TE))) \|\|
5588	TE->UserTreeIndices.empty() \|\| !TE->ReuseShuffleIndices.empty() \|\|
5589	!all_of(Range: drop_begin(RangeOrContainer&: TE->UserTreeIndices),
5590	P: [TE](const EdgeInfo &EI) {
5591	return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5592	}) \|\|
5593	!Visited.insert(Ptr: TE).second) {
5594	Filtered.push_back(Elt: TE);
5595	continue;
5596	}
5597	// Build a map between user nodes and their operands order to speedup
5598	// search. The graph currently does not provide this dependency directly.
5599	for (EdgeInfo &EI : TE->UserTreeIndices) {
5600	TreeEntry *UserTE = EI.UserTE;
5601	auto It = Users.find(Val: UserTE);
5602	if (It == Users.end())
5603	It = Users.insert(KV: {UserTE, {}}).first;
5604	It ->second.emplace_back(Args&: EI.EdgeIdx, Args&: TE);
5605	}
5606	}
5607	// Erase filtered entries.
5608	for (TreeEntry *TE : Filtered)
5609	OrderedEntries.remove(X: TE);
5610	SmallVector<
5611	std::pair<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>>>
5612	UsersVec(Users.begin(), Users.end());
5613	sort(C&: UsersVec, Comp: [](const auto &Data1, const auto &Data2) {
5614	return Data1.first->Idx > Data2.first->Idx;
5615	});
5616	for (auto &Data : UsersVec) {
5617	// Check that operands are used only in the User node.
5618	SmallVector<TreeEntry *> GatherOps;
5619	if (!canReorderOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
5620	GatherOps)) {
5621	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5622	OrderedEntries.remove(X: Op.second);
5623	continue;
5624	}
5625	// All operands are reordered and used only in this node - propagate the
5626	// most used order to the user node.
5627	MapVector<OrdersType, unsigned,
5628	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5629	OrdersUses;
5630	// Do the analysis for each tree entry only once, otherwise the order of
5631	// the same node my be considered several times, though might be not
5632	// profitable.
5633	SmallPtrSet<const TreeEntry *, `4`> VisitedOps;
5634	SmallPtrSet<const TreeEntry *, `4`> VisitedUsers;
5635	for (const auto &Op : Data.second) {
5636	TreeEntry *OpTE = Op.second;
5637	if (!VisitedOps.insert(Ptr: OpTE).second)
5638	continue;
5639	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
5640	continue;
5641	const auto Order = [&]() -> const OrdersType {
5642	if (OpTE->isGather() \|\| !OpTE->ReuseShuffleIndices.empty())
5643	return getReorderingData(TE: OpTE, /TopToBottom=/*false)
5644	.value_or(u: OrdersType (`1`));
5645	return OpTE->ReorderIndices;
5646	}();
5647	// The order is partially ordered, skip it in favor of fully non-ordered
5648	// orders.
5649	if (Order.size() == `1`)
5650	continue;
5651	unsigned NumOps = count_if(
5652	Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5653	return P.second == OpTE;
5654	});
5655	// Stores actually store the mask, not the order, need to invert.
5656	if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5658	SmallVector<int> Mask;
5659	inversePermutation(Indices: Order, Mask);
5660	unsigned E = Order.size();
5661	OrdersType CurrentOrder(E, E);
5662	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5663	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5664	});
5665	fixupOrderingIndices(Order: CurrentOrder);
5666	OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: `0`)).first->second +=
5667	NumOps;
5668	} else {
5669	OrdersUses.insert(KV: std::make_pair(x: Order, y: `0`)).first->second += NumOps;
5670	}
5671	auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType (), y: `0`));
5672	const auto AllowsReordering = [&](const TreeEntry *TE) {
5673	// FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5674	if (TE->isNonPowOf2Vec())
5675	return false;
5676	if (!TE->ReorderIndices.empty() \|\| !TE->ReuseShuffleIndices.empty() \|\|
5677	(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) \|\|
5678	(IgnoreReorder && TE->Idx == `0`))
5679	return true;
5680	if (TE->isGather()) {
5681	if (GathersToOrders.contains(V: TE))
5682	return !getReorderingData(TE: TE, /TopToBottom=/*false)
5683	.value_or(u: OrdersType (`1`))
5684	.empty();
5685	return true;
5686	}
5687	return false;
5688	};
5689	for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690	TreeEntry *UserTE = EI.UserTE;
5691	if (!VisitedUsers.insert(Ptr: UserTE).second)
5692	continue;
5693	// May reorder user node if it requires reordering, has reused
5694	// scalars, is an alternate op vectorize node or its op nodes require
5695	// reordering.
5696	if (AllowsReordering (UserTE))
5697	continue;
5698	// Check if users allow reordering.
5699	// Currently look up just 1 level of operands to avoid increase of
5700	// the compile time.
5701	// Profitable to reorder if definitely more operands allow
5702	// reordering rather than those with natural order.
5703	ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users [UserTE];
5704	if (static_cast<unsigned>(count_if(
5705	Range&: Ops, P: [UserTE, &AllowsReordering](
5706	const std::pair<unsigned, TreeEntry *> &Op) {
5707	return AllowsReordering (Op.second) &&
5708	all_of(Range&: Op.second->UserTreeIndices,
5709	P: [UserTE](const EdgeInfo &EI) {
5710	return EI.UserTE == UserTE;
5711	});
5712	})) <= Ops.size() / `2`)
5713	++Res.first->second;
5714	}
5715	}
5716	if (OrdersUses.empty()) {
5717	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5718	OrderedEntries.remove(X: Op.second);
5719	continue;
5720	}
5721	auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5722	const unsigned Sz = Order.size();
5723	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
5724	if (Idx != Order [Idx] && Order [Idx] != Sz)
5725	return false;
5726	return true;
5727	};
5728	// Choose the most used order.
5729	unsigned IdentityCnt = `0`;
5730	unsigned VF = Data.second.front().second->getVectorFactor();
5731	OrdersType IdentityOrder(VF, VF);
5732	for (auto &Pair : OrdersUses) {
5733	if (Pair.first.empty() \|\| IsIdentityOrder (Pair.first)) {
5734	IdentityCnt += Pair.second;
5735	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5736	}
5737	}
5738	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5739	unsigned Cnt = IdentityCnt;
5740	for (auto &Pair : OrdersUses) {
5741	// Prefer identity order. But, if filled identity found (non-empty
5742	// order) with same number of uses, as the new candidate order, we can
5743	// choose this candidate order.
5744	if (Cnt < Pair.second) {
5745	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5746	BestOrder = Pair.first;
5747	Cnt = Pair.second;
5748	} else {
5749	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5750	}
5751	}
5752	// Set order of the user node.
5753	if (IsIdentityOrder (BestOrder)) {
5754	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5755	OrderedEntries.remove(X: Op.second);
5756	continue;
5757	}
5758	fixupOrderingIndices(Order: BestOrder);
5759	// Erase operands from OrderedEntries list and adjust their orders.
5760	VisitedOps.clear();
5761	SmallVector<int> Mask;
5762	inversePermutation(Indices: BestOrder, Mask);
5763	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5764	unsigned E = BestOrder.size();
5765	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5766	return I < E ? static_cast<int>(I) : PoisonMaskElem;
5767	});
5768	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5769	TreeEntry *TE = Op.second;
5770	OrderedEntries.remove(X: TE);
5771	if (!VisitedOps.insert(Ptr: TE).second)
5772	continue;
5773	if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5774	reorderNodeWithReuses(TE&: *TE, Mask);
5775	continue;
5776	}
5777	// Gathers are processed separately.
5778	if (TE->State != TreeEntry::Vectorize &&
5779	TE->State != TreeEntry::StridedVectorize &&
5780	(TE->State != TreeEntry::ScatterVectorize \|\|
5781	TE->ReorderIndices.empty()))
5782	continue;
5783	assert((BestOrder.size() == TE->ReorderIndices.size() \|\|
5784	TE->ReorderIndices.empty()) &&
5785	"Non-matching sizes of user/operand entries.");
5786	reorderOrder(Order&: TE->ReorderIndices, Mask);
5787	if (IgnoreReorder && TE == VectorizableTree.front().get())
5788	IgnoreReorder = false;
5789	}
5790	// For gathers just need to reorder its scalars.
5791	for (TreeEntry *Gather : GatherOps) {
5792	assert(Gather->ReorderIndices.empty() &&
5793	"Unexpected reordering of gathers.");
5794	if (!Gather->ReuseShuffleIndices.empty()) {
5795	// Just reorder reuses indices.
5796	reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
5797	continue;
5798	}
5799	reorderScalars(Scalars&: Gather->Scalars, Mask);
5800	OrderedEntries.remove(X: Gather);
5801	}
5802	// Reorder operands of the user node and set the ordering for the user
5803	// node itself.
5804	if (Data.first->State != TreeEntry::Vectorize \|\|
5805	!isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806	Val: Data.first->getMainOp()) \|\|
5807	Data.first->isAltShuffle())
5808	Data.first->reorderOperands(Mask);
5809	if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) \|\|
5810	Data.first->isAltShuffle() \|\|
5811	Data.first->State == TreeEntry::StridedVectorize) {
5812	reorderScalars(Scalars&: Data.first->Scalars, Mask);
5813	reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
5814	/BottomOrder=/true);
5815	if (Data.first->ReuseShuffleIndices.empty() &&
5816	!Data.first->ReorderIndices.empty() &&
5817	!Data.first->isAltShuffle()) {
5818	// Insert user node to the list to try to sink reordering deeper in
5819	// the graph.
5820	OrderedEntries.insert(X: Data.first);
5821	}
5822	} else {
5823	reorderOrder(Order&: Data.first->ReorderIndices, Mask);
5824	}
5825	}
5826	}
5827	// If the reordering is unnecessary, just remove the reorder.
5828	if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829	VectorizableTree.front()->ReuseShuffleIndices.empty())
5830	VectorizableTree.front()->ReorderIndices.clear();
5831	}
5832
5833	void BoUpSLP::buildExternalUses(
5834	const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5835	DenseMap<Value , unsigned*> ScalarToExtUses;
5836	// Collect the values that we need to extract from the tree.
5837	for (auto &TEPtr : VectorizableTree) {
5838	TreeEntry *Entry = TEPtr.get();
5839
5840	// No need to handle users of gathered values.
5841	if (Entry->isGather())
5842	continue;
5843
5844	// For each lane:
5845	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846	Value *Scalar = Entry->Scalars [Lane];
5847	if (!isa<Instruction>(Val: Scalar))
5848	continue;
5849	// All uses must be replaced already? No need to do it again.
5850	auto It = ScalarToExtUses.find(Val: Scalar);
5851	if (It != ScalarToExtUses.end() && !ExternalUses [It ->second].User)
5852	continue;
5853
5854	// Check if the scalar is externally used as an extra arg.
5855	const auto *ExtI = ExternallyUsedValues.find(Key: Scalar);
5856	if (ExtI != ExternallyUsedValues.end()) {
5857	int FoundLane = Entry->findLaneForValue(V: Scalar);
5858	LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5859	<< FoundLane << " from " << *Scalar << ".\n");
5860	ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
5861	ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: FoundLane);
5862	continue;
5863	}
5864	for (User *U : Scalar->users()) {
5865	LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5866
5867	Instruction *UserInst = dyn_cast<Instruction>(Val: U);
5868	if (!UserInst \|\| isDeleted(I: UserInst))
5869	continue;
5870
5871	// Ignore users in the user ignore list.
5872	if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
5873	continue;
5874
5875	// Skip in-tree scalars that become vectors
5876	if (TreeEntry *UseEntry = getTreeEntry(V: U)) {
5877	// Some in-tree scalars will remain as scalar in vectorized
5878	// instructions. If that is the case, the one in FoundLane will
5879	// be used.
5880	if (UseEntry->State == TreeEntry::ScatterVectorize \|\|
5881	!doesInTreeUserNeedToExtract(
5882	Scalar, UserInst: cast<Instruction>(Val: UseEntry->Scalars.front()), TLI)) {
5883	LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5884	<< ".\n");
5885	assert(!UseEntry->isGather() && "Bad state");
5886	continue;
5887	}
5888	U = nullptr;
5889	if (It != ScalarToExtUses.end()) {
5890	ExternalUses [It ->second].User = nullptr;
5891	break;
5892	}
5893	}
5894
5895	if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
5896	U = nullptr;
5897	int FoundLane = Entry->findLaneForValue(V: Scalar);
5898	LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5899	<< " from lane " << FoundLane << " from " << *Scalar
5900	<< ".\n");
5901	It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
5902	ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: FoundLane);
5903	if (!U)
5904	break;
5905	}
5906	}
5907	}
5908	}
5909
5910	DenseMap<Value , SmallVector<StoreInst >>
5911	BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry TE) const* {
5912	DenseMap<Value , SmallVector<StoreInst >> PtrToStoresMap;
5913	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: TE->Scalars.size())) {
5914	Value *V = TE->Scalars [Lane];
5915	// To save compilation time we don't visit if we have too many users.
5916	if (V->hasNUsesOrMore(N: UsesLimit))
5917	break;
5918
5919	// Collect stores per pointer object.
5920	for (User *U : V->users()) {
5921	auto *SI = dyn_cast<StoreInst>(Val: U);
5922	if (SI == nullptr \|\| !SI->isSimple() \|\|
5923	!isValidElementType(Ty: SI->getValueOperand()->getType()))
5924	continue;
5925	// Skip entry if already
5926	if (getTreeEntry(V: U))
5927	continue;
5928
5929	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
5930	auto &StoresVec = PtrToStoresMap [Ptr];
5931	// For now just keep one store per pointer object per lane.
5932	// TODO: Extend this to support multiple stores per pointer per lane
5933	if (StoresVec.size() > Lane)
5934	continue;
5935	// Skip if in different BBs.
5936	if (!StoresVec.empty() &&
5937	SI->getParent() != StoresVec.back()->getParent())
5938	continue;
5939	// Make sure that the stores are of the same type.
5940	if (!StoresVec.empty() &&
5941	SI->getValueOperand()->getType() !=
5942	StoresVec.back()->getValueOperand()->getType())
5943	continue;
5944	StoresVec.push_back(Elt: SI);
5945	}
5946	}
5947	return PtrToStoresMap;
5948	}
5949
5950	bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5951	OrdersType &ReorderIndices) const {
5952	// We check whether the stores in StoreVec can form a vector by sorting them
5953	// and checking whether they are consecutive.
5954
5955	// To avoid calling getPointersDiff() while sorting we create a vector of
5956	// pairs {store, offset from first} and sort this instead.
5957	SmallVector<std::pair<StoreInst , int*>> StoreOffsetVec(StoresVec.size());
5958	StoreInst *S0 = StoresVec [`0`];
5959	StoreOffsetVec [`0`] = {S0, `0`};
5960	Type *S0Ty = S0->getValueOperand()->getType();
5961	Value *S0Ptr = S0->getPointerOperand();
5962	for (unsigned Idx : seq<unsigned>(Begin: `1`, End: StoresVec.size())) {
5963	StoreInst *SI = StoresVec [Idx];
5964	std::optional<int> Diff =
5965	getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
5966	PtrB: SI->getPointerOperand(), DL: DL, SE&: SE,
5967	/StrictCheck=/true);
5968	// We failed to compare the pointers so just abandon this StoresVec.
5969	if (!Diff)
5970	return false;
5971	StoreOffsetVec [Idx] = {StoresVec [Idx], *Diff};
5972	}
5973
5974	// Sort the vector based on the pointers. We create a copy because we may
5975	// need the original later for calculating the reorder (shuffle) indices.
5976	stable_sort(Range&: StoreOffsetVec, C: [](const std::pair<StoreInst , int*> &Pair1,
5977	const std::pair<StoreInst , int*> &Pair2) {
5978	int Offset1 = Pair1.second;
5979	int Offset2 = Pair2.second;
5980	return Offset1 < Offset2;
5981	});
5982
5983	// Check if the stores are consecutive by checking if their difference is 1.
5984	for (unsigned Idx : seq<unsigned>(Begin: `1`, End: StoreOffsetVec.size()))
5985	if (StoreOffsetVec [Idx].second != StoreOffsetVec [Idx - `1`].second + `1`)
5986	return false;
5987
5988	// Calculate the shuffle indices according to their offset against the sorted
5989	// StoreOffsetVec.
5990	ReorderIndices.reserve(N: StoresVec.size());
5991	for (StoreInst *SI : StoresVec) {
5992	unsigned Idx = find_if(Range&: StoreOffsetVec,
5993	P: [SI](const std::pair<StoreInst , int*> &Pair) {
5994	return Pair.first == SI;
5995	}) -
5996	StoreOffsetVec.begin();
5997	ReorderIndices.push_back(Elt: Idx);
5998	}
5999	// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6000	// reorderTopToBottom() and reorderBottomToTop(), so we are following the
6001	// same convention here.
6002	auto IsIdentityOrder = [](const OrdersType &Order) {
6003	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Order.size()))
6004	if (Idx != Order [Idx])
6005	return false;
6006	return true;
6007	};
6008	if (IsIdentityOrder (ReorderIndices))
6009	ReorderIndices.clear();
6010
6011	return true;
6012	}
6013
6014	#ifndef NDEBUG
6015	LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
6016	for (unsigned Idx : Order)
6017	dbgs() << Idx << ", ";
6018	dbgs() << "\n";
6019	}
6020	#endif
6021
6022	SmallVector<BoUpSLP::OrdersType, `1`>
6023	BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry TE) const* {
6024	unsigned NumLanes = TE->Scalars.size();
6025
6026	DenseMap<Value , SmallVector<StoreInst >> PtrToStoresMap =
6027	collectUserStores(TE);
6028
6029	// Holds the reorder indices for each candidate store vector that is a user of
6030	// the current TreeEntry.
6031	SmallVector<OrdersType, `1`> ExternalReorderIndices;
6032
6033	// Now inspect the stores collected per pointer and look for vectorization
6034	// candidates. For each candidate calculate the reorder index vector and push
6035	// it into `ExternalReorderIndices`
6036	for (const auto &Pair : PtrToStoresMap) {
6037	auto &StoresVec = Pair.second;
6038	// If we have fewer than NumLanes stores, then we can't form a vector.
6039	if (StoresVec.size() != NumLanes)
6040	continue;
6041
6042	// If the stores are not consecutive then abandon this StoresVec.
6043	OrdersType ReorderIndices;
6044	if (!canFormVector(StoresVec, ReorderIndices))
6045	continue;
6046
6047	// We now know that the scalars in StoresVec can form a vector instruction,
6048	// so set the reorder indices.
6049	ExternalReorderIndices.push_back(Elt: ReorderIndices);
6050	}
6051	return ExternalReorderIndices;
6052	}
6053
6054	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
6055	const SmallDenseSet<Value *> &UserIgnoreLst) {
6056	deleteTree();
6057	UserIgnoreList = &UserIgnoreLst;
6058	if (!allSameType(VL: Roots))
6059	return;
6060	buildTree_rec(Roots, Depth: `0`, EI: EdgeInfo ());
6061	}
6062
6063	void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6064	deleteTree();
6065	if (!allSameType(VL: Roots))
6066	return;
6067	buildTree_rec(Roots, Depth: `0`, EI: EdgeInfo ());
6068	}
6069
6070	/// \return true if the specified list of values has only one instruction that
6071	/// requires scheduling, false otherwise.
6072	#ifndef NDEBUG
6073	static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
6074	Value NeedsScheduling = nullptr*;
6075	for (Value *V : VL) {
6076	if (doesNotNeedToBeScheduled(V))
6077	continue;
6078	if (!NeedsScheduling) {
6079	NeedsScheduling = V;
6080	continue;
6081	}
6082	return false;
6083	}
6084	return NeedsScheduling;
6085	}
6086	#endif
6087
6088	/// Generates key/subkey pair for the given value to provide effective sorting
6089	/// of the values and better detection of the vectorizable values sequences. The
6090	/// keys/subkeys can be used for better sorting of the values themselves (keys)
6091	/// and in values subgroups (subkeys).
6092	static std::pair<size_t, size_t> generateKeySubkey(
6093	Value V, const* TargetLibraryInfo *TLI,
6094	function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6095	bool AllowAlternate) {
6096	hash_code Key = hash_value(value: V->getValueID() + `2`);
6097	hash_code SubKey = hash_value(value: `0`);
6098	// Sort the loads by the distance between the pointers.
6099	if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
6100	Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
6101	if (LI->isSimple())
6102	SubKey = hash_value(code: LoadsSubkeyGenerator (Key, LI));
6103	else
6104	Key = SubKey = hash_value(ptr: LI);
6105	} else if (isVectorLikeInstWithConstOps(V)) {
6106	// Sort extracts by the vector operands.
6107	if (isa<ExtractElementInst, UndefValue>(Val: V))
6108	Key = hash_value(value: Value::UndefValueVal + `1`);
6109	if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
6110	if (!isUndefVector(V: EI->getVectorOperand()).all() &&
6111	!isa<UndefValue>(Val: EI->getIndexOperand()))
6112	SubKey = hash_value(ptr: EI->getVectorOperand());
6113	}
6114	} else if (auto *I = dyn_cast<Instruction>(Val: V)) {
6115	// Sort other instructions just by the opcodes except for CMPInst.
6116	// For CMP also sort by the predicate kind.
6117	if ((isa<BinaryOperator, CastInst>(Val: I)) &&
6118	isValidForAlternation(Opcode: I->getOpcode())) {
6119	if (AllowAlternate)
6120	Key = hash_value(value: isa<BinaryOperator>(Val: I) ? `1` : `0`);
6121	else
6122	Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
6123	SubKey = hash_combine(
6124	args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
6125	args: hash_value(ptr: isa<BinaryOperator>(Val: I)
6126	? I->getType()
6127	: cast<CastInst>(Val: I)->getOperand(i_nocapture: `0`)->getType()));
6128	// For casts, look through the only operand to improve compile time.
6129	if (isa<CastInst>(Val: I)) {
6130	std::pair<size_t, size_t> OpVals =
6131	generateKeySubkey(V: I->getOperand(i: `0`), TLI, LoadsSubkeyGenerator,
6132	/AllowAlternate=/true);
6133	Key = hash_combine(args: OpVals.first, args: Key);
6134	SubKey = hash_combine(args: OpVals.first, args: SubKey);
6135	}
6136	} else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
6137	CmpInst::Predicate Pred = CI->getPredicate();
6138	if (CI->isCommutative())
6139	Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
6140	CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
6141	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
6142	args: hash_value(value: SwapPred),
6143	args: hash_value(ptr: CI->getOperand(i_nocapture: `0`)->getType()));
6144	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
6145	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
6146	if (isTriviallyVectorizable(ID)) {
6147	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
6148	} else if (!VFDatabase (Call).getMappings(CI: Call).empty()) {
6149	SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
6150	args: hash_value(ptr: Call->getCalledFunction()));
6151	} else {
6152	Key = hash_combine(args: hash_value(ptr: Call), args: Key);
6153	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
6154	}
6155	for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6156	SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
6157	args: hash_value(ptr: Op.Tag), args: SubKey);
6158	} else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
6159	if (Gep->getNumOperands() == `2` && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: `1`)))
6160	SubKey = hash_value(ptr: Gep->getPointerOperand());
6161	else
6162	SubKey = hash_value(ptr: Gep);
6163	} else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
6164	!isa<ConstantInt>(Val: I->getOperand(i: `1`))) {
6165	// Do not try to vectorize instructions with potentially high cost.
6166	SubKey = hash_value(ptr: I);
6167	} else {
6168	SubKey = hash_value(value: I->getOpcode());
6169	}
6170	Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
6171	}
6172	return std::make_pair(x&: Key, y&: SubKey);
6173	}
6174
6175	/// Checks if the specified instruction \p I is an alternate operation for
6176	/// the given \p MainOp and \p AltOp instructions.
6177	static bool isAlternateInstruction(const Instruction *I,
6178	const Instruction *MainOp,
6179	const Instruction *AltOp,
6180	const TargetLibraryInfo &TLI);
6181
6182	bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6183	ArrayRef<Value > VL) const* {
6184	unsigned Opcode0 = S.getOpcode();
6185	unsigned Opcode1 = S.getAltOpcode();
6186	SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6187	// If this pattern is supported by the target then consider it profitable.
6188	if (TTI->isLegalAltInstr(VecTy: getWidenedType(ScalarTy: S.MainOp->getType(), VF: VL.size()),
6189	Opcode0, Opcode1, OpcodeMask))
6190	return true;
6191	SmallVector<ValueList> Operands;
6192	for (unsigned I : seq<unsigned>(Begin: `0`, End: S.MainOp->getNumOperands())) {
6193	Operands.emplace_back();
6194	// Prepare the operand vector.
6195	for (Value *V : VL)
6196	Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6197	}
6198	if (Operands.size() == `2`) {
6199	// Try find best operands candidates.
6200	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL.size() - `1`)) {
6201	SmallVector<std::pair<Value , Value >> Candidates(`3`);
6202	Candidates [`0`] = std::make_pair(x&: Operands [`0`][I], y&: Operands [`0`][I + `1`]);
6203	Candidates [`1`] = std::make_pair(x&: Operands [`0`][I], y&: Operands [`1`][I + `1`]);
6204	Candidates [`2`] = std::make_pair(x&: Operands [`1`][I], y&: Operands [`0`][I + `1`]);
6205	std::optional<int> Res = findBestRootPair(Candidates);
6206	switch (Res.value_or(u: `0`)) {
6207	case `0`:
6208	break;
6209	case `1`:
6210	std::swap(a&: Operands [`0`][I + `1`], b&: Operands [`1`][I + `1`]);
6211	break;
6212	case `2`:
6213	std::swap(a&: Operands [`0`][I], b&: Operands [`1`][I]);
6214	break;
6215	default:
6216	llvm_unreachable("Unexpected index.");
6217	}
6218	}
6219	}
6220	DenseSet<unsigned> UniqueOpcodes;
6221	constexpr unsigned NumAltInsts = `3`; // main + alt + shuffle.
6222	unsigned NonInstCnt = `0`;
6223	// Estimate number of instructions, required for the vectorized node and for
6224	// the buildvector node.
6225	unsigned UndefCnt = `0`;
6226	// Count the number of extra shuffles, required for vector nodes.
6227	unsigned ExtraShuffleInsts = `0`;
6228	// Check that operands do not contain same values and create either perfect
6229	// diamond match or shuffled match.
6230	if (Operands.size() == `2`) {
6231	// Do not count same operands twice.
6232	if (Operands.front() == Operands.back()) {
6233	Operands.erase(CI: Operands.begin());
6234	} else if (!allConstant(VL: Operands.front()) &&
6235	all_of(Range&: Operands.front(), P: [&](Value *V) {
6236	return is_contained(Range&: Operands.back(), Element: V);
6237	})) {
6238	Operands.erase(CI: Operands.begin());
6239	++ExtraShuffleInsts;
6240	}
6241	}
6242	const Loop *L = LI->getLoopFor(BB: S.MainOp->getParent());
6243	// Vectorize node, if:
6244	// 1. at least single operand is constant or splat.
6245	// 2. Operands have many loop invariants (the instructions are not loop
6246	// invariants).
6247	// 3. At least single unique operands is supposed to vectorized.
6248	return none_of(Range&: Operands,
6249	P: [&](ArrayRef<Value *> Op) {
6250	if (allConstant(VL: Op) \|\|
6251	(!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
6252	getSameOpcode(VL: Op, TLI: *TLI).MainOp))
6253	return false;
6254	DenseMap<Value , unsigned*> Uniques;
6255	for (Value *V : Op) {
6256	if (isa<Constant, ExtractElementInst>(Val: V) \|\|
6257	getTreeEntry(V) \|\| (L && L->isLoopInvariant(V))) {
6258	if (isa<UndefValue>(Val: V))
6259	++UndefCnt;
6260	continue;
6261	}
6262	auto Res = Uniques.try_emplace(Key: V, Args: `0`);
6263	// Found first duplicate - need to add shuffle.
6264	if (!Res.second && Res.first ->second == `1`)
6265	++ExtraShuffleInsts;
6266	++Res.first ->getSecond();
6267	if (auto *I = dyn_cast<Instruction>(Val: V))
6268	UniqueOpcodes.insert(V: I->getOpcode());
6269	else if (Res.second)
6270	++NonInstCnt;
6271	}
6272	return none_of(Range&: Uniques, P: [&](const auto &P) {
6273	return P.first->hasNUsesOrMore(P.second + `1`) &&
6274	none_of(P.first->users(), [&](User *U) {
6275	return getTreeEntry(V: U) \|\| Uniques.contains(Val: U);
6276	});
6277	});
6278	}) \|\|
6279	// Do not vectorize node, if estimated number of vector instructions is
6280	// more than estimated number of buildvector instructions. Number of
6281	// vector operands is number of vector instructions + number of vector
6282	// instructions for operands (buildvectors). Number of buildvector
6283	// instructions is just number_of_operands number_of_scalars.*
6284	(UndefCnt < (VL.size() - `1`) * S.MainOp->getNumOperands() &&
6285	(UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6286	NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6287	}
6288
6289	BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6290	InstructionsState &S, ArrayRef<Value > VL, bool* IsScatterVectorizeUserTE,
6291	OrdersType &CurrentOrder, SmallVectorImpl<Value > &PointerOps) const* {
6292	assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6293
6294	unsigned ShuffleOrOp =
6295	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6296	auto *VL0 = cast<Instruction>(Val: S.OpValue);
6297	switch (ShuffleOrOp) {
6298	case Instruction::PHI: {
6299	// Too many operands - gather, most probably won't be vectorized.
6300	if (VL0->getNumOperands() > MaxPHINumOperands)
6301	return TreeEntry::NeedToGather;
6302	// Check for terminator values (e.g. invoke).
6303	for (Value *V : VL)
6304	for (Value *Incoming : cast<PHINode>(Val: V)->incoming_values()) {
6305	Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
6306	if (Term && Term->isTerminator()) {
6307	LLVM_DEBUG(dbgs()
6308	<< "SLP: Need to swizzle PHINodes (terminator use).\n");
6309	return TreeEntry::NeedToGather;
6310	}
6311	}
6312
6313	return TreeEntry::Vectorize;
6314	}
6315	case Instruction::ExtractValue:
6316	case Instruction::ExtractElement: {
6317	bool Reuse = canReuseExtract(VL, OpValue: VL0, CurrentOrder);
6318	// FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6319	if (!isPowerOf2_32(Value: VL.size()))
6320	return TreeEntry::NeedToGather;
6321	if (Reuse \|\| !CurrentOrder.empty())
6322	return TreeEntry::Vectorize;
6323	LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6324	return TreeEntry::NeedToGather;
6325	}
6326	case Instruction::InsertElement: {
6327	// Check that we have a buildvector and not a shuffle of 2 or more
6328	// different vectors.
6329	ValueSet SourceVectors;
6330	for (Value *V : VL) {
6331	SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: `0`));
6332	assert(getElementIndex(V) != std::nullopt &&
6333	"Non-constant or undef index?");
6334	}
6335
6336	if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
6337	return !SourceVectors.contains(Ptr: V);
6338	}) >= `2`) {
6339	// Found 2nd source vector - cancel.
6340	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6341	"different source vectors.\n");
6342	return TreeEntry::NeedToGather;
6343	}
6344
6345	return TreeEntry::Vectorize;
6346	}
6347	case Instruction::Load: {
6348	// Check that a vectorized load would load the same memory as a scalar
6349	// load. For example, we don't want to vectorize loads that are smaller
6350	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6351	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
6352	// from such a struct, we read/write packed bits disagreeing with the
6353	// unvectorized version.
6354	switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) {
6355	case LoadsState::Vectorize:
6356	return TreeEntry::Vectorize;
6357	case LoadsState::ScatterVectorize:
6358	return TreeEntry::ScatterVectorize;
6359	case LoadsState::StridedVectorize:
6360	return TreeEntry::StridedVectorize;
6361	case LoadsState::Gather:
6362	#ifndef NDEBUG
6363	Type *ScalarTy = VL0->getType();
6364	if (DL->getTypeSizeInBits(ScalarTy) !=
6365	DL->getTypeAllocSizeInBits(ScalarTy))
6366	LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6367	else if (any_of(VL,
6368	[](Value V) { return* !cast<LoadInst>(V)->isSimple(); }))
6369	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6370	else
6371	LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6372	#endif // NDEBUG
6373	return TreeEntry::NeedToGather;
6374	}
6375	llvm_unreachable("Unexpected state of loads");
6376	}
6377	case Instruction::ZExt:
6378	case Instruction::SExt:
6379	case Instruction::FPToUI:
6380	case Instruction::FPToSI:
6381	case Instruction::FPExt:
6382	case Instruction::PtrToInt:
6383	case Instruction::IntToPtr:
6384	case Instruction::SIToFP:
6385	case Instruction::UIToFP:
6386	case Instruction::Trunc:
6387	case Instruction::FPTrunc:
6388	case Instruction::BitCast: {
6389	Type *SrcTy = VL0->getOperand(i: `0`)->getType();
6390	for (Value *V : VL) {
6391	Type *Ty = cast<Instruction>(Val: V)->getOperand(i: `0`)->getType();
6392	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
6393	LLVM_DEBUG(
6394	dbgs() << "SLP: Gathering casts with different src types.\n");
6395	return TreeEntry::NeedToGather;
6396	}
6397	}
6398	return TreeEntry::Vectorize;
6399	}
6400	case Instruction::ICmp:
6401	case Instruction::FCmp: {
6402	// Check that all of the compares have the same predicate.
6403	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
6404	CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
6405	Type *ComparedTy = VL0->getOperand(i: `0`)->getType();
6406	for (Value *V : VL) {
6407	CmpInst *Cmp = cast<CmpInst>(Val: V);
6408	if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) \|\|
6409	Cmp->getOperand(i_nocapture: `0`)->getType() != ComparedTy) {
6410	LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6411	return TreeEntry::NeedToGather;
6412	}
6413	}
6414	return TreeEntry::Vectorize;
6415	}
6416	case Instruction::Select:
6417	case Instruction::FNeg:
6418	case Instruction::Add:
6419	case Instruction::FAdd:
6420	case Instruction::Sub:
6421	case Instruction::FSub:
6422	case Instruction::Mul:
6423	case Instruction::FMul:
6424	case Instruction::UDiv:
6425	case Instruction::SDiv:
6426	case Instruction::FDiv:
6427	case Instruction::URem:
6428	case Instruction::SRem:
6429	case Instruction::FRem:
6430	case Instruction::Shl:
6431	case Instruction::LShr:
6432	case Instruction::AShr:
6433	case Instruction::And:
6434	case Instruction::Or:
6435	case Instruction::Xor:
6436	return TreeEntry::Vectorize;
6437	case Instruction::GetElementPtr: {
6438	// We don't combine GEPs with complicated (nested) indexing.
6439	for (Value *V : VL) {
6440	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6441	if (!I)
6442	continue;
6443	if (I->getNumOperands() != `2`) {
6444	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6445	return TreeEntry::NeedToGather;
6446	}
6447	}
6448
6449	// We can't combine several GEPs into one vector if they operate on
6450	// different types.
6451	Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
6452	for (Value *V : VL) {
6453	auto *GEP = dyn_cast<GEPOperator>(Val: V);
6454	if (!GEP)
6455	continue;
6456	Type *CurTy = GEP->getSourceElementType();
6457	if (Ty0 != CurTy) {
6458	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6459	return TreeEntry::NeedToGather;
6460	}
6461	}
6462
6463	// We don't combine GEPs with non-constant indexes.
6464	Type *Ty1 = VL0->getOperand(i: `1`)->getType();
6465	for (Value *V : VL) {
6466	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6467	if (!I)
6468	continue;
6469	auto *Op = I->getOperand(i_nocapture: `1`);
6470	if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
6471	(Op->getType() != Ty1 &&
6472	((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
6473	Op->getType()->getScalarSizeInBits() >
6474	DL->getIndexSizeInBits(
6475	AS: V->getType()->getPointerAddressSpace())))) {
6476	LLVM_DEBUG(
6477	dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6478	return TreeEntry::NeedToGather;
6479	}
6480	}
6481
6482	return TreeEntry::Vectorize;
6483	}
6484	case Instruction::Store: {
6485	// Check if the stores are consecutive or if we need to swizzle them.
6486	llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
6487	// Avoid types that are padded when being allocated as scalars, while
6488	// being packed together in a vector (such as i1).
6489	if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
6490	DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
6491	LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6492	return TreeEntry::NeedToGather;
6493	}
6494	// Make sure all stores in the bundle are simple - we can't vectorize
6495	// atomic or volatile stores.
6496	for (Value *V : VL) {
6497	auto *SI = cast<StoreInst>(Val: V);
6498	if (!SI->isSimple()) {
6499	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6500	return TreeEntry::NeedToGather;
6501	}
6502	PointerOps.push_back(Elt: SI->getPointerOperand());
6503	}
6504
6505	// Check the order of pointer operands.
6506	if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: CurrentOrder)) {
6507	Value *Ptr0;
6508	Value *PtrN;
6509	if (CurrentOrder.empty()) {
6510	Ptr0 = PointerOps.front();
6511	PtrN = PointerOps.back();
6512	} else {
6513	Ptr0 = PointerOps [CurrentOrder.front()];
6514	PtrN = PointerOps [CurrentOrder.back()];
6515	}
6516	std::optional<int> Dist =
6517	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
6518	// Check that the sorted pointer operands are consecutive.
6519	if (static_cast<unsigned>(*Dist) == VL.size() - `1`)
6520	return TreeEntry::Vectorize;
6521	}
6522
6523	LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6524	return TreeEntry::NeedToGather;
6525	}
6526	case Instruction::Call: {
6527	// Check if the calls are all to the same vectorizable intrinsic or
6528	// library function.
6529	CallInst *CI = cast<CallInst>(Val: VL0);
6530	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6531
6532	VFShape Shape = VFShape::get(
6533	FTy: CI->getFunctionType(),
6534	EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
6535	HasGlobalPred: false /HasGlobalPred/);
6536	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
6537
6538	if (!VecFunc && !isTriviallyVectorizable(ID)) {
6539	LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6540	return TreeEntry::NeedToGather;
6541	}
6542	Function *F = CI->getCalledFunction();
6543	unsigned NumArgs = CI->arg_size();
6544	SmallVector<Value , `4`> ScalarArgs(NumArgs, nullptr*);
6545	for (unsigned J = `0`; J != NumArgs; ++J)
6546	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J))
6547	ScalarArgs [J] = CI->getArgOperand(i: J);
6548	for (Value *V : VL) {
6549	CallInst *CI2 = dyn_cast<CallInst>(Val: V);
6550	if (!CI2 \|\| CI2->getCalledFunction() != F \|\|
6551	getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID \|\|
6552	(VecFunc &&
6553	VecFunc != VFDatabase (*CI2).getVectorizedFunction(Shape)) \|\|
6554	!CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
6555	LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << V
6556	<< "\n");
6557	return TreeEntry::NeedToGather;
6558	}
6559	// Some intrinsics have scalar arguments and should be same in order for
6560	// them to be vectorized.
6561	for (unsigned J = `0`; J != NumArgs; ++J) {
6562	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J)) {
6563	Value *A1J = CI2->getArgOperand(i: J);
6564	if (ScalarArgs [J] != A1J) {
6565	LLVM_DEBUG(dbgs()
6566	<< "SLP: mismatched arguments in call:" << *CI
6567	<< " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6568	return TreeEntry::NeedToGather;
6569	}
6570	}
6571	}
6572	// Verify that the bundle operands are identical between the two calls.
6573	if (CI->hasOperandBundles() &&
6574	!std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6575	CI->op_begin() + CI->getBundleOperandsEndIndex(),
6576	CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6577	LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6578	<< "!=" << *V << `'\n'`);
6579	return TreeEntry::NeedToGather;
6580	}
6581	}
6582
6583	return TreeEntry::Vectorize;
6584	}
6585	case Instruction::ShuffleVector: {
6586	// If this is not an alternate sequence of opcode like add-sub
6587	// then do not vectorize this instruction.
6588	if (!S.isAltShuffle()) {
6589	LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6590	return TreeEntry::NeedToGather;
6591	}
6592	if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6593	LLVM_DEBUG(
6594	dbgs()
6595	<< "SLP: ShuffleVector not vectorized, operands are buildvector and "
6596	"the whole alt sequence is not profitable.\n");
6597	return TreeEntry::NeedToGather;
6598	}
6599
6600	return TreeEntry::Vectorize;
6601	}
6602	default:
6603	LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6604	return TreeEntry::NeedToGather;
6605	}
6606	}
6607
6608	namespace {
6609	/// Allows to correctly handle operands of the phi nodes based on the \p Main
6610	/// PHINode order of incoming basic blocks/values.
6611	class PHIHandler {
6612	DominatorTree &DT;
6613	PHINode Main = nullptr*;
6614	SmallVector<Value *> Phis;
6615	SmallVector<SmallVector<Value *>> Operands;
6616
6617	public:
6618	PHIHandler() = delete;
6619	PHIHandler(DominatorTree &DT, PHINode Main, ArrayRef<Value > Phis)
6620	: DT(DT), Main(Main), Phis (Phis),
6621	Operands (Main->getNumIncomingValues(),
6622	SmallVector<Value >(Phis.size(), nullptr*)) {}
6623	void buildOperands() {
6624	constexpr unsigned FastLimit = `4`;
6625	if (Main->getNumIncomingValues() <= FastLimit) {
6626	for (unsigned I : seq<unsigned>(Begin: `0`, End: Main->getNumIncomingValues())) {
6627	BasicBlock *InBB = Main->getIncomingBlock(i: I);
6628	if (!DT.isReachableFromEntry(A: InBB)) {
6629	Operands [I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
6630	continue;
6631	}
6632	// Prepare the operand vector.
6633	for (auto [Idx, V] : enumerate(First&: Phis)) {
6634	auto *P = cast<PHINode>(Val: V);
6635	if (P->getIncomingBlock(i: I) == InBB)
6636	Operands [I][Idx] = P->getIncomingValue(i: I);
6637	else
6638	Operands [I][Idx] = P->getIncomingValueForBlock(BB: InBB);
6639	}
6640	}
6641	return;
6642	}
6643	SmallDenseMap<BasicBlock , SmallVector<unsigned*>, `4`> Blocks;
6644	for (unsigned I : seq<unsigned>(Begin: `0`, End: Main->getNumIncomingValues())) {
6645	BasicBlock *InBB = Main->getIncomingBlock(i: I);
6646	if (!DT.isReachableFromEntry(A: InBB)) {
6647	Operands [I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
6648	continue;
6649	}
6650	Blocks.try_emplace(Key: InBB).first ->second.push_back(Elt: I);
6651	}
6652	for (auto [Idx, V] : enumerate(First&: Phis)) {
6653	auto *P = cast<PHINode>(Val: V);
6654	for (unsigned I : seq<unsigned>(Begin: `0`, End: P->getNumIncomingValues())) {
6655	BasicBlock *InBB = P->getIncomingBlock(i: I);
6656	if (InBB == Main->getIncomingBlock(i: I)) {
6657	if (isa_and_nonnull<PoisonValue>(Val: Operands [I][Idx]))
6658	continue;
6659	Operands [I][Idx] = P->getIncomingValue(i: I);
6660	continue;
6661	}
6662	auto It = Blocks.find(Val: InBB);
6663	if (It == Blocks.end())
6664	continue;
6665	Operands [It ->second.front()][Idx] = P->getIncomingValue(i: I);
6666	}
6667	}
6668	for (const auto &P : Blocks) {
6669	if (P.getSecond().size() <= `1`)
6670	continue;
6671	unsigned BasicI = P.getSecond().front();
6672	for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6673	assert(all_of(enumerate(Operands[I]),
6674	[&](const auto &Data) {
6675	return !Data.value() \|\|
6676	Data.value() == Operands[BasicI][Data.index()];
6677	}) &&
6678	"Expected empty operands list.");
6679	Operands [I] = Operands [BasicI];
6680	}
6681	}
6682	}
6683	ArrayRef<Value > getOperands(unsigned* I) const { return Operands [I]; }
6684	};
6685	} // namespace
6686
6687	void BoUpSLP::buildTree_rec(ArrayRef<Value > VL, unsigned* Depth,
6688	const EdgeInfo &UserTreeIdx) {
6689	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");
6690
6691	SmallVector<int> ReuseShuffleIndices;
6692	SmallVector<Value *> UniqueValues;
6693	SmallVector<Value *> NonUniqueValueVL;
6694	auto TryToFindDuplicates = [&](const InstructionsState &S,
6695	bool DoNotFail = false) {
6696	// Check that every instruction appears once in this bundle.
6697	DenseMap<Value , unsigned*> UniquePositions(VL.size());
6698	for (Value *V : VL) {
6699	if (isConstant(V)) {
6700	ReuseShuffleIndices.emplace_back(
6701	Args: isa<UndefValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
6702	UniqueValues.emplace_back(Args&: V);
6703	continue;
6704	}
6705	auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
6706	ReuseShuffleIndices.emplace_back(Args&: Res.first ->second);
6707	if (Res.second)
6708	UniqueValues.emplace_back(Args&: V);
6709	}
6710	size_t NumUniqueScalarValues = UniqueValues.size();
6711	if (NumUniqueScalarValues == VL.size()) {
6712	ReuseShuffleIndices.clear();
6713	} else {
6714	// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6715	if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716	LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6717	"for nodes with padding.\n");
6718	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6719	return false;
6720	}
6721	LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6722	if (NumUniqueScalarValues <= `1` \|\|
6723	(UniquePositions.size() == `1` && all_of(Range&: UniqueValues,
6724	P: [](Value *V) {
6725	return isa<UndefValue>(Val: V) \|\|
6726	!isConstant(V);
6727	})) \|\|
6728	!llvm::has_single_bit<uint32_t>(Value: NumUniqueScalarValues)) {
6729	if (DoNotFail && UniquePositions.size() > `1` &&
6730	NumUniqueScalarValues > `1` && S.MainOp->isSafeToRemove() &&
6731	all_of(Range&: UniqueValues, P: [=](Value *V) {
6732	return isa<ExtractElementInst>(Val: V) \|\|
6733	areAllUsersVectorized(I: cast<Instruction>(Val: V),
6734	VectorizedVals: UserIgnoreList);
6735	})) {
6736	unsigned PWSz = PowerOf2Ceil(A: UniqueValues.size());
6737	if (PWSz == VL.size()) {
6738	ReuseShuffleIndices.clear();
6739	} else {
6740	NonUniqueValueVL.assign(in_start: UniqueValues.begin(), in_end: UniqueValues.end());
6741	NonUniqueValueVL.append(NumInputs: PWSz - UniqueValues.size(),
6742	Elt: UniqueValues.back());
6743	VL = NonUniqueValueVL;
6744	}
6745	return true;
6746	}
6747	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6748	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6749	return false;
6750	}
6751	VL = UniqueValues;
6752	}
6753	return true;
6754	};
6755
6756	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
6757
6758	// Don't vectorize ephemeral values.
6759	if (!EphValues.empty()) {
6760	for (Value *V : VL) {
6761	if (EphValues.count(Ptr: V)) {
6762	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6763	<< ") is ephemeral.\n");
6764	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6765	return;
6766	}
6767	}
6768	}
6769
6770	// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6771	// a load), in which case peek through to include it in the tree, without
6772	// ballooning over-budget.
6773	if (Depth >= RecursionMaxDepth &&
6774	!(S.MainOp && isa<Instruction>(Val: S.MainOp) && S.MainOp == S.AltOp &&
6775	VL.size() >= `4` &&
6776	(match(V: S.MainOp, P: m_Load(Op: m_Value())) \|\| all_of(Range&: VL, P: [&S](const Value *I) {
6777	return match(V: I,
6778	P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
6779	cast<Instruction>(Val: I)->getOpcode() ==
6780	cast<Instruction>(Val: S.MainOp)->getOpcode();
6781	})))) {
6782	LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6783	if (TryToFindDuplicates (S))
6784	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6785	ReuseShuffleIndices);
6786	return;
6787	}
6788
6789	// Don't handle scalable vectors
6790	if (S.getOpcode() == Instruction::ExtractElement &&
6791	isa<ScalableVectorType>(
6792	Val: cast<ExtractElementInst>(Val: S.OpValue)->getVectorOperandType())) {
6793	LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6794	if (TryToFindDuplicates (S))
6795	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6796	ReuseShuffleIndices);
6797	return;
6798	}
6799
6800	// Don't handle vectors.
6801	if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802	!isa<InsertElementInst>(Val: S.OpValue)) {
6803	LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6804	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6805	return;
6806	}
6807
6808	if (StoreInst *SI = dyn_cast<StoreInst>(Val: S.OpValue))
6809	if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6810	LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6811	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6812	return;
6813	}
6814
6815	// If all of the operands are identical or constant we have a simple solution.
6816	// If we deal with insert/extract instructions, they all must have constant
6817	// indices, otherwise we should gather them, not try to vectorize.
6818	// If alternate op node with 2 elements with gathered operands - do not
6819	// vectorize.
6820	auto &&NotProfitableForVectorization = [&S, this,
6821	Depth](ArrayRef<Value *> VL) {
6822	if (!S.getOpcode() \|\| !S.isAltShuffle() \|\| VL.size() > `2`)
6823	return false;
6824	if (VectorizableTree.size() < MinTreeSize)
6825	return false;
6826	if (Depth >= RecursionMaxDepth - `1`)
6827	return true;
6828	// Check if all operands are extracts, part of vector node or can build a
6829	// regular vectorize node.
6830	SmallVector<unsigned, `2`> InstsCount(VL.size(), `0`);
6831	for (Value *V : VL) {
6832	auto *I = cast<Instruction>(Val: V);
6833	InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
6834	return isa<Instruction>(Val: Op) \|\| isVectorLikeInstWithConstOps(V: Op);
6835	}));
6836	}
6837	bool IsCommutative = isCommutative(I: S.MainOp) \|\| isCommutative(I: S.AltOp);
6838	if ((IsCommutative &&
6839	std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: `0`) < `2`) \|\|
6840	(!IsCommutative &&
6841	all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < `2`; })))
6842	return true;
6843	assert(VL.size() == `2` && "Expected only 2 alternate op instructions.");
6844	SmallVector<SmallVector<std::pair<Value , Value >>> Candidates;
6845	auto *I1 = cast<Instruction>(Val: VL.front());
6846	auto *I2 = cast<Instruction>(Val: VL.back());
6847	for (int Op = `0`, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6848	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6849	Args: I2->getOperand(i: Op));
6850	if (static_cast<unsigned>(count_if(
6851	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
6852	return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6853	})) >= S.MainOp->getNumOperands() / `2`)
6854	return false;
6855	if (S.MainOp->getNumOperands() > `2`)
6856	return true;
6857	if (IsCommutative) {
6858	// Check permuted operands.
6859	Candidates.clear();
6860	for (int Op = `0`, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6861	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6862	Args: I2->getOperand(i: (Op + `1`) % E));
6863	if (any_of(
6864	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
6865	return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6866	}))
6867	return false;
6868	}
6869	return true;
6870	};
6871	SmallVector<unsigned> SortedIndices;
6872	BasicBlock BB = nullptr*;
6873	bool IsScatterVectorizeUserTE =
6874	UserTreeIdx.UserTE &&
6875	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876	bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6877	bool AreScatterAllGEPSameBlock =
6878	(IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6879	VL.size() > `2` &&
6880	all_of(Range&: VL,
6881	P: [&BB](Value *V) {
6882	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6883	if (!I)
6884	return doesNotNeedToBeScheduled(V);
6885	if (!BB)
6886	BB = I->getParent();
6887	return BB == I->getParent() && I->getNumOperands() == `2`;
6888	}) &&
6889	BB &&
6890	sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: DL, SE&: SE,
6891	SortedIndices));
6892	bool AreAllSameInsts = AreAllSameBlock \|\| AreScatterAllGEPSameBlock;
6893	if (!AreAllSameInsts \|\| allConstant(VL) \|\| isSplat(VL) \|\|
6894	(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6895	Val: S.OpValue) &&
6896	!all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) \|\|
6897	NotProfitableForVectorization (VL)) {
6898	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899	if (TryToFindDuplicates (S))
6900	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6901	ReuseShuffleIndices);
6902	return;
6903	}
6904
6905	// We now know that this is a vector of instructions of the same type from
6906	// the same block.
6907
6908	// Check if this is a duplicate of another entry.
6909	if (TreeEntry *E = getTreeEntry(V: S.OpValue)) {
6910	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6911	if (!E->isSame(VL)) {
6912	auto It = MultiNodeScalars.find(Val: S.OpValue);
6913	if (It != MultiNodeScalars.end()) {
6914	auto *TEIt = find_if(Range&: It ->getSecond(),
6915	P: [&](TreeEntry ME) { return* ME->isSame(VL); });
6916	if (TEIt != It ->getSecond().end())
6917	E = *TEIt;
6918	else
6919	E = nullptr;
6920	} else {
6921	E = nullptr;
6922	}
6923	}
6924	if (!E) {
6925	if (!doesNotNeedToBeScheduled(V: S.OpValue)) {
6926	LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6927	if (TryToFindDuplicates (S))
6928	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6929	ReuseShuffleIndices);
6930	return;
6931	}
6932	} else {
6933	// Record the reuse of the tree node. FIXME, currently this is only used
6934	// to properly draw the graph rather than for the actual vectorization.
6935	E->UserTreeIndices.push_back(Elt: UserTreeIdx);
6936	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6937	<< ".\n");
6938	return;
6939	}
6940	}
6941
6942	// Check that none of the instructions in the bundle are already in the tree.
6943	for (Value *V : VL) {
6944	if ((!IsScatterVectorizeUserTE && !isa<Instruction>(Val: V)) \|\|
6945	doesNotNeedToBeScheduled(V))
6946	continue;
6947	if (getTreeEntry(V)) {
6948	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6949	<< ") is already in tree.\n");
6950	if (TryToFindDuplicates (S))
6951	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6952	ReuseShuffleIndices);
6953	return;
6954	}
6955	}
6956
6957	// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6958	if (UserIgnoreList && !UserIgnoreList->empty()) {
6959	for (Value *V : VL) {
6960	if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961	LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6962	if (TryToFindDuplicates (S))
6963	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
6964	ReuseShuffleIndices);
6965	return;
6966	}
6967	}
6968	}
6969
6970	// Special processing for sorted pointers for ScatterVectorize node with
6971	// constant indeces only.
6972	if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973	assert(S.OpValue->getType()->isPointerTy() &&
6974	count_if(VL, IsaPred<GetElementPtrInst>) >= `2` &&
6975	"Expected pointers only.");
6976	// Reset S to make it GetElementPtr kind of node.
6977	const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
6978	assert(It != VL.end() && "Expected at least one GEP.");
6979	S = getSameOpcode(VL: It, TLI: TLI);
6980	}
6981
6982	// Check that all of the users of the scalars that we want to vectorize are
6983	// schedulable.
6984	auto *VL0 = cast<Instruction>(Val: S.OpValue);
6985	BB = VL0->getParent();
6986
6987	if (!DT->isReachableFromEntry(A: BB)) {
6988	// Don't go into unreachable blocks. They may contain instructions with
6989	// dependency cycles which confuse the final scheduling.
6990	LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6991	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
6992	return;
6993	}
6994
6995	// Don't go into catchswitch blocks, which can happen with PHIs.
6996	// Such blocks can only have PHIs and the catchswitch. There is no
6997	// place to insert a shuffle if we need to, so just avoid that issue.
6998	if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
6999	LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7000	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx);
7001	return;
7002	}
7003
7004	// Check that every instruction appears once in this bundle.
7005	if (!TryToFindDuplicates (S, /DoNotFail=/true))
7006	return;
7007
7008	// Perform specific checks for each particular instruction kind.
7009	OrdersType CurrentOrder;
7010	SmallVector<Value *> PointerOps;
7011	TreeEntry::EntryState State = getScalarsVectorizationState(
7012	S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013	if (State == TreeEntry::NeedToGather) {
7014	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
7015	ReuseShuffleIndices);
7016	return;
7017	}
7018
7019	auto &BSRef = BlocksSchedules [BB];
7020	if (!BSRef)
7021	BSRef = std::make_unique<BlockScheduling>(args&: BB);
7022
7023	BlockScheduling &BS = *BSRef;
7024
7025	std::optional<ScheduleData *> Bundle =
7026	BS.tryScheduleBundle(VL: UniqueValues, SLP: this, S);
7027	#ifdef EXPENSIVE_CHECKS
7028	// Make sure we didn't break any internal invariants
7029	BS.verify();
7030	#endif
7031	if (!Bundle) {
7032	LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7033	assert((!BS.getScheduleData(VL0) \|\|
7034	!BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035	"tryScheduleBundle should cancelScheduling on failure");
7036	newTreeEntry(VL, Bundle: std::nullopt /not vectorized/, S, UserTreeIdx,
7037	ReuseShuffleIndices);
7038	NonScheduledFirst.insert(Ptr: VL.front());
7039	return;
7040	}
7041	LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7042
7043	unsigned ShuffleOrOp = S.isAltShuffle() ?
7044	(unsigned) Instruction::ShuffleVector : S.getOpcode();
7045	switch (ShuffleOrOp) {
7046	case Instruction::PHI: {
7047	auto *PH = cast<PHINode>(Val: VL0);
7048
7049	TreeEntry *TE =
7050	newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7051	LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7052
7053	// Keeps the reordered operands to avoid code duplication.
7054	PHIHandler Handler(*DT, PH, VL);
7055	Handler.buildOperands();
7056	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumOperands()))
7057	TE->setOperand(OpIdx: I, OpVL: Handler.getOperands(I));
7058	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumOperands()))
7059	buildTree_rec(VL: Handler.getOperands(I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
7060	return;
7061	}
7062	case Instruction::ExtractValue:
7063	case Instruction::ExtractElement: {
7064	if (CurrentOrder.empty()) {
7065	LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7066	} else {
7067	LLVM_DEBUG({
7068	dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7069	"with order";
7070	for (unsigned Idx : CurrentOrder)
7071	dbgs() << " " << Idx;
7072	dbgs() << "\n";
7073	});
7074	fixupOrderingIndices(Order: CurrentOrder);
7075	}
7076	// Insert new order with initial value 0, if it does not exist,
7077	// otherwise return the iterator to the existing one.
7078	newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
7079	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7080	// This is a special case, as it does not gather, but at the same time
7081	// we are not extending buildTree_rec() towards the operands.
7082	ValueList Op0;
7083	Op0.assign(NumElts: VL.size(), Elt: VL0->getOperand(i: `0`));
7084	VectorizableTree.back()->setOperand(OpIdx: `0`, OpVL: Op0);
7085	return;
7086	}
7087	case Instruction::InsertElement: {
7088	assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7089
7090	auto OrdCompare = [](const std::pair<int, int> &P1,
7091	const std::pair<int, int> &P2) {
7092	return P1.first > P2.first;
7093	};
7094	PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
7095	decltype(OrdCompare)>
7096	Indices(OrdCompare);
7097	for (int I = `0`, E = VL.size(); I < E; ++I) {
7098	unsigned Idx = *getElementIndex(Inst: VL [I]);
7099	Indices.emplace(args&: Idx, args&: I);
7100	}
7101	OrdersType CurrentOrder(VL.size(), VL.size());
7102	bool IsIdentity = true;
7103	for (int I = `0`, E = VL.size(); I < E; ++I) {
7104	CurrentOrder [Indices.top().second] = I;
7105	IsIdentity &= Indices.top().second == I;
7106	Indices.pop();
7107	}
7108	if (IsIdentity)
7109	CurrentOrder.clear();
7110	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7111	ReuseShuffleIndices: std::nullopt, ReorderIndices: CurrentOrder);
7112	LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7113
7114	TE->setOperandsInOrder();
7115	buildTree_rec(VL: TE->getOperand(OpIdx: `1`), Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7116	return;
7117	}
7118	case Instruction::Load: {
7119	// Check that a vectorized load would load the same memory as a scalar
7120	// load. For example, we don't want to vectorize loads that are smaller
7121	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7122	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
7123	// from such a struct, we read/write packed bits disagreeing with the
7124	// unvectorized version.
7125	TreeEntry TE = nullptr*;
7126	fixupOrderingIndices(Order: CurrentOrder);
7127	switch (State) {
7128	case TreeEntry::Vectorize:
7129	TE = newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
7130	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7131	if (CurrentOrder.empty())
7132	LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7133	else
7134	LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7135	TE->setOperandsInOrder();
7136	break;
7137	case TreeEntry::StridedVectorize:
7138	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
7139	TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
7140	UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7141	TE->setOperandsInOrder();
7142	LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7143	break;
7144	case TreeEntry::ScatterVectorize:
7145	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
7146	TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
7147	UserTreeIdx, ReuseShuffleIndices);
7148	TE->setOperandsInOrder();
7149	buildTree_rec(VL: PointerOps, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7150	LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7151	break;
7152	case TreeEntry::NeedToGather:
7153	llvm_unreachable("Unexpected loads state.");
7154	}
7155	return;
7156	}
7157	case Instruction::ZExt:
7158	case Instruction::SExt:
7159	case Instruction::FPToUI:
7160	case Instruction::FPToSI:
7161	case Instruction::FPExt:
7162	case Instruction::PtrToInt:
7163	case Instruction::IntToPtr:
7164	case Instruction::SIToFP:
7165	case Instruction::UIToFP:
7166	case Instruction::Trunc:
7167	case Instruction::FPTrunc:
7168	case Instruction::BitCast: {
7169	auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170	u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
7171	y: std::numeric_limits<unsigned>::max()));
7172	if (ShuffleOrOp == Instruction::ZExt \|\|
7173	ShuffleOrOp == Instruction::SExt) {
7174	CastMaxMinBWSizes = std::make_pair(
7175	x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
7176	b: PrevMaxBW),
7177	y: std::min<unsigned>(
7178	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
7179	b: PrevMinBW));
7180	} else if (ShuffleOrOp == Instruction::Trunc) {
7181	CastMaxMinBWSizes = std::make_pair(
7182	x: std::max<unsigned>(
7183	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
7184	b: PrevMaxBW),
7185	y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
7186	b: PrevMinBW));
7187	ExtraBitWidthNodes.insert(V: VectorizableTree.size() + `1`);
7188	} else if (ShuffleOrOp == Instruction::SIToFP \|\|
7189	ShuffleOrOp == Instruction::UIToFP) {
7190	unsigned NumSignBits =
7191	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
7192	if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: `0`))) {
7193	APInt Mask = DB->getDemandedBits(I: OpI);
7194	NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
7195	}
7196	if (NumSignBits * `2` >=
7197	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
7198	ExtraBitWidthNodes.insert(V: VectorizableTree.size() + `1`);
7199	}
7200	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7201	ReuseShuffleIndices);
7202	LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7203
7204	TE->setOperandsInOrder();
7205	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL0->getNumOperands()))
7206	buildTree_rec(VL: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
7207	return;
7208	}
7209	case Instruction::ICmp:
7210	case Instruction::FCmp: {
7211	// Check that all of the compares have the same predicate.
7212	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
7213	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7214	ReuseShuffleIndices);
7215	LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7216
7217	ValueList Left, Right;
7218	if (cast<CmpInst>(Val: VL0)->isCommutative()) {
7219	// Commutative predicate - collect + sort operands of the instructions
7220	// so that each side is more likely to have the same opcode.
7221	assert(P0 == CmpInst::getSwappedPredicate(P0) &&
7222	"Commutative Predicate mismatch");
7223	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7224	} else {
7225	// Collect operands - commute if it uses the swapped predicate.
7226	for (Value *V : VL) {
7227	auto *Cmp = cast<CmpInst>(Val: V);
7228	Value *LHS = Cmp->getOperand(i_nocapture: `0`);
7229	Value *RHS = Cmp->getOperand(i_nocapture: `1`);
7230	if (Cmp->getPredicate() != P0)
7231	std::swap(a&: LHS, b&: RHS);
7232	Left.push_back(Elt: LHS);
7233	Right.push_back(Elt: RHS);
7234	}
7235	}
7236	TE->setOperand(OpIdx: `0`, OpVL: Left);
7237	TE->setOperand(OpIdx: `1`, OpVL: Right);
7238	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7239	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7240	if (ShuffleOrOp == Instruction::ICmp) {
7241	unsigned NumSignBits0 =
7242	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
7243	if (NumSignBits0 * `2` >=
7244	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
7245	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
7246	unsigned NumSignBits1 =
7247	ComputeNumSignBits(Op: VL0->getOperand(i: `1`), DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
7248	if (NumSignBits1 * `2` >=
7249	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `1`)->getType()))
7250	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `1`)->Idx);
7251	}
7252	return;
7253	}
7254	case Instruction::Select:
7255	case Instruction::FNeg:
7256	case Instruction::Add:
7257	case Instruction::FAdd:
7258	case Instruction::Sub:
7259	case Instruction::FSub:
7260	case Instruction::Mul:
7261	case Instruction::FMul:
7262	case Instruction::UDiv:
7263	case Instruction::SDiv:
7264	case Instruction::FDiv:
7265	case Instruction::URem:
7266	case Instruction::SRem:
7267	case Instruction::FRem:
7268	case Instruction::Shl:
7269	case Instruction::LShr:
7270	case Instruction::AShr:
7271	case Instruction::And:
7272	case Instruction::Or:
7273	case Instruction::Xor: {
7274	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7275	ReuseShuffleIndices);
7276	LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7277
7278	// Sort operands of the instructions so that each side is more likely to
7279	// have the same opcode.
7280	if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
7281	ValueList Left, Right;
7282	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7283	TE->setOperand(OpIdx: `0`, OpVL: Left);
7284	TE->setOperand(OpIdx: `1`, OpVL: Right);
7285	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7286	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7287	return;
7288	}
7289
7290	TE->setOperandsInOrder();
7291	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL0->getNumOperands()))
7292	buildTree_rec(VL: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
7293	return;
7294	}
7295	case Instruction::GetElementPtr: {
7296	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7297	ReuseShuffleIndices);
7298	LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7299	SmallVector<ValueList, `2`> Operands(`2`);
7300	// Prepare the operand vector for pointer operands.
7301	for (Value *V : VL) {
7302	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
7303	if (!GEP) {
7304	Operands.front().push_back(Elt: V);
7305	continue;
7306	}
7307	Operands.front().push_back(Elt: GEP->getPointerOperand());
7308	}
7309	TE->setOperand(OpIdx: `0`, OpVL: Operands.front());
7310	// Need to cast all indices to the same type before vectorization to
7311	// avoid crash.
7312	// Required to be able to find correct matches between different gather
7313	// nodes and reuse the vectorized values rather than trying to gather them
7314	// again.
7315	int IndexIdx = `1`;
7316	Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
7317	Type *Ty = all_of(Range&: VL,
7318	P: [VL0Ty, IndexIdx](Value *V) {
7319	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
7320	if (!GEP)
7321	return true;
7322	return VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
7323	})
7324	? VL0Ty
7325	: DL->getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
7326	->getPointerOperandType()
7327	->getScalarType());
7328	// Prepare the operand vector.
7329	for (Value *V : VL) {
7330	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
7331	if (!I) {
7332	Operands.back().push_back(
7333	Elt: ConstantInt::get(Ty, V: `0`, /isSigned=/IsSigned: false));
7334	continue;
7335	}
7336	auto *Op = I->getOperand(i_nocapture: IndexIdx);
7337	auto *CI = dyn_cast<ConstantInt>(Val: Op);
7338	if (!CI)
7339	Operands.back().push_back(Elt: Op);
7340	else
7341	Operands.back().push_back(Elt: ConstantFoldIntegerCast(
7342	C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL: *DL));
7343	}
7344	TE->setOperand(OpIdx: IndexIdx, OpVL: Operands.back());
7345
7346	for (unsigned I = `0`, Ops = Operands.size(); I < Ops; ++I)
7347	buildTree_rec(VL: Operands [I], Depth: Depth + `1`, UserTreeIdx: {TE, I});
7348	return;
7349	}
7350	case Instruction::Store: {
7351	bool Consecutive = CurrentOrder.empty();
7352	if (!Consecutive)
7353	fixupOrderingIndices(Order: CurrentOrder);
7354	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7355	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7356	TE->setOperandsInOrder();
7357	buildTree_rec(VL: TE->getOperand(OpIdx: `0`), Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7358	if (Consecutive)
7359	LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7360	else
7361	LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7362	return;
7363	}
7364	case Instruction::Call: {
7365	// Check if the calls are all to the same vectorizable intrinsic or
7366	// library function.
7367	CallInst *CI = cast<CallInst>(Val: VL0);
7368	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7369
7370	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7371	ReuseShuffleIndices);
7372	// Sort operands of the instructions so that each side is more likely to
7373	// have the same opcode.
7374	if (isCommutative(I: VL0)) {
7375	ValueList Left, Right;
7376	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7377	TE->setOperand(OpIdx: `0`, OpVL: Left);
7378	TE->setOperand(OpIdx: `1`, OpVL: Right);
7379	SmallVector<ValueList> Operands;
7380	for (unsigned I : seq<unsigned>(Begin: `2`, End: CI->arg_size())) {
7381	Operands.emplace_back();
7382	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7383	continue;
7384	for (Value *V : VL) {
7385	auto *CI2 = cast<CallInst>(Val: V);
7386	Operands.back().push_back(Elt: CI2->getArgOperand(i: I));
7387	}
7388	TE->setOperand(OpIdx: I, OpVL: Operands.back());
7389	}
7390	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7391	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7392	for (unsigned I : seq<unsigned>(Begin: `2`, End: CI->arg_size())) {
7393	if (Operands [I - `2`].empty())
7394	continue;
7395	buildTree_rec(VL: Operands [I - `2`], Depth: Depth + `1`, UserTreeIdx: {TE, I});
7396	}
7397	return;
7398	}
7399	TE->setOperandsInOrder();
7400	for (unsigned I : seq<unsigned>(Begin: `0`, End: CI->arg_size())) {
7401	// For scalar operands no need to create an entry since no need to
7402	// vectorize it.
7403	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7404	continue;
7405	ValueList Operands;
7406	// Prepare the operand vector.
7407	for (Value *V : VL) {
7408	auto *CI2 = cast<CallInst>(Val: V);
7409	Operands.push_back(Elt: CI2->getArgOperand(i: I));
7410	}
7411	buildTree_rec(VL: Operands, Depth: Depth + `1`, UserTreeIdx: {TE, I});
7412	}
7413	return;
7414	}
7415	case Instruction::ShuffleVector: {
7416	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
7417	ReuseShuffleIndices);
7418	LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7419
7420	// Reorder operands if reordering would enable vectorization.
7421	auto *CI = dyn_cast<CmpInst>(Val: VL0);
7422	if (isa<BinaryOperator>(Val: VL0) \|\| CI) {
7423	ValueList Left, Right;
7424	if (!CI \|\| all_of(Range&: VL, P: [](Value *V) {
7425	return cast<CmpInst>(Val: V)->isCommutative();
7426	})) {
7427	reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7428	} else {
7429	auto *MainCI = cast<CmpInst>(Val: S.MainOp);
7430	auto *AltCI = cast<CmpInst>(Val: S.AltOp);
7431	CmpInst::Predicate MainP = MainCI->getPredicate();
7432	CmpInst::Predicate AltP = AltCI->getPredicate();
7433	assert(MainP != AltP &&
7434	"Expected different main/alternate predicates.");
7435	// Collect operands - commute if it uses the swapped predicate or
7436	// alternate operation.
7437	for (Value *V : VL) {
7438	auto *Cmp = cast<CmpInst>(Val: V);
7439	Value *LHS = Cmp->getOperand(i_nocapture: `0`);
7440	Value *RHS = Cmp->getOperand(i_nocapture: `1`);
7441
7442	if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
7443	if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7444	std::swap(a&: LHS, b&: RHS);
7445	} else {
7446	if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7447	std::swap(a&: LHS, b&: RHS);
7448	}
7449	Left.push_back(Elt: LHS);
7450	Right.push_back(Elt: RHS);
7451	}
7452	}
7453	TE->setOperand(OpIdx: `0`, OpVL: Left);
7454	TE->setOperand(OpIdx: `1`, OpVL: Right);
7455	buildTree_rec(VL: Left, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
7456	buildTree_rec(VL: Right, Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
7457	return;
7458	}
7459
7460	TE->setOperandsInOrder();
7461	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL0->getNumOperands()))
7462	buildTree_rec(VL: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
7463	return;
7464	}
7465	default:
7466	break;
7467	}
7468	llvm_unreachable("Unexpected vectorization of the instructions.");
7469	}
7470
7471	unsigned BoUpSLP::canMapToVector(Type T) const* {
7472	unsigned N = `1`;
7473	Type *EltTy = T;
7474
7475	while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
7476	if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
7477	// Check that struct is homogeneous.
7478	for (const auto *Ty : ST->elements())
7479	if (Ty != *ST->element_begin())
7480	return `0`;
7481	N *= ST->getNumElements();
7482	EltTy = *ST->element_begin();
7483	} else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
7484	N *= AT->getNumElements();
7485	EltTy = AT->getElementType();
7486	} else {
7487	auto *VT = cast<FixedVectorType>(Val: EltTy);
7488	N *= VT->getNumElements();
7489	EltTy = VT->getElementType();
7490	}
7491	}
7492
7493	if (!isValidElementType(Ty: EltTy))
7494	return `0`;
7495	uint64_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
7496	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\|
7497	VTSize != DL->getTypeStoreSizeInBits(Ty: T))
7498	return `0`;
7499	return N;
7500	}
7501
7502	bool BoUpSLP::canReuseExtract(ArrayRef<Value > VL, Value OpValue,
7503	SmallVectorImpl<unsigned> &CurrentOrder,
7504	bool ResizeAllowed) const {
7505	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
7506	assert(It != VL.end() && "Expected at least one extract instruction.");
7507	auto E0 = cast<Instruction>(Val: It);
7508	assert(
7509	all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7510	"Invalid opcode");
7511	// Check if all of the extracts come from the same vector and from the
7512	// correct offset.
7513	Value *Vec = E0->getOperand(i: `0`);
7514
7515	CurrentOrder.clear();
7516
7517	// We have to extract from a vector/aggregate with the same number of elements.
7518	unsigned NElts;
7519	if (E0->getOpcode() == Instruction::ExtractValue) {
7520	NElts = canMapToVector(T: Vec->getType());
7521	if (!NElts)
7522	return false;
7523	// Check if load can be rewritten as load of vector.
7524	LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
7525	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(N: VL.size()))
7526	return false;
7527	} else {
7528	NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
7529	}
7530
7531	unsigned E = VL.size();
7532	if (!ResizeAllowed && NElts != E)
7533	return false;
7534	SmallVector<int> Indices(E, PoisonMaskElem);
7535	unsigned MinIdx = NElts, MaxIdx = `0`;
7536	for (auto [I, V] : enumerate(First&: VL)) {
7537	auto *Inst = dyn_cast<Instruction>(Val: V);
7538	if (!Inst)
7539	continue;
7540	if (Inst->getOperand(i: `0`) != Vec)
7541	return false;
7542	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
7543	if (isa<UndefValue>(Val: EE->getIndexOperand()))
7544	continue;
7545	std::optional<unsigned> Idx = getExtractIndex(E: Inst);
7546	if (!Idx)
7547	return false;
7548	const unsigned ExtIdx = *Idx;
7549	if (ExtIdx >= NElts)
7550	continue;
7551	Indices [I] = ExtIdx;
7552	if (MinIdx > ExtIdx)
7553	MinIdx = ExtIdx;
7554	if (MaxIdx < ExtIdx)
7555	MaxIdx = ExtIdx;
7556	}
7557	if (MaxIdx - MinIdx + `1` > E)
7558	return false;
7559	if (MaxIdx + `1` <= E)
7560	MinIdx = `0`;
7561
7562	// Check that all of the indices extract from the correct offset.
7563	bool ShouldKeepOrder = true;
7564	// Assign to all items the initial value E + 1 so we can check if the extract
7565	// instruction index was used already.
7566	// Also, later we can check that all the indices are used and we have a
7567	// consecutive access in the extract instructions, by checking that no
7568	// element of CurrentOrder still has value E + 1.
7569	CurrentOrder.assign(NumElts: E, Elt: E);
7570	for (unsigned I = `0`; I < E; ++I) {
7571	if (Indices [I] == PoisonMaskElem)
7572	continue;
7573	const unsigned ExtIdx = Indices [I] - MinIdx;
7574	if (CurrentOrder [ExtIdx] != E) {
7575	CurrentOrder.clear();
7576	return false;
7577	}
7578	ShouldKeepOrder &= ExtIdx == I;
7579	CurrentOrder [ExtIdx] = I;
7580	}
7581	if (ShouldKeepOrder)
7582	CurrentOrder.clear();
7583
7584	return ShouldKeepOrder;
7585	}
7586
7587	bool BoUpSLP::areAllUsersVectorized(
7588	Instruction I, const* SmallDenseSet<Value > VectorizedVals) const {
7589	return (I->hasOneUse() && (!VectorizedVals \|\| VectorizedVals->contains(V: I))) \|\|
7590	all_of(Range: I->users(), P: [this](User *U) {
7591	return ScalarToTreeEntry.contains(Val: U) \|\|
7592	isVectorLikeInstWithConstOps(V: U) \|\|
7593	(isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
7594	});
7595	}
7596
7597	static std::pair<InstructionCost, InstructionCost>
7598	getVectorCallCosts(CallInst CI, FixedVectorType VecTy,
7599	TargetTransformInfo TTI, TargetLibraryInfo TLI,
7600	ArrayRef<Type *> ArgTys) {
7601	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7602
7603	// Calculate the cost of the scalar and vector calls.
7604	FastMathFlags FMF;
7605	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
7606	FMF = FPCI->getFastMathFlags();
7607	SmallVector<const Value *> Arguments(CI->args());
7608	IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7609	dyn_cast<IntrinsicInst>(Val: CI));
7610	auto IntrinsicCost =
7611	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
7612
7613	auto Shape = VFShape::get(FTy: CI->getFunctionType(),
7614	EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
7615	HasGlobalPred: false /HasGlobalPred/);
7616	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
7617	auto LibCost = IntrinsicCost;
7618	if (!CI->isNoBuiltin() && VecFunc) {
7619	// Calculate the cost of the vector library call.
7620	// If the corresponding vector call is cheaper, return its cost.
7621	LibCost =
7622	TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
7623	}
7624	return {IntrinsicCost, LibCost};
7625	}
7626
7627	void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7628	const function_ref<bool(Instruction )> IsAltOp, SmallVectorImpl<int*> &Mask,
7629	SmallVectorImpl<Value > OpScalars,
7630	SmallVectorImpl<Value > AltScalars) const {
7631	unsigned Sz = Scalars.size();
7632	Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
7633	SmallVector<int> OrderMask;
7634	if (!ReorderIndices.empty())
7635	inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
7636	for (unsigned I = `0`; I < Sz; ++I) {
7637	unsigned Idx = I;
7638	if (!ReorderIndices.empty())
7639	Idx = OrderMask [I];
7640	auto *OpInst = cast<Instruction>(Val: Scalars [Idx]);
7641	if (IsAltOp (OpInst)) {
7642	Mask [I] = Sz + Idx;
7643	if (AltScalars)
7644	AltScalars->push_back(Elt: OpInst);
7645	} else {
7646	Mask [I] = Idx;
7647	if (OpScalars)
7648	OpScalars->push_back(Elt: OpInst);
7649	}
7650	}
7651	if (!ReuseShuffleIndices.empty()) {
7652	SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7653	transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
7654	return Idx != PoisonMaskElem ? Mask [Idx] : PoisonMaskElem;
7655	});
7656	Mask.swap(RHS&: NewMask);
7657	}
7658	}
7659
7660	static bool isAlternateInstruction(const Instruction *I,
7661	const Instruction *MainOp,
7662	const Instruction *AltOp,
7663	const TargetLibraryInfo &TLI) {
7664	if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
7665	auto *AltCI = cast<CmpInst>(Val: AltOp);
7666	CmpInst::Predicate MainP = MainCI->getPredicate();
7667	CmpInst::Predicate AltP = AltCI->getPredicate();
7668	assert(MainP != AltP && "Expected different main/alternate predicates.");
7669	auto *CI = cast<CmpInst>(Val: I);
7670	if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
7671	return false;
7672	if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
7673	return true;
7674	CmpInst::Predicate P = CI->getPredicate();
7675	CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
7676
7677	assert((MainP == P \|\| AltP == P \|\| MainP == SwappedP \|\| AltP == SwappedP) &&
7678	"CmpInst expected to match either main or alternate predicate or "
7679	"their swap.");
7680	(void)AltP;
7681	return MainP != P && MainP != SwappedP;
7682	}
7683	return I->getOpcode() == AltOp->getOpcode();
7684	}
7685
7686	TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7687	assert(!Ops.empty());
7688	const auto *Op0 = Ops.front();
7689
7690	const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
7691	// TODO: We should allow undef elements here
7692	return isConstant(V) && !isa<UndefValue>(Val: V);
7693	});
7694	const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
7695	// TODO: We should allow undef elements here
7696	return V == Op0;
7697	});
7698	const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7699	// TODO: We should allow undef elements here
7700	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7701	return CI->getValue().isPowerOf2();
7702	return false;
7703	});
7704	const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7705	// TODO: We should allow undef elements here
7706	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7707	return CI->getValue().isNegatedPowerOf2();
7708	return false;
7709	});
7710
7711	TTI::OperandValueKind VK = TTI::OK_AnyValue;
7712	if (IsConstant && IsUniform)
7713	VK = TTI::OK_UniformConstantValue;
7714	else if (IsConstant)
7715	VK = TTI::OK_NonUniformConstantValue;
7716	else if (IsUniform)
7717	VK = TTI::OK_UniformValue;
7718
7719	TTI::OperandValueProperties VP = TTI::OP_None;
7720	VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7721	VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7722
7723	return {.Kind: VK, .Properties: VP};
7724	}
7725
7726	namespace {
7727	/// The base class for shuffle instruction emission and shuffle cost estimation.
7728	class BaseShuffleAnalysis {
7729	protected:
7730	/// Checks if the mask is an identity mask.
7731	/// \param IsStrict if is true the function returns false if mask size does
7732	/// not match vector size.
7733	static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7734	bool IsStrict) {
7735	int Limit = Mask.size();
7736	int VF = VecTy->getNumElements();
7737	int Index = -`1`;
7738	if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
7739	return true;
7740	if (!IsStrict) {
7741	// Consider extract subvector starting from index 0.
7742	if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
7743	Index == `0`)
7744	return true;
7745	// All VF-size submasks are identity (e.g.
7746	// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7747	if (Limit % VF == `0` && all_of(Range: seq<int>(Begin: `0`, End: Limit / VF), P: [=](int Idx) {
7748	ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
7749	return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) \|\|
7750	ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
7751	}))
7752	return true;
7753	}
7754	return false;
7755	}
7756
7757	/// Tries to combine 2 different masks into single one.
7758	/// \param LocalVF Vector length of the permuted input vector. \p Mask may
7759	/// change the size of the vector, \p LocalVF is the original size of the
7760	/// shuffled vector.
7761	static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7762	ArrayRef<int> ExtMask) {
7763	unsigned VF = Mask.size();
7764	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7765	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
7766	if (ExtMask [I] == PoisonMaskElem)
7767	continue;
7768	int MaskedIdx = Mask [ExtMask [I] % VF];
7769	NewMask [I] =
7770	MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7771	}
7772	Mask.swap(RHS&: NewMask);
7773	}
7774
7775	/// Looks through shuffles trying to reduce final number of shuffles in the
7776	/// code. The function looks through the previously emitted shuffle
7777	/// instructions and properly mark indices in mask as undef.
7778	/// For example, given the code
7779	/// \code
7780	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7781	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7782	/// \endcode
7783	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7784	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
7785	/// <0, 1, 2, 3> for the shuffle.
7786	/// If 2 operands are of different size, the smallest one will be resized and
7787	/// the mask recalculated properly.
7788	/// For example, given the code
7789	/// \code
7790	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7791	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7792	/// \endcode
7793	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7794	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
7795	/// <0, 1, 2, 3> for the shuffle.
7796	/// So, it tries to transform permutations to simple vector merge, if
7797	/// possible.
7798	/// \param V The input vector which must be shuffled using the given \p Mask.
7799	/// If the better candidate is found, \p V is set to this best candidate
7800	/// vector.
7801	/// \param Mask The input mask for the shuffle. If the best candidate is found
7802	/// during looking-through-shuffles attempt, it is updated accordingly.
7803	/// \param SinglePermute true if the shuffle operation is originally a
7804	/// single-value-permutation. In this case the look-through-shuffles procedure
7805	/// may look for resizing shuffles as the best candidates.
7806	/// \return true if the shuffle results in the non-resizing identity shuffle
7807	/// (and thus can be ignored), false - otherwise.
7808	static bool peekThroughShuffles(Value &V, SmallVectorImpl<int*> &Mask,
7809	bool SinglePermute) {
7810	Value *Op = V;
7811	ShuffleVectorInst IdentityOp = nullptr*;
7812	SmallVector<int> IdentityMask;
7813	while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
7814	// Exit if not a fixed vector type or changing size shuffle.
7815	auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
7816	if (!SVTy)
7817	break;
7818	// Remember the identity or broadcast mask, if it is not a resizing
7819	// shuffle. If no better candidates are found, this Op and Mask will be
7820	// used in the final shuffle.
7821	if (isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/false)) {
7822	if (!IdentityOp \|\| !SinglePermute \|\|
7823	(isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/true) &&
7824	!ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
7825	NumSrcElts: IdentityMask.size()))) {
7826	IdentityOp = SV;
7827	// Store current mask in the IdentityMask so later we did not lost
7828	// this info if IdentityOp is selected as the best candidate for the
7829	// permutation.
7830	IdentityMask.assign(RHS: Mask);
7831	}
7832	}
7833	// Remember the broadcast mask. If no better candidates are found, this Op
7834	// and Mask will be used in the final shuffle.
7835	// Zero splat can be used as identity too, since it might be used with
7836	// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7837	// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7838	// expensive, the analysis founds out, that the source vector is just a
7839	// broadcast, this original mask can be transformed to identity mask <0,
7840	// 1, 2, 3>.
7841	// \code
7842	// %0 = shuffle %v, poison, zeroinitalizer
7843	// %res = shuffle %0, poison, <3, 1, 2, 0>
7844	// \endcode
7845	// may be transformed to
7846	// \code
7847	// %0 = shuffle %v, poison, zeroinitalizer
7848	// %res = shuffle %0, poison, <0, 1, 2, 3>
7849	// \endcode
7850	if (SV->isZeroEltSplat()) {
7851	IdentityOp = SV;
7852	IdentityMask.assign(RHS: Mask);
7853	}
7854	int LocalVF = Mask.size();
7855	if (auto *SVOpTy =
7856	dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType()))
7857	LocalVF = SVOpTy->getNumElements();
7858	SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7859	for (auto [Idx, I] : enumerate(First&: Mask)) {
7860	if (I == PoisonMaskElem \|\|
7861	static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7862	continue;
7863	ExtMask [Idx] = SV->getMaskValue(Elt: I);
7864	}
7865	bool IsOp1Undef =
7866	isUndefVector(V: SV->getOperand(i_nocapture: `0`),
7867	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
7868	.all();
7869	bool IsOp2Undef =
7870	isUndefVector(V: SV->getOperand(i_nocapture: `1`),
7871	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
7872	.all();
7873	if (!IsOp1Undef && !IsOp2Undef) {
7874	// Update mask and mark undef elems.
7875	for (int &I : Mask) {
7876	if (I == PoisonMaskElem)
7877	continue;
7878	if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
7879	PoisonMaskElem)
7880	I = PoisonMaskElem;
7881	}
7882	break;
7883	}
7884	SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7885	SV->getShuffleMask().end());
7886	combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
7887	Mask.swap(RHS&: ShuffleMask);
7888	if (IsOp2Undef)
7889	Op = SV->getOperand(i_nocapture: `0`);
7890	else
7891	Op = SV->getOperand(i_nocapture: `1`);
7892	}
7893	if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
7894	!OpTy \|\| !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) \|\|
7895	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
7896	if (IdentityOp) {
7897	V = IdentityOp;
7898	assert(Mask.size() == IdentityMask.size() &&
7899	"Expected masks of same sizes.");
7900	// Clear known poison elements.
7901	for (auto [I, Idx] : enumerate(First&: Mask))
7902	if (Idx == PoisonMaskElem)
7903	IdentityMask [I] = PoisonMaskElem;
7904	Mask.swap(RHS&: IdentityMask);
7905	auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
7906	return SinglePermute &&
7907	(isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
7908	/IsStrict=/true) \|\|
7909	(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7910	Shuffle->isZeroEltSplat() &&
7911	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())));
7912	}
7913	V = Op;
7914	return false;
7915	}
7916	V = Op;
7917	return true;
7918	}
7919
7920	/// Smart shuffle instruction emission, walks through shuffles trees and
7921	/// tries to find the best matching vector for the actual shuffle
7922	/// instruction.
7923	template <typename T, typename ShuffleBuilderTy>
7924	static T createShuffle(Value V1, Value V2, ArrayRef<int> Mask,
7925	ShuffleBuilderTy &Builder) {
7926	assert(V1 && "Expected at least one vector value.");
7927	if (V2)
7928	Builder.resizeToMatch(V1, V2);
7929	int VF = Mask.size();
7930	if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
7931	VF = FTy->getNumElements();
7932	if (V2 &&
7933	!isUndefVector(V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg)).all()) {
7934	// Peek through shuffles.
7935	Value *Op1 = V1;
7936	Value *Op2 = V2;
7937	int VF =
7938	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
7939	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7940	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7941	for (int I = `0`, E = Mask.size(); I < E; ++I) {
7942	if (Mask [I] < VF)
7943	CombinedMask1 [I] = Mask [I];
7944	else
7945	CombinedMask2 [I] = Mask [I] - VF;
7946	}
7947	Value *PrevOp1;
7948	Value *PrevOp2;
7949	do {
7950	PrevOp1 = Op1;
7951	PrevOp2 = Op2;
7952	(void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /SinglePermute=/false);
7953	(void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /SinglePermute=/false);
7954	// Check if we have 2 resizing shuffles - need to peek through operands
7955	// again.
7956	if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
7957	if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
7958	SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7959	for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
7960	if (I == PoisonMaskElem)
7961	continue;
7962	ExtMask1 [Idx] = SV1->getMaskValue(Elt: I);
7963	}
7964	SmallBitVector UseMask1 = buildUseMask(
7965	VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: `1`)->getType())
7966	->getNumElements(),
7967	Mask: ExtMask1, MaskArg: UseMask::SecondArg);
7968	SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7969	for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
7970	if (I == PoisonMaskElem)
7971	continue;
7972	ExtMask2 [Idx] = SV2->getMaskValue(Elt: I);
7973	}
7974	SmallBitVector UseMask2 = buildUseMask(
7975	VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: `1`)->getType())
7976	->getNumElements(),
7977	Mask: ExtMask2, MaskArg: UseMask::SecondArg);
7978	if (SV1->getOperand(i_nocapture: `0`)->getType() ==
7979	SV2->getOperand(i_nocapture: `0`)->getType() &&
7980	SV1->getOperand(i_nocapture: `0`)->getType() != SV1->getType() &&
7981	isUndefVector(V: SV1->getOperand(i_nocapture: `1`), UseMask: UseMask1).all() &&
7982	isUndefVector(V: SV2->getOperand(i_nocapture: `1`), UseMask: UseMask2).all()) {
7983	Op1 = SV1->getOperand(i_nocapture: `0`);
7984	Op2 = SV2->getOperand(i_nocapture: `0`);
7985	SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7986	SV1->getShuffleMask().end());
7987	int LocalVF = ShuffleMask1.size();
7988	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
7989	LocalVF = FTy->getNumElements();
7990	combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
7991	CombinedMask1.swap(RHS&: ShuffleMask1);
7992	SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7993	SV2->getShuffleMask().end());
7994	LocalVF = ShuffleMask2.size();
7995	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
7996	LocalVF = FTy->getNumElements();
7997	combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
7998	CombinedMask2.swap(RHS&: ShuffleMask2);
7999	}
8000	}
8001	} while (PrevOp1 != Op1 \|\| PrevOp2 != Op2);
8002	Builder.resizeToMatch(Op1, Op2);
8003	VF = std::max(a: cast<VectorType>(Val: Op1->getType())
8004	->getElementCount()
8005	.getKnownMinValue(),
8006	b: cast<VectorType>(Val: Op2->getType())
8007	->getElementCount()
8008	.getKnownMinValue());
8009	for (int I = `0`, E = Mask.size(); I < E; ++I) {
8010	if (CombinedMask2 [I] != PoisonMaskElem) {
8011	assert(CombinedMask1[I] == PoisonMaskElem &&
8012	"Expected undefined mask element");
8013	CombinedMask1 [I] = CombinedMask2 [I] + (Op1 == Op2 ? `0` : VF);
8014	}
8015	}
8016	if (Op1 == Op2 &&
8017	(ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) \|\|
8018	(ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
8019	isa<ShuffleVectorInst>(Val: Op1) &&
8020	cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
8021	ArrayRef(CombinedMask1))))
8022	return Builder.createIdentity(Op1);
8023	return Builder.createShuffleVector(
8024	Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
8025	CombinedMask1);
8026	}
8027	if (isa<PoisonValue>(Val: V1))
8028	return Builder.createPoison(
8029	cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
8030	SmallVector<int> NewMask(Mask.begin(), Mask.end());
8031	bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /SinglePermute=/true);
8032	assert(V1 && "Expected non-null value after looking through shuffles.");
8033
8034	if (!IsIdentity)
8035	return Builder.createShuffleVector(V1, NewMask);
8036	return Builder.createIdentity(V1);
8037	}
8038	};
8039	} // namespace
8040
8041	/// Returns the cost of the shuffle instructions with the given \p Kind, vector
8042	/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8043	/// subvector pattern.
8044	static InstructionCost
8045	getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
8046	VectorType Tp, ArrayRef<int*> Mask = std::nullopt,
8047	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
8048	int Index = `0`, VectorType SubTp = nullptr*,
8049	ArrayRef<const Value *> Args = std::nullopt) {
8050	if (Kind != TTI::SK_PermuteTwoSrc)
8051	return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8052	int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8053	int NumSubElts;
8054	if (Mask.size() > `2` && ShuffleVectorInst::isInsertSubvectorMask(
8055	Mask, NumSrcElts, NumSubElts, Index)) {
8056	if (Index + NumSubElts > NumSrcElts &&
8057	Index + NumSrcElts <= static_cast<int>(Mask.size()))
8058	return TTI.getShuffleCost(
8059	Kind: TTI::SK_InsertSubvector,
8060	Tp: getWidenedType(ScalarTy: Tp->getElementType(), VF: Mask.size()), Mask,
8061	CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
8062	}
8063	return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8064	}
8065
8066	/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8067	static std::pair<InstructionCost, InstructionCost>
8068	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
8069	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
8070	Type ScalarTy, VectorType VecTy) {
8071	InstructionCost ScalarCost = `0`;
8072	InstructionCost VecCost = `0`;
8073	// Here we differentiate two cases: (1) when Ptrs represent a regular
8074	// vectorization tree node (as they are pointer arguments of scattered
8075	// loads) or (2) when Ptrs are the arguments of loads or stores being
8076	// vectorized as plane wide unit-stride load/store since all the
8077	// loads/stores are known to be from/to adjacent locations.
8078	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store) {
8079	// Case 2: estimate costs for pointer related costs when vectorizing to
8080	// a wide load/store.
8081	// Scalar cost is estimated as a set of pointers with known relationship
8082	// between them.
8083	// For vector code we will use BasePtr as argument for the wide load/store
8084	// but we also need to account all the instructions which are going to
8085	// stay in vectorized code due to uses outside of these scalar
8086	// loads/stores.
8087	ScalarCost = TTI.getPointersChainCost(
8088	Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
8089	CostKind);
8090
8091	SmallVector<const Value *> PtrsRetainedInVecCode;
8092	for (Value *V : Ptrs) {
8093	if (V == BasePtr) {
8094	PtrsRetainedInVecCode.push_back(Elt: V);
8095	continue;
8096	}
8097	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
8098	// For simplicity assume Ptr to stay in vectorized code if it's not a
8099	// GEP instruction. We don't care since it's cost considered free.
8100	// TODO: We should check for any uses outside of vectorizable tree
8101	// rather than just single use.
8102	if (!Ptr \|\| !Ptr->hasOneUse())
8103	PtrsRetainedInVecCode.push_back(Elt: V);
8104	}
8105
8106	if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8107	// If all pointers stay in vectorized code then we don't have
8108	// any savings on that.
8109	return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
8110	}
8111	VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
8112	Info: TTI::PointersChainInfo::getKnownStride(),
8113	AccessTy: VecTy, CostKind);
8114	} else {
8115	// Case 1: Ptrs are the arguments of loads that we are going to transform
8116	// into masked gather load intrinsic.
8117	// All the scalar GEPs will be removed as a result of vectorization.
8118	// For any external uses of some lanes extract element instructions will
8119	// be generated (which cost is estimated separately).
8120	TTI::PointersChainInfo PtrsInfo =
8121	all_of(Range&: Ptrs,
8122	P: [](const Value *V) {
8123	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
8124	return Ptr && !Ptr->hasAllConstantIndices();
8125	})
8126	? TTI::PointersChainInfo::getUnknownStride()
8127	: TTI::PointersChainInfo::getKnownStride();
8128
8129	ScalarCost =
8130	TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
8131	auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
8132	if (!BaseGEP) {
8133	auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
8134	if (It != Ptrs.end())
8135	BaseGEP = cast<GEPOperator>(Val: *It);
8136	}
8137	if (BaseGEP) {
8138	SmallVector<const Value *> Indices(BaseGEP->indices());
8139	VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
8140	Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
8141	CostKind);
8142	}
8143	}
8144
8145	return std::make_pair(x&: ScalarCost, y&: VecCost);
8146	}
8147
8148	void BoUpSLP::transformNodes() {
8149	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8150	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8151	TreeEntry &E = *TE;
8152	switch (E.getOpcode()) {
8153	case Instruction::Load: {
8154	// No need to reorder masked gather loads, just reorder the scalar
8155	// operands.
8156	if (E.State != TreeEntry::Vectorize)
8157	break;
8158	Type *ScalarTy = E.getMainOp()->getType();
8159	auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
8160	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
8161	// Check if profitable to represent consecutive load + reverse as strided
8162	// load with stride -1.
8163	if (isReverseOrder(Order: E.ReorderIndices) &&
8164	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
8165	SmallVector<int> Mask;
8166	inversePermutation(Indices: E.ReorderIndices, Mask);
8167	auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
8168	InstructionCost OriginalVecCost =
8169	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
8170	AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
8171	OpdInfo: TTI::OperandValueInfo ()) +
8172	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
8173	InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8174	Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(),
8175	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: BaseLI);
8176	if (StridedCost < OriginalVecCost)
8177	// Strided load is more profitable than consecutive load + reverse -
8178	// transform the node to strided load.
8179	E.State = TreeEntry::StridedVectorize;
8180	}
8181	break;
8182	}
8183	case Instruction::Store: {
8184	Type *ScalarTy =
8185	cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
8186	auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
8187	Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
8188	// Check if profitable to represent consecutive load + reverse as strided
8189	// load with stride -1.
8190	if (isReverseOrder(Order: E.ReorderIndices) &&
8191	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
8192	SmallVector<int> Mask;
8193	inversePermutation(Indices: E.ReorderIndices, Mask);
8194	auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
8195	InstructionCost OriginalVecCost =
8196	TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
8197	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
8198	OpdInfo: TTI::OperandValueInfo ()) +
8199	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
8200	InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8201	Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
8202	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: BaseSI);
8203	if (StridedCost < OriginalVecCost)
8204	// Strided load is more profitable than consecutive load + reverse -
8205	// transform the node to strided load.
8206	E.State = TreeEntry::StridedVectorize;
8207	}
8208	break;
8209	}
8210	default:
8211	break;
8212	}
8213	}
8214	}
8215
8216	/// Merges shuffle masks and emits final shuffle instruction, if required. It
8217	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8218	/// when the actual shuffle instruction is generated only if this is actually
8219	/// required. Otherwise, the shuffle instruction emission is delayed till the
8220	/// end of the process, to reduce the number of emitted instructions and further
8221	/// analysis/transformations.
8222	class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8223	bool IsFinalized = false;
8224	SmallVector<int> CommonMask;
8225	SmallVector<PointerUnion<Value , const* TreeEntry *>, `2`> InVectors;
8226	Type ScalarTy = nullptr*;
8227	const TargetTransformInfo &TTI;
8228	InstructionCost Cost = `0`;
8229	SmallDenseSet<Value *> VectorizedVals;
8230	BoUpSLP &R;
8231	SmallPtrSetImpl<Value *> &CheckedExtracts;
8232	constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8233	/// While set, still trying to estimate the cost for the same nodes and we
8234	/// can delay actual cost estimation (virtual shuffle instruction emission).
8235	/// May help better estimate the cost if same nodes must be permuted + allows
8236	/// to move most of the long shuffles cost estimation to TTI.
8237	bool SameNodesEstimated = true;
8238
8239	static Constant getAllOnesValue(const* DataLayout &DL, Type *Ty) {
8240	if (Ty->getScalarType()->isPointerTy()) {
8241	Constant *Res = ConstantExpr::getIntToPtr(
8242	C: ConstantInt::getAllOnesValue(
8243	Ty: IntegerType::get(C&: Ty->getContext(),
8244	NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
8245	Ty: Ty->getScalarType());
8246	if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
8247	Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
8248	return Res;
8249	}
8250	return Constant::getAllOnesValue(Ty);
8251	}
8252
8253	InstructionCost getBuildVectorCost(ArrayRef<Value > VL, Value Root) {
8254	if ((!Root && allConstant(VL)) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>))
8255	return TTI::TCC_Free;
8256	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
8257	InstructionCost GatherCost = `0`;
8258	SmallVector<Value *> Gathers(VL.begin(), VL.end());
8259	// Improve gather cost for gather of loads, if we can group some of the
8260	// loads into vector loads.
8261	InstructionsState S = getSameOpcode(VL, TLI: *R.TLI);
8262	const unsigned Sz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
8263	unsigned MinVF = R.getMinVF(Sz: `2` * Sz);
8264	if (VL.size() > `2` &&
8265	((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) \|\|
8266	(InVectors.empty() &&
8267	any_of(Range: seq<unsigned>(Begin: `0`, End: VL.size() / MinVF),
8268	P: [&](unsigned Idx) {
8269	ArrayRef<Value > SubVL = VL.slice(N: Idx MinVF, M: MinVF);
8270	InstructionsState S = getSameOpcode(VL: SubVL, TLI: *R.TLI);
8271	return S.getOpcode() == Instruction::Load &&
8272	!S.isAltShuffle();
8273	}))) &&
8274	!all_of(Range&: Gathers, P: [&](Value V) { return* R.getTreeEntry(V); }) &&
8275	!isSplat(VL: Gathers)) {
8276	InstructionCost BaseCost = R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root, ScalarTy);
8277	SetVector<Value *> VectorizedLoads;
8278	SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
8279	SmallVector<unsigned> ScatterVectorized;
8280	unsigned StartIdx = `0`;
8281	unsigned VF = VL.size() / `2`;
8282	for (; VF >= MinVF; VF /= `2`) {
8283	for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8284	Cnt += VF) {
8285	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
8286	if (S.getOpcode() != Instruction::Load \|\| S.isAltShuffle()) {
8287	InstructionsState SliceS = getSameOpcode(VL: Slice, TLI: *R.TLI);
8288	if (SliceS.getOpcode() != Instruction::Load \|\|
8289	SliceS.isAltShuffle())
8290	continue;
8291	}
8292	if (!VectorizedLoads.count(key: Slice.front()) &&
8293	!VectorizedLoads.count(key: Slice.back()) && allSameBlock(VL: Slice)) {
8294	SmallVector<Value *> PointerOps;
8295	OrdersType CurrentOrder;
8296	LoadsState LS = R.canVectorizeLoads(VL: Slice, VL0: Slice.front(),
8297	Order&: CurrentOrder, PointerOps);
8298	switch (LS) {
8299	case LoadsState::Vectorize:
8300	case LoadsState::ScatterVectorize:
8301	case LoadsState::StridedVectorize:
8302	// Mark the vectorized loads so that we don't vectorize them
8303	// again.
8304	// TODO: better handling of loads with reorders.
8305	if (((LS == LoadsState::Vectorize \|\|
8306	LS == LoadsState::StridedVectorize) &&
8307	CurrentOrder.empty()) \|\|
8308	(LS == LoadsState::StridedVectorize &&
8309	isReverseOrder(Order: CurrentOrder)))
8310	VectorizedStarts.emplace_back(Args&: Cnt, Args&: LS);
8311	else
8312	ScatterVectorized.push_back(Elt: Cnt);
8313	VectorizedLoads.insert(Start: Slice.begin(), End: Slice.end());
8314	// If we vectorized initial block, no need to try to vectorize
8315	// it again.
8316	if (Cnt == StartIdx)
8317	StartIdx += VF;
8318	break;
8319	case LoadsState::Gather:
8320	break;
8321	}
8322	}
8323	}
8324	// Check if the whole array was vectorized already - exit.
8325	if (StartIdx >= VL.size())
8326	break;
8327	// Found vectorizable parts - exit.
8328	if (!VectorizedLoads.empty())
8329	break;
8330	}
8331	if (!VectorizedLoads.empty()) {
8332	unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
8333	bool NeedInsertSubvectorAnalysis =
8334	!NumParts \|\| (VL.size() / VF) > NumParts;
8335	// Get the cost for gathered loads.
8336	for (unsigned I = `0`, End = VL.size(); I < End; I += VF) {
8337	if (VectorizedLoads.contains(key: VL [I]))
8338	continue;
8339	GatherCost +=
8340	getBuildVectorCost(VL: VL.slice(N: I, M: std::min(a: End - I, b: VF)), Root);
8341	}
8342	// Exclude potentially vectorized loads from list of gathered
8343	// scalars.
8344	Gathers.assign(NumElts: Gathers.size(), Elt: PoisonValue::get(T: VL.front()->getType()));
8345	// The cost for vectorized loads.
8346	InstructionCost ScalarsCost = `0`;
8347	for (Value *V : VectorizedLoads) {
8348	auto *LI = cast<LoadInst>(Val: V);
8349	ScalarsCost +=
8350	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LI->getType(),
8351	Alignment: LI->getAlign(), AddressSpace: LI->getPointerAddressSpace(),
8352	CostKind, OpdInfo: TTI::OperandValueInfo (), I: LI);
8353	}
8354	auto *LoadTy = getWidenedType(ScalarTy: VL.front()->getType(), VF);
8355	for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8356	auto *LI = cast<LoadInst>(Val: VL [P.first]);
8357	Align Alignment = LI->getAlign();
8358	GatherCost +=
8359	P.second == LoadsState::Vectorize
8360	? TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment,
8361	AddressSpace: LI->getPointerAddressSpace(), CostKind,
8362	OpdInfo: TTI::OperandValueInfo (), I: LI)
8363	: TTI.getStridedMemoryOpCost(
8364	Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI->getPointerOperand(),
8365	/VariableMask=/false, Alignment, CostKind, I: LI);
8366	// Estimate GEP cost.
8367	SmallVector<Value *> PointerOps(VF);
8368	for (auto [I, V] : enumerate(First: VL.slice(N: P.first, M: VF)))
8369	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
8370	auto [ScalarGEPCost, VectorGEPCost] =
8371	getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: LI->getPointerOperand(),
8372	Opcode: Instruction::Load, CostKind, ScalarTy: LI->getType(), VecTy: LoadTy);
8373	GatherCost += VectorGEPCost - ScalarGEPCost;
8374	}
8375	for (unsigned P : ScatterVectorized) {
8376	auto *LI0 = cast<LoadInst>(Val: VL [P]);
8377	ArrayRef<Value *> Slice = VL.slice(N: P, M: VF);
8378	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: Slice);
8379	GatherCost += TTI.getGatherScatterOpCost(
8380	Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI0->getPointerOperand(),
8381	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: LI0);
8382	// Estimate GEP cost.
8383	SmallVector<Value *> PointerOps(VF);
8384	for (auto [I, V] : enumerate(First&: Slice))
8385	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
8386	OrdersType Order;
8387	if (sortPtrAccesses(VL: PointerOps, ElemTy: LI0->getType(), DL: R.DL, SE&: R.SE,
8388	SortedIndices&: Order)) {
8389	// TODO: improve checks if GEPs can be vectorized.
8390	Value *Ptr0 = PointerOps.front();
8391	Type *ScalarTy = Ptr0->getType();
8392	auto *VecTy = getWidenedType(ScalarTy, VF);
8393	auto [ScalarGEPCost, VectorGEPCost] =
8394	getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: Ptr0, Opcode: Instruction::GetElementPtr,
8395	CostKind, ScalarTy, VecTy);
8396	GatherCost += VectorGEPCost - ScalarGEPCost;
8397	if (!Order.empty()) {
8398	SmallVector<int> Mask;
8399	inversePermutation(Indices: Order, Mask);
8400	GatherCost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
8401	Tp: VecTy, Mask, CostKind);
8402	}
8403	} else {
8404	GatherCost += R.getGatherCost(VL: PointerOps, /ForPoisonSrc=/true,
8405	ScalarTy: PointerOps.front()->getType());
8406	}
8407	}
8408	if (NeedInsertSubvectorAnalysis) {
8409	// Add the cost for the subvectors insert.
8410	SmallVector<int> ShuffleMask(VL.size());
8411	for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8412	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: E))
8413	ShuffleMask [Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8414	GatherCost += TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: VecTy,
8415	Mask: ShuffleMask, CostKind, Index: I, SubTp: LoadTy);
8416	}
8417	}
8418	GatherCost -= ScalarsCost;
8419	}
8420	GatherCost = std::min(a: BaseCost, b: GatherCost);
8421	} else if (!Root && isSplat(VL)) {
8422	// Found the broadcasting of the single scalar, calculate the cost as
8423	// the broadcast.
8424	const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
8425	assert(It != VL.end() && "Expected at least one non-undef value.");
8426	// Add broadcast for non-identity shuffle only.
8427	bool NeedShuffle =
8428	count(Range&: VL, Element: *It) > `1` &&
8429	(VL.front() != *It \|\| !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
8430	if (!NeedShuffle)
8431	return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
8432	CostKind, Index: std::distance(first: VL.begin(), last: It),
8433	Op0: PoisonValue::get(T: VecTy), Op1: *It);
8434
8435	SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8436	transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
8437	return isa<PoisonValue>(Val: V) ? PoisonMaskElem : `0`;
8438	});
8439	InstructionCost InsertCost =
8440	TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`,
8441	Op0: PoisonValue::get(T: VecTy), Op1: *It);
8442	return InsertCost + TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast,
8443	Tp: VecTy, Mask: ShuffleMask, CostKind,
8444	/Index=/`0`, /SubTp=/nullptr,
8445	/Args=/*It);
8446	}
8447	return GatherCost +
8448	(all_of(Range&: Gathers, P: IsaPred<UndefValue>)
8449	? TTI::TCC_Free
8450	: R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
8451	ScalarTy));
8452	};
8453
8454	/// Compute the cost of creating a vector containing the extracted values from
8455	/// \p VL.
8456	InstructionCost
8457	computeExtractCost(ArrayRef<Value > VL, ArrayRef<int*> Mask,
8458	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459	unsigned NumParts) {
8460	assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8461	unsigned NumElts =
8462	std::accumulate(first: VL.begin(), last: VL.end(), init: `0`, binary_op: [](unsigned Sz, Value *V) {
8463	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8464	if (!EE)
8465	return Sz;
8466	auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
8467	if (!VecTy)
8468	return Sz;
8469	return std::max(a: Sz, b: VecTy->getNumElements());
8470	});
8471	// FIXME: this must be moved to TTI for better estimation.
8472	unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
8473	auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8474	SmallVectorImpl<unsigned> &Indices)
8475	-> std::optional<TTI::ShuffleKind> {
8476	if (NumElts <= EltsPerVector)
8477	return std::nullopt;
8478	int OffsetReg0 =
8479	alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
8480	binary_op: [](int S, int I) {
8481	if (I == PoisonMaskElem)
8482	return S;
8483	return std::min(a: S, b: I);
8484	}),
8485	Align: EltsPerVector);
8486	int OffsetReg1 = OffsetReg0;
8487	DenseSet<int> RegIndices;
8488	// Check that if trying to permute same single/2 input vectors.
8489	TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8490	int FirstRegId = -`1`;
8491	Indices.assign(NumElts: `1`, Elt: OffsetReg0);
8492	for (auto [Pos, I] : enumerate(First&: Mask)) {
8493	if (I == PoisonMaskElem)
8494	continue;
8495	int Idx = I - OffsetReg0;
8496	int RegId =
8497	(Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8498	if (FirstRegId < `0`)
8499	FirstRegId = RegId;
8500	RegIndices.insert(V: RegId);
8501	if (RegIndices.size() > `2`)
8502	return std::nullopt;
8503	if (RegIndices.size() == `2`) {
8504	ShuffleKind = TTI::SK_PermuteTwoSrc;
8505	if (Indices.size() == `1`) {
8506	OffsetReg1 = alignDown(
8507	Value: std::accumulate(
8508	first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
8509	binary_op: [&](int S, int I) {
8510	if (I == PoisonMaskElem)
8511	return S;
8512	int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513	((I - OffsetReg0) % NumElts) / EltsPerVector;
8514	if (RegId == FirstRegId)
8515	return S;
8516	return std::min(a: S, b: I);
8517	}),
8518	Align: EltsPerVector);
8519	Indices.push_back(Elt: OffsetReg1 % NumElts);
8520	}
8521	Idx = I - OffsetReg1;
8522	}
8523	I = (Idx % NumElts) % EltsPerVector +
8524	(RegId == FirstRegId ? `0` : EltsPerVector);
8525	}
8526	return ShuffleKind;
8527	};
8528	InstructionCost Cost = `0`;
8529
8530	// Process extracts in blocks of EltsPerVector to check if the source vector
8531	// operand can be re-used directly. If not, add the cost of creating a
8532	// shuffle to extract the values into a vector register.
8533	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
8534	if (!ShuffleKinds [Part])
8535	continue;
8536	ArrayRef<int> MaskSlice = Mask.slice(
8537	N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
8538	SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8539	copy(Range&: MaskSlice, Out: SubMask.begin());
8540	SmallVector<unsigned, `2`> Indices;
8541	std::optional<TTI::ShuffleKind> RegShuffleKind =
8542	CheckPerRegistersShuffle(SubMask, Indices);
8543	if (!RegShuffleKind) {
8544	if (*ShuffleKinds [Part] != TTI::SK_PermuteSingleSrc \|\|
8545	!ShuffleVectorInst::isIdentityMask(
8546	Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
8547	Cost +=
8548	::getShuffleCost(TTI, Kind: *ShuffleKinds [Part],
8549	Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
8550	continue;
8551	}
8552	if (*RegShuffleKind != TTI::SK_PermuteSingleSrc \|\|
8553	!ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
8554	Cost +=
8555	::getShuffleCost(TTI, Kind: *RegShuffleKind,
8556	Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
8557	}
8558	for (unsigned Idx : Indices) {
8559	assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8560	"SK_ExtractSubvector index out of range");
8561	Cost += ::getShuffleCost(
8562	TTI, Kind: TTI::SK_ExtractSubvector,
8563	Tp: getWidenedType(ScalarTy, VF: alignTo(Value: NumElts, Align: EltsPerVector)),
8564	Mask: std::nullopt, CostKind, Index: Idx,
8565	SubTp: getWidenedType(ScalarTy, VF: EltsPerVector));
8566	}
8567	// Second attempt to check, if just a permute is better estimated than
8568	// subvector extract.
8569	SubMask.assign(NumElts, Elt: PoisonMaskElem);
8570	copy(Range&: MaskSlice, Out: SubMask.begin());
8571	InstructionCost OriginalCost = ::getShuffleCost(
8572	TTI, Kind: *ShuffleKinds [Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
8573	if (OriginalCost < Cost)
8574	Cost = OriginalCost;
8575	}
8576	return Cost;
8577	}
8578	/// Transforms mask \p CommonMask per given \p Mask to make proper set after
8579	/// shuffle emission.
8580	static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8581	ArrayRef<int> Mask) {
8582	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8583	if (Mask [Idx] != PoisonMaskElem)
8584	CommonMask [Idx] = Idx;
8585	}
8586	/// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8587	/// mask \p Mask, register number \p Part, that includes \p SliceSize
8588	/// elements.
8589	void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8590	ArrayRef<int> Mask, unsigned Part,
8591	unsigned SliceSize) {
8592	if (SameNodesEstimated) {
8593	// Delay the cost estimation if the same nodes are reshuffling.
8594	// If we already requested the cost of reshuffling of E1 and E2 before, no
8595	// need to estimate another cost with the sub-Mask, instead include this
8596	// sub-Mask into the CommonMask to estimate it later and avoid double cost
8597	// estimation.
8598	if ((InVectors.size() == `2` &&
8599	InVectors.front().get<const TreeEntry *>() == &E1 &&
8600	InVectors.back().get<const TreeEntry *>() == E2) \|\|
8601	(!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8602	unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
8603	assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8604	[](int Idx) { return Idx == PoisonMaskElem; }) &&
8605	"Expected all poisoned elements.");
8606	ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
8607	copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
8608	return;
8609	}
8610	// Found non-matching nodes - need to estimate the cost for the matched
8611	// and transform mask.
8612	Cost += createShuffle(P1: InVectors.front(),
8613	P2: InVectors.size() == `1` ? nullptr : InVectors.back(),
8614	Mask: CommonMask);
8615	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8616	}
8617	SameNodesEstimated = false;
8618	if (!E2 && InVectors.size() == `1`) {
8619	unsigned VF = E1.getVectorFactor();
8620	if (Value V1 = InVectors.front().dyn_cast<Value >()) {
8621	VF = std::max(a: VF,
8622	b: cast<FixedVectorType>(Val: V1->getType())->getNumElements());
8623	} else {
8624	const auto E = InVectors.front().get<const* TreeEntry *>();
8625	VF = std::max(a: VF, b: E->getVectorFactor());
8626	}
8627	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8628	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
8629	CommonMask [Idx] = Mask [Idx] + VF;
8630	Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
8631	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8632	} else {
8633	Cost += createShuffle(P1: &E1, P2: E2, Mask);
8634	transformMaskAfterShuffle(CommonMask, Mask);
8635	}
8636	}
8637
8638	class ShuffleCostBuilder {
8639	const TargetTransformInfo &TTI;
8640
8641	static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8642	int Index = -`1`;
8643	return Mask.empty() \|\|
8644	(VF == Mask.size() &&
8645	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) \|\|
8646	(ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
8647	Index == `0`);
8648	}
8649
8650	public:
8651	ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8652	~ShuffleCostBuilder() = default;
8653	InstructionCost createShuffleVector(Value V1, Value ,
8654	ArrayRef<int> Mask) const {
8655	// Empty mask or identity mask are free.
8656	unsigned VF =
8657	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8658	if (isEmptyOrIdentity(Mask, VF))
8659	return TTI::TCC_Free;
8660	return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
8661	Tp: cast<VectorType>(Val: V1->getType()), Mask);
8662	}
8663	InstructionCost createShuffleVector(Value V1, ArrayRef<int> Mask) const* {
8664	// Empty mask or identity mask are free.
8665	unsigned VF =
8666	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8667	if (isEmptyOrIdentity(Mask, VF))
8668	return TTI::TCC_Free;
8669	return TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc,
8670	Tp: cast<VectorType>(Val: V1->getType()), Mask);
8671	}
8672	InstructionCost createIdentity(Value ) const* { return TTI::TCC_Free; }
8673	InstructionCost createPoison(Type Ty, unsigned* VF) const {
8674	return TTI::TCC_Free;
8675	}
8676	void resizeToMatch(Value &, Value &) const {}
8677	};
8678
8679	/// Smart shuffle instruction emission, walks through shuffles trees and
8680	/// tries to find the best matching vector for the actual shuffle
8681	/// instruction.
8682	InstructionCost
8683	createShuffle(const PointerUnion<Value , const* TreeEntry *> &P1,
8684	const PointerUnion<Value , const* TreeEntry *> &P2,
8685	ArrayRef<int> Mask) {
8686	ShuffleCostBuilder Builder(TTI);
8687	SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8688	Value V1 = P1.dyn_cast<Value >(), V2 = P2.dyn_cast<Value >();
8689	unsigned CommonVF = Mask.size();
8690	InstructionCost ExtraCost = `0`;
8691	auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8692	unsigned VF) -> InstructionCost {
8693	if (E.isGather() && allConstant(VL: E.Scalars))
8694	return TTI::TCC_Free;
8695	Type *EScalarTy = E.Scalars.front()->getType();
8696	bool IsSigned = true;
8697	if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
8698	EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It ->second.first);
8699	IsSigned = It ->second.second;
8700	}
8701	if (EScalarTy != ScalarTy) {
8702	unsigned CastOpcode = Instruction::Trunc;
8703	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
8704	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
8705	if (DstSz > SrcSz)
8706	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8707	return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
8708	Src: getWidenedType(ScalarTy: EScalarTy, VF),
8709	CCH: TTI::CastContextHint::None, CostKind);
8710	}
8711	return TTI::TCC_Free;
8712	};
8713	auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8714	if (isa<Constant>(Val: V))
8715	return TTI::TCC_Free;
8716	auto *VecTy = cast<VectorType>(Val: V->getType());
8717	Type *EScalarTy = VecTy->getElementType();
8718	if (EScalarTy != ScalarTy) {
8719	bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery (*R.DL));
8720	unsigned CastOpcode = Instruction::Trunc;
8721	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
8722	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
8723	if (DstSz > SrcSz)
8724	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8725	return TTI.getCastInstrCost(
8726	Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
8727	Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
8728	}
8729	return TTI::TCC_Free;
8730	};
8731	if (!V1 && !V2 && !P2.isNull()) {
8732	// Shuffle 2 entry nodes.
8733	const TreeEntry E = P1.get<const* TreeEntry *>();
8734	unsigned VF = E->getVectorFactor();
8735	const TreeEntry E2 = P2.get<const* TreeEntry *>();
8736	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8737	assert(all_of(Mask,
8738	[=](int Idx) {
8739	return Idx < `2` * static_cast<int>(CommonVF);
8740	}) &&
8741	"All elements in mask must be less than 2 * CommonVF.");
8742	if (E->Scalars.size() == E2->Scalars.size()) {
8743	SmallVector<int> EMask = E->getCommonMask();
8744	SmallVector<int> E2Mask = E2->getCommonMask();
8745	if (!EMask.empty() \|\| !E2Mask.empty()) {
8746	for (int &Idx : CommonMask) {
8747	if (Idx == PoisonMaskElem)
8748	continue;
8749	if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8750	Idx = EMask [Idx];
8751	else if (Idx >= static_cast<int>(CommonVF))
8752	Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask [Idx - CommonVF]) +
8753	E->Scalars.size();
8754	}
8755	}
8756	CommonVF = E->Scalars.size();
8757	ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758	GetNodeMinBWAffectedCost(*E2, CommonVF);
8759	} else {
8760	ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761	GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8762	}
8763	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8764	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8765	} else if (!V1 && P2.isNull()) {
8766	// Shuffle single entry node.
8767	const TreeEntry E = P1.get<const* TreeEntry *>();
8768	unsigned VF = E->getVectorFactor();
8769	CommonVF = VF;
8770	assert(
8771	all_of(Mask,
8772	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8773	"All elements in mask must be less than CommonVF.");
8774	if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8775	SmallVector<int> EMask = E->getCommonMask();
8776	assert(!EMask.empty() && "Expected non-empty common mask.");
8777	for (int &Idx : CommonMask) {
8778	if (Idx != PoisonMaskElem)
8779	Idx = EMask [Idx];
8780	}
8781	CommonVF = E->Scalars.size();
8782	}
8783	ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8784	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8785	// Not identity/broadcast? Try to see if the original vector is better.
8786	if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787	CommonVF == CommonMask.size() &&
8788	any_of(Range: enumerate(First&: CommonMask),
8789	P: [](const auto &&P) {
8790	return P.value() != PoisonMaskElem &&
8791	static_cast<unsigned>(P.value()) != P.index();
8792	}) &&
8793	any_of(Range&: CommonMask,
8794	P: [](int Idx) { return Idx != PoisonMaskElem && Idx != `0`; })) {
8795	SmallVector<int> ReorderMask;
8796	inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
8797	::addMask(Mask&: CommonMask, SubMask: ReorderMask);
8798	}
8799	} else if (V1 && P2.isNull()) {
8800	// Shuffle single vector.
8801	ExtraCost += GetValueMinBWAffectedCost(V1);
8802	CommonVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8803	assert(
8804	all_of(Mask,
8805	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8806	"All elements in mask must be less than CommonVF.");
8807	} else if (V1 && !V2) {
8808	// Shuffle vector and tree node.
8809	unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8810	const TreeEntry E2 = P2.get<const* TreeEntry *>();
8811	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8812	assert(all_of(Mask,
8813	[=](int Idx) {
8814	return Idx < `2` * static_cast<int>(CommonVF);
8815	}) &&
8816	"All elements in mask must be less than 2 * CommonVF.");
8817	if (E2->Scalars.size() == VF && VF != CommonVF) {
8818	SmallVector<int> E2Mask = E2->getCommonMask();
8819	assert(!E2Mask.empty() && "Expected non-empty common mask.");
8820	for (int &Idx : CommonMask) {
8821	if (Idx == PoisonMaskElem)
8822	continue;
8823	if (Idx >= static_cast<int>(CommonVF))
8824	Idx = E2Mask [Idx - CommonVF] + VF;
8825	}
8826	CommonVF = VF;
8827	}
8828	ExtraCost += GetValueMinBWAffectedCost(V1);
8829	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8830	ExtraCost += GetNodeMinBWAffectedCost(
8831	*E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
8832	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8833	} else if (!V1 && V2) {
8834	// Shuffle vector and tree node.
8835	unsigned VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
8836	const TreeEntry E1 = P1.get<const* TreeEntry *>();
8837	CommonVF = std::max(a: VF, b: E1->getVectorFactor());
8838	assert(all_of(Mask,
8839	[=](int Idx) {
8840	return Idx < `2` * static_cast<int>(CommonVF);
8841	}) &&
8842	"All elements in mask must be less than 2 * CommonVF.");
8843	if (E1->Scalars.size() == VF && VF != CommonVF) {
8844	SmallVector<int> E1Mask = E1->getCommonMask();
8845	assert(!E1Mask.empty() && "Expected non-empty common mask.");
8846	for (int &Idx : CommonMask) {
8847	if (Idx == PoisonMaskElem)
8848	continue;
8849	if (Idx >= static_cast<int>(CommonVF))
8850	Idx = E1Mask [Idx - CommonVF] + VF;
8851	else
8852	Idx = E1Mask [Idx];
8853	}
8854	CommonVF = VF;
8855	}
8856	ExtraCost += GetNodeMinBWAffectedCost(
8857	*E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
8858	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8859	ExtraCost += GetValueMinBWAffectedCost(V2);
8860	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8861	} else {
8862	assert(V1 && V2 && "Expected both vectors.");
8863	unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8864	CommonVF =
8865	std::max(a: VF, b: cast<FixedVectorType>(Val: V2->getType())->getNumElements());
8866	assert(all_of(Mask,
8867	[=](int Idx) {
8868	return Idx < `2` * static_cast<int>(CommonVF);
8869	}) &&
8870	"All elements in mask must be less than 2 * CommonVF.");
8871	ExtraCost +=
8872	GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873	if (V1->getType() != V2->getType()) {
8874	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8875	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8876	} else {
8877	if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
8878	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8879	if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
8880	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8881	}
8882	}
8883	InVectors.front() =
8884	Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
8885	if (InVectors.size() == `2`)
8886	InVectors.pop_back();
8887	return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888	V1, V2, Mask: CommonMask, Builder);
8889	}
8890
8891	public:
8892	ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
8893	ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8894	SmallPtrSetImpl<Value *> &CheckedExtracts)
8895	: ScalarTy(ScalarTy), TTI(TTI),
8896	VectorizedVals (VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897	CheckedExtracts(CheckedExtracts) {}
8898	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
8899	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900	unsigned NumParts, bool &UseVecBaseAsInput) {
8901	UseVecBaseAsInput = false;
8902	if (Mask.empty())
8903	return nullptr;
8904	Value VecBase = nullptr*;
8905	ArrayRef<Value *> VL = E->Scalars;
8906	// If the resulting type is scalarized, do not adjust the cost.
8907	if (NumParts == VL.size())
8908	return nullptr;
8909	// Check if it can be considered reused if same extractelements were
8910	// vectorized already.
8911	bool PrevNodeFound = any_of(
8912	Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
8913	P: [&](const std::unique_ptr<TreeEntry> &TE) {
8914	return ((!TE ->isAltShuffle() &&
8915	TE ->getOpcode() == Instruction::ExtractElement) \|\|
8916	TE ->isGather()) &&
8917	all_of(Range: enumerate(First&: TE ->Scalars), P: [&](auto &&Data) {
8918	return VL.size() > Data.index() &&
8919	(Mask[Data.index()] == PoisonMaskElem \|\|
8920	isa<UndefValue>(VL[Data.index()]) \|\|
8921	Data.value() == VL[Data.index()]);
8922	});
8923	});
8924	SmallPtrSet<Value *, `4`> UniqueBases;
8925	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
8926	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
8927	unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
8928	ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
8929	for (auto [I, V] : enumerate(First: VL.slice(N: Part * SliceSize, M: Limit))) {
8930	// Ignore non-extractelement scalars.
8931	if (isa<UndefValue>(Val: V) \|\|
8932	(!SubMask.empty() && SubMask [I] == PoisonMaskElem))
8933	continue;
8934	// If all users of instruction are going to be vectorized and this
8935	// instruction itself is not going to be vectorized, consider this
8936	// instruction as dead and remove its cost from the final cost of the
8937	// vectorized tree.
8938	// Also, avoid adjusting the cost for extractelements with multiple uses
8939	// in different graph entries.
8940	auto *EE = cast<ExtractElementInst>(Val: V);
8941	VecBase = EE->getVectorOperand();
8942	UniqueBases.insert(Ptr: VecBase);
8943	const TreeEntry *VE = R.getTreeEntry(V);
8944	if (!CheckedExtracts.insert(Ptr: V).second \|\|
8945	!R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) \|\|
8946	any_of(Range: EE->users(),
8947	P: [&](User *U) {
8948	return isa<GetElementPtrInst>(Val: U) &&
8949	!R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
8950	VectorizedVals: &VectorizedVals);
8951	}) \|\|
8952	(VE && VE != E))
8953	continue;
8954	std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
8955	if (!EEIdx)
8956	continue;
8957	unsigned Idx = *EEIdx;
8958	// Take credit for instruction that will become dead.
8959	if (EE->hasOneUse() \|\| !PrevNodeFound) {
8960	Instruction *Ext = EE->user_back();
8961	if (isa<SExtInst, ZExtInst>(Val: Ext) &&
8962	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
8963	// Use getExtractWithExtendCost() to calculate the cost of
8964	// extractelement/ext pair.
8965	Cost -=
8966	TTI.getExtractWithExtendCost(Opcode: Ext->getOpcode(), Dst: Ext->getType(),
8967	VecTy: EE->getVectorOperandType(), Index: Idx);
8968	// Add back the cost of s\|zext which is subtracted separately.
8969	Cost += TTI.getCastInstrCost(
8970	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
8971	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
8972	continue;
8973	}
8974	}
8975	Cost -= TTI.getVectorInstrCost(I: *EE, Val: EE->getVectorOperandType(),
8976	CostKind, Index: Idx);
8977	}
8978	}
8979	// Check that gather of extractelements can be represented as just a
8980	// shuffle of a single/two vectors the scalars are extracted from.
8981	// Found the bunch of extractelement instructions that must be gathered
8982	// into a vector and can be represented as a permutation elements in a
8983	// single input vector or of 2 input vectors.
8984	// Done for reused if same extractelements were vectorized already.
8985	if (!PrevNodeFound)
8986	Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8987	InVectors.assign(NumElts: `1`, Elt: E);
8988	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8989	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8990	SameNodesEstimated = false;
8991	if (NumParts != `1` && UniqueBases.size() != `1`) {
8992	UseVecBaseAsInput = true;
8993	VecBase =
8994	Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
8995	}
8996	return VecBase;
8997	}
8998	/// Checks if the specified entry \p E needs to be delayed because of its
8999	/// dependency nodes.
9000	std::optional<InstructionCost>
9001	needToDelay(const TreeEntry *,
9002	ArrayRef<SmallVector<const TreeEntry >>) const* {
9003	// No need to delay the cost estimation during analysis.
9004	return std::nullopt;
9005	}
9006	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9007	if (&E1 == &E2) {
9008	assert(all_of(Mask,
9009	[&](int Idx) {
9010	return Idx < static_cast<int>(E1.getVectorFactor());
9011	}) &&
9012	"Expected single vector shuffle mask.");
9013	add(E1, Mask);
9014	return;
9015	}
9016	if (InVectors.empty()) {
9017	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
9018	InVectors.assign(IL: {&E1, &E2});
9019	return;
9020	}
9021	assert(!CommonMask.empty() && "Expected non-empty common mask.");
9022	auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
9023	unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
9024	if (NumParts == `0` \|\| NumParts >= Mask.size())
9025	NumParts = `1`;
9026	unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
9027	const auto *It =
9028	find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
9029	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
9030	estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
9031	}
9032	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9033	if (InVectors.empty()) {
9034	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
9035	InVectors.assign(NumElts: `1`, Elt: &E1);
9036	return;
9037	}
9038	assert(!CommonMask.empty() && "Expected non-empty common mask.");
9039	auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
9040	unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
9041	if (NumParts == `0` \|\| NumParts >= Mask.size())
9042	NumParts = `1`;
9043	unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
9044	const auto *It =
9045	find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
9046	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
9047	estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
9048	if (!SameNodesEstimated && InVectors.size() == `1`)
9049	InVectors.emplace_back(Args: &E1);
9050	}
9051	/// Adds 2 input vectors and the mask for their shuffling.
9052	void add(Value V1, Value V2, ArrayRef<int> Mask) {
9053	// May come only for shuffling of 2 vectors with extractelements, already
9054	// handled in adjustExtracts.
9055	assert(InVectors.size() == `1` &&
9056	all_of(enumerate(CommonMask),
9057	[&](auto P) {
9058	if (P.value() == PoisonMaskElem)
9059	return Mask[P.index()] == PoisonMaskElem;
9060	auto *EI =
9061	cast<ExtractElementInst>(InVectors.front()
9062	.get<const TreeEntry *>()
9063	->Scalars[P.index()]);
9064	return EI->getVectorOperand() == V1 \|\|
9065	EI->getVectorOperand() == V2;
9066	}) &&
9067	"Expected extractelement vectors.");
9068	}
9069	/// Adds another one input vector and the mask for the shuffling.
9070	void add(Value V1, ArrayRef<int> Mask, bool* ForExtracts = false) {
9071	if (InVectors.empty()) {
9072	assert(CommonMask.empty() && !ForExtracts &&
9073	"Expected empty input mask/vectors.");
9074	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
9075	InVectors.assign(NumElts: `1`, Elt: V1);
9076	return;
9077	}
9078	if (ForExtracts) {
9079	// No need to add vectors here, already handled them in adjustExtracts.
9080	assert(InVectors.size() == `1` &&
9081	InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9082	all_of(enumerate(CommonMask),
9083	[&](auto P) {
9084	Value *Scalar = InVectors.front()
9085	.get<const TreeEntry *>()
9086	->Scalars[P.index()];
9087	if (P.value() == PoisonMaskElem)
9088	return P.value() == Mask[P.index()] \|\|
9089	isa<UndefValue>(Scalar);
9090	if (isa<Constant>(V1))
9091	return true;
9092	auto *EI = cast<ExtractElementInst>(Scalar);
9093	return EI->getVectorOperand() == V1;
9094	}) &&
9095	"Expected only tree entry for extractelement vectors.");
9096	return;
9097	}
9098	assert(!InVectors.empty() && !CommonMask.empty() &&
9099	"Expected only tree entries from extracts/reused buildvectors.");
9100	unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
9101	if (InVectors.size() == `2`) {
9102	Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
9103	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
9104	VF = std::max<unsigned>(a: VF, b: CommonMask.size());
9105	} else if (const auto *InTE =
9106	InVectors.front().dyn_cast<const TreeEntry *>()) {
9107	VF = std::max(a: VF, b: InTE->getVectorFactor());
9108	} else {
9109	VF = std::max(
9110	a: VF, b: cast<FixedVectorType>(Val: InVectors.front().get<Value *>()->getType())
9111	->getNumElements());
9112	}
9113	InVectors.push_back(Elt: V1);
9114	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9115	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
9116	CommonMask [Idx] = Mask [Idx] + VF;
9117	}
9118	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
9119	Value Root = nullptr*) {
9120	Cost += getBuildVectorCost(VL, Root);
9121	if (!Root) {
9122	// FIXME: Need to find a way to avoid use of getNullValue here.
9123	SmallVector<Constant *> Vals;
9124	unsigned VF = VL.size();
9125	if (MaskVF != `0`)
9126	VF = std::min(a: VF, b: MaskVF);
9127	for (Value *V : VL.take_front(N: VF)) {
9128	if (isa<UndefValue>(Val: V)) {
9129	Vals.push_back(Elt: cast<Constant>(Val: V));
9130	continue;
9131	}
9132	Vals.push_back(Elt: Constant::getNullValue(Ty: V->getType()));
9133	}
9134	return ConstantVector::get(V: Vals);
9135	}
9136	return ConstantVector::getSplat(
9137	EC: ElementCount::getFixed(
9138	MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
9139	Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy));
9140	}
9141	InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
9142	/// Finalize emission of the shuffles.
9143	InstructionCost
9144	finalize(ArrayRef<int> ExtMask, unsigned VF = `0`,
9145	function_ref<void(Value &, SmallVectorImpl<int*> &)> Action = {}) {
9146	IsFinalized = true;
9147	if (Action) {
9148	const PointerUnion<Value , const* TreeEntry *> &Vec = InVectors.front();
9149	if (InVectors.size() == `2`)
9150	Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
9151	else
9152	Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
9153	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9154	if (CommonMask [Idx] != PoisonMaskElem)
9155	CommonMask [Idx] = Idx;
9156	assert(VF > `0` &&
9157	"Expected vector length for the final value before action.");
9158	Value V = Vec.get<Value >();
9159	Action (V, CommonMask);
9160	InVectors.front() = V;
9161	}
9162	::addMask(Mask&: CommonMask, SubMask: ExtMask, /ExtendingManyInputs=/true);
9163	if (CommonMask.empty()) {
9164	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
9165	return Cost;
9166	}
9167	return Cost +
9168	createShuffle(P1: InVectors.front(),
9169	P2: InVectors.size() == `2` ? InVectors.back() : nullptr,
9170	Mask: CommonMask);
9171	}
9172
9173	~ShuffleCostEstimator() {
9174	assert((IsFinalized \|\| CommonMask.empty()) &&
9175	"Shuffle construction must be finalized.");
9176	}
9177	};
9178
9179	const BoUpSLP::TreeEntry BoUpSLP::getOperandEntry(const* TreeEntry *E,
9180	unsigned Idx) const {
9181	Value *Op = E->getOperand(OpIdx: Idx).front();
9182	if (const TreeEntry *TE = getTreeEntry(V: Op)) {
9183	if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
9184	return EI.EdgeIdx == Idx && EI.UserTE == E;
9185	}) != TE->UserTreeIndices.end())
9186	return TE;
9187	auto MIt = MultiNodeScalars.find(Val: Op);
9188	if (MIt != MultiNodeScalars.end()) {
9189	for (const TreeEntry *TE : MIt ->second) {
9190	if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
9191	return EI.EdgeIdx == Idx && EI.UserTE == E;
9192	}) != TE->UserTreeIndices.end())
9193	return TE;
9194	}
9195	}
9196	}
9197	const auto *It =
9198	find_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
9199	return TE ->isGather() &&
9200	find_if(Range&: TE ->UserTreeIndices, P: [&](const EdgeInfo &EI) {
9201	return EI.EdgeIdx == Idx && EI.UserTE == E;
9202	}) != TE ->UserTreeIndices.end();
9203	});
9204	assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9205	return It->get();
9206	}
9207
9208	TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9209	if (TE.State == TreeEntry::ScatterVectorize \|\|
9210	TE.State == TreeEntry::StridedVectorize)
9211	return TTI::CastContextHint::GatherScatter;
9212	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9213	!TE.isAltShuffle()) {
9214	if (TE.ReorderIndices.empty())
9215	return TTI::CastContextHint::Normal;
9216	SmallVector<int> Mask;
9217	inversePermutation(Indices: TE.ReorderIndices, Mask);
9218	if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
9219	return TTI::CastContextHint::Reversed;
9220	}
9221	return TTI::CastContextHint::None;
9222	}
9223
9224	/// Builds the arguments types vector for the given call instruction with the
9225	/// given \p ID for the specified vector factor.
9226	static SmallVector<Type > buildIntrinsicArgTypes(const* CallInst *CI,
9227	const Intrinsic::ID ID,
9228	const unsigned VF,
9229	unsigned MinBW) {
9230	SmallVector<Type *> ArgTys;
9231	for (auto [Idx, Arg] : enumerate(First: CI->args())) {
9232	if (ID != Intrinsic::not_intrinsic) {
9233	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx)) {
9234	ArgTys.push_back(Elt: Arg ->getType());
9235	continue;
9236	}
9237	if (MinBW > `0`) {
9238	ArgTys.push_back(
9239	Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
9240	continue;
9241	}
9242	}
9243	ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg ->getType(), VF));
9244	}
9245	return ArgTys;
9246	}
9247
9248	InstructionCost
9249	BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
9250	SmallPtrSetImpl<Value *> &CheckedExtracts) {
9251	ArrayRef<Value *> VL = E->Scalars;
9252
9253	Type *ScalarTy = VL [`0`]->getType();
9254	if (!E->isGather()) {
9255	if (auto *SI = dyn_cast<StoreInst>(Val: VL [`0`]))
9256	ScalarTy = SI->getValueOperand()->getType();
9257	else if (auto *CI = dyn_cast<CmpInst>(Val: VL [`0`]))
9258	ScalarTy = CI->getOperand(i_nocapture: `0`)->getType();
9259	else if (auto *IE = dyn_cast<InsertElementInst>(Val: VL [`0`]))
9260	ScalarTy = IE->getOperand(i_nocapture: `1`)->getType();
9261	}
9262	if (!isValidElementType(Ty: ScalarTy))
9263	return InstructionCost::getInvalid();
9264	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9265
9266	// If we have computed a smaller type for the expression, update VecTy so
9267	// that the costs will be accurate.
9268	auto It = MinBWs.find(Val: E);
9269	Type *OrigScalarTy = ScalarTy;
9270	if (It != MinBWs.end())
9271	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
9272	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
9273	unsigned EntryVF = E->getVectorFactor();
9274	auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
9275
9276	bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277	if (E->isGather()) {
9278	if (allConstant(VL))
9279	return `0`;
9280	if (isa<InsertElementInst>(Val: VL [`0`]))
9281	return InstructionCost::getInvalid();
9282	return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283	E, ScalarTy, Params&: TTI, Params&: VectorizedVals, Params&: this, Params&: CheckedExtracts);
9284	}
9285	InstructionCost CommonCost = `0`;
9286	SmallVector<int> Mask;
9287	bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
9288	if (!E->ReorderIndices.empty() &&
9289	(E->State != TreeEntry::StridedVectorize \|\| !IsReverseOrder)) {
9290	SmallVector<int> NewMask;
9291	if (E->getOpcode() == Instruction::Store) {
9292	// For stores the order is actually a mask.
9293	NewMask.resize(N: E->ReorderIndices.size());
9294	copy(Range: E->ReorderIndices, Out: NewMask.begin());
9295	} else {
9296	inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
9297	}
9298	::addMask(Mask, SubMask: NewMask);
9299	}
9300	if (NeedToShuffleReuses)
9301	::addMask(Mask, SubMask: E->ReuseShuffleIndices);
9302	if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
9303	CommonCost =
9304	TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
9305	assert((E->State == TreeEntry::Vectorize \|\|
9306	E->State == TreeEntry::ScatterVectorize \|\|
9307	E->State == TreeEntry::StridedVectorize) &&
9308	"Unhandled state");
9309	assert(E->getOpcode() &&
9310	((allSameType(VL) && allSameBlock(VL)) \|\|
9311	(E->getOpcode() == Instruction::GetElementPtr &&
9312	E->getMainOp()->getType()->isPointerTy())) &&
9313	"Invalid VL");
9314	Instruction *VL0 = E->getMainOp();
9315	unsigned ShuffleOrOp =
9316	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9317	SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9318	const unsigned Sz = UniqueValues.size();
9319	SmallBitVector UsedScalars(Sz, false);
9320	for (unsigned I = `0`; I < Sz; ++I) {
9321	if (getTreeEntry(V: UniqueValues [I]) == E)
9322	continue;
9323	UsedScalars.set(I);
9324	}
9325	auto GetCastContextHint = [&](Value *V) {
9326	if (const TreeEntry *OpTE = getTreeEntry(V))
9327	return getCastContextHint(TE: *OpTE);
9328	InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: `0`), TLI: *TLI);
9329	if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9330	return TTI::CastContextHint::GatherScatter;
9331	return TTI::CastContextHint::None;
9332	};
9333	auto GetCostDiff =
9334	[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9335	function_ref<InstructionCost(InstructionCost)> VectorCost) {
9336	// Calculate the cost of this instruction.
9337	InstructionCost ScalarCost = `0`;
9338	if (isa<CastInst, CallInst>(Val: VL0)) {
9339	// For some of the instructions no need to calculate cost for each
9340	// particular instruction, we can use the cost of the single
9341	// instruction x total number of scalar instructions.
9342	ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost (`0`);
9343	} else {
9344	for (unsigned I = `0`; I < Sz; ++I) {
9345	if (UsedScalars.test(Idx: I))
9346	continue;
9347	ScalarCost += ScalarEltCost (I);
9348	}
9349	}
9350
9351	InstructionCost VecCost = VectorCost (CommonCost);
9352	// Check if the current node must be resized, if the parent node is not
9353	// resized.
9354	if (!UnaryInstruction::isCast(Opcode: E->getOpcode()) && E->Idx != `0`) {
9355	const EdgeInfo &EI = E->UserTreeIndices.front();
9356	if ((EI.UserTE->getOpcode() != Instruction::Select \|\|
9357	EI.EdgeIdx != `0`) &&
9358	It != MinBWs.end()) {
9359	auto UserBWIt = MinBWs.find(Val: EI.UserTE);
9360	Type *UserScalarTy =
9361	EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
9362	if (UserBWIt != MinBWs.end())
9363	UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
9364	NumBits: UserBWIt ->second.first);
9365	if (ScalarTy != UserScalarTy) {
9366	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9367	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
9368	unsigned VecOpcode;
9369	auto *UserVecTy =
9370	getWidenedType(ScalarTy: UserScalarTy, VF: E->getVectorFactor());
9371	if (BWSz > SrcBWSz)
9372	VecOpcode = Instruction::Trunc;
9373	else
9374	VecOpcode =
9375	It ->second.second ? Instruction::SExt : Instruction::ZExt;
9376	TTI::CastContextHint CCH = GetCastContextHint (VL0);
9377	VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
9378	CostKind);
9379	}
9380	}
9381	}
9382	LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383	ScalarCost, "Calculated costs for Tree"));
9384	return VecCost - ScalarCost;
9385	};
9386	// Calculate cost difference from vectorizing set of GEPs.
9387	// Negative value means vectorizing is profitable.
9388	auto GetGEPCostDiff = [=](ArrayRef<Value > Ptrs, Value BasePtr) {
9389	assert((E->State == TreeEntry::Vectorize \|\|
9390	E->State == TreeEntry::StridedVectorize) &&
9391	"Entry state expected to be Vectorize or StridedVectorize here.");
9392	InstructionCost ScalarCost = `0`;
9393	InstructionCost VecCost = `0`;
9394	std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
9395	TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
9396	LLVM_DEBUG(dumpTreeCosts(E, `0`, VecCost, ScalarCost,
9397	"Calculated GEPs cost for Tree"));
9398
9399	return VecCost - ScalarCost;
9400	};
9401
9402	switch (ShuffleOrOp) {
9403	case Instruction::PHI: {
9404	// Count reused scalars.
9405	InstructionCost ScalarCost = `0`;
9406	SmallPtrSet<const TreeEntry *, `4`> CountedOps;
9407	for (Value *V : UniqueValues) {
9408	auto *PHI = dyn_cast<PHINode>(Val: V);
9409	if (!PHI)
9410	continue;
9411
9412	ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9413	for (unsigned I = `0`, N = PHI->getNumIncomingValues(); I < N; ++I) {
9414	Value *Op = PHI->getIncomingValue(i: I);
9415	Operands [I] = Op;
9416	}
9417	if (const TreeEntry *OpTE = getTreeEntry(V: Operands.front()))
9418	if (OpTE->isSame(VL: Operands) && CountedOps.insert(Ptr: OpTE).second)
9419	if (!OpTE->ReuseShuffleIndices.empty())
9420	ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421	OpTE->Scalars.size());
9422	}
9423
9424	return CommonCost - ScalarCost;
9425	}
9426	case Instruction::ExtractValue:
9427	case Instruction::ExtractElement: {
9428	auto GetScalarCost = [&](unsigned Idx) {
9429	auto *I = cast<Instruction>(Val: UniqueValues [Idx]);
9430	VectorType *SrcVecTy;
9431	if (ShuffleOrOp == Instruction::ExtractElement) {
9432	auto *EE = cast<ExtractElementInst>(Val: I);
9433	SrcVecTy = EE->getVectorOperandType();
9434	} else {
9435	auto *EV = cast<ExtractValueInst>(Val: I);
9436	Type *AggregateTy = EV->getAggregateOperand()->getType();
9437	unsigned NumElts;
9438	if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
9439	NumElts = ATy->getNumElements();
9440	else
9441	NumElts = AggregateTy->getStructNumElements();
9442	SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
9443	}
9444	if (I->hasOneUse()) {
9445	Instruction *Ext = I->user_back();
9446	if ((isa<SExtInst>(Val: Ext) \|\| isa<ZExtInst>(Val: Ext)) &&
9447	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
9448	// Use getExtractWithExtendCost() to calculate the cost of
9449	// extractelement/ext pair.
9450	InstructionCost Cost = TTI->getExtractWithExtendCost(
9451	Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I));
9452	// Subtract the cost of s\|zext which is subtracted separately.
9453	Cost -= TTI->getCastInstrCost(
9454	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
9455	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
9456	return Cost;
9457	}
9458	}
9459	return TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: SrcVecTy,
9460	CostKind, Index: *getExtractIndex(E: I));
9461	};
9462	auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9463	return GetCostDiff (GetScalarCost, GetVectorCost);
9464	}
9465	case Instruction::InsertElement: {
9466	assert(E->ReuseShuffleIndices.empty() &&
9467	"Unique insertelements only are expected.");
9468	auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
9469	unsigned const NumElts = SrcVecTy->getNumElements();
9470	unsigned const NumScalars = VL.size();
9471
9472	unsigned NumOfParts = TTI->getNumberOfParts(Tp: SrcVecTy);
9473
9474	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9475	unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
9476	unsigned OffsetEnd = OffsetBeg;
9477	InsertMask [OffsetBeg] = `0`;
9478	for (auto [I, V] : enumerate(First: VL.drop_front())) {
9479	unsigned Idx = *getElementIndex(Inst: V);
9480	if (OffsetBeg > Idx)
9481	OffsetBeg = Idx;
9482	else if (OffsetEnd < Idx)
9483	OffsetEnd = Idx;
9484	InsertMask [Idx] = I + `1`;
9485	}
9486	unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
9487	if (NumOfParts > `0`)
9488	VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - `1`) / NumOfParts);
9489	unsigned VecSz = (`1` + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9490	VecScalarsSz;
9491	unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492	unsigned InsertVecSz = std::min<unsigned>(
9493	a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + `1`),
9494	b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495	bool IsWholeSubvector =
9496	OffsetBeg == Offset && ((OffsetEnd + `1`) % VecScalarsSz == `0`);
9497	// Check if we can safely insert a subvector. If it is not possible, just
9498	// generate a whole-sized vector and shuffle the source vector and the new
9499	// subvector.
9500	if (OffsetBeg + InsertVecSz > VecSz) {
9501	// Align OffsetBeg to generate correct mask.
9502	OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
9503	InsertVecSz = VecSz;
9504	}
9505
9506	APInt DemandedElts = APInt::getZero(numBits: NumElts);
9507	// TODO: Add support for Instruction::InsertValue.
9508	SmallVector<int> Mask;
9509	if (!E->ReorderIndices.empty()) {
9510	inversePermutation(Indices: E->ReorderIndices, Mask);
9511	Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
9512	} else {
9513	Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
9514	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: `0`);
9515	}
9516	bool IsIdentity = true;
9517	SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9518	Mask.swap(RHS&: PrevMask);
9519	for (unsigned I = `0`; I < NumScalars; ++I) {
9520	unsigned InsertIdx = *getElementIndex(Inst: VL [PrevMask [I]]);
9521	DemandedElts.setBit(InsertIdx);
9522	IsIdentity &= InsertIdx - OffsetBeg == I;
9523	Mask [InsertIdx - OffsetBeg] = I;
9524	}
9525	assert(Offset < NumElts && "Failed to find vector index offset");
9526
9527	InstructionCost Cost = `0`;
9528	Cost -= TTI->getScalarizationOverhead(Ty: SrcVecTy, DemandedElts,
9529	/Insert/ true, /Extract/ false,
9530	CostKind);
9531
9532	// First cost - resize to actual vector size if not identity shuffle or
9533	// need to shift the vector.
9534	// Do not calculate the cost if the actual size is the register size and
9535	// we can merge this shuffle with the following SK_Select.
9536	auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
9537	if (!IsIdentity)
9538	Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
9539	Tp: InsertVecTy, Mask);
9540	auto FirstInsert = cast<Instruction>(Val: find_if(Range: E->Scalars, P: [E](Value *V) {
9541	return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
9542	}));
9543	// Second cost - permutation with subvector, if some elements are from the
9544	// initial vector or inserting a subvector.
9545	// TODO: Implement the analysis of the FirstInsert->getOperand(0)
9546	// subvector of ActualVecTy.
9547	SmallBitVector InMask =
9548	isUndefVector(V: FirstInsert->getOperand(i: `0`),
9549	UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
9550	if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9551	if (InsertVecSz != VecSz) {
9552	auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
9553	Cost += TTI->getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy,
9554	Mask: std::nullopt, CostKind, Index: OffsetBeg - Offset,
9555	SubTp: InsertVecTy);
9556	} else {
9557	for (unsigned I = `0`, End = OffsetBeg - Offset; I < End; ++I)
9558	Mask [I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
9559	for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9560	I <= End; ++I)
9561	if (Mask [I] != PoisonMaskElem)
9562	Mask [I] = I + VecSz;
9563	for (unsigned I = OffsetEnd + `1` - Offset; I < VecSz; ++I)
9564	Mask [I] =
9565	((I >= InMask.size()) \|\| InMask.test(Idx: I)) ? PoisonMaskElem : I;
9566	Cost +=
9567	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
9568	}
9569	}
9570	return Cost;
9571	}
9572	case Instruction::ZExt:
9573	case Instruction::SExt:
9574	case Instruction::FPToUI:
9575	case Instruction::FPToSI:
9576	case Instruction::FPExt:
9577	case Instruction::PtrToInt:
9578	case Instruction::IntToPtr:
9579	case Instruction::SIToFP:
9580	case Instruction::UIToFP:
9581	case Instruction::Trunc:
9582	case Instruction::FPTrunc:
9583	case Instruction::BitCast: {
9584	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
9585	Type *SrcScalarTy = VL0->getOperand(i: `0`)->getType();
9586	auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
9587	unsigned Opcode = ShuffleOrOp;
9588	unsigned VecOpcode = Opcode;
9589	if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9590	(SrcIt != MinBWs.end() \|\| It != MinBWs.end())) {
9591	// Check if the values are candidates to demote.
9592	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
9593	if (SrcIt != MinBWs.end()) {
9594	SrcBWSz = SrcIt ->second.first;
9595	SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
9596	SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
9597	}
9598	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9599	if (BWSz == SrcBWSz) {
9600	VecOpcode = Instruction::BitCast;
9601	} else if (BWSz < SrcBWSz) {
9602	VecOpcode = Instruction::Trunc;
9603	} else if (It != MinBWs.end()) {
9604	assert(BWSz > SrcBWSz && "Invalid cast!");
9605	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
9606	} else if (SrcIt != MinBWs.end()) {
9607	assert(BWSz > SrcBWSz && "Invalid cast!");
9608	VecOpcode =
9609	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
9610	}
9611	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9612	!SrcIt ->second.second) {
9613	VecOpcode = Instruction::UIToFP;
9614	}
9615	auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9616	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9617	return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
9618	Src: VL0->getOperand(i: `0`)->getType(),
9619	CCH: TTI::getCastContextHint(I: VI), CostKind, I: VI);
9620	};
9621	auto GetVectorCost = [=](InstructionCost CommonCost) {
9622	// Do not count cost here if minimum bitwidth is in effect and it is just
9623	// a bitcast (here it is just a noop).
9624	if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9625	return CommonCost;
9626	auto VI = VL0->getOpcode() == Opcode ? VL0 : nullptr*;
9627	TTI::CastContextHint CCH = GetCastContextHint (VL0->getOperand(i: `0`));
9628	return CommonCost +
9629	TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
9630	I: VecOpcode == Opcode ? VI : nullptr);
9631	};
9632	return GetCostDiff (GetScalarCost, GetVectorCost);
9633	}
9634	case Instruction::FCmp:
9635	case Instruction::ICmp:
9636	case Instruction::Select: {
9637	CmpInst::Predicate VecPred, SwappedVecPred;
9638	auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
9639	if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) \|\|
9640	match(V: VL0, P: MatchCmp))
9641	SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
9642	else
9643	SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9644	? CmpInst::BAD_FCMP_PREDICATE
9645	: CmpInst::BAD_ICMP_PREDICATE;
9646	auto GetScalarCost = [&](unsigned Idx) {
9647	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9648	CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9649	? CmpInst::BAD_FCMP_PREDICATE
9650	: CmpInst::BAD_ICMP_PREDICATE;
9651	auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
9652	if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
9653	!match(V: VI, P: MatchCmp)) \|\|
9654	(CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9655	VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9656	? CmpInst::BAD_FCMP_PREDICATE
9657	: CmpInst::BAD_ICMP_PREDICATE;
9658
9659	InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
9660	Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
9661	CostKind, I: VI);
9662	auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI);
9663	if (MinMaxID != Intrinsic::not_intrinsic) {
9664	Type *CanonicalType = OrigScalarTy;
9665	if (CanonicalType->isPtrOrPtrVectorTy())
9666	CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
9667	C&: CanonicalType->getContext(),
9668	NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
9669
9670	IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9671	{CanonicalType, CanonicalType});
9672	InstructionCost IntrinsicCost =
9673	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9674	// If the selects are the only uses of the compares, they will be
9675	// dead and we can adjust the cost by removing their cost.
9676	if (SelectOnly) {
9677	auto *CI = cast<CmpInst>(Val: VI->getOperand(i: `0`));
9678	IntrinsicCost -= TTI->getCmpSelInstrCost(
9679	Opcode: CI->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(),
9680	VecPred: CI->getPredicate(), CostKind, I: CI);
9681	}
9682	ScalarCost = std::min(a: ScalarCost, b: IntrinsicCost);
9683	}
9684
9685	return ScalarCost;
9686	};
9687	auto GetVectorCost = [&](InstructionCost CommonCost) {
9688	auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
9689
9690	InstructionCost VecCost = TTI->getCmpSelInstrCost(
9691	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred, CostKind, I: VL0);
9692	// Check if it is possible and profitable to use min/max for selects
9693	// in VL.
9694	//
9695	auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9696	if (MinMaxID != Intrinsic::not_intrinsic) {
9697	Type *CanonicalType = VecTy;
9698	if (CanonicalType->isPtrOrPtrVectorTy())
9699	CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
9700	C&: CanonicalType->getContext(),
9701	NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
9702	IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9703	{CanonicalType, CanonicalType});
9704	InstructionCost IntrinsicCost =
9705	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9706	// If the selects are the only uses of the compares, they will be
9707	// dead and we can adjust the cost by removing their cost.
9708	if (SelectOnly) {
9709	auto *CI =
9710	cast<CmpInst>(Val: cast<Instruction>(Val: VL.front())->getOperand(i: `0`));
9711	IntrinsicCost -= TTI->getCmpSelInstrCost(Opcode: CI->getOpcode(), ValTy: VecTy,
9712	CondTy: MaskTy, VecPred, CostKind);
9713	}
9714	VecCost = std::min(a: VecCost, b: IntrinsicCost);
9715	}
9716	return VecCost + CommonCost;
9717	};
9718	return GetCostDiff (GetScalarCost, GetVectorCost);
9719	}
9720	case Instruction::FNeg:
9721	case Instruction::Add:
9722	case Instruction::FAdd:
9723	case Instruction::Sub:
9724	case Instruction::FSub:
9725	case Instruction::Mul:
9726	case Instruction::FMul:
9727	case Instruction::UDiv:
9728	case Instruction::SDiv:
9729	case Instruction::FDiv:
9730	case Instruction::URem:
9731	case Instruction::SRem:
9732	case Instruction::FRem:
9733	case Instruction::Shl:
9734	case Instruction::LShr:
9735	case Instruction::AShr:
9736	case Instruction::And:
9737	case Instruction::Or:
9738	case Instruction::Xor: {
9739	auto GetScalarCost = [&](unsigned Idx) {
9740	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9741	unsigned OpIdx = isa<UnaryOperator>(Val: VI) ? `0` : `1`;
9742	TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: VI->getOperand(i: `0`));
9743	TTI::OperandValueInfo Op2Info =
9744	TTI::getOperandInfo(V: VI->getOperand(i: OpIdx));
9745	SmallVector<const Value *> Operands(VI->operand_values());
9746	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind,
9747	Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands, CxtI: VI);
9748	};
9749	auto GetVectorCost = [=](InstructionCost CommonCost) {
9750	if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9751	for (unsigned I : seq<unsigned>(Begin: `0`, End: E->getNumOperands())) {
9752	ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
9753	if (all_of(Range&: Ops, P: [&](Value *Op) {
9754	auto *CI = dyn_cast<ConstantInt>(Val: Op);
9755	return CI && CI->getValue().countr_one() >= It ->second.first;
9756	}))
9757	return CommonCost;
9758	}
9759	}
9760	unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? `0` : `1`;
9761	TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
9762	TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
9763	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
9764	Opd2Info: Op2Info, Args: std::nullopt, CxtI: nullptr, TLibInfo: TLI) +
9765	CommonCost;
9766	};
9767	return GetCostDiff (GetScalarCost, GetVectorCost);
9768	}
9769	case Instruction::GetElementPtr: {
9770	return CommonCost + GetGEPCostDiff (VL, VL0);
9771	}
9772	case Instruction::Load: {
9773	auto GetScalarCost = [&](unsigned Idx) {
9774	auto *VI = cast<LoadInst>(Val: UniqueValues [Idx]);
9775	return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
9776	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9777	CostKind, OpdInfo: TTI::OperandValueInfo (), I: VI);
9778	};
9779	auto *LI0 = cast<LoadInst>(Val: VL0);
9780	auto GetVectorCost = [&](InstructionCost CommonCost) {
9781	InstructionCost VecLdCost;
9782	if (E->State == TreeEntry::Vectorize) {
9783	VecLdCost = TTI->getMemoryOpCost(
9784	Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
9785	AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo ());
9786	} else if (E->State == TreeEntry::StridedVectorize) {
9787	Align CommonAlignment =
9788	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9789	VecLdCost = TTI->getStridedMemoryOpCost(
9790	Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9791	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
9792	} else {
9793	assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9794	Align CommonAlignment =
9795	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9796	VecLdCost = TTI->getGatherScatterOpCost(
9797	Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9798	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
9799	}
9800	return VecLdCost + CommonCost;
9801	};
9802
9803	InstructionCost Cost = GetCostDiff (GetScalarCost, GetVectorCost);
9804	// If this node generates masked gather load then it is not a terminal node.
9805	// Hence address operand cost is estimated separately.
9806	if (E->State == TreeEntry::ScatterVectorize)
9807	return Cost;
9808
9809	// Estimate cost of GEPs since this tree node is a terminator.
9810	SmallVector<Value *> PointerOps(VL.size());
9811	for (auto [I, V] : enumerate(First&: VL))
9812	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
9813	return Cost + GetGEPCostDiff (PointerOps, LI0->getPointerOperand());
9814	}
9815	case Instruction::Store: {
9816	bool IsReorder = !E->ReorderIndices.empty();
9817	auto GetScalarCost = [=](unsigned Idx) {
9818	auto *VI = cast<StoreInst>(Val: VL [Idx]);
9819	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
9820	return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
9821	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9822	CostKind, OpdInfo: OpInfo, I: VI);
9823	};
9824	auto *BaseSI =
9825	cast<StoreInst>(Val: IsReorder ? VL [E->ReorderIndices.front()] : VL0);
9826	auto GetVectorCost = [=](InstructionCost CommonCost) {
9827	// We know that we can merge the stores. Calculate the cost.
9828	InstructionCost VecStCost;
9829	if (E->State == TreeEntry::StridedVectorize) {
9830	Align CommonAlignment =
9831	computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
9832	VecStCost = TTI->getStridedMemoryOpCost(
9833	Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
9834	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
9835	} else {
9836	assert(E->State == TreeEntry::Vectorize &&
9837	"Expected either strided or consecutive stores.");
9838	TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
9839	VecStCost = TTI->getMemoryOpCost(
9840	Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
9841	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
9842	}
9843	return VecStCost + CommonCost;
9844	};
9845	SmallVector<Value *> PointerOps(VL.size());
9846	for (auto [I, V] : enumerate(First&: VL)) {
9847	unsigned Idx = IsReorder ? E->ReorderIndices [I] : I;
9848	PointerOps [Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
9849	}
9850
9851	return GetCostDiff (GetScalarCost, GetVectorCost) +
9852	GetGEPCostDiff (PointerOps, BaseSI->getPointerOperand());
9853	}
9854	case Instruction::Call: {
9855	auto GetScalarCost = [&](unsigned Idx) {
9856	auto *CI = cast<CallInst>(Val: UniqueValues [Idx]);
9857	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9858	if (ID != Intrinsic::not_intrinsic) {
9859	IntrinsicCostAttributes CostAttrs(ID, *CI, `1`);
9860	return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9861	}
9862	return TTI->getCallInstrCost(F: CI->getCalledFunction(),
9863	RetTy: CI->getFunctionType()->getReturnType(),
9864	Tys: CI->getFunctionType()->params(), CostKind);
9865	};
9866	auto GetVectorCost = [=](InstructionCost CommonCost) {
9867	auto *CI = cast<CallInst>(Val: VL0);
9868	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9869	SmallVector<Type *> ArgTys =
9870	buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
9871	MinBW: It != MinBWs.end() ? It ->second.first : `0`);
9872	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9873	return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
9874	};
9875	return GetCostDiff (GetScalarCost, GetVectorCost);
9876	}
9877	case Instruction::ShuffleVector: {
9878	assert(E->isAltShuffle() &&
9879	((Instruction::isBinaryOp(E->getOpcode()) &&
9880	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
9881	(Instruction::isCast(E->getOpcode()) &&
9882	Instruction::isCast(E->getAltOpcode())) \|\|
9883	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884	"Invalid Shuffle Vector Operand");
9885	// Try to find the previous shuffle node with the same operands and same
9886	// main/alternate ops.
9887	auto TryFindNodeWithEqualOperands = [=]() {
9888	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9889	if (TE.get() == E)
9890	break;
9891	if (TE ->isAltShuffle() &&
9892	((TE ->getOpcode() == E->getOpcode() &&
9893	TE ->getAltOpcode() == E->getAltOpcode()) \|\|
9894	(TE ->getOpcode() == E->getAltOpcode() &&
9895	TE ->getAltOpcode() == E->getOpcode())) &&
9896	TE ->hasEqualOperands(TE: *E))
9897	return true;
9898	}
9899	return false;
9900	};
9901	auto GetScalarCost = [&](unsigned Idx) {
9902	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
9903	assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9904	(void)E;
9905	return TTI->getInstructionCost(U: VI, CostKind);
9906	};
9907	// Need to clear CommonCost since the final shuffle cost is included into
9908	// vector cost.
9909	auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9910	// VecCost is equal to sum of the cost of creating 2 vectors
9911	// and the cost of creating shuffle.
9912	InstructionCost VecCost = `0`;
9913	if (TryFindNodeWithEqualOperands ()) {
9914	LLVM_DEBUG({
9915	dbgs() << "SLP: diamond match for alternate node found.\n";
9916	E->dump();
9917	});
9918	// No need to add new vector costs here since we're going to reuse
9919	// same main/alternate vector ops, just do different shuffling.
9920	} else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
9921	VecCost =
9922	TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
9923	VecCost +=
9924	TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
9925	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
9926	auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
9927	VecCost = TTIRef.getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9928	VecPred: CI0->getPredicate(), CostKind, I: VL0);
9929	VecCost += TTIRef.getCmpSelInstrCost(
9930	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9931	VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
9932	I: E->getAltOp());
9933	} else {
9934	Type *SrcSclTy = E->getMainOp()->getOperand(i: `0`)->getType();
9935	auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
9936	if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9937	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
9938	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9939	unsigned SrcBWSz =
9940	DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: `0`)->getType());
9941	if (SrcIt != MinBWs.end()) {
9942	SrcBWSz = SrcIt ->second.first;
9943	SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
9944	SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
9945	}
9946	if (BWSz <= SrcBWSz) {
9947	if (BWSz < SrcBWSz)
9948	VecCost =
9949	TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
9950	CCH: TTI::CastContextHint::None, CostKind);
9951	LLVM_DEBUG({
9952	dbgs()
9953	<< "SLP: alternate extension, which should be truncated.\n";
9954	E->dump();
9955	});
9956	return VecCost;
9957	}
9958	}
9959	VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
9960	CCH: TTI::CastContextHint::None, CostKind);
9961	VecCost +=
9962	TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
9963	CCH: TTI::CastContextHint::None, CostKind);
9964	}
9965	SmallVector<int> Mask;
9966	E->buildAltOpShuffleMask(
9967	IsAltOp: [E](Instruction *I) {
9968	assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9969	return I->getOpcode() == E->getAltOpcode();
9970	},
9971	Mask);
9972	VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
9973	Tp: FinalVecTy, Mask);
9974	// Patterns like [fadd,fsub] can be combined into a single instruction
9975	// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9976	// need to take into account their order when looking for the most used
9977	// order.
9978	unsigned Opcode0 = E->getOpcode();
9979	unsigned Opcode1 = E->getAltOpcode();
9980	SmallBitVector OpcodeMask(getAltInstrMask(VL: E->Scalars, Opcode0, Opcode1));
9981	// If this pattern is supported by the target then we consider the
9982	// order.
9983	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9984	InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9985	VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9986	return AltVecCost < VecCost ? AltVecCost : VecCost;
9987	}
9988	// TODO: Check the reverse order too.
9989	return VecCost;
9990	};
9991	return GetCostDiff (GetScalarCost, GetVectorCost);
9992	}
9993	default:
9994	llvm_unreachable("Unknown instruction");
9995	}
9996	}
9997
9998	bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9999	LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10000	<< VectorizableTree.size() << " is fully vectorizable .\n");
10001
10002	auto &&AreVectorizableGathers = [this](const TreeEntry TE, unsigned* Limit) {
10003	SmallVector<int> Mask;
10004	return TE->isGather() &&
10005	!any_of(Range: TE->Scalars,
10006	P: [this](Value V) { return* EphValues.contains(Ptr: V); }) &&
10007	(allConstant(VL: TE->Scalars) \|\| isSplat(VL: TE->Scalars) \|\|
10008	TE->Scalars.size() < Limit \|\|
10009	((TE->getOpcode() == Instruction::ExtractElement \|\|
10010	all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
10011	isFixedVectorShuffle(VL: TE->Scalars, Mask)) \|\|
10012	(TE->isGather() && TE->getOpcode() == Instruction::Load &&
10013	!TE->isAltShuffle()));
10014	};
10015
10016	// We only handle trees of heights 1 and 2.
10017	if (VectorizableTree.size() == `1` &&
10018	(VectorizableTree [`0`]->State == TreeEntry::Vectorize \|\|
10019	(ForReduction &&
10020	AreVectorizableGathers (VectorizableTree [`0`].get(),
10021	VectorizableTree [`0`]->Scalars.size()) &&
10022	VectorizableTree [`0`]->getVectorFactor() > `2`)))
10023	return true;
10024
10025	if (VectorizableTree.size() != `2`)
10026	return false;
10027
10028	// Handle splat and all-constants stores. Also try to vectorize tiny trees
10029	// with the second gather nodes if they have less scalar operands rather than
10030	// the initial tree element (may be profitable to shuffle the second gather)
10031	// or they are extractelements, which form shuffle.
10032	SmallVector<int> Mask;
10033	if (VectorizableTree [`0`]->State == TreeEntry::Vectorize &&
10034	AreVectorizableGathers (VectorizableTree [`1`].get(),
10035	VectorizableTree [`0`]->Scalars.size()))
10036	return true;
10037
10038	// Gathering cost would be too much for tiny trees.
10039	if (VectorizableTree [`0`]->isGather() \|\|
10040	(VectorizableTree [`1`]->isGather() &&
10041	VectorizableTree [`0`]->State != TreeEntry::ScatterVectorize &&
10042	VectorizableTree [`0`]->State != TreeEntry::StridedVectorize))
10043	return false;
10044
10045	return true;
10046	}
10047
10048	static bool isLoadCombineCandidateImpl(Value Root, unsigned* NumElts,
10049	TargetTransformInfo *TTI,
10050	bool MustMatchOrInst) {
10051	// Look past the root to find a source value. Arbitrarily follow the
10052	// path through operand 0 of any 'or'. Also, peek through optional
10053	// shift-left-by-multiple-of-8-bits.
10054	Value *ZextLoad = Root;
10055	const APInt *ShAmtC;
10056	bool FoundOr = false;
10057	while (!isa<ConstantExpr>(Val: ZextLoad) &&
10058	(match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) \|\|
10059	(match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
10060	ShAmtC->urem(RHS: `8`) == `0`))) {
10061	auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
10062	ZextLoad = BinOp->getOperand(i_nocapture: `0`);
10063	if (BinOp->getOpcode() == Instruction::Or)
10064	FoundOr = true;
10065	}
10066	// Check if the input is an extended load of the required or/shift expression.
10067	Value *Load;
10068	if ((MustMatchOrInst && !FoundOr) \|\| ZextLoad == Root \|\|
10069	!match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) \|\| !isa<LoadInst>(Val: Load))
10070	return false;
10071
10072	// Require that the total load bit width is a legal integer type.
10073	// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10074	// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10075	Type *SrcTy = Load->getType();
10076	unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10077	if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
10078	return false;
10079
10080	// Everything matched - assume that we can fold the whole sequence using
10081	// load combining.
10082	LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10083	<< *(cast<Instruction>(Root)) << "\n");
10084
10085	return true;
10086	}
10087
10088	bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
10089	if (RdxKind != RecurKind::Or)
10090	return false;
10091
10092	unsigned NumElts = VectorizableTree [`0`]->Scalars.size();
10093	Value *FirstReduced = VectorizableTree [`0`]->Scalars [`0`];
10094	return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
10095	/ MatchOr / MustMatchOrInst: false);
10096	}
10097
10098	bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value > Stores) const* {
10099	// Peek through a final sequence of stores and check if all operations are
10100	// likely to be load-combined.
10101	unsigned NumElts = Stores.size();
10102	for (Value *Scalar : Stores) {
10103	Value *X;
10104	if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) \|\|
10105	!isLoadCombineCandidateImpl(Root: X, NumElts, TTI, / MatchOr / MustMatchOrInst: true))
10106	return false;
10107	}
10108	return true;
10109	}
10110
10111	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10112	// No need to vectorize inserts of gathered values.
10113	if (VectorizableTree.size() == `2` &&
10114	isa<InsertElementInst>(Val: VectorizableTree [`0`]->Scalars [`0`]) &&
10115	VectorizableTree [`1`]->isGather() &&
10116	(VectorizableTree [`1`]->getVectorFactor() <= `2` \|\|
10117	!(isSplat(VL: VectorizableTree [`1`]->Scalars) \|\|
10118	allConstant(VL: VectorizableTree [`1`]->Scalars))))
10119	return true;
10120
10121	// If the graph includes only PHI nodes and gathers, it is defnitely not
10122	// profitable for the vectorization, we can skip it, if the cost threshold is
10123	// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10124	// gathers/buildvectors.
10125	constexpr int Limit = `4`;
10126	if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10127	!VectorizableTree.empty() &&
10128	all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
10129	return (TE ->isGather() &&
10130	TE ->getOpcode() != Instruction::ExtractElement &&
10131	count_if(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) \|\|
10132	TE ->getOpcode() == Instruction::PHI;
10133	}))
10134	return true;
10135
10136	// We can vectorize the tree if its size is greater than or equal to the
10137	// minimum size specified by the MinTreeSize command line option.
10138	if (VectorizableTree.size() >= MinTreeSize)
10139	return false;
10140
10141	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10142	// can vectorize it if we can prove it fully vectorizable.
10143	if (isFullyVectorizableTinyTree(ForReduction))
10144	return false;
10145
10146	// Check if any of the gather node forms an insertelement buildvector
10147	// somewhere.
10148	bool IsAllowedSingleBVNode =
10149	VectorizableTree.size() > `1` \|\|
10150	(VectorizableTree.size() == `1` && VectorizableTree.front()->getOpcode() &&
10151	!VectorizableTree.front()->isAltShuffle() &&
10152	VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153	VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10154	allSameBlock(VL: VectorizableTree.front()->Scalars));
10155	if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
10156	return TE ->isGather() && all_of(Range&: TE ->Scalars, P: [&](Value *V) {
10157	return isa<ExtractElementInst, UndefValue>(Val: V) \|\|
10158	(IsAllowedSingleBVNode &&
10159	!V->hasNUsesOrMore(N: UsesLimit) &&
10160	any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
10161	});
10162	}))
10163	return false;
10164
10165	assert(VectorizableTree.empty()
10166	? ExternalUses.empty()
10167	: true && "We shouldn't have any external users");
10168
10169	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
10170	// vectorizable.
10171	return true;
10172	}
10173
10174	InstructionCost BoUpSLP::getSpillCost() const {
10175	// Walk from the bottom of the tree to the top, tracking which values are
10176	// live. When we see a call instruction that is not part of our tree,
10177	// query TTI to see if there is a cost to keeping values live over it
10178	// (for example, if spills and fills are required).
10179	unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10180	InstructionCost Cost = `0`;
10181
10182	SmallPtrSet<Instruction *, `4`> LiveValues;
10183	Instruction PrevInst = nullptr*;
10184
10185	// The entries in VectorizableTree are not necessarily ordered by their
10186	// position in basic blocks. Collect them and order them by dominance so later
10187	// instructions are guaranteed to be visited first. For instructions in
10188	// different basic blocks, we only scan to the beginning of the block, so
10189	// their order does not matter, as long as all instructions in a basic block
10190	// are grouped together. Using dominance ensures a deterministic order.
10191	SmallVector<Instruction *, `16`> OrderedScalars;
10192	for (const auto &TEPtr : VectorizableTree) {
10193	if (TEPtr ->State != TreeEntry::Vectorize)
10194	continue;
10195	Instruction *Inst = dyn_cast<Instruction>(Val: TEPtr ->Scalars [`0`]);
10196	if (!Inst)
10197	continue;
10198	OrderedScalars.push_back(Elt: Inst);
10199	}
10200	llvm::sort(C&: OrderedScalars, Comp: [&](Instruction A, Instruction B) {
10201	auto *NodeA = DT->getNode(BB: A->getParent());
10202	auto *NodeB = DT->getNode(BB: B->getParent());
10203	assert(NodeA && "Should only process reachable instructions");
10204	assert(NodeB && "Should only process reachable instructions");
10205	assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206	"Different nodes should have different DFS numbers");
10207	if (NodeA != NodeB)
10208	return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209	return B->comesBefore(Other: A);
10210	});
10211
10212	for (Instruction *Inst : OrderedScalars) {
10213	if (!PrevInst) {
10214	PrevInst = Inst;
10215	continue;
10216	}
10217
10218	// Update LiveValues.
10219	LiveValues.erase(Ptr: PrevInst);
10220	for (auto &J : PrevInst->operands()) {
10221	if (isa<Instruction>(Val: &J) && getTreeEntry(V: &J))
10222	LiveValues.insert(Ptr: cast<Instruction>(Val: &*J));
10223	}
10224
10225	LLVM_DEBUG({
10226	dbgs() << "SLP: #LV: " << LiveValues.size();
10227	for (auto *X : LiveValues)
10228	dbgs() << " " << X->getName();
10229	dbgs() << ", Looking at ";
10230	Inst->dump();
10231	});
10232
10233	// Now find the sequence of instructions between PrevInst and Inst.
10234	unsigned NumCalls = `0`;
10235	BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10236	PrevInstIt =
10237	PrevInst->getIterator().getReverse();
10238	while (InstIt != PrevInstIt) {
10239	if (PrevInstIt == PrevInst->getParent()->rend()) {
10240	PrevInstIt = Inst->getParent()->rbegin();
10241	continue;
10242	}
10243
10244	auto NoCallIntrinsic = [this](Instruction *I) {
10245	if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
10246	if (II->isAssumeLikeIntrinsic())
10247	return true;
10248	FastMathFlags FMF;
10249	SmallVector<Type *, `4`> Tys;
10250	for (auto &ArgOp : II->args())
10251	Tys.push_back(Elt: ArgOp ->getType());
10252	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: II))
10253	FMF = FPMO->getFastMathFlags();
10254	IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10255	FMF);
10256	InstructionCost IntrCost =
10257	TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
10258	InstructionCost CallCost = TTI->getCallInstrCost(
10259	F: nullptr, RetTy: II->getType(), Tys, CostKind: TTI::TCK_RecipThroughput);
10260	if (IntrCost < CallCost)
10261	return true;
10262	}
10263	return false;
10264	};
10265
10266	// Debug information does not impact spill cost.
10267	if (isa<CallBase>(Val: &PrevInstIt) && !NoCallIntrinsic (&PrevInstIt) &&
10268	&*PrevInstIt != PrevInst)
10269	NumCalls++;
10270
10271	++PrevInstIt;
10272	}
10273
10274	if (NumCalls) {
10275	SmallVector<Type *, `4`> V;
10276	for (auto *II : LiveValues) {
10277	auto *ScalarTy = II->getType();
10278	if (auto *VectorTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
10279	ScalarTy = VectorTy->getElementType();
10280	V.push_back(Elt: getWidenedType(ScalarTy, VF: BundleWidth));
10281	}
10282	Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(Tys: V);
10283	}
10284
10285	PrevInst = Inst;
10286	}
10287
10288	return Cost;
10289	}
10290
10291	/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10292	/// buildvector sequence.
10293	static bool isFirstInsertElement(const InsertElementInst *IE1,
10294	const InsertElementInst *IE2) {
10295	if (IE1 == IE2)
10296	return false;
10297	const auto *I1 = IE1;
10298	const auto *I2 = IE2;
10299	const InsertElementInst *PrevI1;
10300	const InsertElementInst *PrevI2;
10301	unsigned Idx1 = *getElementIndex(Inst: IE1);
10302	unsigned Idx2 = *getElementIndex(Inst: IE2);
10303	do {
10304	if (I2 == IE1)
10305	return true;
10306	if (I1 == IE2)
10307	return false;
10308	PrevI1 = I1;
10309	PrevI2 = I2;
10310	if (I1 && (I1 == IE1 \|\| I1->hasOneUse()) &&
10311	getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
10312	I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: `0`));
10313	if (I2 && ((I2 == IE2 \|\| I2->hasOneUse())) &&
10314	getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
10315	I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: `0`));
10316	} while ((I1 && PrevI1 != I1) \|\| (I2 && PrevI2 != I2));
10317	llvm_unreachable("Two different buildvectors not expected.");
10318	}
10319
10320	namespace {
10321	/// Returns incoming Value , if the requested type is Value * too, or a default*
10322	/// value, otherwise.
10323	struct ValueSelect {
10324	template <typename U>
10325	static std::enable_if_t<std::is_same_v<Value , U>, Value > get(Value *V) {
10326	return V;
10327	}
10328	template <typename U>
10329	static std::enable_if_t<!std::is_same_v<Value , U>, U> get(Value ) {
10330	return U();
10331	}
10332	};
10333	} // namespace
10334
10335	/// Does the analysis of the provided shuffle masks and performs the requested
10336	/// actions on the vectors with the given shuffle masks. It tries to do it in
10337	/// several steps.
10338	/// 1. If the Base vector is not undef vector, resizing the very first mask to
10339	/// have common VF and perform action for 2 input vectors (including non-undef
10340	/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10341	/// and processed as a shuffle of 2 elements.
10342	/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10343	/// action only for 1 vector with the given mask, if it is not the identity
10344	/// mask.
10345	/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10346	/// vectors, combing the masks properly between the steps.
10347	template <typename T>
10348	static T *performExtractsShuffleAction(
10349	MutableArrayRef<std::pair<T , SmallVector<int>>> ShuffleMask, Value Base,
10350	function_ref<unsigned(T *)> GetVF,
10351	function_ref<std::pair<T , bool>(T , ArrayRef<int>, bool)> ResizeAction,
10352	function_ref<T (ArrayRef<int>, ArrayRef<T >)> Action) {
10353	assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10354	SmallVector<int> Mask(ShuffleMask.begin()->second);
10355	auto VMIt = std::next(ShuffleMask.begin());
10356	T Prev = nullptr*;
10357	SmallBitVector UseMask =
10358	buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
10359	SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
10360	if (!IsBaseUndef.all()) {
10361	// Base is not undef, need to combine it with the next subvectors.
10362	std::pair<T , bool*> Res =
10363	ResizeAction(ShuffleMask.begin()->first, Mask, /ForSingleMask=/false);
10364	SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
10365	for (unsigned Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
10366	if (Mask [Idx] == PoisonMaskElem)
10367	Mask [Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10368	else
10369	Mask [Idx] = (Res.second ? Idx : Mask [Idx]) + VF;
10370	}
10371	auto V = ValueSelect::get<T >(Base);
10372	(void)V;
10373	assert((!V \|\| GetVF(V) == Mask.size()) &&
10374	"Expected base vector of VF number of elements.");
10375	Prev = Action(Mask, {nullptr, Res.first});
10376	} else if (ShuffleMask.size() == `1`) {
10377	// Base is undef and only 1 vector is shuffled - perform the action only for
10378	// single vector, if the mask is not the identity mask.
10379	std::pair<T , bool*> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10380	/ForSingleMask=/true);
10381	if (Res.second)
10382	// Identity mask is found.
10383	Prev = Res.first;
10384	else
10385	Prev = Action(Mask, {ShuffleMask.begin()->first});
10386	} else {
10387	// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10388	// shuffles step by step, combining shuffle between the steps.
10389	unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390	unsigned Vec2VF = GetVF(VMIt->first);
10391	if (Vec1VF == Vec2VF) {
10392	// No need to resize the input vectors since they are of the same size, we
10393	// can shuffle them directly.
10394	ArrayRef<int> SecMask = VMIt->second;
10395	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
10396	if (SecMask [I] != PoisonMaskElem) {
10397	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10398	Mask [I] = SecMask [I] + Vec1VF;
10399	}
10400	}
10401	Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10402	} else {
10403	// Vectors of different sizes - resize and reshuffle.
10404	std::pair<T , bool*> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10405	/ForSingleMask=/false);
10406	std::pair<T , bool*> Res2 =
10407	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
10408	ArrayRef<int> SecMask = VMIt->second;
10409	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
10410	if (Mask [I] != PoisonMaskElem) {
10411	assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10412	if (Res1.second)
10413	Mask [I] = I;
10414	} else if (SecMask [I] != PoisonMaskElem) {
10415	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10416	Mask [I] = (Res2.second ? I : SecMask [I]) + VF;
10417	}
10418	}
10419	Prev = Action(Mask, {Res1.first, Res2.first});
10420	}
10421	VMIt = std::next(VMIt);
10422	}
10423	bool IsBaseNotUndef = !IsBaseUndef.all();
10424	(void)IsBaseNotUndef;
10425	// Perform requested actions for the remaining masks/vectors.
10426	for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10427	// Shuffle other input vectors, if any.
10428	std::pair<T , bool*> Res =
10429	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
10430	ArrayRef<int> SecMask = VMIt->second;
10431	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
10432	if (SecMask [I] != PoisonMaskElem) {
10433	assert((Mask[I] == PoisonMaskElem \|\| IsBaseNotUndef) &&
10434	"Multiple uses of scalars.");
10435	Mask [I] = (Res.second ? I : SecMask [I]) + VF;
10436	} else if (Mask [I] != PoisonMaskElem) {
10437	Mask [I] = I;
10438	}
10439	}
10440	Prev = Action(Mask, {Prev, Res.first});
10441	}
10442	return Prev;
10443	}
10444
10445	InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10446	InstructionCost Cost = `0`;
10447	LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10448	<< VectorizableTree.size() << ".\n");
10449
10450	unsigned BundleWidth = VectorizableTree [`0`]->Scalars.size();
10451
10452	SmallPtrSet<Value *, `4`> CheckedExtracts;
10453	for (unsigned I = `0`, E = VectorizableTree.size(); I < E; ++I) {
10454	TreeEntry &TE = *VectorizableTree [I];
10455	if (TE.isGather()) {
10456	if (const TreeEntry *E = getTreeEntry(V: TE.getMainOp());
10457	E && E->getVectorFactor() == TE.getVectorFactor() &&
10458	E->isSame(VL: TE.Scalars)) {
10459	// Some gather nodes might be absolutely the same as some vectorizable
10460	// nodes after reordering, need to handle it.
10461	LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10462	<< shortBundleName(TE.Scalars) << ".\n"
10463	<< "SLP: Current total cost = " << Cost << "\n");
10464	continue;
10465	}
10466	}
10467
10468	InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
10469	Cost += C;
10470	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10471	<< shortBundleName(TE.Scalars) << ".\n"
10472	<< "SLP: Current total cost = " << Cost << "\n");
10473	}
10474
10475	SmallPtrSet<Value *, `16`> ExtractCostCalculated;
10476	InstructionCost ExtractCost = `0`;
10477	SmallVector<MapVector<const TreeEntry , SmallVector<int*>>> ShuffleMasks;
10478	SmallVector<std::pair<Value , const* TreeEntry *>> FirstUsers;
10479	SmallVector<APInt> DemandedElts;
10480	SmallDenseSet<Value *, `4`> UsedInserts;
10481	DenseSet<std::pair<const TreeEntry , Type >> VectorCasts;
10482	std::optional<DenseMap<Value , unsigned*>> ValueToExtUses;
10483	for (ExternalUser &EU : ExternalUses) {
10484	// We only add extract cost once for the same scalar.
10485	if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
10486	!ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
10487	continue;
10488
10489	// Uses by ephemeral values are free (because the ephemeral value will be
10490	// removed prior to code generation, and so the extraction will be
10491	// removed as well).
10492	if (EphValues.count(Ptr: EU.User))
10493	continue;
10494
10495	// No extract cost for vector "scalar"
10496	if (isa<FixedVectorType>(Val: EU.Scalar->getType()))
10497	continue;
10498
10499	// If found user is an insertelement, do not calculate extract cost but try
10500	// to detect it as a final shuffled/identity match.
10501	if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
10502	VU && VU->getOperand(i_nocapture: `1`) == EU.Scalar) {
10503	if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
10504	if (!UsedInserts.insert(V: VU).second)
10505	continue;
10506	std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
10507	if (InsertIdx) {
10508	const TreeEntry *ScalarTE = getTreeEntry(V: EU.Scalar);
10509	auto *It = find_if(
10510	Range&: FirstUsers,
10511	P: [this, VU](const std::pair<Value , const* TreeEntry *> &Pair) {
10512	return areTwoInsertFromSameBuildVector(
10513	VU, V: cast<InsertElementInst>(Val: Pair.first),
10514	GetBaseOperand: [this](InsertElementInst II) -> Value {
10515	Value *Op0 = II->getOperand(i_nocapture: `0`);
10516	if (getTreeEntry(V: II) && !getTreeEntry(V: Op0))
10517	return nullptr;
10518	return Op0;
10519	});
10520	});
10521	int VecId = -`1`;
10522	if (It == FirstUsers.end()) {
10523	(void)ShuffleMasks.emplace_back();
10524	SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10525	if (Mask.empty())
10526	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10527	// Find the insertvector, vectorized in tree, if any.
10528	Value *Base = VU;
10529	while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
10530	if (IEBase != EU.User &&
10531	(!IEBase->hasOneUse() \|\|
10532	getElementIndex(Inst: IEBase).value_or(u&: InsertIdx) == InsertIdx))
10533	break;
10534	// Build the mask for the vectorized insertelement instructions.
10535	if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
10536	VU = IEBase;
10537	do {
10538	IEBase = cast<InsertElementInst>(Val: Base);
10539	int Idx = *getElementIndex(Inst: IEBase);
10540	assert(Mask[Idx] == PoisonMaskElem &&
10541	"InsertElementInstruction used already.");
10542	Mask [Idx] = Idx;
10543	Base = IEBase->getOperand(i_nocapture: `0`);
10544	} while (E == getTreeEntry(V: Base));
10545	break;
10546	}
10547	Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: `0`);
10548	}
10549	FirstUsers.emplace_back(Args&: VU, Args&: ScalarTE);
10550	DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
10551	VecId = FirstUsers.size() - `1`;
10552	auto It = MinBWs.find(Val: ScalarTE);
10553	if (It != MinBWs.end() &&
10554	VectorCasts
10555	.insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
10556	.second) {
10557	unsigned BWSz = It ->second.first;
10558	unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
10559	unsigned VecOpcode;
10560	if (DstBWSz < BWSz)
10561	VecOpcode = Instruction::Trunc;
10562	else
10563	VecOpcode =
10564	It ->second.second ? Instruction::SExt : Instruction::ZExt;
10565	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10566	InstructionCost C = TTI->getCastInstrCost(
10567	Opcode: VecOpcode, Dst: FTy,
10568	Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
10569	VF: FTy->getNumElements()),
10570	CCH: TTI::CastContextHint::None, CostKind);
10571	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10572	<< " for extending externally used vector with "
10573	"non-equal minimum bitwidth.\n");
10574	Cost += C;
10575	}
10576	} else {
10577	if (isFirstInsertElement(IE1: VU, IE2: cast<InsertElementInst>(Val: It->first)))
10578	It->first = VU;
10579	VecId = std::distance(first: FirstUsers.begin(), last: It);
10580	}
10581	int InIdx = *InsertIdx;
10582	SmallVectorImpl<int> &Mask = ShuffleMasks [VecId][ScalarTE];
10583	if (Mask.empty())
10584	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10585	Mask [InIdx] = EU.Lane;
10586	DemandedElts [VecId].setBit(InIdx);
10587	continue;
10588	}
10589	}
10590	}
10591	// Leave the GEPs as is, they are free in most cases and better to keep them
10592	// as GEPs.
10593	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10594	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: EU.Scalar)) {
10595	if (!ValueToExtUses) {
10596	ValueToExtUses.emplace();
10597	for_each(Range: enumerate(First&: ExternalUses), F: [&](const auto &P) {
10598	ValueToExtUses ->try_emplace(P.value().Scalar, P.index());
10599	});
10600	}
10601	// Can use original GEP, if no operands vectorized or they are marked as
10602	// externally used already.
10603	bool CanBeUsedAsGEP = all_of(Range: GEP->operands(), P: [&](Value *V) {
10604	if (!getTreeEntry(V))
10605	return true;
10606	auto It = ValueToExtUses ->find(Val: V);
10607	if (It != ValueToExtUses ->end()) {
10608	// Replace all uses to avoid compiler crash.
10609	ExternalUses [It ->second].User = nullptr;
10610	return true;
10611	}
10612	return false;
10613	});
10614	if (CanBeUsedAsGEP) {
10615	ExtractCost += TTI->getInstructionCost(U: GEP, CostKind);
10616	ExternalUsesAsGEPs.insert(Ptr: EU.Scalar);
10617	continue;
10618	}
10619	}
10620
10621	// If we plan to rewrite the tree in a smaller type, we will need to sign
10622	// extend the extracted value back to the original type. Here, we account
10623	// for the extract and the added cost of the sign extend if needed.
10624	auto *VecTy = getWidenedType(ScalarTy: EU.Scalar->getType(), VF: BundleWidth);
10625	auto It = MinBWs.find(Val: getTreeEntry(V: EU.Scalar));
10626	if (It != MinBWs.end()) {
10627	auto *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
10628	unsigned Extend =
10629	It ->second.second ? Instruction::SExt : Instruction::ZExt;
10630	VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
10631	ExtractCost += TTI->getExtractWithExtendCost(Opcode: Extend, Dst: EU.Scalar->getType(),
10632	VecTy, Index: EU.Lane);
10633	} else {
10634	ExtractCost += TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
10635	CostKind, Index: EU.Lane);
10636	}
10637	}
10638	// Add reduced value cost, if resized.
10639	if (!VectorizedVals.empty()) {
10640	const TreeEntry &Root = *VectorizableTree.front();
10641	auto BWIt = MinBWs.find(Val: &Root);
10642	if (BWIt != MinBWs.end()) {
10643	Type *DstTy = Root.Scalars.front()->getType();
10644	unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy);
10645	unsigned SrcSz =
10646	ReductionBitWidth == `0` ? BWIt ->second.first : ReductionBitWidth;
10647	if (OriginalSz != SrcSz) {
10648	unsigned Opcode = Instruction::Trunc;
10649	if (OriginalSz > SrcSz)
10650	Opcode = BWIt ->second.second ? Instruction::SExt : Instruction::ZExt;
10651	Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
10652	Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
10653	CCH: TTI::CastContextHint::None,
10654	CostKind: TTI::TCK_RecipThroughput);
10655	}
10656	}
10657	}
10658
10659	InstructionCost SpillCost = getSpillCost();
10660	Cost += SpillCost + ExtractCost;
10661	auto &&ResizeToVF = [this, &Cost](const TreeEntry TE, ArrayRef<int*> Mask,
10662	bool) {
10663	InstructionCost C = `0`;
10664	unsigned VF = Mask.size();
10665	unsigned VecVF = TE->getVectorFactor();
10666	if (VF != VecVF &&
10667	(any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); }) \|\|
10668	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))) {
10669	SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10670	std::copy(Mask.begin(), std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
10671	OrigMask.begin());
10672	C = TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc,
10673	Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
10674	Mask: OrigMask);
10675	LLVM_DEBUG(
10676	dbgs() << "SLP: Adding cost " << C
10677	<< " for final shuffle of insertelement external users.\n";
10678	TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10679	Cost += C;
10680	return std::make_pair(x&: TE, y: true);
10681	}
10682	return std::make_pair(x&: TE, y: false);
10683	};
10684	// Calculate the cost of the reshuffled vectors, if any.
10685	for (int I = `0`, E = FirstUsers.size(); I < E; ++I) {
10686	Value *Base = cast<Instruction>(Val: FirstUsers [I].first)->getOperand(i: `0`);
10687	auto Vector = ShuffleMasks [I].takeVector();
10688	unsigned VF = `0`;
10689	auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10690	ArrayRef<const TreeEntry *> TEs) {
10691	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
10692	"Expected exactly 1 or 2 tree entries.");
10693	if (TEs.size() == `1`) {
10694	if (VF == `0`)
10695	VF = TEs.front()->getVectorFactor();
10696	auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
10697	if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
10698	!all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
10699	return Data.value() == PoisonMaskElem \|\|
10700	(Data.index() < VF &&
10701	static_cast<int>(Data.index()) == Data.value());
10702	})) {
10703	InstructionCost C =
10704	TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
10705	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10706	<< " for final shuffle of insertelement "
10707	"external users.\n";
10708	TEs.front()->dump();
10709	dbgs() << "SLP: Current total cost = " << Cost << "\n");
10710	Cost += C;
10711	}
10712	} else {
10713	if (VF == `0`) {
10714	if (TEs.front() &&
10715	TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716	VF = TEs.front()->getVectorFactor();
10717	else
10718	VF = Mask.size();
10719	}
10720	auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
10721	InstructionCost C =
10722	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
10723	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10724	<< " for final shuffle of vector node and external "
10725	"insertelement users.\n";
10726	if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727	dbgs() << "SLP: Current total cost = " << Cost << "\n");
10728	Cost += C;
10729	}
10730	VF = Mask.size();
10731	return TEs.back();
10732	};
10733	(void)performExtractsShuffleAction<const TreeEntry>(
10734	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
10735	GetVF: [](const TreeEntry E) { return* E->getVectorFactor(); }, ResizeAction: ResizeToVF,
10736	Action: EstimateShufflesCost);
10737	InstructionCost InsertCost = TTI->getScalarizationOverhead(
10738	Ty: cast<FixedVectorType>(Val: FirstUsers [I].first->getType()), DemandedElts: DemandedElts [I],
10739	/Insert/ true, /Extract/ false, CostKind: TTI::TCK_RecipThroughput);
10740	Cost -= InsertCost;
10741	}
10742
10743	// Add the cost for reduced value resize (if required).
10744	if (ReductionBitWidth != `0`) {
10745	assert(UserIgnoreList && "Expected reduction tree.");
10746	const TreeEntry &E = *VectorizableTree.front();
10747	auto It = MinBWs.find(Val: &E);
10748	if (It != MinBWs.end() && It ->second.first != ReductionBitWidth) {
10749	unsigned SrcSize = It ->second.first;
10750	unsigned DstSize = ReductionBitWidth;
10751	unsigned Opcode = Instruction::Trunc;
10752	if (SrcSize < DstSize)
10753	Opcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
10754	auto *SrcVecTy =
10755	getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
10756	auto *DstVecTy =
10757	getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
10758	TTI::CastContextHint CCH = getCastContextHint(TE: E);
10759	InstructionCost CastCost;
10760	switch (E.getOpcode()) {
10761	case Instruction::SExt:
10762	case Instruction::ZExt:
10763	case Instruction::Trunc: {
10764	const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: `0`);
10765	CCH = getCastContextHint(TE: *OpTE);
10766	break;
10767	}
10768	default:
10769	break;
10770	}
10771	CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
10772	CostKind: TTI::TCK_RecipThroughput);
10773	Cost += CastCost;
10774	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10775	<< " for final resize for reduction from " << SrcVecTy
10776	<< " to " << DstVecTy << "\n";
10777	dbgs() << "SLP: Current total cost = " << Cost << "\n");
10778	}
10779	}
10780
10781	#ifndef NDEBUG
10782	SmallString<`256`> Str;
10783	{
10784	raw_svector_ostream OS(Str);
10785	OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10786	<< "SLP: Extract Cost = " << ExtractCost << ".\n"
10787	<< "SLP: Total Cost = " << Cost << ".\n";
10788	}
10789	LLVM_DEBUG(dbgs() << Str);
10790	if (ViewSLPTree)
10791	ViewGraph(this, "SLP" + F->getName(), false, Str);
10792	#endif
10793
10794	return Cost;
10795	}
10796
10797	/// Tries to find extractelement instructions with constant indices from fixed
10798	/// vector type and gather such instructions into a bunch, which highly likely
10799	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10800	/// successful, the matched scalars are replaced by poison values in \p VL for
10801	/// future analysis.
10802	std::optional<TTI::ShuffleKind>
10803	BoUpSLP::tryToGatherSingleRegisterExtractElements(
10804	MutableArrayRef<Value > VL, SmallVectorImpl<int> &Mask) const* {
10805	// Scan list of gathered scalars for extractelements that can be represented
10806	// as shuffles.
10807	MapVector<Value , SmallVector<int*>> VectorOpToIdx;
10808	SmallVector<int> UndefVectorExtracts;
10809	for (int I = `0`, E = VL.size(); I < E; ++I) {
10810	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
10811	if (!EI) {
10812	if (isa<UndefValue>(Val: VL [I]))
10813	UndefVectorExtracts.push_back(Elt: I);
10814	continue;
10815	}
10816	auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
10817	if (!VecTy \|\| !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
10818	continue;
10819	std::optional<unsigned> Idx = getExtractIndex(E: EI);
10820	// Undefined index.
10821	if (!Idx) {
10822	UndefVectorExtracts.push_back(Elt: I);
10823	continue;
10824	}
10825	SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10826	ExtractMask.reset(Idx: *Idx);
10827	if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
10828	UndefVectorExtracts.push_back(Elt: I);
10829	continue;
10830	}
10831	VectorOpToIdx [EI->getVectorOperand()].push_back(Elt: I);
10832	}
10833	// Sort the vector operands by the maximum number of uses in extractelements.
10834	SmallVector<std::pair<Value , SmallVector<int*>>> Vectors =
10835	VectorOpToIdx.takeVector();
10836	stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
10837	return P1.second.size() > P2.second.size();
10838	});
10839	// Find the best pair of the vectors or a single vector.
10840	const int UndefSz = UndefVectorExtracts.size();
10841	unsigned SingleMax = `0`;
10842	unsigned PairMax = `0`;
10843	if (!Vectors.empty()) {
10844	SingleMax = Vectors.front().second.size() + UndefSz;
10845	if (Vectors.size() > `1`) {
10846	auto *ItNext = std::next(x: Vectors.begin());
10847	PairMax = SingleMax + ItNext->second.size();
10848	}
10849	}
10850	if (SingleMax == `0` && PairMax == `0` && UndefSz == `0`)
10851	return std::nullopt;
10852	// Check if better to perform a shuffle of 2 vectors or just of a single
10853	// vector.
10854	SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10855	SmallVector<Value *> GatheredExtracts(
10856	VL.size(), PoisonValue::get(T: VL.front()->getType()));
10857	if (SingleMax >= PairMax && SingleMax) {
10858	for (int Idx : Vectors.front().second)
10859	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
10860	} else if (!Vectors.empty()) {
10861	for (unsigned Idx : {`0`, `1`})
10862	for (int Idx : Vectors [Idx].second)
10863	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
10864	}
10865	// Add extracts from undefs too.
10866	for (int Idx : UndefVectorExtracts)
10867	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
10868	// Check that gather of extractelements can be represented as just a
10869	// shuffle of a single/two vectors the scalars are extracted from.
10870	std::optional<TTI::ShuffleKind> Res =
10871	isFixedVectorShuffle(VL: GatheredExtracts, Mask);
10872	if (!Res) {
10873	// TODO: try to check other subsets if possible.
10874	// Restore the original VL if attempt was not successful.
10875	copy(Range&: SavedVL, Out: VL.begin());
10876	return std::nullopt;
10877	}
10878	// Restore unused scalars from mask, if some of the extractelements were not
10879	// selected for shuffle.
10880	for (int I = `0`, E = GatheredExtracts.size(); I < E; ++I) {
10881	if (Mask [I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts [I]) &&
10882	isa<UndefValue>(Val: GatheredExtracts [I])) {
10883	std::swap(a&: VL [I], b&: GatheredExtracts [I]);
10884	continue;
10885	}
10886	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
10887	if (!EI \|\| !isa<FixedVectorType>(Val: EI->getVectorOperandType()) \|\|
10888	!isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) \|\|
10889	is_contained(Range&: UndefVectorExtracts, Element: I))
10890	continue;
10891	}
10892	return Res;
10893	}
10894
10895	/// Tries to find extractelement instructions with constant indices from fixed
10896	/// vector type and gather such instructions into a bunch, which highly likely
10897	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10898	/// successful, the matched scalars are replaced by poison values in \p VL for
10899	/// future analysis.
10900	SmallVector<std::optional<TTI::ShuffleKind>>
10901	BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10902	SmallVectorImpl<int> &Mask,
10903	unsigned NumParts) const {
10904	assert(NumParts > `0` && "NumParts expected be greater than or equal to 1.");
10905	SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10906	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
10907	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
10908	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
10909	// Scan list of gathered scalars for extractelements that can be represented
10910	// as shuffles.
10911	MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
10912	N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
10913	SmallVector<int> SubMask;
10914	std::optional<TTI::ShuffleKind> Res =
10915	tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
10916	ShufflesRes [Part] = Res;
10917	copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
10918	}
10919	if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
10920	return Res.has_value();
10921	}))
10922	ShufflesRes.clear();
10923	return ShufflesRes;
10924	}
10925
10926	std::optional<TargetTransformInfo::ShuffleKind>
10927	BoUpSLP::isGatherShuffledSingleRegisterEntry(
10928	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
10929	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part, bool ForOrder) {
10930	Entries.clear();
10931	// TODO: currently checking only for Scalars in the tree entry, need to count
10932	// reused elements too for better cost estimation.
10933	const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10934	const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
10935	const BasicBlock TEInsertBlock = nullptr*;
10936	// Main node of PHI entries keeps the correct order of operands/incoming
10937	// blocks.
10938	if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp())) {
10939	TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
10940	TEInsertPt = TEInsertBlock->getTerminator();
10941	} else {
10942	TEInsertBlock = TEInsertPt->getParent();
10943	}
10944	if (!DT->isReachableFromEntry(A: TEInsertBlock))
10945	return std::nullopt;
10946	auto *NodeUI = DT->getNode(BB: TEInsertBlock);
10947	assert(NodeUI && "Should only process reachable instructions");
10948	SmallPtrSet<Value *, `4`> GatheredScalars(VL.begin(), VL.end());
10949	auto CheckOrdering = [&](const Instruction *InsertPt) {
10950	// Argument InsertPt is an instruction where vector code for some other
10951	// tree entry (one that shares one or more scalars with TE) is going to be
10952	// generated. This lambda returns true if insertion point of vector code
10953	// for the TE dominates that point (otherwise dependency is the other way
10954	// around). The other node is not limited to be of a gather kind. Gather
10955	// nodes are not scheduled and their vector code is inserted before their
10956	// first user. If user is PHI, that is supposed to be at the end of a
10957	// predecessor block. Otherwise it is the last instruction among scalars of
10958	// the user node. So, instead of checking dependency between instructions
10959	// themselves, we check dependency between their insertion points for vector
10960	// code (since each scalar instruction ends up as a lane of a vector
10961	// instruction).
10962	const BasicBlock *InsertBlock = InsertPt->getParent();
10963	auto *NodeEUI = DT->getNode(BB: InsertBlock);
10964	if (!NodeEUI)
10965	return false;
10966	assert((NodeUI == NodeEUI) ==
10967	(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968	"Different nodes should have different DFS numbers");
10969	// Check the order of the gather nodes users.
10970	if (TEInsertPt->getParent() != InsertBlock &&
10971	(DT->dominates(A: NodeUI, B: NodeEUI) \|\| !DT->dominates(A: NodeEUI, B: NodeUI)))
10972	return false;
10973	if (TEInsertPt->getParent() == InsertBlock &&
10974	TEInsertPt->comesBefore(Other: InsertPt))
10975	return false;
10976	return true;
10977	};
10978	// Find all tree entries used by the gathered values. If no common entries
10979	// found - not a shuffle.
10980	// Here we build a set of tree nodes for each gathered value and trying to
10981	// find the intersection between these sets. If we have at least one common
10982	// tree node for each gathered value - we have just a permutation of the
10983	// single vector. If we have 2 different sets, we're in situation where we
10984	// have a permutation of 2 input vectors.
10985	SmallVector<SmallPtrSet<const TreeEntry *, `4`>> UsedTEs;
10986	DenseMap<Value , int*> UsedValuesEntry;
10987	for (Value *V : VL) {
10988	if (isConstant(V))
10989	continue;
10990	// Build a list of tree entries where V is used.
10991	SmallPtrSet<const TreeEntry *, `4`> VToTEs;
10992	for (const TreeEntry *TEPtr : ValueToGatherNodes.find(Val: V)->second) {
10993	if (TEPtr == TE)
10994	continue;
10995	assert(any_of(TEPtr->Scalars,
10996	[&](Value V) { return* GatheredScalars.contains(V); }) &&
10997	"Must contain at least single gathered value.");
10998	assert(TEPtr->UserTreeIndices.size() == `1` &&
10999	"Expected only single user of a gather node.");
11000	const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11001
11002	PHINode *UserPHI = dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp());
11003	const Instruction *InsertPt =
11004	UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
11005	: &getLastInstructionInBundle(E: UseEI.UserTE);
11006	if (TEInsertPt == InsertPt) {
11007	// If 2 gathers are operands of the same entry (regardless of whether
11008	// user is PHI or else), compare operands indices, use the earlier one
11009	// as the base.
11010	if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11011	continue;
11012	// If the user instruction is used for some reason in different
11013	// vectorized nodes - make it depend on index.
11014	if (TEUseEI.UserTE != UseEI.UserTE &&
11015	TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11016	continue;
11017	}
11018
11019	// Check if the user node of the TE comes after user node of TEPtr,
11020	// otherwise TEPtr depends on TE.
11021	if ((TEInsertBlock != InsertPt->getParent() \|\|
11022	TEUseEI.EdgeIdx < UseEI.EdgeIdx \|\| TEUseEI.UserTE != UseEI.UserTE) &&
11023	!CheckOrdering (InsertPt))
11024	continue;
11025	VToTEs.insert(Ptr: TEPtr);
11026	}
11027	if (const TreeEntry *VTE = getTreeEntry(V)) {
11028	if (ForOrder) {
11029	if (VTE->State != TreeEntry::Vectorize) {
11030	auto It = MultiNodeScalars.find(Val: V);
11031	if (It == MultiNodeScalars.end())
11032	continue;
11033	VTE = *It ->getSecond().begin();
11034	// Iterate through all vectorized nodes.
11035	auto MIt = find_if(Range&: It ->getSecond(), P: [](const* TreeEntry *MTE) {
11036	return MTE->State == TreeEntry::Vectorize;
11037	});
11038	if (MIt == It ->getSecond().end())
11039	continue;
11040	VTE = *MIt;
11041	}
11042	}
11043	Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
11044	if (&LastBundleInst == TEInsertPt \|\| !CheckOrdering (&LastBundleInst))
11045	continue;
11046	VToTEs.insert(Ptr: VTE);
11047	}
11048	if (VToTEs.empty())
11049	continue;
11050	if (UsedTEs.empty()) {
11051	// The first iteration, just insert the list of nodes to vector.
11052	UsedTEs.push_back(Elt: VToTEs);
11053	UsedValuesEntry.try_emplace(Key: V, Args: `0`);
11054	} else {
11055	// Need to check if there are any previously used tree nodes which use V.
11056	// If there are no such nodes, consider that we have another one input
11057	// vector.
11058	SmallPtrSet<const TreeEntry *, `4`> SavedVToTEs(VToTEs);
11059	unsigned Idx = `0`;
11060	for (SmallPtrSet<const TreeEntry *, `4`> &Set : UsedTEs) {
11061	// Do we have a non-empty intersection of previously listed tree entries
11062	// and tree entries using current V?
11063	set_intersect(S1&: VToTEs, S2: Set);
11064	if (!VToTEs.empty()) {
11065	// Yes, write the new subset and continue analysis for the next
11066	// scalar.
11067	Set.swap(RHS&: VToTEs);
11068	break;
11069	}
11070	VToTEs = SavedVToTEs;
11071	++Idx;
11072	}
11073	// No non-empty intersection found - need to add a second set of possible
11074	// source vectors.
11075	if (Idx == UsedTEs.size()) {
11076	// If the number of input vectors is greater than 2 - not a permutation,
11077	// fallback to the regular gather.
11078	// TODO: support multiple reshuffled nodes.
11079	if (UsedTEs.size() == `2`)
11080	continue;
11081	UsedTEs.push_back(Elt: SavedVToTEs);
11082	Idx = UsedTEs.size() - `1`;
11083	}
11084	UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
11085	}
11086	}
11087
11088	if (UsedTEs.empty()) {
11089	Entries.clear();
11090	return std::nullopt;
11091	}
11092
11093	unsigned VF = `0`;
11094	if (UsedTEs.size() == `1`) {
11095	// Keep the order to avoid non-determinism.
11096	SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11097	UsedTEs.front().end());
11098	sort(C&: FirstEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
11099	return TE1->Idx < TE2->Idx;
11100	});
11101	// Try to find the perfect match in another gather node at first.
11102	auto It = find_if(Range&: FirstEntries, P: [=](const* TreeEntry *EntryPtr) {
11103	return EntryPtr->isSame(VL) \|\| EntryPtr->isSame(VL: TE->Scalars);
11104	});
11105	if (It != FirstEntries.end() &&
11106	((*It)->getVectorFactor() == VL.size() \|\|
11107	((*It)->getVectorFactor() == TE->Scalars.size() &&
11108	TE->ReuseShuffleIndices.size() == VL.size() &&
11109	(*It)->isSame(VL: TE->Scalars)))) {
11110	Entries.push_back(Elt: *It);
11111	if ((*It)->getVectorFactor() == VL.size()) {
11112	std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
11113	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: `0`);
11114	} else {
11115	SmallVector<int> CommonMask = TE->getCommonMask();
11116	copy(Range&: CommonMask, Out: Mask.begin());
11117	}
11118	// Clear undef scalars.
11119	for (int I = `0`, Sz = VL.size(); I < Sz; ++I)
11120	if (isa<PoisonValue>(Val: VL [I]))
11121	Mask [I] = PoisonMaskElem;
11122	return TargetTransformInfo::SK_PermuteSingleSrc;
11123	}
11124	// No perfect match, just shuffle, so choose the first tree node from the
11125	// tree.
11126	Entries.push_back(Elt: FirstEntries.front());
11127	} else {
11128	// Try to find nodes with the same vector factor.
11129	assert(UsedTEs.size() == `2` && "Expected at max 2 permuted entries.");
11130	// Keep the order of tree nodes to avoid non-determinism.
11131	DenseMap<int, const TreeEntry *> VFToTE;
11132	for (const TreeEntry *TE : UsedTEs.front()) {
11133	unsigned VF = TE->getVectorFactor();
11134	auto It = VFToTE.find(Val: VF);
11135	if (It != VFToTE.end()) {
11136	if (It ->second->Idx > TE->Idx)
11137	It ->getSecond() = TE;
11138	continue;
11139	}
11140	VFToTE.try_emplace(Key: VF, Args&: TE);
11141	}
11142	// Same, keep the order to avoid non-determinism.
11143	SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11144	UsedTEs.back().end());
11145	sort(C&: SecondEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
11146	return TE1->Idx < TE2->Idx;
11147	});
11148	for (const TreeEntry *TE : SecondEntries) {
11149	auto It = VFToTE.find(Val: TE->getVectorFactor());
11150	if (It != VFToTE.end()) {
11151	VF = It ->first;
11152	Entries.push_back(Elt: It ->second);
11153	Entries.push_back(Elt: TE);
11154	break;
11155	}
11156	}
11157	// No 2 source vectors with the same vector factor - just choose 2 with max
11158	// index.
11159	if (Entries.empty()) {
11160	Entries.push_back(Elt: *llvm::max_element(
11161	Range&: UsedTEs.front(), C: [](const TreeEntry TE1, const* TreeEntry *TE2) {
11162	return TE1->Idx < TE2->Idx;
11163	}));
11164	Entries.push_back(Elt: SecondEntries.front());
11165	VF = std::max(a: Entries.front()->getVectorFactor(),
11166	b: Entries.back()->getVectorFactor());
11167	}
11168	}
11169
11170	bool IsSplatOrUndefs = isSplat(VL) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>);
11171	// Checks if the 2 PHIs are compatible in terms of high possibility to be
11172	// vectorized.
11173	auto AreCompatiblePHIs = [&](Value V, Value V1) {
11174	auto *PHI = cast<PHINode>(Val: V);
11175	auto *PHI1 = cast<PHINode>(Val: V1);
11176	// Check that all incoming values are compatible/from same parent (if they
11177	// are instructions).
11178	// The incoming values are compatible if they all are constants, or
11179	// instruction with the same/alternate opcodes from the same basic block.
11180	for (int I = `0`, E = PHI->getNumIncomingValues(); I < E; ++I) {
11181	Value *In = PHI->getIncomingValue(i: I);
11182	Value *In1 = PHI1->getIncomingValue(i: I);
11183	if (isConstant(V: In) && isConstant(V: In1))
11184	continue;
11185	if (!getSameOpcode(VL: {In, In1}, TLI: *TLI).getOpcode())
11186	return false;
11187	if (cast<Instruction>(Val: In)->getParent() !=
11188	cast<Instruction>(Val: In1)->getParent())
11189	return false;
11190	}
11191	return true;
11192	};
11193	// Check if the value can be ignored during analysis for shuffled gathers.
11194	// We suppose it is better to ignore instruction, which do not form splats,
11195	// are not vectorized/not extractelements (these instructions will be handled
11196	// by extractelements processing) or may form vector node in future.
11197	auto MightBeIgnored = [=](Value *V) {
11198	auto *I = dyn_cast<Instruction>(Val: V);
11199	return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(Val: I) &&
11200	!isVectorLikeInstWithConstOps(V: I) &&
11201	!areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
11202	};
11203	// Check that the neighbor instruction may form a full vector node with the
11204	// current instruction V. It is possible, if they have same/alternate opcode
11205	// and same parent basic block.
11206	auto NeighborMightBeIgnored = [&](Value V, int* Idx) {
11207	Value *V1 = VL [Idx];
11208	bool UsedInSameVTE = false;
11209	auto It = UsedValuesEntry.find(Val: V1);
11210	if (It != UsedValuesEntry.end())
11211	UsedInSameVTE = It ->second == UsedValuesEntry.find(Val: V)->second;
11212	return V != V1 && MightBeIgnored (V1) && !UsedInSameVTE &&
11213	getSameOpcode(VL: {V, V1}, TLI: *TLI).getOpcode() &&
11214	cast<Instruction>(Val: V)->getParent() ==
11215	cast<Instruction>(Val: V1)->getParent() &&
11216	(!isa<PHINode>(Val: V1) \|\| AreCompatiblePHIs (V, V1));
11217	};
11218	// Build a shuffle mask for better cost estimation and vector emission.
11219	SmallBitVector UsedIdxs(Entries.size());
11220	SmallVector<std::pair<unsigned, int>> EntryLanes;
11221	for (int I = `0`, E = VL.size(); I < E; ++I) {
11222	Value *V = VL [I];
11223	auto It = UsedValuesEntry.find(Val: V);
11224	if (It == UsedValuesEntry.end())
11225	continue;
11226	// Do not try to shuffle scalars, if they are constants, or instructions
11227	// that can be vectorized as a result of the following vector build
11228	// vectorization.
11229	if (isConstant(V) \|\| (MightBeIgnored (V) &&
11230	((I > `0` && NeighborMightBeIgnored (V, I - `1`)) \|\|
11231	(I != E - `1` && NeighborMightBeIgnored (V, I + `1`)))))
11232	continue;
11233	unsigned Idx = It ->second;
11234	EntryLanes.emplace_back(Args&: Idx, Args&: I);
11235	UsedIdxs.set(Idx);
11236	}
11237	// Iterate through all shuffled scalars and select entries, which can be used
11238	// for final shuffle.
11239	SmallVector<const TreeEntry *> TempEntries;
11240	for (unsigned I = `0`, Sz = Entries.size(); I < Sz; ++I) {
11241	if (!UsedIdxs.test(Idx: I))
11242	continue;
11243	// Fix the entry number for the given scalar. If it is the first entry, set
11244	// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11245	// These indices are used when calculating final shuffle mask as the vector
11246	// offset.
11247	for (std::pair<unsigned, int> &Pair : EntryLanes)
11248	if (Pair.first == I)
11249	Pair.first = TempEntries.size();
11250	TempEntries.push_back(Elt: Entries [I]);
11251	}
11252	Entries.swap(RHS&: TempEntries);
11253	if (EntryLanes.size() == Entries.size() &&
11254	!VL.equals(RHS: ArrayRef(TE->Scalars)
11255	.slice(N: Part * VL.size(),
11256	M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
11257	// We may have here 1 or 2 entries only. If the number of scalars is equal
11258	// to the number of entries, no need to do the analysis, it is not very
11259	// profitable. Since VL is not the same as TE->Scalars, it means we already
11260	// have some shuffles before. Cut off not profitable case.
11261	Entries.clear();
11262	return std::nullopt;
11263	}
11264	// Build the final mask, check for the identity shuffle, if possible.
11265	bool IsIdentity = Entries.size() == `1`;
11266	// Pair.first is the offset to the vector, while Pair.second is the index of
11267	// scalar in the list.
11268	for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11269	unsigned Idx = Part * VL.size() + Pair.second;
11270	Mask [Idx] =
11271	Pair.first * VF +
11272	(ForOrder ? std::distance(
11273	first: Entries [Pair.first]->Scalars.begin(),
11274	last: find(Range: Entries [Pair.first]->Scalars, Val: VL [Pair.second]))
11275	: Entries [Pair.first]->findLaneForValue(V: VL [Pair.second]));
11276	IsIdentity &= Mask [Idx] == Pair.second;
11277	}
11278	switch (Entries.size()) {
11279	case `1`:
11280	if (IsIdentity \|\| EntryLanes.size() > `1` \|\| VL.size() <= `2`)
11281	return TargetTransformInfo::SK_PermuteSingleSrc;
11282	break;
11283	case `2`:
11284	if (EntryLanes.size() > `2` \|\| VL.size() <= `2`)
11285	return TargetTransformInfo::SK_PermuteTwoSrc;
11286	break;
11287	default:
11288	break;
11289	}
11290	Entries.clear();
11291	// Clear the corresponding mask elements.
11292	std::fill(std::next(x: Mask.begin(), n: Part * VL.size()),
11293	std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), PoisonMaskElem);
11294	return std::nullopt;
11295	}
11296
11297	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
11298	BoUpSLP::isGatherShuffledEntry(
11299	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
11300	SmallVectorImpl<SmallVector<const TreeEntry >> &Entries, unsigned* NumParts,
11301	bool ForOrder) {
11302	assert(NumParts > `0` && NumParts < VL.size() &&
11303	"Expected positive number of registers.");
11304	Entries.clear();
11305	// No need to check for the topmost gather node.
11306	if (TE == VectorizableTree.front().get())
11307	return {};
11308	// FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11309	if (TE->isNonPowOf2Vec())
11310	return {};
11311	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
11312	assert(TE->UserTreeIndices.size() == `1` &&
11313	"Expected only single user of the gather node.");
11314	assert(VL.size() % NumParts == `0` &&
11315	"Number of scalars must be divisible by NumParts.");
11316	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
11317	SmallVector<std::optional<TTI::ShuffleKind>> Res;
11318	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
11319	ArrayRef<Value *> SubVL =
11320	VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
11321	SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11322	std::optional<TTI::ShuffleKind> SubRes =
11323	isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
11324	ForOrder);
11325	if (!SubRes)
11326	SubEntries.clear();
11327	Res.push_back(Elt: SubRes);
11328	if (SubEntries.size() == `1` && *SubRes == TTI::SK_PermuteSingleSrc &&
11329	SubEntries.front()->getVectorFactor() == VL.size() &&
11330	(SubEntries.front()->isSame(VL: TE->Scalars) \|\|
11331	SubEntries.front()->isSame(VL))) {
11332	SmallVector<const TreeEntry *> LocalSubEntries;
11333	LocalSubEntries.swap(RHS&: SubEntries);
11334	Entries.clear();
11335	Res.clear();
11336	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
11337	// Clear undef scalars.
11338	for (int I = `0`, Sz = VL.size(); I < Sz; ++I)
11339	if (isa<PoisonValue>(Val: VL [I]))
11340	Mask [I] = PoisonMaskElem;
11341	Entries.emplace_back(Args: `1`, Args&: LocalSubEntries.front());
11342	Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
11343	return Res;
11344	}
11345	}
11346	if (all_of(Range&: Res,
11347	P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11348	Entries.clear();
11349	return {};
11350	}
11351	return Res;
11352	}
11353
11354	InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc,
11355	Type ScalarTy) const* {
11356	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11357	bool DuplicateNonConst = false;
11358	// Find the cost of inserting/extracting values from the vector.
11359	// Check if the same elements are inserted several times and count them as
11360	// shuffle candidates.
11361	APInt ShuffledElements = APInt::getZero(numBits: VL.size());
11362	DenseMap<Value , unsigned*> UniqueElements;
11363	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11364	InstructionCost Cost;
11365	auto EstimateInsertCost = [&](unsigned I, Value *V) {
11366	if (V->getType() != ScalarTy) {
11367	Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
11368	CCH: TTI::CastContextHint::None, CostKind);
11369	V = nullptr;
11370	}
11371	if (!ForPoisonSrc)
11372	Cost +=
11373	TTI->getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
11374	Index: I, Op0: Constant::getNullValue(Ty: VecTy), Op1: V);
11375	};
11376	SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11377	for (unsigned I = `0`, E = VL.size(); I < E; ++I) {
11378	Value *V = VL [I];
11379	// No need to shuffle duplicates for constants.
11380	if ((ForPoisonSrc && isConstant(V)) \|\| isa<UndefValue>(Val: V)) {
11381	ShuffledElements.setBit(I);
11382	ShuffleMask [I] = isa<PoisonValue>(Val: V) ? PoisonMaskElem : I;
11383	continue;
11384	}
11385
11386	auto Res = UniqueElements.try_emplace(Key: V, Args&: I);
11387	if (Res.second) {
11388	EstimateInsertCost (I, V);
11389	ShuffleMask [I] = I;
11390	continue;
11391	}
11392
11393	DuplicateNonConst = true;
11394	ShuffledElements.setBit(I);
11395	ShuffleMask [I] = Res.first ->second;
11396	}
11397	if (ForPoisonSrc)
11398	Cost =
11399	TTI->getScalarizationOverhead(Ty: VecTy, DemandedElts: ~ShuffledElements, /Insert/ true,
11400	/Extract/ false, CostKind);
11401	if (DuplicateNonConst)
11402	Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
11403	Tp: VecTy, Mask: ShuffleMask);
11404	return Cost;
11405	}
11406
11407	// Perform operand reordering on the instructions in VL and return the reordered
11408	// operands in Left and Right.
11409	void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11410	SmallVectorImpl<Value *> &Left,
11411	SmallVectorImpl<Value *> &Right,
11412	const BoUpSLP &R) {
11413	if (VL.empty())
11414	return;
11415	VLOperands Ops(VL, R);
11416	// Reorder the operands in place.
11417	Ops.reorder();
11418	Left = Ops.getVL(OpIdx: `0`);
11419	Right = Ops.getVL(OpIdx: `1`);
11420	}
11421
11422	Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11423	auto &Res = EntryToLastInstruction.FindAndConstruct(Key: E);
11424	if (Res.second)
11425	return *Res.second;
11426	// Get the basic block this bundle is in. All instructions in the bundle
11427	// should be in this block (except for extractelement-like instructions with
11428	// constant indeces).
11429	auto *Front = E->getMainOp();
11430	auto *BB = Front->getParent();
11431	assert(llvm::all_of(E->Scalars, [=](Value V) -> bool* {
11432	if (E->getOpcode() == Instruction::GetElementPtr &&
11433	!isa<GetElementPtrInst>(V))
11434	return true;
11435	auto *I = cast<Instruction>(V);
11436	return !E->isOpcodeOrAlt(I) \|\| I->getParent() == BB \|\|
11437	isVectorLikeInstWithConstOps(I);
11438	}));
11439
11440	auto FindLastInst = [&]() {
11441	Instruction *LastInst = Front;
11442	for (Value *V : E->Scalars) {
11443	auto *I = dyn_cast<Instruction>(Val: V);
11444	if (!I)
11445	continue;
11446	if (LastInst->getParent() == I->getParent()) {
11447	if (LastInst->comesBefore(Other: I))
11448	LastInst = I;
11449	continue;
11450	}
11451	assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452	!isa<GetElementPtrInst>(I)) \|\|
11453	(isVectorLikeInstWithConstOps(LastInst) &&
11454	isVectorLikeInstWithConstOps(I))) &&
11455	"Expected vector-like or non-GEP in GEP node insts only.");
11456	if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
11457	LastInst = I;
11458	continue;
11459	}
11460	if (!DT->isReachableFromEntry(A: I->getParent()))
11461	continue;
11462	auto *NodeA = DT->getNode(BB: LastInst->getParent());
11463	auto *NodeB = DT->getNode(BB: I->getParent());
11464	assert(NodeA && "Should only process reachable instructions");
11465	assert(NodeB && "Should only process reachable instructions");
11466	assert((NodeA == NodeB) ==
11467	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468	"Different nodes should have different DFS numbers");
11469	if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11470	LastInst = I;
11471	}
11472	BB = LastInst->getParent();
11473	return LastInst;
11474	};
11475
11476	auto FindFirstInst = [&]() {
11477	Instruction *FirstInst = Front;
11478	for (Value *V : E->Scalars) {
11479	auto *I = dyn_cast<Instruction>(Val: V);
11480	if (!I)
11481	continue;
11482	if (FirstInst->getParent() == I->getParent()) {
11483	if (I->comesBefore(Other: FirstInst))
11484	FirstInst = I;
11485	continue;
11486	}
11487	assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488	!isa<GetElementPtrInst>(I)) \|\|
11489	(isVectorLikeInstWithConstOps(FirstInst) &&
11490	isVectorLikeInstWithConstOps(I))) &&
11491	"Expected vector-like or non-GEP in GEP node insts only.");
11492	if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
11493	FirstInst = I;
11494	continue;
11495	}
11496	if (!DT->isReachableFromEntry(A: I->getParent()))
11497	continue;
11498	auto *NodeA = DT->getNode(BB: FirstInst->getParent());
11499	auto *NodeB = DT->getNode(BB: I->getParent());
11500	assert(NodeA && "Should only process reachable instructions");
11501	assert(NodeB && "Should only process reachable instructions");
11502	assert((NodeA == NodeB) ==
11503	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504	"Different nodes should have different DFS numbers");
11505	if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11506	FirstInst = I;
11507	}
11508	return FirstInst;
11509	};
11510
11511	// Set the insert point to the beginning of the basic block if the entry
11512	// should not be scheduled.
11513	if (doesNotNeedToSchedule(VL: E->Scalars) \|\|
11514	(!E->isGather() && all_of(Range: E->Scalars, P: isVectorLikeInstWithConstOps))) {
11515	if ((E->getOpcode() == Instruction::GetElementPtr &&
11516	any_of(Range: E->Scalars,
11517	P: [](Value *V) {
11518	return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
11519	})) \|\|
11520	all_of(Range: E->Scalars,
11521	P: [](Value *V) {
11522	return !isVectorLikeInstWithConstOps(V) &&
11523	isUsedOutsideBlock(V);
11524	}) \|\|
11525	(E->isGather() && E->Idx == `0` && all_of(Range: E->Scalars, P: [](Value *V) {
11526	return isa<ExtractElementInst, UndefValue>(Val: V) \|\|
11527	areAllOperandsNonInsts(V);
11528	})))
11529	Res.second = FindLastInst ();
11530	else
11531	Res.second = FindFirstInst ();
11532	return *Res.second;
11533	}
11534
11535	// Find the last instruction. The common case should be that BB has been
11536	// scheduled, and the last instruction is VL.back(). So we start with
11537	// VL.back() and iterate over schedule data until we reach the end of the
11538	// bundle. The end of the bundle is marked by null ScheduleData.
11539	if (BlocksSchedules.count(Key: BB)) {
11540	Value *V = E->isOneOf(Op: E->Scalars.back());
11541	if (doesNotNeedToBeScheduled(V))
11542	V = *find_if_not(Range: E->Scalars, P: doesNotNeedToBeScheduled);
11543	auto *Bundle = BlocksSchedules [BB]->getScheduleData(V);
11544	if (Bundle && Bundle->isPartOfBundle())
11545	for (; Bundle; Bundle = Bundle->NextInBundle)
11546	if (Bundle->OpValue == Bundle->Inst)
11547	Res.second = Bundle->Inst;
11548	}
11549
11550	// LastInst can still be null at this point if there's either not an entry
11551	// for BB in BlocksSchedules or there's no ScheduleData available for
11552	// VL.back(). This can be the case if buildTree_rec aborts for various
11553	// reasons (e.g., the maximum recursion depth is reached, the maximum region
11554	// size is reached, etc.). ScheduleData is initialized in the scheduling
11555	// "dry-run".
11556	//
11557	// If this happens, we can still find the last instruction by brute force. We
11558	// iterate forwards from Front (inclusive) until we either see all
11559	// instructions in the bundle or reach the end of the block. If Front is the
11560	// last instruction in program order, LastInst will be set to Front, and we
11561	// will visit all the remaining instructions in the block.
11562	//
11563	// One of the reasons we exit early from buildTree_rec is to place an upper
11564	// bound on compile-time. Thus, taking an additional compile-time hit here is
11565	// not ideal. However, this should be exceedingly rare since it requires that
11566	// we both exit early from buildTree_rec and that the bundle be out-of-order
11567	// (causing us to iterate all the way to the end of the block).
11568	if (!Res.second)
11569	Res.second = FindLastInst ();
11570	assert(Res.second && "Failed to find last instruction in bundle");
11571	return *Res.second;
11572	}
11573
11574	void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11575	auto *Front = E->getMainOp();
11576	Instruction *LastInst = &getLastInstructionInBundle(E);
11577	assert(LastInst && "Failed to find last instruction in bundle");
11578	BasicBlock::iterator LastInstIt = LastInst->getIterator();
11579	// If the instruction is PHI, set the insert point after all the PHIs.
11580	bool IsPHI = isa<PHINode>(Val: LastInst);
11581	if (IsPHI)
11582	LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11583	if (IsPHI \|\| (!E->isGather() && doesNotNeedToSchedule(VL: E->Scalars))) {
11584	Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
11585	} else {
11586	// Set the insertion point after the last instruction in the bundle. Set the
11587	// debug location to Front.
11588	Builder.SetInsertPoint(
11589	TheBB: LastInst->getParent(),
11590	IP: LastInst->getNextNonDebugInstruction()->getIterator());
11591	}
11592	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11593	}
11594
11595	Value BoUpSLP::gather(ArrayRef<Value > VL, Value Root, Type ScalarTy) {
11596	// List of instructions/lanes from current block and/or the blocks which are
11597	// part of the current loop. These instructions will be inserted at the end to
11598	// make it possible to optimize loops and hoist invariant instructions out of
11599	// the loops body with better chances for success.
11600	SmallVector<std::pair<Value , unsigned*>, `4`> PostponedInsts;
11601	SmallSet<int, `4`> PostponedIndices;
11602	Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
11603	auto &&CheckPredecessor = [](BasicBlock InstBB, BasicBlock InsertBB) {
11604	SmallPtrSet<BasicBlock *, `4`> Visited;
11605	while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
11606	InsertBB = InsertBB->getSinglePredecessor();
11607	return InsertBB && InsertBB == InstBB;
11608	};
11609	for (int I = `0`, E = VL.size(); I < E; ++I) {
11610	if (auto *Inst = dyn_cast<Instruction>(Val: VL [I]))
11611	if ((CheckPredecessor (Inst->getParent(), Builder.GetInsertBlock()) \|\|
11612	getTreeEntry(V: Inst) \|\|
11613	(L && (!Root \|\| L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
11614	PostponedIndices.insert(V: I).second)
11615	PostponedInsts.emplace_back(Args&: Inst, Args&: I);
11616	}
11617
11618	auto &&CreateInsertElement = [this](Value Vec, Value V, unsigned Pos,
11619	Type *Ty) {
11620	Value *Scalar = V;
11621	if (Scalar->getType() != Ty) {
11622	assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11623	"Expected integer types only.");
11624	Value *V = Scalar;
11625	if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
11626	isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
11627	Value *Op = CI->getOperand(i_nocapture: `0`);
11628	if (auto *IOp = dyn_cast<Instruction>(Val: Op);
11629	!IOp \|\| !(isDeleted(I: IOp) \|\| getTreeEntry(V: IOp)))
11630	V = Op;
11631	}
11632	Scalar = Builder.CreateIntCast(
11633	V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery (*DL)));
11634	}
11635
11636	Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
11637	auto *InsElt = dyn_cast<InsertElementInst>(Val: Vec);
11638	if (!InsElt)
11639	return Vec;
11640	GatherShuffleExtractSeq.insert(X: InsElt);
11641	CSEBlocks.insert(V: InsElt->getParent());
11642	// Add to our 'need-to-extract' list.
11643	if (isa<Instruction>(Val: V)) {
11644	if (TreeEntry *Entry = getTreeEntry(V)) {
11645	// Find which lane we need to extract.
11646	User UserOp = nullptr*;
11647	if (Scalar != V) {
11648	if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
11649	UserOp = SI;
11650	} else {
11651	UserOp = InsElt;
11652	}
11653	if (UserOp) {
11654	unsigned FoundLane = Entry->findLaneForValue(V);
11655	ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: FoundLane);
11656	}
11657	}
11658	}
11659	return Vec;
11660	};
11661	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11662	Value *Vec = Root ? Root : PoisonValue::get(T: VecTy);
11663	SmallVector<int> NonConsts;
11664	// Insert constant values at first.
11665	for (int I = `0`, E = VL.size(); I < E; ++I) {
11666	if (PostponedIndices.contains(V: I))
11667	continue;
11668	if (!isConstant(V: VL [I])) {
11669	NonConsts.push_back(Elt: I);
11670	continue;
11671	}
11672	if (Root) {
11673	if (!isa<UndefValue>(Val: VL [I])) {
11674	NonConsts.push_back(Elt: I);
11675	continue;
11676	}
11677	if (isa<PoisonValue>(Val: VL [I]))
11678	continue;
11679	if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Root)) {
11680	if (SV->getMaskValue(Elt: I) == PoisonMaskElem)
11681	continue;
11682	}
11683	}
11684	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
11685	}
11686	// Insert non-constant values.
11687	for (int I : NonConsts)
11688	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
11689	// Append instructions, which are/may be part of the loop, in the end to make
11690	// it possible to hoist non-loop-based instructions.
11691	for (const std::pair<Value , unsigned*> &Pair : PostponedInsts)
11692	Vec = CreateInsertElement (Vec, Pair.first, Pair.second, ScalarTy);
11693
11694	return Vec;
11695	}
11696
11697	/// Merges shuffle masks and emits final shuffle instruction, if required. It
11698	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11699	/// when the actual shuffle instruction is generated only if this is actually
11700	/// required. Otherwise, the shuffle instruction emission is delayed till the
11701	/// end of the process, to reduce the number of emitted instructions and further
11702	/// analysis/transformations.
11703	/// The class also will look through the previously emitted shuffle instructions
11704	/// and properly mark indices in mask as undef.
11705	/// For example, given the code
11706	/// \code
11707	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11708	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11709	/// \endcode
11710	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11711	/// look through %s1 and %s2 and emit
11712	/// \code
11713	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11714	/// \endcode
11715	/// instead.
11716	/// If 2 operands are of different size, the smallest one will be resized and
11717	/// the mask recalculated properly.
11718	/// For example, given the code
11719	/// \code
11720	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11721	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11722	/// \endcode
11723	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11724	/// look through %s1 and %s2 and emit
11725	/// \code
11726	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11727	/// \endcode
11728	/// instead.
11729	class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11730	bool IsFinalized = false;
11731	/// Combined mask for all applied operands and masks. It is built during
11732	/// analysis and actual emission of shuffle vector instructions.
11733	SmallVector<int> CommonMask;
11734	/// List of operands for the shuffle vector instruction. It hold at max 2
11735	/// operands, if the 3rd is going to be added, the first 2 are combined into
11736	/// shuffle with \p CommonMask mask, the first operand sets to be the
11737	/// resulting shuffle and the second operand sets to be the newly added
11738	/// operand. The \p CommonMask is transformed in the proper way after that.
11739	SmallVector<Value *, `2`> InVectors;
11740	Type ScalarTy = nullptr*;
11741	IRBuilderBase &Builder;
11742	BoUpSLP &R;
11743
11744	class ShuffleIRBuilder {
11745	IRBuilderBase &Builder;
11746	/// Holds all of the instructions that we gathered.
11747	SetVector<Instruction *> &GatherShuffleExtractSeq;
11748	/// A list of blocks that we are going to CSE.
11749	DenseSet<BasicBlock *> &CSEBlocks;
11750	/// Data layout.
11751	const DataLayout &DL;
11752
11753	public:
11754	ShuffleIRBuilder(IRBuilderBase &Builder,
11755	SetVector<Instruction *> &GatherShuffleExtractSeq,
11756	DenseSet<BasicBlock > &CSEBlocks, const* DataLayout &DL)
11757	: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758	CSEBlocks(CSEBlocks), DL(DL) {}
11759	~ShuffleIRBuilder() = default;
11760	/// Creates shufflevector for the 2 operands with the given mask.
11761	Value createShuffleVector(Value V1, Value V2, ArrayRef<int*> Mask) {
11762	if (V1->getType() != V2->getType()) {
11763	assert(V1->getType()->isIntOrIntVectorTy() &&
11764	V1->getType()->isIntOrIntVectorTy() &&
11765	"Expected integer vector types only.");
11766	if (V1->getType() != V2->getType()) {
11767	if (cast<VectorType>(Val: V2->getType())
11768	->getElementType()
11769	->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
11770	->getElementType()
11771	->getIntegerBitWidth())
11772	V2 = Builder.CreateIntCast(
11773	V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery (DL)));
11774	else
11775	V1 = Builder.CreateIntCast(
11776	V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery (DL)));
11777	}
11778	}
11779	Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11780	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11781	GatherShuffleExtractSeq.insert(X: I);
11782	CSEBlocks.insert(V: I->getParent());
11783	}
11784	return Vec;
11785	}
11786	/// Creates permutation of the single vector operand with the given mask, if
11787	/// it is not identity mask.
11788	Value createShuffleVector(Value V1, ArrayRef<int> Mask) {
11789	if (Mask.empty())
11790	return V1;
11791	unsigned VF = Mask.size();
11792	unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11793	if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
11794	return V1;
11795	Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
11796	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11797	GatherShuffleExtractSeq.insert(X: I);
11798	CSEBlocks.insert(V: I->getParent());
11799	}
11800	return Vec;
11801	}
11802	Value createIdentity(Value V) { return V; }
11803	Value createPoison(Type Ty, unsigned VF) {
11804	return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
11805	}
11806	/// Resizes 2 input vector to match the sizes, if the they are not equal
11807	/// yet. The smallest vector is resized to the size of the larger vector.
11808	void resizeToMatch(Value &V1, Value &V2) {
11809	if (V1->getType() == V2->getType())
11810	return;
11811	int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11812	int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
11813	int VF = std::max(a: V1VF, b: V2VF);
11814	int MinVF = std::min(a: V1VF, b: V2VF);
11815	SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11816	std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
11817	value: `0`);
11818	Value *&Op = MinVF == V1VF ? V1 : V2;
11819	Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
11820	if (auto *I = dyn_cast<Instruction>(Val: Op)) {
11821	GatherShuffleExtractSeq.insert(X: I);
11822	CSEBlocks.insert(V: I->getParent());
11823	}
11824	if (MinVF == V1VF)
11825	V1 = Op;
11826	else
11827	V2 = Op;
11828	}
11829	};
11830
11831	/// Smart shuffle instruction emission, walks through shuffles trees and
11832	/// tries to find the best matching vector for the actual shuffle
11833	/// instruction.
11834	Value createShuffle(Value V1, Value V2, ArrayRef<int*> Mask) {
11835	assert(V1 && "Expected at least one vector value.");
11836	ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837	R.CSEBlocks, *R.DL);
11838	return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11839	Builder&: ShuffleBuilder);
11840	}
11841
11842	/// Transforms mask \p CommonMask per given \p Mask to make proper set after
11843	/// shuffle emission.
11844	static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11845	ArrayRef<int> Mask) {
11846	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11847	if (Mask [Idx] != PoisonMaskElem)
11848	CommonMask [Idx] = Idx;
11849	}
11850
11851	/// Cast value \p V to the vector type with the same number of elements, but
11852	/// the base type \p ScalarTy.
11853	Value castToScalarTyElem(Value V,
11854	std::optional<bool> IsSigned = std::nullopt) {
11855	auto *VecTy = cast<VectorType>(Val: V->getType());
11856	assert(getNumElements(VecTy) % getNumElements(ScalarTy) == `0`);
11857	if (VecTy->getElementType() == ScalarTy->getScalarType())
11858	return V;
11859	return Builder.CreateIntCast(
11860	V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
11861	isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery (*R.DL))));
11862	}
11863
11864	public:
11865	ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
11866	: ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11867
11868	/// Adjusts extractelements after reusing them.
11869	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
11870	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871	unsigned NumParts, bool &UseVecBaseAsInput) {
11872	UseVecBaseAsInput = false;
11873	SmallPtrSet<Value *, `4`> UniqueBases;
11874	Value VecBase = nullptr*;
11875	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
11876	int Idx = Mask [I];
11877	if (Idx == PoisonMaskElem)
11878	continue;
11879	auto *EI = cast<ExtractElementInst>(Val: E->Scalars [I]);
11880	VecBase = EI->getVectorOperand();
11881	if (const TreeEntry *TE = R.getTreeEntry(V: VecBase))
11882	VecBase = TE->VectorizedValue;
11883	assert(VecBase && "Expected vectorized value.");
11884	UniqueBases.insert(Ptr: VecBase);
11885	// If the only one use is vectorized - can delete the extractelement
11886	// itself.
11887	if (!EI->hasOneUse() \|\| (NumParts != `1` && count(Range: E->Scalars, Element: EI) > `1`) \|\|
11888	any_of(Range: EI->users(), P: [&](User *U) {
11889	const TreeEntry *UTE = R.getTreeEntry(V: U);
11890	return !UTE \|\| R.MultiNodeScalars.contains(Val: U) \|\|
11891	(isa<GetElementPtrInst>(Val: U) &&
11892	!R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) \|\|
11893	count_if(Range&: R.VectorizableTree,
11894	P: [&](const std::unique_ptr<TreeEntry> &TE) {
11895	return any_of(Range&: TE ->UserTreeIndices,
11896	P: [&](const EdgeInfo &Edge) {
11897	return Edge.UserTE == UTE;
11898	}) &&
11899	is_contained(Range&: TE ->Scalars, Element: EI);
11900	}) != `1`;
11901	}))
11902	continue;
11903	R.eraseInstruction(I: EI);
11904	}
11905	if (NumParts == `1` \|\| UniqueBases.size() == `1`) {
11906	assert(VecBase && "Expected vectorized value.");
11907	return castToScalarTyElem(V: VecBase);
11908	}
11909	UseVecBaseAsInput = true;
11910	auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11911	for (auto [I, Idx] : enumerate(First&: Mask))
11912	if (Idx != PoisonMaskElem)
11913	Idx = I;
11914	};
11915	// Perform multi-register vector shuffle, joining them into a single virtual
11916	// long vector.
11917	// Need to shuffle each part independently and then insert all this parts
11918	// into a long virtual vector register, forming the original vector.
11919	Value Vec = nullptr*;
11920	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11921	unsigned SliceSize = getPartNumElems(Size: E->Scalars.size(), NumParts);
11922	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
11923	unsigned Limit = getNumElems(Size: E->Scalars.size(), PartNumElems: SliceSize, Part);
11924	ArrayRef<Value *> VL =
11925	ArrayRef(E->Scalars).slice(N: Part * SliceSize, M: Limit);
11926	MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
11927	constexpr int MaxBases = `2`;
11928	SmallVector<Value *, MaxBases> Bases(MaxBases);
11929	auto VLMask = zip(t&: VL, u&: SubMask);
11930	const unsigned VF = std::accumulate(
11931	first: VLMask.begin(), last: VLMask.end(), init: `0U`, binary_op: [&](unsigned S, const auto &D) {
11932	if (std::get<`1`>(D) == PoisonMaskElem)
11933	return S;
11934	Value *VecOp =
11935	cast<ExtractElementInst>(std::get<`0`>(D))->getVectorOperand();
11936	if (const TreeEntry *TE = R.getTreeEntry(V: VecOp))
11937	VecOp = TE->VectorizedValue;
11938	assert(VecOp && "Expected vectorized value.");
11939	const unsigned Size =
11940	cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
11941	return std::max(a: S, b: Size);
11942	});
11943	for (const auto [V, I] : VLMask) {
11944	if (I == PoisonMaskElem)
11945	continue;
11946	Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
11947	if (const TreeEntry *TE = R.getTreeEntry(V: VecOp))
11948	VecOp = TE->VectorizedValue;
11949	assert(VecOp && "Expected vectorized value.");
11950	VecOp = castToScalarTyElem(V: VecOp);
11951	Bases [I / VF] = VecOp;
11952	}
11953	if (!Bases.front())
11954	continue;
11955	Value *SubVec;
11956	if (Bases.back()) {
11957	SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
11958	TransformToIdentity(SubMask);
11959	} else {
11960	SubVec = Bases.front();
11961	}
11962	if (!Vec) {
11963	Vec = SubVec;
11964	assert((Part == `0` \|\| all_of(seq<unsigned>(`0`, Part),
11965	[&](unsigned P) {
11966	ArrayRef<int> SubMask =
11967	Mask.slice(P * SliceSize,
11968	getNumElems(Mask.size(),
11969	SliceSize, P));
11970	return all_of(SubMask, [](int Idx) {
11971	return Idx == PoisonMaskElem;
11972	});
11973	})) &&
11974	"Expected first part or all previous parts masked.");
11975	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11976	} else {
11977	unsigned NewVF =
11978	cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
11979	if (Vec->getType() != SubVec->getType()) {
11980	unsigned SubVecVF =
11981	cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
11982	NewVF = std::max(a: NewVF, b: SubVecVF);
11983	}
11984	// Adjust SubMask.
11985	for (int &Idx : SubMask)
11986	if (Idx != PoisonMaskElem)
11987	Idx += NewVF;
11988	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11989	Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
11990	TransformToIdentity(VecMask);
11991	}
11992	}
11993	copy(Range&: VecMask, Out: Mask.begin());
11994	return Vec;
11995	}
11996	/// Checks if the specified entry \p E needs to be delayed because of its
11997	/// dependency nodes.
11998	std::optional<Value *>
11999	needToDelay(const TreeEntry *E,
12000	ArrayRef<SmallVector<const TreeEntry >> Deps) const* {
12001	// No need to delay emission if all deps are ready.
12002	if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
12003	return all_of(
12004	Range&: TEs, P: [](const TreeEntry TE) { return* TE->VectorizedValue; });
12005	}))
12006	return std::nullopt;
12007	// Postpone gather emission, will be emitted after the end of the
12008	// process to keep correct order.
12009	auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
12010	return Builder.CreateAlignedLoad(
12011	Ty: ResVecTy,
12012	Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
12013	Align: MaybeAlign ());
12014	}
12015	/// Adds 2 input vectors (in form of tree entries) and the mask for their
12016	/// shuffling.
12017	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12018	Value *V1 = E1.VectorizedValue;
12019	if (V1->getType()->isIntOrIntVectorTy())
12020	V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
12021	return !isKnownNonNegative(
12022	V, SQ: SimplifyQuery (*R.DL));
12023	}));
12024	Value *V2 = E2.VectorizedValue;
12025	if (V2->getType()->isIntOrIntVectorTy())
12026	V2 = castToScalarTyElem(V: V2, IsSigned: any_of(Range: E2.Scalars, P: [&](Value *V) {
12027	return !isKnownNonNegative(
12028	V, SQ: SimplifyQuery (*R.DL));
12029	}));
12030	add(V1, V2, Mask);
12031	}
12032	/// Adds single input vector (in form of tree entry) and the mask for its
12033	/// shuffling.
12034	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12035	Value *V1 = E1.VectorizedValue;
12036	if (V1->getType()->isIntOrIntVectorTy())
12037	V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
12038	return !isKnownNonNegative(
12039	V, SQ: SimplifyQuery (*R.DL));
12040	}));
12041	add(V1, Mask);
12042	}
12043	/// Adds 2 input vectors and the mask for their shuffling.
12044	void add(Value V1, Value V2, ArrayRef<int> Mask) {
12045	assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12046	V1 = castToScalarTyElem(V: V1);
12047	V2 = castToScalarTyElem(V: V2);
12048	if (InVectors.empty()) {
12049	InVectors.push_back(Elt: V1);
12050	InVectors.push_back(Elt: V2);
12051	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12052	return;
12053	}
12054	Value *Vec = InVectors.front();
12055	if (InVectors.size() == `2`) {
12056	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
12057	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12058	} else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
12059	Mask.size()) {
12060	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
12061	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12062	}
12063	V1 = createShuffle(V1, V2, Mask);
12064	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12065	if (Mask [Idx] != PoisonMaskElem)
12066	CommonMask [Idx] = Idx + Sz;
12067	InVectors.front() = Vec;
12068	if (InVectors.size() == `2`)
12069	InVectors.back() = V1;
12070	else
12071	InVectors.push_back(Elt: V1);
12072	}
12073	/// Adds another one input vector and the mask for the shuffling.
12074	void add(Value V1, ArrayRef<int> Mask, bool* = false) {
12075	V1 = castToScalarTyElem(V: V1);
12076	if (InVectors.empty()) {
12077	if (!isa<FixedVectorType>(Val: V1->getType())) {
12078	V1 = createShuffle(V1, V2: nullptr, Mask: CommonMask);
12079	CommonMask.assign(NumElts: Mask.size(), Elt: PoisonMaskElem);
12080	transformMaskAfterShuffle(CommonMask, Mask);
12081	}
12082	InVectors.push_back(Elt: V1);
12083	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12084	return;
12085	}
12086	const auto *It = find(Range&: InVectors, Val: V1);
12087	if (It == InVectors.end()) {
12088	if (InVectors.size() == `2` \|\|
12089	InVectors.front()->getType() != V1->getType() \|\|
12090	!isa<FixedVectorType>(Val: V1->getType())) {
12091	Value *V = InVectors.front();
12092	if (InVectors.size() == `2`) {
12093	V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
12094	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12095	} else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
12096	CommonMask.size()) {
12097	V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
12098	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12099	}
12100	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12101	if (CommonMask [Idx] == PoisonMaskElem && Mask [Idx] != PoisonMaskElem)
12102	CommonMask [Idx] =
12103	V->getType() != V1->getType()
12104	? Idx + Sz
12105	: Mask [Idx] + cast<FixedVectorType>(Val: V1->getType())
12106	->getNumElements();
12107	if (V->getType() != V1->getType())
12108	V1 = createShuffle(V1, V2: nullptr, Mask);
12109	InVectors.front() = V;
12110	if (InVectors.size() == `2`)
12111	InVectors.back() = V1;
12112	else
12113	InVectors.push_back(Elt: V1);
12114	return;
12115	}
12116	// Check if second vector is required if the used elements are already
12117	// used from the first one.
12118	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12119	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem) {
12120	InVectors.push_back(Elt: V1);
12121	break;
12122	}
12123	}
12124	int VF = CommonMask.size();
12125	if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
12126	VF = FTy->getNumElements();
12127	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12128	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
12129	CommonMask [Idx] = Mask [Idx] + (It == InVectors.begin() ? `0` : VF);
12130	}
12131	/// Adds another one input vector and the mask for the shuffling.
12132	void addOrdered(Value V1, ArrayRef<unsigned*> Order) {
12133	SmallVector<int> NewMask;
12134	inversePermutation(Indices: Order, Mask&: NewMask);
12135	add(V1, Mask: NewMask);
12136	}
12137	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
12138	Value Root = nullptr*) {
12139	return R.gather(VL, Root, ScalarTy);
12140	}
12141	Value createFreeze(Value V) { return Builder.CreateFreeze(V); }
12142	/// Finalize emission of the shuffles.
12143	/// \param Action the action (if any) to be performed before final applying of
12144	/// the \p ExtMask mask.
12145	Value *
12146	finalize(ArrayRef<int> ExtMask, unsigned VF = `0`,
12147	function_ref<void(Value &, SmallVectorImpl<int*> &)> Action = {}) {
12148	IsFinalized = true;
12149	if (Action) {
12150	Value *Vec = InVectors.front();
12151	if (InVectors.size() == `2`) {
12152	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
12153	InVectors.pop_back();
12154	} else {
12155	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
12156	}
12157	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12158	if (CommonMask [Idx] != PoisonMaskElem)
12159	CommonMask [Idx] = Idx;
12160	assert(VF > `0` &&
12161	"Expected vector length for the final value before action.");
12162	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
12163	if (VecVF < VF) {
12164	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12165	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: `0`);
12166	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
12167	}
12168	Action (Vec, CommonMask);
12169	InVectors.front() = Vec;
12170	}
12171	if (!ExtMask.empty()) {
12172	if (CommonMask.empty()) {
12173	CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
12174	} else {
12175	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12176	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
12177	if (ExtMask [I] == PoisonMaskElem)
12178	continue;
12179	NewMask [I] = CommonMask [ExtMask [I]];
12180	}
12181	CommonMask.swap(RHS&: NewMask);
12182	}
12183	}
12184	if (CommonMask.empty()) {
12185	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
12186	return InVectors.front();
12187	}
12188	if (InVectors.size() == `2`)
12189	return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
12190	return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
12191	}
12192
12193	~ShuffleInstructionBuilder() {
12194	assert((IsFinalized \|\| CommonMask.empty()) &&
12195	"Shuffle construction must be finalized.");
12196	}
12197	};
12198
12199	Value BoUpSLP::vectorizeOperand(TreeEntry E, unsigned NodeIdx,
12200	bool PostponedPHIs) {
12201	ValueList &VL = E->getOperand(OpIdx: NodeIdx);
12202	const unsigned VF = VL.size();
12203	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
12204	// Special processing for GEPs bundle, which may include non-gep values.
12205	if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12206	const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
12207	if (It != VL.end())
12208	S = getSameOpcode(VL: It, TLI: TLI);
12209	}
12210	if (S.getOpcode()) {
12211	auto CheckSameVE = [&](const TreeEntry *VE) {
12212	return VE->isSame(VL) &&
12213	(any_of(Range: VE->UserTreeIndices,
12214	P: [E, NodeIdx](const EdgeInfo &EI) {
12215	return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12216	}) \|\|
12217	any_of(Range&: VectorizableTree,
12218	P: [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12219	return TE ->isOperandGatherNode(UserEI: {E, NodeIdx}) &&
12220	VE->isSame(VL: TE ->Scalars);
12221	}));
12222	};
12223	TreeEntry *VE = getTreeEntry(V: S.OpValue);
12224	bool IsSameVE = VE && CheckSameVE (VE);
12225	if (!IsSameVE) {
12226	auto It = MultiNodeScalars.find(Val: S.OpValue);
12227	if (It != MultiNodeScalars.end()) {
12228	auto I = find_if(Range&: It ->getSecond(), P: [&](const* TreeEntry *TE) {
12229	return TE != VE && CheckSameVE (TE);
12230	});
12231	if (I != It ->getSecond().end()) {
12232	VE = *I;
12233	IsSameVE = true;
12234	}
12235	}
12236	}
12237	if (IsSameVE) {
12238	auto FinalShuffle = [&](Value V, ArrayRef<int*> Mask) {
12239	ShuffleInstructionBuilder ShuffleBuilder(
12240	cast<VectorType>(Val: V->getType())->getElementType(), Builder, *this);
12241	ShuffleBuilder.add(V1: V, Mask);
12242	return ShuffleBuilder.finalize(ExtMask: std::nullopt);
12243	};
12244	Value *V = vectorizeTree(E: VE, PostponedPHIs);
12245	if (VF * getNumElements(Ty: VL [`0`]->getType()) !=
12246	cast<FixedVectorType>(Val: V->getType())->getNumElements()) {
12247	if (!VE->ReuseShuffleIndices.empty()) {
12248	// Reshuffle to get only unique values.
12249	// If some of the scalars are duplicated in the vectorization
12250	// tree entry, we do not vectorize them but instead generate a
12251	// mask for the reuses. But if there are several users of the
12252	// same entry, they may have different vectorization factors.
12253	// This is especially important for PHI nodes. In this case, we
12254	// need to adapt the resulting instruction for the user
12255	// vectorization factor and have to reshuffle it again to take
12256	// only unique elements of the vector. Without this code the
12257	// function incorrectly returns reduced vector instruction with
12258	// the same elements, not with the unique ones.
12259
12260	// block:
12261	// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12262	// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12263	// ... (use %2)
12264	// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12265	// br %block
12266	SmallVector<int> Mask(VF, PoisonMaskElem);
12267	for (auto [I, V] : enumerate(First&: VL)) {
12268	if (isa<PoisonValue>(Val: V))
12269	continue;
12270	Mask [I] = VE->findLaneForValue(V);
12271	}
12272	V = FinalShuffle (V, Mask);
12273	} else {
12274	assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12275	"Expected vectorization factor less "
12276	"than original vector size.");
12277	SmallVector<int> UniformMask(VF, `0`);
12278	std::iota(first: UniformMask.begin(), last: UniformMask.end(), value: `0`);
12279	V = FinalShuffle (V, UniformMask);
12280	}
12281	}
12282	// Need to update the operand gather node, if actually the operand is not a
12283	// vectorized node, but the buildvector/gather node, which matches one of
12284	// the vectorized nodes.
12285	if (find_if(Range&: VE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
12286	return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12287	}) == VE->UserTreeIndices.end()) {
12288	auto *It = find_if(
12289	Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
12290	return TE ->isGather() &&
12291	TE ->UserTreeIndices.front().UserTE == E &&
12292	TE ->UserTreeIndices.front().EdgeIdx == NodeIdx;
12293	});
12294	assert(It != VectorizableTree.end() && "Expected gather node operand.");
12295	(*It)->VectorizedValue = V;
12296	}
12297	return V;
12298	}
12299	}
12300
12301	// Find the corresponding gather entry and vectorize it.
12302	// Allows to be more accurate with tree/graph transformations, checks for the
12303	// correctness of the transformations in many cases.
12304	auto *I = find_if(Range&: VectorizableTree,
12305	P: [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12306	return TE ->isOperandGatherNode(UserEI: {E, NodeIdx});
12307	});
12308	assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12309	assert(I->get()->UserTreeIndices.size() == `1` &&
12310	"Expected only single user for the gather node.");
12311	assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12312	return vectorizeTree(E: I->get(), PostponedPHIs);
12313	}
12314
12315	template <typename BVTy, typename ResTy, typename... Args>
12316	ResTy BoUpSLP::processBuildVector(const TreeEntry E, Type ScalarTy,
12317	Args &...Params) {
12318	assert(E->isGather() && "Expected gather node.");
12319	unsigned VF = E->getVectorFactor();
12320
12321	bool NeedFreeze = false;
12322	SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12323	E->ReuseShuffleIndices.end());
12324	SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12325	// Build a mask out of the reorder indices and reorder scalars per this
12326	// mask.
12327	SmallVector<int> ReorderMask;
12328	inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
12329	if (!ReorderMask.empty())
12330	reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
12331	auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12332	unsigned I, unsigned SliceSize) {
12333	if (!isSplat(VL: E->Scalars) \|\| none_of(E->Scalars, [](Value *V) {
12334	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
12335	}))
12336	return false;
12337	TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12338	unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12339	if (UserTE->getNumOperands() != `2`)
12340	return false;
12341	auto *It =
12342	find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12343	return find_if(TE ->UserTreeIndices, [=](const EdgeInfo &EI) {
12344	return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12345	}) != TE ->UserTreeIndices.end();
12346	});
12347	if (It == VectorizableTree.end())
12348	return false;
12349	int Idx;
12350	if ((Mask.size() < InputVF &&
12351	ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
12352	Idx == `0`) \|\|
12353	(Mask.size() == InputVF &&
12354	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
12355	std::iota(
12356	first: std::next(x: Mask.begin(), n: I * SliceSize),
12357	last: std::next(x: Mask.begin(),
12358	n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
12359	value: `0`);
12360	} else {
12361	unsigned IVal =
12362	find_if_not(Mask, [](int* Idx) { return Idx == PoisonMaskElem; });
12363	std::fill(
12364	std::next(x: Mask.begin(), n: I * SliceSize),
12365	std::next(x: Mask.begin(),
12366	n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
12367	IVal);
12368	}
12369	return true;
12370	};
12371	BVTy ShuffleBuilder(ScalarTy, Params...);
12372	ResTy Res = ResTy();
12373	SmallVector<int> Mask;
12374	SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12375	SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
12376	Value ExtractVecBase = nullptr*;
12377	bool UseVecBaseAsInput = false;
12378	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
12379	SmallVector<SmallVector<const TreeEntry *>> Entries;
12380	Type *OrigScalarTy = GatheredScalars.front()->getType();
12381	auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
12382	unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
12383	if (NumParts == `0` \|\| NumParts >= GatheredScalars.size())
12384	NumParts = `1`;
12385	if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
12386	// Check for gathered extracts.
12387	bool Resized = false;
12388	ExtractShuffles =
12389	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
12390	if (!ExtractShuffles.empty()) {
12391	SmallVector<const TreeEntry *> ExtractEntries;
12392	for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
12393	if (I == PoisonMaskElem)
12394	continue;
12395	if (const auto *TE = getTreeEntry(
12396	V: cast<ExtractElementInst>(Val: E->Scalars [Idx])->getVectorOperand()))
12397	ExtractEntries.push_back(Elt: TE);
12398	}
12399	if (std::optional<ResTy> Delayed =
12400	ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12401	// Delay emission of gathers which are not ready yet.
12402	PostponedGathers.insert(X: E);
12403	// Postpone gather emission, will be emitted after the end of the
12404	// process to keep correct order.
12405	return *Delayed;
12406	}
12407	if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12408	E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12409	ExtractVecBase = VecBase;
12410	if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
12411	if (VF == VecBaseTy->getNumElements() &&
12412	GatheredScalars.size() != VF) {
12413	Resized = true;
12414	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
12415	Elt: PoisonValue::get(T: OrigScalarTy));
12416	}
12417	}
12418	}
12419	// Gather extracts after we check for full matched gathers only.
12420	if (!ExtractShuffles.empty() \|\| E->getOpcode() != Instruction::Load \|\|
12421	E->isAltShuffle() \|\|
12422	all_of(E->Scalars, [this](Value V) { return* getTreeEntry(V); }) \|\|
12423	isSplat(VL: E->Scalars) \|\|
12424	(E->Scalars != GatheredScalars && GatheredScalars.size() <= `2`)) {
12425	GatherShuffles =
12426	isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
12427	}
12428	if (!GatherShuffles.empty()) {
12429	if (std::optional<ResTy> Delayed =
12430	ShuffleBuilder.needToDelay(E, Entries)) {
12431	// Delay emission of gathers which are not ready yet.
12432	PostponedGathers.insert(X: E);
12433	// Postpone gather emission, will be emitted after the end of the
12434	// process to keep correct order.
12435	return *Delayed;
12436	}
12437	if (GatherShuffles.size() == `1` &&
12438	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12439	Entries.front().front()->isSame(VL: E->Scalars)) {
12440	// Perfect match in the graph, will reuse the previously vectorized
12441	// node. Cost is 0.
12442	LLVM_DEBUG(
12443	dbgs()
12444	<< "SLP: perfect diamond match for gather bundle "
12445	<< shortBundleName(E->Scalars) << ".\n");
12446	// Restore the mask for previous partially matched values.
12447	Mask.resize(N: E->Scalars.size());
12448	const TreeEntry *FrontTE = Entries.front().front();
12449	if (FrontTE->ReorderIndices.empty() &&
12450	((FrontTE->ReuseShuffleIndices.empty() &&
12451	E->Scalars.size() == FrontTE->Scalars.size()) \|\|
12452	(E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12453	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
12454	} else {
12455	for (auto [I, V] : enumerate(First: E->Scalars)) {
12456	if (isa<PoisonValue>(Val: V)) {
12457	Mask [I] = PoisonMaskElem;
12458	continue;
12459	}
12460	Mask [I] = FrontTE->findLaneForValue(V);
12461	}
12462	}
12463	ShuffleBuilder.add(*FrontTE, Mask);
12464	Res = ShuffleBuilder.finalize(E->getCommonMask());
12465	return Res;
12466	}
12467	if (!Resized) {
12468	if (GatheredScalars.size() != VF &&
12469	any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12470	return any_of(TEs, [&](const TreeEntry *TE) {
12471	return TE->getVectorFactor() == VF;
12472	});
12473	}))
12474	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
12475	Elt: PoisonValue::get(T: OrigScalarTy));
12476	}
12477	// Remove shuffled elements from list of gathers.
12478	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
12479	if (Mask [I] != PoisonMaskElem)
12480	GatheredScalars [I] = PoisonValue::get(T: OrigScalarTy);
12481	}
12482	}
12483	}
12484	auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12485	SmallVectorImpl<int> &ReuseMask,
12486	bool IsRootPoison) {
12487	// For splats with can emit broadcasts instead of gathers, so try to find
12488	// such sequences.
12489	bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
12490	(Scalars.size() > `2` \|\| Scalars.front() == Scalars.back());
12491	Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
12492	SmallVector<int> UndefPos;
12493	DenseMap<Value , unsigned*> UniquePositions;
12494	// Gather unique non-const values and all constant values.
12495	// For repeated values, just shuffle them.
12496	int NumNonConsts = `0`;
12497	int SinglePos = `0`;
12498	for (auto [I, V] : enumerate(First&: Scalars)) {
12499	if (isa<UndefValue>(Val: V)) {
12500	if (!isa<PoisonValue>(Val: V)) {
12501	ReuseMask [I] = I;
12502	UndefPos.push_back(Elt: I);
12503	}
12504	continue;
12505	}
12506	if (isConstant(V)) {
12507	ReuseMask [I] = I;
12508	continue;
12509	}
12510	++NumNonConsts;
12511	SinglePos = I;
12512	Value *OrigV = V;
12513	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
12514	if (IsSplat) {
12515	Scalars.front() = OrigV;
12516	ReuseMask [I] = `0`;
12517	} else {
12518	const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
12519	Scalars [Res.first ->second] = OrigV;
12520	ReuseMask [I] = Res.first ->second;
12521	}
12522	}
12523	if (NumNonConsts == `1`) {
12524	// Restore single insert element.
12525	if (IsSplat) {
12526	ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
12527	std::swap(a&: Scalars.front(), b&: Scalars [SinglePos]);
12528	if (!UndefPos.empty() && UndefPos.front() == `0`)
12529	Scalars.front() = UndefValue::get(T: OrigScalarTy);
12530	}
12531	ReuseMask [SinglePos] = SinglePos;
12532	} else if (!UndefPos.empty() && IsSplat) {
12533	// For undef values, try to replace them with the simple broadcast.
12534	// We can do it if the broadcasted value is guaranteed to be
12535	// non-poisonous, or by freezing the incoming scalar value first.
12536	auto It = find_if(Scalars, [this, E](Value V) {
12537	return !isa<UndefValue>(Val: V) &&
12538	(getTreeEntry(V) \|\| isGuaranteedNotToBePoison(V) \|\|
12539	(E->UserTreeIndices.size() == `1` &&
12540	any_of(V->uses(), [E](const Use &U) {
12541	// Check if the value already used in the same operation in
12542	// one of the nodes already.
12543	return E->UserTreeIndices.front().EdgeIdx !=
12544	U.getOperandNo() &&
12545	is_contained(
12546	Range&: E->UserTreeIndices.front().UserTE->Scalars,
12547	Element: U.getUser());
12548	})));
12549	});
12550	if (It != Scalars.end()) {
12551	// Replace undefs by the non-poisoned scalars and emit broadcast.
12552	int Pos = std::distance(Scalars.begin(), It);
12553	for (int I : UndefPos) {
12554	// Set the undef position to the non-poisoned scalar.
12555	ReuseMask [I] = Pos;
12556	// Replace the undef by the poison, in the mask it is replaced by
12557	// non-poisoned scalar already.
12558	if (I != Pos)
12559	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
12560	}
12561	} else {
12562	// Replace undefs by the poisons, emit broadcast and then emit
12563	// freeze.
12564	for (int I : UndefPos) {
12565	ReuseMask [I] = PoisonMaskElem;
12566	if (isa<UndefValue>(Val: Scalars [I]))
12567	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
12568	}
12569	NeedFreeze = true;
12570	}
12571	}
12572	};
12573	if (!ExtractShuffles.empty() \|\| !GatherShuffles.empty()) {
12574	bool IsNonPoisoned = true;
12575	bool IsUsedInExpr = true;
12576	Value Vec1 = nullptr*;
12577	if (!ExtractShuffles.empty()) {
12578	// Gather of extractelements can be represented as just a shuffle of
12579	// a single/two vectors the scalars are extracted from.
12580	// Find input vectors.
12581	Value Vec2 = nullptr*;
12582	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
12583	if (!Mask.empty() && Mask [I] != PoisonMaskElem)
12584	ExtractMask [I] = PoisonMaskElem;
12585	}
12586	if (UseVecBaseAsInput) {
12587	Vec1 = ExtractVecBase;
12588	} else {
12589	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
12590	if (ExtractMask [I] == PoisonMaskElem)
12591	continue;
12592	if (isa<UndefValue>(Val: E->Scalars [I]))
12593	continue;
12594	auto *EI = cast<ExtractElementInst>(Val: E->Scalars [I]);
12595	Value *VecOp = EI->getVectorOperand();
12596	if (const auto *TE = getTreeEntry(V: VecOp))
12597	if (TE->VectorizedValue)
12598	VecOp = TE->VectorizedValue;
12599	if (!Vec1) {
12600	Vec1 = VecOp;
12601	} else if (Vec1 != VecOp) {
12602	assert((!Vec2 \|\| Vec2 == VecOp) &&
12603	"Expected only 1 or 2 vectors shuffle.");
12604	Vec2 = VecOp;
12605	}
12606	}
12607	}
12608	if (Vec2) {
12609	IsUsedInExpr = false;
12610	IsNonPoisoned &=
12611	isGuaranteedNotToBePoison(V: Vec1) && isGuaranteedNotToBePoison(V: Vec2);
12612	ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12613	} else if (Vec1) {
12614	IsUsedInExpr &= FindReusedSplat(
12615	ExtractMask,
12616	cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), `0`,
12617	ExtractMask.size());
12618	ShuffleBuilder.add(Vec1, ExtractMask, /ForExtracts=/true);
12619	IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1);
12620	} else {
12621	IsUsedInExpr = false;
12622	ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
12623	/ForExtracts=/true);
12624	}
12625	}
12626	if (!GatherShuffles.empty()) {
12627	unsigned SliceSize = getPartNumElems(Size: E->Scalars.size(), NumParts);
12628	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12629	for (const auto [I, TEs] : enumerate(First&: Entries)) {
12630	if (TEs.empty()) {
12631	assert(!GatherShuffles[I] &&
12632	"No shuffles with empty entries list expected.");
12633	continue;
12634	}
12635	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
12636	"Expected shuffle of 1 or 2 entries.");
12637	unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
12638	auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
12639	VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
12640	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
12641	if (TEs.size() == `1`) {
12642	IsUsedInExpr &= FindReusedSplat(
12643	VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12644	ShuffleBuilder.add(*TEs.front(), VecMask);
12645	if (TEs.front()->VectorizedValue)
12646	IsNonPoisoned &=
12647	isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue);
12648	} else {
12649	IsUsedInExpr = false;
12650	ShuffleBuilder.add(TEs.front(), TEs.back(), VecMask);
12651	if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12652	IsNonPoisoned &=
12653	isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue) &&
12654	isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue);
12655	}
12656	}
12657	}
12658	// Try to figure out best way to combine values: build a shuffle and insert
12659	// elements or just build several shuffles.
12660	// Insert non-constant scalars.
12661	SmallVector<Value *> NonConstants(GatheredScalars);
12662	int EMSz = ExtractMask.size();
12663	int MSz = Mask.size();
12664	// Try to build constant vector and shuffle with it only if currently we
12665	// have a single permutation and more than 1 scalar constants.
12666	bool IsSingleShuffle = ExtractShuffles.empty() \|\| GatherShuffles.empty();
12667	bool IsIdentityShuffle =
12668	((UseVecBaseAsInput \|\|
12669	all_of(ExtractShuffles,
12670	[](const std::optional<TTI::ShuffleKind> &SK) {
12671	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12672	TTI::SK_PermuteSingleSrc;
12673	})) &&
12674	none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12675	ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) \|\|
12676	(!GatherShuffles.empty() &&
12677	all_of(GatherShuffles,
12678	[](const std::optional<TTI::ShuffleKind> &SK) {
12679	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12680	TTI::SK_PermuteSingleSrc;
12681	}) &&
12682	none_of(Mask, [&](int I) { return I >= MSz; }) &&
12683	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
12684	bool EnoughConstsForShuffle =
12685	IsSingleShuffle &&
12686	(none_of(GatheredScalars,
12687	[](Value *V) {
12688	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
12689	}) \|\|
12690	any_of(GatheredScalars,
12691	[](Value *V) {
12692	return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
12693	})) &&
12694	(!IsIdentityShuffle \|\|
12695	(GatheredScalars.size() == `2` &&
12696	any_of(GatheredScalars,
12697	[](Value V) { return* !isa<UndefValue>(Val: V); })) \|\|
12698	count_if(GatheredScalars, [](Value *V) {
12699	return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
12700	}) > `1`);
12701	// NonConstants array contains just non-constant values, GatheredScalars
12702	// contains only constant to build final vector and then shuffle.
12703	for (int I = `0`, Sz = GatheredScalars.size(); I < Sz; ++I) {
12704	if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars [I]))
12705	NonConstants [I] = PoisonValue::get(T: OrigScalarTy);
12706	else
12707	GatheredScalars [I] = PoisonValue::get(T: OrigScalarTy);
12708	}
12709	// Generate constants for final shuffle and build a mask for them.
12710	if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
12711	SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12712	TryPackScalars(GatheredScalars, BVMask, /IsRootPoison=/true);
12713	Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12714	ShuffleBuilder.add(BV, BVMask);
12715	}
12716	if (all_of(NonConstants, [=](Value *V) {
12717	return isa<PoisonValue>(Val: V) \|\|
12718	(IsSingleShuffle && ((IsIdentityShuffle &&
12719	IsNonPoisoned) \|\| IsUsedInExpr) && isa<UndefValue>(Val: V));
12720	}))
12721	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12722	else
12723	Res = ShuffleBuilder.finalize(
12724	E->ReuseShuffleIndices, E->Scalars.size(),
12725	[&](Value &Vec, SmallVectorImpl<int*> &Mask) {
12726	TryPackScalars(NonConstants, Mask, /IsRootPoison=/false);
12727	Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12728	});
12729	} else if (!allConstant(VL: GatheredScalars)) {
12730	// Gather unique scalars and all constants.
12731	SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12732	TryPackScalars(GatheredScalars, ReuseMask, /IsRootPoison=/true);
12733	Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12734	ShuffleBuilder.add(BV, ReuseMask);
12735	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12736	} else {
12737	// Gather all constants.
12738	SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12739	for (auto [I, V] : enumerate(First: E->Scalars)) {
12740	if (!isa<PoisonValue>(Val: V))
12741	Mask [I] = I;
12742	}
12743	Value *BV = ShuffleBuilder.gather(E->Scalars);
12744	ShuffleBuilder.add(BV, Mask);
12745	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12746	}
12747
12748	if (NeedFreeze)
12749	Res = ShuffleBuilder.createFreeze(Res);
12750	return Res;
12751	}
12752
12753	Value BoUpSLP::createBuildVector(const* TreeEntry E, Type ScalarTy) {
12754	return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12755	Params&: Builder, Params&: *this);
12756	}
12757
12758	Value BoUpSLP::vectorizeTree(TreeEntry E, bool PostponedPHIs) {
12759	IRBuilderBase::InsertPointGuard Guard(Builder);
12760
12761	if (E->VectorizedValue &&
12762	(E->State != TreeEntry::Vectorize \|\| E->getOpcode() != Instruction::PHI \|\|
12763	E->isAltShuffle())) {
12764	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[`0`] << ".\n");
12765	return E->VectorizedValue;
12766	}
12767
12768	Value *V = E->Scalars.front();
12769	Type *ScalarTy = V->getType();
12770	if (auto *Store = dyn_cast<StoreInst>(Val: V))
12771	ScalarTy = Store->getValueOperand()->getType();
12772	else if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
12773	ScalarTy = IE->getOperand(i_nocapture: `1`)->getType();
12774	auto It = MinBWs.find(Val: E);
12775	if (It != MinBWs.end())
12776	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
12777	auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
12778	if (E->isGather()) {
12779	// Set insert point for non-reduction initial nodes.
12780	if (E->getMainOp() && E->Idx == `0` && !UserIgnoreList)
12781	setInsertPointAfterBundle(E);
12782	Value *Vec = createBuildVector(E, ScalarTy);
12783	E->VectorizedValue = Vec;
12784	return Vec;
12785	}
12786
12787	bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
12788	auto FinalShuffle = [&](Value V, const* TreeEntry E, VectorType VecTy) {
12789	ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12790	if (E->getOpcode() == Instruction::Store &&
12791	E->State == TreeEntry::Vectorize) {
12792	ArrayRef<int> Mask =
12793	ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12794	E->ReorderIndices.size());
12795	ShuffleBuilder.add(V1: V, Mask);
12796	} else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12797	ShuffleBuilder.addOrdered(V1: V, Order: std::nullopt);
12798	} else {
12799	ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
12800	}
12801	return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices);
12802	};
12803
12804	assert((E->State == TreeEntry::Vectorize \|\|
12805	E->State == TreeEntry::ScatterVectorize \|\|
12806	E->State == TreeEntry::StridedVectorize) &&
12807	"Unhandled state");
12808	unsigned ShuffleOrOp =
12809	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12810	Instruction *VL0 = E->getMainOp();
12811	auto GetOperandSignedness = [&](unsigned Idx) {
12812	const TreeEntry *OpE = getOperandEntry(E, Idx);
12813	bool IsSigned = false;
12814	auto It = MinBWs.find(Val: OpE);
12815	if (It != MinBWs.end())
12816	IsSigned = It ->second.second;
12817	else
12818	IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
12819	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
12820	});
12821	return IsSigned;
12822	};
12823	switch (ShuffleOrOp) {
12824	case Instruction::PHI: {
12825	assert((E->ReorderIndices.empty() \|\| !E->ReuseShuffleIndices.empty() \|\|
12826	E != VectorizableTree.front().get() \|\|
12827	!E->UserTreeIndices.empty()) &&
12828	"PHI reordering is free.");
12829	if (PostponedPHIs && E->VectorizedValue)
12830	return E->VectorizedValue;
12831	auto *PH = cast<PHINode>(Val: VL0);
12832	Builder.SetInsertPoint(TheBB: PH->getParent(),
12833	IP: PH->getParent()->getFirstNonPHIIt());
12834	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12835	if (PostponedPHIs \|\| !E->VectorizedValue) {
12836	PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
12837	E->PHI = NewPhi;
12838	Value *V = NewPhi;
12839
12840	// Adjust insertion point once all PHI's have been generated.
12841	Builder.SetInsertPoint(TheBB: PH->getParent(),
12842	IP: PH->getParent()->getFirstInsertionPt());
12843	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12844
12845	V = FinalShuffle (V, E, VecTy);
12846
12847	E->VectorizedValue = V;
12848	if (PostponedPHIs)
12849	return V;
12850	}
12851	PHINode *NewPhi = cast<PHINode>(Val: E->PHI);
12852	// If phi node is fully emitted - exit.
12853	if (NewPhi->getNumIncomingValues() != `0`)
12854	return NewPhi;
12855
12856	// PHINodes may have multiple entries from the same block. We want to
12857	// visit every block once.
12858	SmallPtrSet<BasicBlock *, `4`> VisitedBBs;
12859
12860	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumIncomingValues())) {
12861	ValueList Operands;
12862	BasicBlock *IBB = PH->getIncomingBlock(i: I);
12863
12864	// Stop emission if all incoming values are generated.
12865	if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12866	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12867	return NewPhi;
12868	}
12869
12870	if (!VisitedBBs.insert(Ptr: IBB).second) {
12871	NewPhi->addIncoming(V: NewPhi->getIncomingValueForBlock(BB: IBB), BB: IBB);
12872	continue;
12873	}
12874
12875	Builder.SetInsertPoint(IBB->getTerminator());
12876	Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12877	Value Vec = vectorizeOperand(E, NodeIdx: I, /PostponedPHIs=/*true);
12878	if (VecTy != Vec->getType()) {
12879	assert((It != MinBWs.end() \|\| getOperandEntry(E, I)->isGather() \|\|
12880	MinBWs.contains(getOperandEntry(E, I))) &&
12881	"Expected item in MinBWs.");
12882	Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
12883	}
12884	NewPhi->addIncoming(V: Vec, BB: IBB);
12885	}
12886
12887	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12888	"Invalid number of incoming values");
12889	return NewPhi;
12890	}
12891
12892	case Instruction::ExtractElement: {
12893	Value *V = E->getSingleOperand(OpIdx: `0`);
12894	if (const TreeEntry *TE = getTreeEntry(V))
12895	V = TE->VectorizedValue;
12896	setInsertPointAfterBundle(E);
12897	V = FinalShuffle (V, E, VecTy);
12898	E->VectorizedValue = V;
12899	return V;
12900	}
12901	case Instruction::ExtractValue: {
12902	auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: `0`));
12903	Builder.SetInsertPoint(LI);
12904	Value *Ptr = LI->getPointerOperand();
12905	LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
12906	Value *NewV = propagateMetadata(I: V, VL: E->Scalars);
12907	NewV = FinalShuffle (NewV, E, VecTy);
12908	E->VectorizedValue = NewV;
12909	return NewV;
12910	}
12911	case Instruction::InsertElement: {
12912	assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12913	Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
12914	Value *V = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
12915	ArrayRef<Value *> Op = E->getOperand(OpIdx: `1`);
12916	Type *ScalarTy = Op.front()->getType();
12917	if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
12918	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12919	std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: `1`));
12920	assert(Res.first > `0` && "Expected item in MinBWs.");
12921	V = Builder.CreateIntCast(
12922	V,
12923	DestTy: getWidenedType(
12924	ScalarTy,
12925	VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
12926	isSigned: Res.second);
12927	}
12928
12929	// Create InsertVector shuffle if necessary
12930	auto FirstInsert = cast<Instruction>(Val: find_if(Range&: E->Scalars, P: [E](Value *V) {
12931	return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
12932	}));
12933	const unsigned NumElts =
12934	cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
12935	const unsigned NumScalars = E->Scalars.size();
12936
12937	unsigned Offset = *getElementIndex(Inst: VL0);
12938	assert(Offset < NumElts && "Failed to find vector index offset");
12939
12940	// Create shuffle to resize vector
12941	SmallVector<int> Mask;
12942	if (!E->ReorderIndices.empty()) {
12943	inversePermutation(Indices: E->ReorderIndices, Mask);
12944	Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
12945	} else {
12946	Mask.assign(NumElts, Elt: PoisonMaskElem);
12947	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: `0`);
12948	}
12949	// Create InsertVector shuffle if necessary
12950	bool IsIdentity = true;
12951	SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12952	Mask.swap(RHS&: PrevMask);
12953	for (unsigned I = `0`; I < NumScalars; ++I) {
12954	Value *Scalar = E->Scalars [PrevMask [I]];
12955	unsigned InsertIdx = *getElementIndex(Inst: Scalar);
12956	IsIdentity &= InsertIdx - Offset == I;
12957	Mask [InsertIdx - Offset] = I;
12958	}
12959	if (!IsIdentity \|\| NumElts != NumScalars) {
12960	Value V2 = nullptr*;
12961	bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12962	SmallVector<int> InsertMask(Mask);
12963	if (NumElts != NumScalars && Offset == `0`) {
12964	// Follow all insert element instructions from the current buildvector
12965	// sequence.
12966	InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
12967	do {
12968	std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
12969	if (!InsertIdx)
12970	break;
12971	if (InsertMask [*InsertIdx] == PoisonMaskElem)
12972	InsertMask [InsertIdx] = InsertIdx;
12973	if (!Ins->hasOneUse())
12974	break;
12975	Ins = dyn_cast_or_null<InsertElementInst>(
12976	Val: Ins->getUniqueUndroppableUser());
12977	} while (Ins);
12978	SmallBitVector UseMask =
12979	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
12980	SmallBitVector IsFirstPoison =
12981	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
12982	SmallBitVector IsFirstUndef =
12983	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
12984	if (!IsFirstPoison.all()) {
12985	unsigned Idx = `0`;
12986	for (unsigned I = `0`; I < NumElts; I++) {
12987	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
12988	IsFirstUndef.test(Idx: I)) {
12989	if (IsVNonPoisonous) {
12990	InsertMask [I] = I < NumScalars ? I : `0`;
12991	continue;
12992	}
12993	if (!V2)
12994	V2 = UndefValue::get(T: V->getType());
12995	if (Idx >= NumScalars)
12996	Idx = NumScalars - `1`;
12997	InsertMask [I] = NumScalars + Idx;
12998	++Idx;
12999	} else if (InsertMask [I] != PoisonMaskElem &&
13000	Mask [I] == PoisonMaskElem) {
13001	InsertMask [I] = PoisonMaskElem;
13002	}
13003	}
13004	} else {
13005	InsertMask = Mask;
13006	}
13007	}
13008	if (!V2)
13009	V2 = PoisonValue::get(T: V->getType());
13010	V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
13011	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13012	GatherShuffleExtractSeq.insert(X: I);
13013	CSEBlocks.insert(V: I->getParent());
13014	}
13015	}
13016
13017	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13018	for (unsigned I = `0`; I < NumElts; I++) {
13019	if (Mask [I] != PoisonMaskElem)
13020	InsertMask [Offset + I] = I;
13021	}
13022	SmallBitVector UseMask =
13023	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
13024	SmallBitVector IsFirstUndef =
13025	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
13026	if ((!IsIdentity \|\| Offset != `0` \|\| !IsFirstUndef.all()) &&
13027	NumElts != NumScalars) {
13028	if (IsFirstUndef.all()) {
13029	if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
13030	SmallBitVector IsFirstPoison =
13031	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
13032	if (!IsFirstPoison.all()) {
13033	for (unsigned I = `0`; I < NumElts; I++) {
13034	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
13035	InsertMask [I] = I + NumElts;
13036	}
13037	}
13038	V = Builder.CreateShuffleVector(
13039	V1: V,
13040	V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
13041	: FirstInsert->getOperand(i: `0`),
13042	Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
13043	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13044	GatherShuffleExtractSeq.insert(X: I);
13045	CSEBlocks.insert(V: I->getParent());
13046	}
13047	}
13048	} else {
13049	SmallBitVector IsFirstPoison =
13050	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
13051	for (unsigned I = `0`; I < NumElts; I++) {
13052	if (InsertMask [I] == PoisonMaskElem)
13053	InsertMask [I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
13054	else
13055	InsertMask [I] += NumElts;
13056	}
13057	V = Builder.CreateShuffleVector(
13058	V1: FirstInsert->getOperand(i: `0`), V2: V, Mask: InsertMask,
13059	Name: cast<Instruction>(Val: E->Scalars.back())->getName());
13060	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13061	GatherShuffleExtractSeq.insert(X: I);
13062	CSEBlocks.insert(V: I->getParent());
13063	}
13064	}
13065	}
13066
13067	++NumVectorInstructions;
13068	E->VectorizedValue = V;
13069	return V;
13070	}
13071	case Instruction::ZExt:
13072	case Instruction::SExt:
13073	case Instruction::FPToUI:
13074	case Instruction::FPToSI:
13075	case Instruction::FPExt:
13076	case Instruction::PtrToInt:
13077	case Instruction::IntToPtr:
13078	case Instruction::SIToFP:
13079	case Instruction::UIToFP:
13080	case Instruction::Trunc:
13081	case Instruction::FPTrunc:
13082	case Instruction::BitCast: {
13083	setInsertPointAfterBundle(E);
13084
13085	Value *InVec = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13086	if (E->VectorizedValue) {
13087	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13088	return E->VectorizedValue;
13089	}
13090
13091	auto *CI = cast<CastInst>(Val: VL0);
13092	Instruction::CastOps VecOpcode = CI->getOpcode();
13093	Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
13094	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
13095	if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13096	(SrcIt != MinBWs.end() \|\| It != MinBWs.end() \|\|
13097	SrcScalarTy != CI->getOperand(i_nocapture: `0`)->getType())) {
13098	// Check if the values are candidates to demote.
13099	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
13100	if (SrcIt != MinBWs.end())
13101	SrcBWSz = SrcIt ->second.first;
13102	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13103	if (BWSz == SrcBWSz) {
13104	VecOpcode = Instruction::BitCast;
13105	} else if (BWSz < SrcBWSz) {
13106	VecOpcode = Instruction::Trunc;
13107	} else if (It != MinBWs.end()) {
13108	assert(BWSz > SrcBWSz && "Invalid cast!");
13109	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
13110	} else if (SrcIt != MinBWs.end()) {
13111	assert(BWSz > SrcBWSz && "Invalid cast!");
13112	VecOpcode =
13113	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
13114	}
13115	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13116	!SrcIt ->second.second) {
13117	VecOpcode = Instruction::UIToFP;
13118	}
13119	Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13120	? InVec
13121	: Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
13122	V = FinalShuffle (V, E, VecTy);
13123
13124	E->VectorizedValue = V;
13125	++NumVectorInstructions;
13126	return V;
13127	}
13128	case Instruction::FCmp:
13129	case Instruction::ICmp: {
13130	setInsertPointAfterBundle(E);
13131
13132	Value *L = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13133	if (E->VectorizedValue) {
13134	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13135	return E->VectorizedValue;
13136	}
13137	Value *R = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
13138	if (E->VectorizedValue) {
13139	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13140	return E->VectorizedValue;
13141	}
13142	if (L->getType() != R->getType()) {
13143	assert((getOperandEntry(E, `0`)->isGather() \|\|
13144	getOperandEntry(E, `1`)->isGather() \|\|
13145	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
13146	MinBWs.contains(getOperandEntry(E, `1`))) &&
13147	"Expected item in MinBWs.");
13148	if (cast<VectorType>(Val: L->getType())
13149	->getElementType()
13150	->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
13151	->getElementType()
13152	->getIntegerBitWidth()) {
13153	Type *CastTy = R->getType();
13154	L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
13155	} else {
13156	Type *CastTy = L->getType();
13157	R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
13158	}
13159	}
13160
13161	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
13162	Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
13163	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
13164	// Do not cast for cmps.
13165	VecTy = cast<FixedVectorType>(Val: V->getType());
13166	V = FinalShuffle (V, E, VecTy);
13167
13168	E->VectorizedValue = V;
13169	++NumVectorInstructions;
13170	return V;
13171	}
13172	case Instruction::Select: {
13173	setInsertPointAfterBundle(E);
13174
13175	Value *Cond = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13176	if (E->VectorizedValue) {
13177	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13178	return E->VectorizedValue;
13179	}
13180	Value *True = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
13181	if (E->VectorizedValue) {
13182	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13183	return E->VectorizedValue;
13184	}
13185	Value *False = vectorizeOperand(E, NodeIdx: `2`, PostponedPHIs);
13186	if (E->VectorizedValue) {
13187	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13188	return E->VectorizedValue;
13189	}
13190	if (True->getType() != VecTy \|\| False->getType() != VecTy) {
13191	assert((It != MinBWs.end() \|\| getOperandEntry(E, `1`)->isGather() \|\|
13192	getOperandEntry(E, `2`)->isGather() \|\|
13193	MinBWs.contains(getOperandEntry(E, `1`)) \|\|
13194	MinBWs.contains(getOperandEntry(E, `2`))) &&
13195	"Expected item in MinBWs.");
13196	if (True->getType() != VecTy)
13197	True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
13198	if (False->getType() != VecTy)
13199	False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness (`2`));
13200	}
13201
13202	Value *V = Builder.CreateSelect(C: Cond, True, False);
13203	V = FinalShuffle (V, E, VecTy);
13204
13205	E->VectorizedValue = V;
13206	++NumVectorInstructions;
13207	return V;
13208	}
13209	case Instruction::FNeg: {
13210	setInsertPointAfterBundle(E);
13211
13212	Value *Op = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13213
13214	if (E->VectorizedValue) {
13215	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13216	return E->VectorizedValue;
13217	}
13218
13219	Value *V = Builder.CreateUnOp(
13220	Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
13221	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
13222	if (auto *I = dyn_cast<Instruction>(Val: V))
13223	V = propagateMetadata(I, VL: E->Scalars);
13224
13225	V = FinalShuffle (V, E, VecTy);
13226
13227	E->VectorizedValue = V;
13228	++NumVectorInstructions;
13229
13230	return V;
13231	}
13232	case Instruction::Add:
13233	case Instruction::FAdd:
13234	case Instruction::Sub:
13235	case Instruction::FSub:
13236	case Instruction::Mul:
13237	case Instruction::FMul:
13238	case Instruction::UDiv:
13239	case Instruction::SDiv:
13240	case Instruction::FDiv:
13241	case Instruction::URem:
13242	case Instruction::SRem:
13243	case Instruction::FRem:
13244	case Instruction::Shl:
13245	case Instruction::LShr:
13246	case Instruction::AShr:
13247	case Instruction::And:
13248	case Instruction::Or:
13249	case Instruction::Xor: {
13250	setInsertPointAfterBundle(E);
13251
13252	Value *LHS = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13253	if (E->VectorizedValue) {
13254	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13255	return E->VectorizedValue;
13256	}
13257	Value *RHS = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
13258	if (E->VectorizedValue) {
13259	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13260	return E->VectorizedValue;
13261	}
13262	if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13263	for (unsigned I : seq<unsigned>(Begin: `0`, End: E->getNumOperands())) {
13264	ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
13265	if (all_of(Range&: Ops, P: [&](Value *Op) {
13266	auto *CI = dyn_cast<ConstantInt>(Val: Op);
13267	return CI && CI->getValue().countr_one() >= It ->second.first;
13268	})) {
13269	V = FinalShuffle (I == `0` ? RHS : LHS, E, VecTy);
13270	E->VectorizedValue = V;
13271	++NumVectorInstructions;
13272	return V;
13273	}
13274	}
13275	}
13276	if (LHS->getType() != VecTy \|\| RHS->getType() != VecTy) {
13277	assert((It != MinBWs.end() \|\| getOperandEntry(E, `0`)->isGather() \|\|
13278	getOperandEntry(E, `1`)->isGather() \|\|
13279	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
13280	MinBWs.contains(getOperandEntry(E, `1`))) &&
13281	"Expected item in MinBWs.");
13282	if (LHS->getType() != VecTy)
13283	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
13284	if (RHS->getType() != VecTy)
13285	RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
13286	}
13287
13288	Value *V = Builder.CreateBinOp(
13289	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13290	RHS);
13291	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0, IncludeWrapFlags: It == MinBWs.end());
13292	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13293	V = propagateMetadata(I, VL: E->Scalars);
13294	// Drop nuw flags for abs(sub(commutative), true).
13295	if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
13296	any_of(Range&: E->Scalars, P: [](Value *V) {
13297	return isCommutative(I: cast<Instruction>(Val: V));
13298	}))
13299	I->setHasNoUnsignedWrap(/b=/false);
13300	}
13301
13302	V = FinalShuffle (V, E, VecTy);
13303
13304	E->VectorizedValue = V;
13305	++NumVectorInstructions;
13306
13307	return V;
13308	}
13309	case Instruction::Load: {
13310	// Loads are inserted at the head of the tree because we don't want to
13311	// sink them all the way down past store instructions.
13312	setInsertPointAfterBundle(E);
13313
13314	LoadInst *LI = cast<LoadInst>(Val: VL0);
13315	Instruction *NewLI;
13316	Value *PO = LI->getPointerOperand();
13317	if (E->State == TreeEntry::Vectorize) {
13318	NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
13319	} else if (E->State == TreeEntry::StridedVectorize) {
13320	Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
13321	Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
13322	PO = IsReverseOrder ? PtrN : Ptr0;
13323	std::optional<int> Diff = getPointersDiff(
13324	ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: DL, SE&: SE);
13325	Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
13326	Value *StrideVal;
13327	if (Diff) {
13328	int Stride = Diff / (static_cast<int*>(E->Scalars.size()) - `1`);
13329	StrideVal =
13330	ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -`1` : `1`) * Stride *
13331	DL->getTypeAllocSize(Ty: ScalarTy));
13332	} else {
13333	SmallVector<Value > PointerOps(E->Scalars.size(), nullptr*);
13334	transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) {
13335	return cast<LoadInst>(Val: V)->getPointerOperand();
13336	});
13337	OrdersType Order;
13338	std::optional<Value *> Stride =
13339	calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order,
13340	Inst: &*Builder.GetInsertPoint());
13341	Value *NewStride =
13342	Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /isSigned=/*true);
13343	StrideVal = Builder.CreateMul(
13344	LHS: NewStride,
13345	RHS: ConstantInt::get(
13346	Ty: StrideTy,
13347	V: (IsReverseOrder ? -`1` : `1`) *
13348	static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))));
13349	}
13350	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
13351	auto *Inst = Builder.CreateIntrinsic(
13352	ID: Intrinsic::experimental_vp_strided_load,
13353	Types: {VecTy, PO->getType(), StrideTy},
13354	Args: {PO, StrideVal, Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
13355	Builder.getInt32(C: E->Scalars.size())});
13356	Inst->addParamAttr(
13357	/ArgNo=/`0`,
13358	Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
13359	NewLI = Inst;
13360	} else {
13361	assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13362	Value *VecPtr = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13363	if (E->VectorizedValue) {
13364	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13365	return E->VectorizedValue;
13366	}
13367	// Use the minimum alignment of the gathered loads.
13368	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
13369	NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
13370	}
13371	Value *V = propagateMetadata(I: NewLI, VL: E->Scalars);
13372
13373	V = FinalShuffle (V, E, VecTy);
13374	E->VectorizedValue = V;
13375	++NumVectorInstructions;
13376	return V;
13377	}
13378	case Instruction::Store: {
13379	auto *SI = cast<StoreInst>(Val: VL0);
13380
13381	setInsertPointAfterBundle(E);
13382
13383	Value *VecValue = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13384	if (VecValue->getType() != VecTy)
13385	VecValue =
13386	Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
13387	VecValue = FinalShuffle (VecValue, E, VecTy);
13388
13389	Value *Ptr = SI->getPointerOperand();
13390	Instruction *ST;
13391	if (E->State == TreeEntry::Vectorize) {
13392	ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
13393	} else {
13394	assert(E->State == TreeEntry::StridedVectorize &&
13395	"Expected either strided or conseutive stores.");
13396	if (!E->ReorderIndices.empty()) {
13397	SI = cast<StoreInst>(Val: E->Scalars [E->ReorderIndices.front()]);
13398	Ptr = SI->getPointerOperand();
13399	}
13400	Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
13401	Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
13402	auto *Inst = Builder.CreateIntrinsic(
13403	ID: Intrinsic::experimental_vp_strided_store,
13404	Types: {VecTy, Ptr->getType(), StrideTy},
13405	Args: {VecValue, Ptr,
13406	ConstantInt::get(
13407	Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
13408	Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
13409	Builder.getInt32(C: E->Scalars.size())});
13410	Inst->addParamAttr(
13411	/ArgNo=/`1`,
13412	Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
13413	ST = Inst;
13414	}
13415
13416	Value *V = propagateMetadata(I: ST, VL: E->Scalars);
13417
13418	E->VectorizedValue = V;
13419	++NumVectorInstructions;
13420	return V;
13421	}
13422	case Instruction::GetElementPtr: {
13423	auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
13424	setInsertPointAfterBundle(E);
13425
13426	Value *Op0 = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13427	if (E->VectorizedValue) {
13428	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13429	return E->VectorizedValue;
13430	}
13431
13432	SmallVector<Value *> OpVecs;
13433	for (int J = `1`, N = GEP0->getNumOperands(); J < N; ++J) {
13434	Value *OpVec = vectorizeOperand(E, NodeIdx: J, PostponedPHIs);
13435	if (E->VectorizedValue) {
13436	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13437	return E->VectorizedValue;
13438	}
13439	OpVecs.push_back(Elt: OpVec);
13440	}
13441
13442	Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
13443	if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
13444	SmallVector<Value *> GEPs;
13445	for (Value *V : E->Scalars) {
13446	if (isa<GetElementPtrInst>(Val: V))
13447	GEPs.push_back(Elt: V);
13448	}
13449	V = propagateMetadata(I, VL: GEPs);
13450	}
13451
13452	V = FinalShuffle (V, E, VecTy);
13453
13454	E->VectorizedValue = V;
13455	++NumVectorInstructions;
13456
13457	return V;
13458	}
13459	case Instruction::Call: {
13460	CallInst *CI = cast<CallInst>(Val: VL0);
13461	setInsertPointAfterBundle(E);
13462
13463	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13464
13465	SmallVector<Type *> ArgTys =
13466	buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
13467	MinBW: It != MinBWs.end() ? It ->second.first : `0`);
13468	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13469	bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13470	VecCallCosts.first <= VecCallCosts.second;
13471
13472	Value ScalarArg = nullptr*;
13473	SmallVector<Value *> OpVecs;
13474	SmallVector<Type *, `2`> TysForDecl;
13475	// Add return type if intrinsic is overloaded on it.
13476	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -`1`))
13477	TysForDecl.push_back(Elt: VecTy);
13478	auto *CEI = cast<CallInst>(Val: VL0);
13479	for (unsigned I : seq<unsigned>(Begin: `0`, End: CI->arg_size())) {
13480	ValueList OpVL;
13481	// Some intrinsics have scalar arguments. This argument should not be
13482	// vectorized.
13483	if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I)) {
13484	ScalarArg = CEI->getArgOperand(i: I);
13485	// if decided to reduce bitwidth of abs intrinsic, it second argument
13486	// must be set false (do not return poison, if value issigned min).
13487	if (ID == Intrinsic::abs && It != MinBWs.end() &&
13488	It ->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
13489	ScalarArg = Builder.getFalse();
13490	OpVecs.push_back(Elt: ScalarArg);
13491	if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
13492	TysForDecl.push_back(Elt: ScalarArg->getType());
13493	continue;
13494	}
13495
13496	Value *OpVec = vectorizeOperand(E, NodeIdx: I, PostponedPHIs);
13497	if (E->VectorizedValue) {
13498	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13499	return E->VectorizedValue;
13500	}
13501	ScalarArg = CEI->getArgOperand(i: I);
13502	if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
13503	ScalarArg->getType()->getScalarType() &&
13504	It == MinBWs.end()) {
13505	auto *CastTy =
13506	getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
13507	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness (I));
13508	} else if (It != MinBWs.end()) {
13509	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
13510	}
13511	LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13512	OpVecs.push_back(Elt: OpVec);
13513	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
13514	TysForDecl.push_back(Elt: OpVec->getType());
13515	}
13516
13517	Function *CF;
13518	if (!UseIntrinsic) {
13519	VFShape Shape =
13520	VFShape::get(FTy: CI->getFunctionType(),
13521	EC: ElementCount::getFixed(
13522	MinVal: static_cast<unsigned>(VecTy->getNumElements())),
13523	HasGlobalPred: false /HasGlobalPred/);
13524	CF = VFDatabase (*CI).getVectorizedFunction(Shape);
13525	} else {
13526	CF = Intrinsic::getDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
13527	}
13528
13529	SmallVector<OperandBundleDef, `1`> OpBundles;
13530	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
13531	Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
13532
13533	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
13534	V = FinalShuffle (V, E, VecTy);
13535
13536	E->VectorizedValue = V;
13537	++NumVectorInstructions;
13538	return V;
13539	}
13540	case Instruction::ShuffleVector: {
13541	assert(E->isAltShuffle() &&
13542	((Instruction::isBinaryOp(E->getOpcode()) &&
13543	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
13544	(Instruction::isCast(E->getOpcode()) &&
13545	Instruction::isCast(E->getAltOpcode())) \|\|
13546	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13547	"Invalid Shuffle Vector Operand");
13548
13549	Value LHS = nullptr, RHS = nullptr;
13550	if (Instruction::isBinaryOp(Opcode: E->getOpcode()) \|\| isa<CmpInst>(Val: VL0)) {
13551	setInsertPointAfterBundle(E);
13552	LHS = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13553	if (E->VectorizedValue) {
13554	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13555	return E->VectorizedValue;
13556	}
13557	RHS = vectorizeOperand(E, NodeIdx: `1`, PostponedPHIs);
13558	} else {
13559	setInsertPointAfterBundle(E);
13560	LHS = vectorizeOperand(E, NodeIdx: `0`, PostponedPHIs);
13561	}
13562	if (E->VectorizedValue) {
13563	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13564	return E->VectorizedValue;
13565	}
13566	if (LHS && RHS &&
13567	((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
13568	(LHS->getType() != VecTy \|\| RHS->getType() != VecTy)) \|\|
13569	(isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
13570	assert((It != MinBWs.end() \|\| getOperandEntry(E, `0`)->isGather() \|\|
13571	getOperandEntry(E, `1`)->isGather() \|\|
13572	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
13573	MinBWs.contains(getOperandEntry(E, `1`))) &&
13574	"Expected item in MinBWs.");
13575	Type *CastTy = VecTy;
13576	if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
13577	if (cast<VectorType>(Val: LHS->getType())
13578	->getElementType()
13579	->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
13580	->getElementType()
13581	->getIntegerBitWidth())
13582	CastTy = RHS->getType();
13583	else
13584	CastTy = LHS->getType();
13585	}
13586	if (LHS->getType() != CastTy)
13587	LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
13588	if (RHS->getType() != CastTy)
13589	RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
13590	}
13591
13592	Value V0, V1;
13593	if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
13594	V0 = Builder.CreateBinOp(
13595	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13596	V1 = Builder.CreateBinOp(
13597	Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13598	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
13599	V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
13600	auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
13601	CmpInst::Predicate AltPred = AltCI->getPredicate();
13602	V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
13603	} else {
13604	if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13605	unsigned SrcBWSz = DL->getTypeSizeInBits(
13606	Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
13607	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13608	if (BWSz <= SrcBWSz) {
13609	if (BWSz < SrcBWSz)
13610	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It ->second.first);
13611	assert(LHS->getType() == VecTy && "Expected same type as operand.");
13612	if (auto *I = dyn_cast<Instruction>(Val: LHS))
13613	LHS = propagateMetadata(I, VL: E->Scalars);
13614	E->VectorizedValue = LHS;
13615	++NumVectorInstructions;
13616	return LHS;
13617	}
13618	}
13619	V0 = Builder.CreateCast(
13620	Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
13621	V1 = Builder.CreateCast(
13622	Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
13623	}
13624	// Add V0 and V1 to later analysis to try to find and remove matching
13625	// instruction, if any.
13626	for (Value *V : {V0, V1}) {
13627	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13628	GatherShuffleExtractSeq.insert(X: I);
13629	CSEBlocks.insert(V: I->getParent());
13630	}
13631	}
13632
13633	// Create shuffle to take alternate operations from the vector.
13634	// Also, gather up main and alt scalar ops to propagate IR flags to
13635	// each vector operation.
13636	ValueList OpScalars, AltScalars;
13637	SmallVector<int> Mask;
13638	E->buildAltOpShuffleMask(
13639	IsAltOp: [E, this](Instruction *I) {
13640	assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13641	return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
13642	TLI: *TLI);
13643	},
13644	Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
13645
13646	propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
13647	propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
13648	auto DropNuwFlag = [&](Value Vec, unsigned* Opcode) {
13649	// Drop nuw flags for abs(sub(commutative), true).
13650	if (auto *I = dyn_cast<Instruction>(Val: Vec);
13651	I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
13652	any_of(Range&: E->Scalars, P: [](Value *V) {
13653	auto *IV = cast<Instruction>(Val: V);
13654	return IV->getOpcode() == Instruction::Sub &&
13655	isCommutative(I: cast<Instruction>(Val: IV));
13656	}))
13657	I->setHasNoUnsignedWrap(/b=/false);
13658	};
13659	DropNuwFlag (V0, E->getOpcode());
13660	DropNuwFlag (V1, E->getAltOpcode());
13661
13662	Value *V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
13663	if (auto *I = dyn_cast<Instruction>(Val: V)) {
13664	V = propagateMetadata(I, VL: E->Scalars);
13665	GatherShuffleExtractSeq.insert(X: I);
13666	CSEBlocks.insert(V: I->getParent());
13667	}
13668
13669	E->VectorizedValue = V;
13670	++NumVectorInstructions;
13671
13672	return V;
13673	}
13674	default:
13675	llvm_unreachable("unknown inst");
13676	}
13677	return nullptr;
13678	}
13679
13680	Value *BoUpSLP::vectorizeTree() {
13681	ExtraValueToDebugLocsMap ExternallyUsedValues;
13682	SmallVector<std::pair<Value , Value >> ReplacedExternals;
13683	return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13684	}
13685
13686	namespace {
13687	/// Data type for handling buildvector sequences with the reused scalars from
13688	/// other tree entries.
13689	struct ShuffledInsertData {
13690	/// List of insertelements to be replaced by shuffles.
13691	SmallVector<InsertElementInst *> InsertElements;
13692	/// The parent vectors and shuffle mask for the given list of inserts.
13693	MapVector<Value , SmallVector<int*>> ValueMasks;
13694	};
13695	} // namespace
13696
13697	Value *BoUpSLP::vectorizeTree(
13698	const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13699	SmallVectorImpl<std::pair<Value , Value >> &ReplacedExternals,
13700	Instruction *ReductionRoot) {
13701	// All blocks must be scheduled before any instructions are inserted.
13702	for (auto &BSIter : BlocksSchedules) {
13703	scheduleBlock(BS: BSIter.second.get());
13704	}
13705	// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13706	// need to rebuild it.
13707	EntryToLastInstruction.clear();
13708
13709	if (ReductionRoot)
13710	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
13711	IP: ReductionRoot->getIterator());
13712	else
13713	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13714
13715	// Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13716	(void)vectorizeTree(E: VectorizableTree [`0`].get(), /PostponedPHIs=/true);
13717	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13718	if (TE ->State == TreeEntry::Vectorize &&
13719	TE ->getOpcode() == Instruction::PHI && !TE ->isAltShuffle() &&
13720	TE ->VectorizedValue)
13721	(void)vectorizeTree(E: TE.get(), /PostponedPHIs=/false);
13722	// Run through the list of postponed gathers and emit them, replacing the temp
13723	// emitted allocas with actual vector instructions.
13724	ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13725	DenseMap<Value , SmallVector<TreeEntry >> PostponedValues;
13726	for (const TreeEntry *E : PostponedNodes) {
13727	auto TE = const_cast<TreeEntry >(E);
13728	if (auto *VecTE = getTreeEntry(V: TE->Scalars.front()))
13729	if (VecTE->isSame(VL: TE->UserTreeIndices.front().UserTE->getOperand(
13730	OpIdx: TE->UserTreeIndices.front().EdgeIdx)) &&
13731	VecTE->isSame(VL: TE->Scalars))
13732	// Found gather node which is absolutely the same as one of the
13733	// vectorized nodes. It may happen after reordering.
13734	continue;
13735	auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
13736	TE->VectorizedValue = nullptr;
13737	auto *UserI =
13738	cast<Instruction>(Val&: TE->UserTreeIndices.front().UserTE->VectorizedValue);
13739	// If user is a PHI node, its vector code have to be inserted right before
13740	// block terminator. Since the node was delayed, there were some unresolved
13741	// dependencies at the moment when stab instruction was emitted. In a case
13742	// when any of these dependencies turn out an operand of another PHI, coming
13743	// from this same block, position of a stab instruction will become invalid.
13744	// The is because source vector that supposed to feed this gather node was
13745	// inserted at the end of the block [after stab instruction]. So we need
13746	// to adjust insertion point again to the end of block.
13747	if (isa<PHINode>(Val: UserI)) {
13748	// Insert before all users.
13749	Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13750	for (User *U : PrevVec->users()) {
13751	if (U == UserI)
13752	continue;
13753	auto *UI = dyn_cast<Instruction>(Val: U);
13754	if (!UI \|\| isa<PHINode>(Val: UI) \|\| UI->getParent() != InsertPt->getParent())
13755	continue;
13756	if (UI->comesBefore(Other: InsertPt))
13757	InsertPt = UI;
13758	}
13759	Builder.SetInsertPoint(InsertPt);
13760	} else {
13761	Builder.SetInsertPoint(PrevVec);
13762	}
13763	Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13764	Value Vec = vectorizeTree(E: TE, /PostponedPHIs=/*false);
13765	if (Vec->getType() != PrevVec->getType()) {
13766	assert(Vec->getType()->isIntOrIntVectorTy() &&
13767	PrevVec->getType()->isIntOrIntVectorTy() &&
13768	"Expected integer vector types only.");
13769	std::optional<bool> IsSigned;
13770	for (Value *V : TE->Scalars) {
13771	if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13772	auto It = MinBWs.find(Val: BaseTE);
13773	if (It != MinBWs.end()) {
13774	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
13775	if (*IsSigned)
13776	break;
13777	}
13778	for (const TreeEntry *MNTE : MultiNodeScalars.lookup(Val: V)) {
13779	auto It = MinBWs.find(Val: MNTE);
13780	if (It != MinBWs.end()) {
13781	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
13782	if (*IsSigned)
13783	break;
13784	}
13785	}
13786	if (IsSigned.value_or(u: false))
13787	break;
13788	// Scan through gather nodes.
13789	for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
13790	auto It = MinBWs.find(Val: BVE);
13791	if (It != MinBWs.end()) {
13792	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
13793	if (*IsSigned)
13794	break;
13795	}
13796	}
13797	if (IsSigned.value_or(u: false))
13798	break;
13799	if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
13800	IsSigned =
13801	IsSigned.value_or(u: false) \|\|
13802	!isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery (*DL));
13803	continue;
13804	}
13805	if (IsSigned.value_or(u: false))
13806	break;
13807	}
13808	}
13809	if (IsSigned.value_or(u: false)) {
13810	// Final attempt - check user node.
13811	auto It = MinBWs.find(Val: TE->UserTreeIndices.front().UserTE);
13812	if (It != MinBWs.end())
13813	IsSigned = It ->second.second;
13814	}
13815	assert(IsSigned &&
13816	"Expected user node or perfect diamond match in MinBWs.");
13817	Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
13818	}
13819	PrevVec->replaceAllUsesWith(V: Vec);
13820	PostponedValues.try_emplace(Key: Vec).first ->second.push_back(Elt: TE);
13821	// Replace the stub vector node, if it was used before for one of the
13822	// buildvector nodes already.
13823	auto It = PostponedValues.find(Val: PrevVec);
13824	if (It != PostponedValues.end()) {
13825	for (TreeEntry *VTE : It ->getSecond())
13826	VTE->VectorizedValue = Vec;
13827	}
13828	eraseInstruction(I: PrevVec);
13829	}
13830
13831	LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13832	<< " values .\n");
13833
13834	SmallVector<ShuffledInsertData> ShuffledInserts;
13835	// Maps vector instruction to original insertelement instruction
13836	DenseMap<Value , InsertElementInst > VectorToInsertElement;
13837	// Maps extract Scalar to the corresponding extractelement instruction in the
13838	// basic block. Only one extractelement per block should be emitted.
13839	DenseMap<Value *,
13840	DenseMap<BasicBlock , std::pair<Instruction , Instruction *>>>
13841	ScalarToEEs;
13842	SmallDenseSet<Value *, `4`> UsedInserts;
13843	DenseMap<std::pair<Value , Type >, Value *> VectorCasts;
13844	SmallDenseSet<Value *, `4`> ScalarsWithNullptrUser;
13845	// Extract all of the elements with the external uses.
13846	for (const auto &ExternalUse : ExternalUses) {
13847	Value *Scalar = ExternalUse.Scalar;
13848	llvm::User *User = ExternalUse.User;
13849
13850	// Skip users that we already RAUW. This happens when one instruction
13851	// has multiple uses of the same value.
13852	if (User && !is_contained(Range: Scalar->users(), Element: User))
13853	continue;
13854	TreeEntry *E = getTreeEntry(V: Scalar);
13855	assert(E && "Invalid scalar");
13856	assert(!E->isGather() && "Extracting from a gather list");
13857	// Non-instruction pointers are not deleted, just skip them.
13858	if (E->getOpcode() == Instruction::GetElementPtr &&
13859	!isa<GetElementPtrInst>(Val: Scalar))
13860	continue;
13861
13862	Value *Vec = E->VectorizedValue;
13863	assert(Vec && "Can't find vectorizable value");
13864
13865	Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
13866	auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13867	if (Scalar->getType() != Vec->getType()) {
13868	Value Ex = nullptr*;
13869	Value ExV = nullptr*;
13870	auto *GEP = dyn_cast<GetElementPtrInst>(Val: Scalar);
13871	bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(Ptr: GEP);
13872	auto It = ScalarToEEs.find(Val: Scalar);
13873	if (It != ScalarToEEs.end()) {
13874	// No need to emit many extracts, just move the only one in the
13875	// current block.
13876	auto EEIt = It ->second.find(Val: Builder.GetInsertBlock());
13877	if (EEIt != It ->second.end()) {
13878	Instruction *I = EEIt ->second.first;
13879	if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13880	Builder.GetInsertPoint()->comesBefore(Other: I)) {
13881	I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
13882	I: Builder.GetInsertPoint());
13883	if (auto *CI = EEIt ->second.second)
13884	CI->moveAfter(MovePos: I);
13885	}
13886	Ex = I;
13887	ExV = EEIt ->second.second ? EEIt ->second.second : Ex;
13888	}
13889	}
13890	if (!Ex) {
13891	// "Reuse" the existing extract to improve final codegen.
13892	if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar)) {
13893	Value *V = ES->getVectorOperand();
13894	if (const TreeEntry *ETE = getTreeEntry(V))
13895	V = ETE->VectorizedValue;
13896	Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
13897	} else if (ReplaceGEP) {
13898	// Leave the GEPs as is, they are free in most cases and better to
13899	// keep them as GEPs.
13900	auto *CloneGEP = GEP->clone();
13901	if (isa<Instruction>(Val: Vec))
13902	CloneGEP->insertBefore(BB&: *Builder.GetInsertBlock(),
13903	InsertPos: Builder.GetInsertPoint());
13904	else
13905	CloneGEP->insertBefore(InsertPos: GEP);
13906	if (GEP->hasName())
13907	CloneGEP->takeName(V: GEP);
13908	Ex = CloneGEP;
13909	} else {
13910	Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
13911	}
13912	// If necessary, sign-extend or zero-extend ScalarRoot
13913	// to the larger type.
13914	ExV = Ex;
13915	if (Scalar->getType() != Ex->getType())
13916	ExV = Builder.CreateIntCast(V: Ex, DestTy: Scalar->getType(),
13917	isSigned: MinBWs.find(Val: E)->second.second);
13918	if (auto *I = dyn_cast<Instruction>(Val: Ex))
13919	ScalarToEEs [Scalar].try_emplace(
13920	Key: Builder.GetInsertBlock(),
13921	Args: std::make_pair(x&: I, y: cast<Instruction>(Val: ExV)));
13922	}
13923	// The then branch of the previous if may produce constants, since 0
13924	// operand might be a constant.
13925	if (auto *ExI = dyn_cast<Instruction>(Val: Ex)) {
13926	GatherShuffleExtractSeq.insert(X: ExI);
13927	CSEBlocks.insert(V: ExI->getParent());
13928	}
13929	return ExV;
13930	}
13931	assert(isa<FixedVectorType>(Scalar->getType()) &&
13932	isa<InsertElementInst>(Scalar) &&
13933	"In-tree scalar of vector type is not insertelement?");
13934	auto *IE = cast<InsertElementInst>(Val: Scalar);
13935	VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
13936	return Vec;
13937	};
13938	// If User == nullptr, the Scalar remains as scalar in vectorized
13939	// instructions or is used as extra arg. Generate ExtractElement instruction
13940	// and update the record for this scalar in ExternallyUsedValues.
13941	if (!User) {
13942	if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
13943	continue;
13944	assert((ExternallyUsedValues.count(Scalar) \|\|
13945	Scalar->hasNUsesOrMore(UsesLimit) \|\|
13946	any_of(Scalar->users(),
13947	[&](llvm::User *U) {
13948	if (ExternalUsesAsGEPs.contains(U))
13949	return true;
13950	TreeEntry *UseEntry = getTreeEntry(U);
13951	return UseEntry &&
13952	(UseEntry->State == TreeEntry::Vectorize \|\|
13953	UseEntry->State ==
13954	TreeEntry::StridedVectorize) &&
13955	(E->State == TreeEntry::Vectorize \|\|
13956	E->State == TreeEntry::StridedVectorize) &&
13957	doesInTreeUserNeedToExtract(
13958	Scalar,
13959	cast<Instruction>(UseEntry->Scalars.front()),
13960	TLI);
13961	})) &&
13962	"Scalar with nullptr User must be registered in "
13963	"ExternallyUsedValues map or remain as scalar in vectorized "
13964	"instructions");
13965	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
13966	if (auto *PHI = dyn_cast<PHINode>(Val: VecI))
13967	Builder.SetInsertPoint(TheBB: PHI->getParent(),
13968	IP: PHI->getParent()->getFirstNonPHIIt());
13969	else
13970	Builder.SetInsertPoint(TheBB: VecI->getParent(),
13971	IP: std::next(x: VecI->getIterator()));
13972	} else {
13973	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13974	}
13975	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
13976	// Required to update internally referenced instructions.
13977	Scalar->replaceAllUsesWith(V: NewInst);
13978	ReplacedExternals.emplace_back(Args&: Scalar, Args&: NewInst);
13979	continue;
13980	}
13981
13982	if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
13983	VU && VU->getOperand(i_nocapture: `1`) == Scalar) {
13984	// Skip if the scalar is another vector op or Vec is not an instruction.
13985	if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
13986	if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
13987	if (!UsedInserts.insert(V: VU).second)
13988	continue;
13989	// Need to use original vector, if the root is truncated.
13990	auto BWIt = MinBWs.find(Val: E);
13991	if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13992	auto *ScalarTy = FTy->getElementType();
13993	auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
13994	auto VecIt = VectorCasts.find(Val: Key);
13995	if (VecIt == VectorCasts.end()) {
13996	IRBuilderBase::InsertPointGuard Guard(Builder);
13997	if (auto *IVec = dyn_cast<PHINode>(Val: Vec))
13998	Builder.SetInsertPoint(
13999	IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14000	else if (auto *IVec = dyn_cast<Instruction>(Val: Vec))
14001	Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14002	Vec = Builder.CreateIntCast(
14003	V: Vec,
14004	DestTy: getWidenedType(
14005	ScalarTy,
14006	VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
14007	isSigned: BWIt ->second.second);
14008	VectorCasts.try_emplace(Key, Args&: Vec);
14009	} else {
14010	Vec = VecIt ->second;
14011	}
14012	}
14013
14014	std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
14015	if (InsertIdx) {
14016	auto *It =
14017	find_if(Range&: ShuffledInserts, P: [VU](const ShuffledInsertData &Data) {
14018	// Checks if 2 insertelements are from the same buildvector.
14019	InsertElementInst *VecInsert = Data.InsertElements.front();
14020	return areTwoInsertFromSameBuildVector(
14021	VU, V: VecInsert,
14022	GetBaseOperand: [](InsertElementInst II) { return* II->getOperand(i_nocapture: `0`); });
14023	});
14024	unsigned Idx = *InsertIdx;
14025	if (It == ShuffledInserts.end()) {
14026	(void)ShuffledInserts.emplace_back();
14027	It = std::next(x: ShuffledInserts.begin(),
14028	n: ShuffledInserts.size() - `1`);
14029	SmallVectorImpl<int> &Mask = It->ValueMasks [Vec];
14030	if (Mask.empty())
14031	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
14032	// Find the insertvector, vectorized in tree, if any.
14033	Value *Base = VU;
14034	while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
14035	if (IEBase != User &&
14036	(!IEBase->hasOneUse() \|\|
14037	getElementIndex(Inst: IEBase).value_or(u&: Idx) == Idx))
14038	break;
14039	// Build the mask for the vectorized insertelement instructions.
14040	if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
14041	do {
14042	IEBase = cast<InsertElementInst>(Val: Base);
14043	int IEIdx = *getElementIndex(Inst: IEBase);
14044	assert(Mask[IEIdx] == PoisonMaskElem &&
14045	"InsertElementInstruction used already.");
14046	Mask [IEIdx] = IEIdx;
14047	Base = IEBase->getOperand(i_nocapture: `0`);
14048	} while (E == getTreeEntry(V: Base));
14049	break;
14050	}
14051	Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: `0`);
14052	// After the vectorization the def-use chain has changed, need
14053	// to look through original insertelement instructions, if they
14054	// get replaced by vector instructions.
14055	auto It = VectorToInsertElement.find(Val: Base);
14056	if (It != VectorToInsertElement.end())
14057	Base = It ->second;
14058	}
14059	}
14060	SmallVectorImpl<int> &Mask = It->ValueMasks [Vec];
14061	if (Mask.empty())
14062	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
14063	Mask [Idx] = ExternalUse.Lane;
14064	It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
14065	continue;
14066	}
14067	}
14068	}
14069	}
14070
14071	// Generate extracts for out-of-tree users.
14072	// Find the insertion point for the extractelement lane.
14073	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
14074	if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
14075	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumIncomingValues())) {
14076	if (PH->getIncomingValue(i: I) == Scalar) {
14077	Instruction *IncomingTerminator =
14078	PH->getIncomingBlock(i: I)->getTerminator();
14079	if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
14080	Builder.SetInsertPoint(TheBB: VecI->getParent(),
14081	IP: std::next(x: VecI->getIterator()));
14082	} else {
14083	Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
14084	}
14085	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
14086	PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
14087	}
14088	}
14089	} else {
14090	Builder.SetInsertPoint(cast<Instruction>(Val: User));
14091	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
14092	User->replaceUsesOfWith(From: Scalar, To: NewInst);
14093	}
14094	} else {
14095	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
14096	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
14097	User->replaceUsesOfWith(From: Scalar, To: NewInst);
14098	}
14099
14100	LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14101	}
14102
14103	auto CreateShuffle = [&](Value V1, Value V2, ArrayRef<int> Mask) {
14104	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14105	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14106	int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
14107	for (int I = `0`, E = Mask.size(); I < E; ++I) {
14108	if (Mask [I] < VF)
14109	CombinedMask1 [I] = Mask [I];
14110	else
14111	CombinedMask2 [I] = Mask [I] - VF;
14112	}
14113	ShuffleInstructionBuilder ShuffleBuilder(
14114	cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
14115	ShuffleBuilder.add(V1, Mask: CombinedMask1);
14116	if (V2)
14117	ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
14118	return ShuffleBuilder.finalize(ExtMask: std::nullopt);
14119	};
14120
14121	auto &&ResizeToVF = [&CreateShuffle](Value Vec, ArrayRef<int*> Mask,
14122	bool ForSingleMask) {
14123	unsigned VF = Mask.size();
14124	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
14125	if (VF != VecVF) {
14126	if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14127	Vec = CreateShuffle (Vec, nullptr, Mask);
14128	return std::make_pair(x&: Vec, y: true);
14129	}
14130	if (!ForSingleMask) {
14131	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14132	for (unsigned I = `0`; I < VF; ++I) {
14133	if (Mask [I] != PoisonMaskElem)
14134	ResizeMask [Mask [I]] = Mask [I];
14135	}
14136	Vec = CreateShuffle (Vec, nullptr, ResizeMask);
14137	}
14138	}
14139
14140	return std::make_pair(x&: Vec, y: false);
14141	};
14142	// Perform shuffling of the vectorize tree entries for better handling of
14143	// external extracts.
14144	for (int I = `0`, E = ShuffledInserts.size(); I < E; ++I) {
14145	// Find the first and the last instruction in the list of insertelements.
14146	sort(C&: ShuffledInserts [I].InsertElements, Comp: isFirstInsertElement);
14147	InsertElementInst *FirstInsert = ShuffledInserts [I].InsertElements.front();
14148	InsertElementInst *LastInsert = ShuffledInserts [I].InsertElements.back();
14149	Builder.SetInsertPoint(LastInsert);
14150	auto Vector = ShuffledInserts [I].ValueMasks.takeVector();
14151	Value *NewInst = performExtractsShuffleAction<Value>(
14152	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
14153	Base: FirstInsert->getOperand(i_nocapture: `0`),
14154	GetVF: [](Value *Vec) {
14155	return cast<VectorType>(Val: Vec->getType())
14156	->getElementCount()
14157	.getKnownMinValue();
14158	},
14159	ResizeAction: ResizeToVF,
14160	Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14161	ArrayRef<Value *> Vals) {
14162	assert((Vals.size() == `1` \|\| Vals.size() == `2`) &&
14163	"Expected exactly 1 or 2 input values.");
14164	if (Vals.size() == `1`) {
14165	// Do not create shuffle if the mask is a simple identity
14166	// non-resizing mask.
14167	if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
14168	->getNumElements() \|\|
14169	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
14170	return CreateShuffle (Vals.front(), nullptr, Mask);
14171	return Vals.front();
14172	}
14173	return CreateShuffle (Vals.front() ? Vals.front()
14174	: FirstInsert->getOperand(i_nocapture: `0`),
14175	Vals.back(), Mask);
14176	});
14177	auto It = ShuffledInserts [I].InsertElements.rbegin();
14178	// Rebuild buildvector chain.
14179	InsertElementInst II = nullptr*;
14180	if (It != ShuffledInserts [I].InsertElements.rend())
14181	II = *It;
14182	SmallVector<Instruction *> Inserts;
14183	while (It != ShuffledInserts [I].InsertElements.rend()) {
14184	assert(II && "Must be an insertelement instruction.");
14185	if (*It == II)
14186	++It;
14187	else
14188	Inserts.push_back(Elt: cast<Instruction>(Val: II));
14189	II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: `0`));
14190	}
14191	for (Instruction *II : reverse(C&: Inserts)) {
14192	II->replaceUsesOfWith(From: II->getOperand(i: `0`), To: NewInst);
14193	if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
14194	if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
14195	II->moveAfter(MovePos: NewI);
14196	NewInst = II;
14197	}
14198	LastInsert->replaceAllUsesWith(V: NewInst);
14199	for (InsertElementInst *IE : reverse(C&: ShuffledInserts [I].InsertElements)) {
14200	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `0`),
14201	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `0`)->getType()));
14202	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `1`),
14203	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `1`)->getType()));
14204	eraseInstruction(I: IE);
14205	}
14206	CSEBlocks.insert(V: LastInsert->getParent());
14207	}
14208
14209	SmallVector<Instruction *> RemovedInsts;
14210	// For each vectorized value:
14211	for (auto &TEPtr : VectorizableTree) {
14212	TreeEntry *Entry = TEPtr.get();
14213
14214	// No need to handle users of gathered values.
14215	if (Entry->isGather())
14216	continue;
14217
14218	assert(Entry->VectorizedValue && "Can't find vectorizable value");
14219
14220	// For each lane:
14221	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14222	Value *Scalar = Entry->Scalars [Lane];
14223
14224	if (Entry->getOpcode() == Instruction::GetElementPtr &&
14225	!isa<GetElementPtrInst>(Val: Scalar))
14226	continue;
14227	#ifndef NDEBUG
14228	Type *Ty = Scalar->getType();
14229	if (!Ty->isVoidTy()) {
14230	for (User *U : Scalar->users()) {
14231	LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14232
14233	// It is legal to delete users in the ignorelist.
14234	assert((getTreeEntry(U) \|\|
14235	(UserIgnoreList && UserIgnoreList->contains(U)) \|\|
14236	(isa_and_nonnull<Instruction>(U) &&
14237	isDeleted(cast<Instruction>(U)))) &&
14238	"Deleting out-of-tree value");
14239	}
14240	}
14241	#endif
14242	LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14243	auto *I = cast<Instruction>(Val: Scalar);
14244	RemovedInsts.push_back(Elt: I);
14245	}
14246	}
14247
14248	// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14249	// new vector instruction.
14250	if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree [`0`]->VectorizedValue))
14251	V->mergeDIAssignID(SourceInstructions: RemovedInsts);
14252
14253	// Clear up reduction references, if any.
14254	if (UserIgnoreList) {
14255	for (Instruction *I : RemovedInsts) {
14256	if (getTreeEntry(V: I)->Idx != `0`)
14257	continue;
14258	SmallVector<SelectInst *> LogicalOpSelects;
14259	I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
14260	// Do not replace condition of the logical op in form select <cond>.
14261	bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
14262	(match(V: U.getUser(), P: m_LogicalAnd()) \|\|
14263	match(V: U.getUser(), P: m_LogicalOr())) &&
14264	U.getOperandNo() == `0`;
14265	if (IsPoisoningLogicalOp) {
14266	LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
14267	return false;
14268	}
14269	return UserIgnoreList->contains(V: U.getUser());
14270	});
14271	// Replace conditions of the poisoning logical ops with the non-poison
14272	// constant value.
14273	for (SelectInst *SI : LogicalOpSelects)
14274	SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
14275	}
14276	}
14277	// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14278	// cache correctness.
14279	// NOTE: removeInstructionAndOperands only marks the instruction for deletion
14280	// - instructions are not deleted until later.
14281	removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts));
14282
14283	Builder.ClearInsertionPoint();
14284	InstrElementSize.clear();
14285
14286	const TreeEntry &RootTE = *VectorizableTree.front();
14287	Value *Vec = RootTE.VectorizedValue;
14288	if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != `0` &&
14289	It != MinBWs.end() &&
14290	ReductionBitWidth != It ->second.first) {
14291	IRBuilder<>::InsertPointGuard Guard(Builder);
14292	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
14293	IP: ReductionRoot->getIterator());
14294	Vec = Builder.CreateIntCast(
14295	V: Vec,
14296	DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
14297	EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
14298	isSigned: It ->second.second);
14299	}
14300	return Vec;
14301	}
14302
14303	void BoUpSLP::optimizeGatherSequence() {
14304	LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14305	<< " gather sequences instructions.\n");
14306	// LICM InsertElementInst sequences.
14307	for (Instruction *I : GatherShuffleExtractSeq) {
14308	if (isDeleted(I))
14309	continue;
14310
14311	// Check if this block is inside a loop.
14312	Loop *L = LI->getLoopFor(BB: I->getParent());
14313	if (!L)
14314	continue;
14315
14316	// Check if it has a preheader.
14317	BasicBlock *PreHeader = L->getLoopPreheader();
14318	if (!PreHeader)
14319	continue;
14320
14321	// If the vector or the element that we insert into it are
14322	// instructions that are defined in this basic block then we can't
14323	// hoist this instruction.
14324	if (any_of(Range: I->operands(), P: [L](Value *V) {
14325	auto *OpI = dyn_cast<Instruction>(Val: V);
14326	return OpI && L->contains(Inst: OpI);
14327	}))
14328	continue;
14329
14330	// We can hoist this instruction. Move it to the pre-header.
14331	I->moveBefore(MovePos: PreHeader->getTerminator());
14332	CSEBlocks.insert(V: PreHeader);
14333	}
14334
14335	// Make a list of all reachable blocks in our CSE queue.
14336	SmallVector<const DomTreeNode *, `8`> CSEWorkList;
14337	CSEWorkList.reserve(N: CSEBlocks.size());
14338	for (BasicBlock *BB : CSEBlocks)
14339	if (DomTreeNode *N = DT->getNode(BB)) {
14340	assert(DT->isReachableFromEntry(N));
14341	CSEWorkList.push_back(Elt: N);
14342	}
14343
14344	// Sort blocks by domination. This ensures we visit a block after all blocks
14345	// dominating it are visited.
14346	llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode A, const* DomTreeNode *B) {
14347	assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14348	"Different nodes should have different DFS numbers");
14349	return A->getDFSNumIn() < B->getDFSNumIn();
14350	});
14351
14352	// Less defined shuffles can be replaced by the more defined copies.
14353	// Between two shuffles one is less defined if it has the same vector operands
14354	// and its mask indeces are the same as in the first one or undefs. E.g.
14355	// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14356	// poison, <0, 0, 0, 0>.
14357	auto &&IsIdenticalOrLessDefined = [this](Instruction I1, Instruction I2,
14358	SmallVectorImpl<int> &NewMask) {
14359	if (I1->getType() != I2->getType())
14360	return false;
14361	auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
14362	auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
14363	if (!SI1 \|\| !SI2)
14364	return I1->isIdenticalTo(I: I2);
14365	if (SI1->isIdenticalTo(I: SI2))
14366	return true;
14367	for (int I = `0`, E = SI1->getNumOperands(); I < E; ++I)
14368	if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
14369	return false;
14370	// Check if the second instruction is more defined than the first one.
14371	NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
14372	ArrayRef<int> SM1 = SI1->getShuffleMask();
14373	// Count trailing undefs in the mask to check the final number of used
14374	// registers.
14375	unsigned LastUndefsCnt = `0`;
14376	for (int I = `0`, E = NewMask.size(); I < E; ++I) {
14377	if (SM1 [I] == PoisonMaskElem)
14378	++LastUndefsCnt;
14379	else
14380	LastUndefsCnt = `0`;
14381	if (NewMask [I] != PoisonMaskElem && SM1 [I] != PoisonMaskElem &&
14382	NewMask [I] != SM1 [I])
14383	return false;
14384	if (NewMask [I] == PoisonMaskElem)
14385	NewMask [I] = SM1 [I];
14386	}
14387	// Check if the last undefs actually change the final number of used vector
14388	// registers.
14389	return SM1.size() - LastUndefsCnt > `1` &&
14390	TTI->getNumberOfParts(Tp: SI1->getType()) ==
14391	TTI->getNumberOfParts(
14392	Tp: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
14393	VF: SM1.size() - LastUndefsCnt));
14394	};
14395	// Perform O(N^2) search over the gather/shuffle sequences and merge identical
14396	// instructions. TODO: We can further optimize this scan if we split the
14397	// instructions into different buckets based on the insert lane.
14398	SmallVector<Instruction *, `16`> Visited;
14399	for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14400	assert(*I &&
14401	(I == CSEWorkList.begin() \|\| !DT->dominates(I, std::prev(I))) &&
14402	"Worklist not sorted properly!");
14403	BasicBlock BB = (I)->getBlock();
14404	// For all instructions in blocks containing gather sequences:
14405	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
14406	if (isDeleted(I: &In))
14407	continue;
14408	if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
14409	!GatherShuffleExtractSeq.contains(key: &In))
14410	continue;
14411
14412	// Check if we can replace this instruction with any of the
14413	// visited instructions.
14414	bool Replaced = false;
14415	for (Instruction *&V : Visited) {
14416	SmallVector<int> NewMask;
14417	if (IsIdenticalOrLessDefined (&In, V, NewMask) &&
14418	DT->dominates(A: V->getParent(), B: In.getParent())) {
14419	In.replaceAllUsesWith(V);
14420	eraseInstruction(I: &In);
14421	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
14422	if (!NewMask.empty())
14423	SI->setShuffleMask(NewMask);
14424	Replaced = true;
14425	break;
14426	}
14427	if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
14428	GatherShuffleExtractSeq.contains(key: V) &&
14429	IsIdenticalOrLessDefined (V, &In, NewMask) &&
14430	DT->dominates(A: In.getParent(), B: V->getParent())) {
14431	In.moveAfter(MovePos: V);
14432	V->replaceAllUsesWith(V: &In);
14433	eraseInstruction(I: V);
14434	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
14435	if (!NewMask.empty())
14436	SI->setShuffleMask(NewMask);
14437	V = &In;
14438	Replaced = true;
14439	break;
14440	}
14441	}
14442	if (!Replaced) {
14443	assert(!is_contained(Visited, &In));
14444	Visited.push_back(Elt: &In);
14445	}
14446	}
14447	}
14448	CSEBlocks.clear();
14449	GatherShuffleExtractSeq.clear();
14450	}
14451
14452	BoUpSLP::ScheduleData *
14453	BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14454	ScheduleData Bundle = nullptr*;
14455	ScheduleData PrevInBundle = nullptr*;
14456	for (Value *V : VL) {
14457	if (doesNotNeedToBeScheduled(V))
14458	continue;
14459	ScheduleData *BundleMember = getScheduleData(V);
14460	assert(BundleMember &&
14461	"no ScheduleData for bundle member "
14462	"(maybe not in same basic block)");
14463	assert(BundleMember->isSchedulingEntity() &&
14464	"bundle member already part of other bundle");
14465	if (PrevInBundle) {
14466	PrevInBundle->NextInBundle = BundleMember;
14467	} else {
14468	Bundle = BundleMember;
14469	}
14470
14471	// Group the instructions to a bundle.
14472	BundleMember->FirstInBundle = Bundle;
14473	PrevInBundle = BundleMember;
14474	}
14475	assert(Bundle && "Failed to find schedule bundle");
14476	return Bundle;
14477	}
14478
14479	// Groups the instructions to a bundle (which is then a single scheduling entity)
14480	// and schedules instructions until the bundle gets ready.
14481	std::optional<BoUpSLP::ScheduleData *>
14482	BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
14483	const InstructionsState &S) {
14484	// No need to schedule PHIs, insertelement, extractelement and extractvalue
14485	// instructions.
14486	if (isa<PHINode>(Val: S.OpValue) \|\| isVectorLikeInstWithConstOps(V: S.OpValue) \|\|
14487	doesNotNeedToSchedule(VL))
14488	return nullptr;
14489
14490	// Initialize the instruction bundle.
14491	Instruction *OldScheduleEnd = ScheduleEnd;
14492	LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14493
14494	auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14495	ScheduleData *Bundle) {
14496	// The scheduling region got new instructions at the lower end (or it is a
14497	// new region for the first bundle). This makes it necessary to
14498	// recalculate all dependencies.
14499	// It is seldom that this needs to be done a second time after adding the
14500	// initial bundle to the region.
14501	if (ScheduleEnd != OldScheduleEnd) {
14502	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14503	doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->clearDependencies(); });
14504	ReSchedule = true;
14505	}
14506	if (Bundle) {
14507	LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14508	<< " in block " << BB->getName() << "\n");
14509	calculateDependencies(SD: Bundle, /InsertInReadyList=/true, SLP);
14510	}
14511
14512	if (ReSchedule) {
14513	resetSchedule();
14514	initialFillReadyList(ReadyList&: ReadyInsts);
14515	}
14516
14517	// Now try to schedule the new bundle or (if no bundle) just calculate
14518	// dependencies. As soon as the bundle is "ready" it means that there are no
14519	// cyclic dependencies and we can schedule it. Note that's important that we
14520	// don't "schedule" the bundle yet (see cancelScheduling).
14521	while (((!Bundle && ReSchedule) \|\| (Bundle && !Bundle->isReady())) &&
14522	!ReadyInsts.empty()) {
14523	ScheduleData *Picked = ReadyInsts.pop_back_val();
14524	assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14525	"must be ready to schedule");
14526	schedule(SD: Picked, ReadyList&: ReadyInsts);
14527	}
14528	};
14529
14530	// Make sure that the scheduling region contains all
14531	// instructions of the bundle.
14532	for (Value *V : VL) {
14533	if (doesNotNeedToBeScheduled(V))
14534	continue;
14535	if (!extendSchedulingRegion(V, S)) {
14536	// If the scheduling region got new instructions at the lower end (or it
14537	// is a new region for the first bundle). This makes it necessary to
14538	// recalculate all dependencies.
14539	// Otherwise the compiler may crash trying to incorrectly calculate
14540	// dependencies and emit instruction in the wrong order at the actual
14541	// scheduling.
14542	TryScheduleBundleImpl (/ReSchedule=/false, nullptr);
14543	return std::nullopt;
14544	}
14545	}
14546
14547	bool ReSchedule = false;
14548	for (Value *V : VL) {
14549	if (doesNotNeedToBeScheduled(V))
14550	continue;
14551	ScheduleData *BundleMember = getScheduleData(V);
14552	assert(BundleMember &&
14553	"no ScheduleData for bundle member (maybe not in same basic block)");
14554
14555	// Make sure we don't leave the pieces of the bundle in the ready list when
14556	// whole bundle might not be ready.
14557	ReadyInsts.remove(X: BundleMember);
14558
14559	if (!BundleMember->IsScheduled)
14560	continue;
14561	// A bundle member was scheduled as single instruction before and now
14562	// needs to be scheduled as part of the bundle. We just get rid of the
14563	// existing schedule.
14564	LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14565	<< " was already scheduled\n");
14566	ReSchedule = true;
14567	}
14568
14569	auto *Bundle = buildBundle(VL);
14570	TryScheduleBundleImpl (ReSchedule, Bundle);
14571	if (!Bundle->isReady()) {
14572	cancelScheduling(VL, OpValue: S.OpValue);
14573	return std::nullopt;
14574	}
14575	return Bundle;
14576	}
14577
14578	void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14579	Value *OpValue) {
14580	if (isa<PHINode>(Val: OpValue) \|\| isVectorLikeInstWithConstOps(V: OpValue) \|\|
14581	doesNotNeedToSchedule(VL))
14582	return;
14583
14584	if (doesNotNeedToBeScheduled(V: OpValue))
14585	OpValue = *find_if_not(Range&: VL, P: doesNotNeedToBeScheduled);
14586	ScheduleData *Bundle = getScheduleData(V: OpValue);
14587	LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14588	assert(!Bundle->IsScheduled &&
14589	"Can't cancel bundle which is already scheduled");
14590	assert(Bundle->isSchedulingEntity() &&
14591	(Bundle->isPartOfBundle() \|\| needToScheduleSingleInstruction(VL)) &&
14592	"tried to unbundle something which is not a bundle");
14593
14594	// Remove the bundle from the ready list.
14595	if (Bundle->isReady())
14596	ReadyInsts.remove(X: Bundle);
14597
14598	// Un-bundle: make single instructions out of the bundle.
14599	ScheduleData *BundleMember = Bundle;
14600	while (BundleMember) {
14601	assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14602	BundleMember->FirstInBundle = BundleMember;
14603	ScheduleData *Next = BundleMember->NextInBundle;
14604	BundleMember->NextInBundle = nullptr;
14605	BundleMember->TE = nullptr;
14606	if (BundleMember->unscheduledDepsInBundle() == `0`) {
14607	ReadyInsts.insert(X: BundleMember);
14608	}
14609	BundleMember = Next;
14610	}
14611	}
14612
14613	BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14614	// Allocate a new ScheduleData for the instruction.
14615	if (ChunkPos >= ChunkSize) {
14616	ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
14617	ChunkPos = `0`;
14618	}
14619	return &(ScheduleDataChunks.back()[ChunkPos++]);
14620	}
14621
14622	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14623	const InstructionsState &S) {
14624	if (getScheduleData(V, Key: isOneOf(S, Op: V)))
14625	return true;
14626	Instruction *I = dyn_cast<Instruction>(Val: V);
14627	assert(I && "bundle member must be an instruction");
14628	assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14629	!doesNotNeedToBeScheduled(I) &&
14630	"phi nodes/insertelements/extractelements/extractvalues don't need to "
14631	"be scheduled");
14632	auto &&CheckScheduleForI = [this, &S](Instruction I) -> bool* {
14633	ScheduleData *ISD = getScheduleData(I);
14634	if (!ISD)
14635	return false;
14636	assert(isInSchedulingRegion(ISD) &&
14637	"ScheduleData not in scheduling region");
14638	ScheduleData *SD = allocateScheduleDataChunks();
14639	SD->Inst = I;
14640	SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: S.OpValue);
14641	ExtraScheduleDataMap [I][S.OpValue] = SD;
14642	return true;
14643	};
14644	if (CheckScheduleForI (I))
14645	return true;
14646	if (!ScheduleStart) {
14647	// It's the first instruction in the new region.
14648	initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
14649	ScheduleStart = I;
14650	ScheduleEnd = I->getNextNode();
14651	if (isOneOf(S, Op: I) != I)
14652	CheckScheduleForI (I);
14653	assert(ScheduleEnd && "tried to vectorize a terminator?");
14654	LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14655	return true;
14656	}
14657	// Search up and down at the same time, because we don't know if the new
14658	// instruction is above or below the existing scheduling region.
14659	// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14660	// against the budget. Otherwise debug info could affect codegen.
14661	BasicBlock::reverse_iterator UpIter =
14662	++ScheduleStart->getIterator().getReverse();
14663	BasicBlock::reverse_iterator UpperEnd = BB->rend();
14664	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14665	BasicBlock::iterator LowerEnd = BB->end();
14666	auto IsAssumeLikeIntr = [](const Instruction &I) {
14667	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
14668	return II->isAssumeLikeIntrinsic();
14669	return false;
14670	};
14671	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14672	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14673	while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14674	&*DownIter != I) {
14675	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14676	LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14677	return false;
14678	}
14679
14680	++UpIter;
14681	++DownIter;
14682
14683	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14684	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14685	}
14686	if (DownIter == LowerEnd \|\| (UpIter != UpperEnd && &*UpIter == I)) {
14687	assert(I->getParent() == ScheduleStart->getParent() &&
14688	"Instruction is in wrong basic block.");
14689	initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
14690	ScheduleStart = I;
14691	if (isOneOf(S, Op: I) != I)
14692	CheckScheduleForI (I);
14693	LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14694	<< "\n");
14695	return true;
14696	}
14697	assert((UpIter == UpperEnd \|\| (DownIter != LowerEnd && &*DownIter == I)) &&
14698	"Expected to reach top of the basic block or instruction down the "
14699	"lower end.");
14700	assert(I->getParent() == ScheduleEnd->getParent() &&
14701	"Instruction is in wrong basic block.");
14702	initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
14703	NextLoadStore: nullptr);
14704	ScheduleEnd = I->getNextNode();
14705	if (isOneOf(S, Op: I) != I)
14706	CheckScheduleForI (I);
14707	assert(ScheduleEnd && "tried to vectorize a terminator?");
14708	LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14709	return true;
14710	}
14711
14712	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14713	Instruction *ToI,
14714	ScheduleData *PrevLoadStore,
14715	ScheduleData *NextLoadStore) {
14716	ScheduleData *CurrentLoadStore = PrevLoadStore;
14717	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14718	// No need to allocate data for non-schedulable instructions.
14719	if (doesNotNeedToBeScheduled(V: I))
14720	continue;
14721	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
14722	if (!SD) {
14723	SD = allocateScheduleDataChunks();
14724	ScheduleDataMap [I] = SD;
14725	SD->Inst = I;
14726	}
14727	assert(!isInSchedulingRegion(SD) &&
14728	"new ScheduleData already in scheduling region");
14729	SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: I);
14730
14731	if (I->mayReadOrWriteMemory() &&
14732	(!isa<IntrinsicInst>(Val: I) \|\|
14733	(cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
14734	cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
14735	Intrinsic::pseudoprobe))) {
14736	// Update the linked list of memory accessing instructions.
14737	if (CurrentLoadStore) {
14738	CurrentLoadStore->NextLoadStore = SD;
14739	} else {
14740	FirstLoadStoreInRegion = SD;
14741	}
14742	CurrentLoadStore = SD;
14743	}
14744
14745	if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
14746	match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
14747	RegionHasStackSave = true;
14748	}
14749	if (NextLoadStore) {
14750	if (CurrentLoadStore)
14751	CurrentLoadStore->NextLoadStore = NextLoadStore;
14752	} else {
14753	LastLoadStoreInRegion = CurrentLoadStore;
14754	}
14755	}
14756
14757	void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14758	bool InsertInReadyList,
14759	BoUpSLP *SLP) {
14760	assert(SD->isSchedulingEntity());
14761
14762	SmallVector<ScheduleData *, `10`> WorkList;
14763	WorkList.push_back(Elt: SD);
14764
14765	while (!WorkList.empty()) {
14766	ScheduleData *SD = WorkList.pop_back_val();
14767	for (ScheduleData *BundleMember = SD; BundleMember;
14768	BundleMember = BundleMember->NextInBundle) {
14769	assert(isInSchedulingRegion(BundleMember));
14770	if (BundleMember->hasValidDependencies())
14771	continue;
14772
14773	LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14774	<< "\n");
14775	BundleMember->Dependencies = `0`;
14776	BundleMember->resetUnscheduledDeps();
14777
14778	// Handle def-use chain dependencies.
14779	if (BundleMember->OpValue != BundleMember->Inst) {
14780	if (ScheduleData *UseSD = getScheduleData(I: BundleMember->Inst)) {
14781	BundleMember->Dependencies++;
14782	ScheduleData *DestBundle = UseSD->FirstInBundle;
14783	if (!DestBundle->IsScheduled)
14784	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14785	if (!DestBundle->hasValidDependencies())
14786	WorkList.push_back(Elt: DestBundle);
14787	}
14788	} else {
14789	for (User *U : BundleMember->Inst->users()) {
14790	if (ScheduleData *UseSD = getScheduleData(I: cast<Instruction>(Val: U))) {
14791	BundleMember->Dependencies++;
14792	ScheduleData *DestBundle = UseSD->FirstInBundle;
14793	if (!DestBundle->IsScheduled)
14794	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14795	if (!DestBundle->hasValidDependencies())
14796	WorkList.push_back(Elt: DestBundle);
14797	}
14798	}
14799	}
14800
14801	auto MakeControlDependent = [&](Instruction *I) {
14802	auto *DepDest = getScheduleData(I);
14803	assert(DepDest && "must be in schedule window");
14804	DepDest->ControlDependencies.push_back(Elt: BundleMember);
14805	BundleMember->Dependencies++;
14806	ScheduleData *DestBundle = DepDest->FirstInBundle;
14807	if (!DestBundle->IsScheduled)
14808	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14809	if (!DestBundle->hasValidDependencies())
14810	WorkList.push_back(Elt: DestBundle);
14811	};
14812
14813	// Any instruction which isn't safe to speculate at the beginning of the
14814	// block is control dependend on any early exit or non-willreturn call
14815	// which proceeds it.
14816	if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->Inst)) {
14817	for (Instruction *I = BundleMember->Inst->getNextNode();
14818	I != ScheduleEnd; I = I->getNextNode()) {
14819	if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
14820	continue;
14821
14822	// Add the dependency
14823	MakeControlDependent (I);
14824
14825	if (!isGuaranteedToTransferExecutionToSuccessor(I))
14826	// Everything past here must be control dependent on I.
14827	break;
14828	}
14829	}
14830
14831	if (RegionHasStackSave) {
14832	// If we have an inalloc alloca instruction, it needs to be scheduled
14833	// after any preceeding stacksave. We also need to prevent any alloca
14834	// from reordering above a preceeding stackrestore.
14835	if (match(V: BundleMember->Inst, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
14836	match(V: BundleMember->Inst, P: m_Intrinsic<Intrinsic::stackrestore>())) {
14837	for (Instruction *I = BundleMember->Inst->getNextNode();
14838	I != ScheduleEnd; I = I->getNextNode()) {
14839	if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
14840	match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
14841	// Any allocas past here must be control dependent on I, and I
14842	// must be memory dependend on BundleMember->Inst.
14843	break;
14844
14845	if (!isa<AllocaInst>(Val: I))
14846	continue;
14847
14848	// Add the dependency
14849	MakeControlDependent (I);
14850	}
14851	}
14852
14853	// In addition to the cases handle just above, we need to prevent
14854	// allocas and loads/stores from moving below a stacksave or a
14855	// stackrestore. Avoiding moving allocas below stackrestore is currently
14856	// thought to be conservatism. Moving loads/stores below a stackrestore
14857	// can lead to incorrect code.
14858	if (isa<AllocaInst>(Val: BundleMember->Inst) \|\|
14859	BundleMember->Inst->mayReadOrWriteMemory()) {
14860	for (Instruction *I = BundleMember->Inst->getNextNode();
14861	I != ScheduleEnd; I = I->getNextNode()) {
14862	if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
14863	!match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
14864	continue;
14865
14866	// Add the dependency
14867	MakeControlDependent (I);
14868	break;
14869	}
14870	}
14871	}
14872
14873	// Handle the memory dependencies (if any).
14874	ScheduleData *DepDest = BundleMember->NextLoadStore;
14875	if (!DepDest)
14876	continue;
14877	Instruction *SrcInst = BundleMember->Inst;
14878	assert(SrcInst->mayReadOrWriteMemory() &&
14879	"NextLoadStore list for non memory effecting bundle?");
14880	MemoryLocation SrcLoc = getLocation(I: SrcInst);
14881	bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14882	unsigned NumAliased = `0`;
14883	unsigned DistToSrc = `1`;
14884
14885	for (; DepDest; DepDest = DepDest->NextLoadStore) {
14886	assert(isInSchedulingRegion(DepDest));
14887
14888	// We have two limits to reduce the complexity:
14889	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
14890	// SLP->isAliased (which is the expensive part in this loop).
14891	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
14892	// the whole loop (even if the loop is fast, it's quadratic).
14893	// It's important for the loop break condition (see below) to
14894	// check this limit even between two read-only instructions.
14895	if (DistToSrc >= MaxMemDepDistance \|\|
14896	((SrcMayWrite \|\| DepDest->Inst->mayWriteToMemory()) &&
14897	(NumAliased >= AliasedCheckLimit \|\|
14898	SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->Inst)))) {
14899
14900	// We increment the counter only if the locations are aliased
14901	// (instead of counting all alias checks). This gives a better
14902	// balance between reduced runtime and accurate dependencies.
14903	NumAliased++;
14904
14905	DepDest->MemoryDependencies.push_back(Elt: BundleMember);
14906	BundleMember->Dependencies++;
14907	ScheduleData *DestBundle = DepDest->FirstInBundle;
14908	if (!DestBundle->IsScheduled) {
14909	BundleMember->incrementUnscheduledDeps(Incr: `1`);
14910	}
14911	if (!DestBundle->hasValidDependencies()) {
14912	WorkList.push_back(Elt: DestBundle);
14913	}
14914	}
14915
14916	// Example, explaining the loop break condition: Let's assume our
14917	// starting instruction is i0 and MaxMemDepDistance = 3.
14918	//
14919	// +--------v--v--v
14920	// i0,i1,i2,i3,i4,i5,i6,i7,i8
14921	// +--------^--^--^
14922	//
14923	// MaxMemDepDistance let us stop alias-checking at i3 and we add
14924	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
14925	// Previously we already added dependencies from i3 to i6,i7,i8
14926	// (because of MaxMemDepDistance). As we added a dependency from
14927	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14928	// and we can abort this loop at i6.
14929	if (DistToSrc >= `2` * MaxMemDepDistance)
14930	break;
14931	DistToSrc++;
14932	}
14933	}
14934	if (InsertInReadyList && SD->isReady()) {
14935	ReadyInsts.insert(X: SD);
14936	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14937	<< "\n");
14938	}
14939	}
14940	}
14941
14942	void BoUpSLP::BlockScheduling::resetSchedule() {
14943	assert(ScheduleStart &&
14944	"tried to reset schedule on block which has not been scheduled");
14945	for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14946	doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
14947	assert(isInSchedulingRegion(SD) &&
14948	"ScheduleData not in scheduling region");
14949	SD->IsScheduled = false;
14950	SD->resetUnscheduledDeps();
14951	});
14952	}
14953	ReadyInsts.clear();
14954	}
14955
14956	void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14957	if (!BS->ScheduleStart)
14958	return;
14959
14960	LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14961
14962	// A key point - if we got here, pre-scheduling was able to find a valid
14963	// scheduling of the sub-graph of the scheduling window which consists
14964	// of all vector bundles and their transitive users. As such, we do not
14965	// need to reschedule anything outside of* that subgraph.*
14966
14967	BS->resetSchedule();
14968
14969	// For the real scheduling we use a more sophisticated ready-list: it is
14970	// sorted by the original instruction location. This lets the final schedule
14971	// be as close as possible to the original instruction order.
14972	// WARNING: If changing this order causes a correctness issue, that means
14973	// there is some missing dependence edge in the schedule data graph.
14974	struct ScheduleDataCompare {
14975	bool operator()(ScheduleData SD1, ScheduleData SD2) const {
14976	return SD2->SchedulingPriority < SD1->SchedulingPriority;
14977	}
14978	};
14979	std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14980
14981	// Ensure that all dependency data is updated (for nodes in the sub-graph)
14982	// and fill the ready-list with initial instructions.
14983	int Idx = `0`;
14984	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14985	I = I->getNextNode()) {
14986	BS->doForAllOpcodes(V: I, Action: [this, &Idx, BS](ScheduleData *SD) {
14987	TreeEntry *SDTE = getTreeEntry(V: SD->Inst);
14988	(void)SDTE;
14989	assert((isVectorLikeInstWithConstOps(SD->Inst) \|\|
14990	SD->isPartOfBundle() ==
14991	(SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14992	"scheduler and vectorizer bundle mismatch");
14993	SD->FirstInBundle->SchedulingPriority = Idx++;
14994
14995	if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14996	BS->calculateDependencies(SD, InsertInReadyList: false, SLP: this);
14997	});
14998	}
14999	BS->initialFillReadyList(ReadyList&: ReadyInsts);
15000
15001	Instruction *LastScheduledInst = BS->ScheduleEnd;
15002
15003	// Do the "real" scheduling.
15004	while (!ReadyInsts.empty()) {
15005	ScheduleData Picked = ReadyInsts.begin();
15006	ReadyInsts.erase(position: ReadyInsts.begin());
15007
15008	// Move the scheduled instruction(s) to their dedicated places, if not
15009	// there yet.
15010	for (ScheduleData *BundleMember = Picked; BundleMember;
15011	BundleMember = BundleMember->NextInBundle) {
15012	Instruction *PickedInst = BundleMember->Inst;
15013	if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15014	PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
15015	LastScheduledInst = PickedInst;
15016	}
15017
15018	BS->schedule(SD: Picked, ReadyList&: ReadyInsts);
15019	}
15020
15021	// Check that we didn't break any of our invariants.
15022	#ifdef EXPENSIVE_CHECKS
15023	BS->verify();
15024	#endif
15025
15026	#if !defined(NDEBUG) \|\| defined(EXPENSIVE_CHECKS)
15027	// Check that all schedulable entities got scheduled
15028	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15029	BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15030	if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15031	assert(SD->IsScheduled && "must be scheduled at this point");
15032	}
15033	});
15034	}
15035	#endif
15036
15037	// Avoid duplicate scheduling of the block.
15038	BS->ScheduleStart = nullptr;
15039	}
15040
15041	unsigned BoUpSLP::getVectorElementSize(Value *V) {
15042	// If V is a store, just return the width of the stored value (or value
15043	// truncated just before storing) without traversing the expression tree.
15044	// This is the common case.
15045	if (auto *Store = dyn_cast<StoreInst>(Val: V))
15046	return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
15047
15048	if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
15049	return getVectorElementSize(V: IEI->getOperand(i_nocapture: `1`));
15050
15051	auto E = InstrElementSize.find(Val: V);
15052	if (E != InstrElementSize.end())
15053	return E ->second;
15054
15055	// If V is not a store, we can traverse the expression tree to find loads
15056	// that feed it. The type of the loaded value may indicate a more suitable
15057	// width than V's type. We want to base the vector element size on the width
15058	// of memory operations where possible.
15059	SmallVector<std::tuple<Instruction , BasicBlock , unsigned>> Worklist;
15060	SmallPtrSet<Instruction *, `16`> Visited;
15061	if (auto *I = dyn_cast<Instruction>(Val: V)) {
15062	Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: `0`);
15063	Visited.insert(Ptr: I);
15064	}
15065
15066	// Traverse the expression tree in bottom-up order looking for loads. If we
15067	// encounter an instruction we don't yet handle, we give up.
15068	auto Width = `0u`;
15069	Value FirstNonBool = nullptr*;
15070	while (!Worklist.empty()) {
15071	auto [I, Parent, Level] = Worklist.pop_back_val();
15072
15073	// We should only be looking at scalar instructions here. If the current
15074	// instruction has a vector type, skip.
15075	auto *Ty = I->getType();
15076	if (isa<VectorType>(Val: Ty))
15077	continue;
15078	if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15079	FirstNonBool = I;
15080	if (Level > RecursionMaxDepth)
15081	continue;
15082
15083	// If the current instruction is a load, update MaxWidth to reflect the
15084	// width of the loaded value.
15085	if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
15086	Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
15087
15088	// Otherwise, we need to visit the operands of the instruction. We only
15089	// handle the interesting cases from buildTree here. If an operand is an
15090	// instruction we haven't yet visited and from the same basic block as the
15091	// user or the use is a PHI node, we add it to the worklist.
15092	else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
15093	BinaryOperator, UnaryOperator>(Val: I)) {
15094	for (Use &U : I->operands()) {
15095	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
15096	if (Visited.insert(Ptr: J).second &&
15097	(isa<PHINode>(Val: I) \|\| J->getParent() == Parent)) {
15098	Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + `1`);
15099	continue;
15100	}
15101	if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15102	FirstNonBool = U.get();
15103	}
15104	} else {
15105	break;
15106	}
15107	}
15108
15109	// If we didn't encounter a memory access in the expression tree, or if we
15110	// gave up for some reason, just return the width of V. Otherwise, return the
15111	// maximum width we found.
15112	if (!Width) {
15113	if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15114	V = FirstNonBool;
15115	Width = DL->getTypeSizeInBits(Ty: V->getType());
15116	}
15117
15118	for (Instruction *I : Visited)
15119	InstrElementSize [I] = Width;
15120
15121	return Width;
15122	}
15123
15124	bool BoUpSLP::collectValuesToDemote(
15125	const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15126	SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
15127	unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15128	bool IsTruncRoot) const {
15129	// We can always demote constants.
15130	if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
15131	return true;
15132
15133	unsigned OrigBitWidth = DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType());
15134	if (OrigBitWidth == BitWidth) {
15135	MaxDepthLevel = `1`;
15136	return true;
15137	}
15138
15139	// If the value is not a vectorized instruction in the expression and not used
15140	// by the insertelement instruction and not used in multiple vector nodes, it
15141	// cannot be demoted.
15142	bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
15143	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
15144	});
15145	auto IsPotentiallyTruncated = [&](Value V, unsigned* &BitWidth) -> bool {
15146	if (MultiNodeScalars.contains(Val: V))
15147	return false;
15148	// For lat shuffle of sext/zext with many uses need to check the extra bit
15149	// for unsigned values, otherwise may have incorrect casting for reused
15150	// scalars.
15151	bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery (*DL));
15152	if ((!IsSignedNode \|\| IsSignedVal) && OrigBitWidth > BitWidth) {
15153	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15154	if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery (*DL)))
15155	return true;
15156	}
15157	unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
15158	unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15159	if (IsSignedNode)
15160	++BitWidth1;
15161	if (auto *I = dyn_cast<Instruction>(Val: V)) {
15162	APInt Mask = DB->getDemandedBits(I);
15163	unsigned BitWidth2 =
15164	std::max<unsigned>(a: `1`, b: Mask.getBitWidth() - Mask.countl_zero());
15165	while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15166	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - `1`);
15167	if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery (*DL)))
15168	break;
15169	BitWidth2 *= `2`;
15170	}
15171	BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
15172	}
15173	BitWidth = std::max(a: BitWidth, b: BitWidth1);
15174	return BitWidth > `0` && OrigBitWidth >= (BitWidth * `2`);
15175	};
15176	using namespace std::placeholders;
15177	auto FinalAnalysis = [&]() {
15178	if (!IsProfitableToDemote)
15179	return false;
15180	bool Res = all_of(
15181	Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
15182	// Demote gathers.
15183	if (Res && E.isGather()) {
15184	// Check possible extractelement instructions bases and final vector
15185	// length.
15186	SmallPtrSet<Value *, `4`> UniqueBases;
15187	for (Value *V : E.Scalars) {
15188	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
15189	if (!EE)
15190	continue;
15191	UniqueBases.insert(Ptr: EE->getVectorOperand());
15192	}
15193	const unsigned VF = E.Scalars.size();
15194	Type *OrigScalarTy = E.Scalars.front()->getType();
15195	if (UniqueBases.size() <= `2` \|\|
15196	TTI->getNumberOfParts(Tp: getWidenedType(ScalarTy: OrigScalarTy, VF)) ==
15197	TTI->getNumberOfParts(Tp: getWidenedType(
15198	ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth), VF)))
15199	ToDemote.push_back(Elt: E.Idx);
15200	}
15201	return Res;
15202	};
15203	if (E.isGather() \|\| !Visited.insert(V: &E).second \|\|
15204	any_of(Range: E.Scalars, P: [&](Value *V) {
15205	return all_of(Range: V->users(), P: [&](User *U) {
15206	return isa<InsertElementInst>(Val: U) && !getTreeEntry(V: U);
15207	});
15208	}))
15209	return FinalAnalysis ();
15210
15211	if (any_of(Range: E.Scalars, P: [&](Value *V) {
15212	return !all_of(Range: V->users(), P: [=](User *U) {
15213	return getTreeEntry(V: U) \|\|
15214	(E.Idx == `0` && UserIgnoreList &&
15215	UserIgnoreList->contains(V: U)) \|\|
15216	(!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
15217	!U->getType()->isScalableTy() &&
15218	DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
15219	}) && !IsPotentiallyTruncated (V, BitWidth);
15220	}))
15221	return false;
15222
15223	auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15224	bool &NeedToExit) {
15225	NeedToExit = false;
15226	unsigned InitLevel = MaxDepthLevel;
15227	for (const TreeEntry *Op : Operands) {
15228	unsigned Level = InitLevel;
15229	if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
15230	ToDemote, Visited, MaxDepthLevel&: Level, IsProfitableToDemote,
15231	IsTruncRoot)) {
15232	if (!IsProfitableToDemote)
15233	return false;
15234	NeedToExit = true;
15235	if (!FinalAnalysis ())
15236	return false;
15237	continue;
15238	}
15239	MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
15240	}
15241	return true;
15242	};
15243	auto AttemptCheckBitwidth =
15244	[&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15245	// Try all bitwidth < OrigBitWidth.
15246	NeedToExit = false;
15247	unsigned BestFailBitwidth = `0`;
15248	for (; BitWidth < OrigBitWidth; BitWidth *= `2`) {
15249	if (Checker (BitWidth, OrigBitWidth))
15250	return true;
15251	if (BestFailBitwidth == `0` && FinalAnalysis ())
15252	BestFailBitwidth = BitWidth;
15253	}
15254	if (BitWidth >= OrigBitWidth) {
15255	if (BestFailBitwidth == `0`) {
15256	BitWidth = OrigBitWidth;
15257	return false;
15258	}
15259	MaxDepthLevel = `1`;
15260	BitWidth = BestFailBitwidth;
15261	NeedToExit = true;
15262	return true;
15263	}
15264	return false;
15265	};
15266	auto TryProcessInstruction =
15267	[&](unsigned &BitWidth,
15268	ArrayRef<const TreeEntry *> Operands = std::nullopt,
15269	function_ref<bool(unsigned, unsigned)> Checker = {}) {
15270	if (Operands.empty()) {
15271	if (!IsTruncRoot)
15272	MaxDepthLevel = `1`;
15273	(void)for_each(Range: E.Scalars, F: std::bind(f&: IsPotentiallyTruncated, args: _1,
15274	args: std::ref(t&: BitWidth)));
15275	} else {
15276	// Several vectorized uses? Check if we can truncate it, otherwise -
15277	// exit.
15278	if (E.UserTreeIndices.size() > `1` &&
15279	!all_of(Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1,
15280	args: std::ref(t&: BitWidth))))
15281	return false;
15282	bool NeedToExit = false;
15283	if (Checker && !AttemptCheckBitwidth (Checker, NeedToExit))
15284	return false;
15285	if (NeedToExit)
15286	return true;
15287	if (!ProcessOperands (Operands, NeedToExit))
15288	return false;
15289	if (NeedToExit)
15290	return true;
15291	}
15292
15293	++MaxDepthLevel;
15294	// Record the entry that we can demote.
15295	ToDemote.push_back(Elt: E.Idx);
15296	return IsProfitableToDemote;
15297	};
15298	switch (E.getOpcode()) {
15299
15300	// We can always demote truncations and extensions. Since truncations can
15301	// seed additional demotion, we save the truncated value.
15302	case Instruction::Trunc:
15303	if (IsProfitableToDemoteRoot)
15304	IsProfitableToDemote = true;
15305	return TryProcessInstruction (BitWidth);
15306	case Instruction::ZExt:
15307	case Instruction::SExt:
15308	IsProfitableToDemote = true;
15309	return TryProcessInstruction (BitWidth);
15310
15311	// We can demote certain binary operations if we can demote both of their
15312	// operands.
15313	case Instruction::Add:
15314	case Instruction::Sub:
15315	case Instruction::Mul:
15316	case Instruction::And:
15317	case Instruction::Or:
15318	case Instruction::Xor: {
15319	return TryProcessInstruction (
15320	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)});
15321	}
15322	case Instruction::Shl: {
15323	// If we are truncating the result of this SHL, and if it's a shift of an
15324	// inrange amount, we can always perform a SHL in a smaller type.
15325	auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15326	return all_of(Range: E.Scalars, P: [&](Value *V) {
15327	auto *I = cast<Instruction>(Val: V);
15328	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
15329	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
15330	});
15331	};
15332	return TryProcessInstruction (
15333	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, ShlChecker);
15334	}
15335	case Instruction::LShr: {
15336	// If this is a truncate of a logical shr, we can truncate it to a smaller
15337	// lshr iff we know that the bits we would otherwise be shifting in are
15338	// already zeros.
15339	auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15340	return all_of(Range: E.Scalars, P: [&](Value *V) {
15341	auto *I = cast<Instruction>(Val: V);
15342	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
15343	APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15344	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
15345	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask: ShiftedBits,
15346	DL: SimplifyQuery (*DL));
15347	});
15348	};
15349	return TryProcessInstruction (
15350	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
15351	LShrChecker);
15352	}
15353	case Instruction::AShr: {
15354	// If this is a truncate of an arithmetic shr, we can truncate it to a
15355	// smaller ashr iff we know that all the bits from the sign bit of the
15356	// original type and the sign bit of the truncate type are similar.
15357	auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15358	return all_of(Range: E.Scalars, P: [&](Value *V) {
15359	auto *I = cast<Instruction>(Val: V);
15360	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
15361	unsigned ShiftedBits = OrigBitWidth - BitWidth;
15362	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
15363	ShiftedBits < ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: *DL, Depth: `0`, AC,
15364	CxtI: nullptr, DT);
15365	});
15366	};
15367	return TryProcessInstruction (
15368	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
15369	AShrChecker);
15370	}
15371	case Instruction::UDiv:
15372	case Instruction::URem: {
15373	// UDiv and URem can be truncated if all the truncated bits are zero.
15374	auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15375	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15376	return all_of(Range: E.Scalars, P: [&](Value *V) {
15377	auto *I = cast<Instruction>(Val: V);
15378	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15379	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask, DL: SimplifyQuery (*DL)) &&
15380	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, DL: SimplifyQuery (*DL));
15381	});
15382	};
15383	return TryProcessInstruction (
15384	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, Checker);
15385	}
15386
15387	// We can demote selects if we can demote their true and false values.
15388	case Instruction::Select: {
15389	return TryProcessInstruction (
15390	BitWidth, {getOperandEntry(E: &E, Idx: `1`), getOperandEntry(E: &E, Idx: `2`)});
15391	}
15392
15393	// We can demote phis if we can demote all their incoming operands. Note that
15394	// we don't need to worry about cycles since we ensure single use above.
15395	case Instruction::PHI: {
15396	const unsigned NumOps = E.getNumOperands();
15397	SmallVector<const TreeEntry *> Ops(NumOps);
15398	transform(Range: seq<unsigned>(Begin: `0`, End: NumOps), d_first: Ops.begin(),
15399	F: std::bind(f: &BoUpSLP::getOperandEntry, args: this, args: &E, args: _1));
15400
15401	return TryProcessInstruction (BitWidth, Ops);
15402	}
15403
15404	case Instruction::Call: {
15405	auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
15406	if (!IC)
15407	break;
15408	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
15409	if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15410	ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15411	break;
15412	SmallVector<const TreeEntry *, `2`> Operands(`1`, getOperandEntry(E: &E, Idx: `0`));
15413	function_ref<bool(unsigned, unsigned)> CallChecker;
15414	auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15415	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15416	return all_of(Range: E.Scalars, P: [&](Value *V) {
15417	auto *I = cast<Instruction>(Val: V);
15418	if (ID == Intrinsic::umin \|\| ID == Intrinsic::umax) {
15419	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15420	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
15421	DL: SimplifyQuery (*DL)) &&
15422	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, DL: SimplifyQuery (*DL));
15423	}
15424	assert((ID == Intrinsic::smin \|\| ID == Intrinsic::smax) &&
15425	"Expected min/max intrinsics only.");
15426	unsigned SignBits = OrigBitWidth - BitWidth;
15427	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - `1`);
15428	unsigned Op0SignBits = ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: *DL, Depth: `0`, AC,
15429	CxtI: nullptr, DT);
15430	unsigned Op1SignBits = ComputeNumSignBits(Op: I->getOperand(i: `1`), DL: *DL, Depth: `0`, AC,
15431	CxtI: nullptr, DT);
15432	return SignBits <= Op0SignBits &&
15433	((SignBits != Op0SignBits &&
15434	!isKnownNonNegative(V: I->getOperand(i: `0`), SQ: SimplifyQuery (*DL))) \|\|
15435	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
15436	DL: SimplifyQuery (*DL))) &&
15437	SignBits <= Op1SignBits &&
15438	((SignBits != Op1SignBits &&
15439	!isKnownNonNegative(V: I->getOperand(i: `1`), SQ: SimplifyQuery (*DL))) \|\|
15440	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, DL: SimplifyQuery (*DL)));
15441	});
15442	};
15443	if (ID != Intrinsic::abs) {
15444	Operands.push_back(Elt: getOperandEntry(E: &E, Idx: `1`));
15445	CallChecker = CompChecker;
15446	}
15447	InstructionCost BestCost =
15448	std::numeric_limits<InstructionCost::CostType>::max();
15449	unsigned BestBitWidth = BitWidth;
15450	unsigned VF = E.Scalars.size();
15451	// Choose the best bitwidth based on cost estimations.
15452	auto Checker = [&](unsigned BitWidth, unsigned) {
15453	unsigned MinBW = PowerOf2Ceil(A: BitWidth);
15454	SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW);
15455	auto VecCallCosts = getVectorCallCosts(
15456	CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
15457	TTI, TLI, ArgTys);
15458	InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
15459	if (Cost < BestCost) {
15460	BestCost = Cost;
15461	BestBitWidth = BitWidth;
15462	}
15463	return false;
15464	};
15465	[[maybe_unused]] bool NeedToExit;
15466	(void)AttemptCheckBitwidth (Checker, NeedToExit);
15467	BitWidth = BestBitWidth;
15468	return TryProcessInstruction (BitWidth, Operands, CallChecker);
15469	}
15470
15471	// Otherwise, conservatively give up.
15472	default:
15473	break;
15474	}
15475	MaxDepthLevel = `1`;
15476	return FinalAnalysis ();
15477	}
15478
15479	static RecurKind getRdxKind(Value *V);
15480
15481	void BoUpSLP::computeMinimumValueSizes() {
15482	// We only attempt to truncate integer expressions.
15483	bool IsStoreOrInsertElt =
15484	VectorizableTree.front()->getOpcode() == Instruction::Store \|\|
15485	VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15486	if ((IsStoreOrInsertElt \|\| UserIgnoreList) &&
15487	ExtraBitWidthNodes.size() <= `1` &&
15488	(!CastMaxMinBWSizes \|\| CastMaxMinBWSizes ->second == `0` \|\|
15489	CastMaxMinBWSizes ->first / CastMaxMinBWSizes ->second <= `2`))
15490	return;
15491
15492	unsigned NodeIdx = `0`;
15493	if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15494	NodeIdx = `1`;
15495
15496	// Ensure the roots of the vectorizable tree don't form a cycle.
15497	if (VectorizableTree [NodeIdx]->isGather() \|\|
15498	(NodeIdx == `0` && !VectorizableTree [NodeIdx]->UserTreeIndices.empty()) \|\|
15499	(NodeIdx != `0` && any_of(Range&: VectorizableTree [NodeIdx]->UserTreeIndices,
15500	P: [NodeIdx](const EdgeInfo &EI) {
15501	return EI.UserTE->Idx >
15502	static_cast<int>(NodeIdx);
15503	})))
15504	return;
15505
15506	// The first value node for store/insertelement is sext/zext/trunc? Skip it,
15507	// resize to the final type.
15508	bool IsTruncRoot = false;
15509	bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15510	SmallVector<unsigned> RootDemotes;
15511	if (NodeIdx != `0` &&
15512	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
15513	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
15514	assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15515	IsTruncRoot = true;
15516	RootDemotes.push_back(Elt: NodeIdx);
15517	IsProfitableToDemoteRoot = true;
15518	++NodeIdx;
15519	}
15520
15521	// Analyzed the reduction already and not profitable - exit.
15522	if (AnalyzedMinBWVals.contains(V: VectorizableTree [NodeIdx]->Scalars.front()))
15523	return;
15524
15525	SmallVector<unsigned> ToDemote;
15526	auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15527	bool IsProfitableToDemoteRoot, unsigned Opcode,
15528	unsigned Limit, bool IsTruncRoot,
15529	bool IsSignedCmp) -> unsigned {
15530	ToDemote.clear();
15531	// Check if the root is trunc and the next node is gather/buildvector, then
15532	// keep trunc in scalars, which is free in most cases.
15533	if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == `1` &&
15534	E.Idx > (IsStoreOrInsertElt ? `2` : `1`) &&
15535	all_of(Range: E.Scalars, P: [&](Value *V) {
15536	return V->hasOneUse() \|\| isa<Constant>(Val: V) \|\|
15537	(!V->hasNUsesOrMore(N: UsesLimit) &&
15538	none_of(Range: V->users(), P: [&](User *U) {
15539	const TreeEntry *TE = getTreeEntry(V: U);
15540	const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15541	if (TE == UserTE \|\| !TE)
15542	return false;
15543	if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15544	SelectInst>(Val: U) \|\|
15545	!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15546	SelectInst>(Val: UserTE->getMainOp()))
15547	return true;
15548	unsigned UserTESz = DL->getTypeSizeInBits(
15549	Ty: UserTE->Scalars.front()->getType());
15550	auto It = MinBWs.find(Val: TE);
15551	if (It != MinBWs.end() && It ->second.first > UserTESz)
15552	return true;
15553	return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
15554	}));
15555	})) {
15556	ToDemote.push_back(Elt: E.Idx);
15557	const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15558	auto It = MinBWs.find(Val: UserTE);
15559	if (It != MinBWs.end())
15560	return It ->second.first;
15561	unsigned MaxBitWidth =
15562	DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
15563	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
15564	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
15565	MaxBitWidth = `8`;
15566	return MaxBitWidth;
15567	}
15568
15569	unsigned VF = E.getVectorFactor();
15570	auto *TreeRootIT = dyn_cast<IntegerType>(Val: E.Scalars.front()->getType());
15571	if (!TreeRootIT \|\| !Opcode)
15572	return `0u`;
15573
15574	if (any_of(Range: E.Scalars,
15575	P: [&](Value V) { return* AnalyzedMinBWVals.contains(V); }))
15576	return `0u`;
15577
15578	unsigned NumParts = TTI->getNumberOfParts(Tp: getWidenedType(ScalarTy: TreeRootIT, VF));
15579
15580	// The maximum bit width required to represent all the values that can be
15581	// demoted without loss of precision. It would be safe to truncate the roots
15582	// of the expression to this width.
15583	unsigned MaxBitWidth = `1u`;
15584
15585	// True if the roots can be zero-extended back to their original type,
15586	// rather than sign-extended. We know that if the leading bits are not
15587	// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15588	// True.
15589	// Determine if the sign bit of all the roots is known to be zero. If not,
15590	// IsKnownPositive is set to False.
15591	bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
15592	KnownBits Known = computeKnownBits(V: R, DL: *DL);
15593	return Known.isNonNegative();
15594	});
15595
15596	// We first check if all the bits of the roots are demanded. If they're not,
15597	// we can truncate the roots to this narrower type.
15598	for (Value *Root : E.Scalars) {
15599	unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
15600	TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: Root->getType());
15601	unsigned BitWidth1 = NumTypeBits - NumSignBits;
15602	// If we can't prove that the sign bit is zero, we must add one to the
15603	// maximum bit width to account for the unknown sign bit. This preserves
15604	// the existing sign bit so we can safely sign-extend the root back to the
15605	// original type. Otherwise, if we know the sign bit is zero, we will
15606	// zero-extend the root instead.
15607	//
15608	// FIXME: This is somewhat suboptimal, as there will be cases where adding
15609	// one to the maximum bit width will yield a larger-than-necessary
15610	// type. In general, we need to add an extra bit only if we can't
15611	// prove that the upper bit of the original type is equal to the
15612	// upper bit of the proposed smaller type. If these two bits are
15613	// the same (either zero or one) we know that sign-extending from
15614	// the smaller type will result in the same value. Here, since we
15615	// can't yet prove this, we are just making the proposed smaller
15616	// type larger to ensure correctness.
15617	if (!IsKnownPositive)
15618	++BitWidth1;
15619
15620	APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root));
15621	unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15622	MaxBitWidth =
15623	std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
15624	}
15625
15626	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
15627	MaxBitWidth = `8`;
15628
15629	// If the original type is large, but reduced type does not improve the reg
15630	// use - ignore it.
15631	if (NumParts > `1` &&
15632	NumParts ==
15633	TTI->getNumberOfParts(Tp: getWidenedType(
15634	ScalarTy: IntegerType::get(C&: F->getContext(), NumBits: bit_ceil(Value: MaxBitWidth)), VF)))
15635	return `0u`;
15636
15637	bool IsProfitableToDemote = Opcode == Instruction::Trunc \|\|
15638	Opcode == Instruction::SExt \|\|
15639	Opcode == Instruction::ZExt \|\| NumParts > `1`;
15640	// Conservatively determine if we can actually truncate the roots of the
15641	// expression. Collect the values that can be demoted in ToDemote and
15642	// additional roots that require investigating in Roots.
15643	DenseSet<const TreeEntry *> Visited;
15644	unsigned MaxDepthLevel = IsTruncRoot ? Limit : `1`;
15645	bool NeedToDemote = IsProfitableToDemote;
15646
15647	if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
15648	ToDemote, Visited, MaxDepthLevel, IsProfitableToDemote&: NeedToDemote,
15649	IsTruncRoot) \|\|
15650	(MaxDepthLevel <= Limit &&
15651	!(((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
15652	(!IsTopRoot \|\| !(IsStoreOrInsertElt \|\| UserIgnoreList) \|\|
15653	DL->getTypeSizeInBits(Ty: TreeRootIT) /
15654	DL->getTypeSizeInBits(Ty: cast<Instruction>(Val: E.Scalars.front())
15655	->getOperand(i: `0`)
15656	->getType()) >
15657	`2`)))))
15658	return `0u`;
15659	// Round MaxBitWidth up to the next power-of-two.
15660	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
15661
15662	return MaxBitWidth;
15663	};
15664
15665	// If we can truncate the root, we must collect additional values that might
15666	// be demoted as a result. That is, those seeded by truncations we will
15667	// modify.
15668	// Add reduction ops sizes, if any.
15669	if (UserIgnoreList &&
15670	isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
15671	for (Value V : UserIgnoreList) {
15672	auto NumSignBits = ComputeNumSignBits(Op: V, DL: DL, Depth: `0`, AC, CxtI: nullptr*, DT);
15673	auto NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
15674	unsigned BitWidth1 = NumTypeBits - NumSignBits;
15675	if (!isKnownNonNegative(V, SQ: SimplifyQuery (*DL)))
15676	++BitWidth1;
15677	unsigned BitWidth2 = BitWidth1;
15678	if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
15679	auto Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
15680	BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15681	}
15682	ReductionBitWidth =
15683	std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
15684	}
15685	if (ReductionBitWidth < `8` && ReductionBitWidth > `1`)
15686	ReductionBitWidth = `8`;
15687
15688	ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
15689	}
15690	bool IsTopRoot = NodeIdx == `0`;
15691	while (NodeIdx < VectorizableTree.size() &&
15692	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
15693	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
15694	RootDemotes.push_back(Elt: NodeIdx);
15695	++NodeIdx;
15696	IsTruncRoot = true;
15697	}
15698	bool IsSignedCmp = false;
15699	while (NodeIdx < VectorizableTree.size()) {
15700	ArrayRef<Value *> TreeRoot = VectorizableTree [NodeIdx]->Scalars;
15701	unsigned Limit = `2`;
15702	unsigned Opcode = VectorizableTree [NodeIdx]->getOpcode();
15703	if (IsTopRoot &&
15704	ReductionBitWidth ==
15705	DL->getTypeSizeInBits(
15706	Ty: VectorizableTree.front()->Scalars.front()->getType()))
15707	Limit = `3`;
15708	unsigned MaxBitWidth = ComputeMaxBitWidth (
15709	*VectorizableTree [NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15710	Limit, IsTruncRoot, IsSignedCmp);
15711	if (ReductionBitWidth != `0` && (IsTopRoot \|\| !RootDemotes.empty())) {
15712	if (MaxBitWidth != `0` && ReductionBitWidth < MaxBitWidth)
15713	ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
15714	else if (MaxBitWidth == `0`)
15715	ReductionBitWidth = `0`;
15716	}
15717
15718	for (unsigned Idx : RootDemotes) {
15719	if (all_of(Range&: VectorizableTree [Idx]->Scalars, P: [&](Value *V) {
15720	uint32_t OrigBitWidth = DL->getTypeSizeInBits(Ty: V->getType());
15721	if (OrigBitWidth > MaxBitWidth) {
15722	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
15723	return MaskedValueIsZero(V, Mask, DL: SimplifyQuery (*DL));
15724	}
15725	return false;
15726	}))
15727	ToDemote.push_back(Elt: Idx);
15728	}
15729	RootDemotes.clear();
15730	IsTopRoot = false;
15731	IsProfitableToDemoteRoot = true;
15732
15733	if (ExtraBitWidthNodes.empty()) {
15734	NodeIdx = VectorizableTree.size();
15735	} else {
15736	unsigned NewIdx = `0`;
15737	do {
15738	NewIdx = *ExtraBitWidthNodes.begin();
15739	ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
15740	} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15741	NodeIdx = NewIdx;
15742	IsTruncRoot =
15743	NodeIdx < VectorizableTree.size() &&
15744	any_of(Range&: VectorizableTree [NodeIdx]->UserTreeIndices,
15745	P: [](const EdgeInfo &EI) {
15746	return EI.EdgeIdx == `0` &&
15747	EI.UserTE->getOpcode() == Instruction::Trunc &&
15748	!EI.UserTE->isAltShuffle();
15749	});
15750	IsSignedCmp =
15751	NodeIdx < VectorizableTree.size() &&
15752	any_of(Range&: VectorizableTree [NodeIdx]->UserTreeIndices,
15753	P: [&](const EdgeInfo &EI) {
15754	return EI.UserTE->getOpcode() == Instruction::ICmp &&
15755	any_of(Range&: EI.UserTE->Scalars, P: [&](Value *V) {
15756	auto *IC = dyn_cast<ICmpInst>(Val: V);
15757	return IC &&
15758	(IC->isSigned() \|\|
15759	!isKnownNonNegative(V: IC->getOperand(i_nocapture: `0`),
15760	SQ: SimplifyQuery (*DL)) \|\|
15761	!isKnownNonNegative(V: IC->getOperand(i_nocapture: `1`),
15762	SQ: SimplifyQuery (*DL)));
15763	});
15764	});
15765	}
15766
15767	// If the maximum bit width we compute is less than the with of the roots'
15768	// type, we can proceed with the narrowing. Otherwise, do nothing.
15769	if (MaxBitWidth == `0` \|\|
15770	MaxBitWidth >=
15771	cast<IntegerType>(Val: TreeRoot.front()->getType())->getBitWidth()) {
15772	if (UserIgnoreList)
15773	AnalyzedMinBWVals.insert(I: TreeRoot.begin(), E: TreeRoot.end());
15774	continue;
15775	}
15776
15777	// Finally, map the values we can demote to the maximum bit with we
15778	// computed.
15779	for (unsigned Idx : ToDemote) {
15780	TreeEntry *TE = VectorizableTree [Idx].get();
15781	if (MinBWs.contains(Val: TE))
15782	continue;
15783	bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
15784	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
15785	});
15786	MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
15787	}
15788	}
15789	}
15790
15791	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15792	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
15793	auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
15794	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
15795	auto *AA = &AM.getResult<AAManager>(IR&: F);
15796	auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
15797	auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
15798	auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
15799	auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
15800	auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
15801
15802	bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
15803	if (!Changed)
15804	return PreservedAnalyses::all();
15805
15806	PreservedAnalyses PA;
15807	PA.preserveSet<CFGAnalyses>();
15808	return PA;
15809	}
15810
15811	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15812	TargetTransformInfo *TTI_,
15813	TargetLibraryInfo TLI_, AAResults AA_,
15814	LoopInfo LI_, DominatorTree DT_,
15815	AssumptionCache AC_, DemandedBits DB_,
15816	OptimizationRemarkEmitter *ORE_) {
15817	if (!RunSLPVectorization)
15818	return false;
15819	SE = SE_;
15820	TTI = TTI_;
15821	TLI = TLI_;
15822	AA = AA_;
15823	LI = LI_;
15824	DT = DT_;
15825	AC = AC_;
15826	DB = DB_;
15827	DL = &F.getDataLayout();
15828
15829	Stores.clear();
15830	GEPs.clear();
15831	bool Changed = false;
15832
15833	// If the target claims to have no vector registers don't attempt
15834	// vectorization.
15835	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
15836	LLVM_DEBUG(
15837	dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15838	return false;
15839	}
15840
15841	// Don't vectorize when the attribute NoImplicitFloat is used.
15842	if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
15843	return false;
15844
15845	LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15846
15847	// Use the bottom up slp vectorizer to construct chains that start with
15848	// store instructions.
15849	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15850
15851	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15852	// delete instructions.
15853
15854	// Update DFS numbers now so that we can use them for ordering.
15855	DT->updateDFSNumbers();
15856
15857	// Scan the blocks in the function in post order.
15858	for (auto *BB : post_order(G: &F.getEntryBlock())) {
15859	// Start new block - clear the list of reduction roots.
15860	R.clearReductionData();
15861	collectSeedInstructions(BB);
15862
15863	// Vectorize trees that end at stores.
15864	if (!Stores.empty()) {
15865	LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15866	<< " underlying objects.\n");
15867	Changed \|= vectorizeStoreChains(R);
15868	}
15869
15870	// Vectorize trees that end at reductions.
15871	Changed \|= vectorizeChainsInBlock(BB, R);
15872
15873	// Vectorize the index computations of getelementptr instructions. This
15874	// is primarily intended to catch gather-like idioms ending at
15875	// non-consecutive loads.
15876	if (!GEPs.empty()) {
15877	LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15878	<< " underlying objects.\n");
15879	Changed \|= vectorizeGEPIndices(BB, R);
15880	}
15881	}
15882
15883	if (Changed) {
15884	R.optimizeGatherSequence();
15885	LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15886	}
15887	return Changed;
15888	}
15889
15890	std::optional<bool>
15891	SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15892	unsigned Idx, unsigned MinVF,
15893	unsigned &Size) {
15894	Size = `0`;
15895	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15896	<< "\n");
15897	const unsigned Sz = R.getVectorElementSize(V: Chain [`0`]);
15898	unsigned VF = Chain.size();
15899
15900	if (!isPowerOf2_32(Value: Sz) \|\| !isPowerOf2_32(Value: VF) \|\| VF < `2` \|\| VF < MinVF) {
15901	// Check if vectorizing with a non-power-of-2 VF should be considered. At
15902	// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15903	// all vector lanes are used.
15904	if (!VectorizeNonPowerOf2 \|\| (VF < MinVF && VF + `1` != MinVF))
15905	return false;
15906	}
15907
15908	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15909	<< "\n");
15910
15911	SetVector<Value *> ValOps;
15912	for (Value *V : Chain)
15913	ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
15914	// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15915	InstructionsState S = getSameOpcode(VL: ValOps.getArrayRef(), TLI: *TLI);
15916	if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > `1`) {
15917	DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15918	bool IsPowerOf2 =
15919	isPowerOf2_32(Value: ValOps.size()) \|\|
15920	(VectorizeNonPowerOf2 && isPowerOf2_32(Value: ValOps.size() + `1`));
15921	if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15922	(!S.MainOp->isSafeToRemove() \|\|
15923	any_of(Range: ValOps.getArrayRef(),
15924	P: [&](Value *V) {
15925	return !isa<ExtractElementInst>(Val: V) &&
15926	(V->getNumUses() > Chain.size() \|\|
15927	any_of(Range: V->users(), P: [&](User *U) {
15928	return !Stores.contains(V: U);
15929	}));
15930	}))) \|\|
15931	(ValOps.size() > Chain.size() / `2` && !S.getOpcode())) {
15932	Size = (!IsPowerOf2 && S.getOpcode()) ? `1` : `2`;
15933	return false;
15934	}
15935	}
15936	if (R.isLoadCombineCandidate(Stores: Chain))
15937	return true;
15938	R.buildTree(Roots: Chain);
15939	// Check if tree tiny and store itself or its value is not vectorized.
15940	if (R.isTreeTinyAndNotFullyVectorizable()) {
15941	if (R.isGathered(V: Chain.front()) \|\|
15942	R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
15943	return std::nullopt;
15944	Size = R.getTreeSize();
15945	return false;
15946	}
15947	R.reorderTopToBottom();
15948	R.reorderBottomToTop();
15949	R.buildExternalUses();
15950
15951	R.computeMinimumValueSizes();
15952	R.transformNodes();
15953
15954	Size = R.getTreeSize();
15955	if (S.getOpcode() == Instruction::Load)
15956	Size = `2`; // cut off masked gather small trees
15957	InstructionCost Cost = R.getTreeCost();
15958
15959	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15960	if (Cost < -SLPCostThreshold) {
15961	LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15962
15963	using namespace ore;
15964
15965	R.getORE()->emit(OptDiag&: OptimizationRemark (SV_NAME, "StoresVectorized",
15966	cast<StoreInst>(Val: Chain [`0`]))
15967	<< "Stores SLP vectorized with cost " << NV ("Cost", Cost)
15968	<< " and with tree size "
15969	<< NV ("TreeSize", R.getTreeSize()));
15970
15971	R.vectorizeTree();
15972	return true;
15973	}
15974
15975	return false;
15976	}
15977
15978	/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15979	static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15980	bool First) {
15981	unsigned Num = `0`;
15982	uint64_t Sum = std::accumulate(
15983	first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(`0`),
15984	binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15985	unsigned Size = First ? Val.first : Val.second;
15986	if (Size == `1`)
15987	return V;
15988	++Num;
15989	return V + Size;
15990	});
15991	if (Num == `0`)
15992	return true;
15993	uint64_t Mean = Sum / Num;
15994	if (Mean == `0`)
15995	return true;
15996	uint64_t Dev = std::accumulate(
15997	first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(`0`),
15998	binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15999	unsigned P = First ? Val.first : Val.second;
16000	if (P == `1`)
16001	return V;
16002	return V + (P - Mean) * (P - Mean);
16003	}) /
16004	Num;
16005	return Dev * `81` / (Mean * Mean) == `0`;
16006	}
16007
16008	bool SLPVectorizerPass::vectorizeStores(
16009	ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16010	DenseSet<std::tuple<Value , Value , Value , Value , unsigned>>
16011	&Visited) {
16012	// We may run into multiple chains that merge into a single chain. We mark the
16013	// stores that we vectorized so that we don't visit the same store twice.
16014	BoUpSLP::ValueSet VectorizedStores;
16015	bool Changed = false;
16016
16017	struct StoreDistCompare {
16018	bool operator()(const std::pair<unsigned, int> &Op1,
16019	const std::pair<unsigned, int> &Op2) const {
16020	return Op1.second < Op2.second;
16021	}
16022	};
16023	// A set of pairs (index of store in Stores array ref, Distance of the store
16024	// address relative to base store address in units).
16025	using StoreIndexToDistSet =
16026	std::set<std::pair<unsigned, int>, StoreDistCompare>;
16027	auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16028	int PrevDist = -`1`;
16029	BoUpSLP::ValueList Operands;
16030	// Collect the chain into a list.
16031	for (auto [Idx, Data] : enumerate(First: Set)) {
16032	if (Operands.empty() \|\| Data.second - PrevDist == `1`) {
16033	Operands.push_back(Elt: Stores [Data.first]);
16034	PrevDist = Data.second;
16035	if (Idx != Set.size() - `1`)
16036	continue;
16037	}
16038	auto E = make_scope_exit(F: [&, &DataVar = Data]() {
16039	Operands.clear();
16040	Operands.push_back(Elt: Stores [DataVar.first]);
16041	PrevDist = DataVar.second;
16042	});
16043
16044	if (Operands.size() <= `1` \|\|
16045	!Visited
16046	.insert(V: {Operands.front(),
16047	cast<StoreInst>(Val: Operands.front())->getValueOperand(),
16048	Operands.back(),
16049	cast<StoreInst>(Val: Operands.back())->getValueOperand(),
16050	Operands.size()})
16051	.second)
16052	continue;
16053
16054	unsigned MaxVecRegSize = R.getMaxVecRegSize();
16055	unsigned EltSize = R.getVectorElementSize(V: Operands [`0`]);
16056	unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
16057
16058	unsigned MaxVF =
16059	std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
16060	unsigned MaxRegVF = MaxVF;
16061	auto *Store = cast<StoreInst>(Val: Operands [`0`]);
16062	Type *StoreTy = Store->getValueOperand()->getType();
16063	Type *ValueTy = StoreTy;
16064	if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
16065	ValueTy = Trunc->getSrcTy();
16066	if (ValueTy == StoreTy &&
16067	R.getVectorElementSize(V: Store->getValueOperand()) <= EltSize)
16068	MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
16069	unsigned MinVF = std::max<unsigned>(
16070	a: `2`, b: PowerOf2Ceil(A: TTI->getStoreMinimumVF(
16071	VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreTy)), ScalarMemTy: StoreTy,
16072	ScalarValTy: ValueTy)));
16073
16074	if (MaxVF < MinVF) {
16075	LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16076	<< ") < "
16077	<< "MinVF (" << MinVF << ")\n");
16078	continue;
16079	}
16080
16081	unsigned NonPowerOf2VF = `0`;
16082	if (VectorizeNonPowerOf2) {
16083	// First try vectorizing with a non-power-of-2 VF. At the moment, only
16084	// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16085	// lanes are used.
16086	unsigned CandVF = Operands.size();
16087	if (isPowerOf2_32(Value: CandVF + `1`) && CandVF <= MaxRegVF)
16088	NonPowerOf2VF = CandVF;
16089	}
16090
16091	unsigned Sz = `1` + Log2_32(Value: MaxVF) - Log2_32(Value: MinVF);
16092	SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > `0` ? `1` : `0`));
16093	unsigned Size = MinVF;
16094	for_each(Range: reverse(C&: CandidateVFs), F: [&](unsigned &VF) {
16095	VF = Size > MaxVF ? NonPowerOf2VF : Size;
16096	Size *= `2`;
16097	});
16098	unsigned End = Operands.size();
16099	unsigned Repeat = `0`;
16100	constexpr unsigned MaxAttempts = `4`;
16101	OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
16102	for_each(Range&: RangeSizes, F: [](std::pair<unsigned, unsigned> &P) {
16103	P.first = P.second = `1`;
16104	});
16105	DenseMap<Value , std::pair<unsigned, unsigned*>> NonSchedulable;
16106	auto IsNotVectorized = [](bool First,
16107	const std::pair<unsigned, unsigned> &P) {
16108	return First ? P.first > `0` : P.second > `0`;
16109	};
16110	auto IsVectorized = [](bool First,
16111	const std::pair<unsigned, unsigned> &P) {
16112	return First ? P.first == `0` : P.second == `0`;
16113	};
16114	auto VFIsProfitable = [](bool First, unsigned Size,
16115	const std::pair<unsigned, unsigned> &P) {
16116	return First ? Size >= P.first : Size >= P.second;
16117	};
16118	auto FirstSizeSame = [](unsigned Size,
16119	const std::pair<unsigned, unsigned> &P) {
16120	return Size == P.first;
16121	};
16122	while (true) {
16123	++Repeat;
16124	bool RepeatChanged = false;
16125	bool AnyProfitableGraph = false;
16126	for (unsigned Size : CandidateVFs) {
16127	AnyProfitableGraph = false;
16128	unsigned StartIdx = std::distance(
16129	first: RangeSizes.begin(),
16130	last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: Size >= MaxRegVF,
16131	args: std::placeholders::_1)));
16132	while (StartIdx < End) {
16133	unsigned EndIdx =
16134	std::distance(first: RangeSizes.begin(),
16135	last: find_if(Range: RangeSizes.drop_front(N: StartIdx),
16136	P: std::bind(f&: IsVectorized, args: Size >= MaxRegVF,
16137	args: std::placeholders::_1)));
16138	unsigned Sz = EndIdx >= End ? End : EndIdx;
16139	for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16140	if (!checkTreeSizes(Sizes: RangeSizes.slice(N: Cnt, M: Size),
16141	First: Size >= MaxRegVF)) {
16142	++Cnt;
16143	continue;
16144	}
16145	ArrayRef<Value *> Slice = ArrayRef(Operands).slice(N: Cnt, M: Size);
16146	assert(all_of(Slice,
16147	[&](Value *V) {
16148	return cast<StoreInst>(V)
16149	->getValueOperand()
16150	->getType() ==
16151	cast<StoreInst>(Slice.front())
16152	->getValueOperand()
16153	->getType();
16154	}) &&
16155	"Expected all operands of same type.");
16156	if (!NonSchedulable.empty()) {
16157	auto [NonSchedSizeMax, NonSchedSizeMin] =
16158	NonSchedulable.lookup(Val: Slice.front());
16159	if (NonSchedSizeMax > `0` && NonSchedSizeMin <= Size) {
16160	Cnt += NonSchedSizeMax;
16161	continue;
16162	}
16163	}
16164	unsigned TreeSize;
16165	std::optional<bool> Res =
16166	vectorizeStoreChain(Chain: Slice, R, Idx: Cnt, MinVF, Size&: TreeSize);
16167	if (!Res) {
16168	NonSchedulable
16169	.try_emplace(Key: Slice.front(), Args: std::make_pair(x&: Size, y&: Size))
16170	.first ->getSecond()
16171	.second = Size;
16172	} else if (*Res) {
16173	// Mark the vectorized stores so that we don't vectorize them
16174	// again.
16175	VectorizedStores.insert(I: Slice.begin(), E: Slice.end());
16176	// Mark the vectorized stores so that we don't vectorize them
16177	// again.
16178	AnyProfitableGraph = RepeatChanged = Changed = true;
16179	// If we vectorized initial block, no need to try to vectorize
16180	// it again.
16181	for_each(Range: RangeSizes.slice(N: Cnt, M: Size),
16182	F: [](std::pair<unsigned, unsigned> &P) {
16183	P.first = P.second = `0`;
16184	});
16185	if (Cnt < StartIdx + MinVF) {
16186	for_each(Range: RangeSizes.slice(N: StartIdx, M: Cnt - StartIdx),
16187	F: [](std::pair<unsigned, unsigned> &P) {
16188	P.first = P.second = `0`;
16189	});
16190	StartIdx = Cnt + Size;
16191	}
16192	if (Cnt > Sz - Size - MinVF) {
16193	for_each(Range: RangeSizes.slice(N: Cnt + Size, M: Sz - (Cnt + Size)),
16194	F: [](std::pair<unsigned, unsigned> &P) {
16195	P.first = P.second = `0`;
16196	});
16197	if (Sz == End)
16198	End = Cnt;
16199	Sz = Cnt;
16200	}
16201	Cnt += Size;
16202	continue;
16203	}
16204	if (Size > `2` && Res &&
16205	!all_of(Range: RangeSizes.slice(N: Cnt, M: Size),
16206	P: std::bind(f&: VFIsProfitable, args: Size >= MaxRegVF, args&: TreeSize,
16207	args: std::placeholders::_1))) {
16208	Cnt += Size;
16209	continue;
16210	}
16211	// Check for the very big VFs that we're not rebuilding same
16212	// trees, just with larger number of elements.
16213	if (Size > MaxRegVF && TreeSize > `1` &&
16214	all_of(Range: RangeSizes.slice(N: Cnt, M: Size),
16215	P: std::bind(f&: FirstSizeSame, args&: TreeSize,
16216	args: std::placeholders::_1))) {
16217	Cnt += Size;
16218	while (Cnt != Sz && RangeSizes [Cnt].first == TreeSize)
16219	++Cnt;
16220	continue;
16221	}
16222	if (TreeSize > `1`)
16223	for_each(Range: RangeSizes.slice(N: Cnt, M: Size),
16224	F: [&](std::pair<unsigned, unsigned> &P) {
16225	if (Size >= MaxRegVF)
16226	P.second = std::max(a: P.second, b: TreeSize);
16227	else
16228	P.first = std::max(a: P.first, b: TreeSize);
16229	});
16230	++Cnt;
16231	AnyProfitableGraph = true;
16232	}
16233	if (StartIdx >= End)
16234	break;
16235	if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16236	AnyProfitableGraph = true;
16237	StartIdx = std::distance(
16238	first: RangeSizes.begin(),
16239	last: find_if(Range: RangeSizes.drop_front(N: Sz),
16240	P: std::bind(f&: IsNotVectorized, args: Size >= MaxRegVF,
16241	args: std::placeholders::_1)));
16242	}
16243	if (!AnyProfitableGraph && Size >= MaxRegVF)
16244	break;
16245	}
16246	// All values vectorized - exit.
16247	if (all_of(Range&: RangeSizes, P: [](const std::pair<unsigned, unsigned> &P) {
16248	return P.first == `0` && P.second == `0`;
16249	}))
16250	break;
16251	// Check if tried all attempts or no need for the last attempts at all.
16252	if (Repeat >= MaxAttempts \|\|
16253	(Repeat > `1` && (RepeatChanged \|\| !AnyProfitableGraph)))
16254	break;
16255	constexpr unsigned StoresLimit = `64`;
16256	const unsigned MaxTotalNum = bit_floor(Value: std::min<unsigned>(
16257	a: Operands.size(),
16258	b: static_cast<unsigned>(
16259	End -
16260	std::distance(
16261	first: RangeSizes.begin(),
16262	last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: true,
16263	args: std::placeholders::_1))) +
16264	`1`)));
16265	unsigned VF = PowerOf2Ceil(A: CandidateVFs.front()) * `2`;
16266	if (VF > MaxTotalNum \|\| VF >= StoresLimit)
16267	break;
16268	for_each(Range&: RangeSizes, F: [&](std::pair<unsigned, unsigned> &P) {
16269	if (P.first != `0`)
16270	P.first = std::max(a: P.second, b: P.first);
16271	});
16272	// Last attempt to vectorize max number of elements, if all previous
16273	// attempts were unsuccessful because of the cost issues.
16274	CandidateVFs.clear();
16275	CandidateVFs.push_back(Elt: VF);
16276	}
16277	}
16278	};
16279
16280	// Stores pair (first: index of the store into Stores array ref, address of
16281	// which taken as base, second: sorted set of pairs {index, dist}, which are
16282	// indices of stores in the set and their store location distances relative to
16283	// the base address).
16284
16285	// Need to store the index of the very first store separately, since the set
16286	// may be reordered after the insertion and the first store may be moved. This
16287	// container allows to reduce number of calls of getPointersDiff() function.
16288	SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
16289	// Inserts the specified store SI with the given index Idx to the set of the
16290	// stores. If the store with the same distance is found already - stop
16291	// insertion, try to vectorize already found stores. If some stores from this
16292	// sequence were not vectorized - try to vectorize them with the new store
16293	// later. But this logic is applied only to the stores, that come before the
16294	// previous store with the same distance.
16295	// Example:
16296	// 1. store x, %p
16297	// 2. store y, %p+1
16298	// 3. store z, %p+2
16299	// 4. store a, %p
16300	// 5. store b, %p+3
16301	// - Scan this from the last to first store. The very first bunch of stores is
16302	// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16303	// vector).
16304	// - The next store in the list - #1 - has the same distance from store #5 as
16305	// the store #4.
16306	// - Try to vectorize sequence of stores 4,2,3,5.
16307	// - If all these stores are vectorized - just drop them.
16308	// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16309	// - Start new stores sequence.
16310	// The new bunch of stores is {1, {1, 0}}.
16311	// - Add the stores from previous sequence, that were not vectorized.
16312	// Here we consider the stores in the reversed order, rather they are used in
16313	// the IR (Stores are reversed already, see vectorizeStoreChains() function).
16314	// Store #3 can be added -> comes after store #4 with the same distance as
16315	// store #1.
16316	// Store #5 cannot be added - comes before store #4.
16317	// This logic allows to improve the compile time, we assume that the stores
16318	// after previous store with the same distance most likely have memory
16319	// dependencies and no need to waste compile time to try to vectorize them.
16320	// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16321	auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16322	for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16323	std::optional<int> Diff = getPointersDiff(
16324	ElemTyA: Stores [Set.first]->getValueOperand()->getType(),
16325	PtrA: Stores [Set.first]->getPointerOperand(),
16326	ElemTyB: SI->getValueOperand()->getType(), PtrB: SI->getPointerOperand(), DL: DL, SE&: SE,
16327	/StrictCheck=/true);
16328	if (!Diff)
16329	continue;
16330	auto It = Set.second.find(x: std::make_pair(x&: Idx, y&: *Diff));
16331	if (It == Set.second.end()) {
16332	Set.second.emplace(args&: Idx, args&: *Diff);
16333	return;
16334	}
16335	// Try to vectorize the first found set to avoid duplicate analysis.
16336	TryToVectorize (Set.second);
16337	StoreIndexToDistSet PrevSet;
16338	PrevSet.swap(x&: Set.second);
16339	Set.first = Idx;
16340	Set.second.emplace(args&: Idx, args: `0`);
16341	// Insert stores that followed previous match to try to vectorize them
16342	// with this store.
16343	unsigned StartIdx = It ->first + `1`;
16344	SmallBitVector UsedStores(Idx - StartIdx);
16345	// Distances to previously found dup store (or this store, since they
16346	// store to the same addresses).
16347	SmallVector<int> Dists(Idx - StartIdx, `0`);
16348	for (const std::pair<unsigned, int> &Pair : reverse(C&: PrevSet)) {
16349	// Do not try to vectorize sequences, we already tried.
16350	if (Pair.first <= It ->first \|\|
16351	VectorizedStores.contains(Ptr: Stores [Pair.first]))
16352	break;
16353	unsigned BI = Pair.first - StartIdx;
16354	UsedStores.set(BI);
16355	Dists [BI] = Pair.second - It ->second;
16356	}
16357	for (unsigned I = StartIdx; I < Idx; ++I) {
16358	unsigned BI = I - StartIdx;
16359	if (UsedStores.test(Idx: BI))
16360	Set.second.emplace(args&: I, args&: Dists [BI]);
16361	}
16362	return;
16363	}
16364	auto &Res = SortedStores.emplace_back();
16365	Res.first = Idx;
16366	Res.second.emplace(args&: Idx, args: `0`);
16367	};
16368	Type PrevValTy = nullptr*;
16369	for (auto [I, SI] : enumerate(First&: Stores)) {
16370	if (R.isDeleted(I: SI))
16371	continue;
16372	if (!PrevValTy)
16373	PrevValTy = SI->getValueOperand()->getType();
16374	// Check that we do not try to vectorize stores of different types.
16375	if (PrevValTy != SI->getValueOperand()->getType()) {
16376	for (auto &Set : SortedStores)
16377	TryToVectorize (Set.second);
16378	SortedStores.clear();
16379	PrevValTy = SI->getValueOperand()->getType();
16380	}
16381	FillStoresSet (I, SI);
16382	}
16383
16384	// Final vectorization attempt.
16385	for (auto &Set : SortedStores)
16386	TryToVectorize (Set.second);
16387
16388	return Changed;
16389	}
16390
16391	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16392	// Initialize the collections. We will make a single pass over the block.
16393	Stores.clear();
16394	GEPs.clear();
16395
16396	// Visit the store and getelementptr instructions in BB and organize them in
16397	// Stores and GEPs according to the underlying objects of their pointer
16398	// operands.
16399	for (Instruction &I : *BB) {
16400	// Ignore store instructions that are volatile or have a pointer operand
16401	// that doesn't point to a scalar type.
16402	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
16403	if (!SI->isSimple())
16404	continue;
16405	if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
16406	continue;
16407	Stores [getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
16408	}
16409
16410	// Ignore getelementptr instructions that have more than one index, a
16411	// constant index, or a pointer operand that doesn't point to a scalar
16412	// type.
16413	else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
16414	if (GEP->getNumIndices() != `1`)
16415	continue;
16416	Value *Idx = GEP->idx_begin()->get();
16417	if (isa<Constant>(Val: Idx))
16418	continue;
16419	if (!isValidElementType(Ty: Idx->getType()))
16420	continue;
16421	if (GEP->getType()->isVectorTy())
16422	continue;
16423	GEPs [GEP->getPointerOperand()].push_back(Elt: GEP);
16424	}
16425	}
16426	}
16427
16428	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16429	bool MaxVFOnly) {
16430	if (VL.size() < `2`)
16431	return false;
16432
16433	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16434	<< VL.size() << ".\n");
16435
16436	// Check that all of the parts are instructions of the same type,
16437	// we permit an alternate opcode via InstructionsState.
16438	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
16439	if (!S.getOpcode())
16440	return false;
16441
16442	Instruction *I0 = cast<Instruction>(Val: S.OpValue);
16443	// Make sure invalid types (including vector type) are rejected before
16444	// determining vectorization factor for scalar instructions.
16445	for (Value *V : VL) {
16446	Type *Ty = V->getType();
16447	if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
16448	// NOTE: the following will give user internal llvm type name, which may
16449	// not be useful.
16450	R.getORE()->emit(RemarkBuilder: [&]() {
16451	std::string TypeStr;
16452	llvm::raw_string_ostream rso(TypeStr);
16453	Ty->print(O&: rso);
16454	return OptimizationRemarkMissed (SV_NAME, "UnsupportedType", I0)
16455	<< "Cannot SLP vectorize list: type "
16456	<< TypeStr + " is unsupported by vectorizer";
16457	});
16458	return false;
16459	}
16460	}
16461
16462	unsigned Sz = R.getVectorElementSize(V: I0);
16463	unsigned MinVF = R.getMinVF(Sz);
16464	unsigned MaxVF = std::max<unsigned>(a: llvm::bit_floor(Value: VL.size()), b: MinVF);
16465	MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
16466	if (MaxVF < `2`) {
16467	R.getORE()->emit(RemarkBuilder: [&]() {
16468	return OptimizationRemarkMissed (SV_NAME, "SmallVF", I0)
16469	<< "Cannot SLP vectorize list: vectorization factor "
16470	<< "less than 2 is not supported";
16471	});
16472	return false;
16473	}
16474
16475	bool Changed = false;
16476	bool CandidateFound = false;
16477	InstructionCost MinCost = SLPCostThreshold.getValue();
16478	Type *ScalarTy = VL [`0`]->getType();
16479	if (auto *IE = dyn_cast<InsertElementInst>(Val: VL [`0`]))
16480	ScalarTy = IE->getOperand(i_nocapture: `1`)->getType();
16481
16482	unsigned NextInst = `0`, MaxInst = VL.size();
16483	for (unsigned VF = MaxVF; NextInst + `1` < MaxInst && VF >= MinVF; VF /= `2`) {
16484	// No actual vectorization should happen, if number of parts is the same as
16485	// provided vectorization factor (i.e. the scalar type is used for vector
16486	// code during codegen).
16487	auto *VecTy = getWidenedType(ScalarTy, VF);
16488	if (TTI->getNumberOfParts(Tp: VecTy) == VF)
16489	continue;
16490	for (unsigned I = NextInst; I < MaxInst; ++I) {
16491	unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
16492
16493	if (!isPowerOf2_32(Value: ActualVF))
16494	continue;
16495
16496	if (MaxVFOnly && ActualVF < MaxVF)
16497	break;
16498	if ((VF > MinVF && ActualVF <= VF / `2`) \|\| (VF == MinVF && ActualVF < `2`))
16499	break;
16500
16501	ArrayRef<Value *> Ops = VL.slice(N: I, M: ActualVF);
16502	// Check that a previous iteration of this loop did not delete the Value.
16503	if (llvm::any_of(Range&: Ops, P: [&R](Value *V) {
16504	auto *I = dyn_cast<Instruction>(Val: V);
16505	return I && R.isDeleted(I);
16506	}))
16507	continue;
16508
16509	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16510	<< "\n");
16511
16512	R.buildTree(Roots: Ops);
16513	if (R.isTreeTinyAndNotFullyVectorizable())
16514	continue;
16515	R.reorderTopToBottom();
16516	R.reorderBottomToTop(
16517	/IgnoreReorder=/!isa<InsertElementInst>(Val: Ops.front()) &&
16518	!R.doesRootHaveInTreeUses());
16519	R.buildExternalUses();
16520
16521	R.computeMinimumValueSizes();
16522	R.transformNodes();
16523	InstructionCost Cost = R.getTreeCost();
16524	CandidateFound = true;
16525	MinCost = std::min(a: MinCost, b: Cost);
16526
16527	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16528	<< " for VF=" << ActualVF << "\n");
16529	if (Cost < -SLPCostThreshold) {
16530	LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16531	R.getORE()->emit(OptDiag&: OptimizationRemark (SV_NAME, "VectorizedList",
16532	cast<Instruction>(Val: Ops [`0`]))
16533	<< "SLP vectorized with cost " << ore::NV ("Cost", Cost)
16534	<< " and with tree size "
16535	<< ore::NV ("TreeSize", R.getTreeSize()));
16536
16537	R.vectorizeTree();
16538	// Move to the next bundle.
16539	I += VF - `1`;
16540	NextInst = I + `1`;
16541	Changed = true;
16542	}
16543	}
16544	}
16545
16546	if (!Changed && CandidateFound) {
16547	R.getORE()->emit(RemarkBuilder: [&]() {
16548	return OptimizationRemarkMissed (SV_NAME, "NotBeneficial", I0)
16549	<< "List vectorization was possible but not beneficial with cost "
16550	<< ore::NV ("Cost", MinCost) << " >= "
16551	<< ore::NV ("Treshold", -SLPCostThreshold);
16552	});
16553	} else if (!Changed) {
16554	R.getORE()->emit(RemarkBuilder: [&]() {
16555	return OptimizationRemarkMissed (SV_NAME, "NotPossible", I0)
16556	<< "Cannot SLP vectorize list: vectorization was impossible"
16557	<< " with available vectorization factors";
16558	});
16559	}
16560	return Changed;
16561	}
16562
16563	bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16564	if (!I)
16565	return false;
16566
16567	if (!isa<BinaryOperator, CmpInst>(Val: I) \|\| isa<VectorType>(Val: I->getType()))
16568	return false;
16569
16570	Value *P = I->getParent();
16571
16572	// Vectorize in current basic block only.
16573	auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
16574	auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: `1`));
16575	if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P)
16576	return false;
16577
16578	// First collect all possible candidates
16579	SmallVector<std::pair<Value , Value >, `4`> Candidates;
16580	Candidates.emplace_back(Args&: Op0, Args&: Op1);
16581
16582	auto *A = dyn_cast<BinaryOperator>(Val: Op0);
16583	auto *B = dyn_cast<BinaryOperator>(Val: Op1);
16584	// Try to skip B.
16585	if (A && B && B->hasOneUse()) {
16586	auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `0`));
16587	auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `1`));
16588	if (B0 && B0->getParent() == P)
16589	Candidates.emplace_back(Args&: A, Args&: B0);
16590	if (B1 && B1->getParent() == P)
16591	Candidates.emplace_back(Args&: A, Args&: B1);
16592	}
16593	// Try to skip A.
16594	if (B && A && A->hasOneUse()) {
16595	auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `0`));
16596	auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `1`));
16597	if (A0 && A0->getParent() == P)
16598	Candidates.emplace_back(Args&: A0, Args&: B);
16599	if (A1 && A1->getParent() == P)
16600	Candidates.emplace_back(Args&: A1, Args&: B);
16601	}
16602
16603	if (Candidates.size() == `1`)
16604	return tryToVectorizeList(VL: {Op0, Op1}, R);
16605
16606	// We have multiple options. Try to pick the single best.
16607	std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16608	if (!BestCandidate)
16609	return false;
16610	return tryToVectorizeList(
16611	VL: {Candidates [BestCandidate].first, Candidates [BestCandidate].second}, R);
16612	}
16613
16614	namespace {
16615
16616	/// Model horizontal reductions.
16617	///
16618	/// A horizontal reduction is a tree of reduction instructions that has values
16619	/// that can be put into a vector as its leaves. For example:
16620	///
16621	/// mul mul mul mul
16622	/// \ / \ /
16623	/// + +
16624	/// \ /
16625	/// +
16626	/// This tree has "mul" as its leaf values and "+" as its reduction
16627	/// instructions. A reduction can feed into a store or a binary operation
16628	/// feeding a phi.
16629	/// ...
16630	/// \ /
16631	/// +
16632	/// \|
16633	/// phi +=
16634	///
16635	/// Or:
16636	/// ...
16637	/// \ /
16638	/// +
16639	/// \|
16640	/// p =*
16641	///
16642	class HorizontalReduction {
16643	using ReductionOpsType = SmallVector<Value *, `16`>;
16644	using ReductionOpsListType = SmallVector<ReductionOpsType, `2`>;
16645	ReductionOpsListType ReductionOps;
16646	/// List of possibly reduced values.
16647	SmallVector<SmallVector<Value *>> ReducedVals;
16648	/// Maps reduced value to the corresponding reduction operation.
16649	DenseMap<Value , SmallVector<Instruction >> ReducedValsToOps;
16650	// Use map vector to make stable output.
16651	MapVector<Instruction , Value > ExtraArgs;
16652	WeakTrackingVH ReductionRoot;
16653	/// The type of reduction operation.
16654	RecurKind RdxKind;
16655	/// Checks if the optimization of original scalar identity operations on
16656	/// matched horizontal reductions is enabled and allowed.
16657	bool IsSupportedHorRdxIdentityOp = false;
16658
16659	static bool isCmpSelMinMax(Instruction *I) {
16660	return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
16661	RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
16662	}
16663
16664	// And/or are potentially poison-safe logical patterns like:
16665	// select x, y, false
16666	// select x, true, y
16667	static bool isBoolLogicOp(Instruction *I) {
16668	return isa<SelectInst>(Val: I) &&
16669	(match(V: I, P: m_LogicalAnd()) \|\| match(V: I, P: m_LogicalOr()));
16670	}
16671
16672	/// Checks if instruction is associative and can be vectorized.
16673	static bool isVectorizable(RecurKind Kind, Instruction *I) {
16674	if (Kind == RecurKind::None)
16675	return false;
16676
16677	// Integer ops that map to select instructions or intrinsics are fine.
16678	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) \|\|
16679	isBoolLogicOp(I))
16680	return true;
16681
16682	if (Kind == RecurKind::FMax \|\| Kind == RecurKind::FMin) {
16683	// FP min/max are associative except for NaN and -0.0. We do not
16684	// have to rule out -0.0 here because the intrinsic semantics do not
16685	// specify a fixed result for it.
16686	return I->getFastMathFlags().noNaNs();
16687	}
16688
16689	if (Kind == RecurKind::FMaximum \|\| Kind == RecurKind::FMinimum)
16690	return true;
16691
16692	return I->isAssociative();
16693	}
16694
16695	static Value getRdxOperand(Instruction I, unsigned Index) {
16696	// Poison-safe 'or' takes the form: select X, true, Y
16697	// To make that work with the normal operand processing, we skip the
16698	// true value operand.
16699	// TODO: Change the code and data structures to handle this without a hack.
16700	if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == `1`)
16701	return I->getOperand(i: `2`);
16702	return I->getOperand(i: Index);
16703	}
16704
16705	/// Creates reduction operation with the current opcode.
16706	static Value createOp(IRBuilderBase &Builder, RecurKind Kind, Value LHS,
16707	Value RHS, const* Twine &Name, bool UseSelect) {
16708	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16709	switch (Kind) {
16710	case RecurKind::Or:
16711	if (UseSelect &&
16712	LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
16713	return Builder.CreateSelect(C: LHS, True: Builder.getTrue(), False: RHS, Name);
16714	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16715	Name);
16716	case RecurKind::And:
16717	if (UseSelect &&
16718	LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
16719	return Builder.CreateSelect(C: LHS, True: RHS, False: Builder.getFalse(), Name);
16720	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16721	Name);
16722	case RecurKind::Add:
16723	case RecurKind::Mul:
16724	case RecurKind::Xor:
16725	case RecurKind::FAdd:
16726	case RecurKind::FMul:
16727	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16728	Name);
16729	case RecurKind::FMax:
16730	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::maxnum, LHS, RHS);
16731	case RecurKind::FMin:
16732	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::minnum, LHS, RHS);
16733	case RecurKind::FMaximum:
16734	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::maximum, LHS, RHS);
16735	case RecurKind::FMinimum:
16736	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::minimum, LHS, RHS);
16737	case RecurKind::SMax:
16738	if (UseSelect) {
16739	Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16740	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16741	}
16742	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::smax, LHS, RHS);
16743	case RecurKind::SMin:
16744	if (UseSelect) {
16745	Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16746	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16747	}
16748	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::smin, LHS, RHS);
16749	case RecurKind::UMax:
16750	if (UseSelect) {
16751	Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16752	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16753	}
16754	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::umax, LHS, RHS);
16755	case RecurKind::UMin:
16756	if (UseSelect) {
16757	Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16758	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16759	}
16760	return Builder.CreateBinaryIntrinsic(ID: Intrinsic::umin, LHS, RHS);
16761	default:
16762	llvm_unreachable("Unknown reduction operation.");
16763	}
16764	}
16765
16766	/// Creates reduction operation with the current opcode with the IR flags
16767	/// from \p ReductionOps, dropping nuw/nsw flags.
16768	static Value createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value LHS,
16769	Value RHS, const* Twine &Name,
16770	const ReductionOpsListType &ReductionOps) {
16771	bool UseSelect = ReductionOps.size() == `2` \|\|
16772	// Logical or/and.
16773	(ReductionOps.size() == `1` &&
16774	any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
16775	assert((!UseSelect \|\| ReductionOps.size() != `2` \|\|
16776	isa<SelectInst>(ReductionOps[`1`][`0`])) &&
16777	"Expected cmp + select pairs for reduction");
16778	Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
16779	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
16780	if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
16781	propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps [`0`], OpValue: nullptr,
16782	/IncludeWrapFlags=/false);
16783	propagateIRFlags(I: Op, VL: ReductionOps [`1`], OpValue: nullptr,
16784	/IncludeWrapFlags=/false);
16785	return Op;
16786	}
16787	}
16788	propagateIRFlags(I: Op, VL: ReductionOps [`0`], OpValue: nullptr, /IncludeWrapFlags=/false);
16789	return Op;
16790	}
16791
16792	public:
16793	static RecurKind getRdxKind(Value *V) {
16794	auto *I = dyn_cast<Instruction>(Val: V);
16795	if (!I)
16796	return RecurKind::None;
16797	if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
16798	return RecurKind::Add;
16799	if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
16800	return RecurKind::Mul;
16801	if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) \|\|
16802	match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
16803	return RecurKind::And;
16804	if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) \|\|
16805	match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
16806	return RecurKind::Or;
16807	if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
16808	return RecurKind::Xor;
16809	if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
16810	return RecurKind::FAdd;
16811	if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
16812	return RecurKind::FMul;
16813
16814	if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
16815	return RecurKind::FMax;
16816	if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
16817	return RecurKind::FMin;
16818
16819	if (match(V: I, P: m_Intrinsic<Intrinsic::maximum>(Op0: m_Value(), Op1: m_Value())))
16820	return RecurKind::FMaximum;
16821	if (match(V: I, P: m_Intrinsic<Intrinsic::minimum>(Op0: m_Value(), Op1: m_Value())))
16822	return RecurKind::FMinimum;
16823	// This matches either cmp+select or intrinsics. SLP is expected to handle
16824	// either form.
16825	// TODO: If we are canonicalizing to intrinsics, we can remove several
16826	// special-case paths that deal with selects.
16827	if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
16828	return RecurKind::SMax;
16829	if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
16830	return RecurKind::SMin;
16831	if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
16832	return RecurKind::UMax;
16833	if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
16834	return RecurKind::UMin;
16835
16836	if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
16837	// Try harder: look for min/max pattern based on instructions producing
16838	// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16839	// During the intermediate stages of SLP, it's very common to have
16840	// pattern like this (since optimizeGatherSequence is run only once
16841	// at the end):
16842	// %1 = extractelement <2 x i32> %a, i32 0
16843	// %2 = extractelement <2 x i32> %a, i32 1
16844	// %cond = icmp sgt i32 %1, %2
16845	// %3 = extractelement <2 x i32> %a, i32 0
16846	// %4 = extractelement <2 x i32> %a, i32 1
16847	// %select = select i1 %cond, i32 %3, i32 %4
16848	CmpInst::Predicate Pred;
16849	Instruction *L1;
16850	Instruction *L2;
16851
16852	Value *LHS = Select->getTrueValue();
16853	Value *RHS = Select->getFalseValue();
16854	Value *Cond = Select->getCondition();
16855
16856	// TODO: Support inverse predicates.
16857	if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
16858	if (!isa<ExtractElementInst>(Val: RHS) \|\|
16859	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
16860	return RecurKind::None;
16861	} else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
16862	if (!isa<ExtractElementInst>(Val: LHS) \|\|
16863	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
16864	return RecurKind::None;
16865	} else {
16866	if (!isa<ExtractElementInst>(Val: LHS) \|\| !isa<ExtractElementInst>(Val: RHS))
16867	return RecurKind::None;
16868	if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) \|\|
16869	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) \|\|
16870	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
16871	return RecurKind::None;
16872	}
16873
16874	switch (Pred) {
16875	default:
16876	return RecurKind::None;
16877	case CmpInst::ICMP_SGT:
16878	case CmpInst::ICMP_SGE:
16879	return RecurKind::SMax;
16880	case CmpInst::ICMP_SLT:
16881	case CmpInst::ICMP_SLE:
16882	return RecurKind::SMin;
16883	case CmpInst::ICMP_UGT:
16884	case CmpInst::ICMP_UGE:
16885	return RecurKind::UMax;
16886	case CmpInst::ICMP_ULT:
16887	case CmpInst::ICMP_ULE:
16888	return RecurKind::UMin;
16889	}
16890	}
16891	return RecurKind::None;
16892	}
16893
16894	/// Get the index of the first operand.
16895	static unsigned getFirstOperandIndex(Instruction *I) {
16896	return isCmpSelMinMax(I) ? `1` : `0`;
16897	}
16898
16899	private:
16900	/// Total number of operands in the reduction operation.
16901	static unsigned getNumberOfOperands(Instruction *I) {
16902	return isCmpSelMinMax(I) ? `3` : `2`;
16903	}
16904
16905	/// Checks if the instruction is in basic block \p BB.
16906	/// For a cmp+sel min/max reduction check that both ops are in \p BB.
16907	static bool hasSameParent(Instruction I, BasicBlock BB) {
16908	if (isCmpSelMinMax(I) \|\| isBoolLogicOp(I)) {
16909	auto *Sel = cast<SelectInst>(Val: I);
16910	auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
16911	return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16912	}
16913	return I->getParent() == BB;
16914	}
16915
16916	/// Expected number of uses for reduction operations/reduced values.
16917	static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16918	if (IsCmpSelMinMax) {
16919	// SelectInst must be used twice while the condition op must have single
16920	// use only.
16921	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
16922	return Sel->hasNUses(N: `2`) && Sel->getCondition()->hasOneUse();
16923	return I->hasNUses(N: `2`);
16924	}
16925
16926	// Arithmetic reduction operation must be used once only.
16927	return I->hasOneUse();
16928	}
16929
16930	/// Initializes the list of reduction operations.
16931	void initReductionOps(Instruction *I) {
16932	if (isCmpSelMinMax(I))
16933	ReductionOps.assign(NumElts: `2`, Elt: ReductionOpsType ());
16934	else
16935	ReductionOps.assign(NumElts: `1`, Elt: ReductionOpsType ());
16936	}
16937
16938	/// Add all reduction operations for the reduction instruction \p I.
16939	void addReductionOps(Instruction *I) {
16940	if (isCmpSelMinMax(I)) {
16941	ReductionOps [`0`].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
16942	ReductionOps [`1`].emplace_back(Args&: I);
16943	} else {
16944	ReductionOps [`0`].emplace_back(Args&: I);
16945	}
16946	}
16947
16948	static bool isGoodForReduction(ArrayRef<Value *> Data) {
16949	int Sz = Data.size();
16950	auto *I = dyn_cast<Instruction>(Val: Data.front());
16951	return Sz > `1` \|\| isConstant(V: Data.front()) \|\|
16952	(I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
16953	}
16954
16955	public:
16956	HorizontalReduction() = default;
16957
16958	/// Try to find a reduction tree.
16959	bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16960	ScalarEvolution &SE, const DataLayout &DL,
16961	const TargetLibraryInfo &TLI) {
16962	RdxKind = HorizontalReduction::getRdxKind(V: Root);
16963	if (!isVectorizable(Kind: RdxKind, I: Root))
16964	return false;
16965
16966	// Analyze "regular" integer/FP types for reductions - no target-specific
16967	// types or pointers.
16968	Type *Ty = Root->getType();
16969	if (!isValidElementType(Ty) \|\| Ty->isPointerTy())
16970	return false;
16971
16972	// Though the ultimate reduction may have multiple uses, its condition must
16973	// have only single use.
16974	if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
16975	if (!Sel->getCondition()->hasOneUse())
16976	return false;
16977
16978	ReductionRoot = Root;
16979
16980	// Iterate through all the operands of the possible reduction tree and
16981	// gather all the reduced values, sorting them by their value id.
16982	BasicBlock *BB = Root->getParent();
16983	bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
16984	SmallVector<Instruction *> Worklist(`1`, Root);
16985	// Checks if the operands of the \p TreeN instruction are also reduction
16986	// operations or should be treated as reduced values or an extra argument,
16987	// which is not part of the reduction.
16988	auto CheckOperands = [&](Instruction *TreeN,
16989	SmallVectorImpl<Value *> &ExtraArgs,
16990	SmallVectorImpl<Value *> &PossibleReducedVals,
16991	SmallVectorImpl<Instruction *> &ReductionOps) {
16992	for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
16993	End: getNumberOfOperands(I: TreeN)))) {
16994	Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
16995	ReducedValsToOps [EdgeVal].push_back(Elt: TreeN);
16996	auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
16997	// Edge has wrong parent - mark as an extra argument.
16998	if (EdgeInst && !isVectorLikeInstWithConstOps(V: EdgeInst) &&
16999	!hasSameParent(I: EdgeInst, BB)) {
17000	ExtraArgs.push_back(Elt: EdgeVal);
17001	continue;
17002	}
17003	// If the edge is not an instruction, or it is different from the main
17004	// reduction opcode or has too many uses - possible reduced value.
17005	// Also, do not try to reduce const values, if the operation is not
17006	// foldable.
17007	if (!EdgeInst \|\| getRdxKind(V: EdgeInst) != RdxKind \|\|
17008	IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) \|\|
17009	!hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) \|\|
17010	!isVectorizable(Kind: RdxKind, I: EdgeInst) \|\|
17011	(R.isAnalyzedReductionRoot(I: EdgeInst) &&
17012	all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
17013	PossibleReducedVals.push_back(Elt: EdgeVal);
17014	continue;
17015	}
17016	ReductionOps.push_back(Elt: EdgeInst);
17017	}
17018	};
17019	// Try to regroup reduced values so that it gets more profitable to try to
17020	// reduce them. Values are grouped by their value ids, instructions - by
17021	// instruction op id and/or alternate op id, plus do extra analysis for
17022	// loads (grouping them by the distabce between pointers) and cmp
17023	// instructions (grouping them by the predicate).
17024	MapVector<size_t, MapVector<size_t, MapVector<Value , unsigned*>>>
17025	PossibleReducedVals;
17026	initReductionOps(I: Root);
17027	DenseMap<Value , SmallVector<LoadInst >> LoadsMap;
17028	SmallSet<size_t, `2`> LoadKeyUsed;
17029
17030	auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17031	Value *Ptr = getUnderlyingObject(V: LI->getPointerOperand());
17032	if (LoadKeyUsed.contains(V: Key)) {
17033	auto LIt = LoadsMap.find(Val: Ptr);
17034	if (LIt != LoadsMap.end()) {
17035	for (LoadInst *RLI : LIt ->second) {
17036	if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
17037	ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
17038	/StrictCheck=/true))
17039	return hash_value(ptr: RLI->getPointerOperand());
17040	}
17041	for (LoadInst *RLI : LIt ->second) {
17042	if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
17043	Ptr2: LI->getPointerOperand(), TLI)) {
17044	hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
17045	return SubKey;
17046	}
17047	}
17048	if (LIt ->second.size() > `2`) {
17049	hash_code SubKey =
17050	hash_value(ptr: LIt ->second.back()->getPointerOperand());
17051	return SubKey;
17052	}
17053	}
17054	}
17055	LoadKeyUsed.insert(V: Key);
17056	LoadsMap.try_emplace(Key: Ptr).first ->second.push_back(Elt: LI);
17057	return hash_value(ptr: LI->getPointerOperand());
17058	};
17059
17060	while (!Worklist.empty()) {
17061	Instruction *TreeN = Worklist.pop_back_val();
17062	SmallVector<Value *> Args;
17063	SmallVector<Value *> PossibleRedVals;
17064	SmallVector<Instruction *> PossibleReductionOps;
17065	CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17066	// If too many extra args - mark the instruction itself as a reduction
17067	// value, not a reduction operation.
17068	if (Args.size() < `2`) {
17069	addReductionOps(I: TreeN);
17070	// Add extra args.
17071	if (!Args.empty()) {
17072	assert(Args.size() == `1` && "Expected only single argument.");
17073	ExtraArgs [TreeN] = Args.front();
17074	}
17075	// Add reduction values. The values are sorted for better vectorization
17076	// results.
17077	for (Value *V : PossibleRedVals) {
17078	size_t Key, Idx;
17079	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
17080	/AllowAlternate=/false);
17081	++PossibleReducedVals [Key][Idx]
17082	.insert(KV: std::make_pair(x&: V, y: `0`))
17083	.first->second;
17084	}
17085	Worklist.append(in_start: PossibleReductionOps.rbegin(),
17086	in_end: PossibleReductionOps.rend());
17087	} else {
17088	size_t Key, Idx;
17089	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V: TreeN, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
17090	/AllowAlternate=/false);
17091	++PossibleReducedVals [Key][Idx]
17092	.insert(KV: std::make_pair(x&: TreeN, y: `0`))
17093	.first->second;
17094	}
17095	}
17096	auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17097	// Sort values by the total number of values kinds to start the reduction
17098	// from the longest possible reduced values sequences.
17099	for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17100	auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17101	SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17102	for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17103	It != E; ++It) {
17104	PossibleRedValsVect.emplace_back();
17105	auto RedValsVect = It->second.takeVector();
17106	stable_sort(Range&: RedValsVect, C: llvm::less_second ());
17107	for (const std::pair<Value , unsigned*> &Data : RedValsVect)
17108	PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
17109	}
17110	stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
17111	return P1.size() > P2.size();
17112	});
17113	int NewIdx = -`1`;
17114	for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17115	if (NewIdx < `0` \|\|
17116	(!isGoodForReduction(Data) &&
17117	(!isa<LoadInst>(Val: Data.front()) \|\|
17118	!isa<LoadInst>(Val: ReducedVals [NewIdx].front()) \|\|
17119	getUnderlyingObject(
17120	V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) !=
17121	getUnderlyingObject(
17122	V: cast<LoadInst>(Val: ReducedVals [NewIdx].front())
17123	->getPointerOperand())))) {
17124	NewIdx = ReducedVals.size();
17125	ReducedVals.emplace_back();
17126	}
17127	ReducedVals [NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend());
17128	}
17129	}
17130	// Sort the reduced values by number of same/alternate opcode and/or pointer
17131	// operand.
17132	stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value > P1, ArrayRef<Value > P2) {
17133	return P1.size() > P2.size();
17134	});
17135	return true;
17136	}
17137
17138	/// Attempt to vectorize the tree found by matchAssociativeReduction.
17139	Value tryToReduce(BoUpSLP &V, const* DataLayout &DL, TargetTransformInfo *TTI,
17140	const TargetLibraryInfo &TLI) {
17141	constexpr int ReductionLimit = `4`;
17142	constexpr unsigned RegMaxNumber = `4`;
17143	constexpr unsigned RedValsMaxNumber = `128`;
17144	// If there are a sufficient number of reduction values, reduce
17145	// to a nearby power-of-2. We can safely generate oversized
17146	// vectors and rely on the backend to split them to legal sizes.
17147	unsigned NumReducedVals =
17148	std::accumulate(first: ReducedVals.begin(), last: ReducedVals.end(), init: `0`,
17149	binary_op: [](unsigned Num, ArrayRef<Value > Vals) -> unsigned* {
17150	if (!isGoodForReduction(Data: Vals))
17151	return Num;
17152	return Num + Vals.size();
17153	});
17154	if (NumReducedVals < ReductionLimit &&
17155	(!AllowHorRdxIdenityOptimization \|\|
17156	all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
17157	return RedV.size() < `2` \|\| !allConstant(VL: RedV) \|\| !isSplat(VL: RedV);
17158	}))) {
17159	for (ReductionOpsType &RdxOps : ReductionOps)
17160	for (Value *RdxOp : RdxOps)
17161	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
17162	return nullptr;
17163	}
17164
17165	IRBuilder<TargetFolder> Builder(ReductionRoot ->getContext(),
17166	TargetFolder (DL));
17167	Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
17168
17169	// Track the reduced values in case if they are replaced by extractelement
17170	// because of the vectorization.
17171	DenseMap<Value *, WeakTrackingVH> TrackedVals(
17172	ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17173	BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17174	SmallVector<std::pair<Value , Value >> ReplacedExternals;
17175	ExternallyUsedValues.reserve(NumEntries: ExtraArgs.size() + `1`);
17176	// The same extra argument may be used several times, so log each attempt
17177	// to use it.
17178	for (const std::pair<Instruction , Value > &Pair : ExtraArgs) {
17179	assert(Pair.first && "DebugLoc must be set.");
17180	ExternallyUsedValues [Pair.second].push_back(Elt: Pair.first);
17181	TrackedVals.try_emplace(Key: Pair.second, Args: Pair.second);
17182	}
17183
17184	// The compare instruction of a min/max is the insertion point for new
17185	// instructions and may be replaced with a new compare instruction.
17186	auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17187	assert(isa<SelectInst>(RdxRootInst) &&
17188	"Expected min/max reduction to have select root instruction");
17189	Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
17190	assert(isa<Instruction>(ScalarCond) &&
17191	"Expected min/max reduction to have compare condition");
17192	return cast<Instruction>(Val: ScalarCond);
17193	};
17194
17195	// Return new VectorizedTree, based on previous value.
17196	auto GetNewVectorizedTree = [&](Value VectorizedTree, Value Res) {
17197	if (VectorizedTree) {
17198	// Update the final value in the reduction.
17199	Builder.SetCurrentDebugLocation(
17200	cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
17201	if ((isa<PoisonValue>(Val: VectorizedTree) && !isa<PoisonValue>(Val: Res)) \|\|
17202	(isGuaranteedNotToBePoison(V: Res) &&
17203	!isGuaranteedNotToBePoison(V: VectorizedTree))) {
17204	auto It = ReducedValsToOps.find(Val: Res);
17205	if (It != ReducedValsToOps.end() &&
17206	any_of(Range&: It ->getSecond(),
17207	P: [](Instruction I) { return* isBoolLogicOp(I); }))
17208	std::swap(a&: VectorizedTree, b&: Res);
17209	}
17210
17211	return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
17212	ReductionOps);
17213	}
17214	// Initialize the final value in the reduction.
17215	return Res;
17216	};
17217	bool AnyBoolLogicOp =
17218	any_of(Range&: ReductionOps.back(), P: [](Value *V) {
17219	return isBoolLogicOp(I: cast<Instruction>(Val: V));
17220	});
17221	// The reduction root is used as the insertion point for new instructions,
17222	// so set it as externally used to prevent it from being deleted.
17223	ExternallyUsedValues [ReductionRoot];
17224	SmallDenseSet<Value > IgnoreList(ReductionOps.size()
17225	ReductionOps.front().size());
17226	for (ReductionOpsType &RdxOps : ReductionOps)
17227	for (Value *RdxOp : RdxOps) {
17228	if (!RdxOp)
17229	continue;
17230	IgnoreList.insert(V: RdxOp);
17231	}
17232	// Intersect the fast-math-flags from all reduction operations.
17233	FastMathFlags RdxFMF;
17234	RdxFMF.set();
17235	for (Value *U : IgnoreList)
17236	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
17237	RdxFMF &= FPMO->getFastMathFlags();
17238	bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
17239
17240	// Need to track reduced vals, they may be changed during vectorization of
17241	// subvectors.
17242	for (ArrayRef<Value *> Candidates : ReducedVals)
17243	for (Value *V : Candidates)
17244	TrackedVals.try_emplace(Key: V, Args&: V);
17245
17246	DenseMap<Value , unsigned*> VectorizedVals(ReducedVals.size());
17247	// List of the values that were reduced in other trees as part of gather
17248	// nodes and thus requiring extract if fully vectorized in other trees.
17249	SmallPtrSet<Value *, `4`> RequiredExtract;
17250	Value VectorizedTree = nullptr*;
17251	bool CheckForReusedReductionOps = false;
17252	// Try to vectorize elements based on their type.
17253	SmallVector<InstructionsState> States;
17254	for (ArrayRef<Value *> RV : ReducedVals)
17255	States.push_back(Elt: getSameOpcode(VL: RV, TLI));
17256	for (unsigned I = `0`, E = ReducedVals.size(); I < E; ++I) {
17257	ArrayRef<Value *> OrigReducedVals = ReducedVals [I];
17258	InstructionsState S = States [I];
17259	SmallVector<Value *> Candidates;
17260	Candidates.reserve(N: `2` * OrigReducedVals.size());
17261	DenseMap<Value , Value > TrackedToOrig(`2` * OrigReducedVals.size());
17262	for (unsigned Cnt = `0`, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17263	Value *RdxVal = TrackedVals.find(Val: OrigReducedVals [Cnt])->second;
17264	// Check if the reduction value was not overriden by the extractelement
17265	// instruction because of the vectorization and exclude it, if it is not
17266	// compatible with other values.
17267	// Also check if the instruction was folded to constant/other value.
17268	auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
17269	if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
17270	(!S.getOpcode() \|\| !S.isOpcodeOrAlt(I: Inst))) \|\|
17271	(S.getOpcode() && !Inst))
17272	continue;
17273	Candidates.push_back(Elt: RdxVal);
17274	TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals [Cnt]);
17275	}
17276	bool ShuffledExtracts = false;
17277	// Try to handle shuffled extractelements.
17278	if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17279	I + `1` < E) {
17280	InstructionsState NextS = getSameOpcode(VL: ReducedVals [I + `1`], TLI);
17281	if (NextS.getOpcode() == Instruction::ExtractElement &&
17282	!NextS.isAltShuffle()) {
17283	SmallVector<Value *> CommonCandidates(Candidates);
17284	for (Value *RV : ReducedVals [I + `1`]) {
17285	Value *RdxVal = TrackedVals.find(Val: RV)->second;
17286	// Check if the reduction value was not overriden by the
17287	// extractelement instruction because of the vectorization and
17288	// exclude it, if it is not compatible with other values.
17289	if (auto *Inst = dyn_cast<Instruction>(Val: RdxVal))
17290	if (!NextS.getOpcode() \|\| !NextS.isOpcodeOrAlt(I: Inst))
17291	continue;
17292	CommonCandidates.push_back(Elt: RdxVal);
17293	TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
17294	}
17295	SmallVector<int> Mask;
17296	if (isFixedVectorShuffle(VL: CommonCandidates, Mask)) {
17297	++I;
17298	Candidates.swap(RHS&: CommonCandidates);
17299	ShuffledExtracts = true;
17300	}
17301	}
17302	}
17303
17304	// Emit code for constant values.
17305	if (AllowHorRdxIdenityOptimization && Candidates.size() > `1` &&
17306	allConstant(VL: Candidates)) {
17307	Value *Res = Candidates.front();
17308	++VectorizedVals.try_emplace(Key: Candidates.front(), Args: `0`).first ->getSecond();
17309	for (Value *VC : ArrayRef(Candidates).drop_front()) {
17310	Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
17311	++VectorizedVals.try_emplace(Key: VC, Args: `0`).first ->getSecond();
17312	if (auto *ResI = dyn_cast<Instruction>(Val: Res))
17313	V.analyzedReductionRoot(I: ResI);
17314	}
17315	VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17316	continue;
17317	}
17318
17319	unsigned NumReducedVals = Candidates.size();
17320	if (NumReducedVals < ReductionLimit &&
17321	(NumReducedVals < `2` \|\| !AllowHorRdxIdenityOptimization \|\|
17322	!isSplat(VL: Candidates)))
17323	continue;
17324
17325	// Check if we support repeated scalar values processing (optimization of
17326	// original scalar identity operations on matched horizontal reductions).
17327	IsSupportedHorRdxIdentityOp =
17328	AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17329	RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17330	// Gather same values.
17331	MapVector<Value , unsigned*> SameValuesCounter;
17332	if (IsSupportedHorRdxIdentityOp)
17333	for (Value *V : Candidates)
17334	++SameValuesCounter.insert(KV: std::make_pair(x&: V, y: `0`)).first->second;
17335	// Used to check if the reduced values used same number of times. In this
17336	// case the compiler may produce better code. E.g. if reduced values are
17337	// aabbccdd (8 x values), then the first node of the tree will have a node
17338	// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17339	// Plus, the final reduction will be performed on <8 x aabbccdd>.
17340	// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17341	// x abcd) 2.*
17342	// Currently it only handles add/fadd/xor. and/or/min/max do not require
17343	// this analysis, other operations may require an extra estimation of
17344	// the profitability.
17345	bool SameScaleFactor = false;
17346	bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17347	SameValuesCounter.size() != Candidates.size();
17348	if (OptReusedScalars) {
17349	SameScaleFactor =
17350	(RdxKind == RecurKind::Add \|\| RdxKind == RecurKind::FAdd \|\|
17351	RdxKind == RecurKind::Xor) &&
17352	all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
17353	P: [&SameValuesCounter](const std::pair<Value , unsigned*> &P) {
17354	return P.second == SameValuesCounter.front().second;
17355	});
17356	Candidates.resize(N: SameValuesCounter.size());
17357	transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
17358	F: [](const auto &P) { return P.first; });
17359	NumReducedVals = Candidates.size();
17360	// Have a reduction of the same element.
17361	if (NumReducedVals == `1`) {
17362	Value *OrigV = TrackedToOrig.find(Val: Candidates.front())->second;
17363	unsigned Cnt = SameValuesCounter.lookup(Key: OrigV);
17364	Value *RedVal =
17365	emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
17366	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17367	VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
17368	continue;
17369	}
17370	}
17371
17372	unsigned MaxVecRegSize = V.getMaxVecRegSize();
17373	unsigned EltSize = V.getVectorElementSize(V: Candidates [`0`]);
17374	unsigned MaxElts =
17375	RegMaxNumber * llvm::bit_floor(Value: MaxVecRegSize / EltSize);
17376
17377	unsigned ReduxWidth = std::min<unsigned>(
17378	a: llvm::bit_floor(Value: NumReducedVals),
17379	b: std::clamp<unsigned>(val: MaxElts, lo: RedValsMaxNumber,
17380	hi: RegMaxNumber * RedValsMaxNumber));
17381	unsigned Start = `0`;
17382	unsigned Pos = Start;
17383	// Restarts vectorization attempt with lower vector factor.
17384	unsigned PrevReduxWidth = ReduxWidth;
17385	bool CheckForReusedReductionOpsLocal = false;
17386	auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17387	&CheckForReusedReductionOpsLocal,
17388	&PrevReduxWidth, &V,
17389	&IgnoreList](bool IgnoreVL = false) {
17390	bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
17391	if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17392	// Check if any of the reduction ops are gathered. If so, worth
17393	// trying again with less number of reduction ops.
17394	CheckForReusedReductionOpsLocal \|= IsAnyRedOpGathered;
17395	}
17396	++Pos;
17397	if (Pos < NumReducedVals - ReduxWidth + `1`)
17398	return IsAnyRedOpGathered;
17399	Pos = Start;
17400	ReduxWidth /= `2`;
17401	return IsAnyRedOpGathered;
17402	};
17403	bool AnyVectorized = false;
17404	while (Pos < NumReducedVals - ReduxWidth + `1` &&
17405	ReduxWidth >= ReductionLimit) {
17406	// Dependency in tree of the reduction ops - drop this attempt, try
17407	// later.
17408	if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17409	Start == `0`) {
17410	CheckForReusedReductionOps = true;
17411	break;
17412	}
17413	PrevReduxWidth = ReduxWidth;
17414	ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
17415	// Beeing analyzed already - skip.
17416	if (V.areAnalyzedReductionVals(VL)) {
17417	(void)AdjustReducedVals(/IgnoreVL=/true);
17418	continue;
17419	}
17420	// Early exit if any of the reduction values were deleted during
17421	// previous vectorization attempts.
17422	if (any_of(Range&: VL, P: [&V](Value *RedVal) {
17423	auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
17424	if (!RedValI)
17425	return false;
17426	return V.isDeleted(I: RedValI);
17427	}))
17428	break;
17429	V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
17430	if (V.isTreeTinyAndNotFullyVectorizable(/ForReduction=/true)) {
17431	if (!AdjustReducedVals())
17432	V.analyzedReductionVals(VL);
17433	continue;
17434	}
17435	if (V.isLoadCombineReductionCandidate(RdxKind)) {
17436	if (!AdjustReducedVals())
17437	V.analyzedReductionVals(VL);
17438	continue;
17439	}
17440	V.reorderTopToBottom();
17441	// No need to reorder the root node at all.
17442	V.reorderBottomToTop(/IgnoreReorder=/true);
17443	// Keep extracted other reduction values, if they are used in the
17444	// vectorization trees.
17445	BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17446	ExternallyUsedValues);
17447	for (unsigned Cnt = `0`, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17448	if (Cnt == I \|\| (ShuffledExtracts && Cnt == I - `1`))
17449	continue;
17450	for (Value *V : ReducedVals [Cnt])
17451	if (isa<Instruction>(Val: V))
17452	LocalExternallyUsedValues [TrackedVals [V]];
17453	}
17454	if (!IsSupportedHorRdxIdentityOp) {
17455	// Number of uses of the candidates in the vector of values.
17456	assert(SameValuesCounter.empty() &&
17457	"Reused values counter map is not empty");
17458	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
17459	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17460	continue;
17461	Value *V = Candidates [Cnt];
17462	Value *OrigV = TrackedToOrig.find(Val: V)->second;
17463	++SameValuesCounter [OrigV];
17464	}
17465	}
17466	SmallPtrSet<Value *, `4`> VLScalars(VL.begin(), VL.end());
17467	// Gather externally used values.
17468	SmallPtrSet<Value *, `4`> Visited;
17469	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
17470	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17471	continue;
17472	Value *RdxVal = Candidates [Cnt];
17473	if (!Visited.insert(Ptr: RdxVal).second)
17474	continue;
17475	// Check if the scalar was vectorized as part of the vectorization
17476	// tree but not the top node.
17477	if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
17478	LocalExternallyUsedValues [RdxVal];
17479	continue;
17480	}
17481	Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
17482	unsigned NumOps =
17483	VectorizedVals.lookup(Val: RdxVal) + SameValuesCounter [OrigV];
17484	if (NumOps != ReducedValsToOps.find(Val: OrigV)->second.size())
17485	LocalExternallyUsedValues [RdxVal];
17486	}
17487	// Do not need the list of reused scalars in regular mode anymore.
17488	if (!IsSupportedHorRdxIdentityOp)
17489	SameValuesCounter.clear();
17490	for (Value *RdxVal : VL)
17491	if (RequiredExtract.contains(Ptr: RdxVal))
17492	LocalExternallyUsedValues [RdxVal];
17493	// Update LocalExternallyUsedValues for the scalar, replaced by
17494	// extractelement instructions.
17495	DenseMap<Value , Value > ReplacementToExternal;
17496	for (const std::pair<Value , Value > &Pair : ReplacedExternals)
17497	ReplacementToExternal.try_emplace(Key: Pair.second, Args: Pair.first);
17498	for (const std::pair<Value , Value > &Pair : ReplacedExternals) {
17499	Value *Ext = Pair.first;
17500	auto RIt = ReplacementToExternal.find(Val: Ext);
17501	while (RIt != ReplacementToExternal.end()) {
17502	Ext = RIt ->second;
17503	RIt = ReplacementToExternal.find(Val: Ext);
17504	}
17505	auto *It = ExternallyUsedValues.find(Key: Ext);
17506	if (It == ExternallyUsedValues.end())
17507	continue;
17508	LocalExternallyUsedValues [Pair.second].append(RHS: It->second);
17509	}
17510	V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
17511
17512	V.computeMinimumValueSizes();
17513	V.transformNodes();
17514
17515	// Estimate cost.
17516	InstructionCost TreeCost = V.getTreeCost(VectorizedVals: VL);
17517	InstructionCost ReductionCost =
17518	getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, ReduxWidth, FMF: RdxFMF);
17519	InstructionCost Cost = TreeCost + ReductionCost;
17520	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17521	<< " for reduction\n");
17522	if (!Cost.isValid())
17523	break;
17524	if (Cost >= -SLPCostThreshold) {
17525	V.getORE()->emit(RemarkBuilder: [&]() {
17526	return OptimizationRemarkMissed (
17527	SV_NAME, "HorSLPNotBeneficial",
17528	ReducedValsToOps.find(Val: VL [`0`])->second.front())
17529	<< "Vectorizing horizontal reduction is possible "
17530	<< "but not beneficial with cost " << ore::NV ("Cost", Cost)
17531	<< " and threshold "
17532	<< ore::NV ("Threshold", -SLPCostThreshold);
17533	});
17534	if (!AdjustReducedVals())
17535	V.analyzedReductionVals(VL);
17536	continue;
17537	}
17538
17539	LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17540	<< Cost << ". (HorRdx)\n");
17541	V.getORE()->emit(RemarkBuilder: [&]() {
17542	return OptimizationRemark (
17543	SV_NAME, "VectorizedHorizontalReduction",
17544	ReducedValsToOps.find(Val: VL [`0`])->second.front())
17545	<< "Vectorized horizontal reduction with cost "
17546	<< ore::NV ("Cost", Cost) << " and with tree size "
17547	<< ore::NV ("TreeSize", V.getTreeSize());
17548	});
17549
17550	Builder.setFastMathFlags(RdxFMF);
17551
17552	// Emit a reduction. If the root is a select (min/max idiom), the insert
17553	// point is the compare condition of that select.
17554	Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
17555	Instruction *InsertPt = RdxRootInst;
17556	if (IsCmpSelMinMax)
17557	InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17558
17559	// Vectorize a tree.
17560	Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues: LocalExternallyUsedValues,
17561	ReplacedExternals, ReductionRoot: InsertPt);
17562
17563	Builder.SetInsertPoint(InsertPt);
17564
17565	// To prevent poison from leaking across what used to be sequential,
17566	// safe, scalar boolean logic operations, the reduction operand must be
17567	// frozen.
17568	if ((isBoolLogicOp(I: RdxRootInst) \|\|
17569	(AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17570	!isGuaranteedNotToBePoison(V: VectorizedRoot))
17571	VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
17572
17573	// Emit code to correctly handle reused reduced values, if required.
17574	if (OptReusedScalars && !SameScaleFactor) {
17575	VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
17576	SameValuesCounter, TrackedToOrig);
17577	}
17578
17579	Value *ReducedSubTree =
17580	emitReduction(VectorizedValue: VectorizedRoot, Builder, ReduxWidth, TTI);
17581	if (ReducedSubTree->getType() != VL.front()->getType()) {
17582	assert(ReducedSubTree->getType() != VL.front()->getType() &&
17583	"Expected different reduction type.");
17584	ReducedSubTree =
17585	Builder.CreateIntCast(V: ReducedSubTree, DestTy: VL.front()->getType(),
17586	isSigned: V.isSignedMinBitwidthRootNode());
17587	}
17588
17589	// Improved analysis for add/fadd/xor reductions with same scale factor
17590	// for all operands of reductions. We can emit scalar ops for them
17591	// instead.
17592	if (OptReusedScalars && SameScaleFactor)
17593	ReducedSubTree = emitScaleForReusedOps(
17594	VectorizedValue: ReducedSubTree, Builder, Cnt: SameValuesCounter.front().second);
17595
17596	VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17597	// Count vectorized reduced values to exclude them from final reduction.
17598	for (Value *RdxVal : VL) {
17599	Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
17600	if (IsSupportedHorRdxIdentityOp) {
17601	VectorizedVals.try_emplace(Key: OrigV, Args&: SameValuesCounter [RdxVal]);
17602	continue;
17603	}
17604	++VectorizedVals.try_emplace(Key: OrigV, Args: `0`).first ->getSecond();
17605	if (!V.isVectorized(V: RdxVal))
17606	RequiredExtract.insert(Ptr: RdxVal);
17607	}
17608	Pos += ReduxWidth;
17609	Start = Pos;
17610	ReduxWidth = llvm::bit_floor(Value: NumReducedVals - Pos);
17611	AnyVectorized = true;
17612	}
17613	if (OptReusedScalars && !AnyVectorized) {
17614	for (const std::pair<Value , unsigned*> &P : SameValuesCounter) {
17615	Value *RedVal = emitScaleForReusedOps(VectorizedValue: P.first, Builder, Cnt: P.second);
17616	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17617	Value *OrigV = TrackedToOrig.find(Val: P.first)->second;
17618	VectorizedVals.try_emplace(Key: OrigV, Args: P.second);
17619	}
17620	continue;
17621	}
17622	}
17623	if (VectorizedTree) {
17624	// Reorder operands of bool logical op in the natural order to avoid
17625	// possible problem with poison propagation. If not possible to reorder
17626	// (both operands are originally RHS), emit an extra freeze instruction
17627	// for the LHS operand.
17628	// I.e., if we have original code like this:
17629	// RedOp1 = select i1 ?, i1 LHS, i1 false
17630	// RedOp2 = select i1 RHS, i1 ?, i1 false
17631
17632	// Then, we swap LHS/RHS to create a new op that matches the poison
17633	// semantics of the original code.
17634
17635	// If we have original code like this and both values could be poison:
17636	// RedOp1 = select i1 ?, i1 LHS, i1 false
17637	// RedOp2 = select i1 ?, i1 RHS, i1 false
17638
17639	// Then, we must freeze LHS in the new op.
17640	auto FixBoolLogicalOps = [&, VectorizedTree](Value &LHS, Value &RHS,
17641	Instruction *RedOp1,
17642	Instruction *RedOp2,
17643	bool InitStep) {
17644	if (!AnyBoolLogicOp)
17645	return;
17646	if (isBoolLogicOp(I: RedOp1) &&
17647	((!InitStep && LHS == VectorizedTree) \|\|
17648	getRdxOperand(I: RedOp1, Index: `0`) == LHS \|\| isGuaranteedNotToBePoison(V: LHS)))
17649	return;
17650	if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) \|\|
17651	getRdxOperand(I: RedOp2, Index: `0`) == RHS \|\|
17652	isGuaranteedNotToBePoison(V: RHS))) {
17653	std::swap(a&: LHS, b&: RHS);
17654	return;
17655	}
17656	if (LHS != VectorizedTree)
17657	LHS = Builder.CreateFreeze(V: LHS);
17658	};
17659	// Finish the reduction.
17660	// Need to add extra arguments and not vectorized possible reduction
17661	// values.
17662	// Try to avoid dependencies between the scalar remainders after
17663	// reductions.
17664	auto FinalGen =
17665	[&](ArrayRef<std::pair<Instruction , Value >> InstVals,
17666	bool InitStep) {
17667	unsigned Sz = InstVals.size();
17668	SmallVector<std::pair<Instruction , Value >> ExtraReds(Sz / `2` +
17669	Sz % `2`);
17670	for (unsigned I = `0`, E = (Sz / `2`) * `2`; I < E; I += `2`) {
17671	Instruction *RedOp = InstVals [I + `1`].first;
17672	Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17673	Value *RdxVal1 = InstVals [I].second;
17674	Value *StableRdxVal1 = RdxVal1;
17675	auto It1 = TrackedVals.find(Val: RdxVal1);
17676	if (It1 != TrackedVals.end())
17677	StableRdxVal1 = It1 ->second;
17678	Value *RdxVal2 = InstVals [I + `1`].second;
17679	Value *StableRdxVal2 = RdxVal2;
17680	auto It2 = TrackedVals.find(Val: RdxVal2);
17681	if (It2 != TrackedVals.end())
17682	StableRdxVal2 = It2 ->second;
17683	// To prevent poison from leaking across what used to be
17684	// sequential, safe, scalar boolean logic operations, the
17685	// reduction operand must be frozen.
17686	FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals [I].first,
17687	RedOp, InitStep);
17688	Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
17689	RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
17690	ExtraReds [I / `2`] = std::make_pair(x: InstVals [I].first, y&: ExtraRed);
17691	}
17692	if (Sz % `2` == `1`)
17693	ExtraReds [Sz / `2`] = InstVals.back();
17694	return ExtraReds;
17695	};
17696	SmallVector<std::pair<Instruction , Value >> ExtraReductions;
17697	ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
17698	Args&: VectorizedTree);
17699	SmallPtrSet<Value *, `8`> Visited;
17700	for (ArrayRef<Value *> Candidates : ReducedVals) {
17701	for (Value *RdxVal : Candidates) {
17702	if (!Visited.insert(Ptr: RdxVal).second)
17703	continue;
17704	unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
17705	for (Instruction *RedOp :
17706	ArrayRef(ReducedValsToOps.find(Val: RdxVal)->second)
17707	.drop_back(N: NumOps))
17708	ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
17709	}
17710	}
17711	for (auto &Pair : ExternallyUsedValues) {
17712	// Add each externally used value to the final reduction.
17713	for (auto *I : Pair.second)
17714	ExtraReductions.emplace_back(Args&: I, Args&: Pair.first);
17715	}
17716	// Iterate through all not-vectorized reduction values/extra arguments.
17717	bool InitStep = true;
17718	while (ExtraReductions.size() > `1`) {
17719	SmallVector<std::pair<Instruction , Value >> NewReds =
17720	FinalGen(ExtraReductions, InitStep);
17721	ExtraReductions.swap(RHS&: NewReds);
17722	InitStep = false;
17723	}
17724	VectorizedTree = ExtraReductions.front().second;
17725
17726	ReductionRoot ->replaceAllUsesWith(V: VectorizedTree);
17727
17728	// The original scalar reduction is expected to have no remaining
17729	// uses outside the reduction tree itself. Assert that we got this
17730	// correct, replace internal uses with undef, and mark for eventual
17731	// deletion.
17732	#ifndef NDEBUG
17733	SmallSet<Value *, `4`> IgnoreSet;
17734	for (ArrayRef<Value *> RdxOps : ReductionOps)
17735	IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17736	#endif
17737	for (ArrayRef<Value *> RdxOps : ReductionOps) {
17738	for (Value *Ignore : RdxOps) {
17739	if (!Ignore)
17740	continue;
17741	#ifndef NDEBUG
17742	for (auto *U : Ignore->users()) {
17743	assert(IgnoreSet.count(U) &&
17744	"All users must be either in the reduction ops list.");
17745	}
17746	#endif
17747	if (!Ignore->use_empty()) {
17748	Value *P = PoisonValue::get(T: Ignore->getType());
17749	Ignore->replaceAllUsesWith(V: P);
17750	}
17751	}
17752	V.removeInstructionsAndOperands(DeadVals: RdxOps);
17753	}
17754	} else if (!CheckForReusedReductionOps) {
17755	for (ReductionOpsType &RdxOps : ReductionOps)
17756	for (Value *RdxOp : RdxOps)
17757	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
17758	}
17759	return VectorizedTree;
17760	}
17761
17762	private:
17763	/// Calculate the cost of a reduction.
17764	InstructionCost getReductionCost(TargetTransformInfo *TTI,
17765	ArrayRef<Value *> ReducedVals,
17766	bool IsCmpSelMinMax, unsigned ReduxWidth,
17767	FastMathFlags FMF) {
17768	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17769	Type *ScalarTy = ReducedVals.front()->getType();
17770	FixedVectorType *VectorTy = getWidenedType(ScalarTy, VF: ReduxWidth);
17771	InstructionCost VectorCost = `0`, ScalarCost;
17772	// If all of the reduced values are constant, the vector cost is 0, since
17773	// the reduction value can be calculated at the compile time.
17774	bool AllConsts = allConstant(VL: ReducedVals);
17775	auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17776	InstructionCost Cost = `0`;
17777	// Scalar cost is repeated for N-1 elements.
17778	int Cnt = ReducedVals.size();
17779	for (Value *RdxVal : ReducedVals) {
17780	if (Cnt == `1`)
17781	break;
17782	--Cnt;
17783	if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? `3` : `2`)) {
17784	Cost += GenCostFn ();
17785	continue;
17786	}
17787	InstructionCost ScalarCost = `0`;
17788	for (User *U : RdxVal->users()) {
17789	auto *RdxOp = cast<Instruction>(Val: U);
17790	if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
17791	ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
17792	continue;
17793	}
17794	ScalarCost = InstructionCost::getInvalid();
17795	break;
17796	}
17797	if (ScalarCost.isValid())
17798	Cost += ScalarCost;
17799	else
17800	Cost += GenCostFn ();
17801	}
17802	return Cost;
17803	};
17804	switch (RdxKind) {
17805	case RecurKind::Add:
17806	case RecurKind::Mul:
17807	case RecurKind::Or:
17808	case RecurKind::And:
17809	case RecurKind::Xor:
17810	case RecurKind::FAdd:
17811	case RecurKind::FMul: {
17812	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
17813	if (!AllConsts)
17814	VectorCost =
17815	TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy, FMF, CostKind);
17816	ScalarCost = EvaluateScalarCost([&]() {
17817	return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
17818	});
17819	break;
17820	}
17821	case RecurKind::FMax:
17822	case RecurKind::FMin:
17823	case RecurKind::FMaximum:
17824	case RecurKind::FMinimum:
17825	case RecurKind::SMax:
17826	case RecurKind::SMin:
17827	case RecurKind::UMax:
17828	case RecurKind::UMin: {
17829	Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
17830	if (!AllConsts)
17831	VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
17832	ScalarCost = EvaluateScalarCost([&]() {
17833	IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17834	return TTI->getIntrinsicInstrCost(ICA, CostKind);
17835	});
17836	break;
17837	}
17838	default:
17839	llvm_unreachable("Expected arithmetic or min/max reduction operation");
17840	}
17841
17842	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17843	<< " for reduction of " << shortBundleName(ReducedVals)
17844	<< " (It is a splitting reduction)\n");
17845	return VectorCost - ScalarCost;
17846	}
17847
17848	/// Emit a horizontal reduction of the vectorized value.
17849	Value emitReduction(Value VectorizedValue, IRBuilderBase &Builder,
17850	unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17851	assert(VectorizedValue && "Need to have a vectorized tree node");
17852	assert(isPowerOf2_32(ReduxWidth) &&
17853	"We only handle power-of-two reductions for now");
17854	assert(RdxKind != RecurKind::FMulAdd &&
17855	"A call to the llvm.fmuladd intrinsic is not handled yet");
17856
17857	++NumVectorInstructions;
17858	return createSimpleTargetReduction(B&: Builder, Src: VectorizedValue, RdxKind);
17859	}
17860
17861	/// Emits optimized code for unique scalar value reused \p Cnt times.
17862	Value emitScaleForReusedOps(Value VectorizedValue, IRBuilderBase &Builder,
17863	unsigned Cnt) {
17864	assert(IsSupportedHorRdxIdentityOp &&
17865	"The optimization of matched scalar identity horizontal reductions "
17866	"must be supported.");
17867	switch (RdxKind) {
17868	case RecurKind::Add: {
17869	// res = mul vv, n
17870	Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
17871	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17872	<< VectorizedValue << ". (HorRdx)\n");
17873	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
17874	}
17875	case RecurKind::Xor: {
17876	// res = n % 2 ? 0 : vv
17877	LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17878	<< ". (HorRdx)\n");
17879	if (Cnt % `2` == `0`)
17880	return Constant::getNullValue(Ty: VectorizedValue->getType());
17881	return VectorizedValue;
17882	}
17883	case RecurKind::FAdd: {
17884	// res = fmul v, n
17885	Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
17886	LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17887	<< VectorizedValue << ". (HorRdx)\n");
17888	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
17889	}
17890	case RecurKind::And:
17891	case RecurKind::Or:
17892	case RecurKind::SMax:
17893	case RecurKind::SMin:
17894	case RecurKind::UMax:
17895	case RecurKind::UMin:
17896	case RecurKind::FMax:
17897	case RecurKind::FMin:
17898	case RecurKind::FMaximum:
17899	case RecurKind::FMinimum:
17900	// res = vv
17901	return VectorizedValue;
17902	case RecurKind::Mul:
17903	case RecurKind::FMul:
17904	case RecurKind::FMulAdd:
17905	case RecurKind::IAnyOf:
17906	case RecurKind::FAnyOf:
17907	case RecurKind::None:
17908	llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17909	}
17910	return nullptr;
17911	}
17912
17913	/// Emits actual operation for the scalar identity values, found during
17914	/// horizontal reduction analysis.
17915	Value emitReusedOps(Value VectorizedValue, IRBuilderBase &Builder,
17916	BoUpSLP &R,
17917	const MapVector<Value , unsigned*> &SameValuesCounter,
17918	const DenseMap<Value , Value > &TrackedToOrig) {
17919	assert(IsSupportedHorRdxIdentityOp &&
17920	"The optimization of matched scalar identity horizontal reductions "
17921	"must be supported.");
17922	ArrayRef<Value *> VL = R.getRootNodeScalars();
17923	auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
17924	if (VTy->getElementType() != VL.front()->getType()) {
17925	VectorizedValue = Builder.CreateIntCast(
17926	V: VectorizedValue,
17927	DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
17928	isSigned: R.isSignedMinBitwidthRootNode());
17929	}
17930	switch (RdxKind) {
17931	case RecurKind::Add: {
17932	// root = mul prev_root, <1, 1, n, 1>
17933	SmallVector<Constant *> Vals;
17934	for (Value *V : VL) {
17935	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17936	Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /IsSigned=/false));
17937	}
17938	auto *Scale = ConstantVector::get(V: Vals);
17939	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17940	<< VectorizedValue << ". (HorRdx)\n");
17941	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
17942	}
17943	case RecurKind::And:
17944	case RecurKind::Or:
17945	// No need for multiple or/and(s).
17946	LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17947	<< ". (HorRdx)\n");
17948	return VectorizedValue;
17949	case RecurKind::SMax:
17950	case RecurKind::SMin:
17951	case RecurKind::UMax:
17952	case RecurKind::UMin:
17953	case RecurKind::FMax:
17954	case RecurKind::FMin:
17955	case RecurKind::FMaximum:
17956	case RecurKind::FMinimum:
17957	// No need for multiple min/max(s) of the same value.
17958	LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17959	<< ". (HorRdx)\n");
17960	return VectorizedValue;
17961	case RecurKind::Xor: {
17962	// Replace values with even number of repeats with 0, since
17963	// x xor x = 0.
17964	// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17965	// 7>, if elements 4th and 6th elements have even number of repeats.
17966	SmallVector<int> Mask(
17967	cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
17968	PoisonMaskElem);
17969	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
17970	bool NeedShuffle = false;
17971	for (unsigned I = `0`, VF = VL.size(); I < VF; ++I) {
17972	Value *V = VL [I];
17973	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17974	if (Cnt % `2` == `0`) {
17975	Mask [I] = VF;
17976	NeedShuffle = true;
17977	}
17978	}
17979	LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17980	: Mask) dbgs()
17981	<< I << " ";
17982	dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17983	if (NeedShuffle)
17984	VectorizedValue = Builder.CreateShuffleVector(
17985	V1: VectorizedValue,
17986	V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
17987	return VectorizedValue;
17988	}
17989	case RecurKind::FAdd: {
17990	// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17991	SmallVector<Constant *> Vals;
17992	for (Value *V : VL) {
17993	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17994	Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
17995	}
17996	auto *Scale = ConstantVector::get(V: Vals);
17997	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
17998	}
17999	case RecurKind::Mul:
18000	case RecurKind::FMul:
18001	case RecurKind::FMulAdd:
18002	case RecurKind::IAnyOf:
18003	case RecurKind::FAnyOf:
18004	case RecurKind::None:
18005	llvm_unreachable("Unexpected reduction kind for reused scalars.");
18006	}
18007	return nullptr;
18008	}
18009	};
18010	} // end anonymous namespace
18011
18012	/// Gets recurrence kind from the specified value.
18013	static RecurKind getRdxKind(Value *V) {
18014	return HorizontalReduction::getRdxKind(V);
18015	}
18016	static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18017	if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
18018	return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
18019
18020	unsigned AggregateSize = `1`;
18021	auto *IV = cast<InsertValueInst>(Val: InsertInst);
18022	Type *CurrentType = IV->getType();
18023	do {
18024	if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
18025	for (auto *Elt : ST->elements())
18026	if (Elt != ST->getElementType(N: `0`)) // check homogeneity
18027	return std::nullopt;
18028	AggregateSize *= ST->getNumElements();
18029	CurrentType = ST->getElementType(N: `0`);
18030	} else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
18031	AggregateSize *= AT->getNumElements();
18032	CurrentType = AT->getElementType();
18033	} else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
18034	AggregateSize *= VT->getNumElements();
18035	return AggregateSize;
18036	} else if (CurrentType->isSingleValueType()) {
18037	return AggregateSize;
18038	} else {
18039	return std::nullopt;
18040	}
18041	} while (true);
18042	}
18043
18044	static void findBuildAggregate_rec(Instruction *LastInsertInst,
18045	TargetTransformInfo *TTI,
18046	SmallVectorImpl<Value *> &BuildVectorOpds,
18047	SmallVectorImpl<Value *> &InsertElts,
18048	unsigned OperandOffset) {
18049	do {
18050	Value *InsertedOperand = LastInsertInst->getOperand(i: `1`);
18051	std::optional<unsigned> OperandIndex =
18052	getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
18053	if (!OperandIndex)
18054	return;
18055	if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
18056	findBuildAggregate_rec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
18057	BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex);
18058
18059	} else {
18060	BuildVectorOpds [*OperandIndex] = InsertedOperand;
18061	InsertElts [*OperandIndex] = LastInsertInst;
18062	}
18063	LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: `0`));
18064	} while (LastInsertInst != nullptr &&
18065	isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
18066	LastInsertInst->hasOneUse());
18067	}
18068
18069	/// Recognize construction of vectors like
18070	/// %ra = insertelement <4 x float> poison, float %s0, i32 0
18071	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
18072	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
18073	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
18074	/// starting from the last insertelement or insertvalue instruction.
18075	///
18076	/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18077	/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18078	/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18079	///
18080	/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18081	///
18082	/// \return true if it matches.
18083	static bool findBuildAggregate(Instruction *LastInsertInst,
18084	TargetTransformInfo *TTI,
18085	SmallVectorImpl<Value *> &BuildVectorOpds,
18086	SmallVectorImpl<Value *> &InsertElts) {
18087
18088	assert((isa<InsertElementInst>(LastInsertInst) \|\|
18089	isa<InsertValueInst>(LastInsertInst)) &&
18090	"Expected insertelement or insertvalue instruction!");
18091
18092	assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18093	"Expected empty result vectors!");
18094
18095	std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
18096	if (!AggregateSize)
18097	return false;
18098	BuildVectorOpds.resize(N: *AggregateSize);
18099	InsertElts.resize(N: *AggregateSize);
18100
18101	findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: `0`);
18102	llvm::erase(C&: BuildVectorOpds, V: nullptr);
18103	llvm::erase(C&: InsertElts, V: nullptr);
18104	if (BuildVectorOpds.size() >= `2`)
18105	return true;
18106
18107	return false;
18108	}
18109
18110	/// Try and get a reduction instruction from a phi node.
18111	///
18112	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18113	/// if they come from either \p ParentBB or a containing loop latch.
18114	///
18115	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
18116	/// if not possible.
18117	static Instruction getReductionInstr(const* DominatorTree DT, PHINode P,
18118	BasicBlock ParentBB, LoopInfo LI) {
18119	// There are situations where the reduction value is not dominated by the
18120	// reduction phi. Vectorizing such cases has been reported to cause
18121	// miscompiles. See PR25787.
18122	auto DominatedReduxValue = [&](Value *R) {
18123	return isa<Instruction>(Val: R) &&
18124	DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
18125	};
18126
18127	Instruction Rdx = nullptr*;
18128
18129	// Return the incoming value if it comes from the same BB as the phi node.
18130	if (P->getIncomingBlock(i: `0`) == ParentBB) {
18131	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
18132	} else if (P->getIncomingBlock(i: `1`) == ParentBB) {
18133	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
18134	}
18135
18136	if (Rdx && DominatedReduxValue (Rdx))
18137	return Rdx;
18138
18139	// Otherwise, check whether we have a loop latch to look at.
18140	Loop *BBL = LI->getLoopFor(BB: ParentBB);
18141	if (!BBL)
18142	return nullptr;
18143	BasicBlock *BBLatch = BBL->getLoopLatch();
18144	if (!BBLatch)
18145	return nullptr;
18146
18147	// There is a loop latch, return the incoming value if it comes from
18148	// that. This reduction pattern occasionally turns up.
18149	if (P->getIncomingBlock(i: `0`) == BBLatch) {
18150	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
18151	} else if (P->getIncomingBlock(i: `1`) == BBLatch) {
18152	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
18153	}
18154
18155	if (Rdx && DominatedReduxValue (Rdx))
18156	return Rdx;
18157
18158	return nullptr;
18159	}
18160
18161	static bool matchRdxBop(Instruction I, Value &V0, Value *&V1) {
18162	if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
18163	return true;
18164	if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18165	return true;
18166	if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18167	return true;
18168	if (match(V: I, P: m_Intrinsic<Intrinsic::maximum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18169	return true;
18170	if (match(V: I, P: m_Intrinsic<Intrinsic::minimum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18171	return true;
18172	if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18173	return true;
18174	if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18175	return true;
18176	if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18177	return true;
18178	if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18179	return true;
18180	return false;
18181	}
18182
18183	/// We could have an initial reduction that is not an add.
18184	/// r = v1 + v2 + v3 + v4*
18185	/// In such a case start looking for a tree rooted in the first '+'.
18186	/// \Returns the new root if found, which may be nullptr if not an instruction.
18187	static Instruction tryGetSecondaryReductionRoot(PHINode Phi,
18188	Instruction *Root) {
18189	assert((isa<BinaryOperator>(Root) \|\| isa<SelectInst>(Root) \|\|
18190	isa<IntrinsicInst>(Root)) &&
18191	"Expected binop, select, or intrinsic for reduction matching");
18192	Value *LHS =
18193	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
18194	Value *RHS =
18195	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + `1`);
18196	if (LHS == Phi)
18197	return dyn_cast<Instruction>(Val: RHS);
18198	if (RHS == Phi)
18199	return dyn_cast<Instruction>(Val: LHS);
18200	return nullptr;
18201	}
18202
18203	/// \p Returns the first operand of \p I that does not match \p Phi. If
18204	/// operand is not an instruction it returns nullptr.
18205	static Instruction getNonPhiOperand(Instruction I, PHINode *Phi) {
18206	Value Op0 = nullptr*;
18207	Value Op1 = nullptr*;
18208	if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
18209	return nullptr;
18210	return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
18211	}
18212
18213	/// \Returns true if \p I is a candidate instruction for reduction vectorization.
18214	static bool isReductionCandidate(Instruction *I) {
18215	bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
18216	Value B0 = nullptr, B1 = nullptr;
18217	bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
18218	return IsBinop \|\| IsSelect;
18219	}
18220
18221	bool SLPVectorizerPass::vectorizeHorReduction(
18222	PHINode P, Instruction Root, BasicBlock BB, BoUpSLP &R, TargetTransformInfo TTI,
18223	SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18224	if (!ShouldVectorizeHor)
18225	return false;
18226	bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
18227
18228	if (Root->getParent() != BB \|\| isa<PHINode>(Val: Root))
18229	return false;
18230
18231	// If we can find a secondary reduction root, use that instead.
18232	auto SelectRoot = [&]() {
18233	if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
18234	HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
18235	if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
18236	return NewRoot;
18237	return Root;
18238	};
18239
18240	// Start analysis starting from Root instruction. If horizontal reduction is
18241	// found, try to vectorize it. If it is not a horizontal reduction or
18242	// vectorization is not possible or not effective, and currently analyzed
18243	// instruction is a binary operation, try to vectorize the operands, using
18244	// pre-order DFS traversal order. If the operands were not vectorized, repeat
18245	// the same procedure considering each operand as a possible root of the
18246	// horizontal reduction.
18247	// Interrupt the process if the Root instruction itself was vectorized or all
18248	// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18249	// If a horizintal reduction was not matched or vectorized we collect
18250	// instructions for possible later attempts for vectorization.
18251	std::queue<std::pair<Instruction , unsigned*>> Stack;
18252	Stack.emplace(args: SelectRoot (), args: `0`);
18253	SmallPtrSet<Value *, `8`> VisitedInstrs;
18254	bool Res = false;
18255	auto &&TryToReduce = [this, TTI, &R](Instruction Inst) -> Value {
18256	if (R.isAnalyzedReductionRoot(I: Inst))
18257	return nullptr;
18258	if (!isReductionCandidate(I: Inst))
18259	return nullptr;
18260	HorizontalReduction HorRdx;
18261	if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: SE, DL: DL, TLI: *TLI))
18262	return nullptr;
18263	return HorRdx.tryToReduce(V&: R, DL: DL, TTI, TLI: TLI);
18264	};
18265	auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18266	if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18267	FutureSeed = getNonPhiOperand(I: Root, Phi: P);
18268	if (!FutureSeed)
18269	return false;
18270	}
18271	// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18272	// analysis is done separately.
18273	if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
18274	PostponedInsts.push_back(Elt: FutureSeed);
18275	return true;
18276	};
18277
18278	while (!Stack.empty()) {
18279	Instruction *Inst;
18280	unsigned Level;
18281	std::tie(args&: Inst, args&: Level) = Stack.front();
18282	Stack.pop();
18283	// Do not try to analyze instruction that has already been vectorized.
18284	// This may happen when we vectorize instruction operands on a previous
18285	// iteration while stack was populated before that happened.
18286	if (R.isDeleted(I: Inst))
18287	continue;
18288	if (Value *VectorizedV = TryToReduce (Inst)) {
18289	Res = true;
18290	if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
18291	// Try to find another reduction.
18292	Stack.emplace(args&: I, args&: Level);
18293	continue;
18294	}
18295	if (R.isDeleted(I: Inst))
18296	continue;
18297	} else {
18298	// We could not vectorize `Inst` so try to use it as a future seed.
18299	if (!TryAppendToPostponedInsts (Inst)) {
18300	assert(Stack.empty() && "Expected empty stack");
18301	break;
18302	}
18303	}
18304
18305	// Try to vectorize operands.
18306	// Continue analysis for the instruction from the same basic block only to
18307	// save compile time.
18308	if (++Level < RecursionMaxDepth)
18309	for (auto *Op : Inst->operand_values())
18310	if (VisitedInstrs.insert(Ptr: Op).second)
18311	if (auto *I = dyn_cast<Instruction>(Val: Op))
18312	// Do not try to vectorize CmpInst operands, this is done
18313	// separately.
18314	if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
18315	!R.isDeleted(I) && I->getParent() == BB)
18316	Stack.emplace(args&: I, args&: Level);
18317	}
18318	return Res;
18319	}
18320
18321	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Instruction Root,
18322	BasicBlock *BB, BoUpSLP &R,
18323	TargetTransformInfo *TTI) {
18324	SmallVector<WeakTrackingVH> PostponedInsts;
18325	bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18326	Res \|= tryToVectorize(Insts: PostponedInsts, R);
18327	return Res;
18328	}
18329
18330	bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18331	BoUpSLP &R) {
18332	bool Res = false;
18333	for (Value *V : Insts)
18334	if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
18335	Res \|= tryToVectorize(I: Inst, R);
18336	return Res;
18337	}
18338
18339	bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18340	BasicBlock *BB, BoUpSLP &R,
18341	bool MaxVFOnly) {
18342	if (!R.canMapToVector(T: IVI->getType()))
18343	return false;
18344
18345	SmallVector<Value *, `16`> BuildVectorOpds;
18346	SmallVector<Value *, `16`> BuildVectorInsts;
18347	if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts))
18348	return false;
18349
18350	if (MaxVFOnly && BuildVectorOpds.size() == `2`) {
18351	R.getORE()->emit(RemarkBuilder: [&]() {
18352	return OptimizationRemarkMissed (SV_NAME, "NotPossible", IVI)
18353	<< "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18354	"trying reduction first.";
18355	});
18356	return false;
18357	}
18358	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18359	// Aggregate value is unlikely to be processed in vector register.
18360	return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
18361	}
18362
18363	bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18364	BasicBlock *BB, BoUpSLP &R,
18365	bool MaxVFOnly) {
18366	SmallVector<Value *, `16`> BuildVectorInsts;
18367	SmallVector<Value *, `16`> BuildVectorOpds;
18368	SmallVector<int> Mask;
18369	if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts) \|\|
18370	(llvm::all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
18371	isFixedVectorShuffle(VL: BuildVectorOpds, Mask)))
18372	return false;
18373
18374	if (MaxVFOnly && BuildVectorInsts.size() == `2`) {
18375	R.getORE()->emit(RemarkBuilder: [&]() {
18376	return OptimizationRemarkMissed (SV_NAME, "NotPossible", IEI)
18377	<< "Cannot SLP vectorize list: only 2 elements of buildvector, "
18378	"trying reduction first.";
18379	});
18380	return false;
18381	}
18382	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18383	return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
18384	}
18385
18386	template <typename T>
18387	static bool tryToVectorizeSequence(
18388	SmallVectorImpl<T > &Incoming, function_ref<bool(T , T *)> Comparator,
18389	function_ref<bool(T , T )> AreCompatible,
18390	function_ref<bool(ArrayRef<T >, bool*)> TryToVectorizeHelper,
18391	bool MaxVFOnly, BoUpSLP &R) {
18392	bool Changed = false;
18393	// Sort by type, parent, operands.
18394	stable_sort(Incoming, Comparator);
18395
18396	// Try to vectorize elements base on their type.
18397	SmallVector<T *> Candidates;
18398	SmallVector<T *> VL;
18399	for (auto IncIt = Incoming.begin(), E = Incoming.end(); IncIt != E;
18400	VL.clear()) {
18401	// Look for the next elements with the same type, parent and operand
18402	// kinds.
18403	auto I = dyn_cast<Instruction>(IncIt);
18404	if (!I \|\| R.isDeleted(I)) {
18405	++IncIt;
18406	continue;
18407	}
18408	auto *SameTypeIt = IncIt;
18409	while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) \|\|
18410	R.isDeleted(I: cast<Instruction>(*SameTypeIt)) \|\|
18411	AreCompatible(SameTypeIt, IncIt))) {
18412	auto I = dyn_cast<Instruction>(SameTypeIt);
18413	++SameTypeIt;
18414	if (I && !R.isDeleted(I))
18415	VL.push_back(cast<T>(I));
18416	}
18417
18418	// Try to vectorize them.
18419	unsigned NumElts = VL.size();
18420	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18421	<< NumElts << ")\n");
18422	// The vectorization is a 3-state attempt:
18423	// 1. Try to vectorize instructions with the same/alternate opcodes with the
18424	// size of maximal register at first.
18425	// 2. Try to vectorize remaining instructions with the same type, if
18426	// possible. This may result in the better vectorization results rather than
18427	// if we try just to vectorize instructions with the same/alternate opcodes.
18428	// 3. Final attempt to try to vectorize all instructions with the
18429	// same/alternate ops only, this may result in some extra final
18430	// vectorization.
18431	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18432	// Success start over because instructions might have been changed.
18433	Changed = true;
18434	VL.swap(Candidates);
18435	Candidates.clear();
18436	for (T *V : VL) {
18437	if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18438	Candidates.push_back(V);
18439	}
18440	} else {
18441	/// \Returns the minimum number of elements that we will attempt to
18442	/// vectorize.
18443	auto GetMinNumElements = [&R](Value *V) {
18444	unsigned EltSize = R.getVectorElementSize(V);
18445	return std::max(a: `2U`, b: R.getMaxVecRegSize() / EltSize);
18446	};
18447	if (NumElts < GetMinNumElements(*IncIt) &&
18448	(Candidates.empty() \|\|
18449	Candidates.front()->getType() == (*IncIt)->getType())) {
18450	for (T *V : VL) {
18451	if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18452	Candidates.push_back(V);
18453	}
18454	}
18455	}
18456	// Final attempt to vectorize instructions with the same types.
18457	if (Candidates.size() > `1` &&
18458	(SameTypeIt == E \|\| (SameTypeIt)->getType() != (IncIt)->getType())) {
18459	if (TryToVectorizeHelper(Candidates, /MaxVFOnly=/false)) {
18460	// Success start over because instructions might have been changed.
18461	Changed = true;
18462	} else if (MaxVFOnly) {
18463	// Try to vectorize using small vectors.
18464	SmallVector<T *> VL;
18465	for (auto It = Candidates.begin(), End = Candidates.end(); It != End;
18466	VL.clear()) {
18467	auto I = dyn_cast<Instruction>(It);
18468	if (!I \|\| R.isDeleted(I)) {
18469	++It;
18470	continue;
18471	}
18472	auto *SameTypeIt = It;
18473	while (SameTypeIt != End &&
18474	(!isa<Instruction>(*SameTypeIt) \|\|
18475	R.isDeleted(I: cast<Instruction>(*SameTypeIt)) \|\|
18476	AreCompatible(SameTypeIt, It))) {
18477	auto I = dyn_cast<Instruction>(SameTypeIt);
18478	++SameTypeIt;
18479	if (I && !R.isDeleted(I))
18480	VL.push_back(cast<T>(I));
18481	}
18482	unsigned NumElts = VL.size();
18483	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(VL),
18484	/MaxVFOnly=/false))
18485	Changed = true;
18486	It = SameTypeIt;
18487	}
18488	}
18489	Candidates.clear();
18490	}
18491
18492	// Start over at the next instruction of a different type (or the end).
18493	IncIt = SameTypeIt;
18494	}
18495	return Changed;
18496	}
18497
18498	/// Compare two cmp instructions. If IsCompatibility is true, function returns
18499	/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18500	/// operands. If IsCompatibility is false, function implements strict weak
18501	/// ordering relation between two cmp instructions, returning true if the first
18502	/// instruction is "less" than the second, i.e. its predicate is less than the
18503	/// predicate of the second or the operands IDs are less than the operands IDs
18504	/// of the second cmp instruction.
18505	template <bool IsCompatibility>
18506	static bool compareCmp(Value V, Value V2, TargetLibraryInfo &TLI,
18507	const DominatorTree &DT) {
18508	assert(isValidElementType(V->getType()) &&
18509	isValidElementType(V2->getType()) &&
18510	"Expected valid element types only.");
18511	if (V == V2)
18512	return IsCompatibility;
18513	auto *CI1 = cast<CmpInst>(Val: V);
18514	auto *CI2 = cast<CmpInst>(Val: V2);
18515	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() <
18516	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
18517	return !IsCompatibility;
18518	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() >
18519	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
18520	return false;
18521	CmpInst::Predicate Pred1 = CI1->getPredicate();
18522	CmpInst::Predicate Pred2 = CI2->getPredicate();
18523	CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
18524	CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
18525	CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
18526	CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
18527	if (BasePred1 < BasePred2)
18528	return !IsCompatibility;
18529	if (BasePred1 > BasePred2)
18530	return false;
18531	// Compare operands.
18532	bool CI1Preds = Pred1 == BasePred1;
18533	bool CI2Preds = Pred2 == BasePred1;
18534	for (int I = `0`, E = CI1->getNumOperands(); I < E; ++I) {
18535	auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - `1`);
18536	auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - `1`);
18537	if (Op1 == Op2)
18538	continue;
18539	if (Op1->getValueID() < Op2->getValueID())
18540	return !IsCompatibility;
18541	if (Op1->getValueID() > Op2->getValueID())
18542	return false;
18543	if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
18544	if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
18545	if (IsCompatibility) {
18546	if (I1->getParent() != I2->getParent())
18547	return false;
18548	} else {
18549	// Try to compare nodes with same parent.
18550	DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
18551	DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
18552	if (!NodeI1)
18553	return NodeI2 != nullptr;
18554	if (!NodeI2)
18555	return false;
18556	assert((NodeI1 == NodeI2) ==
18557	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18558	"Different nodes should have different DFS numbers");
18559	if (NodeI1 != NodeI2)
18560	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18561	}
18562	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
18563	if (S.getOpcode() && (IsCompatibility \|\| !S.isAltShuffle()))
18564	continue;
18565	if (IsCompatibility)
18566	return false;
18567	if (I1->getOpcode() != I2->getOpcode())
18568	return I1->getOpcode() < I2->getOpcode();
18569	}
18570	}
18571	return IsCompatibility;
18572	}
18573
18574	template <typename ItT>
18575	bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18576	BasicBlock *BB, BoUpSLP &R) {
18577	bool Changed = false;
18578	// Try to find reductions first.
18579	for (CmpInst *I : CmpInsts) {
18580	if (R.isDeleted(I))
18581	continue;
18582	for (Value *Op : I->operands())
18583	if (auto *RootOp = dyn_cast<Instruction>(Val: Op))
18584	Changed \|= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R, TTI);
18585	}
18586	// Try to vectorize operands as vector bundles.
18587	for (CmpInst *I : CmpInsts) {
18588	if (R.isDeleted(I))
18589	continue;
18590	Changed \|= tryToVectorize(I, R);
18591	}
18592	// Try to vectorize list of compares.
18593	// Sort by type, compare predicate, etc.
18594	auto CompareSorter = [&](Value V, Value V2) {
18595	if (V == V2)
18596	return false;
18597	return compareCmp<false>(V, V2, TLI&: TLI, DT: DT);
18598	};
18599
18600	auto AreCompatibleCompares = [&](Value V1, Value V2) {
18601	if (V1 == V2)
18602	return true;
18603	return compareCmp<true>(V: V1, V2, TLI&: TLI, DT: DT);
18604	};
18605
18606	SmallVector<Value *> Vals;
18607	for (Instruction *V : CmpInsts)
18608	if (!R.isDeleted(I: V) && isValidElementType(Ty: V->getType()))
18609	Vals.push_back(Elt: V);
18610	if (Vals.size() <= `1`)
18611	return Changed;
18612	Changed \|= tryToVectorizeSequence<Value>(
18613	Vals, CompareSorter, AreCompatibleCompares,
18614	[this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
18615	// Exclude possible reductions from other blocks.
18616	bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18617	return any_of(V->users(), [V](User *U) {
18618	auto *Select = dyn_cast<SelectInst>(Val: U);
18619	return Select &&
18620	Select->getParent() != cast<Instruction>(Val: V)->getParent();
18621	});
18622	});
18623	if (ArePossiblyReducedInOtherBlock)
18624	return false;
18625	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
18626	},
18627	/MaxVFOnly=/true, R);
18628	return Changed;
18629	}
18630
18631	bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18632	BasicBlock *BB, BoUpSLP &R) {
18633	assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18634	"This function only accepts Insert instructions");
18635	bool OpsChanged = false;
18636	SmallVector<WeakTrackingVH> PostponedInsts;
18637	for (auto *I : reverse(C&: Instructions)) {
18638	// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18639	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
18640	continue;
18641	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
18642	OpsChanged \|=
18643	vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /MaxVFOnly=/true);
18644	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
18645	OpsChanged \|=
18646	vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /MaxVFOnly=/true);
18647	}
18648	// pass2 - try to vectorize reductions only
18649	if (R.isDeleted(I))
18650	continue;
18651	OpsChanged \|= vectorizeHorReduction(P: nullptr, Root: I, BB, R, TTI, PostponedInsts);
18652	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
18653	continue;
18654	// pass3 - try to match and vectorize a buildvector sequence.
18655	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
18656	OpsChanged \|=
18657	vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /MaxVFOnly=/false);
18658	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
18659	OpsChanged \|= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
18660	/MaxVFOnly=/false);
18661	}
18662	}
18663	// Now try to vectorize postponed instructions.
18664	OpsChanged \|= tryToVectorize(Insts: PostponedInsts, R);
18665
18666	Instructions.clear();
18667	return OpsChanged;
18668	}
18669
18670	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18671	bool Changed = false;
18672	SmallVector<Value *, `4`> Incoming;
18673	SmallPtrSet<Value *, `16`> VisitedInstrs;
18674	// Maps phi nodes to the non-phi nodes found in the use tree for each phi
18675	// node. Allows better to identify the chains that can be vectorized in the
18676	// better way.
18677	DenseMap<Value , SmallVector<Value , `4`>> PHIToOpcodes;
18678	auto PHICompare = [this, &PHIToOpcodes](Value V1, Value V2) {
18679	assert(isValidElementType(V1->getType()) &&
18680	isValidElementType(V2->getType()) &&
18681	"Expected vectorizable types only.");
18682	// It is fine to compare type IDs here, since we expect only vectorizable
18683	// types, like ints, floats and pointers, we don't care about other type.
18684	if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18685	return true;
18686	if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18687	return false;
18688	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
18689	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
18690	if (Opcodes1.size() < Opcodes2.size())
18691	return true;
18692	if (Opcodes1.size() > Opcodes2.size())
18693	return false;
18694	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
18695	{
18696	// Instructions come first.
18697	auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]);
18698	auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I]);
18699	if (I1 && I2) {
18700	DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
18701	DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
18702	if (!NodeI1)
18703	return NodeI2 != nullptr;
18704	if (!NodeI2)
18705	return false;
18706	assert((NodeI1 == NodeI2) ==
18707	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18708	"Different nodes should have different DFS numbers");
18709	if (NodeI1 != NodeI2)
18710	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18711	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18712	if (S.getOpcode() && !S.isAltShuffle())
18713	continue;
18714	return I1->getOpcode() < I2->getOpcode();
18715	}
18716	if (I1)
18717	return true;
18718	if (I2)
18719	return false;
18720	}
18721	{
18722	// Non-undef constants come next.
18723	bool C1 = isa<Constant>(Val: Opcodes1 [I]) && !isa<UndefValue>(Val: Opcodes1 [I]);
18724	bool C2 = isa<Constant>(Val: Opcodes2 [I]) && !isa<UndefValue>(Val: Opcodes2 [I]);
18725	if (C1 && C2)
18726	continue;
18727	if (C1)
18728	return true;
18729	if (C2)
18730	return false;
18731	}
18732	bool U1 = isa<UndefValue>(Val: Opcodes1 [I]);
18733	bool U2 = isa<UndefValue>(Val: Opcodes2 [I]);
18734	{
18735	// Non-constant non-instructions come next.
18736	if (!U1 && !U2) {
18737	auto ValID1 = Opcodes1 [I]->getValueID();
18738	auto ValID2 = Opcodes2 [I]->getValueID();
18739	if (ValID1 == ValID2)
18740	continue;
18741	if (ValID1 < ValID2)
18742	return true;
18743	if (ValID1 > ValID2)
18744	return false;
18745	}
18746	if (!U1)
18747	return true;
18748	if (!U2)
18749	return false;
18750	}
18751	// Undefs come last.
18752	assert(U1 && U2 && "The only thing left should be undef & undef.");
18753	continue;
18754	}
18755	return false;
18756	};
18757	auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value V1, Value V2) {
18758	if (V1 == V2)
18759	return true;
18760	if (V1->getType() != V2->getType())
18761	return false;
18762	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
18763	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
18764	if (Opcodes1.size() != Opcodes2.size())
18765	return false;
18766	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
18767	// Undefs are compatible with any other value.
18768	if (isa<UndefValue>(Val: Opcodes1 [I]) \|\| isa<UndefValue>(Val: Opcodes2 [I]))
18769	continue;
18770	if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]))
18771	if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I])) {
18772	if (R.isDeleted(I: I1) \|\| R.isDeleted(I: I2))
18773	return false;
18774	if (I1->getParent() != I2->getParent())
18775	return false;
18776	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18777	if (S.getOpcode())
18778	continue;
18779	return false;
18780	}
18781	if (isa<Constant>(Val: Opcodes1 [I]) && isa<Constant>(Val: Opcodes2 [I]))
18782	continue;
18783	if (Opcodes1 [I]->getValueID() != Opcodes2 [I]->getValueID())
18784	return false;
18785	}
18786	return true;
18787	};
18788
18789	bool HaveVectorizedPhiNodes = false;
18790	do {
18791	// Collect the incoming values from the PHIs.
18792	Incoming.clear();
18793	for (Instruction &I : *BB) {
18794	auto *P = dyn_cast<PHINode>(Val: &I);
18795	if (!P \|\| P->getNumIncomingValues() > MaxPHINumOperands)
18796	break;
18797
18798	// No need to analyze deleted, vectorized and non-vectorizable
18799	// instructions.
18800	if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
18801	isValidElementType(Ty: P->getType()))
18802	Incoming.push_back(Elt: P);
18803	}
18804
18805	if (Incoming.size() <= `1`)
18806	break;
18807
18808	// Find the corresponding non-phi nodes for better matching when trying to
18809	// build the tree.
18810	for (Value *V : Incoming) {
18811	SmallVectorImpl<Value *> &Opcodes =
18812	PHIToOpcodes.try_emplace(Key: V).first ->getSecond();
18813	if (!Opcodes.empty())
18814	continue;
18815	SmallVector<Value *, `4`> Nodes(`1`, V);
18816	SmallPtrSet<Value *, `4`> Visited;
18817	while (!Nodes.empty()) {
18818	auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
18819	if (!Visited.insert(Ptr: PHI).second)
18820	continue;
18821	for (Value *V : PHI->incoming_values()) {
18822	if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
18823	Nodes.push_back(Elt: PHI1);
18824	continue;
18825	}
18826	Opcodes.emplace_back(Args&: V);
18827	}
18828	}
18829	}
18830
18831	HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18832	Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
18833	TryToVectorizeHelper: [this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
18834	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
18835	},
18836	/MaxVFOnly=/true, R);
18837	Changed \|= HaveVectorizedPhiNodes;
18838	if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
18839	auto *PHI = dyn_cast<PHINode>(P.first);
18840	return !PHI \|\| R.isDeleted(I: PHI);
18841	}))
18842	PHIToOpcodes.clear();
18843	VisitedInstrs.insert(I: Incoming.begin(), E: Incoming.end());
18844	} while (HaveVectorizedPhiNodes);
18845
18846	VisitedInstrs.clear();
18847
18848	InstSetVector PostProcessInserts;
18849	SmallSetVector<CmpInst *, `8`> PostProcessCmps;
18850	// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18851	// also vectorizes `PostProcessCmps`.
18852	auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18853	bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
18854	if (VectorizeCmps) {
18855	Changed \|= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
18856	PostProcessCmps.clear();
18857	}
18858	PostProcessInserts.clear();
18859	return Changed;
18860	};
18861	// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18862	auto IsInPostProcessInstrs = [&](Instruction *I) {
18863	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
18864	return PostProcessCmps.contains(key: Cmp);
18865	return isa<InsertElementInst, InsertValueInst>(Val: I) &&
18866	PostProcessInserts.contains(key: I);
18867	};
18868	// Returns true if `I` is an instruction without users, like terminator, or
18869	// function call with ignored return value, store. Ignore unused instructions
18870	// (basing on instruction type, except for CallInst and InvokeInst).
18871	auto HasNoUsers = [](Instruction *I) {
18872	return I->use_empty() &&
18873	(I->getType()->isVoidTy() \|\| isa<CallInst, InvokeInst>(Val: I));
18874	};
18875	for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18876	// Skip instructions with scalable type. The num of elements is unknown at
18877	// compile-time for scalable type.
18878	if (isa<ScalableVectorType>(Val: It ->getType()))
18879	continue;
18880
18881	// Skip instructions marked for the deletion.
18882	if (R.isDeleted(I: &*It))
18883	continue;
18884	// We may go through BB multiple times so skip the one we have checked.
18885	if (!VisitedInstrs.insert(Ptr: &*It).second) {
18886	if (HasNoUsers (&*It) &&
18887	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator())) {
18888	// We would like to start over since some instructions are deleted
18889	// and the iterator may become invalid value.
18890	Changed = true;
18891	It = BB->begin();
18892	E = BB->end();
18893	}
18894	continue;
18895	}
18896
18897	if (isa<DbgInfoIntrinsic>(Val: It))
18898	continue;
18899
18900	// Try to vectorize reductions that use PHINodes.
18901	if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
18902	// Check that the PHI is a reduction PHI.
18903	if (P->getNumIncomingValues() == `2`) {
18904	// Try to match and vectorize a horizontal reduction.
18905	Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
18906	if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18907	Changed = true;
18908	It = BB->begin();
18909	E = BB->end();
18910	continue;
18911	}
18912	}
18913	// Try to vectorize the incoming values of the PHI, to catch reductions
18914	// that feed into PHIs.
18915	for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
18916	// Skip if the incoming block is the current BB for now. Also, bypass
18917	// unreachable IR for efficiency and to avoid crashing.
18918	// TODO: Collect the skipped incoming values and try to vectorize them
18919	// after processing BB.
18920	if (BB == P->getIncomingBlock(i: I) \|\|
18921	!DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
18922	continue;
18923
18924	// Postponed instructions should not be vectorized here, delay their
18925	// vectorization.
18926	if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
18927	PI && !IsInPostProcessInstrs (PI)) {
18928	bool Res = vectorizeRootInstruction(P: nullptr, Root: PI,
18929	BB: P->getIncomingBlock(i: I), R, TTI);
18930	Changed \|= Res;
18931	if (Res && R.isDeleted(I: P)) {
18932	It = BB->begin();
18933	E = BB->end();
18934	break;
18935	}
18936	}
18937	}
18938	continue;
18939	}
18940
18941	if (HasNoUsers (&*It)) {
18942	bool OpsChanged = false;
18943	auto *SI = dyn_cast<StoreInst>(Val&: It);
18944	bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore \|\| !SI;
18945	if (SI) {
18946	auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
18947	// Try to vectorize chain in store, if this is the only store to the
18948	// address in the block.
18949	// TODO: This is just a temporarily solution to save compile time. Need
18950	// to investigate if we can safely turn on slp-vectorize-hor-store
18951	// instead to allow lookup for reduction chains in all non-vectorized
18952	// stores (need to check side effects and compile time).
18953	TryToVectorizeRoot \|= (I == Stores.end() \|\| I->second.size() == `1`) &&
18954	SI->getValueOperand()->hasOneUse();
18955	}
18956	if (TryToVectorizeRoot) {
18957	for (auto *V : It ->operand_values()) {
18958	// Postponed instructions should not be vectorized here, delay their
18959	// vectorization.
18960	if (auto *VI = dyn_cast<Instruction>(Val: V);
18961	VI && !IsInPostProcessInstrs (VI))
18962	// Try to match and vectorize a horizontal reduction.
18963	OpsChanged \|= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R, TTI);
18964	}
18965	}
18966	// Start vectorization of post-process list of instructions from the
18967	// top-tree instructions to try to vectorize as many instructions as
18968	// possible.
18969	OpsChanged \|=
18970	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator());
18971	if (OpsChanged) {
18972	// We would like to start over since some instructions are deleted
18973	// and the iterator may become invalid value.
18974	Changed = true;
18975	It = BB->begin();
18976	E = BB->end();
18977	continue;
18978	}
18979	}
18980
18981	if (isa<InsertElementInst, InsertValueInst>(Val: It))
18982	PostProcessInserts.insert(X: &*It);
18983	else if (isa<CmpInst>(Val: It))
18984	PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
18985	}
18986
18987	return Changed;
18988	}
18989
18990	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18991	auto Changed = false;
18992	for (auto &Entry : GEPs) {
18993	// If the getelementptr list has fewer than two elements, there's nothing
18994	// to do.
18995	if (Entry.second.size() < `2`)
18996	continue;
18997
18998	LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18999	<< Entry.second.size() << ".\n");
19000
19001	// Process the GEP list in chunks suitable for the target's supported
19002	// vector size. If a vector register can't hold 1 element, we are done. We
19003	// are trying to vectorize the index computations, so the maximum number of
19004	// elements is based on the size of the index expression, rather than the
19005	// size of the GEP itself (the target's pointer size).
19006	auto It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst GEP) {
19007	return !R.isDeleted(I: GEP);
19008	});
19009	if (It == Entry.second.end())
19010	continue;
19011	unsigned MaxVecRegSize = R.getMaxVecRegSize();
19012	unsigned EltSize = R.getVectorElementSize(V: (It)->idx_begin());
19013	if (MaxVecRegSize < EltSize)
19014	continue;
19015
19016	unsigned MaxElts = MaxVecRegSize / EltSize;
19017	for (unsigned BI = `0`, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19018	auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
19019	ArrayRef<GetElementPtrInst *> GEPList(&Entry.second [BI], Len);
19020
19021	// Initialize a set a candidate getelementptrs. Note that we use a
19022	// SetVector here to preserve program order. If the index computations
19023	// are vectorizable and begin with loads, we want to minimize the chance
19024	// of having to reorder them later.
19025	SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19026
19027	// Some of the candidates may have already been vectorized after we
19028	// initially collected them or their index is optimized to constant value.
19029	// If so, they are marked as deleted, so remove them from the set of
19030	// candidates.
19031	Candidates.remove_if(P: [&R](Value *I) {
19032	return R.isDeleted(I: cast<Instruction>(Val: I)) \|\|
19033	isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
19034	});
19035
19036	// Remove from the set of candidates all pairs of getelementptrs with
19037	// constant differences. Such getelementptrs are likely not good
19038	// candidates for vectorization in a bottom-up phase since one can be
19039	// computed from the other. We also ensure all candidate getelementptr
19040	// indices are unique.
19041	for (int I = `0`, E = GEPList.size(); I < E && Candidates.size() > `1`; ++I) {
19042	auto *GEPI = GEPList [I];
19043	if (!Candidates.count(key: GEPI))
19044	continue;
19045	auto *SCEVI = SE->getSCEV(V: GEPList [I]);
19046	for (int J = I + `1`; J < E && Candidates.size() > `1`; ++J) {
19047	auto *GEPJ = GEPList [J];
19048	auto *SCEVJ = SE->getSCEV(V: GEPList [J]);
19049	if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
19050	Candidates.remove(X: GEPI);
19051	Candidates.remove(X: GEPJ);
19052	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19053	Candidates.remove(X: GEPJ);
19054	}
19055	}
19056	}
19057
19058	// We break out of the above computation as soon as we know there are
19059	// fewer than two candidates remaining.
19060	if (Candidates.size() < `2`)
19061	continue;
19062
19063	// Add the single, non-constant index of each candidate to the bundle. We
19064	// ensured the indices met these constraints when we originally collected
19065	// the getelementptrs.
19066	SmallVector<Value *, `16`> Bundle(Candidates.size());
19067	auto BundleIndex = `0u`;
19068	for (auto *V : Candidates) {
19069	auto *GEP = cast<GetElementPtrInst>(Val: V);
19070	auto *GEPIdx = GEP->idx_begin()->get();
19071	assert(GEP->getNumIndices() == `1` && !isa<Constant>(GEPIdx));
19072	Bundle [BundleIndex++] = GEPIdx;
19073	}
19074
19075	// Try and vectorize the indices. We are currently only interested in
19076	// gather-like cases of the form:
19077	//
19078	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19079	//
19080	// where the loads of "a", the loads of "b", and the subtractions can be
19081	// performed in parallel. It's likely that detecting this pattern in a
19082	// bottom-up phase will be simpler and less costly than building a
19083	// full-blown top-down phase beginning at the consecutive loads.
19084	Changed \|= tryToVectorizeList(VL: Bundle, R);
19085	}
19086	}
19087	return Changed;
19088	}
19089
19090	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19091	bool Changed = false;
19092	// Sort by type, base pointers and values operand. Value operands must be
19093	// compatible (have the same opcode, same parent), otherwise it is
19094	// definitely not profitable to try to vectorize them.
19095	auto &&StoreSorter = [this](StoreInst V, StoreInst V2) {
19096	if (V->getValueOperand()->getType()->getTypeID() <
19097	V2->getValueOperand()->getType()->getTypeID())
19098	return true;
19099	if (V->getValueOperand()->getType()->getTypeID() >
19100	V2->getValueOperand()->getType()->getTypeID())
19101	return false;
19102	if (V->getPointerOperandType()->getTypeID() <
19103	V2->getPointerOperandType()->getTypeID())
19104	return true;
19105	if (V->getPointerOperandType()->getTypeID() >
19106	V2->getPointerOperandType()->getTypeID())
19107	return false;
19108	// UndefValues are compatible with all other values.
19109	if (isa<UndefValue>(Val: V->getValueOperand()) \|\|
19110	isa<UndefValue>(Val: V2->getValueOperand()))
19111	return false;
19112	if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand()))
19113	if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
19114	DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
19115	DT->getNode(BB: I1->getParent());
19116	DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
19117	DT->getNode(BB: I2->getParent());
19118	assert(NodeI1 && "Should only process reachable instructions");
19119	assert(NodeI2 && "Should only process reachable instructions");
19120	assert((NodeI1 == NodeI2) ==
19121	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19122	"Different nodes should have different DFS numbers");
19123	if (NodeI1 != NodeI2)
19124	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19125	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
19126	if (S.getOpcode())
19127	return false;
19128	return I1->getOpcode() < I2->getOpcode();
19129	}
19130	if (isa<Constant>(Val: V->getValueOperand()) &&
19131	isa<Constant>(Val: V2->getValueOperand()))
19132	return false;
19133	return V->getValueOperand()->getValueID() <
19134	V2->getValueOperand()->getValueID();
19135	};
19136
19137	auto &&AreCompatibleStores = [this](StoreInst V1, StoreInst V2) {
19138	if (V1 == V2)
19139	return true;
19140	if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19141	return false;
19142	if (V1->getPointerOperandType() != V2->getPointerOperandType())
19143	return false;
19144	// Undefs are compatible with any other value.
19145	if (isa<UndefValue>(Val: V1->getValueOperand()) \|\|
19146	isa<UndefValue>(Val: V2->getValueOperand()))
19147	return true;
19148	if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand()))
19149	if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
19150	if (I1->getParent() != I2->getParent())
19151	return false;
19152	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
19153	return S.getOpcode() > `0`;
19154	}
19155	if (isa<Constant>(Val: V1->getValueOperand()) &&
19156	isa<Constant>(Val: V2->getValueOperand()))
19157	return true;
19158	return V1->getValueOperand()->getValueID() ==
19159	V2->getValueOperand()->getValueID();
19160	};
19161
19162	// Attempt to sort and vectorize each of the store-groups.
19163	DenseSet<std::tuple<Value , Value , Value , Value , unsigned>> Attempted;
19164	for (auto &Pair : Stores) {
19165	if (Pair.second.size() < `2`)
19166	continue;
19167
19168	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19169	<< Pair.second.size() << ".\n");
19170
19171	if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
19172	continue;
19173
19174	// Reverse stores to do bottom-to-top analysis. This is important if the
19175	// values are stores to the same addresses several times, in this case need
19176	// to follow the stores order (reversed to meet the memory dependecies).
19177	SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19178	Pair.second.rend());
19179	Changed \|= tryToVectorizeSequence<StoreInst>(
19180	Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
19181	TryToVectorizeHelper: [&](ArrayRef<StoreInst > Candidates, bool*) {
19182	return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
19183	},
19184	/MaxVFOnly=/false, R);
19185	}
19186	return Changed;
19187	}
19188

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp