SLPVectorizer.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp]

1	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10	// stores that can be put together into vector-stores. Next, it attempts to
11	// construct vectorizable tree using the use-def chains. If a profitable tree
12	// was found, the SLP vectorizer performs vectorization on the tree.
13	//
14	// The pass is inspired by the work described in the paper:
15	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16	//
17	//===----------------------------------------------------------------------===//
18
19	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20	#include "llvm/ADT/DenseMap.h"
21	#include "llvm/ADT/DenseSet.h"
22	#include "llvm/ADT/PriorityQueue.h"
23	#include "llvm/ADT/STLExtras.h"
24	#include "llvm/ADT/ScopeExit.h"
25	#include "llvm/ADT/SetOperations.h"
26	#include "llvm/ADT/SetVector.h"
27	#include "llvm/ADT/SmallBitVector.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallString.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/ADT/iterator.h"
33	#include "llvm/ADT/iterator_range.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/Analysis/AssumptionCache.h"
36	#include "llvm/Analysis/CodeMetrics.h"
37	#include "llvm/Analysis/ConstantFolding.h"
38	#include "llvm/Analysis/DemandedBits.h"
39	#include "llvm/Analysis/GlobalsModRef.h"
40	#include "llvm/Analysis/IVDescriptors.h"
41	#include "llvm/Analysis/Loads.h"
42	#include "llvm/Analysis/LoopAccessAnalysis.h"
43	#include "llvm/Analysis/LoopInfo.h"
44	#include "llvm/Analysis/MemoryLocation.h"
45	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
46	#include "llvm/Analysis/ScalarEvolution.h"
47	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
48	#include "llvm/Analysis/TargetLibraryInfo.h"
49	#include "llvm/Analysis/TargetTransformInfo.h"
50	#include "llvm/Analysis/ValueTracking.h"
51	#include "llvm/Analysis/VectorUtils.h"
52	#include "llvm/IR/Attributes.h"
53	#include "llvm/IR/BasicBlock.h"
54	#include "llvm/IR/Constant.h"
55	#include "llvm/IR/Constants.h"
56	#include "llvm/IR/DataLayout.h"
57	#include "llvm/IR/DerivedTypes.h"
58	#include "llvm/IR/Dominators.h"
59	#include "llvm/IR/Function.h"
60	#include "llvm/IR/IRBuilder.h"
61	#include "llvm/IR/InstrTypes.h"
62	#include "llvm/IR/Instruction.h"
63	#include "llvm/IR/Instructions.h"
64	#include "llvm/IR/IntrinsicInst.h"
65	#include "llvm/IR/Intrinsics.h"
66	#include "llvm/IR/Module.h"
67	#include "llvm/IR/Operator.h"
68	#include "llvm/IR/PatternMatch.h"
69	#include "llvm/IR/Type.h"
70	#include "llvm/IR/Use.h"
71	#include "llvm/IR/User.h"
72	#include "llvm/IR/Value.h"
73	#include "llvm/IR/ValueHandle.h"
74	#ifdef EXPENSIVE_CHECKS
75	#include "llvm/IR/Verifier.h"
76	#endif
77	#include "llvm/Pass.h"
78	#include "llvm/Support/Casting.h"
79	#include "llvm/Support/CommandLine.h"
80	#include "llvm/Support/Compiler.h"
81	#include "llvm/Support/DOTGraphTraits.h"
82	#include "llvm/Support/Debug.h"
83	#include "llvm/Support/DebugCounter.h"
84	#include "llvm/Support/ErrorHandling.h"
85	#include "llvm/Support/GraphWriter.h"
86	#include "llvm/Support/InstructionCost.h"
87	#include "llvm/Support/KnownBits.h"
88	#include "llvm/Support/MathExtras.h"
89	#include "llvm/Support/raw_ostream.h"
90	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
91	#include "llvm/Transforms/Utils/Local.h"
92	#include "llvm/Transforms/Utils/LoopUtils.h"
93	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
94	#include <algorithm>
95	#include <cassert>
96	#include <cstdint>
97	#include <iterator>
98	#include <memory>
99	#include <optional>
100	#include <set>
101	#include <string>
102	#include <tuple>
103	#include <utility>
104
105	using namespace llvm;
106	using namespace llvm::PatternMatch;
107	using namespace slpvectorizer;
108	using namespace std::placeholders;
109
110	#define SV_NAME "slp-vectorizer"
111	#define DEBUG_TYPE "SLP"
112
113	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115	DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116	"Controls which SLP graphs should be vectorized.");
117
118	static cl::opt<bool>
119	RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
120	cl::desc ("Run the SLP vectorization passes"));
121
122	static cl::opt<bool>
123	SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
124	cl::desc ("Enable vectorization for wider vector utilization"));
125
126	static cl::opt<int>
127	SLPCostThreshold("slp-threshold", cl::init(Val: `0`), cl::Hidden,
128	cl::desc ("Only vectorize if you gain more than this "
129	"number "));
130
131	static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
132	"slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
133	cl::desc ("When true, SLP vectorizer bypasses profitability checks based on "
134	"heuristics and makes vectorization decision via cost modeling."));
135
136	static cl::opt<bool>
137	ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
138	cl::desc ("Attempt to vectorize horizontal reductions"));
139
140	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
141	"slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
142	cl::desc (
143	"Attempt to vectorize horizontal reductions feeding into a store"));
144
145	static cl::opt<bool> SplitAlternateInstructions(
146	"slp-split-alternate-instructions", cl::init(Val: true), cl::Hidden,
147	cl::desc ("Improve the code quality by splitting alternate instructions"));
148
149	static cl::opt<int>
150	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: `128`), cl::Hidden,
151	cl::desc ("Attempt to vectorize for this register size in bits"));
152
153	static cl::opt<unsigned>
154	MaxVFOption("slp-max-vf", cl::init(Val: `0`), cl::Hidden,
155	cl::desc ("Maximum SLP vectorization factor (0=unlimited)"));
156
157	/// Limits the size of scheduling regions in a block.
158	/// It avoid long compile times for _very_ large blocks where vector
159	/// instructions are spread over a wide range.
160	/// This limit is way higher than needed by real-world functions.
161	static cl::opt<int>
162	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: `100000`), cl::Hidden,
163	cl::desc ("Limit the size of the SLP scheduling region per block"));
164
165	static cl::opt<int> MinVectorRegSizeOption(
166	"slp-min-reg-size", cl::init(Val: `128`), cl::Hidden,
167	cl::desc ("Attempt to vectorize for this register size in bits"));
168
169	static cl::opt<unsigned> RecursionMaxDepth(
170	"slp-recursion-max-depth", cl::init(Val: `12`), cl::Hidden,
171	cl::desc ("Limit the recursion depth when building a vectorizable tree"));
172
173	static cl::opt<unsigned> MinTreeSize(
174	"slp-min-tree-size", cl::init(Val: `3`), cl::Hidden,
175	cl::desc ("Only vectorize small trees if they are fully vectorizable"));
176
177	// The maximum depth that the look-ahead score heuristic will explore.
178	// The higher this value, the higher the compilation time overhead.
179	static cl::opt<int> LookAheadMaxDepth(
180	"slp-max-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
181	cl::desc ("The maximum look-ahead depth for operand reordering scores"));
182
183	// The maximum depth that the look-ahead score heuristic will explore
184	// when it probing among candidates for vectorization tree roots.
185	// The higher this value, the higher the compilation time overhead but unlike
186	// similar limit for operands ordering this is less frequently used, hence
187	// impact of higher value is less noticeable.
188	static cl::opt<int> RootLookAheadMaxDepth(
189	"slp-max-root-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
190	cl::desc ("The maximum look-ahead depth for searching best rooting option"));
191
192	static cl::opt<unsigned> MinProfitableStridedLoads(
193	"slp-min-strided-loads", cl::init(Val: `2`), cl::Hidden,
194	cl::desc ("The minimum number of loads, which should be considered strided, "
195	"if the stride is > 1 or is runtime value"));
196
197	static cl::opt<unsigned> MaxProfitableLoadStride(
198	"slp-max-stride", cl::init(Val: `8`), cl::Hidden,
199	cl::desc ("The maximum stride, considered to be profitable."));
200
201	static cl::opt<bool>
202	ViewSLPTree("view-slp-tree", cl::Hidden,
203	cl::desc ("Display the SLP trees with Graphviz"));
204
205	static cl::opt<bool> VectorizeNonPowerOf2(
206	"slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
207	cl::desc ("Try to vectorize with non-power-of-2 number of elements."));
208
209	// Limit the number of alias checks. The limit is chosen so that
210	// it has no negative effect on the llvm benchmarks.
211	static const unsigned AliasedCheckLimit = `10`;
212
213	// Limit of the number of uses for potentially transformed instructions/values,
214	// used in checks to avoid compile-time explode.
215	static constexpr int UsesLimit = `64`;
216
217	// Another limit for the alias checks: The maximum distance between load/store
218	// instructions where alias checks are done.
219	// This limit is useful for very large basic blocks.
220	static const unsigned MaxMemDepDistance = `160`;
221
222	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
223	/// regions to be handled.
224	static const int MinScheduleRegionSize = `16`;
225
226	/// Maximum allowed number of operands in the PHI nodes.
227	static const unsigned MaxPHINumOperands = `128`;
228
229	/// Predicate for the element types that the SLP vectorizer supports.
230	///
231	/// The most important thing to filter here are types which are invalid in LLVM
232	/// vectors. We also filter target specific types which have absolutely no
233	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
234	/// avoids spending time checking the cost model and realizing that they will
235	/// be inevitably scalarized.
236	static bool isValidElementType(Type *Ty) {
237	// TODO: Support ScalableVectorType.
238	if (SLPReVec && isa<FixedVectorType>(Val: Ty))
239	Ty = Ty->getScalarType();
240	return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
241	!Ty->isPPC_FP128Ty();
242	}
243
244	/// Returns the type of the given value/instruction \p V. If it is store,
245	/// returns the type of its value operand, for Cmp - the types of the compare
246	/// operands and for insertelement - the type os the inserted operand.
247	/// Otherwise, just the type of the value is returned.
248	static Type getValueType(Value V) {
249	if (auto *SI = dyn_cast<StoreInst>(Val: V))
250	return SI->getValueOperand()->getType();
251	if (auto *CI = dyn_cast<CmpInst>(Val: V))
252	return CI->getOperand(i_nocapture: `0`)->getType();
253	if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
254	return IE->getOperand(i_nocapture: `1`)->getType();
255	return V->getType();
256	}
257
258	/// \returns the number of elements for Ty.
259	static unsigned getNumElements(Type *Ty) {
260	assert(!isa<ScalableVectorType>(Ty) &&
261	"ScalableVectorType is not supported.");
262	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
263	return VecTy->getNumElements();
264	return `1`;
265	}
266
267	/// \returns the vector type of ScalarTy based on vectorization factor.
268	static FixedVectorType getWidenedType(Type ScalarTy, unsigned VF) {
269	return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
270	NumElts: VF * getNumElements(Ty: ScalarTy));
271	}
272
273	/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
274	/// which forms type, which splits by \p TTI into whole vector types during
275	/// legalization.
276	static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
277	Type Ty, unsigned* Sz) {
278	if (!isValidElementType(Ty))
279	return bit_ceil(Value: Sz);
280	// Find the number of elements, which forms full vectors.
281	const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
282	if (NumParts == `0` \|\| NumParts >= Sz)
283	return bit_ceil(Value: Sz);
284	return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts;
285	}
286
287	/// Returns the number of elements of the given type \p Ty, not greater than \p
288	/// Sz, which forms type, which splits by \p TTI into whole vector types during
289	/// legalization.
290	static unsigned
291	getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
292	unsigned Sz) {
293	if (!isValidElementType(Ty))
294	return bit_floor(Value: Sz);
295	// Find the number of elements, which forms full vectors.
296	unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
297	if (NumParts == `0` \|\| NumParts >= Sz)
298	return bit_floor(Value: Sz);
299	unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts));
300	if (RegVF > Sz)
301	return bit_floor(Value: Sz);
302	return (Sz / RegVF) * RegVF;
303	}
304
305	static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
306	SmallVectorImpl<int> &Mask) {
307	// The ShuffleBuilder implementation use shufflevector to splat an "element".
308	// But the element have different meaning for SLP (scalar) and REVEC
309	// (vector). We need to expand Mask into masks which shufflevector can use
310	// directly.
311	SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
312	for (unsigned I : seq<unsigned>(Size: Mask.size()))
313	for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice(
314	N: I * VecTyNumElements, M: VecTyNumElements)))
315	MaskV = Mask [I] == PoisonMaskElem ? PoisonMaskElem
316	: Mask [I] * VecTyNumElements + J;
317	Mask.swap(RHS&: NewMask);
318	}
319
320	/// \returns the number of groups of shufflevector
321	/// A group has the following features
322	/// 1. All of value in a group are shufflevector.
323	/// 2. The mask of all shufflevector is isExtractSubvectorMask.
324	/// 3. The mask of all shufflevector uses all of the elements of the source.
325	/// e.g., it is 1 group (%0)
326	/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
327	/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
328	/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
329	/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
330	/// it is 2 groups (%3 and %4)
331	/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
332	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333	/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
334	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
335	/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
336	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337	/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
338	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339	/// it is 0 group
340	/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
341	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342	/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
343	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
344	static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
345	if (VL.empty())
346	return `0`;
347	if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>))
348	return `0`;
349	auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
350	unsigned SVNumElements =
351	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())->getNumElements();
352	unsigned ShuffleMaskSize = SV->getShuffleMask().size();
353	if (SVNumElements % ShuffleMaskSize != `0`)
354	return `0`;
355	unsigned GroupSize = SVNumElements / ShuffleMaskSize;
356	if (GroupSize == `0` \|\| (VL.size() % GroupSize) != `0`)
357	return `0`;
358	unsigned NumGroup = `0`;
359	for (size_t I = `0`, E = VL.size(); I != E; I += GroupSize) {
360	auto *SV = cast<ShuffleVectorInst>(Val: VL [I]);
361	Value *Src = SV->getOperand(i_nocapture: `0`);
362	ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
363	SmallBitVector ExpectedIndex(GroupSize);
364	if (!all_of(Range&: Group, P: [&](Value *V) {
365	auto *SV = cast<ShuffleVectorInst>(Val: V);
366	// From the same source.
367	if (SV->getOperand(i_nocapture: `0`) != Src)
368	return false;
369	int Index;
370	if (!SV->isExtractSubvectorMask(Index))
371	return false;
372	ExpectedIndex.set(Index / ShuffleMaskSize);
373	return true;
374	}))
375	return `0`;
376	if (!ExpectedIndex.all())
377	return `0`;
378	++NumGroup;
379	}
380	assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
381	return NumGroup;
382	}
383
384	/// \returns a shufflevector mask which is used to vectorize shufflevectors
385	/// e.g.,
386	/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
387	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
388	/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
389	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
390	/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
391	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
392	/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
393	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
394	/// the result is
395	/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
396	static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
397	assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
398	auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
399	unsigned SVNumElements =
400	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())->getNumElements();
401	SmallVector<int> Mask;
402	unsigned AccumulateLength = `0`;
403	for (Value *V : VL) {
404	auto *SV = cast<ShuffleVectorInst>(Val: V);
405	for (int M : SV->getShuffleMask())
406	Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem
407	: AccumulateLength + M);
408	AccumulateLength += SVNumElements;
409	}
410	return Mask;
411	}
412
413	/// \returns True if the value is a constant (but not globals/constant
414	/// expressions).
415	static bool isConstant(Value *V) {
416	return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
417	}
418
419	/// Checks if \p V is one of vector-like instructions, i.e. undef,
420	/// insertelement/extractelement with constant indices for fixed vector type or
421	/// extractvalue instruction.
422	static bool isVectorLikeInstWithConstOps(Value *V) {
423	if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
424	!isa<ExtractValueInst, UndefValue>(Val: V))
425	return false;
426	auto *I = dyn_cast<Instruction>(Val: V);
427	if (!I \|\| isa<ExtractValueInst>(Val: I))
428	return true;
429	if (!isa<FixedVectorType>(Val: I->getOperand(i: `0`)->getType()))
430	return false;
431	if (isa<ExtractElementInst>(Val: I))
432	return isConstant(V: I->getOperand(i: `1`));
433	assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
434	return isConstant(V: I->getOperand(i: `2`));
435	}
436
437	/// Returns power-of-2 number of elements in a single register (part), given the
438	/// total number of elements \p Size and number of registers (parts) \p
439	/// NumParts.
440	static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
441	return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts)));
442	}
443
444	/// Returns correct remaining number of elements, considering total amount \p
445	/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
446	/// and current register (part) \p Part.
447	static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
448	unsigned Part) {
449	return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
450	}
451
452	#if !defined(NDEBUG)
453	/// Print a short descriptor of the instruction bundle suitable for debug output.
454	static std::string shortBundleName(ArrayRef<Value > VL, int* Idx = -`1`) {
455	std::string Result;
456	raw_string_ostream OS(Result);
457	if (Idx >= `0`)
458	OS << "Idx: " << Idx << ", ";
459	OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
460	return Result;
461	}
462	#endif
463
464	/// \returns true if all of the instructions in \p VL are in the same block or
465	/// false otherwise.
466	static bool allSameBlock(ArrayRef<Value *> VL) {
467	auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
468	if (It == VL.end())
469	return false;
470	Instruction I0 = cast<Instruction>(Val: It);
471	if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
472	return true;
473
474	BasicBlock *BB = I0->getParent();
475	for (Value *V : iterator_range(It, VL.end())) {
476	if (isa<PoisonValue>(Val: V))
477	continue;
478	auto *II = dyn_cast<Instruction>(Val: V);
479	if (!II)
480	return false;
481
482	if (BB != II->getParent())
483	return false;
484	}
485	return true;
486	}
487
488	/// \returns True if all of the values in \p VL are constants (but not
489	/// globals/constant expressions).
490	static bool allConstant(ArrayRef<Value *> VL) {
491	// Constant expressions and globals can't be vectorized like normal integer/FP
492	// constants.
493	return all_of(Range&: VL, P: isConstant);
494	}
495
496	/// \returns True if all of the values in \p VL are identical or some of them
497	/// are UndefValue.
498	static bool isSplat(ArrayRef<Value *> VL) {
499	Value FirstNonUndef = nullptr*;
500	for (Value *V : VL) {
501	if (isa<UndefValue>(Val: V))
502	continue;
503	if (!FirstNonUndef) {
504	FirstNonUndef = V;
505	continue;
506	}
507	if (V != FirstNonUndef)
508	return false;
509	}
510	return FirstNonUndef != nullptr;
511	}
512
513	/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
514	/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
515	/// patterns that make it effectively commutative (like equality comparisons
516	/// with zero).
517	/// In most cases, users should not call this function directly (since \p I and
518	/// \p InstWithUses are the same). However, when analyzing interchangeable
519	/// instructions, we need to use the converted opcode along with the original
520	/// uses.
521	/// \param I The instruction to check for commutativity
522	/// \param InstWithUses The instruction whose uses are analyzed for special
523	/// patterns
524	static bool isCommutative(Instruction I, Instruction InstWithUses) {
525	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
526	return Cmp->isCommutative();
527	if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
528	return BO->isCommutative() \|\|
529	(BO->getOpcode() == Instruction::Sub &&
530	!InstWithUses->hasNUsesOrMore(N: UsesLimit) &&
531	all_of(
532	Range: InstWithUses->uses(),
533	P: [](const Use &U) {
534	// Commutative, if icmp eq/ne sub, 0
535	CmpPredicate Pred;
536	if (match(V: U.getUser(),
537	P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
538	(Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_NE))
539	return true;
540	// Commutative, if abs(sub nsw, true) or abs(sub, false).
541	ConstantInt *Flag;
542	return match(V: U.getUser(),
543	P: m_Intrinsic<Intrinsic::abs>(
544	Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
545	(!cast<Instruction>(Val: U.get())->hasNoSignedWrap() \|\|
546	Flag->isOne());
547	})) \|\|
548	(BO->getOpcode() == Instruction::FSub &&
549	!InstWithUses->hasNUsesOrMore(N: UsesLimit) &&
550	all_of(Range: InstWithUses->uses(), P: [](const Use &U) {
551	return match(V: U.getUser(),
552	P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
553	}));
554	return I->isCommutative();
555	}
556
557	/// This is a helper function to check whether \p I is commutative.
558	/// This is a convenience wrapper that calls the two-parameter version of
559	/// isCommutative with the same instruction for both parameters. This is
560	/// the common case where the instruction being checked for commutativity
561	/// is the same as the instruction whose uses are analyzed for special
562	/// patterns (see the two-parameter version above for details).
563	/// \param I The instruction to check for commutativity
564	/// \returns true if the instruction is commutative, false otherwise
565	static bool isCommutative(Instruction I) { return* isCommutative(I, InstWithUses: I); }
566
567	template <typename T>
568	static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
569	unsigned Offset) {
570	static_assert(std::is_same_v<T, InsertElementInst> \|\|
571	std::is_same_v<T, ExtractElementInst>,
572	"unsupported T");
573	int Index = Offset;
574	if (const auto *IE = dyn_cast<T>(Inst)) {
575	const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
576	if (!VT)
577	return std::nullopt;
578	const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(`2`));
579	if (!CI)
580	return std::nullopt;
581	if (CI->getValue().uge(VT->getNumElements()))
582	return std::nullopt;
583	Index *= VT->getNumElements();
584	Index += CI->getZExtValue();
585	return Index;
586	}
587	return std::nullopt;
588	}
589
590	/// \returns inserting or extracting index of InsertElement, ExtractElement or
591	/// InsertValue instruction, using Offset as base offset for index.
592	/// \returns std::nullopt if the index is not an immediate.
593	static std::optional<unsigned> getElementIndex(const Value *Inst,
594	unsigned Offset = `0`) {
595	if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
596	return Index;
597	if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
598	return Index;
599
600	int Index = Offset;
601
602	const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
603	if (!IV)
604	return std::nullopt;
605
606	Type *CurrentType = IV->getType();
607	for (unsigned I : IV->indices()) {
608	if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
609	Index *= ST->getNumElements();
610	CurrentType = ST->getElementType(N: I);
611	} else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
612	Index *= AT->getNumElements();
613	CurrentType = AT->getElementType();
614	} else {
615	return std::nullopt;
616	}
617	Index += I;
618	}
619	return Index;
620	}
621
622	/// \returns true if all of the values in \p VL use the same opcode.
623	/// For comparison instructions, also checks if predicates match.
624	/// PoisonValues are considered matching.
625	/// Interchangeable instructions are not considered.
626	static bool allSameOpcode(ArrayRef<Value *> VL) {
627	auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
628	if (It == VL.end())
629	return true;
630	Instruction MainOp = cast<Instruction>(Val: It);
631	unsigned Opcode = MainOp->getOpcode();
632	bool IsCmpOp = isa<CmpInst>(Val: MainOp);
633	CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
634	: CmpInst::BAD_ICMP_PREDICATE;
635	return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) {
636	if (auto *CI = dyn_cast<CmpInst>(Val: V))
637	return BasePred == CI->getPredicate();
638	if (auto *I = dyn_cast<Instruction>(Val: V))
639	return I->getOpcode() == Opcode;
640	return isa<PoisonValue>(Val: V);
641	});
642	}
643
644	namespace {
645	/// Specifies the way the mask should be analyzed for undefs/poisonous elements
646	/// in the shuffle mask.
647	enum class UseMask {
648	FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
649	///< check for the mask elements for the first argument (mask
650	///< indices are in range [0:VF)).
651	SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
652	///< for the mask elements for the second argument (mask indices
653	///< are in range [VF:2VF))*
654	UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
655	///< future shuffle elements and mark them as ones as being used
656	///< in future. Non-undef elements are considered as unused since
657	///< they're already marked as used in the mask.
658	};
659	} // namespace
660
661	/// Prepares a use bitset for the given mask either for the first argument or
662	/// for the second.
663	static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
664	UseMask MaskArg) {
665	SmallBitVector UseMask(VF, true);
666	for (auto [Idx, Value] : enumerate(First&: Mask)) {
667	if (Value == PoisonMaskElem) {
668	if (MaskArg == UseMask::UndefsAsMask)
669	UseMask.reset(Idx);
670	continue;
671	}
672	if (MaskArg == UseMask::FirstArg && Value < VF)
673	UseMask.reset(Idx: Value);
674	else if (MaskArg == UseMask::SecondArg && Value >= VF)
675	UseMask.reset(Idx: Value - VF);
676	}
677	return UseMask;
678	}
679
680	/// Checks if the given value is actually an undefined constant vector.
681	/// Also, if the \p UseMask is not empty, tries to check if the non-masked
682	/// elements actually mask the insertelement buildvector, if any.
683	template <bool IsPoisonOnly = false>
684	static SmallBitVector isUndefVector(const Value *V,
685	const SmallBitVector &UseMask = {}) {
686	SmallBitVector Res(UseMask.empty() ? `1` : UseMask.size(), true);
687	using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
688	if (isa<T>(V))
689	return Res;
690	auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
691	if (!VecTy)
692	return Res.reset();
693	auto *C = dyn_cast<Constant>(Val: V);
694	if (!C) {
695	if (!UseMask.empty()) {
696	const Value *Base = V;
697	while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
698	Base = II->getOperand(i_nocapture: `0`);
699	if (isa<T>(II->getOperand(i_nocapture: `1`)))
700	continue;
701	std::optional<unsigned> Idx = getElementIndex(Inst: II);
702	if (!Idx) {
703	Res.reset();
704	return Res;
705	}
706	if (Idx < UseMask.size() && !UseMask.test(Idx: Idx))
707	Res.reset(Idx: *Idx);
708	}
709	// TODO: Add analysis for shuffles here too.
710	if (V == Base) {
711	Res.reset();
712	} else {
713	SmallBitVector SubMask(UseMask.size(), false);
714	Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
715	}
716	} else {
717	Res.reset();
718	}
719	return Res;
720	}
721	for (unsigned I = `0`, E = VecTy->getNumElements(); I != E; ++I) {
722	if (Constant *Elem = C->getAggregateElement(Elt: I))
723	if (!isa<T>(Elem) &&
724	(UseMask.empty() \|\| (I < UseMask.size() && !UseMask.test(Idx: I))))
725	Res.reset(Idx: I);
726	}
727	return Res;
728	}
729
730	/// Checks if the vector of instructions can be represented as a shuffle, like:
731	/// %x0 = extractelement <4 x i8> %x, i32 0
732	/// %x3 = extractelement <4 x i8> %x, i32 3
733	/// %y1 = extractelement <4 x i8> %y, i32 1
734	/// %y2 = extractelement <4 x i8> %y, i32 2
735	/// %x0x0 = mul i8 %x0, %x0
736	/// %x3x3 = mul i8 %x3, %x3
737	/// %y1y1 = mul i8 %y1, %y1
738	/// %y2y2 = mul i8 %y2, %y2
739	/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
740	/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
741	/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
742	/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
743	/// ret <4 x i8> %ins4
744	/// can be transformed into:
745	/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
746	/// i32 6>
747	/// %2 = mul <4 x i8> %1, %1
748	/// ret <4 x i8> %2
749	/// Mask will return the Shuffle Mask equivalent to the extracted elements.
750	/// TODO: Can we split off and reuse the shuffle mask detection from
751	/// ShuffleVectorInst/getShuffleCost?
752	static std::optional<TargetTransformInfo::ShuffleKind>
753	isFixedVectorShuffle(ArrayRef<Value > VL, SmallVectorImpl<int*> &Mask,
754	AssumptionCache *AC) {
755	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
756	if (It == VL.end())
757	return std::nullopt;
758	unsigned Size =
759	std::accumulate(first: VL.begin(), last: VL.end(), init: `0u`, binary_op: [](unsigned S, Value *V) {
760	auto *EI = dyn_cast<ExtractElementInst>(Val: V);
761	if (!EI)
762	return S;
763	auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
764	if (!VTy)
765	return S;
766	return std::max(a: S, b: VTy->getNumElements());
767	});
768
769	Value Vec1 = nullptr*;
770	Value Vec2 = nullptr*;
771	bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) {
772	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
773	if (!EE)
774	return false;
775	Value *Vec = EE->getVectorOperand();
776	if (isa<UndefValue>(Val: Vec))
777	return false;
778	return isGuaranteedNotToBePoison(V: Vec, AC);
779	});
780	enum ShuffleMode { Unknown, Select, Permute };
781	ShuffleMode CommonShuffleMode = Unknown;
782	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
783	for (unsigned I = `0`, E = VL.size(); I < E; ++I) {
784	// Undef can be represented as an undef element in a vector.
785	if (isa<UndefValue>(Val: VL [I]))
786	continue;
787	auto *EI = cast<ExtractElementInst>(Val: VL [I]);
788	if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
789	return std::nullopt;
790	auto *Vec = EI->getVectorOperand();
791	// We can extractelement from undef or poison vector.
792	if (isUndefVector</isPoisonOnly=/true>(V: Vec).all())
793	continue;
794	// All vector operands must have the same number of vector elements.
795	if (isa<UndefValue>(Val: Vec)) {
796	Mask [I] = I;
797	} else {
798	if (isa<UndefValue>(Val: EI->getIndexOperand()))
799	continue;
800	auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
801	if (!Idx)
802	return std::nullopt;
803	// Undefined behavior if Idx is negative or >= Size.
804	if (Idx->getValue().uge(RHS: Size))
805	continue;
806	unsigned IntIdx = Idx->getValue().getZExtValue();
807	Mask [I] = IntIdx;
808	}
809	if (isUndefVector(V: Vec).all() && HasNonUndefVec)
810	continue;
811	// For correct shuffling we have to have at most 2 different vector operands
812	// in all extractelement instructions.
813	if (!Vec1 \|\| Vec1 == Vec) {
814	Vec1 = Vec;
815	} else if (!Vec2 \|\| Vec2 == Vec) {
816	Vec2 = Vec;
817	Mask [I] += Size;
818	} else {
819	return std::nullopt;
820	}
821	if (CommonShuffleMode == Permute)
822	continue;
823	// If the extract index is not the same as the operation number, it is a
824	// permutation.
825	if (Mask [I] % Size != I) {
826	CommonShuffleMode = Permute;
827	continue;
828	}
829	CommonShuffleMode = Select;
830	}
831	// If we're not crossing lanes in different vectors, consider it as blending.
832	if (CommonShuffleMode == Select && Vec2)
833	return TargetTransformInfo::SK_Select;
834	// If Vec2 was never used, we have a permutation of a single vector, otherwise
835	// we have permutation of 2 vectors.
836	return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
837	: TargetTransformInfo::SK_PermuteSingleSrc;
838	}
839
840	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
841	static std::optional<unsigned> getExtractIndex(const Instruction *E) {
842	unsigned Opcode = E->getOpcode();
843	assert((Opcode == Instruction::ExtractElement \|\|
844	Opcode == Instruction::ExtractValue) &&
845	"Expected extractelement or extractvalue instruction.");
846	if (Opcode == Instruction::ExtractElement) {
847	auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: `1`));
848	if (!CI)
849	return std::nullopt;
850	return CI->getZExtValue();
851	}
852	auto *EI = cast<ExtractValueInst>(Val: E);
853	if (EI->getNumIndices() != `1`)
854	return std::nullopt;
855	return *EI->idx_begin();
856	}
857
858	namespace {
859	/// \returns true if \p Opcode is allowed as part of the main/alternate
860	/// instruction for SLP vectorization.
861	///
862	/// Example of unsupported opcode is SDIV that can potentially cause UB if the
863	/// "shuffled out" lane would result in division by zero.
864	bool isValidForAlternation(unsigned Opcode) {
865	return !Instruction::isIntDivRem(Opcode);
866	}
867
868	/// Helper class that determines VL can use the same opcode.
869	/// Alternate instruction is supported. In addition, it supports interchangeable
870	/// instruction. An interchangeable instruction is an instruction that can be
871	/// converted to another instruction with same semantics. For example, x << 1 is
872	/// equal to x 2. x * 1 is equal to x \| 0.*
873	class BinOpSameOpcodeHelper {
874	using MaskType = std::uint_fast16_t;
875	/// Sort SupportedOp because it is used by binary_search.
876	constexpr static std::initializer_list<unsigned> SupportedOp = {
877	Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
878	Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
879	enum : MaskType {
880	ShlBIT = `0b1`,
881	AShrBIT = `0b10`,
882	MulBIT = `0b100`,
883	AddBIT = `0b1000`,
884	SubBIT = `0b10000`,
885	AndBIT = `0b100000`,
886	OrBIT = `0b1000000`,
887	XorBIT = `0b10000000`,
888	MainOpBIT = `0b100000000`,
889	LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
890	};
891	/// Return a non-nullptr if either operand of I is a ConstantInt.
892	/// The second return value represents the operand position. We check the
893	/// right-hand side first (1). If the right hand side is not a ConstantInt and
894	/// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
895	/// side (0).
896	static std::pair<ConstantInt , unsigned*>
897	isBinOpWithConstantInt(const Instruction *I) {
898	unsigned Opcode = I->getOpcode();
899	assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
900	(void)SupportedOp;
901	auto *BinOp = cast<BinaryOperator>(Val: I);
902	if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: `1`)))
903	return {CI, `1`};
904	if (Opcode == Instruction::Sub \|\| Opcode == Instruction::Shl \|\|
905	Opcode == Instruction::AShr)
906	return {nullptr, `0`};
907	if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: `0`)))
908	return {CI, `0`};
909	return {nullptr, `0`};
910	}
911	struct InterchangeableInfo {
912	const Instruction I = nullptr*;
913	/// The bit it sets represents whether MainOp can be converted to.
914	MaskType Mask = MainOpBIT \| XorBIT \| OrBIT \| AndBIT \| SubBIT \| AddBIT \|
915	MulBIT \| AShrBIT \| ShlBIT;
916	/// We cannot create an interchangeable instruction that does not exist in
917	/// VL. For example, VL [x + 0, y 1] can be converted to [x << 0, y << 0],*
918	/// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
919	/// 1]. SeenBefore is used to know what operations have been seen before.
920	MaskType SeenBefore = `0`;
921	InterchangeableInfo(const Instruction *I) : I(I) {}
922	/// Return false allows BinOpSameOpcodeHelper to find an alternate
923	/// instruction. Directly setting the mask will destroy the mask state,
924	/// preventing us from determining which instruction it should convert to.
925	bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
926	if (Mask & InterchangeableMask) {
927	SeenBefore \|= OpcodeInMaskForm;
928	Mask &= InterchangeableMask;
929	return true;
930	}
931	return false;
932	}
933	bool equal(unsigned Opcode) {
934	if (Opcode == I->getOpcode())
935	return trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT);
936	return false;
937	}
938	unsigned getOpcode() const {
939	MaskType Candidate = Mask & SeenBefore;
940	if (Candidate & MainOpBIT)
941	return I->getOpcode();
942	if (Candidate & ShlBIT)
943	return Instruction::Shl;
944	if (Candidate & AShrBIT)
945	return Instruction::AShr;
946	if (Candidate & MulBIT)
947	return Instruction::Mul;
948	if (Candidate & AddBIT)
949	return Instruction::Add;
950	if (Candidate & SubBIT)
951	return Instruction::Sub;
952	if (Candidate & AndBIT)
953	return Instruction::And;
954	if (Candidate & OrBIT)
955	return Instruction::Or;
956	if (Candidate & XorBIT)
957	return Instruction::Xor;
958	llvm_unreachable("Cannot find interchangeable instruction.");
959	}
960	SmallVector<Value > getOperand(const* Instruction To) const* {
961	unsigned ToOpcode = To->getOpcode();
962	unsigned FromOpcode = I->getOpcode();
963	if (FromOpcode == ToOpcode)
964	return SmallVector<Value *>(I->operands());
965	assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
966	auto [CI, Pos] = isBinOpWithConstantInt(I);
967	const APInt &FromCIValue = CI->getValue();
968	unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
969	APInt ToCIValue;
970	switch (FromOpcode) {
971	case Instruction::Shl:
972	if (ToOpcode == Instruction::Mul) {
973	ToCIValue = APInt::getOneBitSet(numBits: FromCIValueBitWidth,
974	BitNo: FromCIValue.getZExtValue());
975	} else {
976	assert(FromCIValue.isZero() && "Cannot convert the instruction.");
977	ToCIValue = ToOpcode == Instruction::And
978	? APInt::getAllOnes(numBits: FromCIValueBitWidth)
979	: APInt::getZero(numBits: FromCIValueBitWidth);
980	}
981	break;
982	case Instruction::Mul:
983	assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
984	if (ToOpcode == Instruction::Shl) {
985	ToCIValue = APInt (FromCIValueBitWidth, FromCIValue.logBase2());
986	} else {
987	assert(FromCIValue.isOne() && "Cannot convert the instruction.");
988	ToCIValue = ToOpcode == Instruction::And
989	? APInt::getAllOnes(numBits: FromCIValueBitWidth)
990	: APInt::getZero(numBits: FromCIValueBitWidth);
991	}
992	break;
993	case Instruction::Add:
994	case Instruction::Sub:
995	if (FromCIValue.isZero()) {
996	ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth);
997	} else {
998	assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
999	"Cannot convert the instruction.");
1000	ToCIValue = FromCIValue;
1001	ToCIValue.negate();
1002	}
1003	break;
1004	case Instruction::And:
1005	assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1006	ToCIValue = ToOpcode == Instruction::Mul
1007	? APInt::getOneBitSet(numBits: FromCIValueBitWidth, BitNo: `0`)
1008	: APInt::getZero(numBits: FromCIValueBitWidth);
1009	break;
1010	default:
1011	assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1012	ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth);
1013	break;
1014	}
1015	Value *LHS = I->getOperand(i: `1` - Pos);
1016	Constant *RHS =
1017	ConstantInt::get(Ty: I->getOperand(i: Pos)->getType(), V: ToCIValue);
1018	// constant + x cannot be -constant - x
1019	// instead, it should be x - -constant
1020	if (Pos == `1` \|\|
1021	(FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1022	return SmallVector<Value *>({LHS, RHS});
1023	return SmallVector<Value *>({RHS, LHS});
1024	}
1025	};
1026	InterchangeableInfo MainOp;
1027	InterchangeableInfo AltOp;
1028	bool isValidForAlternation(const Instruction I) const* {
1029	return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) &&
1030	::isValidForAlternation(Opcode: I->getOpcode());
1031	}
1032	bool initializeAltOp(const Instruction *I) {
1033	if (AltOp.I)
1034	return true;
1035	if (!isValidForAlternation(I))
1036	return false;
1037	AltOp.I = I;
1038	return true;
1039	}
1040
1041	public:
1042	BinOpSameOpcodeHelper(const Instruction *MainOp,
1043	const Instruction AltOp = nullptr*)
1044	: MainOp (MainOp), AltOp (AltOp) {
1045	assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1046	}
1047	bool add(const Instruction *I) {
1048	assert(isa<BinaryOperator>(I) &&
1049	"BinOpSameOpcodeHelper only accepts BinaryOperator.");
1050	unsigned Opcode = I->getOpcode();
1051	MaskType OpcodeInMaskForm;
1052	// Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1053	switch (Opcode) {
1054	case Instruction::Shl:
1055	OpcodeInMaskForm = ShlBIT;
1056	break;
1057	case Instruction::AShr:
1058	OpcodeInMaskForm = AShrBIT;
1059	break;
1060	case Instruction::Mul:
1061	OpcodeInMaskForm = MulBIT;
1062	break;
1063	case Instruction::Add:
1064	OpcodeInMaskForm = AddBIT;
1065	break;
1066	case Instruction::Sub:
1067	OpcodeInMaskForm = SubBIT;
1068	break;
1069	case Instruction::And:
1070	OpcodeInMaskForm = AndBIT;
1071	break;
1072	case Instruction::Or:
1073	OpcodeInMaskForm = OrBIT;
1074	break;
1075	case Instruction::Xor:
1076	OpcodeInMaskForm = XorBIT;
1077	break;
1078	default:
1079	return MainOp.equal(Opcode) \|\|
1080	(initializeAltOp(I) && AltOp.equal(Opcode));
1081	}
1082	MaskType InterchangeableMask = OpcodeInMaskForm;
1083	ConstantInt *CI = isBinOpWithConstantInt(I).first;
1084	if (CI) {
1085	constexpr MaskType CanBeAll =
1086	XorBIT \| OrBIT \| AndBIT \| SubBIT \| AddBIT \| MulBIT \| AShrBIT \| ShlBIT;
1087	const APInt &CIValue = CI->getValue();
1088	switch (Opcode) {
1089	case Instruction::Shl:
1090	if (CIValue.ult(RHS: CIValue.getBitWidth()))
1091	InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT \| ShlBIT;
1092	break;
1093	case Instruction::Mul:
1094	if (CIValue.isOne()) {
1095	InterchangeableMask = CanBeAll;
1096	break;
1097	}
1098	if (CIValue.isPowerOf2())
1099	InterchangeableMask = MulBIT \| ShlBIT;
1100	break;
1101	case Instruction::Add:
1102	case Instruction::Sub:
1103	InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT \| AddBIT;
1104	break;
1105	case Instruction::And:
1106	if (CIValue.isAllOnes())
1107	InterchangeableMask = CanBeAll;
1108	break;
1109	default:
1110	if (CIValue.isZero())
1111	InterchangeableMask = CanBeAll;
1112	break;
1113	}
1114	}
1115	return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) \|\|
1116	(initializeAltOp(I) &&
1117	AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1118	}
1119	unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1120	bool hasAltOp() const { return AltOp.I; }
1121	unsigned getAltOpcode() const {
1122	return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1123	}
1124	SmallVector<Value > getOperand(const* Instruction I) const* {
1125	return MainOp.getOperand(To: I);
1126	}
1127	};
1128
1129	/// Main data required for vectorization of instructions.
1130	class InstructionsState {
1131	/// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1132	/// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1133	/// (i.e., AltOp is not equal to MainOp; this can be checked using
1134	/// isAltShuffle).
1135	/// A rare exception is TrySplitNode, where the InstructionsState is derived
1136	/// from getMainAltOpsNoStateVL.
1137	/// For those InstructionsState that use alternate instructions, the resulting
1138	/// vectorized output ultimately comes from a shufflevector. For example,
1139	/// given a vector list (VL):
1140	/// VL[0] = add i32 a, e
1141	/// VL[1] = sub i32 b, f
1142	/// VL[2] = add i32 c, g
1143	/// VL[3] = sub i32 d, h
1144	/// The vectorized result would be:
1145	/// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1146	/// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1147	/// result = shufflevector <4 x i32> intermediated_0,
1148	/// <4 x i32> intermediated_1,
1149	/// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1150	/// Since shufflevector is used in the final result, when calculating the cost
1151	/// (getEntryCost), we must account for the usage of shufflevector in
1152	/// GetVectorCost.
1153	Instruction MainOp = nullptr*;
1154	Instruction AltOp = nullptr*;
1155
1156	public:
1157	Instruction getMainOp() const* {
1158	assert(valid() && "InstructionsState is invalid.");
1159	return MainOp;
1160	}
1161
1162	Instruction getAltOp() const* {
1163	assert(valid() && "InstructionsState is invalid.");
1164	return AltOp;
1165	}
1166
1167	/// The main/alternate opcodes for the list of instructions.
1168	unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1169
1170	unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1171
1172	/// Some of the instructions in the list have alternate opcodes.
1173	bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1174
1175	/// Checks if the instruction matches either the main or alternate opcode.
1176	/// \returns
1177	/// - MainOp if \param I matches MainOp's opcode directly or can be converted
1178	/// to it
1179	/// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1180	/// it
1181	/// - nullptr if \param I cannot be matched or converted to either opcode
1182	Instruction getMatchingMainOpOrAltOp(Instruction I) const {
1183	assert(MainOp && "MainOp cannot be nullptr.");
1184	if (I->getOpcode() == MainOp->getOpcode())
1185	return MainOp;
1186	// Prefer AltOp instead of interchangeable instruction of MainOp.
1187	assert(AltOp && "AltOp cannot be nullptr.");
1188	if (I->getOpcode() == AltOp->getOpcode())
1189	return AltOp;
1190	if (!I->isBinaryOp())
1191	return nullptr;
1192	BinOpSameOpcodeHelper Converter(MainOp);
1193	if (Converter.add(I) && Converter.add(I: MainOp) && !Converter.hasAltOp())
1194	return MainOp;
1195	return AltOp;
1196	}
1197
1198	/// Checks if main/alt instructions are shift operations.
1199	bool isShiftOp() const {
1200	return getMainOp()->isShift() && getAltOp()->isShift();
1201	}
1202
1203	/// Checks if main/alt instructions are bitwise logic operations.
1204	bool isBitwiseLogicOp() const {
1205	return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1206	}
1207
1208	/// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1209	bool isMulDivLikeOp() const {
1210	constexpr std::array<unsigned, `8`> MulDiv = {
1211	Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1212	Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1213	Instruction::URem, Instruction::FRem};
1214	return is_contained(Range: MulDiv, Element: getOpcode()) &&
1215	is_contained(Range: MulDiv, Element: getAltOpcode());
1216	}
1217
1218	/// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1219	bool isAddSubLikeOp() const {
1220	constexpr std::array<unsigned, `4`> AddSub = {
1221	Instruction::Add, Instruction::Sub, Instruction::FAdd,
1222	Instruction::FSub};
1223	return is_contained(Range: AddSub, Element: getOpcode()) &&
1224	is_contained(Range: AddSub, Element: getAltOpcode());
1225	}
1226
1227	/// Checks if main/alt instructions are cmp operations.
1228	bool isCmpOp() const {
1229	return (getOpcode() == Instruction::ICmp \|\|
1230	getOpcode() == Instruction::FCmp) &&
1231	getAltOpcode() == getOpcode();
1232	}
1233
1234	/// Checks if the current state is valid, i.e. has non-null MainOp
1235	bool valid() const { return MainOp && AltOp; }
1236
1237	explicit operator bool() const { return valid(); }
1238
1239	InstructionsState() = delete;
1240	InstructionsState(Instruction MainOp, Instruction AltOp)
1241	: MainOp(MainOp), AltOp(AltOp) {}
1242	static InstructionsState invalid() { return {nullptr, nullptr}; }
1243	};
1244
1245	std::pair<Instruction , SmallVector<Value >>
1246	convertTo(Instruction I, const* InstructionsState &S) {
1247	Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1248	assert(SelectedOp && "Cannot convert the instruction.");
1249	if (I->isBinaryOp()) {
1250	BinOpSameOpcodeHelper Converter(I);
1251	return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp));
1252	}
1253	return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands()));
1254	}
1255
1256	} // end anonymous namespace
1257
1258	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1259	const TargetLibraryInfo &TLI);
1260
1261	/// Find an instruction with a specific opcode in VL.
1262	/// \param VL Array of values to search through. Must contain only Instructions
1263	/// and PoisonValues.
1264	/// \param Opcode The instruction opcode to search for
1265	/// \returns
1266	/// - The first instruction found with matching opcode
1267	/// - nullptr if no matching instruction is found
1268	static Instruction findInstructionWithOpcode(ArrayRef<Value > VL,
1269	unsigned Opcode) {
1270	for (Value *V : VL) {
1271	if (isa<PoisonValue>(Val: V))
1272	continue;
1273	assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1274	auto *Inst = cast<Instruction>(Val: V);
1275	if (Inst->getOpcode() == Opcode)
1276	return Inst;
1277	}
1278	return nullptr;
1279	}
1280
1281	/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1282	/// compatible instructions or constants, or just some other regular values.
1283	static bool areCompatibleCmpOps(Value BaseOp0, Value BaseOp1, Value *Op0,
1284	Value Op1, const* TargetLibraryInfo &TLI) {
1285	return (isConstant(V: BaseOp0) && isConstant(V: Op0)) \|\|
1286	(isConstant(V: BaseOp1) && isConstant(V: Op1)) \|\|
1287	(!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
1288	!isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) \|\|
1289	BaseOp0 == Op0 \|\| BaseOp1 == Op1 \|\|
1290	getSameOpcode(VL: {BaseOp0, Op0}, TLI) \|\|
1291	getSameOpcode(VL: {BaseOp1, Op1}, TLI);
1292	}
1293
1294	/// \returns true if a compare instruction \p CI has similar "look" and
1295	/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1296	/// swapped, false otherwise.
1297	static bool isCmpSameOrSwapped(const CmpInst BaseCI, const* CmpInst *CI,
1298	const TargetLibraryInfo &TLI) {
1299	assert(BaseCI->getOperand(`0`)->getType() == CI->getOperand(`0`)->getType() &&
1300	"Assessing comparisons of different types?");
1301	CmpInst::Predicate BasePred = BaseCI->getPredicate();
1302	CmpInst::Predicate Pred = CI->getPredicate();
1303	CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
1304
1305	Value *BaseOp0 = BaseCI->getOperand(i_nocapture: `0`);
1306	Value *BaseOp1 = BaseCI->getOperand(i_nocapture: `1`);
1307	Value *Op0 = CI->getOperand(i_nocapture: `0`);
1308	Value *Op1 = CI->getOperand(i_nocapture: `1`);
1309
1310	return (BasePred == Pred &&
1311	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) \|\|
1312	(BasePred == SwappedPred &&
1313	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
1314	}
1315
1316	/// \returns analysis of the Instructions in \p VL described in
1317	/// InstructionsState, the Opcode that we suppose the whole list
1318	/// could be vectorized even if its structure is diverse.
1319	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1320	const TargetLibraryInfo &TLI) {
1321	// Make sure these are all Instructions.
1322	if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>))
1323	return InstructionsState::invalid();
1324
1325	auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
1326	if (It == VL.end())
1327	return InstructionsState::invalid();
1328
1329	Instruction MainOp = cast<Instruction>(Val: It);
1330	unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>);
1331	if ((VL.size() > `2` && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / `2`) \|\|
1332	(VL.size() == `2` && InstCnt < `2`))
1333	return InstructionsState::invalid();
1334
1335	bool IsCastOp = isa<CastInst>(Val: MainOp);
1336	bool IsBinOp = isa<BinaryOperator>(Val: MainOp);
1337	bool IsCmpOp = isa<CmpInst>(Val: MainOp);
1338	CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
1339	: CmpInst::BAD_ICMP_PREDICATE;
1340	Instruction *AltOp = MainOp;
1341	unsigned Opcode = MainOp->getOpcode();
1342	unsigned AltOpcode = Opcode;
1343
1344	BinOpSameOpcodeHelper BinOpHelper(MainOp);
1345	bool SwappedPredsCompatible = IsCmpOp && [&]() {
1346	SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1347	UniquePreds.insert(X: BasePred);
1348	UniqueNonSwappedPreds.insert(X: BasePred);
1349	for (Value *V : VL) {
1350	auto *I = dyn_cast<CmpInst>(Val: V);
1351	if (!I)
1352	return false;
1353	CmpInst::Predicate CurrentPred = I->getPredicate();
1354	CmpInst::Predicate SwappedCurrentPred =
1355	CmpInst::getSwappedPredicate(pred: CurrentPred);
1356	UniqueNonSwappedPreds.insert(X: CurrentPred);
1357	if (!UniquePreds.contains(key: CurrentPred) &&
1358	!UniquePreds.contains(key: SwappedCurrentPred))
1359	UniquePreds.insert(X: CurrentPred);
1360	}
1361	// Total number of predicates > 2, but if consider swapped predicates
1362	// compatible only 2, consider swappable predicates as compatible opcodes,
1363	// not alternate.
1364	return UniqueNonSwappedPreds.size() > `2` && UniquePreds.size() == `2`;
1365	}();
1366	// Check for one alternate opcode from another BinaryOperator.
1367	// TODO - generalize to support all operators (types, calls etc.).
1368	Intrinsic::ID BaseID = `0`;
1369	SmallVector<VFInfo> BaseMappings;
1370	if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) {
1371	BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
1372	BaseMappings = VFDatabase (CallBase).getMappings(CI: CallBase);
1373	if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
1374	return InstructionsState::invalid();
1375	}
1376	bool AnyPoison = InstCnt != VL.size();
1377	// Check MainOp too to be sure that it matches the requirements for the
1378	// instructions.
1379	for (Value *V : iterator_range(It, VL.end())) {
1380	auto *I = dyn_cast<Instruction>(Val: V);
1381	if (!I)
1382	continue;
1383
1384	// Cannot combine poison and divisions.
1385	// TODO: do some smart analysis of the CallInsts to exclude divide-like
1386	// intrinsics/functions only.
1387	if (AnyPoison && (I->isIntDivRem() \|\| I->isFPDivRem() \|\| isa<CallInst>(Val: I)))
1388	return InstructionsState::invalid();
1389	unsigned InstOpcode = I->getOpcode();
1390	if (IsBinOp && isa<BinaryOperator>(Val: I)) {
1391	if (BinOpHelper.add(I))
1392	continue;
1393	} else if (IsCastOp && isa<CastInst>(Val: I)) {
1394	Value *Op0 = MainOp->getOperand(i: `0`);
1395	Type *Ty0 = Op0->getType();
1396	Value *Op1 = I->getOperand(i: `0`);
1397	Type *Ty1 = Op1->getType();
1398	if (Ty0 == Ty1) {
1399	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
1400	continue;
1401	if (Opcode == AltOpcode) {
1402	assert(isValidForAlternation(Opcode) &&
1403	isValidForAlternation(InstOpcode) &&
1404	"Cast isn't safe for alternation, logic needs to be updated!");
1405	AltOpcode = InstOpcode;
1406	AltOp = I;
1407	continue;
1408	}
1409	}
1410	} else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) {
1411	auto *BaseInst = cast<CmpInst>(Val: MainOp);
1412	Type *Ty0 = BaseInst->getOperand(i_nocapture: `0`)->getType();
1413	Type *Ty1 = Inst->getOperand(i_nocapture: `0`)->getType();
1414	if (Ty0 == Ty1) {
1415	assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1416	assert(InstOpcode == AltOpcode &&
1417	"Alternate instructions are only supported by BinaryOperator "
1418	"and CastInst.");
1419	// Check for compatible operands. If the corresponding operands are not
1420	// compatible - need to perform alternate vectorization.
1421	CmpInst::Predicate CurrentPred = Inst->getPredicate();
1422	CmpInst::Predicate SwappedCurrentPred =
1423	CmpInst::getSwappedPredicate(pred: CurrentPred);
1424
1425	if ((VL.size() == `2` \|\| SwappedPredsCompatible) &&
1426	(BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred))
1427	continue;
1428
1429	if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
1430	continue;
1431	auto *AltInst = cast<CmpInst>(Val: AltOp);
1432	if (MainOp != AltOp) {
1433	if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
1434	continue;
1435	} else if (BasePred != CurrentPred) {
1436	assert(
1437	isValidForAlternation(InstOpcode) &&
1438	"CmpInst isn't safe for alternation, logic needs to be updated!");
1439	AltOp = I;
1440	continue;
1441	}
1442	CmpInst::Predicate AltPred = AltInst->getPredicate();
1443	if (BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred \|\|
1444	AltPred == CurrentPred \|\| AltPred == SwappedCurrentPred)
1445	continue;
1446	}
1447	} else if (InstOpcode == Opcode) {
1448	assert(InstOpcode == AltOpcode &&
1449	"Alternate instructions are only supported by BinaryOperator and "
1450	"CastInst.");
1451	if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
1452	if (Gep->getNumOperands() != `2` \|\|
1453	Gep->getOperand(i_nocapture: `0`)->getType() != MainOp->getOperand(i: `0`)->getType())
1454	return InstructionsState::invalid();
1455	} else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
1456	if (!isVectorLikeInstWithConstOps(V: EI))
1457	return InstructionsState::invalid();
1458	} else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
1459	auto *BaseLI = cast<LoadInst>(Val: MainOp);
1460	if (!LI->isSimple() \|\| !BaseLI->isSimple())
1461	return InstructionsState::invalid();
1462	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
1463	auto *CallBase = cast<CallInst>(Val: MainOp);
1464	if (Call->getCalledFunction() != CallBase->getCalledFunction())
1465	return InstructionsState::invalid();
1466	if (Call->hasOperandBundles() &&
1467	(!CallBase->hasOperandBundles() \|\|
1468	!std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
1469	last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
1470	first2: CallBase->op_begin() +
1471	CallBase->getBundleOperandsStartIndex())))
1472	return InstructionsState::invalid();
1473	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
1474	if (ID != BaseID)
1475	return InstructionsState::invalid();
1476	if (!ID) {
1477	SmallVector<VFInfo> Mappings = VFDatabase (Call).getMappings(CI: Call);
1478	if (Mappings.size() != BaseMappings.size() \|\|
1479	Mappings.front().ISA != BaseMappings.front().ISA \|\|
1480	Mappings.front().ScalarName != BaseMappings.front().ScalarName \|\|
1481	Mappings.front().VectorName != BaseMappings.front().VectorName \|\|
1482	Mappings.front().Shape.VF != BaseMappings.front().Shape.VF \|\|
1483	Mappings.front().Shape.Parameters !=
1484	BaseMappings.front().Shape.Parameters)
1485	return InstructionsState::invalid();
1486	}
1487	}
1488	continue;
1489	}
1490	return InstructionsState::invalid();
1491	}
1492
1493	if (IsBinOp) {
1494	MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode());
1495	assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1496	AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode());
1497	assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1498	}
1499	assert((MainOp == AltOp \|\| !allSameOpcode(VL)) &&
1500	"Incorrect implementation of allSameOpcode.");
1501	InstructionsState S(MainOp, AltOp);
1502	assert(all_of(VL,
1503	[&](Value *V) {
1504	return isa<PoisonValue>(V) \|\|
1505	S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1506	}) &&
1507	"Invalid InstructionsState.");
1508	return S;
1509	}
1510
1511	/// \returns true if all of the values in \p VL have the same type or false
1512	/// otherwise.
1513	static bool allSameType(ArrayRef<Value *> VL) {
1514	Type *Ty = VL.front()->getType();
1515	return all_of(Range: VL.drop_front(), P: [&](Value V) { return* V->getType() == Ty; });
1516	}
1517
1518	/// \returns True if in-tree use also needs extract. This refers to
1519	/// possible scalar operand in vectorized instruction.
1520	static bool doesInTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
1521	TargetLibraryInfo *TLI,
1522	const TargetTransformInfo *TTI) {
1523	if (!UserInst)
1524	return false;
1525	unsigned Opcode = UserInst->getOpcode();
1526	switch (Opcode) {
1527	case Instruction::Load: {
1528	LoadInst *LI = cast<LoadInst>(Val: UserInst);
1529	return (LI->getPointerOperand() == Scalar);
1530	}
1531	case Instruction::Store: {
1532	StoreInst *SI = cast<StoreInst>(Val: UserInst);
1533	return (SI->getPointerOperand() == Scalar);
1534	}
1535	case Instruction::Call: {
1536	CallInst *CI = cast<CallInst>(Val: UserInst);
1537	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1538	return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
1539	return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1540	Arg.value().get() == Scalar;
1541	});
1542	}
1543	default:
1544	return false;
1545	}
1546	}
1547
1548	/// \returns the AA location that is being access by the instruction.
1549	static MemoryLocation getLocation(Instruction *I) {
1550	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1551	return MemoryLocation::get(SI);
1552	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1553	return MemoryLocation::get(LI);
1554	return MemoryLocation ();
1555	}
1556
1557	/// \returns True if the instruction is not a volatile or atomic load/store.
1558	static bool isSimple(Instruction *I) {
1559	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1560	return LI->isSimple();
1561	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1562	return SI->isSimple();
1563	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
1564	return !MI->isVolatile();
1565	return true;
1566	}
1567
1568	/// Shuffles \p Mask in accordance with the given \p SubMask.
1569	/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1570	/// one but two input vectors.
1571	static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1572	bool ExtendingManyInputs = false) {
1573	if (SubMask.empty())
1574	return;
1575	assert(
1576	(!ExtendingManyInputs \|\| SubMask.size() > Mask.size() \|\|
1577	// Check if input scalars were extended to match the size of other node.
1578	(SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1579	"SubMask with many inputs support must be larger than the mask.");
1580	if (Mask.empty()) {
1581	Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
1582	return;
1583	}
1584	SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1585	int TermValue = std::min(a: Mask.size(), b: SubMask.size());
1586	for (int I = `0`, E = SubMask.size(); I < E; ++I) {
1587	if (SubMask [I] == PoisonMaskElem \|\|
1588	(!ExtendingManyInputs &&
1589	(SubMask [I] >= TermValue \|\| Mask [SubMask [I]] >= TermValue)))
1590	continue;
1591	NewMask [I] = Mask [SubMask [I]];
1592	}
1593	Mask.swap(RHS&: NewMask);
1594	}
1595
1596	/// Order may have elements assigned special value (size) which is out of
1597	/// bounds. Such indices only appear on places which correspond to undef values
1598	/// (see canReuseExtract for details) and used in order to avoid undef values
1599	/// have effect on operands ordering.
1600	/// The first loop below simply finds all unused indices and then the next loop
1601	/// nest assigns these indices for undef values positions.
1602	/// As an example below Order has two undef positions and they have assigned
1603	/// values 3 and 7 respectively:
1604	/// before: 6 9 5 4 9 2 1 0
1605	/// after: 6 3 5 4 7 2 1 0
1606	static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1607	const size_t Sz = Order.size();
1608	SmallBitVector UnusedIndices(Sz, /t=/true);
1609	SmallBitVector MaskedIndices(Sz);
1610	for (unsigned I = `0`; I < Sz; ++I) {
1611	if (Order [I] < Sz)
1612	UnusedIndices.reset(Idx: Order [I]);
1613	else
1614	MaskedIndices.set(I);
1615	}
1616	if (MaskedIndices.none())
1617	return;
1618	assert(UnusedIndices.count() == MaskedIndices.count() &&
1619	"Non-synced masked/available indices.");
1620	int Idx = UnusedIndices.find_first();
1621	int MIdx = MaskedIndices.find_first();
1622	while (MIdx >= `0`) {
1623	assert(Idx >= `0` && "Indices must be synced.");
1624	Order [MIdx] = Idx;
1625	Idx = UnusedIndices.find_next(Prev: Idx);
1626	MIdx = MaskedIndices.find_next(Prev: MIdx);
1627	}
1628	}
1629
1630	/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1631	/// Opcode1.
1632	static SmallBitVector getAltInstrMask(ArrayRef<Value > VL, Type ScalarTy,
1633	unsigned Opcode0, unsigned Opcode1) {
1634	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
1635	SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1636	for (unsigned Lane : seq<unsigned>(Size: VL.size())) {
1637	if (isa<PoisonValue>(Val: VL [Lane]))
1638	continue;
1639	if (cast<Instruction>(Val: VL [Lane])->getOpcode() == Opcode1)
1640	OpcodeMask.set(I: Lane * ScalarTyNumElements,
1641	E: Lane * ScalarTyNumElements + ScalarTyNumElements);
1642	}
1643	return OpcodeMask;
1644	}
1645
1646	/// Replicates the given \p Val \p VF times.
1647	static SmallVector<Constant > replicateMask(ArrayRef<Constant > Val,
1648	unsigned VF) {
1649	assert(none_of(Val, [](Constant C) { return* C->getType()->isVectorTy(); }) &&
1650	"Expected scalar constants.");
1651	SmallVector<Constant > NewVal(Val.size() VF);
1652	for (auto [I, V] : enumerate(First&: Val))
1653	std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V);
1654	return NewVal;
1655	}
1656
1657	namespace llvm {
1658
1659	static void inversePermutation(ArrayRef<unsigned> Indices,
1660	SmallVectorImpl<int> &Mask) {
1661	Mask.clear();
1662	const unsigned E = Indices.size();
1663	Mask.resize(N: E, NV: PoisonMaskElem);
1664	for (unsigned I = `0`; I < E; ++I)
1665	Mask [Indices [I]] = I;
1666	}
1667
1668	/// Reorders the list of scalars in accordance with the given \p Mask.
1669	static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1670	ArrayRef<int> Mask) {
1671	assert(!Mask.empty() && "Expected non-empty mask.");
1672	SmallVector<Value *> Prev(Scalars.size(),
1673	PoisonValue::get(T: Scalars.front()->getType()));
1674	Prev.swap(RHS&: Scalars);
1675	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
1676	if (Mask [I] != PoisonMaskElem)
1677	Scalars [Mask [I]] = Prev [I];
1678	}
1679
1680	/// Checks if the provided value does not require scheduling. It does not
1681	/// require scheduling if this is not an instruction or it is an instruction
1682	/// that does not read/write memory and all operands are either not instructions
1683	/// or phi nodes or instructions from different blocks.
1684	static bool areAllOperandsNonInsts(Value *V) {
1685	auto *I = dyn_cast<Instruction>(Val: V);
1686	if (!I)
1687	return true;
1688	return !mayHaveNonDefUseDependency(I: *I) &&
1689	all_of(Range: I->operands(), P: [I](Value *V) {
1690	auto *IO = dyn_cast<Instruction>(Val: V);
1691	if (!IO)
1692	return true;
1693	return isa<PHINode>(Val: IO) \|\| IO->getParent() != I->getParent();
1694	});
1695	}
1696
1697	/// Checks if the provided value does not require scheduling. It does not
1698	/// require scheduling if this is not an instruction or it is an instruction
1699	/// that does not read/write memory and all users are phi nodes or instructions
1700	/// from the different blocks.
1701	static bool isUsedOutsideBlock(Value *V) {
1702	auto *I = dyn_cast<Instruction>(Val: V);
1703	if (!I)
1704	return true;
1705	// Limits the number of uses to save compile time.
1706	return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1707	all_of(Range: I->users(), P: [I](User *U) {
1708	auto *IU = dyn_cast<Instruction>(Val: U);
1709	if (!IU)
1710	return true;
1711	return IU->getParent() != I->getParent() \|\| isa<PHINode>(Val: IU);
1712	});
1713	}
1714
1715	/// Checks if the specified value does not require scheduling. It does not
1716	/// require scheduling if all operands and all users do not need to be scheduled
1717	/// in the current basic block.
1718	static bool doesNotNeedToBeScheduled(Value *V) {
1719	return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1720	}
1721
1722	/// Checks if the specified array of instructions does not require scheduling.
1723	/// It is so if all either instructions have operands that do not require
1724	/// scheduling or their users do not require scheduling since they are phis or
1725	/// in other basic blocks.
1726	static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1727	return !VL.empty() &&
1728	(all_of(Range&: VL, P: isUsedOutsideBlock) \|\| all_of(Range&: VL, P: areAllOperandsNonInsts));
1729	}
1730
1731	/// Returns true if widened type of \p Ty elements with size \p Sz represents
1732	/// full vector type, i.e. adding extra element results in extra parts upon type
1733	/// legalization.
1734	static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1735	unsigned Sz) {
1736	if (Sz <= `1`)
1737	return false;
1738	if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty))
1739	return false;
1740	if (has_single_bit(Value: Sz))
1741	return true;
1742	const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
1743	return NumParts > `0` && NumParts < Sz && has_single_bit(Value: Sz / NumParts) &&
1744	Sz % NumParts == `0`;
1745	}
1746
1747	/// Returns number of parts, the type \p VecTy will be split at the codegen
1748	/// phase. If the type is going to be scalarized or does not uses whole
1749	/// registers, returns 1.
1750	static unsigned
1751	getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1752	const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1753	unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
1754	if (NumParts == `0` \|\| NumParts >= Limit)
1755	return `1`;
1756	unsigned Sz = getNumElements(Ty: VecTy);
1757	if (NumParts >= Sz \|\| Sz % NumParts != `0` \|\|
1758	!hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts))
1759	return `1`;
1760	return NumParts;
1761	}
1762
1763	namespace slpvectorizer {
1764
1765	/// Bottom Up SLP Vectorizer.
1766	class BoUpSLP {
1767	class TreeEntry;
1768	class ScheduleEntity;
1769	class ScheduleData;
1770	class ScheduleBundle;
1771	class ShuffleCostEstimator;
1772	class ShuffleInstructionBuilder;
1773
1774	public:
1775	/// Tracks the state we can represent the loads in the given sequence.
1776	enum class LoadsState {
1777	Gather,
1778	Vectorize,
1779	ScatterVectorize,
1780	StridedVectorize,
1781	CompressVectorize
1782	};
1783
1784	using ValueList = SmallVector<Value *, `8`>;
1785	using InstrList = SmallVector<Instruction *, `16`>;
1786	using ValueSet = SmallPtrSet<Value *, `16`>;
1787	using StoreList = SmallVector<StoreInst *, `8`>;
1788	using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, `4`>;
1789	using OrdersType = SmallVector<unsigned, `4`>;
1790
1791	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
1792	TargetLibraryInfo TLi, AAResults Aa, LoopInfo *Li,
1793	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
1794	const DataLayout DL, OptimizationRemarkEmitter ORE)
1795	: BatchAA (*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1796	AC(AC), DB(DB), DL(DL), ORE(ORE),
1797	Builder (Se->getContext(), TargetFolder (*DL)) {
1798	CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1799	// Use the vector register size specified by the target unless overridden
1800	// by a command-line option.
1801	// TODO: It would be better to limit the vectorization factor based on
1802	// data type rather than just register size. For example, x86 AVX has
1803	// 256-bit registers, but it does not support integer operations
1804	// at that width (that requires AVX2).
1805	if (MaxVectorRegSizeOption.getNumOccurrences())
1806	MaxVecRegSize = MaxVectorRegSizeOption;
1807	else
1808	MaxVecRegSize =
1809	TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
1810	.getFixedValue();
1811
1812	if (MinVectorRegSizeOption.getNumOccurrences())
1813	MinVecRegSize = MinVectorRegSizeOption;
1814	else
1815	MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1816	}
1817
1818	/// Vectorize the tree that starts with the elements in \p VL.
1819	/// Returns the vectorized root.
1820	Value *vectorizeTree();
1821
1822	/// Vectorize the tree but with the list of externally used values \p
1823	/// ExternallyUsedValues. Values in this MapVector can be replaced but the
1824	/// generated extractvalue instructions.
1825	Value *vectorizeTree(
1826	const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1827	Instruction ReductionRoot = nullptr*,
1828	ArrayRef<std::tuple<Value , unsigned, bool*>> VectorValuesAndScales = {});
1829
1830	/// \returns the cost incurred by unwanted spills and fills, caused by
1831	/// holding live values over call sites.
1832	InstructionCost getSpillCost();
1833
1834	/// \returns the vectorization cost of the subtree that starts at \p VL.
1835	/// A negative number means that this is profitable.
1836	InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
1837	InstructionCost ReductionCost = TTI::TCC_Free);
1838
1839	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1840	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1841	void buildTree(ArrayRef<Value *> Roots,
1842	const SmallDenseSet<Value *> &UserIgnoreLst);
1843
1844	/// Construct a vectorizable tree that starts at \p Roots.
1845	void buildTree(ArrayRef<Value *> Roots);
1846
1847	/// Return the scalars of the root node.
1848	ArrayRef<Value > getRootNodeScalars() const* {
1849	assert(!VectorizableTree.empty() && "No graph to get the first node from");
1850	return VectorizableTree.front()->Scalars;
1851	}
1852
1853	/// Returns the type/is-signed info for the root node in the graph without
1854	/// casting.
1855	std::optional<std::pair<Type , bool>> getRootNodeTypeWithNoCast() const* {
1856	const TreeEntry &Root = *VectorizableTree.front();
1857	if (Root.State != TreeEntry::Vectorize \|\| Root.isAltShuffle() \|\|
1858	!Root.Scalars.front()->getType()->isIntegerTy())
1859	return std::nullopt;
1860	auto It = MinBWs.find(Val: &Root);
1861	if (It != MinBWs.end())
1862	return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(),
1863	NumBits: It ->second.first),
1864	y: It ->second.second);
1865	if (Root.getOpcode() == Instruction::ZExt \|\|
1866	Root.getOpcode() == Instruction::SExt)
1867	return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(),
1868	y: Root.getOpcode() == Instruction::SExt);
1869	return std::nullopt;
1870	}
1871
1872	/// Checks if the root graph node can be emitted with narrower bitwidth at
1873	/// codegen and returns it signedness, if so.
1874	bool isSignedMinBitwidthRootNode() const {
1875	return MinBWs.at(Val: VectorizableTree.front().get()).second;
1876	}
1877
1878	/// Returns reduction type after minbitdth analysis.
1879	FixedVectorType getReductionType() const* {
1880	if (ReductionBitWidth == `0` \|\|
1881	!VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() \|\|
1882	ReductionBitWidth >=
1883	DL->getTypeSizeInBits(
1884	Ty: VectorizableTree.front()->Scalars.front()->getType()))
1885	return getWidenedType(
1886	ScalarTy: VectorizableTree.front()->Scalars.front()->getType(),
1887	VF: VectorizableTree.front()->getVectorFactor());
1888	return getWidenedType(
1889	ScalarTy: IntegerType::get(
1890	C&: VectorizableTree.front()->Scalars.front()->getContext(),
1891	NumBits: ReductionBitWidth),
1892	VF: VectorizableTree.front()->getVectorFactor());
1893	}
1894
1895	/// Builds external uses of the vectorized scalars, i.e. the list of
1896	/// vectorized scalars to be extracted, their lanes and their scalar users. \p
1897	/// ExternallyUsedValues contains additional list of external uses to handle
1898	/// vectorization of reductions.
1899	void
1900	buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1901
1902	/// Transforms graph nodes to target specific representations, if profitable.
1903	void transformNodes();
1904
1905	/// Clear the internal data structures that are created by 'buildTree'.
1906	void deleteTree() {
1907	VectorizableTree.clear();
1908	ScalarToTreeEntries.clear();
1909	OperandsToTreeEntry.clear();
1910	ScalarsInSplitNodes.clear();
1911	MustGather.clear();
1912	NonScheduledFirst.clear();
1913	EntryToLastInstruction.clear();
1914	LoadEntriesToVectorize.clear();
1915	IsGraphTransformMode = false;
1916	GatheredLoadsEntriesFirst.reset();
1917	CompressEntryToData.clear();
1918	ExternalUses.clear();
1919	ExternalUsesAsOriginalScalar.clear();
1920	for (auto &Iter : BlocksSchedules) {
1921	BlockScheduling *BS = Iter.second.get();
1922	BS->clear();
1923	}
1924	MinBWs.clear();
1925	ReductionBitWidth = `0`;
1926	BaseGraphSize = `1`;
1927	CastMaxMinBWSizes.reset();
1928	ExtraBitWidthNodes.clear();
1929	InstrElementSize.clear();
1930	UserIgnoreList = nullptr;
1931	PostponedGathers.clear();
1932	ValueToGatherNodes.clear();
1933	}
1934
1935	unsigned getTreeSize() const { return VectorizableTree.size(); }
1936
1937	/// Returns the base graph size, before any transformations.
1938	unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1939
1940	/// Perform LICM and CSE on the newly generated gather sequences.
1941	void optimizeGatherSequence();
1942
1943	/// Does this non-empty order represent an identity order? Identity
1944	/// should be represented as an empty order, so this is used to
1945	/// decide if we can canonicalize a computed order. Undef elements
1946	/// (represented as size) are ignored.
1947	static bool isIdentityOrder(ArrayRef<unsigned> Order) {
1948	assert(!Order.empty() && "expected non-empty order");
1949	const unsigned Sz = Order.size();
1950	return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) {
1951	return P.value() == P.index() \|\| P.value() == Sz;
1952	});
1953	}
1954
1955	/// Checks if the specified gather tree entry \p TE can be represented as a
1956	/// shuffled vector entry + (possibly) permutation with other gathers. It
1957	/// implements the checks only for possibly ordered scalars (Loads,
1958	/// ExtractElement, ExtractValue), which can be part of the graph.
1959	/// \param TopToBottom If true, used for the whole tree rotation, false - for
1960	/// sub-tree rotations. \param IgnoreReorder true, if the order of the root
1961	/// node might be ignored.
1962	std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
1963	bool TopToBottom,
1964	bool IgnoreReorder);
1965
1966	/// Sort loads into increasing pointers offsets to allow greater clustering.
1967	std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1968
1969	/// Gets reordering data for the given tree entry. If the entry is vectorized
1970	/// - just return ReorderIndices, otherwise check if the scalars can be
1971	/// reordered and return the most optimal order.
1972	/// \return std::nullopt if ordering is not important, empty order, if
1973	/// identity order is important, or the actual order.
1974	/// \param TopToBottom If true, include the order of vectorized stores and
1975	/// insertelement nodes, otherwise skip them.
1976	/// \param IgnoreReorder true, if the root node order can be ignored.
1977	std::optional<OrdersType>
1978	getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
1979
1980	/// Checks if it is profitable to reorder the current tree.
1981	/// If the tree does not contain many profitable reordable nodes, better to
1982	/// skip it to save compile time.
1983	bool isProfitableToReorder() const;
1984
1985	/// Reorders the current graph to the most profitable order starting from the
1986	/// root node to the leaf nodes. The best order is chosen only from the nodes
1987	/// of the same size (vectorization factor). Smaller nodes are considered
1988	/// parts of subgraph with smaller VF and they are reordered independently. We
1989	/// can make it because we still need to extend smaller nodes to the wider VF
1990	/// and we can merge reordering shuffles with the widening shuffles.
1991	void reorderTopToBottom();
1992
1993	/// Reorders the current graph to the most profitable order starting from
1994	/// leaves to the root. It allows to rotate small subgraphs and reduce the
1995	/// number of reshuffles if the leaf nodes use the same order. In this case we
1996	/// can merge the orders and just shuffle user node instead of shuffling its
1997	/// operands. Plus, even the leaf nodes have different orders, it allows to
1998	/// sink reordering in the graph closer to the root node and merge it later
1999	/// during analysis.
2000	void reorderBottomToTop(bool IgnoreReorder = false);
2001
2002	/// \return The vector element size in bits to use when vectorizing the
2003	/// expression tree ending at \p V. If V is a store, the size is the width of
2004	/// the stored value. Otherwise, the size is the width of the largest loaded
2005	/// value reaching V. This method is used by the vectorizer to calculate
2006	/// vectorization factors.
2007	unsigned getVectorElementSize(Value *V);
2008
2009	/// Compute the minimum type sizes required to represent the entries in a
2010	/// vectorizable tree.
2011	void computeMinimumValueSizes();
2012
2013	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
2014	unsigned getMaxVecRegSize() const {
2015	return MaxVecRegSize;
2016	}
2017
2018	// \returns minimum vector register size as set by cl::opt.
2019	unsigned getMinVecRegSize() const {
2020	return MinVecRegSize;
2021	}
2022
2023	unsigned getMinVF(unsigned Sz) const {
2024	return std::max(a: `2U`, b: getMinVecRegSize() / Sz);
2025	}
2026
2027	unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2028	unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2029	MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2030	return MaxVF ? MaxVF : UINT_MAX;
2031	}
2032
2033	/// Check if homogeneous aggregate is isomorphic to some VectorType.
2034	/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2035	/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2036	/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2037	///
2038	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2039	unsigned canMapToVector(Type T) const*;
2040
2041	/// \returns True if the VectorizableTree is both tiny and not fully
2042	/// vectorizable. We do not vectorize such trees.
2043	bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2044
2045	/// Checks if the graph and all its subgraphs cannot be better vectorized.
2046	/// It may happen, if all gather nodes are loads and they cannot be
2047	/// "clusterized". In this case even subgraphs cannot be vectorized more
2048	/// effectively than the base graph.
2049	bool isTreeNotExtendable() const;
2050
2051	/// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2052	/// can be load combined in the backend. Load combining may not be allowed in
2053	/// the IR optimizer, so we do not want to alter the pattern. For example,
2054	/// partially transforming a scalar bswap() pattern into vector code is
2055	/// effectively impossible for the backend to undo.
2056	/// TODO: If load combining is allowed in the IR optimizer, this analysis
2057	/// may not be necessary.
2058	bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2059
2060	/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2061	/// can be load combined in the backend. Load combining may not be allowed in
2062	/// the IR optimizer, so we do not want to alter the pattern. For example,
2063	/// partially transforming a scalar bswap() pattern into vector code is
2064	/// effectively impossible for the backend to undo.
2065	/// TODO: If load combining is allowed in the IR optimizer, this analysis
2066	/// may not be necessary.
2067	bool isLoadCombineCandidate(ArrayRef<Value > Stores) const*;
2068
2069	/// Checks if the given array of loads can be represented as a vectorized,
2070	/// scatter or just simple gather.
2071	/// \param VL list of loads.
2072	/// \param VL0 main load value.
2073	/// \param Order returned order of load instructions.
2074	/// \param PointerOps returned list of pointer operands.
2075	/// \param BestVF return best vector factor, if recursive check found better
2076	/// vectorization sequences rather than masked gather.
2077	/// \param TryRecursiveCheck used to check if long masked gather can be
2078	/// represented as a serie of loads/insert subvector, if profitable.
2079	LoadsState canVectorizeLoads(ArrayRef<Value > VL, const* Value *VL0,
2080	SmallVectorImpl<unsigned> &Order,
2081	SmallVectorImpl<Value *> &PointerOps,
2082	unsigned BestVF = nullptr*,
2083	bool TryRecursiveCheck = true) const;
2084
2085	/// Registers non-vectorizable sequence of loads
2086	template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2087	ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2088	}
2089
2090	/// Checks if the given loads sequence is known as not vectorizable
2091	template <typename T>
2092	bool areKnownNonVectorizableLoads(ArrayRef<T > VL) const* {
2093	return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL));
2094	}
2095
2096	OptimizationRemarkEmitter getORE() { return* ORE; }
2097
2098	/// This structure holds any data we need about the edges being traversed
2099	/// during buildTreeRec(). We keep track of:
2100	/// (i) the user TreeEntry index, and
2101	/// (ii) the index of the edge.
2102	struct EdgeInfo {
2103	EdgeInfo() = default;
2104	EdgeInfo(TreeEntry UserTE, unsigned* EdgeIdx)
2105	: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
2106	/// The user TreeEntry.
2107	TreeEntry UserTE = nullptr*;
2108	/// The operand index of the use.
2109	unsigned EdgeIdx = UINT_MAX;
2110	#ifndef NDEBUG
2111	friend inline raw_ostream &operator<<(raw_ostream &OS,
2112	const BoUpSLP::EdgeInfo &EI) {
2113	EI.dump(OS);
2114	return OS;
2115	}
2116	/// Debug print.
2117	void dump(raw_ostream &OS) const {
2118	OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2119	<< " EdgeIdx:" << EdgeIdx << "}";
2120	}
2121	LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2122	#endif
2123	bool operator == (const EdgeInfo &Other) const {
2124	return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2125	}
2126
2127	operator bool() const { return UserTE != nullptr; }
2128	};
2129
2130	/// A helper class used for scoring candidates for two consecutive lanes.
2131	class LookAheadHeuristics {
2132	const TargetLibraryInfo &TLI;
2133	const DataLayout &DL;
2134	ScalarEvolution &SE;
2135	const BoUpSLP &R;
2136	int NumLanes; // Total number of lanes (aka vectorization factor).
2137	int MaxLevel; // The maximum recursion depth for accumulating score.
2138
2139	public:
2140	LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
2141	ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2142	int MaxLevel)
2143	: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2144	MaxLevel(MaxLevel) {}
2145
2146	// The hard-coded scores listed here are not very important, though it shall
2147	// be higher for better matches to improve the resulting cost. When
2148	// computing the scores of matching one sub-tree with another, we are
2149	// basically counting the number of values that are matching. So even if all
2150	// scores are set to 1, we would still get a decent matching result.
2151	// However, sometimes we have to break ties. For example we may have to
2152	// choose between matching loads vs matching opcodes. This is what these
2153	// scores are helping us with: they provide the order of preference. Also,
2154	// this is important if the scalar is externally used or used in another
2155	// tree entry node in the different lane.
2156
2157	/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2158	static const int ScoreConsecutiveLoads = `4`;
2159	/// The same load multiple times. This should have a better score than
2160	/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2161	/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2162	/// a vector load and 1.0 for a broadcast.
2163	static const int ScoreSplatLoads = `3`;
2164	/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2165	static const int ScoreReversedLoads = `3`;
2166	/// A load candidate for masked gather.
2167	static const int ScoreMaskedGatherCandidate = `1`;
2168	/// ExtractElementInst from same vector and consecutive indexes.
2169	static const int ScoreConsecutiveExtracts = `4`;
2170	/// ExtractElementInst from same vector and reversed indices.
2171	static const int ScoreReversedExtracts = `3`;
2172	/// Constants.
2173	static const int ScoreConstants = `2`;
2174	/// Instructions with the same opcode.
2175	static const int ScoreSameOpcode = `2`;
2176	/// Instructions with alt opcodes (e.g, add + sub).
2177	static const int ScoreAltOpcodes = `1`;
2178	/// Identical instructions (a.k.a. splat or broadcast).
2179	static const int ScoreSplat = `1`;
2180	/// Matching with an undef is preferable to failing.
2181	static const int ScoreUndef = `1`;
2182	/// Score for failing to find a decent match.
2183	static const int ScoreFail = `0`;
2184	/// Score if all users are vectorized.
2185	static const int ScoreAllUserVectorized = `1`;
2186
2187	/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2188	/// \p U1 and \p U2 are the users of \p V1 and \p V2.
2189	/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2190	/// MainAltOps.
2191	int getShallowScore(Value V1, Value V2, Instruction U1, Instruction U2,
2192	ArrayRef<Value > MainAltOps) const* {
2193	if (!isValidElementType(Ty: V1->getType()) \|\|
2194	!isValidElementType(Ty: V2->getType()))
2195	return LookAheadHeuristics::ScoreFail;
2196
2197	if (V1 == V2) {
2198	if (isa<LoadInst>(Val: V1)) {
2199	// Retruns true if the users of V1 and V2 won't need to be extracted.
2200	auto AllUsersAreInternal = [U1, U2, this](Value V1, Value V2) {
2201	// Bail out if we have too many uses to save compilation time.
2202	if (V1->hasNUsesOrMore(N: UsesLimit) \|\| V2->hasNUsesOrMore(N: UsesLimit))
2203	return false;
2204
2205	auto AllUsersVectorized = [U1, U2, this](Value *V) {
2206	return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
2207	return U == U1 \|\| U == U2 \|\| R.isVectorized(V: U);
2208	});
2209	};
2210	return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2211	};
2212	// A broadcast of a load can be cheaper on some targets.
2213	if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
2214	NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
2215	((int)V1->getNumUses() == NumLanes \|\|
2216	AllUsersAreInternal(V1, V2)))
2217	return LookAheadHeuristics::ScoreSplatLoads;
2218	}
2219	return LookAheadHeuristics::ScoreSplat;
2220	}
2221
2222	auto CheckSameEntryOrFail = [&]() {
2223	if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) {
2224	SmallPtrSet<TreeEntry *, `4`> Set(llvm::from_range, TEs1);
2225	if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2);
2226	!TEs2.empty() &&
2227	any_of(Range&: TEs2, P: [&](TreeEntry E) { return* Set.contains(Ptr: E); }))
2228	return LookAheadHeuristics::ScoreSplatLoads;
2229	}
2230	return LookAheadHeuristics::ScoreFail;
2231	};
2232
2233	auto *LI1 = dyn_cast<LoadInst>(Val: V1);
2234	auto *LI2 = dyn_cast<LoadInst>(Val: V2);
2235	if (LI1 && LI2) {
2236	if (LI1->getParent() != LI2->getParent() \|\| !LI1->isSimple() \|\|
2237	!LI2->isSimple())
2238	return CheckSameEntryOrFail();
2239
2240	std::optional<int64_t> Dist = getPointersDiff(
2241	ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
2242	PtrB: LI2->getPointerOperand(), DL, SE, /StrictCheck=/true);
2243	if (!Dist \|\| *Dist == `0`) {
2244	if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
2245	getUnderlyingObject(V: LI2->getPointerOperand()) &&
2246	R.TTI->isLegalMaskedGather(
2247	DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
2248	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2249	return CheckSameEntryOrFail();
2250	}
2251	// The distance is too large - still may be profitable to use masked
2252	// loads/gathers.
2253	if (std::abs(i: *Dist) > NumLanes / `2`)
2254	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2255	// This still will detect consecutive loads, but we might have "holes"
2256	// in some cases. It is ok for non-power-2 vectorization and may produce
2257	// better results. It should not affect current vectorization.
2258	return (*Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveLoads
2259	: LookAheadHeuristics::ScoreReversedLoads;
2260	}
2261
2262	auto *C1 = dyn_cast<Constant>(Val: V1);
2263	auto *C2 = dyn_cast<Constant>(Val: V2);
2264	if (C1 && C2)
2265	return LookAheadHeuristics::ScoreConstants;
2266
2267	// Extracts from consecutive indexes of the same vector better score as
2268	// the extracts could be optimized away.
2269	Value *EV1;
2270	ConstantInt *Ex1Idx;
2271	if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
2272	// Undefs are always profitable for extractelements.
2273	// Compiler can easily combine poison and extractelement <non-poison> or
2274	// undef and extractelement <poison>. But combining undef +
2275	// extractelement <non-poison-but-may-produce-poison> requires some
2276	// extra operations.
2277	if (isa<UndefValue>(Val: V2))
2278	return (isa<PoisonValue>(Val: V2) \|\| isUndefVector(V: EV1).all())
2279	? LookAheadHeuristics::ScoreConsecutiveExtracts
2280	: LookAheadHeuristics::ScoreSameOpcode;
2281	Value EV2 = nullptr*;
2282	ConstantInt Ex2Idx = nullptr*;
2283	if (match(V: V2,
2284	P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
2285	R: m_Undef())))) {
2286	// Undefs are always profitable for extractelements.
2287	if (!Ex2Idx)
2288	return LookAheadHeuristics::ScoreConsecutiveExtracts;
2289	if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
2290	return LookAheadHeuristics::ScoreConsecutiveExtracts;
2291	if (EV2 == EV1) {
2292	int Idx1 = Ex1Idx->getZExtValue();
2293	int Idx2 = Ex2Idx->getZExtValue();
2294	int Dist = Idx2 - Idx1;
2295	// The distance is too large - still may be profitable to use
2296	// shuffles.
2297	if (std::abs(x: Dist) == `0`)
2298	return LookAheadHeuristics::ScoreSplat;
2299	if (std::abs(x: Dist) > NumLanes / `2`)
2300	return LookAheadHeuristics::ScoreSameOpcode;
2301	return (Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveExtracts
2302	: LookAheadHeuristics::ScoreReversedExtracts;
2303	}
2304	return LookAheadHeuristics::ScoreAltOpcodes;
2305	}
2306	return CheckSameEntryOrFail();
2307	}
2308
2309	auto *I1 = dyn_cast<Instruction>(Val: V1);
2310	auto *I2 = dyn_cast<Instruction>(Val: V2);
2311	if (I1 && I2) {
2312	if (I1->getParent() != I2->getParent())
2313	return CheckSameEntryOrFail();
2314	SmallVector<Value *, `4`> Ops(MainAltOps);
2315	Ops.push_back(Elt: I1);
2316	Ops.push_back(Elt: I2);
2317	InstructionsState S = getSameOpcode(VL: Ops, TLI);
2318	// Note: Only consider instructions with <= 2 operands to avoid
2319	// complexity explosion.
2320	if (S &&
2321	(S.getMainOp()->getNumOperands() <= `2` \|\| !MainAltOps.empty() \|\|
2322	!S.isAltShuffle()) &&
2323	all_of(Range&: Ops, P: [&S](Value *V) {
2324	return isa<PoisonValue>(Val: V) \|\|
2325	cast<Instruction>(Val: V)->getNumOperands() ==
2326	S.getMainOp()->getNumOperands();
2327	}))
2328	return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2329	: LookAheadHeuristics::ScoreSameOpcode;
2330	}
2331
2332	if (I1 && isa<PoisonValue>(Val: V2))
2333	return LookAheadHeuristics::ScoreSameOpcode;
2334
2335	if (isa<UndefValue>(Val: V2))
2336	return LookAheadHeuristics::ScoreUndef;
2337
2338	return CheckSameEntryOrFail();
2339	}
2340
2341	/// Go through the operands of \p LHS and \p RHS recursively until
2342	/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2343	/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2344	/// of \p U1 and \p U2), except at the beginning of the recursion where
2345	/// these are set to nullptr.
2346	///
2347	/// For example:
2348	/// \verbatim
2349	/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2350	/// \ / \ / \ / \ /
2351	/// + + + +
2352	/// G1 G2 G3 G4
2353	/// \endverbatim
2354	/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2355	/// each level recursively, accumulating the score. It starts from matching
2356	/// the additions at level 0, then moves on to the loads (level 1). The
2357	/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2358	/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2359	/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2360	/// Please note that the order of the operands does not matter, as we
2361	/// evaluate the score of all profitable combinations of operands. In
2362	/// other words the score of G1 and G4 is the same as G1 and G2. This
2363	/// heuristic is based on ideas described in:
2364	/// Look-ahead SLP: Auto-vectorization in the presence of commutative
2365	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2366	/// Luís F. W. Góes
2367	int getScoreAtLevelRec(Value LHS, Value RHS, Instruction *U1,
2368	Instruction U2, int* CurrLevel,
2369	ArrayRef<Value > MainAltOps) const* {
2370
2371	// Get the shallow score of V1 and V2.
2372	int ShallowScoreAtThisLevel =
2373	getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
2374
2375	// If reached MaxLevel,
2376	// or if V1 and V2 are not instructions,
2377	// or if they are SPLAT,
2378	// or if they are not consecutive,
2379	// or if profitable to vectorize loads or extractelements, early return
2380	// the current cost.
2381	auto *I1 = dyn_cast<Instruction>(Val: LHS);
2382	auto *I2 = dyn_cast<Instruction>(Val: RHS);
2383	if (CurrLevel == MaxLevel \|\| !(I1 && I2) \|\| I1 == I2 \|\|
2384	ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail \|\|
2385	(((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) \|\|
2386	(I1->getNumOperands() > `2` && I2->getNumOperands() > `2`) \|\|
2387	(isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
2388	ShallowScoreAtThisLevel))
2389	return ShallowScoreAtThisLevel;
2390	assert(I1 && I2 && "Should have early exited.");
2391
2392	// Contains the I2 operand indexes that got matched with I1 operands.
2393	SmallSet<unsigned, `4`> Op2Used;
2394
2395	// Recursion towards the operands of I1 and I2. We are trying all possible
2396	// operand pairs, and keeping track of the best score.
2397	for (unsigned OpIdx1 = `0`, NumOperands1 = I1->getNumOperands();
2398	OpIdx1 != NumOperands1; ++OpIdx1) {
2399	// Try to pair op1I with the best operand of I2.
2400	int MaxTmpScore = `0`;
2401	unsigned MaxOpIdx2 = `0`;
2402	bool FoundBest = false;
2403	// If I2 is commutative try all combinations.
2404	unsigned FromIdx = isCommutative(I: I2) ? `0` : OpIdx1;
2405	unsigned ToIdx = isCommutative(I: I2)
2406	? I2->getNumOperands()
2407	: std::min(a: I2->getNumOperands(), b: OpIdx1 + `1`);
2408	assert(FromIdx <= ToIdx && "Bad index");
2409	for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2410	// Skip operands already paired with OpIdx1.
2411	if (Op2Used.count(V: OpIdx2))
2412	continue;
2413	// Recursively calculate the cost at each level
2414	int TmpScore =
2415	getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
2416	U1: I1, U2: I2, CurrLevel: CurrLevel + `1`, MainAltOps: {});
2417	// Look for the best score.
2418	if (TmpScore > LookAheadHeuristics::ScoreFail &&
2419	TmpScore > MaxTmpScore) {
2420	MaxTmpScore = TmpScore;
2421	MaxOpIdx2 = OpIdx2;
2422	FoundBest = true;
2423	}
2424	}
2425	if (FoundBest) {
2426	// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2427	Op2Used.insert(V: MaxOpIdx2);
2428	ShallowScoreAtThisLevel += MaxTmpScore;
2429	}
2430	}
2431	return ShallowScoreAtThisLevel;
2432	}
2433	};
2434	/// A helper data structure to hold the operands of a vector of instructions.
2435	/// This supports a fixed vector length for all operand vectors.
2436	class VLOperands {
2437	/// For each operand we need (i) the value, and (ii) the opcode that it
2438	/// would be attached to if the expression was in a left-linearized form.
2439	/// This is required to avoid illegal operand reordering.
2440	/// For example:
2441	/// \verbatim
2442	/// 0 Op1
2443	/// \|/
2444	/// Op1 Op2 Linearized + Op2
2445	/// \ / ----------> \|/
2446	/// - -
2447	///
2448	/// Op1 - Op2 (0 + Op1) - Op2
2449	/// \endverbatim
2450	///
2451	/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2452	///
2453	/// Another way to think of this is to track all the operations across the
2454	/// path from the operand all the way to the root of the tree and to
2455	/// calculate the operation that corresponds to this path. For example, the
2456	/// path from Op2 to the root crosses the RHS of the '-', therefore the
2457	/// corresponding operation is a '-' (which matches the one in the
2458	/// linearized tree, as shown above).
2459	///
2460	/// For lack of a better term, we refer to this operation as Accumulated
2461	/// Path Operation (APO).
2462	struct OperandData {
2463	OperandData() = default;
2464	OperandData(Value V, bool* APO, bool IsUsed)
2465	: V(V), APO(APO), IsUsed(IsUsed) {}
2466	/// The operand value.
2467	Value V = nullptr*;
2468	/// TreeEntries only allow a single opcode, or an alternate sequence of
2469	/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2470	/// APO. It is set to 'true' if 'V' is attached to an inverse operation
2471	/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2472	/// (e.g., Add/Mul)
2473	bool APO = false;
2474	/// Helper data for the reordering function.
2475	bool IsUsed = false;
2476	};
2477
2478	/// During operand reordering, we are trying to select the operand at lane
2479	/// that matches best with the operand at the neighboring lane. Our
2480	/// selection is based on the type of value we are looking for. For example,
2481	/// if the neighboring lane has a load, we need to look for a load that is
2482	/// accessing a consecutive address. These strategies are summarized in the
2483	/// 'ReorderingMode' enumerator.
2484	enum class ReorderingMode {
2485	Load, ///< Matching loads to consecutive memory addresses
2486	Opcode, ///< Matching instructions based on opcode (same or alternate)
2487	Constant, ///< Matching constants
2488	Splat, ///< Matching the same instruction multiple times (broadcast)
2489	Failed, ///< We failed to create a vectorizable group
2490	};
2491
2492	using OperandDataVec = SmallVector<OperandData, `2`>;
2493
2494	/// A vector of operand vectors.
2495	SmallVector<OperandDataVec, `4`> OpsVec;
2496	/// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2497	/// is not IntrinsicInst, ArgSize is User::getNumOperands.
2498	unsigned ArgSize = `0`;
2499
2500	const TargetLibraryInfo &TLI;
2501	const DataLayout &DL;
2502	ScalarEvolution &SE;
2503	const BoUpSLP &R;
2504	const Loop L = nullptr*;
2505
2506	/// \returns the operand data at \p OpIdx and \p Lane.
2507	OperandData &getData(unsigned OpIdx, unsigned Lane) {
2508	return OpsVec [OpIdx][Lane];
2509	}
2510
2511	/// \returns the operand data at \p OpIdx and \p Lane. Const version.
2512	const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2513	return OpsVec [OpIdx][Lane];
2514	}
2515
2516	/// Clears the used flag for all entries.
2517	void clearUsed() {
2518	for (unsigned OpIdx = `0`, NumOperands = getNumOperands();
2519	OpIdx != NumOperands; ++OpIdx)
2520	for (unsigned Lane = `0`, NumLanes = getNumLanes(); Lane != NumLanes;
2521	++Lane)
2522	OpsVec [OpIdx][Lane].IsUsed = false;
2523	}
2524
2525	/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2526	void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2527	std::swap(a&: OpsVec [OpIdx1][Lane], b&: OpsVec [OpIdx2][Lane]);
2528	}
2529
2530	/// \param Lane lane of the operands under analysis.
2531	/// \param OpIdx operand index in \p Lane lane we're looking the best
2532	/// candidate for.
2533	/// \param Idx operand index of the current candidate value.
2534	/// \returns The additional score due to possible broadcasting of the
2535	/// elements in the lane. It is more profitable to have power-of-2 unique
2536	/// elements in the lane, it will be vectorized with higher probability
2537	/// after removing duplicates. Currently the SLP vectorizer supports only
2538	/// vectorization of the power-of-2 number of unique scalars.
2539	int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2540	const SmallBitVector &UsedLanes) const {
2541	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2542	if (!isa<Instruction>(Val: IdxLaneV) \|\| IdxLaneV == getData(OpIdx, Lane).V \|\|
2543	isa<ExtractElementInst>(Val: IdxLaneV))
2544	return `0`;
2545	SmallDenseMap<Value , unsigned*, `4`> Uniques;
2546	for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) {
2547	if (Ln == Lane)
2548	continue;
2549	Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
2550	if (!isa<Instruction>(Val: OpIdxLnV))
2551	return `0`;
2552	Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln);
2553	}
2554	unsigned UniquesCount = Uniques.size();
2555	auto IdxIt = Uniques.find(Val: IdxLaneV);
2556	unsigned UniquesCntWithIdxLaneV =
2557	IdxIt != Uniques.end() ? UniquesCount : UniquesCount + `1`;
2558	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2559	auto OpIdxIt = Uniques.find(Val: OpIdxLaneV);
2560	unsigned UniquesCntWithOpIdxLaneV =
2561	OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + `1`;
2562	if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2563	return `0`;
2564	return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) -
2565	UniquesCntWithOpIdxLaneV,
2566	b: UniquesCntWithOpIdxLaneV -
2567	bit_floor(Value: UniquesCntWithOpIdxLaneV)) -
2568	((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt ->second))
2569	? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV)
2570	: bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2571	}
2572
2573	/// \param Lane lane of the operands under analysis.
2574	/// \param OpIdx operand index in \p Lane lane we're looking the best
2575	/// candidate for.
2576	/// \param Idx operand index of the current candidate value.
2577	/// \returns The additional score for the scalar which users are all
2578	/// vectorized.
2579	int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2580	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2581	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2582	// Do not care about number of uses for vector-like instructions
2583	// (extractelement/extractvalue with constant indices), they are extracts
2584	// themselves and already externally used. Vectorization of such
2585	// instructions does not add extra extractelement instruction, just may
2586	// remove it.
2587	if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
2588	isVectorLikeInstWithConstOps(V: OpIdxLaneV))
2589	return LookAheadHeuristics::ScoreAllUserVectorized;
2590	auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
2591	if (!IdxLaneI \|\| !isa<Instruction>(Val: OpIdxLaneV))
2592	return `0`;
2593	return R.areAllUsersVectorized(I: IdxLaneI)
2594	? LookAheadHeuristics::ScoreAllUserVectorized
2595	: `0`;
2596	}
2597
2598	/// Score scaling factor for fully compatible instructions but with
2599	/// different number of external uses. Allows better selection of the
2600	/// instructions with less external uses.
2601	static const int ScoreScaleFactor = `10`;
2602
2603	/// \Returns the look-ahead score, which tells us how much the sub-trees
2604	/// rooted at \p LHS and \p RHS match, the more they match the higher the
2605	/// score. This helps break ties in an informed way when we cannot decide on
2606	/// the order of the operands by just considering the immediate
2607	/// predecessors.
2608	int getLookAheadScore(Value LHS, Value RHS, ArrayRef<Value *> MainAltOps,
2609	int Lane, unsigned OpIdx, unsigned Idx,
2610	bool &IsUsed, const SmallBitVector &UsedLanes) {
2611	LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2612	LookAheadMaxDepth);
2613	// Keep track of the instruction stack as we recurse into the operands
2614	// during the look-ahead score exploration.
2615	int Score =
2616	LookAhead.getScoreAtLevelRec(LHS, RHS, /U1=/nullptr, /U2=/nullptr,
2617	/CurrLevel=/`1`, MainAltOps);
2618	if (Score) {
2619	int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2620	if (Score <= -SplatScore) {
2621	// Failed score.
2622	Score = `0`;
2623	} else {
2624	Score += SplatScore;
2625	// Scale score to see the difference between different operands
2626	// and similar operands but all vectorized/not all vectorized
2627	// uses. It does not affect actual selection of the best
2628	// compatible operand in general, just allows to select the
2629	// operand with all vectorized uses.
2630	Score *= ScoreScaleFactor;
2631	Score += getExternalUseScore(Lane, OpIdx, Idx);
2632	IsUsed = true;
2633	}
2634	}
2635	return Score;
2636	}
2637
2638	/// Best defined scores per lanes between the passes. Used to choose the
2639	/// best operand (with the highest score) between the passes.
2640	/// The key - {Operand Index, Lane}.
2641	/// The value - the best score between the passes for the lane and the
2642	/// operand.
2643	SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, `8`>
2644	BestScoresPerLanes;
2645
2646	// Search all operands in Ops[][Lane] for the one that matches best*
2647	// Ops[OpIdx][LastLane] and return its opreand index.
2648	// If no good match can be found, return std::nullopt.
2649	std::optional<unsigned>
2650	getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2651	ArrayRef<ReorderingMode> ReorderingModes,
2652	ArrayRef<Value *> MainAltOps,
2653	const SmallBitVector &UsedLanes) {
2654	unsigned NumOperands = getNumOperands();
2655
2656	// The operand of the previous lane at OpIdx.
2657	Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
2658
2659	// Our strategy mode for OpIdx.
2660	ReorderingMode RMode = ReorderingModes [OpIdx];
2661	if (RMode == ReorderingMode::Failed)
2662	return std::nullopt;
2663
2664	// The linearized opcode of the operand at OpIdx, Lane.
2665	bool OpIdxAPO = getData(OpIdx, Lane).APO;
2666
2667	// The best operand index and its score.
2668	// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2669	// are using the score to differentiate between the two.
2670	struct BestOpData {
2671	std::optional<unsigned> Idx;
2672	unsigned Score = `0`;
2673	} BestOp;
2674	BestOp.Score =
2675	BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: `0`)
2676	.first ->second;
2677
2678	// Track if the operand must be marked as used. If the operand is set to
2679	// Score 1 explicitly (because of non power-of-2 unique scalars, we may
2680	// want to reestimate the operands again on the following iterations).
2681	bool IsUsed = RMode == ReorderingMode::Splat \|\|
2682	RMode == ReorderingMode::Constant \|\|
2683	RMode == ReorderingMode::Load;
2684	// Iterate through all unused operands and look for the best.
2685	for (unsigned Idx = `0`; Idx != NumOperands; ++Idx) {
2686	// Get the operand at Idx and Lane.
2687	OperandData &OpData = getData(OpIdx: Idx, Lane);
2688	Value *Op = OpData.V;
2689	bool OpAPO = OpData.APO;
2690
2691	// Skip already selected operands.
2692	if (OpData.IsUsed)
2693	continue;
2694
2695	// Skip if we are trying to move the operand to a position with a
2696	// different opcode in the linearized tree form. This would break the
2697	// semantics.
2698	if (OpAPO != OpIdxAPO)
2699	continue;
2700
2701	// Look for an operand that matches the current mode.
2702	switch (RMode) {
2703	case ReorderingMode::Load:
2704	case ReorderingMode::Opcode: {
2705	bool LeftToRight = Lane > LastLane;
2706	Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2707	Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2708	int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
2709	OpIdx, Idx, IsUsed, UsedLanes);
2710	if (Score > static_cast<int>(BestOp.Score) \|\|
2711	(Score > `0` && Score == static_cast<int>(BestOp.Score) &&
2712	Idx == OpIdx)) {
2713	BestOp.Idx = Idx;
2714	BestOp.Score = Score;
2715	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] = Score;
2716	}
2717	break;
2718	}
2719	case ReorderingMode::Constant:
2720	if (isa<Constant>(Val: Op) \|\|
2721	(!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
2722	BestOp.Idx = Idx;
2723	if (isa<Constant>(Val: Op)) {
2724	BestOp.Score = LookAheadHeuristics::ScoreConstants;
2725	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] =
2726	LookAheadHeuristics::ScoreConstants;
2727	}
2728	if (isa<UndefValue>(Val: Op) \|\| !isa<Constant>(Val: Op))
2729	IsUsed = false;
2730	}
2731	break;
2732	case ReorderingMode::Splat:
2733	if (Op == OpLastLane \|\| (!BestOp.Score && isa<Constant>(Val: Op))) {
2734	IsUsed = Op == OpLastLane;
2735	if (Op == OpLastLane) {
2736	BestOp.Score = LookAheadHeuristics::ScoreSplat;
2737	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] =
2738	LookAheadHeuristics::ScoreSplat;
2739	}
2740	BestOp.Idx = Idx;
2741	}
2742	break;
2743	case ReorderingMode::Failed:
2744	llvm_unreachable("Not expected Failed reordering mode.");
2745	}
2746	}
2747
2748	if (BestOp.Idx) {
2749	getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
2750	return BestOp.Idx;
2751	}
2752	// If we could not find a good match return std::nullopt.
2753	return std::nullopt;
2754	}
2755
2756	/// Helper for reorderOperandVecs.
2757	/// \returns the lane that we should start reordering from. This is the one
2758	/// which has the least number of operands that can freely move about or
2759	/// less profitable because it already has the most optimal set of operands.
2760	unsigned getBestLaneToStartReordering() const {
2761	unsigned Min = UINT_MAX;
2762	unsigned SameOpNumber = `0`;
2763	// std::pair<unsigned, unsigned> is used to implement a simple voting
2764	// algorithm and choose the lane with the least number of operands that
2765	// can freely move about or less profitable because it already has the
2766	// most optimal set of operands. The first unsigned is a counter for
2767	// voting, the second unsigned is the counter of lanes with instructions
2768	// with same/alternate opcodes and same parent basic block.
2769	MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2770	// Try to be closer to the original results, if we have multiple lanes
2771	// with same cost. If 2 lanes have the same cost, use the one with the
2772	// highest index.
2773	for (int I = getNumLanes(); I > `0`; --I) {
2774	unsigned Lane = I - `1`;
2775	OperandsOrderData NumFreeOpsHash =
2776	getMaxNumOperandsThatCanBeReordered(Lane);
2777	// Compare the number of operands that can move and choose the one with
2778	// the least number.
2779	if (NumFreeOpsHash.NumOfAPOs < Min) {
2780	Min = NumFreeOpsHash.NumOfAPOs;
2781	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2782	HashMap.clear();
2783	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
2784	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
2785	NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2786	// Select the most optimal lane in terms of number of operands that
2787	// should be moved around.
2788	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2789	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
2790	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
2791	NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2792	auto [It, Inserted] =
2793	HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: `1`, Args&: Lane);
2794	if (!Inserted)
2795	++It->second.first;
2796	}
2797	}
2798	// Select the lane with the minimum counter.
2799	unsigned BestLane = `0`;
2800	unsigned CntMin = UINT_MAX;
2801	for (const auto &Data : reverse(C&: HashMap)) {
2802	if (Data.second.first < CntMin) {
2803	CntMin = Data.second.first;
2804	BestLane = Data.second.second;
2805	}
2806	}
2807	return BestLane;
2808	}
2809
2810	/// Data structure that helps to reorder operands.
2811	struct OperandsOrderData {
2812	/// The best number of operands with the same APOs, which can be
2813	/// reordered.
2814	unsigned NumOfAPOs = UINT_MAX;
2815	/// Number of operands with the same/alternate instruction opcode and
2816	/// parent.
2817	unsigned NumOpsWithSameOpcodeParent = `0`;
2818	/// Hash for the actual operands ordering.
2819	/// Used to count operands, actually their position id and opcode
2820	/// value. It is used in the voting mechanism to find the lane with the
2821	/// least number of operands that can freely move about or less profitable
2822	/// because it already has the most optimal set of operands. Can be
2823	/// replaced with SmallVector<unsigned> instead but hash code is faster
2824	/// and requires less memory.
2825	unsigned Hash = `0`;
2826	};
2827	/// \returns the maximum number of operands that are allowed to be reordered
2828	/// for \p Lane and the number of compatible instructions(with the same
2829	/// parent/opcode). This is used as a heuristic for selecting the first lane
2830	/// to start operand reordering.
2831	OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2832	unsigned CntTrue = `0`;
2833	unsigned NumOperands = getNumOperands();
2834	// Operands with the same APO can be reordered. We therefore need to count
2835	// how many of them we have for each APO, like this: Cnt[APO] = x.
2836	// Since we only have two APOs, namely true and false, we can avoid using
2837	// a map. Instead we can simply count the number of operands that
2838	// correspond to one of them (in this case the 'true' APO), and calculate
2839	// the other by subtracting it from the total number of operands.
2840	// Operands with the same instruction opcode and parent are more
2841	// profitable since we don't need to move them in many cases, with a high
2842	// probability such lane already can be vectorized effectively.
2843	bool AllUndefs = true;
2844	unsigned NumOpsWithSameOpcodeParent = `0`;
2845	Instruction OpcodeI = nullptr*;
2846	BasicBlock Parent = nullptr*;
2847	unsigned Hash = `0`;
2848	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
2849	const OperandData &OpData = getData(OpIdx, Lane);
2850	if (OpData.APO)
2851	++CntTrue;
2852	// Use Boyer-Moore majority voting for finding the majority opcode and
2853	// the number of times it occurs.
2854	if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
2855	if (!OpcodeI \|\| !getSameOpcode(VL: {OpcodeI, I}, TLI) \|\|
2856	I->getParent() != Parent) {
2857	if (NumOpsWithSameOpcodeParent == `0`) {
2858	NumOpsWithSameOpcodeParent = `1`;
2859	OpcodeI = I;
2860	Parent = I->getParent();
2861	} else {
2862	--NumOpsWithSameOpcodeParent;
2863	}
2864	} else {
2865	++NumOpsWithSameOpcodeParent;
2866	}
2867	}
2868	Hash = hash_combine(
2869	args: Hash, args: hash_value(value: (OpIdx + `1`) * (OpData.V->getValueID() + `1`)));
2870	AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
2871	}
2872	if (AllUndefs)
2873	return {};
2874	OperandsOrderData Data;
2875	Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
2876	Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2877	Data.Hash = Hash;
2878	return Data;
2879	}
2880
2881	/// Go through the instructions in VL and append their operands.
2882	void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
2883	const InstructionsState &S) {
2884	assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
2885	assert((empty() \|\| all_of(Operands,
2886	[this](const ValueList &VL) {
2887	return VL.size() == getNumLanes();
2888	})) &&
2889	"Expected same number of lanes");
2890	assert(S.valid() && "InstructionsState is invalid.");
2891	// IntrinsicInst::isCommutative returns true if swapping the first "two"
2892	// arguments to the intrinsic produces the same result.
2893	constexpr unsigned IntrinsicNumOperands = `2`;
2894	Instruction *MainOp = S.getMainOp();
2895	unsigned NumOperands = MainOp->getNumOperands();
2896	ArgSize = isa<IntrinsicInst>(Val: MainOp) ? IntrinsicNumOperands : NumOperands;
2897	OpsVec.resize(N: ArgSize);
2898	unsigned NumLanes = VL.size();
2899	for (OperandDataVec &Ops : OpsVec)
2900	Ops.resize(N: NumLanes);
2901	for (unsigned Lane : seq<unsigned>(Size: NumLanes)) {
2902	Value *V = VL [Lane];
2903	assert((isa<Instruction>(V) \|\| isa<PoisonValue>(V)) &&
2904	"Expected instruction or poison value");
2905	// Our tree has just 3 nodes: the root and two operands.
2906	// It is therefore trivial to get the APO. We only need to check the
2907	// opcode of V and whether the operand at OpIdx is the LHS or RHS
2908	// operand. The LHS operand of both add and sub is never attached to an
2909	// inversese operation in the linearized form, therefore its APO is
2910	// false. The RHS is true only if V is an inverse operation.
2911
2912	// Since operand reordering is performed on groups of commutative
2913	// operations or alternating sequences (e.g., +, -), we can safely tell
2914	// the inverse operations by checking commutativity.
2915	if (isa<PoisonValue>(Val: V)) {
2916	for (unsigned OpIdx : seq<unsigned>(Size: NumOperands))
2917	OpsVec [OpIdx][Lane] = {Operands [OpIdx][Lane], true, false};
2918	continue;
2919	}
2920	auto [SelectedOp, Ops] = convertTo(I: cast<Instruction>(Val: V), S);
2921	// We cannot check commutativity by the converted instruction
2922	// (SelectedOp) because isCommutative also examines def-use
2923	// relationships.
2924	bool IsInverseOperation =
2925	!isCommutative(I: SelectedOp, InstWithUses: cast<Instruction>(Val: V));
2926	for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) {
2927	bool APO = (OpIdx == `0`) ? false : IsInverseOperation;
2928	OpsVec [OpIdx][Lane] = {Operands [OpIdx][Lane], APO, false};
2929	}
2930	}
2931	}
2932
2933	/// \returns the number of operands.
2934	unsigned getNumOperands() const { return ArgSize; }
2935
2936	/// \returns the number of lanes.
2937	unsigned getNumLanes() const { return OpsVec [`0`].size(); }
2938
2939	/// \returns the operand value at \p OpIdx and \p Lane.
2940	Value getValue(unsigned* OpIdx, unsigned Lane) const {
2941	return getData(OpIdx, Lane).V;
2942	}
2943
2944	/// \returns true if the data structure is empty.
2945	bool empty() const { return OpsVec.empty(); }
2946
2947	/// Clears the data.
2948	void clear() { OpsVec.clear(); }
2949
2950	/// \Returns true if there are enough operands identical to \p Op to fill
2951	/// the whole vector (it is mixed with constants or loop invariant values).
2952	/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2953	bool shouldBroadcast(Value Op, unsigned* OpIdx, unsigned Lane) {
2954	assert(Op == getValue(OpIdx, Lane) &&
2955	"Op is expected to be getValue(OpIdx, Lane).");
2956	// Small number of loads - try load matching.
2957	if (isa<LoadInst>(Val: Op) && getNumLanes() == `2` && getNumOperands() == `2`)
2958	return false;
2959	bool OpAPO = getData(OpIdx, Lane).APO;
2960	bool IsInvariant = L && L->isLoopInvariant(V: Op);
2961	unsigned Cnt = `0`;
2962	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2963	if (Ln == Lane)
2964	continue;
2965	// This is set to true if we found a candidate for broadcast at Lane.
2966	bool FoundCandidate = false;
2967	for (unsigned OpI = `0`, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2968	OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2969	if (Data.APO != OpAPO \|\| Data.IsUsed)
2970	continue;
2971	Value *OpILane = getValue(OpIdx: OpI, Lane);
2972	bool IsConstantOp = isa<Constant>(Val: OpILane);
2973	// Consider the broadcast candidate if:
2974	// 1. Same value is found in one of the operands.
2975	if (Data.V == Op \|\|
2976	// 2. The operand in the given lane is not constant but there is a
2977	// constant operand in another lane (which can be moved to the
2978	// given lane). In this case we can represent it as a simple
2979	// permutation of constant and broadcast.
2980	(!IsConstantOp &&
2981	((Lns > `2` && isa<Constant>(Val: Data.V)) \|\|
2982	// 2.1. If we have only 2 lanes, need to check that value in the
2983	// next lane does not build same opcode sequence.
2984	(Lns == `2` &&
2985	!getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + `1`) % OpE, Lane: Ln)}, TLI) &&
2986	isa<Constant>(Val: Data.V)))) \|\|
2987	// 3. The operand in the current lane is loop invariant (can be
2988	// hoisted out) and another operand is also a loop invariant
2989	// (though not a constant). In this case the whole vector can be
2990	// hoisted out.
2991	// FIXME: need to teach the cost model about this case for better
2992	// estimation.
2993	(IsInvariant && !isa<Constant>(Val: Data.V) &&
2994	!getSameOpcode(VL: {Op, Data.V}, TLI) &&
2995	L->isLoopInvariant(V: Data.V))) {
2996	FoundCandidate = true;
2997	Data.IsUsed = Data.V == Op;
2998	if (Data.V == Op)
2999	++Cnt;
3000	break;
3001	}
3002	}
3003	if (!FoundCandidate)
3004	return false;
3005	}
3006	return getNumLanes() == `2` \|\| Cnt > `1`;
3007	}
3008
3009	/// Checks if there is at least single compatible operand in lanes other
3010	/// than \p Lane, compatible with the operand \p Op.
3011	bool canBeVectorized(Instruction Op, unsigned* OpIdx, unsigned Lane) const {
3012	assert(Op == getValue(OpIdx, Lane) &&
3013	"Op is expected to be getValue(OpIdx, Lane).");
3014	bool OpAPO = getData(OpIdx, Lane).APO;
3015	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3016	if (Ln == Lane)
3017	continue;
3018	if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
3019	const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3020	if (Data.APO != OpAPO \|\| Data.IsUsed)
3021	return true;
3022	Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
3023	return (L && L->isLoopInvariant(V: OpILn)) \|\|
3024	(getSameOpcode(VL: {Op, OpILn}, TLI) &&
3025	allSameBlock(VL: {Op, OpILn}));
3026	}))
3027	return true;
3028	}
3029	return false;
3030	}
3031
3032	public:
3033	/// Initialize with all the operands of the instruction vector \p RootVL.
3034	VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,
3035	const InstructionsState &S, const BoUpSLP &R)
3036	: TLI(R.TLI), DL(R.DL), SE(*R.SE), R(R),
3037	L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) {
3038	// Append all the operands of RootVL.
3039	appendOperands(VL: RootVL, Operands, S);
3040	}
3041
3042	/// \Returns a value vector with the operands across all lanes for the
3043	/// opearnd at \p OpIdx.
3044	ValueList getVL(unsigned OpIdx) const {
3045	ValueList OpVL(OpsVec [OpIdx].size());
3046	assert(OpsVec[OpIdx].size() == getNumLanes() &&
3047	"Expected same num of lanes across all operands");
3048	for (unsigned Lane = `0`, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3049	OpVL [Lane] = OpsVec [OpIdx][Lane].V;
3050	return OpVL;
3051	}
3052
3053	// Performs operand reordering for 2 or more operands.
3054	// The original operands are in OrigOps[OpIdx][Lane].
3055	// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3056	void reorder() {
3057	unsigned NumOperands = getNumOperands();
3058	unsigned NumLanes = getNumLanes();
3059	// Each operand has its own mode. We are using this mode to help us select
3060	// the instructions for each lane, so that they match best with the ones
3061	// we have selected so far.
3062	SmallVector<ReorderingMode, `2`> ReorderingModes(NumOperands);
3063
3064	// This is a greedy single-pass algorithm. We are going over each lane
3065	// once and deciding on the best order right away with no back-tracking.
3066	// However, in order to increase its effectiveness, we start with the lane
3067	// that has operands that can move the least. For example, given the
3068	// following lanes:
3069	// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3070	// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3071	// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3072	// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3073	// we will start at Lane 1, since the operands of the subtraction cannot
3074	// be reordered. Then we will visit the rest of the lanes in a circular
3075	// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3076
3077	// Find the first lane that we will start our search from.
3078	unsigned FirstLane = getBestLaneToStartReordering();
3079
3080	// Initialize the modes.
3081	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
3082	Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
3083	// Keep track if we have instructions with all the same opcode on one
3084	// side.
3085	if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
3086	// Check if OpLane0 should be broadcast.
3087	if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) \|\|
3088	!canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
3089	ReorderingModes [OpIdx] = ReorderingMode::Splat;
3090	else if (isa<LoadInst>(Val: OpILane0))
3091	ReorderingModes [OpIdx] = ReorderingMode::Load;
3092	else
3093	ReorderingModes [OpIdx] = ReorderingMode::Opcode;
3094	} else if (isa<Constant>(Val: OpLane0)) {
3095	ReorderingModes [OpIdx] = ReorderingMode::Constant;
3096	} else if (isa<Argument>(Val: OpLane0)) {
3097	// Our best hope is a Splat. It may save some cost in some cases.
3098	ReorderingModes [OpIdx] = ReorderingMode::Splat;
3099	} else {
3100	llvm_unreachable("Unexpected value kind.");
3101	}
3102	}
3103
3104	// Check that we don't have same operands. No need to reorder if operands
3105	// are just perfect diamond or shuffled diamond match. Do not do it only
3106	// for possible broadcasts or non-power of 2 number of scalars (just for
3107	// now).
3108	auto &&SkipReordering = [this]() {
3109	SmallPtrSet<Value *, `4`> UniqueValues;
3110	ArrayRef<OperandData> Op0 = OpsVec.front();
3111	for (const OperandData &Data : Op0)
3112	UniqueValues.insert(Ptr: Data.V);
3113	for (ArrayRef<OperandData> Op :
3114	ArrayRef(OpsVec).slice(N: `1`, M: getNumOperands() - `1`)) {
3115	if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
3116	return !UniqueValues.contains(Ptr: Data.V);
3117	}))
3118	return false;
3119	}
3120	// TODO: Check if we can remove a check for non-power-2 number of
3121	// scalars after full support of non-power-2 vectorization.
3122	return UniqueValues.size() != `2` &&
3123	hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(),
3124	Sz: UniqueValues.size());
3125	};
3126
3127	// If the initial strategy fails for any of the operand indexes, then we
3128	// perform reordering again in a second pass. This helps avoid assigning
3129	// high priority to the failed strategy, and should improve reordering for
3130	// the non-failed operand indexes.
3131	for (int Pass = `0`; Pass != `2`; ++Pass) {
3132	// Check if no need to reorder operands since they're are perfect or
3133	// shuffled diamond match.
3134	// Need to do it to avoid extra external use cost counting for
3135	// shuffled matches, which may cause regressions.
3136	if (SkipReordering())
3137	break;
3138	// Skip the second pass if the first pass did not fail.
3139	bool StrategyFailed = false;
3140	// Mark all operand data as free to use.
3141	clearUsed();
3142	// We keep the original operand order for the FirstLane, so reorder the
3143	// rest of the lanes. We are visiting the nodes in a circular fashion,
3144	// using FirstLane as the center point and increasing the radius
3145	// distance.
3146	SmallVector<SmallVector<Value *, `2`>> MainAltOps(NumOperands);
3147	for (unsigned I = `0`; I < NumOperands; ++I)
3148	MainAltOps [I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
3149
3150	SmallBitVector UsedLanes(NumLanes);
3151	UsedLanes.set(FirstLane);
3152	for (unsigned Distance = `1`; Distance != NumLanes; ++Distance) {
3153	// Visit the lane on the right and then the lane on the left.
3154	for (int Direction : {+`1`, -`1`}) {
3155	int Lane = FirstLane + Direction * Distance;
3156	if (Lane < `0` \|\| Lane >= (int)NumLanes)
3157	continue;
3158	UsedLanes.set(Lane);
3159	int LastLane = Lane - Direction;
3160	assert(LastLane >= `0` && LastLane < (int)NumLanes &&
3161	"Out of bounds");
3162	// Look for a good match for each operand.
3163	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
3164	// Search for the operand that matches SortedOps[OpIdx][Lane-1].
3165	std::optional<unsigned> BestIdx =
3166	getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3167	MainAltOps: MainAltOps [OpIdx], UsedLanes);
3168	// By not selecting a value, we allow the operands that follow to
3169	// select a better matching value. We will get a non-null value in
3170	// the next run of getBestOperand().
3171	if (BestIdx) {
3172	// Swap the current operand with the one returned by
3173	// getBestOperand().
3174	swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
3175	} else {
3176	// Enable the second pass.
3177	StrategyFailed = true;
3178	}
3179	// Try to get the alternate opcode and follow it during analysis.
3180	if (MainAltOps [OpIdx].size() != `2`) {
3181	OperandData &AltOp = getData(OpIdx, Lane);
3182	InstructionsState OpS =
3183	getSameOpcode(VL: {MainAltOps [OpIdx].front(), AltOp.V}, TLI);
3184	if (OpS && OpS.isAltShuffle())
3185	MainAltOps [OpIdx].push_back(Elt: AltOp.V);
3186	}
3187	}
3188	}
3189	}
3190	// Skip second pass if the strategy did not fail.
3191	if (!StrategyFailed)
3192	break;
3193	}
3194	}
3195
3196	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3197	LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3198	switch (RMode) {
3199	case ReorderingMode::Load:
3200	return "Load";
3201	case ReorderingMode::Opcode:
3202	return "Opcode";
3203	case ReorderingMode::Constant:
3204	return "Constant";
3205	case ReorderingMode::Splat:
3206	return "Splat";
3207	case ReorderingMode::Failed:
3208	return "Failed";
3209	}
3210	llvm_unreachable("Unimplemented Reordering Type");
3211	}
3212
3213	LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3214	raw_ostream &OS) {
3215	return OS << getModeStr(RMode);
3216	}
3217
3218	/// Debug print.
3219	LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3220	printMode(RMode, dbgs());
3221	}
3222
3223	friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3224	return printMode(RMode, OS);
3225	}
3226
3227	LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
3228	const unsigned Indent = `2`;
3229	unsigned Cnt = `0`;
3230	for (const OperandDataVec &OpDataVec : OpsVec) {
3231	OS << "Operand " << Cnt++ << "\n";
3232	for (const OperandData &OpData : OpDataVec) {
3233	OS.indent(Indent) << "{";
3234	if (Value *V = OpData.V)
3235	OS << *V;
3236	else
3237	OS << "null";
3238	OS << ", APO:" << OpData.APO << "}\n";
3239	}
3240	OS << "\n";
3241	}
3242	return OS;
3243	}
3244
3245	/// Debug print.
3246	LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3247	#endif
3248	};
3249
3250	/// Evaluate each pair in \p Candidates and return index into \p Candidates
3251	/// for a pair which have highest score deemed to have best chance to form
3252	/// root of profitable tree to vectorize. Return std::nullopt if no candidate
3253	/// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3254	/// of the cost, considered to be good enough score.
3255	std::optional<int>
3256	findBestRootPair(ArrayRef<std::pair<Value , Value >> Candidates,
3257	int Limit = LookAheadHeuristics::ScoreFail) const {
3258	LookAheadHeuristics LookAhead(TLI, DL, SE, this, /NumLanes=/`2`,
3259	RootLookAheadMaxDepth);
3260	int BestScore = Limit;
3261	std::optional<int> Index;
3262	for (int I : seq<int>(Begin: `0`, End: Candidates.size())) {
3263	int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates [I].first,
3264	RHS: Candidates [I].second,
3265	/U1=/nullptr, /U2=/nullptr,
3266	/CurrLevel=/`1`, MainAltOps: {});
3267	if (Score > BestScore) {
3268	BestScore = Score;
3269	Index = I;
3270	}
3271	}
3272	return Index;
3273	}
3274
3275	/// Checks if the instruction is marked for deletion.
3276	bool isDeleted(Instruction I) const* { return DeletedInstructions.count(V: I); }
3277
3278	/// Removes an instruction from its block and eventually deletes it.
3279	/// It's like Instruction::eraseFromParent() except that the actual deletion
3280	/// is delayed until BoUpSLP is destructed.
3281	void eraseInstruction(Instruction *I) {
3282	DeletedInstructions.insert(V: I);
3283	}
3284
3285	/// Remove instructions from the parent function and clear the operands of \p
3286	/// DeadVals instructions, marking for deletion trivially dead operands.
3287	template <typename T>
3288	void removeInstructionsAndOperands(
3289	ArrayRef<T *> DeadVals,
3290	ArrayRef<std::tuple<Value , unsigned, bool*>> VectorValuesAndScales) {
3291	SmallVector<WeakTrackingVH> DeadInsts;
3292	for (T *V : DeadVals) {
3293	auto *I = cast<Instruction>(V);
3294	eraseInstruction(I);
3295	}
3296	DenseSet<Value *> Processed;
3297	for (T *V : DeadVals) {
3298	if (!V \|\| !Processed.insert(V).second)
3299	continue;
3300	auto *I = cast<Instruction>(V);
3301	salvageDebugInfo(*I);
3302	ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I);
3303	for (Use &U : I->operands()) {
3304	if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
3305	OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
3306	wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
3307	(Entries.empty() \|\| none_of(Entries, [&](const TreeEntry *Entry) {
3308	return Entry->VectorizedValue == OpI;
3309	})))
3310	DeadInsts.push_back(Elt: OpI);
3311	}
3312	I->dropAllReferences();
3313	}
3314	for (T *V : DeadVals) {
3315	auto *I = cast<Instruction>(V);
3316	if (!I->getParent())
3317	continue;
3318	assert((I->use_empty() \|\| all_of(I->uses(),
3319	[&](Use &U) {
3320	return isDeleted(
3321	cast<Instruction>(U.getUser()));
3322	})) &&
3323	"trying to erase instruction with users.");
3324	I->removeFromParent();
3325	SE->forgetValue(V: I);
3326	}
3327	// Process the dead instruction list until empty.
3328	while (!DeadInsts.empty()) {
3329	Value *V = DeadInsts.pop_back_val();
3330	Instruction *VI = cast_or_null<Instruction>(Val: V);
3331	if (!VI \|\| !VI->getParent())
3332	continue;
3333	assert(isInstructionTriviallyDead(VI, TLI) &&
3334	"Live instruction found in dead worklist!");
3335	assert(VI->use_empty() && "Instructions with uses are not dead.");
3336
3337	// Don't lose the debug info while deleting the instructions.
3338	salvageDebugInfo(I&: *VI);
3339
3340	// Null out all of the instruction's operands to see if any operand
3341	// becomes dead as we go.
3342	for (Use &OpU : VI->operands()) {
3343	Value *OpV = OpU.get();
3344	if (!OpV)
3345	continue;
3346	OpU.set(nullptr);
3347
3348	if (!OpV->use_empty())
3349	continue;
3350
3351	// If the operand is an instruction that became dead as we nulled out
3352	// the operand, and if it is 'trivially' dead, delete it in a future
3353	// loop iteration.
3354	if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
3355	if (!DeletedInstructions.contains(V: OpI) &&
3356	(!OpI->getType()->isVectorTy() \|\|
3357	none_of(VectorValuesAndScales,
3358	[&](const std::tuple<Value , unsigned, bool*> &V) {
3359	return std::get<`0`>(t: V) == OpI;
3360	})) &&
3361	isInstructionTriviallyDead(I: OpI, TLI))
3362	DeadInsts.push_back(Elt: OpI);
3363	}
3364
3365	VI->removeFromParent();
3366	eraseInstruction(I: VI);
3367	SE->forgetValue(V: VI);
3368	}
3369	}
3370
3371	/// Checks if the instruction was already analyzed for being possible
3372	/// reduction root.
3373	bool isAnalyzedReductionRoot(Instruction I) const* {
3374	return AnalyzedReductionsRoots.count(Ptr: I);
3375	}
3376	/// Register given instruction as already analyzed for being possible
3377	/// reduction root.
3378	void analyzedReductionRoot(Instruction *I) {
3379	AnalyzedReductionsRoots.insert(Ptr: I);
3380	}
3381	/// Checks if the provided list of reduced values was checked already for
3382	/// vectorization.
3383	bool areAnalyzedReductionVals(ArrayRef<Value > VL) const* {
3384	return AnalyzedReductionVals.contains(V: hash_value(S: VL));
3385	}
3386	/// Adds the list of reduced values to list of already checked values for the
3387	/// vectorization.
3388	void analyzedReductionVals(ArrayRef<Value *> VL) {
3389	AnalyzedReductionVals.insert(V: hash_value(S: VL));
3390	}
3391	/// Clear the list of the analyzed reduction root instructions.
3392	void clearReductionData() {
3393	AnalyzedReductionsRoots.clear();
3394	AnalyzedReductionVals.clear();
3395	AnalyzedMinBWVals.clear();
3396	}
3397	/// Checks if the given value is gathered in one of the nodes.
3398	bool isAnyGathered(const SmallDenseSet<Value > &Vals) const* {
3399	return any_of(Range: MustGather, P: [&](Value V) { return* Vals.contains(V); });
3400	}
3401	/// Checks if the given value is gathered in one of the nodes.
3402	bool isGathered(const Value V) const* {
3403	return MustGather.contains(Ptr: V);
3404	}
3405	/// Checks if the specified value was not schedule.
3406	bool isNotScheduled(const Value V) const* {
3407	return NonScheduledFirst.contains(Ptr: V);
3408	}
3409
3410	/// Check if the value is vectorized in the tree.
3411	bool isVectorized(const Value V) const* {
3412	assert(V && "V cannot be nullptr.");
3413	return ScalarToTreeEntries.contains(Val: V);
3414	}
3415
3416	~BoUpSLP();
3417
3418	private:
3419	/// Determine if a node \p E in can be demoted to a smaller type with a
3420	/// truncation. We collect the entries that will be demoted in ToDemote.
3421	/// \param E Node for analysis
3422	/// \param ToDemote indices of the nodes to be demoted.
3423	bool collectValuesToDemote(
3424	const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3425	SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
3426	const SmallDenseSet<unsigned, `8`> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3427	bool &IsProfitableToDemote, bool IsTruncRoot) const;
3428
3429	/// Builds the list of reorderable operands on the edges \p Edges of the \p
3430	/// UserTE, which allow reordering (i.e. the operands can be reordered because
3431	/// they have only one user and reordarable).
3432	/// \param ReorderableGathers List of all gather nodes that require reordering
3433	/// (e.g., gather of extractlements or partially vectorizable loads).
3434	/// \param GatherOps List of gather operand nodes for \p UserTE that require
3435	/// reordering, subset of \p NonVectorized.
3436	void buildReorderableOperands(
3437	TreeEntry *UserTE,
3438	SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3439	const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3440	SmallVectorImpl<TreeEntry *> &GatherOps);
3441
3442	/// Checks if the given \p TE is a gather node with clustered reused scalars
3443	/// and reorders it per given \p Mask.
3444	void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3445
3446	/// Checks if all users of \p I are the part of the vectorization tree.
3447	bool areAllUsersVectorized(
3448	Instruction *I,
3449	const SmallDenseSet<Value > VectorizedVals = nullptr) const;
3450
3451	/// Return information about the vector formed for the specified index
3452	/// of a vector of (the same) instruction.
3453	TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
3454
3455	/// \returns the graph entry for the \p Idx operand of the \p E entry.
3456	const TreeEntry getOperandEntry(const* TreeEntry E, unsigned* Idx) const;
3457	TreeEntry getOperandEntry(TreeEntry E, unsigned Idx) {
3458	return const_cast<TreeEntry *>(
3459	getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx));
3460	}
3461
3462	/// Gets the root instruction for the given node. If the node is a strided
3463	/// load/store node with the reverse order, the root instruction is the last
3464	/// one.
3465	Instruction getRootEntryInstruction(const* TreeEntry &Entry) const;
3466
3467	/// \returns Cast context for the given graph node.
3468	TargetTransformInfo::CastContextHint
3469	getCastContextHint(const TreeEntry &TE) const;
3470
3471	/// \returns the cost of the vectorizable entry.
3472	InstructionCost getEntryCost(const TreeEntry *E,
3473	ArrayRef<Value *> VectorizedVals,
3474	SmallPtrSetImpl<Value *> &CheckedExtracts);
3475
3476	/// Checks if it is legal and profitable to build SplitVectorize node for the
3477	/// given \p VL.
3478	/// \param Op1 first homogeneous scalars.
3479	/// \param Op2 second homogeneous scalars.
3480	/// \param ReorderIndices indices to reorder the scalars.
3481	/// \returns true if the node was successfully built.
3482	bool canBuildSplitNode(ArrayRef<Value *> VL,
3483	const InstructionsState &LocalState,
3484	SmallVectorImpl<Value *> &Op1,
3485	SmallVectorImpl<Value *> &Op2,
3486	OrdersType &ReorderIndices) const;
3487
3488	/// This is the recursive part of buildTree.
3489	void buildTreeRec(ArrayRef<Value > Roots, unsigned* Depth, const EdgeInfo &EI,
3490	unsigned InterleaveFactor = `0`);
3491
3492	/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3493	/// be vectorized to use the original vector (or aggregate "bitcast" to a
3494	/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3495	/// returns false, setting \p CurrentOrder to either an empty vector or a
3496	/// non-identity permutation that allows to reuse extract instructions.
3497	/// \param ResizeAllowed indicates whether it is allowed to handle subvector
3498	/// extract order.
3499	bool canReuseExtract(ArrayRef<Value *> VL,
3500	SmallVectorImpl<unsigned> &CurrentOrder,
3501	bool ResizeAllowed = false) const;
3502
3503	/// Vectorize a single entry in the tree.
3504	Value vectorizeTree(TreeEntry E);
3505
3506	/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3507	/// \p E.
3508	Value vectorizeOperand(TreeEntry E, unsigned NodeIdx);
3509
3510	/// Create a new vector from a list of scalar values. Produces a sequence
3511	/// which exploits values reused across lanes, and arranges the inserts
3512	/// for ease of later optimization.
3513	template <typename BVTy, typename ResTy, typename... Args>
3514	ResTy processBuildVector(const TreeEntry E, Type ScalarTy, Args &...Params);
3515
3516	/// Create a new vector from a list of scalar values. Produces a sequence
3517	/// which exploits values reused across lanes, and arranges the inserts
3518	/// for ease of later optimization.
3519	Value createBuildVector(const* TreeEntry E, Type ScalarTy);
3520
3521	/// Returns the instruction in the bundle, which can be used as a base point
3522	/// for scheduling. Usually it is the last instruction in the bundle, except
3523	/// for the case when all operands are external (in this case, it is the first
3524	/// instruction in the list).
3525	Instruction &getLastInstructionInBundle(const TreeEntry *E);
3526
3527	/// Tries to find extractelement instructions with constant indices from fixed
3528	/// vector type and gather such instructions into a bunch, which highly likely
3529	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3530	/// was successful, the matched scalars are replaced by poison values in \p VL
3531	/// for future analysis.
3532	std::optional<TargetTransformInfo::ShuffleKind>
3533	tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3534	SmallVectorImpl<int> &Mask) const;
3535
3536	/// Tries to find extractelement instructions with constant indices from fixed
3537	/// vector type and gather such instructions into a bunch, which highly likely
3538	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3539	/// was successful, the matched scalars are replaced by poison values in \p VL
3540	/// for future analysis.
3541	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3542	tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3543	SmallVectorImpl<int> &Mask,
3544	unsigned NumParts) const;
3545
3546	/// Checks if the gathered \p VL can be represented as a single register
3547	/// shuffle(s) of previous tree entries.
3548	/// \param TE Tree entry checked for permutation.
3549	/// \param VL List of scalars (a subset of the TE scalar), checked for
3550	/// permutations. Must form single-register vector.
3551	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3552	/// commands to build the mask using the original vector value, without
3553	/// relying on the potential reordering.
3554	/// \returns ShuffleKind, if gathered values can be represented as shuffles of
3555	/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3556	std::optional<TargetTransformInfo::ShuffleKind>
3557	isGatherShuffledSingleRegisterEntry(
3558	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
3559	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part,
3560	bool ForOrder);
3561
3562	/// Checks if the gathered \p VL can be represented as multi-register
3563	/// shuffle(s) of previous tree entries.
3564	/// \param TE Tree entry checked for permutation.
3565	/// \param VL List of scalars (a subset of the TE scalar), checked for
3566	/// permutations.
3567	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3568	/// commands to build the mask using the original vector value, without
3569	/// relying on the potential reordering.
3570	/// \returns per-register series of ShuffleKind, if gathered values can be
3571	/// represented as shuffles of previous tree entries. \p Mask is filled with
3572	/// the shuffle mask (also on per-register base).
3573	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3574	isGatherShuffledEntry(
3575	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
3576	SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3577	unsigned NumParts, bool ForOrder = false);
3578
3579	/// \returns the cost of gathering (inserting) the values in \p VL into a
3580	/// vector.
3581	/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3582	InstructionCost getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc,
3583	Type ScalarTy) const*;
3584
3585	/// Set the Builder insert point to one after the last instruction in
3586	/// the bundle
3587	void setInsertPointAfterBundle(const TreeEntry *E);
3588
3589	/// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3590	/// specified, the starting vector value is poison.
3591	Value *
3592	gather(ArrayRef<Value > VL, Value Root, Type *ScalarTy,
3593	function_ref<Value (Value , Value , ArrayRef<int*>)> CreateShuffle);
3594
3595	/// \returns whether the VectorizableTree is fully vectorizable and will
3596	/// be beneficial even the tree height is tiny.
3597	bool isFullyVectorizableTinyTree(bool ForReduction) const;
3598
3599	/// Run through the list of all gathered loads in the graph and try to find
3600	/// vector loads/masked gathers instead of regular gathers. Later these loads
3601	/// are reshufled to build final gathered nodes.
3602	void tryToVectorizeGatheredLoads(
3603	const SmallMapVector<
3604	std::tuple<BasicBlock , Value , Type *>,
3605	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, `8`>
3606	&GatheredLoads);
3607
3608	/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3609	/// users of \p TE and collects the stores. It returns the map from the store
3610	/// pointers to the collected stores.
3611	SmallVector<SmallVector<StoreInst *>>
3612	collectUserStores(const BoUpSLP::TreeEntry TE) const*;
3613
3614	/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3615	/// stores in \p StoresVec can form a vector instruction. If so it returns
3616	/// true and populates \p ReorderIndices with the shuffle indices of the
3617	/// stores when compared to the sorted vector.
3618	bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3619	OrdersType &ReorderIndices) const;
3620
3621	/// Iterates through the users of \p TE, looking for scalar stores that can be
3622	/// potentially vectorized in a future SLP-tree. If found, it keeps track of
3623	/// their order and builds an order index vector for each store bundle. It
3624	/// returns all these order vectors found.
3625	/// We run this after the tree has formed, otherwise we may come across user
3626	/// instructions that are not yet in the tree.
3627	SmallVector<OrdersType, `1`>
3628	findExternalStoreUsersReorderIndices(TreeEntry TE) const*;
3629
3630	/// Tries to reorder the gathering node for better vectorization
3631	/// opportunities.
3632	void reorderGatherNode(TreeEntry &TE);
3633
3634	class TreeEntry {
3635	public:
3636	using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, `8`>;
3637	TreeEntry(VecTreeTy &Container) : Container(Container) {}
3638
3639	/// \returns Common mask for reorder indices and reused scalars.
3640	SmallVector<int> getCommonMask() const {
3641	if (State == TreeEntry::SplitVectorize)
3642	return {};
3643	SmallVector<int> Mask;
3644	inversePermutation(Indices: ReorderIndices, Mask);
3645	::addMask(Mask, SubMask: ReuseShuffleIndices);
3646	return Mask;
3647	}
3648
3649	/// \returns The mask for split nodes.
3650	SmallVector<int> getSplitMask() const {
3651	assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3652	"Expected only split vectorize node.");
3653	SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3654	unsigned CommonVF = std::max<unsigned>(
3655	a: CombinedEntriesWithIndices.back().second,
3656	b: Scalars.size() - CombinedEntriesWithIndices.back().second);
3657	for (auto [Idx, I] : enumerate(First: ReorderIndices))
3658	Mask [I] =
3659	Idx + (Idx >= CombinedEntriesWithIndices.back().second
3660	? CommonVF - CombinedEntriesWithIndices.back().second
3661	: `0`);
3662	return Mask;
3663	}
3664
3665	/// Updates (reorders) SplitVectorize node according to the given mask \p
3666	/// Mask and order \p MaskOrder.
3667	void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3668	ArrayRef<int> MaskOrder);
3669
3670	/// \returns true if the scalars in VL are equal to this entry.
3671	bool isSame(ArrayRef<Value > VL) const* {
3672	auto &&IsSame = [VL](ArrayRef<Value > Scalars, ArrayRef<int*> Mask) {
3673	if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3674	return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
3675	return VL.size() == Mask.size() &&
3676	std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
3677	binary_pred: [Scalars](Value V, int* Idx) {
3678	return (isa<UndefValue>(Val: V) &&
3679	Idx == PoisonMaskElem) \|\|
3680	(Idx != PoisonMaskElem && V == Scalars [Idx]);
3681	});
3682	};
3683	if (!ReorderIndices.empty()) {
3684	// TODO: implement matching if the nodes are just reordered, still can
3685	// treat the vector as the same if the list of scalars matches VL
3686	// directly, without reordering.
3687	SmallVector<int> Mask;
3688	inversePermutation(Indices: ReorderIndices, Mask);
3689	if (VL.size() == Scalars.size())
3690	return IsSame(Scalars, Mask);
3691	if (VL.size() == ReuseShuffleIndices.size()) {
3692	::addMask(Mask, SubMask: ReuseShuffleIndices);
3693	return IsSame(Scalars, Mask);
3694	}
3695	return false;
3696	}
3697	return IsSame(Scalars, ReuseShuffleIndices);
3698	}
3699
3700	/// \returns true if current entry has same operands as \p TE.
3701	bool hasEqualOperands(const TreeEntry &TE) const {
3702	if (TE.getNumOperands() != getNumOperands())
3703	return false;
3704	SmallBitVector Used(getNumOperands());
3705	for (unsigned I = `0`, E = getNumOperands(); I < E; ++I) {
3706	unsigned PrevCount = Used.count();
3707	for (unsigned K = `0`; K < E; ++K) {
3708	if (Used.test(Idx: K))
3709	continue;
3710	if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
3711	Used.set(K);
3712	break;
3713	}
3714	}
3715	// Check if we actually found the matching operand.
3716	if (PrevCount == Used.count())
3717	return false;
3718	}
3719	return true;
3720	}
3721
3722	/// \return Final vectorization factor for the node. Defined by the total
3723	/// number of vectorized scalars, including those, used several times in the
3724	/// entry and counted in the \a ReuseShuffleIndices, if any.
3725	unsigned getVectorFactor() const {
3726	if (!ReuseShuffleIndices.empty())
3727	return ReuseShuffleIndices.size();
3728	return Scalars.size();
3729	};
3730
3731	/// Checks if the current node is a gather node.
3732	bool isGather() const { return State == NeedToGather; }
3733
3734	/// A vector of scalars.
3735	ValueList Scalars;
3736
3737	/// The Scalars are vectorized into this value. It is initialized to Null.
3738	WeakTrackingVH VectorizedValue = nullptr;
3739
3740	/// Do we need to gather this sequence or vectorize it
3741	/// (either with vector instruction or with scatter/gather
3742	/// intrinsics for store/load)?
3743	enum EntryState {
3744	Vectorize, ///< The node is regularly vectorized.
3745	ScatterVectorize, ///< Masked scatter/gather node.
3746	StridedVectorize, ///< Strided loads (and stores)
3747	CompressVectorize, ///< (Masked) load with compress.
3748	NeedToGather, ///< Gather/buildvector node.
3749	CombinedVectorize, ///< Vectorized node, combined with its user into more
3750	///< complex node like select/cmp to minmax, mul/add to
3751	///< fma, etc. Must be used for the following nodes in
3752	///< the pattern, not the very first one.
3753	SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3754	///< independently and then combines back.
3755	};
3756	EntryState State;
3757
3758	/// List of combined opcodes supported by the vectorizer.
3759	enum CombinedOpcode {
3760	NotCombinedOp = -`1`,
3761	MinMax = Instruction::OtherOpsEnd + `1`,
3762	};
3763	CombinedOpcode CombinedOp = NotCombinedOp;
3764
3765	/// Does this sequence require some shuffling?
3766	SmallVector<int, `4`> ReuseShuffleIndices;
3767
3768	/// Does this entry require reordering?
3769	SmallVector<unsigned, `4`> ReorderIndices;
3770
3771	/// Points back to the VectorizableTree.
3772	///
3773	/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3774	/// to be a pointer and needs to be able to initialize the child iterator.
3775	/// Thus we need a reference back to the container to translate the indices
3776	/// to entries.
3777	VecTreeTy &Container;
3778
3779	/// The TreeEntry index containing the user of this entry.
3780	EdgeInfo UserTreeIndex;
3781
3782	/// The index of this treeEntry in VectorizableTree.
3783	unsigned Idx = `0`;
3784
3785	/// For gather/buildvector/alt opcode nodes, which are combined from
3786	/// other nodes as a series of insertvector instructions.
3787	SmallVector<std::pair<unsigned, unsigned>, `2`> CombinedEntriesWithIndices;
3788
3789	private:
3790	/// The operands of each instruction in each lane Operands[op_index][lane].
3791	/// Note: This helps avoid the replication of the code that performs the
3792	/// reordering of operands during buildTreeRec() and vectorizeTree().
3793	SmallVector<ValueList, `2`> Operands;
3794
3795	/// MainOp and AltOp are recorded inside. S should be obtained from
3796	/// newTreeEntry.
3797	InstructionsState S = InstructionsState::invalid();
3798
3799	/// Interleaving factor for interleaved loads Vectorize nodes.
3800	unsigned InterleaveFactor = `0`;
3801
3802	/// True if the node does not require scheduling.
3803	bool DoesNotNeedToSchedule = false;
3804
3805	/// Set this bundle's \p OpIdx'th operand to \p OpVL.
3806	void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3807	if (Operands.size() < OpIdx + `1`)
3808	Operands.resize(N: OpIdx + `1`);
3809	assert(Operands[OpIdx].empty() && "Already resized?");
3810	assert(OpVL.size() <= Scalars.size() &&
3811	"Number of operands is greater than the number of scalars.");
3812	Operands [OpIdx].resize(N: OpVL.size());
3813	copy(Range&: OpVL, Out: Operands [OpIdx].begin());
3814	}
3815
3816	public:
3817	/// Returns interleave factor for interleave nodes.
3818	unsigned getInterleaveFactor() const { return InterleaveFactor; }
3819	/// Sets interleaving factor for the interleaving nodes.
3820	void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3821
3822	/// Marks the node as one that does not require scheduling.
3823	void setDoesNotNeedToSchedule() {
3824	assert(::doesNotNeedToSchedule(Scalars) &&
3825	"Expected to not need scheduling");
3826	DoesNotNeedToSchedule = true;
3827	}
3828	/// Returns true if the node is marked as one that does not require
3829	/// scheduling.
3830	bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
3831
3832	/// Set this bundle's operands from \p Operands.
3833	void setOperands(ArrayRef<ValueList> Operands) {
3834	for (unsigned I : seq<unsigned>(Size: Operands.size()))
3835	setOperand(OpIdx: I, OpVL: Operands [I]);
3836	}
3837
3838	/// Reorders operands of the node to the given mask \p Mask.
3839	void reorderOperands(ArrayRef<int> Mask) {
3840	for (ValueList &Operand : Operands)
3841	reorderScalars(Scalars&: Operand, Mask);
3842	}
3843
3844	/// \returns the \p OpIdx operand of this TreeEntry.
3845	ValueList &getOperand(unsigned OpIdx) {
3846	assert(OpIdx < Operands.size() && "Off bounds");
3847	return Operands [OpIdx];
3848	}
3849
3850	/// \returns the \p OpIdx operand of this TreeEntry.
3851	ArrayRef<Value > getOperand(unsigned* OpIdx) const {
3852	assert(OpIdx < Operands.size() && "Off bounds");
3853	return Operands [OpIdx];
3854	}
3855
3856	/// \returns the number of operands.
3857	unsigned getNumOperands() const { return Operands.size(); }
3858
3859	/// \return the single \p OpIdx operand.
3860	Value getSingleOperand(unsigned* OpIdx) const {
3861	assert(OpIdx < Operands.size() && "Off bounds");
3862	assert(!Operands[OpIdx].empty() && "No operand available");
3863	return Operands [OpIdx][`0`];
3864	}
3865
3866	/// Some of the instructions in the list have alternate opcodes.
3867	bool isAltShuffle() const { return S.isAltShuffle(); }
3868
3869	Instruction getMatchingMainOpOrAltOp(Instruction I) const {
3870	return S.getMatchingMainOpOrAltOp(I);
3871	}
3872
3873	/// Chooses the correct key for scheduling data. If \p Op has the same (or
3874	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3875	/// \p OpValue.
3876	Value isOneOf(Value Op) const {
3877	auto *I = dyn_cast<Instruction>(Val: Op);
3878	if (I && getMatchingMainOpOrAltOp(I))
3879	return Op;
3880	return S.getMainOp();
3881	}
3882
3883	void setOperations(const InstructionsState &S) {
3884	assert(S && "InstructionsState is invalid.");
3885	this->S = S;
3886	}
3887
3888	Instruction getMainOp() const* { return S.getMainOp(); }
3889
3890	Instruction getAltOp() const* { return S.getAltOp(); }
3891
3892	/// The main/alternate opcodes for the list of instructions.
3893	unsigned getOpcode() const { return S.getOpcode(); }
3894
3895	unsigned getAltOpcode() const { return S.getAltOpcode(); }
3896
3897	bool hasState() const { return S.valid(); }
3898
3899	/// When ReuseReorderShuffleIndices is empty it just returns position of \p
3900	/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3901	int findLaneForValue(Value V) const* {
3902	unsigned FoundLane = getVectorFactor();
3903	for (auto It = find(Range: Scalars, Val: V), End = Scalars.end(); It != End;
3904	std::advance(i&: It, n: `1`)) {
3905	if (*It != V)
3906	continue;
3907	FoundLane = std::distance(first: Scalars.begin(), last: It);
3908	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3909	if (!ReorderIndices.empty())
3910	FoundLane = ReorderIndices [FoundLane];
3911	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3912	if (ReuseShuffleIndices.empty())
3913	break;
3914	if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane);
3915	RIt != ReuseShuffleIndices.end()) {
3916	FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt);
3917	break;
3918	}
3919	}
3920	assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3921	return FoundLane;
3922	}
3923
3924	/// Build a shuffle mask for graph entry which represents a merge of main
3925	/// and alternate operations.
3926	void
3927	buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3928	SmallVectorImpl<int> &Mask,
3929	SmallVectorImpl<Value > OpScalars = nullptr,
3930	SmallVectorImpl<Value > AltScalars = nullptr) const;
3931
3932	/// Return true if this is a non-power-of-2 node.
3933	bool isNonPowOf2Vec() const {
3934	bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size());
3935	return IsNonPowerOf2;
3936	}
3937
3938	/// Return true if this is a node, which tries to vectorize number of
3939	/// elements, forming whole vectors.
3940	bool
3941	hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3942	bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3943	TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size());
3944	assert((!IsNonPowerOf2 \|\| ReuseShuffleIndices.empty()) &&
3945	"Reshuffling not supported with non-power-of-2 vectors yet.");
3946	return IsNonPowerOf2;
3947	}
3948
3949	Value getOrdered(unsigned* Idx) const {
3950	assert(isGather() && "Must be used only for buildvectors/gathers.");
3951	if (ReorderIndices.empty())
3952	return Scalars [Idx];
3953	SmallVector<int> Mask;
3954	inversePermutation(Indices: ReorderIndices, Mask);
3955	return Scalars [Mask [Idx]];
3956	}
3957
3958	#ifndef NDEBUG
3959	/// Debug printer.
3960	LLVM_DUMP_METHOD void dump() const {
3961	dbgs() << Idx << ".\n";
3962	for (unsigned OpI = `0`, OpE = Operands.size(); OpI != OpE; ++OpI) {
3963	dbgs() << "Operand " << OpI << ":\n";
3964	for (const Value *V : Operands[OpI])
3965	dbgs().indent(`2`) << *V << "\n";
3966	}
3967	dbgs() << "Scalars: \n";
3968	for (Value *V : Scalars)
3969	dbgs().indent(`2`) << *V << "\n";
3970	dbgs() << "State: ";
3971	switch (State) {
3972	case Vectorize:
3973	if (InterleaveFactor > `0`) {
3974	dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3975	<< "\n";
3976	} else {
3977	dbgs() << "Vectorize\n";
3978	}
3979	break;
3980	case ScatterVectorize:
3981	dbgs() << "ScatterVectorize\n";
3982	break;
3983	case StridedVectorize:
3984	dbgs() << "StridedVectorize\n";
3985	break;
3986	case CompressVectorize:
3987	dbgs() << "CompressVectorize\n";
3988	break;
3989	case NeedToGather:
3990	dbgs() << "NeedToGather\n";
3991	break;
3992	case CombinedVectorize:
3993	dbgs() << "CombinedVectorize\n";
3994	break;
3995	case SplitVectorize:
3996	dbgs() << "SplitVectorize\n";
3997	break;
3998	}
3999	if (S) {
4000	dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4001	dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4002	} else {
4003	dbgs() << "MainOp: NULL\n";
4004	dbgs() << "AltOp: NULL\n";
4005	}
4006	dbgs() << "VectorizedValue: ";
4007	if (VectorizedValue)
4008	dbgs() << *VectorizedValue << "\n";
4009	else
4010	dbgs() << "NULL\n";
4011	dbgs() << "ReuseShuffleIndices: ";
4012	if (ReuseShuffleIndices.empty())
4013	dbgs() << "Empty";
4014	else
4015	for (int ReuseIdx : ReuseShuffleIndices)
4016	dbgs() << ReuseIdx << ", ";
4017	dbgs() << "\n";
4018	dbgs() << "ReorderIndices: ";
4019	for (unsigned ReorderIdx : ReorderIndices)
4020	dbgs() << ReorderIdx << ", ";
4021	dbgs() << "\n";
4022	dbgs() << "UserTreeIndex: ";
4023	if (UserTreeIndex)
4024	dbgs() << UserTreeIndex;
4025	else
4026	dbgs() << "<invalid>";
4027	dbgs() << "\n";
4028	if (!CombinedEntriesWithIndices.empty()) {
4029	dbgs() << "Combined entries: ";
4030	interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4031	dbgs() << "Entry index " << P.first << " with offset " << P.second;
4032	});
4033	dbgs() << "\n";
4034	}
4035	}
4036	#endif
4037	};
4038
4039	#ifndef NDEBUG
4040	void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4041	InstructionCost VecCost, InstructionCost ScalarCost,
4042	StringRef Banner) const {
4043	dbgs() << "SLP: " << Banner << ":\n";
4044	E->dump();
4045	dbgs() << "SLP: Costs:\n";
4046	dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4047	dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4048	dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4049	dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4050	<< ReuseShuffleCost + VecCost - ScalarCost << "\n";
4051	}
4052	#endif
4053
4054	/// Create a new gather TreeEntry
4055	TreeEntry newGatherTreeEntry(ArrayRef<Value > VL,
4056	const InstructionsState &S,
4057	const EdgeInfo &UserTreeIdx,
4058	ArrayRef<int> ReuseShuffleIndices = {}) {
4059	auto Invalid = ScheduleBundle::invalid();
4060	return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4061	}
4062
4063	/// Create a new VectorizableTree entry.
4064	TreeEntry newTreeEntry(ArrayRef<Value > VL, ScheduleBundle &Bundle,
4065	const InstructionsState &S,
4066	const EdgeInfo &UserTreeIdx,
4067	ArrayRef<int> ReuseShuffleIndices = {},
4068	ArrayRef<unsigned> ReorderIndices = {},
4069	unsigned InterleaveFactor = `0`) {
4070	TreeEntry::EntryState EntryState =
4071	Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4072	TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4073	ReuseShuffleIndices, ReorderIndices);
4074	if (E && InterleaveFactor > `0`)
4075	E->setInterleave(InterleaveFactor);
4076	return E;
4077	}
4078
4079	TreeEntry newTreeEntry(ArrayRef<Value > VL,
4080	TreeEntry::EntryState EntryState,
4081	ScheduleBundle &Bundle, const InstructionsState &S,
4082	const EdgeInfo &UserTreeIdx,
4083	ArrayRef<int> ReuseShuffleIndices = {},
4084	ArrayRef<unsigned> ReorderIndices = {}) {
4085	assert(((!Bundle && (EntryState == TreeEntry::NeedToGather \|\|
4086	EntryState == TreeEntry::SplitVectorize)) \|\|
4087	(Bundle && EntryState != TreeEntry::NeedToGather &&
4088	EntryState != TreeEntry::SplitVectorize)) &&
4089	"Need to vectorize gather entry?");
4090	// Gathered loads still gathered? Do not create entry, use the original one.
4091	if (GatheredLoadsEntriesFirst.has_value() &&
4092	EntryState == TreeEntry::NeedToGather && S &&
4093	S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4094	!UserTreeIdx.UserTE)
4095	return nullptr;
4096	VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
4097	TreeEntry *Last = VectorizableTree.back().get();
4098	Last->Idx = VectorizableTree.size() - `1`;
4099	Last->State = EntryState;
4100	if (UserTreeIdx.UserTE)
4101	OperandsToTreeEntry.try_emplace(
4102	Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last);
4103	// FIXME: Remove once support for ReuseShuffleIndices has been implemented
4104	// for non-power-of-two vectors.
4105	assert(
4106	(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) \|\|
4107	ReuseShuffleIndices.empty()) &&
4108	"Reshuffling scalars not yet supported for nodes with padding");
4109	Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
4110	in_end: ReuseShuffleIndices.end());
4111	if (ReorderIndices.empty()) {
4112	Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
4113	if (S)
4114	Last->setOperations(S);
4115	} else {
4116	// Reorder scalars and build final mask.
4117	Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
4118	transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
4119	F: [VL](unsigned Idx) -> Value * {
4120	if (Idx >= VL.size())
4121	return UndefValue::get(T: VL.front()->getType());
4122	return VL [Idx];
4123	});
4124	InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
4125	if (S)
4126	Last->setOperations(S);
4127	Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
4128	}
4129	if (EntryState == TreeEntry::SplitVectorize) {
4130	assert(S && "Split nodes must have operations.");
4131	Last->setOperations(S);
4132	SmallPtrSet<Value *, `4`> Processed;
4133	for (Value *V : VL) {
4134	auto *I = dyn_cast<Instruction>(Val: V);
4135	if (!I)
4136	continue;
4137	auto It = ScalarsInSplitNodes.find(Val: V);
4138	if (It == ScalarsInSplitNodes.end()) {
4139	ScalarsInSplitNodes.try_emplace(Key: V).first ->getSecond().push_back(Elt: Last);
4140	(void)Processed.insert(Ptr: V);
4141	} else if (Processed.insert(Ptr: V).second) {
4142	assert(!is_contained(It->getSecond(), Last) &&
4143	"Value already associated with the node.");
4144	It ->getSecond().push_back(Elt: Last);
4145	}
4146	}
4147	} else if (!Last->isGather()) {
4148	if (doesNotNeedToSchedule(VL))
4149	Last->setDoesNotNeedToSchedule();
4150	SmallPtrSet<Value *, `4`> Processed;
4151	for (Value *V : VL) {
4152	if (isa<PoisonValue>(Val: V))
4153	continue;
4154	auto It = ScalarToTreeEntries.find(Val: V);
4155	if (It == ScalarToTreeEntries.end()) {
4156	ScalarToTreeEntries.try_emplace(Key: V).first ->getSecond().push_back(Elt: Last);
4157	(void)Processed.insert(Ptr: V);
4158	} else if (Processed.insert(Ptr: V).second) {
4159	assert(!is_contained(It->getSecond(), Last) &&
4160	"Value already associated with the node.");
4161	It ->getSecond().push_back(Elt: Last);
4162	}
4163	}
4164	// Update the scheduler bundle to point to this TreeEntry.
4165	assert((!Bundle.getBundle().empty() \|\| isa<PHINode>(S.getMainOp()) \|\|
4166	isVectorLikeInstWithConstOps(S.getMainOp()) \|\|
4167	Last->doesNotNeedToSchedule()) &&
4168	"Bundle and VL out of sync");
4169	if (!Bundle.getBundle().empty()) {
4170	#if !defined(NDEBUG) \|\| defined(EXPENSIVE_CHECKS)
4171	auto *BundleMember = Bundle.getBundle().begin();
4172	SmallPtrSet<Value *, `4`> Processed;
4173	for (Value *V : VL) {
4174	if (doesNotNeedToBeScheduled(V) \|\| !Processed.insert(V).second)
4175	continue;
4176	++BundleMember;
4177	}
4178	assert(BundleMember == Bundle.getBundle().end() &&
4179	"Bundle and VL out of sync");
4180	#endif
4181	Bundle.setTreeEntry(Last);
4182	}
4183	} else {
4184	// Build a map for gathered scalars to the nodes where they are used.
4185	bool AllConstsOrCasts = true;
4186	for (Value *V : VL)
4187	if (!isConstant(V)) {
4188	auto *I = dyn_cast<CastInst>(Val: V);
4189	AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4190	if (UserTreeIdx.EdgeIdx != UINT_MAX \|\| !UserTreeIdx.UserTE \|\|
4191	!UserTreeIdx.UserTE->isGather())
4192	ValueToGatherNodes.try_emplace(Key: V).first ->getSecond().insert(X: Last);
4193	}
4194	if (AllConstsOrCasts)
4195	CastMaxMinBWSizes =
4196	std::make_pair(x: std::numeric_limits<unsigned>::max(), y: `1`);
4197	MustGather.insert_range(R&: VL);
4198	}
4199
4200	if (UserTreeIdx.UserTE)
4201	Last->UserTreeIndex = UserTreeIdx;
4202	return Last;
4203	}
4204
4205	/// -- Vectorization State --
4206	/// Holds all of the tree entries.
4207	TreeEntry::VecTreeTy VectorizableTree;
4208
4209	#ifndef NDEBUG
4210	/// Debug printer.
4211	LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4212	for (unsigned Id = `0`, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4213	VectorizableTree[Id]->dump();
4214	dbgs() << "\n";
4215	}
4216	}
4217	#endif
4218
4219	/// Get list of vector entries, associated with the value \p V.
4220	ArrayRef<TreeEntry > getTreeEntries(Value V) const {
4221	assert(V && "V cannot be nullptr.");
4222	auto It = ScalarToTreeEntries.find(Val: V);
4223	if (It == ScalarToTreeEntries.end())
4224	return {};
4225	return It ->getSecond();
4226	}
4227
4228	/// Get list of split vector entries, associated with the value \p V.
4229	ArrayRef<TreeEntry > getSplitTreeEntries(Value V) const {
4230	assert(V && "V cannot be nullptr.");
4231	auto It = ScalarsInSplitNodes.find(Val: V);
4232	if (It == ScalarsInSplitNodes.end())
4233	return {};
4234	return It ->getSecond();
4235	}
4236
4237	/// Returns first vector node for value \p V, matching values \p VL.
4238	TreeEntry getSameValuesTreeEntry(Value V, ArrayRef<Value *> VL,
4239	bool SameVF = false) const {
4240	assert(V && "V cannot be nullptr.");
4241	for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V))
4242	if ((!SameVF \|\| TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4243	return TE;
4244	return nullptr;
4245	}
4246
4247	/// Check that the operand node of alternate node does not generate
4248	/// buildvector sequence. If it is, then probably not worth it to build
4249	/// alternate shuffle, if number of buildvector operands + alternate
4250	/// instruction > than the number of buildvector instructions.
4251	/// \param S the instructions state of the analyzed values.
4252	/// \param VL list of the instructions with alternate opcodes.
4253	bool areAltOperandsProfitable(const InstructionsState &S,
4254	ArrayRef<Value > VL) const*;
4255
4256	/// Contains all the outputs of legality analysis for a list of values to
4257	/// vectorize.
4258	class ScalarsVectorizationLegality {
4259	InstructionsState S;
4260	bool IsLegal;
4261	bool TryToFindDuplicates;
4262	bool TrySplitVectorize;
4263
4264	public:
4265	ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4266	bool TryToFindDuplicates = true,
4267	bool TrySplitVectorize = false)
4268	: S (S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4269	TrySplitVectorize(TrySplitVectorize) {
4270	assert((!IsLegal \|\| (S.valid() && TryToFindDuplicates)) &&
4271	"Inconsistent state");
4272	}
4273	const InstructionsState &getInstructionsState() const { return S; };
4274	bool isLegal() const { return IsLegal; }
4275	bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4276	bool trySplitVectorize() const { return TrySplitVectorize; }
4277	};
4278
4279	/// Checks if the specified list of the instructions/values can be vectorized
4280	/// in general.
4281	ScalarsVectorizationLegality
4282	getScalarsVectorizationLegality(ArrayRef<Value > VL, unsigned* Depth,
4283	const EdgeInfo &UserTreeIdx) const;
4284
4285	/// Checks if the specified list of the instructions/values can be vectorized
4286	/// and fills required data before actual scheduling of the instructions.
4287	TreeEntry::EntryState
4288	getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
4289	bool IsScatterVectorizeUserTE,
4290	OrdersType &CurrentOrder,
4291	SmallVectorImpl<Value *> &PointerOps);
4292
4293	/// Maps a specific scalar to its tree entry(ies).
4294	SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarToTreeEntries;
4295
4296	/// Maps the operand index and entry to the corresponding tree entry.
4297	SmallDenseMap<std::pair<const TreeEntry , unsigned>, TreeEntry >
4298	OperandsToTreeEntry;
4299
4300	/// Scalars, used in split vectorize nodes.
4301	SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarsInSplitNodes;
4302
4303	/// Maps a value to the proposed vectorizable size.
4304	SmallDenseMap<Value , unsigned*> InstrElementSize;
4305
4306	/// A list of scalars that we found that we need to keep as scalars.
4307	ValueSet MustGather;
4308
4309	/// A set of first non-schedulable values.
4310	ValueSet NonScheduledFirst;
4311
4312	/// A map between the vectorized entries and the last instructions in the
4313	/// bundles. The bundles are built in use order, not in the def order of the
4314	/// instructions. So, we cannot rely directly on the last instruction in the
4315	/// bundle being the last instruction in the program order during
4316	/// vectorization process since the basic blocks are affected, need to
4317	/// pre-gather them before.
4318	SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4319
4320	/// List of gather nodes, depending on other gather/vector nodes, which should
4321	/// be emitted after the vector instruction emission process to correctly
4322	/// handle order of the vector instructions and shuffles.
4323	SetVector<const TreeEntry *> PostponedGathers;
4324
4325	using ValueToGatherNodesMap =
4326	DenseMap<Value , SmallSetVector<const* TreeEntry *, `4`>>;
4327	ValueToGatherNodesMap ValueToGatherNodes;
4328
4329	/// A list of the load entries (node indices), which can be vectorized using
4330	/// strided or masked gather approach, but attempted to be represented as
4331	/// contiguous loads.
4332	SetVector<unsigned> LoadEntriesToVectorize;
4333
4334	/// true if graph nodes transforming mode is on.
4335	bool IsGraphTransformMode = false;
4336
4337	/// The index of the first gathered load entry in the VectorizeTree.
4338	std::optional<unsigned> GatheredLoadsEntriesFirst;
4339
4340	/// Maps compress entries to their mask data for the final codegen.
4341	SmallDenseMap<const TreeEntry *,
4342	std::tuple<SmallVector<int>, VectorType , unsigned, bool*>>
4343	CompressEntryToData;
4344
4345	/// This POD struct describes one external user in the vectorized tree.
4346	struct ExternalUser {
4347	ExternalUser(Value S, llvm::User U, const TreeEntry &E, int L)
4348	: Scalar(S), User(U), E(E), Lane(L) {}
4349
4350	/// Which scalar in our function.
4351	Value Scalar = nullptr*;
4352
4353	/// Which user that uses the scalar.
4354	llvm::User User = nullptr*;
4355
4356	/// Vector node, the value is part of.
4357	const TreeEntry &E;
4358
4359	/// Which lane does the scalar belong to.
4360	int Lane;
4361	};
4362	using UserList = SmallVector<ExternalUser, `16`>;
4363
4364	/// Checks if two instructions may access the same memory.
4365	///
4366	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4367	/// is invariant in the calling loop.
4368	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4369	Instruction *Inst2) {
4370	assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4371	if (!isSimple(I: Inst2))
4372	return true;
4373	// First check if the result is already in the cache.
4374	AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
4375	auto Res = AliasCache.try_emplace(Key);
4376	if (!Res.second)
4377	return Res.first ->second;
4378	bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
4379	// Store the result in the cache.
4380	Res.first ->getSecond() = Aliased;
4381	AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased);
4382	return Aliased;
4383	}
4384
4385	using AliasCacheKey = std::pair<Instruction , Instruction >;
4386
4387	/// Cache for alias results.
4388	/// TODO: consider moving this to the AliasAnalysis itself.
4389	SmallDenseMap<AliasCacheKey, bool> AliasCache;
4390
4391	// Cache for pointerMayBeCaptured calls inside AA. This is preserved
4392	// globally through SLP because we don't perform any action which
4393	// invalidates capture results.
4394	BatchAAResults BatchAA;
4395
4396	/// Temporary store for deleted instructions. Instructions will be deleted
4397	/// eventually when the BoUpSLP is destructed. The deferral is required to
4398	/// ensure that there are no incorrect collisions in the AliasCache, which
4399	/// can happen if a new instruction is allocated at the same address as a
4400	/// previously deleted instruction.
4401	DenseSet<Instruction *> DeletedInstructions;
4402
4403	/// Set of the instruction, being analyzed already for reductions.
4404	SmallPtrSet<Instruction *, `16`> AnalyzedReductionsRoots;
4405
4406	/// Set of hashes for the list of reduction values already being analyzed.
4407	DenseSet<size_t> AnalyzedReductionVals;
4408
4409	/// Values, already been analyzed for mininmal bitwidth and found to be
4410	/// non-profitable.
4411	DenseSet<Value *> AnalyzedMinBWVals;
4412
4413	/// A list of values that need to extracted out of the tree.
4414	/// This list holds pairs of (Internal Scalar : External User). External User
4415	/// can be nullptr, it means that this Internal Scalar will be used later,
4416	/// after vectorization.
4417	UserList ExternalUses;
4418
4419	/// A list of GEPs which can be reaplced by scalar GEPs instead of
4420	/// extractelement instructions.
4421	SmallPtrSet<Value *, `4`> ExternalUsesAsOriginalScalar;
4422
4423	/// Values used only by @llvm.assume calls.
4424	SmallPtrSet<const Value *, `32`> EphValues;
4425
4426	/// Holds all of the instructions that we gathered, shuffle instructions and
4427	/// extractelements.
4428	SetVector<Instruction *> GatherShuffleExtractSeq;
4429
4430	/// A list of blocks that we are going to CSE.
4431	DenseSet<BasicBlock *> CSEBlocks;
4432
4433	/// List of hashes of vector of loads, which are known to be non vectorizable.
4434	DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4435
4436	/// Represents a scheduling entity, either ScheduleData or ScheduleBundle.
4437	/// ScheduleData used to gather dependecies for a single instructions, while
4438	/// ScheduleBundle represents a batch of instructions, going to be groupped
4439	/// together.
4440	class ScheduleEntity {
4441	friend class ScheduleBundle;
4442	friend class ScheduleData;
4443
4444	protected:
4445	enum class Kind { ScheduleData, ScheduleBundle };
4446	Kind getKind() const { return K; }
4447	ScheduleEntity(Kind K) : K(K) {}
4448
4449	private:
4450	/// Used for getting a "good" final ordering of instructions.
4451	int SchedulingPriority = `0`;
4452	/// True if this instruction (or bundle) is scheduled (or considered as
4453	/// scheduled in the dry-run).
4454	bool IsScheduled = false;
4455	/// The kind of the ScheduleEntity.
4456	const Kind K = Kind::ScheduleData;
4457
4458	public:
4459	ScheduleEntity() = delete;
4460	/// Gets/sets the scheduling priority.
4461	void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4462	int getSchedulingPriority() const { return SchedulingPriority; }
4463	bool isReady() const {
4464	if (auto SD = dyn_cast<ScheduleData>(Val: this*))
4465	return SD->isReady();
4466	return cast<ScheduleBundle>(Val: this)->isReady();
4467	}
4468	/// Gets/sets if the bundle is scheduled.
4469	bool isScheduled() const { return IsScheduled; }
4470	void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4471
4472	static bool classof(const ScheduleEntity ) { return* true; }
4473	};
4474
4475	/// Contains all scheduling relevant data for an instruction.
4476	/// A ScheduleData either represents a single instruction or a member of an
4477	/// instruction bundle (= a group of instructions which is combined into a
4478	/// vector instruction).
4479	class ScheduleData final : public ScheduleEntity {
4480	public:
4481	// The initial value for the dependency counters. It means that the
4482	// dependencies are not calculated yet.
4483	enum { InvalidDeps = -`1` };
4484
4485	ScheduleData() : ScheduleEntity (Kind::ScheduleData) {}
4486	static bool classof(const ScheduleEntity *Entity) {
4487	return Entity->getKind() == Kind::ScheduleData;
4488	}
4489
4490	void init(int BlockSchedulingRegionID, Instruction *I) {
4491	NextLoadStore = nullptr;
4492	IsScheduled = false;
4493	SchedulingRegionID = BlockSchedulingRegionID;
4494	clearDependencies();
4495	Inst = I;
4496	}
4497
4498	/// Verify basic self consistency properties
4499	void verify() {
4500	if (hasValidDependencies()) {
4501	assert(UnscheduledDeps <= Dependencies && "invariant");
4502	} else {
4503	assert(UnscheduledDeps == Dependencies && "invariant");
4504	}
4505
4506	if (IsScheduled) {
4507	assert(hasValidDependencies() && UnscheduledDeps == `0` &&
4508	"unexpected scheduled state");
4509	}
4510	}
4511
4512	/// Returns true if the dependency information has been calculated.
4513	/// Note that depenendency validity can vary between instructions within
4514	/// a single bundle.
4515	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4516
4517	/// Returns true if it is ready for scheduling, i.e. it has no more
4518	/// unscheduled depending instructions/bundles.
4519	bool isReady() const { return UnscheduledDeps == `0` && !IsScheduled; }
4520
4521	/// Modifies the number of unscheduled dependencies for this instruction,
4522	/// and returns the number of remaining dependencies for the containing
4523	/// bundle.
4524	int incrementUnscheduledDeps(int Incr) {
4525	assert(hasValidDependencies() &&
4526	"increment of unscheduled deps would be meaningless");
4527	UnscheduledDeps += Incr;
4528	return UnscheduledDeps;
4529	}
4530
4531	/// Sets the number of unscheduled dependencies to the number of
4532	/// dependencies.
4533	void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4534
4535	/// Clears all dependency information.
4536	void clearDependencies() {
4537	Dependencies = InvalidDeps;
4538	resetUnscheduledDeps();
4539	MemoryDependencies.clear();
4540	ControlDependencies.clear();
4541	IsScheduled = false;
4542	}
4543
4544	/// Gets the number of unscheduled dependencies.
4545	int getUnscheduledDeps() const { return UnscheduledDeps; }
4546	/// Gets the number of dependencies.
4547	int getDependencies() const { return Dependencies; }
4548	/// Initializes the number of dependencies.
4549	void initDependencies() { Dependencies = `0`; }
4550	/// Increments the number of dependencies.
4551	void incDependencies() { Dependencies++; }
4552
4553	/// Gets scheduling region ID.
4554	int getSchedulingRegionID() const { return SchedulingRegionID; }
4555
4556	/// Gets the instruction.
4557	Instruction getInst() const* { return Inst; }
4558
4559	/// Gets the list of memory dependencies.
4560	ArrayRef<ScheduleData > getMemoryDependencies() const* {
4561	return MemoryDependencies;
4562	}
4563	/// Adds a memory dependency.
4564	void addMemoryDependency(ScheduleData *Dep) {
4565	MemoryDependencies.push_back(Elt: Dep);
4566	}
4567	/// Gets the list of control dependencies.
4568	ArrayRef<ScheduleData > getControlDependencies() const* {
4569	return ControlDependencies;
4570	}
4571	/// Adds a control dependency.
4572	void addControlDependency(ScheduleData *Dep) {
4573	ControlDependencies.push_back(Elt: Dep);
4574	}
4575	/// Gets/sets the next load/store instruction in the block.
4576	ScheduleData getNextLoadStore() const* { return NextLoadStore; }
4577	void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4578
4579	void dump(raw_ostream &OS) const { OS << *Inst; }
4580
4581	LLVM_DUMP_METHOD void dump() const {
4582	dump(OS&: dbgs());
4583	dbgs() << `'\n'`;
4584	}
4585
4586	private:
4587	Instruction Inst = nullptr*;
4588
4589	/// Single linked list of all memory instructions (e.g. load, store, call)
4590	/// in the block - until the end of the scheduling region.
4591	ScheduleData NextLoadStore = nullptr*;
4592
4593	/// The dependent memory instructions.
4594	/// This list is derived on demand in calculateDependencies().
4595	SmallVector<ScheduleData *> MemoryDependencies;
4596
4597	/// List of instructions which this instruction could be control dependent
4598	/// on. Allowing such nodes to be scheduled below this one could introduce
4599	/// a runtime fault which didn't exist in the original program.
4600	/// ex: this is a load or udiv following a readonly call which inf loops
4601	SmallVector<ScheduleData *> ControlDependencies;
4602
4603	/// This ScheduleData is in the current scheduling region if this matches
4604	/// the current SchedulingRegionID of BlockScheduling.
4605	int SchedulingRegionID = `0`;
4606
4607	/// The number of dependencies. Constitutes of the number of users of the
4608	/// instruction plus the number of dependent memory instructions (if any).
4609	/// This value is calculated on demand.
4610	/// If InvalidDeps, the number of dependencies is not calculated yet.
4611	int Dependencies = InvalidDeps;
4612
4613	/// The number of dependencies minus the number of dependencies of scheduled
4614	/// instructions. As soon as this is zero, the instruction/bundle gets ready
4615	/// for scheduling.
4616	/// Note that this is negative as long as Dependencies is not calculated.
4617	int UnscheduledDeps = InvalidDeps;
4618	};
4619
4620	#ifndef NDEBUG
4621	friend inline raw_ostream &operator<<(raw_ostream &OS,
4622	const BoUpSLP::ScheduleData &SD) {
4623	SD.dump(OS);
4624	return OS;
4625	}
4626	#endif
4627
4628	class ScheduleBundle final : public ScheduleEntity {
4629	/// The schedule data for the instructions in the bundle.
4630	SmallVector<ScheduleData *> Bundle;
4631	/// True if this bundle is valid.
4632	bool IsValid = true;
4633	/// The TreeEntry that this instruction corresponds to.
4634	TreeEntry TE = nullptr*;
4635	ScheduleBundle(bool IsValid)
4636	: ScheduleEntity (Kind::ScheduleBundle), IsValid(IsValid) {}
4637
4638	public:
4639	ScheduleBundle() : ScheduleEntity (Kind::ScheduleBundle) {}
4640	static bool classof(const ScheduleEntity *Entity) {
4641	return Entity->getKind() == Kind::ScheduleBundle;
4642	}
4643
4644	/// Verify basic self consistency properties
4645	void verify() const {
4646	for (const ScheduleData *SD : Bundle) {
4647	if (SD->hasValidDependencies()) {
4648	assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4649	"invariant");
4650	} else {
4651	assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4652	"invariant");
4653	}
4654
4655	if (isScheduled()) {
4656	assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == `0` &&
4657	"unexpected scheduled state");
4658	}
4659	}
4660	}
4661
4662	/// Returns the number of unscheduled dependencies in the bundle.
4663	int unscheduledDepsInBundle() const {
4664	assert(*this && "bundle must not be empty");
4665	int Sum = `0`;
4666	for (const ScheduleData *BundleMember : Bundle) {
4667	if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4668	return ScheduleData::InvalidDeps;
4669	Sum += BundleMember->getUnscheduledDeps();
4670	}
4671	return Sum;
4672	}
4673
4674	/// Returns true if the dependency information has been calculated.
4675	/// Note that depenendency validity can vary between instructions within
4676	/// a single bundle.
4677	bool hasValidDependencies() const {
4678	return all_of(Range: Bundle, P: [](const ScheduleData *SD) {
4679	return SD->hasValidDependencies();
4680	});
4681	}
4682
4683	/// Returns true if it is ready for scheduling, i.e. it has no more
4684	/// unscheduled depending instructions/bundles.
4685	bool isReady() const {
4686	assert(*this && "bundle must not be empty");
4687	return unscheduledDepsInBundle() == `0` && !isScheduled();
4688	}
4689
4690	/// Returns the bundle of scheduling data, associated with the current
4691	/// instruction.
4692	ArrayRef<ScheduleData > getBundle() { return* Bundle; }
4693	ArrayRef<const ScheduleData > getBundle() const* { return Bundle; }
4694	/// Adds an instruction to the bundle.
4695	void add(ScheduleData *SD) { Bundle.push_back(Elt: SD); }
4696
4697	/// Gets/sets the associated tree entry.
4698	void setTreeEntry(TreeEntry TE) { this*->TE = TE; }
4699	TreeEntry getTreeEntry() const* { return TE; }
4700
4701	static ScheduleBundle invalid() { return {false}; }
4702
4703	operator bool() const { return IsValid; }
4704
4705	#ifndef NDEBUG
4706	void dump(raw_ostream &OS) const {
4707	if (!*this) {
4708	OS << "[]";
4709	return;
4710	}
4711	OS << `'['`;
4712	interleaveComma(Bundle, OS,
4713	[&](const ScheduleData SD) { OS << SD->getInst(); });
4714	OS << `']'`;
4715	}
4716
4717	LLVM_DUMP_METHOD void dump() const {
4718	dump(dbgs());
4719	dbgs() << `'\n'`;
4720	}
4721	#endif // NDEBUG
4722	};
4723
4724	#ifndef NDEBUG
4725	friend inline raw_ostream &operator<<(raw_ostream &OS,
4726	const BoUpSLP::ScheduleBundle &Bundle) {
4727	Bundle.dump(OS);
4728	return OS;
4729	}
4730	#endif
4731
4732	friend struct GraphTraits<BoUpSLP *>;
4733	friend struct DOTGraphTraits<BoUpSLP *>;
4734
4735	/// Contains all scheduling data for a basic block.
4736	/// It does not schedules instructions, which are not memory read/write
4737	/// instructions and their operands are either constants, or arguments, or
4738	/// phis, or instructions from others blocks, or their users are phis or from
4739	/// the other blocks. The resulting vector instructions can be placed at the
4740	/// beginning of the basic block without scheduling (if operands does not need
4741	/// to be scheduled) or at the end of the block (if users are outside of the
4742	/// block). It allows to save some compile time and memory used by the
4743	/// compiler.
4744	/// ScheduleData is assigned for each instruction in between the boundaries of
4745	/// the tree entry, even for those, which are not part of the graph. It is
4746	/// required to correctly follow the dependencies between the instructions and
4747	/// their correct scheduling. The ScheduleData is not allocated for the
4748	/// instructions, which do not require scheduling, like phis, nodes with
4749	/// extractelements/insertelements only or nodes with instructions, with
4750	/// uses/operands outside of the block.
4751	struct BlockScheduling {
4752	BlockScheduling(BasicBlock *BB)
4753	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4754
4755	void clear() {
4756	ScheduledBundles.clear();
4757	ScheduledBundlesList.clear();
4758	ReadyInsts.clear();
4759	ScheduleStart = nullptr;
4760	ScheduleEnd = nullptr;
4761	FirstLoadStoreInRegion = nullptr;
4762	LastLoadStoreInRegion = nullptr;
4763	RegionHasStackSave = false;
4764
4765	// Reduce the maximum schedule region size by the size of the
4766	// previous scheduling run.
4767	ScheduleRegionSizeLimit -= ScheduleRegionSize;
4768	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4769	ScheduleRegionSizeLimit = MinScheduleRegionSize;
4770	ScheduleRegionSize = `0`;
4771
4772	// Make a new scheduling region, i.e. all existing ScheduleData is not
4773	// in the new region yet.
4774	++SchedulingRegionID;
4775	}
4776
4777	ScheduleData getScheduleData(Instruction I) {
4778	if (!I)
4779	return nullptr;
4780	if (BB != I->getParent())
4781	// Avoid lookup if can't possibly be in map.
4782	return nullptr;
4783	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
4784	if (SD && isInSchedulingRegion(SD))
4785	return SD;
4786	return nullptr;
4787	}
4788
4789	ScheduleData getScheduleData(Value V) {
4790	return getScheduleData(I: dyn_cast<Instruction>(Val: V));
4791	}
4792
4793	ArrayRef<ScheduleBundle > getScheduleBundles(Value V) const {
4794	auto *I = dyn_cast<Instruction>(Val: V);
4795	if (!I)
4796	return {};
4797	auto It = ScheduledBundles.find(Val: I);
4798	if (It == ScheduledBundles.end())
4799	return {};
4800	return It ->getSecond();
4801	}
4802
4803	bool isInSchedulingRegion(ScheduleData SD) const* {
4804	return SD->getSchedulingRegionID() == SchedulingRegionID;
4805	}
4806
4807	bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
4808	return all_of(Range: Bundle.getBundle(), P: [&](const ScheduleData *BundleMember) {
4809	return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
4810	});
4811	}
4812
4813	/// Marks an instruction as scheduled and puts all dependent ready
4814	/// instructions into the ready-list.
4815	template <typename ReadyListType>
4816	void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
4817	auto ProcessBundleMember = [&](ScheduleData *BundleMember,
4818	ScheduleBundle *Bundle) {
4819	// Handle the def-use chain dependencies.
4820
4821	// Decrement the unscheduled counter and insert to ready list if ready.
4822	auto DecrUnsched = [&](ScheduleData Data, bool* IsControl = false) {
4823	if ((IsControl \|\| Data->hasValidDependencies()) &&
4824	Data->incrementUnscheduledDeps(Incr: -`1`) == `0`) {
4825	// There are no more unscheduled dependencies after
4826	// decrementing, so we can put the dependent instruction
4827	// into the ready list.
4828	if (ArrayRef<ScheduleBundle *> Bundles =
4829	getScheduleBundles(V: Data->getInst());
4830	!Bundles.empty()) {
4831	for (ScheduleBundle *Bundle : Bundles) {
4832	if (Bundle->unscheduledDepsInBundle() == `0`) {
4833	assert(!Bundle->isScheduled() &&
4834	"already scheduled bundle gets ready");
4835	ReadyList.insert(Bundle);
4836	LLVM_DEBUG(dbgs()
4837	<< "SLP: gets ready: " << *Bundle << "\n");
4838	}
4839	}
4840	return;
4841	}
4842	assert(!Data->isScheduled() &&
4843	"already scheduled bundle gets ready");
4844	ReadyList.insert(Data);
4845	LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
4846	}
4847	};
4848
4849	auto DecrUnschedForInst = [&](Instruction *I) {
4850	if (ScheduleData *OpSD = getScheduleData(I))
4851	DecrUnsched(OpSD, /IsControl=/false);
4852	};
4853
4854	// If BundleMember is a vector bundle, its operands may have been
4855	// reordered during buildTree(). We therefore need to get its operands
4856	// through the TreeEntry.
4857	if (Bundle) {
4858	// Need to search for the lane since the tree entry can be reordered.
4859	auto *In = BundleMember->getInst();
4860	int Lane = std::distance(first: Bundle->getTreeEntry()->Scalars.begin(),
4861	last: find(Range&: Bundle->getTreeEntry()->Scalars, Val: In));
4862	assert(Lane >= `0` && "Lane not set");
4863
4864	// Since vectorization tree is being built recursively this assertion
4865	// ensures that the tree entry has all operands set before reaching
4866	// this code. Couple of exceptions known at the moment are extracts
4867	// where their second (immediate) operand is not added. Since
4868	// immediates do not affect scheduler behavior this is considered
4869	// okay.
4870	assert(In &&
4871	(isa<ExtractValueInst, ExtractElementInst, CallBase>(In) \|\|
4872	In->getNumOperands() ==
4873	Bundle->getTreeEntry()->getNumOperands()) &&
4874	"Missed TreeEntry operands?");
4875
4876	for (unsigned OpIdx :
4877	seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands()))
4878	if (auto *I = dyn_cast<Instruction>(
4879	Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
4880	LLVM_DEBUG(dbgs()
4881	<< "SLP: check for readiness (def): " << *I << "\n");
4882	DecrUnschedForInst(I);
4883	}
4884	} else {
4885	// If BundleMember is a stand-alone instruction, no operand reordering
4886	// has taken place, so we directly access its operands.
4887	for (Use &U : BundleMember->getInst()->operands())
4888	if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
4889	LLVM_DEBUG(dbgs()
4890	<< "SLP: check for readiness (def): " << *I << "\n");
4891	DecrUnschedForInst(I);
4892	}
4893	}
4894	// Handle the memory dependencies.
4895	for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
4896	// There are no more unscheduled dependencies after decrementing,
4897	// so we can put the dependent instruction into the ready list.
4898	LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
4899	<< *MemoryDep << "\n");
4900	DecrUnsched(MemoryDep);
4901	}
4902	// Handle the control dependencies.
4903	for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
4904	// There are no more unscheduled dependencies after decrementing,
4905	// so we can put the dependent instruction into the ready list.
4906	LLVM_DEBUG(dbgs()
4907	<< "SLP: check for readiness (ctrl): " << *Dep << "\n");
4908	DecrUnsched(Dep, /IsControl=/true);
4909	}
4910	};
4911	if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) {
4912	SD->setScheduled(/Scheduled=/true);
4913	LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4914	ProcessBundleMember(SD, nullptr);
4915	} else {
4916	ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data);
4917	Bundle.setScheduled(/Scheduled=/true);
4918	LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
4919	auto AreAllBundlesScheduled = [&](const ScheduleData *SD) {
4920	ArrayRef<ScheduleBundle *> SDBundles =
4921	getScheduleBundles(V: SD->getInst());
4922	return !SDBundles.empty() &&
4923	all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
4924	return SDBundle->isScheduled();
4925	});
4926	};
4927	for (ScheduleData *SD : Bundle.getBundle()) {
4928	if (AreAllBundlesScheduled(SD)) {
4929	SD->setScheduled(/Scheduled=/true);
4930	ProcessBundleMember(SD, &Bundle);
4931	}
4932	}
4933	}
4934	}
4935
4936	/// Verify basic self consistency properties of the data structure.
4937	void verify() {
4938	if (!ScheduleStart)
4939	return;
4940
4941	assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4942	ScheduleStart->comesBefore(ScheduleEnd) &&
4943	"Not a valid scheduling region?");
4944
4945	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4946	ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
4947	if (!Bundles.empty()) {
4948	for (ScheduleBundle *Bundle : Bundles) {
4949	assert(isInSchedulingRegion(*Bundle) &&
4950	"primary schedule data not in window?");
4951	Bundle->verify();
4952	}
4953	continue;
4954	}
4955	auto *SD = getScheduleData(I);
4956	if (!SD)
4957	continue;
4958	assert(isInSchedulingRegion(SD) &&
4959	"primary schedule data not in window?");
4960	SD->verify();
4961	}
4962
4963	assert(all_of(ReadyInsts,
4964	[](const ScheduleEntity *Bundle) {
4965	return Bundle->isReady();
4966	}) &&
4967	"item in ready list not ready?");
4968	}
4969
4970	/// Put all instructions into the ReadyList which are ready for scheduling.
4971	template <typename ReadyListType>
4972	void initialFillReadyList(ReadyListType &ReadyList) {
4973	SmallPtrSet<ScheduleBundle *, `16`> Visited;
4974	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4975	ScheduleData *SD = getScheduleData(I);
4976	if (SD && SD->hasValidDependencies() && SD->isReady()) {
4977	if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
4978	!Bundles.empty()) {
4979	for (ScheduleBundle *Bundle : Bundles) {
4980	if (!Visited.insert(Ptr: Bundle).second)
4981	continue;
4982	if (Bundle->hasValidDependencies() && Bundle->isReady()) {
4983	ReadyList.insert(Bundle);
4984	LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
4985	<< *Bundle << "\n");
4986	}
4987	}
4988	continue;
4989	}
4990	ReadyList.insert(SD);
4991	LLVM_DEBUG(dbgs()
4992	<< "SLP: initially in ready list: " << *SD << "\n");
4993	}
4994	}
4995	}
4996
4997	/// Build a bundle from the ScheduleData nodes corresponding to the
4998	/// scalar instruction for each lane.
4999	ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
5000
5001	/// Checks if a bundle of instructions can be scheduled, i.e. has no
5002	/// cyclic dependencies. This is only a dry-run, no instructions are
5003	/// actually moved at this stage.
5004	/// \returns the scheduling bundle. The returned Optional value is not
5005	/// std::nullopt if \p VL is allowed to be scheduled.
5006	std::optional<ScheduleBundle *>
5007	tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
5008	const InstructionsState &S);
5009
5010	/// Allocates schedule data chunk.
5011	ScheduleData *allocateScheduleDataChunks();
5012
5013	/// Extends the scheduling region so that V is inside the region.
5014	/// \returns true if the region size is within the limit.
5015	bool extendSchedulingRegion(Value V, const* InstructionsState &S);
5016
5017	/// Initialize the ScheduleData structures for new instructions in the
5018	/// scheduling region.
5019	void initScheduleData(Instruction FromI, Instruction ToI,
5020	ScheduleData *PrevLoadStore,
5021	ScheduleData *NextLoadStore);
5022
5023	/// Updates the dependency information of a bundle and of all instructions/
5024	/// bundles which depend on the original bundle.
5025	void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5026	BoUpSLP *SLP);
5027
5028	/// Sets all instruction in the scheduling region to un-scheduled.
5029	void resetSchedule();
5030
5031	BasicBlock *BB;
5032
5033	/// Simple memory allocation for ScheduleData.
5034	SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
5035
5036	/// The size of a ScheduleData array in ScheduleDataChunks.
5037	int ChunkSize;
5038
5039	/// The allocator position in the current chunk, which is the last entry
5040	/// of ScheduleDataChunks.
5041	int ChunkPos;
5042
5043	/// Attaches ScheduleData to Instruction.
5044	/// Note that the mapping survives during all vectorization iterations, i.e.
5045	/// ScheduleData structures are recycled.
5046	SmallDenseMap<Instruction , ScheduleData > ScheduleDataMap;
5047
5048	/// Attaches ScheduleBundle to Instruction.
5049	SmallDenseMap<Instruction , SmallVector<ScheduleBundle >>
5050	ScheduledBundles;
5051	/// The list of ScheduleBundles.
5052	SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5053
5054	/// The ready-list for scheduling (only used for the dry-run).
5055	SetVector<ScheduleEntity *> ReadyInsts;
5056
5057	/// The first instruction of the scheduling region.
5058	Instruction ScheduleStart = nullptr*;
5059
5060	/// The first instruction _after_ the scheduling region.
5061	Instruction ScheduleEnd = nullptr*;
5062
5063	/// The first memory accessing instruction in the scheduling region
5064	/// (can be null).
5065	ScheduleData FirstLoadStoreInRegion = nullptr*;
5066
5067	/// The last memory accessing instruction in the scheduling region
5068	/// (can be null).
5069	ScheduleData LastLoadStoreInRegion = nullptr*;
5070
5071	/// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5072	/// region? Used to optimize the dependence calculation for the
5073	/// common case where there isn't.
5074	bool RegionHasStackSave = false;
5075
5076	/// The current size of the scheduling region.
5077	int ScheduleRegionSize = `0`;
5078
5079	/// The maximum size allowed for the scheduling region.
5080	int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5081
5082	/// The ID of the scheduling region. For a new vectorization iteration this
5083	/// is incremented which "removes" all ScheduleData from the region.
5084	/// Make sure that the initial SchedulingRegionID is greater than the
5085	/// initial SchedulingRegionID in ScheduleData (which is 0).
5086	int SchedulingRegionID = `1`;
5087	};
5088
5089	/// Attaches the BlockScheduling structures to basic blocks.
5090	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5091
5092	/// Performs the "real" scheduling. Done before vectorization is actually
5093	/// performed in a basic block.
5094	void scheduleBlock(BlockScheduling *BS);
5095
5096	/// List of users to ignore during scheduling and that don't need extracting.
5097	const SmallDenseSet<Value > UserIgnoreList = nullptr;
5098
5099	/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5100	/// sorted SmallVectors of unsigned.
5101	struct OrdersTypeDenseMapInfo {
5102	static OrdersType getEmptyKey() {
5103	OrdersType V;
5104	V.push_back(Elt: ~`1U`);
5105	return V;
5106	}
5107
5108	static OrdersType getTombstoneKey() {
5109	OrdersType V;
5110	V.push_back(Elt: ~`2U`);
5111	return V;
5112	}
5113
5114	static unsigned getHashValue(const OrdersType &V) {
5115	return static_cast<unsigned>(hash_combine_range(R: V));
5116	}
5117
5118	static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5119	return LHS == RHS;
5120	}
5121	};
5122
5123	// Analysis and block reference.
5124	Function *F;
5125	ScalarEvolution *SE;
5126	TargetTransformInfo *TTI;
5127	TargetLibraryInfo *TLI;
5128	LoopInfo *LI;
5129	DominatorTree *DT;
5130	AssumptionCache *AC;
5131	DemandedBits *DB;
5132	const DataLayout *DL;
5133	OptimizationRemarkEmitter *ORE;
5134
5135	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5136	unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5137
5138	/// Instruction builder to construct the vectorized tree.
5139	IRBuilder<TargetFolder> Builder;
5140
5141	/// A map of scalar integer values to the smallest bit width with which they
5142	/// can legally be represented. The values map to (width, signed) pairs,
5143	/// where "width" indicates the minimum bit width and "signed" is True if the
5144	/// value must be signed-extended, rather than zero-extended, back to its
5145	/// original width.
5146	DenseMap<const TreeEntry , std::pair<uint64_t, bool*>> MinBWs;
5147
5148	/// Final size of the reduced vector, if the current graph represents the
5149	/// input for the reduction and it was possible to narrow the size of the
5150	/// reduction.
5151	unsigned ReductionBitWidth = `0`;
5152
5153	/// Canonical graph size before the transformations.
5154	unsigned BaseGraphSize = `1`;
5155
5156	/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5157	/// type sizes, used in the tree.
5158	std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5159
5160	/// Indices of the vectorized nodes, which supposed to be the roots of the new
5161	/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5162	DenseSet<unsigned> ExtraBitWidthNodes;
5163	};
5164
5165	} // end namespace slpvectorizer
5166
5167	template <> struct GraphTraits<BoUpSLP *> {
5168	using TreeEntry = BoUpSLP::TreeEntry;
5169
5170	/// NodeRef has to be a pointer per the GraphWriter.
5171	using NodeRef = TreeEntry *;
5172
5173	using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
5174
5175	/// Add the VectorizableTree to the index iterator to be able to return
5176	/// TreeEntry pointers.
5177	struct ChildIteratorType
5178	: public iterator_adaptor_base<
5179	ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator> {
5180	ContainerTy &VectorizableTree;
5181
5182	ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator W,
5183	ContainerTy &VT)
5184	: ChildIteratorType::iterator_adaptor_base (W), VectorizableTree(VT) {}
5185
5186	NodeRef operator() { return* I->UserTE; }
5187	};
5188
5189	static NodeRef getEntryNode(BoUpSLP &R) {
5190	return R.VectorizableTree [`0`].get();
5191	}
5192
5193	static ChildIteratorType child_begin(NodeRef N) {
5194	return {&N->UserTreeIndex, N->Container};
5195	}
5196
5197	static ChildIteratorType child_end(NodeRef N) {
5198	return {&N->UserTreeIndex + `1`, N->Container};
5199	}
5200
5201	/// For the node iterator we just need to turn the TreeEntry iterator into a
5202	/// TreeEntry iterator so that it dereferences to NodeRef.*
5203	class nodes_iterator {
5204	using ItTy = ContainerTy::iterator;
5205	ItTy It;
5206
5207	public:
5208	nodes_iterator(const ItTy &It2) : It(It2) {}
5209	NodeRef operator() { return* It->get(); }
5210	nodes_iterator operator++() {
5211	++It;
5212	return *this;
5213	}
5214	bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
5215	};
5216
5217	static nodes_iterator nodes_begin(BoUpSLP *R) {
5218	return nodes_iterator (R->VectorizableTree.begin());
5219	}
5220
5221	static nodes_iterator nodes_end(BoUpSLP *R) {
5222	return nodes_iterator (R->VectorizableTree.end());
5223	}
5224
5225	static unsigned size(BoUpSLP R) { return* R->VectorizableTree.size(); }
5226	};
5227
5228	template <> struct DOTGraphTraits<BoUpSLP > : public* DefaultDOTGraphTraits {
5229	using TreeEntry = BoUpSLP::TreeEntry;
5230
5231	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits (IsSimple) {}
5232
5233	std::string getNodeLabel(const TreeEntry Entry, const* BoUpSLP *R) {
5234	std::string Str;
5235	raw_string_ostream OS(Str);
5236	OS << Entry->Idx << ".\n";
5237	if (isSplat(VL: Entry->Scalars))
5238	OS << "<splat> ";
5239	for (auto *V : Entry->Scalars) {
5240	OS << *V;
5241	if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
5242	return EU.Scalar == V;
5243	}))
5244	OS << " <extract>";
5245	OS << "\n";
5246	}
5247	return Str;
5248	}
5249
5250	static std::string getNodeAttributes(const TreeEntry *Entry,
5251	const BoUpSLP *) {
5252	if (Entry->isGather())
5253	return "color=red";
5254	if (Entry->State == TreeEntry::ScatterVectorize \|\|
5255	Entry->State == TreeEntry::StridedVectorize \|\|
5256	Entry->State == TreeEntry::CompressVectorize)
5257	return "color=blue";
5258	return "";
5259	}
5260	};
5261
5262	} // end namespace llvm
5263
5264	BoUpSLP::~BoUpSLP() {
5265	SmallVector<WeakTrackingVH> DeadInsts;
5266	for (auto *I : DeletedInstructions) {
5267	if (!I->getParent()) {
5268	// Temporarily insert instruction back to erase them from parent and
5269	// memory later.
5270	if (isa<PHINode>(Val: I))
5271	// Phi nodes must be the very first instructions in the block.
5272	I->insertBefore(BB&: F->getEntryBlock(),
5273	InsertPos: F->getEntryBlock().getFirstNonPHIIt());
5274	else
5275	I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator());
5276	continue;
5277	}
5278	for (Use &U : I->operands()) {
5279	auto *Op = dyn_cast<Instruction>(Val: U.get());
5280	if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
5281	wouldInstructionBeTriviallyDead(I: Op, TLI))
5282	DeadInsts.emplace_back(Args&: Op);
5283	}
5284	I->dropAllReferences();
5285	}
5286	for (auto *I : DeletedInstructions) {
5287	assert(I->use_empty() &&
5288	"trying to erase instruction with users.");
5289	I->eraseFromParent();
5290	}
5291
5292	// Cleanup any dead scalar code feeding the vectorized instructions
5293	RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
5294
5295	#ifdef EXPENSIVE_CHECKS
5296	// If we could guarantee that this call is not extremely slow, we could
5297	// remove the ifdef limitation (see PR47712).
5298	assert(!verifyFunction(*F, &dbgs()));
5299	#endif
5300	}
5301
5302	/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
5303	/// contains original mask for the scalars reused in the node. Procedure
5304	/// transform this mask in accordance with the given \p Mask.
5305	static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
5306	assert(!Mask.empty() && Reuses.size() == Mask.size() &&
5307	"Expected non-empty mask.");
5308	SmallVector<int> Prev(Reuses.begin(), Reuses.end());
5309	Prev.swap(RHS&: Reuses);
5310	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
5311	if (Mask [I] != PoisonMaskElem)
5312	Reuses [Mask [I]] = Prev [I];
5313	}
5314
5315	/// Reorders the given \p Order according to the given \p Mask. \p Order - is
5316	/// the original order of the scalars. Procedure transforms the provided order
5317	/// in accordance with the given \p Mask. If the resulting \p Order is just an
5318	/// identity order, \p Order is cleared.
5319	static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
5320	bool BottomOrder = false) {
5321	assert(!Mask.empty() && "Expected non-empty mask.");
5322	unsigned Sz = Mask.size();
5323	if (BottomOrder) {
5324	SmallVector<unsigned> PrevOrder;
5325	if (Order.empty()) {
5326	PrevOrder.resize(N: Sz);
5327	std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: `0`);
5328	} else {
5329	PrevOrder.swap(RHS&: Order);
5330	}
5331	Order.assign(NumElts: Sz, Elt: Sz);
5332	for (unsigned I = `0`; I < Sz; ++I)
5333	if (Mask [I] != PoisonMaskElem)
5334	Order [I] = PrevOrder [Mask [I]];
5335	if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
5336	return Data.value() == Sz \|\| Data.index() == Data.value();
5337	})) {
5338	Order.clear();
5339	return;
5340	}
5341	fixupOrderingIndices(Order);
5342	return;
5343	}
5344	SmallVector<int> MaskOrder;
5345	if (Order.empty()) {
5346	MaskOrder.resize(N: Sz);
5347	std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: `0`);
5348	} else {
5349	inversePermutation(Indices: Order, Mask&: MaskOrder);
5350	}
5351	reorderReuses(Reuses&: MaskOrder, Mask);
5352	if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
5353	Order.clear();
5354	return;
5355	}
5356	Order.assign(NumElts: Sz, Elt: Sz);
5357	for (unsigned I = `0`; I < Sz; ++I)
5358	if (MaskOrder [I] != PoisonMaskElem)
5359	Order [MaskOrder [I]] = I;
5360	fixupOrderingIndices(Order);
5361	}
5362
5363	std::optional<BoUpSLP::OrdersType>
5364	BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
5365	bool TopToBottom, bool IgnoreReorder) {
5366	assert(TE.isGather() && "Expected gather node only.");
5367	// Try to find subvector extract/insert patterns and reorder only such
5368	// patterns.
5369	SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
5370	Type *ScalarTy = GatheredScalars.front()->getType();
5371	size_t NumScalars = GatheredScalars.size();
5372	if (!isValidElementType(Ty: ScalarTy))
5373	return std::nullopt;
5374	auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
5375	unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars);
5376	SmallVector<int> ExtractMask;
5377	SmallVector<int> Mask;
5378	SmallVector<SmallVector<const TreeEntry *>> Entries;
5379	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
5380	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
5381	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
5382	isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
5383	/ForOrder=/true);
5384	// No shuffled operands - ignore.
5385	if (GatherShuffles.empty() && ExtractShuffles.empty())
5386	return std::nullopt;
5387	OrdersType CurrentOrder(NumScalars, NumScalars);
5388	if (GatherShuffles.size() == `1` &&
5389	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
5390	Entries.front().front()->isSame(VL: TE.Scalars)) {
5391	// If the full matched node in whole tree rotation - no need to consider the
5392	// matching order, rotating the whole tree.
5393	if (TopToBottom)
5394	return std::nullopt;
5395	// No need to keep the order for the same user node.
5396	if (Entries.front().front()->UserTreeIndex.UserTE ==
5397	TE.UserTreeIndex.UserTE)
5398	return std::nullopt;
5399	// No need to keep the order for the matched root node, if it can be freely
5400	// reordered.
5401	if (!IgnoreReorder && Entries.front().front()->Idx == `0`)
5402	return std::nullopt;
5403	// If shuffling 2 elements only and the matching node has reverse reuses -
5404	// no need to count order, both work fine.
5405	if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
5406	TE.getVectorFactor() == `2` && Mask.size() == `2` &&
5407	any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices),
5408	P: [](const auto &P) {
5409	return P.value() % `2` != static_cast<int>(P.index()) % `2`;
5410	}))
5411	return std::nullopt;
5412
5413	// Perfect match in the graph, will reuse the previously vectorized
5414	// node. Cost is 0.
5415	std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: `0`);
5416	return CurrentOrder;
5417	}
5418	auto IsSplatMask = [](ArrayRef<int> Mask) {
5419	int SingleElt = PoisonMaskElem;
5420	return all_of(Range&: Mask, P: [&](int I) {
5421	if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
5422	SingleElt = I;
5423	return I == PoisonMaskElem \|\| I == SingleElt;
5424	});
5425	};
5426	// Exclusive broadcast mask - ignore.
5427	if ((ExtractShuffles.empty() && IsSplatMask (Mask) &&
5428	(Entries.size() != `1` \|\|
5429	Entries.front().front()->ReorderIndices.empty())) \|\|
5430	(GatherShuffles.empty() && IsSplatMask (ExtractMask)))
5431	return std::nullopt;
5432	SmallBitVector ShuffledSubMasks(NumParts);
5433	auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
5434	ArrayRef<int> Mask, int PartSz, int NumParts,
5435	function_ref<unsigned(unsigned)> GetVF) {
5436	for (int I : seq<int>(Begin: `0`, End: NumParts)) {
5437	if (ShuffledSubMasks.test(Idx: I))
5438	continue;
5439	const int VF = GetVF (I);
5440	if (VF == `0`)
5441	continue;
5442	unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
5443	MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
5444	// Shuffle of at least 2 vectors - ignore.
5445	if (any_of(Range&: Slice, P: [&](unsigned I) { return I != NumScalars; })) {
5446	llvm::fill(Range&: Slice, Value&: NumScalars);
5447	ShuffledSubMasks.set(I);
5448	continue;
5449	}
5450	// Try to include as much elements from the mask as possible.
5451	int FirstMin = INT_MAX;
5452	int SecondVecFound = false;
5453	for (int K : seq<int>(Size: Limit)) {
5454	int Idx = Mask [I * PartSz + K];
5455	if (Idx == PoisonMaskElem) {
5456	Value V = GatheredScalars [I PartSz + K];
5457	if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
5458	SecondVecFound = true;
5459	break;
5460	}
5461	continue;
5462	}
5463	if (Idx < VF) {
5464	if (FirstMin > Idx)
5465	FirstMin = Idx;
5466	} else {
5467	SecondVecFound = true;
5468	break;
5469	}
5470	}
5471	FirstMin = (FirstMin / PartSz) * PartSz;
5472	// Shuffle of at least 2 vectors - ignore.
5473	if (SecondVecFound) {
5474	llvm::fill(Range&: Slice, Value&: NumScalars);
5475	ShuffledSubMasks.set(I);
5476	continue;
5477	}
5478	for (int K : seq<int>(Size: Limit)) {
5479	int Idx = Mask [I * PartSz + K];
5480	if (Idx == PoisonMaskElem)
5481	continue;
5482	Idx -= FirstMin;
5483	if (Idx >= PartSz) {
5484	SecondVecFound = true;
5485	break;
5486	}
5487	if (CurrentOrder [I * PartSz + Idx] >
5488	static_cast<unsigned>(I * PartSz + K) &&
5489	CurrentOrder [I * PartSz + Idx] !=
5490	static_cast<unsigned>(I * PartSz + Idx))
5491	CurrentOrder [I * PartSz + Idx] = I * PartSz + K;
5492	}
5493	// Shuffle of at least 2 vectors - ignore.
5494	if (SecondVecFound) {
5495	llvm::fill(Range&: Slice, Value&: NumScalars);
5496	ShuffledSubMasks.set(I);
5497	continue;
5498	}
5499	}
5500	};
5501	int PartSz = getPartNumElems(Size: NumScalars, NumParts);
5502	if (!ExtractShuffles.empty())
5503	TransformMaskToOrder (
5504	CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
5505	if (!ExtractShuffles [I])
5506	return `0U`;
5507	unsigned VF = `0`;
5508	unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
5509	for (unsigned Idx : seq<unsigned>(Size: Sz)) {
5510	int K = I * PartSz + Idx;
5511	if (ExtractMask [K] == PoisonMaskElem)
5512	continue;
5513	if (!TE.ReuseShuffleIndices.empty())
5514	K = TE.ReuseShuffleIndices [K];
5515	if (K == PoisonMaskElem)
5516	continue;
5517	if (!TE.ReorderIndices.empty())
5518	K = std::distance(first: TE.ReorderIndices.begin(),
5519	last: find(Range: TE.ReorderIndices, Val: K));
5520	auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars [K]);
5521	if (!EI)
5522	continue;
5523	VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
5524	->getElementCount()
5525	.getKnownMinValue());
5526	}
5527	return VF;
5528	});
5529	// Check special corner case - single shuffle of the same entry.
5530	if (GatherShuffles.size() == `1` && NumParts != `1`) {
5531	if (ShuffledSubMasks.any())
5532	return std::nullopt;
5533	PartSz = NumScalars;
5534	NumParts = `1`;
5535	}
5536	if (!Entries.empty())
5537	TransformMaskToOrder (CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
5538	if (!GatherShuffles [I])
5539	return `0U`;
5540	return std::max(a: Entries [I].front()->getVectorFactor(),
5541	b: Entries [I].back()->getVectorFactor());
5542	});
5543	unsigned NumUndefs =
5544	count_if(Range&: CurrentOrder, P: [&](unsigned Idx) { return Idx == NumScalars; });
5545	if (ShuffledSubMasks.all() \|\| (NumScalars > `2` && NumUndefs >= NumScalars / `2`))
5546	return std::nullopt;
5547	return std::move(CurrentOrder);
5548	}
5549
5550	static bool arePointersCompatible(Value Ptr1, Value Ptr2,
5551	const TargetLibraryInfo &TLI,
5552	bool CompareOpcodes = true) {
5553	if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) !=
5554	getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth))
5555	return false;
5556	auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
5557	auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
5558	return (!GEP1 \|\| GEP1->getNumOperands() == `2`) &&
5559	(!GEP2 \|\| GEP2->getNumOperands() == `2`) &&
5560	(((!GEP1 \|\| isConstant(V: GEP1->getOperand(i_nocapture: `1`))) &&
5561	(!GEP2 \|\| isConstant(V: GEP2->getOperand(i_nocapture: `1`)))) \|\|
5562	!CompareOpcodes \|\|
5563	(GEP1 && GEP2 &&
5564	getSameOpcode(VL: {GEP1->getOperand(i_nocapture: `1`), GEP2->getOperand(i_nocapture: `1`)}, TLI)));
5565	}
5566
5567	/// Calculates minimal alignment as a common alignment.
5568	template <typename T>
5569	static Align computeCommonAlignment(ArrayRef<Value *> VL) {
5570	Align CommonAlignment = cast<T>(VL.front())->getAlign();
5571	for (Value *V : VL.drop_front())
5572	CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
5573	return CommonAlignment;
5574	}
5575
5576	/// Check if \p Order represents reverse order.
5577	static bool isReverseOrder(ArrayRef<unsigned> Order) {
5578	assert(!Order.empty() &&
5579	"Order is empty. Please check it before using isReverseOrder.");
5580	unsigned Sz = Order.size();
5581	return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
5582	return Pair.value() == Sz \|\| Sz - Pair.index() - `1` == Pair.value();
5583	});
5584	}
5585
5586	/// Checks if the provided list of pointers \p Pointers represents the strided
5587	/// pointers for type ElemTy. If they are not, std::nullopt is returned.
5588	/// Otherwise, if \p Inst is not specified, just initialized optional value is
5589	/// returned to show that the pointers represent strided pointers. If \p Inst
5590	/// specified, the runtime stride is materialized before the given \p Inst.
5591	/// \returns std::nullopt if the pointers are not pointers with the runtime
5592	/// stride, nullptr or actual stride value, otherwise.
5593	static std::optional<Value *>
5594	calculateRtStride(ArrayRef<Value > PointerOps, Type ElemTy,
5595	const DataLayout &DL, ScalarEvolution &SE,
5596	SmallVectorImpl<unsigned> &SortedIndices,
5597	Instruction Inst = nullptr*) {
5598	SmallVector<const SCEV *> SCEVs;
5599	const SCEV PtrSCEVLowest = nullptr*;
5600	const SCEV PtrSCEVHighest = nullptr*;
5601	// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
5602	// addresses).
5603	for (Value *Ptr : PointerOps) {
5604	const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
5605	if (!PtrSCEV)
5606	return std::nullopt;
5607	SCEVs.push_back(Elt: PtrSCEV);
5608	if (!PtrSCEVLowest && !PtrSCEVHighest) {
5609	PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
5610	continue;
5611	}
5612	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
5613	if (isa<SCEVCouldNotCompute>(Val: Diff))
5614	return std::nullopt;
5615	if (Diff->isNonConstantNegative()) {
5616	PtrSCEVLowest = PtrSCEV;
5617	continue;
5618	}
5619	const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
5620	if (isa<SCEVCouldNotCompute>(Val: Diff1))
5621	return std::nullopt;
5622	if (Diff1->isNonConstantNegative()) {
5623	PtrSCEVHighest = PtrSCEV;
5624	continue;
5625	}
5626	}
5627	// Dist = PtrSCEVHighest - PtrSCEVLowest;
5628	const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
5629	if (isa<SCEVCouldNotCompute>(Val: Dist))
5630	return std::nullopt;
5631	int Size = DL.getTypeStoreSize(Ty: ElemTy);
5632	auto TryGetStride = [&](const SCEV *Dist,
5633	const SCEV Multiplier) -> const* SCEV * {
5634	if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
5635	if (M->getOperand(i: `0`) == Multiplier)
5636	return M->getOperand(i: `1`);
5637	if (M->getOperand(i: `1`) == Multiplier)
5638	return M->getOperand(i: `0`);
5639	return nullptr;
5640	}
5641	if (Multiplier == Dist)
5642	return SE.getConstant(Ty: Dist->getType(), V: `1`);
5643	return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
5644	};
5645	// Stride_in_elements = Dist / element_size (num_elems - 1).*
5646	const SCEV Stride = nullptr*;
5647	if (Size != `1` \|\| SCEVs.size() > `2`) {
5648	const SCEV Sz = SE.getConstant(Ty: Dist->getType(), V: Size (SCEVs.size() - `1`));
5649	Stride = TryGetStride (Dist, Sz);
5650	if (!Stride)
5651	return std::nullopt;
5652	}
5653	if (!Stride \|\| isa<SCEVConstant>(Val: Stride))
5654	return std::nullopt;
5655	// Iterate through all pointers and check if all distances are
5656	// unique multiple of Stride.
5657	using DistOrdPair = std::pair<int64_t, int>;
5658	auto Compare = llvm::less_first ();
5659	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
5660	int Cnt = `0`;
5661	bool IsConsecutive = true;
5662	for (const SCEV *PtrSCEV : SCEVs) {
5663	unsigned Dist = `0`;
5664	if (PtrSCEV != PtrSCEVLowest) {
5665	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
5666	const SCEV *Coeff = TryGetStride (Diff, Stride);
5667	if (!Coeff)
5668	return std::nullopt;
5669	const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
5670	if (!SC \|\| isa<SCEVCouldNotCompute>(Val: SC))
5671	return std::nullopt;
5672	if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
5673	RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
5674	->isZero())
5675	return std::nullopt;
5676	Dist = SC->getAPInt().getZExtValue();
5677	}
5678	// If the strides are not the same or repeated, we can't vectorize.
5679	if ((Dist / Size) * Size != Dist \|\| (Dist / Size) >= SCEVs.size())
5680	return std::nullopt;
5681	auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
5682	if (!Res.second)
5683	return std::nullopt;
5684	// Consecutive order if the inserted element is the last one.
5685	IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
5686	++Cnt;
5687	}
5688	if (Offsets.size() != SCEVs.size())
5689	return std::nullopt;
5690	SortedIndices.clear();
5691	if (!IsConsecutive) {
5692	// Fill SortedIndices array only if it is non-consecutive.
5693	SortedIndices.resize(N: PointerOps.size());
5694	Cnt = `0`;
5695	for (const std::pair<int64_t, int> &Pair : Offsets) {
5696	SortedIndices [Cnt] = Pair.second;
5697	++Cnt;
5698	}
5699	}
5700	if (!Inst)
5701	return nullptr;
5702	SCEVExpander Expander(SE, DL, "strided-load-vec");
5703	return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst);
5704	}
5705
5706	static std::pair<InstructionCost, InstructionCost>
5707	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
5708	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
5709	Type ScalarTy, VectorType VecTy);
5710
5711	/// Returns the cost of the shuffle instructions with the given \p Kind, vector
5712	/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
5713	/// subvector pattern.
5714	static InstructionCost
5715	getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
5716	VectorType Tp, ArrayRef<int*> Mask = {},
5717	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
5718	int Index = `0`, VectorType SubTp = nullptr*,
5719	ArrayRef<const Value *> Args = {}) {
5720	VectorType *DstTy = Tp;
5721	if (!Mask.empty())
5722	DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size());
5723
5724	if (Kind != TTI::SK_PermuteTwoSrc)
5725	return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
5726	Args);
5727	int NumSrcElts = Tp->getElementCount().getKnownMinValue();
5728	int NumSubElts;
5729	if (Mask.size() > `2` && ShuffleVectorInst::isInsertSubvectorMask(
5730	Mask, NumSrcElts, NumSubElts, Index)) {
5731	if (Index + NumSubElts > NumSrcElts &&
5732	Index + NumSrcElts <= static_cast<int>(Mask.size()))
5733	return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask,
5734	CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
5735	}
5736	return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
5737	Args);
5738	}
5739
5740	/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
5741	/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
5742	/// instead of a scalar.
5743	static InstructionCost
5744	getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
5745	VectorType Ty, const* APInt &DemandedElts, bool Insert,
5746	bool Extract, TTI::TargetCostKind CostKind,
5747	bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
5748	assert(!isa<ScalableVectorType>(Ty) &&
5749	"ScalableVectorType is not supported.");
5750	assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
5751	getNumElements(Ty) &&
5752	"Incorrect usage.");
5753	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
5754	assert(SLPReVec && "Only supported by REVEC.");
5755	// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
5756	// of CreateInsertElement.
5757	unsigned ScalarTyNumElements = VecTy->getNumElements();
5758	InstructionCost Cost = `0`;
5759	for (unsigned I : seq(Size: DemandedElts.getBitWidth())) {
5760	if (!DemandedElts [I])
5761	continue;
5762	if (Insert)
5763	Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind,
5764	Index: I * ScalarTyNumElements, SubTp: VecTy);
5765	if (Extract)
5766	Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind,
5767	Index: I * ScalarTyNumElements, SubTp: VecTy);
5768	}
5769	return Cost;
5770	}
5771	return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5772	CostKind, ForPoisonSrc, VL);
5773	}
5774
5775	/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
5776	/// is a FixedVectorType, a vector will be extracted instead of a scalar.
5777	static InstructionCost getVectorInstrCost(
5778	const TargetTransformInfo &TTI, Type ScalarTy, unsigned* Opcode, Type *Val,
5779	TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
5780	ArrayRef<std::tuple<Value , User , int>> ScalarUserAndIdx) {
5781	if (Opcode == Instruction::ExtractElement) {
5782	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
5783	assert(SLPReVec && "Only supported by REVEC.");
5784	assert(isa<VectorType>(Val) && "Val must be a vector type.");
5785	return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
5786	Tp: cast<VectorType>(Val), Mask: {}, CostKind,
5787	Index: Index * VecTy->getNumElements(), SubTp: VecTy);
5788	}
5789	}
5790	return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
5791	ScalarUserAndIdx);
5792	}
5793
5794	/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
5795	/// is a FixedVectorType, a vector will be extracted instead of a scalar.
5796	static InstructionCost getExtractWithExtendCost(
5797	const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
5798	VectorType VecTy, unsigned* Index,
5799	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
5800	if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) {
5801	assert(SLPReVec && "Only supported by REVEC.");
5802	auto *SubTp =
5803	getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements());
5804	return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind,
5805	Index: Index * ScalarTy->getNumElements(), SubTp) +
5806	TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None,
5807	CostKind);
5808	}
5809	return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
5810	}
5811
5812	/// Correctly creates insert_subvector, checking that the index is multiple of
5813	/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5814	/// using default shuffle.
5815	static Value *createInsertVector(
5816	IRBuilderBase &Builder, Value Vec, Value V, unsigned Index,
5817	function_ref<Value (Value , Value , ArrayRef<int*>)> Generator = {}) {
5818	const unsigned SubVecVF = getNumElements(Ty: V->getType());
5819	if (Index % SubVecVF == `0`) {
5820	Vec = Builder.CreateInsertVector(DstType: Vec->getType(), SrcVec: Vec, SubVec: V, Idx: Index);
5821	} else {
5822	// Create shuffle, insertvector requires that index is multiple of
5823	// the subvector length.
5824	const unsigned VecVF = getNumElements(Ty: Vec->getType());
5825	SmallVector<int> Mask(VecVF, PoisonMaskElem);
5826	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
5827	for (unsigned I : seq<unsigned>(Size: SubVecVF))
5828	Mask [I + Index] = I + VecVF;
5829	if (Generator) {
5830	Vec = Generator (Vec, V, Mask);
5831	} else {
5832	// 1. Resize V to the size of Vec.
5833	SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
5834	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: `0`);
5835	V = Builder.CreateShuffleVector(V, Mask: ResizeMask);
5836	Vec = Builder.CreateShuffleVector(V1: Vec, V2: V, Mask);
5837	}
5838	}
5839	return Vec;
5840	}
5841
5842	/// Correctly creates extract_subvector, checking that the index is multiple of
5843	/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5844	/// using default shuffle.
5845	static Value createExtractVector(IRBuilderBase &Builder, Value Vec,
5846	unsigned SubVecVF, unsigned Index) {
5847	if (Index % SubVecVF == `0`) {
5848	VectorType *SubVecTy =
5849	getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: SubVecVF);
5850	return Builder.CreateExtractVector(DstType: SubVecTy, SrcVec: Vec, Idx: Index);
5851	}
5852	// Create shuffle, extract_subvector requires that index is multiple of
5853	// the subvector length.
5854	SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
5855	std::iota(first: Mask.begin(), last: Mask.end(), value: Index);
5856	return Builder.CreateShuffleVector(V: Vec, Mask);
5857	}
5858
5859	/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
5860	/// with \p Order.
5861	/// \return true if the mask represents strided access, false - otherwise.
5862	static bool buildCompressMask(ArrayRef<Value *> PointerOps,
5863	ArrayRef<unsigned> Order, Type *ScalarTy,
5864	const DataLayout &DL, ScalarEvolution &SE,
5865	SmallVectorImpl<int> &CompressMask) {
5866	const unsigned Sz = PointerOps.size();
5867	CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
5868	// The first element always set.
5869	CompressMask [`0`] = `0`;
5870	// Check if the mask represents strided access.
5871	std::optional<unsigned> Stride = `0`;
5872	Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps [Order.front()];
5873	for (unsigned I : seq<unsigned>(Begin: `1`, End: Sz)) {
5874	Value *Ptr = Order.empty() ? PointerOps [I] : PointerOps [Order [I]];
5875	std::optional<int64_t> OptPos =
5876	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
5877	if (!OptPos \|\| OptPos > std::numeric_limits<unsigned>::max())
5878	return false;
5879	unsigned Pos = static_cast<unsigned>(*OptPos);
5880	CompressMask [I] = Pos;
5881	if (!Stride)
5882	continue;
5883	if (*Stride == `0`) {
5884	*Stride = Pos;
5885	continue;
5886	}
5887	if (Pos != Stride I)
5888	Stride.reset();
5889	}
5890	return Stride.has_value();
5891	}
5892
5893	/// Checks if the \p VL can be transformed to a (masked)load + compress or
5894	/// (masked) interleaved load.
5895	static bool isMaskedLoadCompress(
5896	ArrayRef<Value > VL, ArrayRef<Value > PointerOps,
5897	ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
5898	const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
5899	const DominatorTree &DT, const TargetLibraryInfo &TLI,
5900	const function_ref<bool(Value )> AreAllUsersVectorized, bool* &IsMasked,
5901	unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
5902	VectorType *&LoadVecTy) {
5903	InterleaveFactor = `0`;
5904	Type *ScalarTy = VL.front()->getType();
5905	const size_t Sz = VL.size();
5906	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
5907	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5908	SmallVector<int> Mask;
5909	if (!Order.empty())
5910	inversePermutation(Indices: Order, Mask);
5911	// Check external uses.
5912	for (const auto [I, V] : enumerate(First&: VL)) {
5913	if (AreAllUsersVectorized (V))
5914	continue;
5915	InstructionCost ExtractCost =
5916	TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
5917	Index: Mask.empty() ? I : Mask [I]);
5918	InstructionCost ScalarCost =
5919	TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind);
5920	if (ExtractCost <= ScalarCost)
5921	return false;
5922	}
5923	Value *Ptr0;
5924	Value *PtrN;
5925	if (Order.empty()) {
5926	Ptr0 = PointerOps.front();
5927	PtrN = PointerOps.back();
5928	} else {
5929	Ptr0 = PointerOps [Order.front()];
5930	PtrN = PointerOps [Order.back()];
5931	}
5932	std::optional<int64_t> Diff =
5933	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE);
5934	if (!Diff)
5935	return false;
5936	const size_t MaxRegSize =
5937	TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
5938	.getFixedValue();
5939	// Check for very large distances between elements.
5940	if (*Diff / Sz >= MaxRegSize / `8`)
5941	return false;
5942	LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + `1`);
5943	auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL [Order.front()]);
5944	Align CommonAlignment = LI->getAlign();
5945	IsMasked = !isSafeToLoadUnconditionally(
5946	V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL,
5947	ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL [Order.back()]), AC: &AC, DT: &DT,
5948	TLI: &TLI);
5949	if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment,
5950	AddressSpace: LI->getPointerAddressSpace()))
5951	return false;
5952	// TODO: perform the analysis of each scalar load for better
5953	// safe-load-unconditionally analysis.
5954	bool IsStrided =
5955	buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
5956	assert(CompressMask.size() >= `2` && "At least two elements are required");
5957	SmallVector<Value *> OrderedPointerOps(PointerOps);
5958	if (!Order.empty())
5959	reorderScalars(Scalars&: OrderedPointerOps, Mask);
5960	auto [ScalarGEPCost, VectorGEPCost] =
5961	getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(),
5962	Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy);
5963	// The cost of scalar loads.
5964	InstructionCost ScalarLoadsCost =
5965	std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost (),
5966	binary_op: [&](InstructionCost C, Value *V) {
5967	return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V),
5968	CostKind);
5969	}) +
5970	ScalarGEPCost;
5971	APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
5972	InstructionCost GatherCost =
5973	getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
5974	/Insert=/true,
5975	/Extract=/false, CostKind) +
5976	ScalarLoadsCost;
5977	InstructionCost LoadCost = `0`;
5978	if (IsMasked) {
5979	LoadCost =
5980	TTI.getMaskedMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
5981	AddressSpace: LI->getPointerAddressSpace(), CostKind);
5982	} else {
5983	LoadCost =
5984	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
5985	AddressSpace: LI->getPointerAddressSpace(), CostKind);
5986	}
5987	if (IsStrided && !IsMasked && Order.empty()) {
5988	// Check for potential segmented(interleaved) loads.
5989	VectorType *AlignedLoadVecTy = getWidenedType(
5990	ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + `1`));
5991	if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment,
5992	DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT,
5993	TLI: &TLI))
5994	AlignedLoadVecTy = LoadVecTy;
5995	if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask [`1`],
5996	Alignment: CommonAlignment,
5997	AddrSpace: LI->getPointerAddressSpace())) {
5998	InstructionCost InterleavedCost =
5999	VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6000	Opcode: Instruction::Load, VecTy: AlignedLoadVecTy,
6001	Factor: CompressMask [`1`], Indices: {}, Alignment: CommonAlignment,
6002	AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked);
6003	if (InterleavedCost < GatherCost) {
6004	InterleaveFactor = CompressMask [`1`];
6005	LoadVecTy = AlignedLoadVecTy;
6006	return true;
6007	}
6008	}
6009	}
6010	InstructionCost CompressCost = ::getShuffleCost(
6011	TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind);
6012	if (!Order.empty()) {
6013	SmallVector<int> NewMask(Sz, PoisonMaskElem);
6014	for (unsigned I : seq<unsigned>(Size: Sz)) {
6015	NewMask [I] = CompressMask [Mask [I]];
6016	}
6017	CompressMask.swap(RHS&: NewMask);
6018	}
6019	InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6020	return TotalVecCost < GatherCost;
6021	}
6022
6023	/// Checks if the \p VL can be transformed to a (masked)load + compress or
6024	/// (masked) interleaved load.
6025	static bool
6026	isMaskedLoadCompress(ArrayRef<Value > VL, ArrayRef<Value > PointerOps,
6027	ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6028	const DataLayout &DL, ScalarEvolution &SE,
6029	AssumptionCache &AC, const DominatorTree &DT,
6030	const TargetLibraryInfo &TLI,
6031	const function_ref<bool(Value *)> AreAllUsersVectorized) {
6032	bool IsMasked;
6033	unsigned InterleaveFactor;
6034	SmallVector<int> CompressMask;
6035	VectorType *LoadVecTy;
6036	return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6037	AreAllUsersVectorized, IsMasked, InterleaveFactor,
6038	CompressMask, LoadVecTy);
6039	}
6040
6041	/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6042	/// PointerOps:
6043	/// 1. Target with strided load support is detected.
6044	/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6045	/// potential stride <= MaxProfitableLoadStride and the potential stride is
6046	/// power-of-2 (to avoid perf regressions for the very small number of loads)
6047	/// and max distance > number of loads, or potential stride is -1.
6048	/// 3. The loads are ordered, or number of unordered loads <=
6049	/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6050	/// to avoid extra costs for very expensive shuffles).
6051	/// 4. Any pointer operand is an instruction with the users outside of the
6052	/// current graph (for masked gathers extra extractelement instructions
6053	/// might be required).
6054	static bool isStridedLoad(ArrayRef<Value > VL, ArrayRef<Value > PointerOps,
6055	ArrayRef<unsigned> Order,
6056	const TargetTransformInfo &TTI, const DataLayout &DL,
6057	ScalarEvolution &SE,
6058	const bool IsAnyPointerUsedOutGraph,
6059	const int64_t Diff) {
6060	const size_t Sz = VL.size();
6061	const uint64_t AbsoluteDiff = std::abs(i: Diff);
6062	Type *ScalarTy = VL.front()->getType();
6063	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6064	if (IsAnyPointerUsedOutGraph \|\|
6065	(AbsoluteDiff > Sz &&
6066	(Sz > MinProfitableStridedLoads \|\|
6067	(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6068	AbsoluteDiff % Sz == `0` && has_single_bit(Value: AbsoluteDiff / Sz)))) \|\|
6069	Diff == -(static_cast<int64_t>(Sz) - `1`)) {
6070	int64_t Stride = Diff / static_cast<int64_t>(Sz - `1`);
6071	if (Diff != Stride * static_cast<int64_t>(Sz - `1`))
6072	return false;
6073	Align Alignment =
6074	cast<LoadInst>(Val: Order.empty() ? VL.front() : VL [Order.front()])
6075	->getAlign();
6076	if (!TTI.isLegalStridedLoadStore(DataType: VecTy, Alignment))
6077	return false;
6078	Value *Ptr0;
6079	Value *PtrN;
6080	if (Order.empty()) {
6081	Ptr0 = PointerOps.front();
6082	PtrN = PointerOps.back();
6083	} else {
6084	Ptr0 = PointerOps [Order.front()];
6085	PtrN = PointerOps [Order.back()];
6086	}
6087	// Iterate through all pointers and check if all distances are
6088	// unique multiple of Dist.
6089	SmallSet<int64_t, `4`> Dists;
6090	for (Value *Ptr : PointerOps) {
6091	int64_t Dist = `0`;
6092	if (Ptr == PtrN)
6093	Dist = Diff;
6094	else if (Ptr != Ptr0)
6095	Dist = *getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
6096	// If the strides are not the same or repeated, we can't
6097	// vectorize.
6098	if (((Dist / Stride) * Stride) != Dist \|\| !Dists.insert(V: Dist).second)
6099	break;
6100	}
6101	if (Dists.size() == Sz)
6102	return true;
6103	}
6104	return false;
6105	}
6106
6107	BoUpSLP::LoadsState
6108	BoUpSLP::canVectorizeLoads(ArrayRef<Value > VL, const* Value *VL0,
6109	SmallVectorImpl<unsigned> &Order,
6110	SmallVectorImpl<Value *> &PointerOps,
6111	unsigned BestVF, bool* TryRecursiveCheck) const {
6112	// Check that a vectorized load would load the same memory as a scalar
6113	// load. For example, we don't want to vectorize loads that are smaller
6114	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6115	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
6116	// from such a struct, we read/write packed bits disagreeing with the
6117	// unvectorized version.
6118	if (BestVF)
6119	*BestVF = `0`;
6120	if (areKnownNonVectorizableLoads(VL))
6121	return LoadsState::Gather;
6122	Type *ScalarTy = VL0->getType();
6123
6124	if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
6125	return LoadsState::Gather;
6126
6127	// Make sure all loads in the bundle are simple - we can't vectorize
6128	// atomic or volatile loads.
6129	PointerOps.clear();
6130	const size_t Sz = VL.size();
6131	PointerOps.resize(N: Sz);
6132	auto *POIter = PointerOps.begin();
6133	for (Value *V : VL) {
6134	auto *L = dyn_cast<LoadInst>(Val: V);
6135	if (!L \|\| !L->isSimple())
6136	return LoadsState::Gather;
6137	*POIter = L->getPointerOperand();
6138	++POIter;
6139	}
6140
6141	Order.clear();
6142	// Check the order of pointer operands or that all pointers are the same.
6143	bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order);
6144
6145	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6146	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6147	if (!IsSorted) {
6148	if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy)) {
6149	if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) &&
6150	calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
6151	return LoadsState::StridedVectorize;
6152	}
6153
6154	if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) \|\|
6155	TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
6156	return LoadsState::Gather;
6157
6158	if (!all_of(Range&: PointerOps, P: [&](Value *P) {
6159	return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
6160	}))
6161	return LoadsState::Gather;
6162
6163	} else {
6164	Value *Ptr0;
6165	Value *PtrN;
6166	if (Order.empty()) {
6167	Ptr0 = PointerOps.front();
6168	PtrN = PointerOps.back();
6169	} else {
6170	Ptr0 = PointerOps [Order.front()];
6171	PtrN = PointerOps [Order.back()];
6172	}
6173	std::optional<int64_t> Diff =
6174	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
6175	// Check that the sorted loads are consecutive.
6176	if (static_cast<uint64_t>(*Diff) == Sz - `1`)
6177	return LoadsState::Vectorize;
6178	if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: TTI, DL: DL, SE&: SE, AC&: AC, DT: *DT,
6179	TLI: TLI, AreAllUsersVectorized: [&](Value V) {
6180	return areAllUsersVectorized(
6181	I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList);
6182	}))
6183	return LoadsState::CompressVectorize;
6184	// Simple check if not a strided access - clear order.
6185	bool IsPossibleStrided = *Diff % (Sz - `1`) == `0`;
6186	// Try to generate strided load node.
6187	auto IsAnyPointerUsedOutGraph =
6188	IsPossibleStrided && any_of(Range&: PointerOps, P: [&](Value *V) {
6189	return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) {
6190	return !isVectorized(V: U) && !MustGather.contains(Ptr: U);
6191	});
6192	});
6193	if (IsPossibleStrided &&
6194	isStridedLoad(VL, PointerOps, Order, TTI: TTI, DL: DL, SE&: *SE,
6195	IsAnyPointerUsedOutGraph, Diff: *Diff))
6196	return LoadsState::StridedVectorize;
6197	}
6198	if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) \|\|
6199	TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
6200	return LoadsState::Gather;
6201	// Correctly identify compare the cost of loads + shuffles rather than
6202	// strided/masked gather loads. Returns true if vectorized + shuffles
6203	// representation is better than just gather.
6204	auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6205	unsigned *BestVF,
6206	bool ProfitableGatherPointers) {
6207	if (BestVF)
6208	*BestVF = `0`;
6209	// Compare masked gather cost and loads + insert subvector costs.
6210	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6211	auto [ScalarGEPCost, VectorGEPCost] =
6212	getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(),
6213	Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6214	// Estimate the cost of masked gather GEP. If not a splat, roughly
6215	// estimate as a buildvector, otherwise estimate as splat.
6216	APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
6217	Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6218	VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz);
6219	if (static_cast<unsigned>(count_if(
6220	Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - `1` \|\|
6221	any_of(Range&: PointerOps, P: [&](Value *V) {
6222	return getUnderlyingObject(V) !=
6223	getUnderlyingObject(V: PointerOps.front());
6224	}))
6225	VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy,
6226	DemandedElts, /Insert=/true,
6227	/Extract=/false, CostKind);
6228	else
6229	VectorGEPCost +=
6230	getScalarizationOverhead(
6231	TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: `0`),
6232	/Insert=/true, /Extract=/false, CostKind) +
6233	::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind);
6234	// The cost of scalar loads.
6235	InstructionCost ScalarLoadsCost =
6236	std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost (),
6237	binary_op: [&](InstructionCost C, Value *V) {
6238	return C + TTI.getInstructionCost(
6239	U: cast<Instruction>(Val: V), CostKind);
6240	}) +
6241	ScalarGEPCost;
6242	// The cost of masked gather.
6243	InstructionCost MaskedGatherCost =
6244	TTI.getGatherScatterOpCost(
6245	Opcode: Instruction::Load, DataTy: VecTy, Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(),
6246	/VariableMask=/false, Alignment: CommonAlignment, CostKind) +
6247	(ProfitableGatherPointers ? `0` : VectorGEPCost);
6248	InstructionCost GatherCost =
6249	getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
6250	/Insert=/true,
6251	/Extract=/false, CostKind) +
6252	ScalarLoadsCost;
6253	// The list of loads is small or perform partial check already - directly
6254	// compare masked gather cost and gather cost.
6255	constexpr unsigned ListLimit = `4`;
6256	if (!TryRecursiveCheck \|\| VL.size() < ListLimit)
6257	return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
6258
6259	// FIXME: The following code has not been updated for non-power-of-2
6260	// vectors (and not whole registers). The splitting logic here does not
6261	// cover the original vector if the vector factor is not a power of two.
6262	if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size()))
6263	return false;
6264
6265	unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
6266	unsigned MinVF = getMinVF(Sz: `2` * Sz);
6267	DemandedElts.clearAllBits();
6268	// Iterate through possible vectorization factors and check if vectorized +
6269	// shuffles is better than just gather.
6270	for (unsigned VF =
6271	getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - `1`);
6272	VF >= MinVF;
6273	VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - `1`)) {
6274	SmallVector<LoadsState> States;
6275	for (unsigned Cnt = `0`, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
6276	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
6277	SmallVector<unsigned> Order;
6278	SmallVector<Value *> PointerOps;
6279	LoadsState LS =
6280	canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps, BestVF,
6281	/TryRecursiveCheck=/false);
6282	// Check that the sorted loads are consecutive.
6283	if (LS == LoadsState::Gather) {
6284	if (BestVF) {
6285	DemandedElts.setAllBits();
6286	break;
6287	}
6288	DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF);
6289	continue;
6290	}
6291	// If need the reorder - consider as high-cost masked gather for now.
6292	if ((LS == LoadsState::Vectorize \|\|
6293	LS == LoadsState::StridedVectorize \|\|
6294	LS == LoadsState::CompressVectorize) &&
6295	!Order.empty() && !isReverseOrder(Order))
6296	LS = LoadsState::ScatterVectorize;
6297	States.push_back(Elt: LS);
6298	}
6299	if (DemandedElts.isAllOnes())
6300	// All loads gathered - try smaller VF.
6301	continue;
6302	// Can be vectorized later as a serie of loads/insertelements.
6303	InstructionCost VecLdCost = `0`;
6304	if (!DemandedElts.isZero()) {
6305	VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
6306	/Insert=/true,
6307	/Extract=/false, CostKind) +
6308	ScalarGEPCost;
6309	for (unsigned Idx : seq<unsigned>(Size: VL.size()))
6310	if (DemandedElts [Idx])
6311	VecLdCost +=
6312	TTI.getInstructionCost(U: cast<Instruction>(Val: VL [Idx]), CostKind);
6313	}
6314	auto *SubVecTy = getWidenedType(ScalarTy, VF);
6315	for (auto [I, LS] : enumerate(First&: States)) {
6316	auto LI0 = cast<LoadInst>(Val: VL [I VF]);
6317	InstructionCost VectorGEPCost =
6318	(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
6319	? `0`
6320	: getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
6321	BasePtr: LI0->getPointerOperand(),
6322	Opcode: Instruction::GetElementPtr, CostKind, ScalarTy,
6323	VecTy: SubVecTy)
6324	.second;
6325	if (LS == LoadsState::ScatterVectorize) {
6326	if (static_cast<unsigned>(
6327	count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) <
6328	PointerOps.size() - `1` \|\|
6329	any_of(Range&: PointerOps, P: [&](Value *V) {
6330	return getUnderlyingObject(V) !=
6331	getUnderlyingObject(V: PointerOps.front());
6332	}))
6333	VectorGEPCost += getScalarizationOverhead(
6334	TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF),
6335	/Insert=/true, /Extract=/false, CostKind);
6336	else
6337	VectorGEPCost +=
6338	getScalarizationOverhead(
6339	TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: `0`),
6340	/Insert=/true, /Extract=/false, CostKind) +
6341	::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {},
6342	CostKind);
6343	}
6344	switch (LS) {
6345	case LoadsState::Vectorize:
6346	VecLdCost +=
6347	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
6348	AddressSpace: LI0->getPointerAddressSpace(), CostKind,
6349	OpdInfo: TTI::OperandValueInfo ()) +
6350	VectorGEPCost;
6351	break;
6352	case LoadsState::StridedVectorize:
6353	VecLdCost += TTI.getStridedMemoryOpCost(Opcode: Instruction::Load, DataTy: SubVecTy,
6354	Ptr: LI0->getPointerOperand(),
6355	/VariableMask=/false,
6356	Alignment: CommonAlignment, CostKind) +
6357	VectorGEPCost;
6358	break;
6359	case LoadsState::CompressVectorize:
6360	VecLdCost += TTI.getMaskedMemoryOpCost(
6361	Opcode: Instruction::Load, Src: SubVecTy, Alignment: CommonAlignment,
6362	AddressSpace: LI0->getPointerAddressSpace(), CostKind) +
6363	VectorGEPCost +
6364	::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy,
6365	Mask: {}, CostKind);
6366	break;
6367	case LoadsState::ScatterVectorize:
6368	VecLdCost += TTI.getGatherScatterOpCost(Opcode: Instruction::Load, DataTy: SubVecTy,
6369	Ptr: LI0->getPointerOperand(),
6370	/VariableMask=/false,
6371	Alignment: CommonAlignment, CostKind) +
6372	VectorGEPCost;
6373	break;
6374	case LoadsState::Gather:
6375	// Gathers are already calculated - ignore.
6376	continue;
6377	}
6378	SmallVector<int> ShuffleMask(VL.size());
6379	for (int Idx : seq<int>(Begin: `0`, End: VL.size()))
6380	ShuffleMask [Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
6381	if (I > `0`)
6382	VecLdCost +=
6383	::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
6384	CostKind, Index: I * VF, SubTp: SubVecTy);
6385	}
6386	// If masked gather cost is higher - better to vectorize, so
6387	// consider it as a gather node. It will be better estimated
6388	// later.
6389	if (MaskedGatherCost >= VecLdCost &&
6390	VecLdCost - GatherCost < -SLPCostThreshold) {
6391	if (BestVF)
6392	*BestVF = VF;
6393	return true;
6394	}
6395	}
6396	return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
6397	};
6398	// TODO: need to improve analysis of the pointers, if not all of them are
6399	// GEPs or have > 2 operands, we end up with a gather node, which just
6400	// increases the cost.
6401	Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
6402	bool ProfitableGatherPointers =
6403	L && Sz > `2` && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
6404	return L->isLoopInvariant(V);
6405	})) <= Sz / `2`;
6406	if (ProfitableGatherPointers \|\| all_of(Range&: PointerOps, P: [](Value *P) {
6407	auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
6408	return (!GEP && doesNotNeedToBeScheduled(V: P)) \|\|
6409	(GEP && GEP->getNumOperands() == `2` &&
6410	isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: `1`)));
6411	})) {
6412	// Check if potential masked gather can be represented as series
6413	// of loads + insertsubvectors.
6414	// If masked gather cost is higher - better to vectorize, so
6415	// consider it as a gather node. It will be better estimated
6416	// later.
6417	if (!TryRecursiveCheck \|\| !CheckForShuffledLoads (CommonAlignment, BestVF,
6418	ProfitableGatherPointers))
6419	return LoadsState::ScatterVectorize;
6420	}
6421
6422	return LoadsState::Gather;
6423	}
6424
6425	static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
6426	ArrayRef<BasicBlock > BBs, Type ElemTy,
6427	const DataLayout &DL, ScalarEvolution &SE,
6428	SmallVectorImpl<unsigned> &SortedIndices) {
6429	assert(
6430	all_of(VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
6431	"Expected list of pointer operands.");
6432	// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
6433	// Ptr into, sort and return the sorted indices with values next to one
6434	// another.
6435	SmallMapVector<
6436	std::pair<BasicBlock , Value >,
6437	SmallVector<SmallVector<std::tuple<Value , int64_t, unsigned*>>>, `8`>
6438	Bases;
6439	Bases
6440	.try_emplace(Key: std::make_pair(
6441	x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth)))
6442	.first->second.emplace_back().emplace_back(Args: VL.front(), Args: `0U`, Args: `0U`);
6443
6444	SortedIndices.clear();
6445	for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) {
6446	auto Key = std::make_pair(x: BBs [Cnt + `1`],
6447	y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth));
6448	bool Found = any_of(Range&: Bases.try_emplace(Key).first->second,
6449	P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
6450	std::optional<int64_t> Diff =
6451	getPointersDiff(ElemTy, std::get<`0`>(Base.front()),
6452	ElemTy, Ptr, DL, SE,
6453	/StrictCheck=/true);
6454	if (!Diff)
6455	return false;
6456
6457	Base.emplace_back(Ptr, *Diff, Cnt + `1`);
6458	return true;
6459	});
6460
6461	if (!Found) {
6462	// If we haven't found enough to usefully cluster, return early.
6463	if (Bases.size() > VL.size() / `2` - `1`)
6464	return false;
6465
6466	// Not found already - add a new Base
6467	Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: `0`, Args: Cnt + `1`);
6468	}
6469	}
6470
6471	if (Bases.size() == VL.size())
6472	return false;
6473
6474	if (Bases.size() == `1` && (Bases.front().second.size() == `1` \|\|
6475	Bases.front().second.size() == VL.size()))
6476	return false;
6477
6478	// For each of the bases sort the pointers by Offset and check if any of the
6479	// base become consecutively allocated.
6480	auto ComparePointers = [](Value Ptr1, Value Ptr2) {
6481	SmallPtrSet<Value *, `13`> FirstPointers;
6482	SmallPtrSet<Value *, `13`> SecondPointers;
6483	Value *P1 = Ptr1;
6484	Value *P2 = Ptr2;
6485	unsigned Depth = `0`;
6486	while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) {
6487	if (P1 == P2 \|\| Depth > RecursionMaxDepth)
6488	return false;
6489	FirstPointers.insert(Ptr: P1);
6490	SecondPointers.insert(Ptr: P2);
6491	P1 = getUnderlyingObject(V: P1, /MaxLookup=/`1`);
6492	P2 = getUnderlyingObject(V: P2, /MaxLookup=/`1`);
6493	++Depth;
6494	}
6495	assert((FirstPointers.contains(P2) \|\| SecondPointers.contains(P1)) &&
6496	"Unable to find matching root.");
6497	return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1);
6498	};
6499	for (auto &Base : Bases) {
6500	for (auto &Vec : Base.second) {
6501	if (Vec.size() > `1`) {
6502	stable_sort(Range&: Vec, C: llvm::less_second ());
6503	int64_t InitialOffset = std::get<`1`>(t&: Vec [`0`]);
6504	bool AnyConsecutive =
6505	all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
6506	return std::get<`1`>(P.value()) ==
6507	int64_t(P.index()) + InitialOffset;
6508	});
6509	// Fill SortedIndices array only if it looks worth-while to sort the
6510	// ptrs.
6511	if (!AnyConsecutive)
6512	return false;
6513	}
6514	}
6515	stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) {
6516	return ComparePointers(std::get<`0`>(V1.front()), std::get<`0`>(V2.front()));
6517	});
6518	}
6519
6520	for (auto &T : Bases)
6521	for (const auto &Vec : T.second)
6522	for (const auto &P : Vec)
6523	SortedIndices.push_back(Elt: std::get<`2`>(t: P));
6524
6525	assert(SortedIndices.size() == VL.size() &&
6526	"Expected SortedIndices to be the size of VL");
6527	return true;
6528	}
6529
6530	std::optional<BoUpSLP::OrdersType>
6531	BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
6532	assert(TE.isGather() && "Expected gather node only.");
6533	Type *ScalarTy = TE.Scalars [`0`]->getType();
6534
6535	SmallVector<Value *> Ptrs;
6536	Ptrs.reserve(N: TE.Scalars.size());
6537	SmallVector<BasicBlock *> BBs;
6538	BBs.reserve(N: TE.Scalars.size());
6539	for (Value *V : TE.Scalars) {
6540	auto *L = dyn_cast<LoadInst>(Val: V);
6541	if (!L \|\| !L->isSimple())
6542	return std::nullopt;
6543	Ptrs.push_back(Elt: L->getPointerOperand());
6544	BBs.push_back(Elt: L->getParent());
6545	}
6546
6547	BoUpSLP::OrdersType Order;
6548	if (!LoadEntriesToVectorize.contains(key: TE.Idx) &&
6549	clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
6550	return std::move(Order);
6551	return std::nullopt;
6552	}
6553
6554	/// Check if two insertelement instructions are from the same buildvector.
6555	static bool areTwoInsertFromSameBuildVector(
6556	InsertElementInst VU, InsertElementInst V,
6557	function_ref<Value (InsertElementInst )> GetBaseOperand) {
6558	// Instructions must be from the same basic blocks.
6559	if (VU->getParent() != V->getParent())
6560	return false;
6561	// Checks if 2 insertelements are from the same buildvector.
6562	if (VU->getType() != V->getType())
6563	return false;
6564	// Multiple used inserts are separate nodes.
6565	if (!VU->hasOneUse() && !V->hasOneUse())
6566	return false;
6567	auto *IE1 = VU;
6568	auto *IE2 = V;
6569	std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
6570	std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
6571	if (Idx1 == std::nullopt \|\| Idx2 == std::nullopt)
6572	return false;
6573	// Go through the vector operand of insertelement instructions trying to find
6574	// either VU as the original vector for IE2 or V as the original vector for
6575	// IE1.
6576	SmallBitVector ReusedIdx(
6577	cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
6578	bool IsReusedIdx = false;
6579	do {
6580	if (IE2 == VU && !IE1)
6581	return VU->hasOneUse();
6582	if (IE1 == V && !IE2)
6583	return V->hasOneUse();
6584	if (IE1 && IE1 != V) {
6585	unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
6586	IsReusedIdx \|= ReusedIdx.test(Idx: Idx1);
6587	ReusedIdx.set(Idx1);
6588	if ((IE1 != VU && !IE1->hasOneUse()) \|\| IsReusedIdx)
6589	IE1 = nullptr;
6590	else
6591	IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE1));
6592	}
6593	if (IE2 && IE2 != VU) {
6594	unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
6595	IsReusedIdx \|= ReusedIdx.test(Idx: Idx2);
6596	ReusedIdx.set(Idx2);
6597	if ((IE2 != V && !IE2->hasOneUse()) \|\| IsReusedIdx)
6598	IE2 = nullptr;
6599	else
6600	IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE2));
6601	}
6602	} while (!IsReusedIdx && (IE1 \|\| IE2));
6603	return false;
6604	}
6605
6606	/// Checks if the specified instruction \p I is an alternate operation for
6607	/// the given \p MainOp and \p AltOp instructions.
6608	static bool isAlternateInstruction(Instruction I, Instruction MainOp,
6609	Instruction *AltOp,
6610	const TargetLibraryInfo &TLI);
6611
6612	std::optional<BoUpSLP::OrdersType>
6613	BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
6614	bool IgnoreReorder) {
6615	// No need to reorder if need to shuffle reuses, still need to shuffle the
6616	// node.
6617	if (!TE.ReuseShuffleIndices.empty()) {
6618	// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
6619	assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
6620	"Reshuffling scalars not yet supported for nodes with padding");
6621
6622	if (isSplat(VL: TE.Scalars))
6623	return std::nullopt;
6624	// Check if reuse shuffle indices can be improved by reordering.
6625	// For this, check that reuse mask is "clustered", i.e. each scalar values
6626	// is used once in each submask of size <number_of_scalars>.
6627	// Example: 4 scalar values.
6628	// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
6629	// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
6630	// element 3 is used twice in the second submask.
6631	unsigned Sz = TE.Scalars.size();
6632	if (TE.isGather()) {
6633	if (std::optional<OrdersType> CurrentOrder =
6634	findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
6635	SmallVector<int> Mask;
6636	fixupOrderingIndices(Order: *CurrentOrder);
6637	inversePermutation(Indices: *CurrentOrder, Mask);
6638	::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
6639	OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
6640	unsigned Sz = TE.Scalars.size();
6641	for (int K = `0`, E = TE.getVectorFactor() / Sz; K < E; ++K) {
6642	for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
6643	if (Idx != PoisonMaskElem)
6644	Res [Idx + K * Sz] = I + K * Sz;
6645	}
6646	return std::move(Res);
6647	}
6648	}
6649	if (Sz == `2` && TE.getVectorFactor() == `4` &&
6650	::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
6651	VF: `2` * TE.getVectorFactor())) == `1`)
6652	return std::nullopt;
6653	if (TE.ReuseShuffleIndices.size() % Sz != `0`)
6654	return std::nullopt;
6655	if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
6656	VF: Sz)) {
6657	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
6658	if (TE.ReorderIndices.empty())
6659	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
6660	else
6661	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
6662	::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
6663	unsigned VF = ReorderMask.size();
6664	OrdersType ResOrder(VF, VF);
6665	unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
6666	SmallBitVector UsedVals(NumParts);
6667	for (unsigned I = `0`; I < VF; I += Sz) {
6668	int Val = PoisonMaskElem;
6669	unsigned UndefCnt = `0`;
6670	unsigned Limit = std::min(a: Sz, b: VF - I);
6671	if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
6672	P: [&](int Idx) {
6673	if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
6674	Val = Idx;
6675	if (Idx == PoisonMaskElem)
6676	++UndefCnt;
6677	return Idx != PoisonMaskElem && Idx != Val;
6678	}) \|\|
6679	Val >= static_cast<int>(NumParts) \|\| UsedVals.test(Idx: Val) \|\|
6680	UndefCnt > Sz / `2`)
6681	return std::nullopt;
6682	UsedVals.set(Val);
6683	for (unsigned K = `0`; K < NumParts; ++K) {
6684	unsigned Idx = Val + Sz * K;
6685	if (Idx < VF && I + K < VF)
6686	ResOrder [Idx] = I + K;
6687	}
6688	}
6689	return std::move(ResOrder);
6690	}
6691	unsigned VF = TE.getVectorFactor();
6692	// Try build correct order for extractelement instructions.
6693	SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
6694	TE.ReuseShuffleIndices.end());
6695	if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
6696	all_of(Range: TE.Scalars, P: [Sz](Value *V) {
6697	if (isa<PoisonValue>(Val: V))
6698	return true;
6699	std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
6700	return Idx && *Idx < Sz;
6701	})) {
6702	assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
6703	"by BinaryOperator and CastInst.");
6704	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
6705	if (TE.ReorderIndices.empty())
6706	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
6707	else
6708	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
6709	for (unsigned I = `0`; I < VF; ++I) {
6710	int &Idx = ReusedMask [I];
6711	if (Idx == PoisonMaskElem)
6712	continue;
6713	Value *V = TE.Scalars [ReorderMask [Idx]];
6714	std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
6715	Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
6716	}
6717	}
6718	// Build the order of the VF size, need to reorder reuses shuffles, they are
6719	// always of VF size.
6720	OrdersType ResOrder(VF);
6721	std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: `0`);
6722	auto *It = ResOrder.begin();
6723	for (unsigned K = `0`; K < VF; K += Sz) {
6724	OrdersType CurrentOrder(TE.ReorderIndices);
6725	SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
6726	if (SubMask.front() == PoisonMaskElem)
6727	std::iota(first: SubMask.begin(), last: SubMask.end(), value: `0`);
6728	reorderOrder(Order&: CurrentOrder, Mask: SubMask);
6729	transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
6730	std::advance(i&: It, n: Sz);
6731	}
6732	if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
6733	return Data.index() == Data.value();
6734	}))
6735	return std::nullopt; // No need to reorder.
6736	return std::move(ResOrder);
6737	}
6738	if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
6739	(!TE.UserTreeIndex \|\|
6740	!Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) &&
6741	(TE.ReorderIndices.empty() \|\| isReverseOrder(Order: TE.ReorderIndices)))
6742	return std::nullopt;
6743	if (TE.State == TreeEntry::SplitVectorize \|\|
6744	((TE.State == TreeEntry::Vectorize \|\|
6745	TE.State == TreeEntry::StridedVectorize \|\|
6746	TE.State == TreeEntry::CompressVectorize) &&
6747	(isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) \|\|
6748	(TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) {
6749	assert((TE.State == TreeEntry::SplitVectorize \|\| !TE.isAltShuffle()) &&
6750	"Alternate instructions are only supported by "
6751	"BinaryOperator and CastInst.");
6752	return TE.ReorderIndices;
6753	}
6754	if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
6755	TE.isAltShuffle()) {
6756	assert(TE.ReuseShuffleIndices.empty() &&
6757	"ReuseShuffleIndices should be "
6758	"empty for alternate instructions.");
6759	SmallVector<int> Mask;
6760	TE.buildAltOpShuffleMask(
6761	IsAltOp: [&](Instruction *I) {
6762	assert(TE.getMatchingMainOpOrAltOp(I) &&
6763	"Unexpected main/alternate opcode");
6764	return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI);
6765	},
6766	Mask);
6767	const int VF = TE.getVectorFactor();
6768	OrdersType ResOrder(VF, VF);
6769	for (unsigned I : seq<unsigned>(Size: VF)) {
6770	if (Mask [I] == PoisonMaskElem)
6771	continue;
6772	ResOrder [Mask [I] % VF] = I;
6773	}
6774	return std::move(ResOrder);
6775	}
6776	if (!TE.ReorderIndices.empty())
6777	return TE.ReorderIndices;
6778	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
6779	if (!TE.ReorderIndices.empty())
6780	return TE.ReorderIndices;
6781
6782	SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
6783	for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) {
6784	if (isa<Constant>(Val: V) \|\| !V->hasNUsesOrMore(N: `1`))
6785	continue;
6786	auto II = dyn_cast<InsertElementInst>(Val: V->user_begin());
6787	if (!II)
6788	continue;
6789	Instruction BVHead = nullptr*;
6790	BasicBlock *BB = II->getParent();
6791	while (II && II->hasOneUse() && II->getParent() == BB) {
6792	BVHead = II;
6793	II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: `0`));
6794	}
6795	I = BVHead;
6796	}
6797
6798	auto CompareByBasicBlocks = [&](BasicBlock BB1, BasicBlock BB2) {
6799	assert(BB1 != BB2 && "Expected different basic blocks.");
6800	if (!DT->isReachableFromEntry(A: BB1))
6801	return false;
6802	if (!DT->isReachableFromEntry(A: BB2))
6803	return true;
6804	auto *NodeA = DT->getNode(BB: BB1);
6805	auto *NodeB = DT->getNode(BB: BB2);
6806	assert(NodeA && "Should only process reachable instructions");
6807	assert(NodeB && "Should only process reachable instructions");
6808	assert((NodeA == NodeB) ==
6809	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
6810	"Different nodes should have different DFS numbers");
6811	return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
6812	};
6813	auto PHICompare = [&](unsigned I1, unsigned I2) {
6814	Value *V1 = TE.Scalars [I1];
6815	Value *V2 = TE.Scalars [I2];
6816	if (V1 == V2 \|\| (V1->use_empty() && V2->use_empty()))
6817	return false;
6818	if (isa<PoisonValue>(Val: V1))
6819	return true;
6820	if (isa<PoisonValue>(Val: V2))
6821	return false;
6822	if (V1->getNumUses() < V2->getNumUses())
6823	return true;
6824	if (V1->getNumUses() > V2->getNumUses())
6825	return false;
6826	auto FirstUserOfPhi1 = cast<Instruction>(Val: V1->user_begin());
6827	auto FirstUserOfPhi2 = cast<Instruction>(Val: V2->user_begin());
6828	if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
6829	return CompareByBasicBlocks (FirstUserOfPhi1->getParent(),
6830	FirstUserOfPhi2->getParent());
6831	auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1);
6832	auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2);
6833	auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1);
6834	auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2);
6835	if (IE1 && !IE2)
6836	return true;
6837	if (!IE1 && IE2)
6838	return false;
6839	if (IE1 && IE2) {
6840	if (UserBVHead [I1] && !UserBVHead [I2])
6841	return true;
6842	if (!UserBVHead [I1])
6843	return false;
6844	if (UserBVHead [I1] == UserBVHead [I2])
6845	return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
6846	if (UserBVHead [I1]->getParent() != UserBVHead [I2]->getParent())
6847	return CompareByBasicBlocks (UserBVHead [I1]->getParent(),
6848	UserBVHead [I2]->getParent());
6849	return UserBVHead [I1]->comesBefore(Other: UserBVHead [I2]);
6850	}
6851	if (EE1 && !EE2)
6852	return true;
6853	if (!EE1 && EE2)
6854	return false;
6855	if (EE1 && EE2) {
6856	auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: `0`));
6857	auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: `0`));
6858	auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: `0`));
6859	auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: `0`));
6860	if (!Inst2 && !P2)
6861	return Inst1 \|\| P1;
6862	if (EE1->getOperand(i_nocapture: `0`) == EE2->getOperand(i_nocapture: `0`))
6863	return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
6864	if (!Inst1 && Inst2)
6865	return false;
6866	if (Inst1 && Inst2) {
6867	if (Inst1->getParent() != Inst2->getParent())
6868	return CompareByBasicBlocks (Inst1->getParent(), Inst2->getParent());
6869	return Inst1->comesBefore(Other: Inst2);
6870	}
6871	if (!P1 && P2)
6872	return false;
6873	assert(P1 && P2 &&
6874	"Expected either instructions or arguments vector operands.");
6875	return P1->getArgNo() < P2->getArgNo();
6876	}
6877	return false;
6878	};
6879	OrdersType Phis(TE.Scalars.size());
6880	std::iota(first: Phis.begin(), last: Phis.end(), value: `0`);
6881	stable_sort(Range&: Phis, C: PHICompare);
6882	if (isIdentityOrder(Order: Phis))
6883	return std::nullopt; // No need to reorder.
6884	return std::move(Phis);
6885	}
6886	if (TE.isGather() &&
6887	(!TE.hasState() \|\| !TE.isAltShuffle() \|\|
6888	ScalarsInSplitNodes.contains(Val: TE.getMainOp())) &&
6889	allSameType(VL: TE.Scalars)) {
6890	// TODO: add analysis of other gather nodes with extractelement
6891	// instructions and other values/instructions, not only undefs.
6892	if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) \|\|
6893	(all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
6894	any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
6895	all_of(Range: TE.Scalars, P: [](Value *V) {
6896	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
6897	return !EE \|\| isa<FixedVectorType>(Val: EE->getVectorOperandType());
6898	})) {
6899	// Check that gather of extractelements can be represented as
6900	// just a shuffle of a single vector.
6901	OrdersType CurrentOrder;
6902	bool Reuse =
6903	canReuseExtract(VL: TE.Scalars, CurrentOrder, /ResizeAllowed=/true);
6904	if (Reuse \|\| !CurrentOrder.empty())
6905	return std::move(CurrentOrder);
6906	}
6907	// If the gather node is <undef, v, .., poison> and
6908	// insertelement poison, v, 0 [+ permute]
6909	// is cheaper than
6910	// insertelement poison, v, n - try to reorder.
6911	// If rotating the whole graph, exclude the permute cost, the whole graph
6912	// might be transformed.
6913	int Sz = TE.Scalars.size();
6914	if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
6915	count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - `1`) {
6916	const auto *It = find_if_not(Range: TE.Scalars, P: isConstant);
6917	if (It == TE.Scalars.begin())
6918	return OrdersType ();
6919	auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
6920	if (It != TE.Scalars.end()) {
6921	OrdersType Order(Sz, Sz);
6922	unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
6923	Order [Idx] = `0`;
6924	fixupOrderingIndices(Order);
6925	SmallVector<int> Mask;
6926	inversePermutation(Indices: Order, Mask);
6927	InstructionCost PermuteCost =
6928	TopToBottom
6929	? `0`
6930	: ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
6931	InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
6932	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: `0`,
6933	Op0: PoisonValue::get(T: Ty), Op1: *It);
6934	InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
6935	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
6936	Op0: PoisonValue::get(T: Ty), Op1: *It);
6937	if (InsertFirstCost + PermuteCost < InsertIdxCost) {
6938	OrdersType Order(Sz, Sz);
6939	Order [Idx] = `0`;
6940	return std::move(Order);
6941	}
6942	}
6943	}
6944	if (isSplat(VL: TE.Scalars))
6945	return std::nullopt;
6946	if (TE.Scalars.size() >= `3`)
6947	if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
6948	return Order;
6949	// Check if can include the order of vectorized loads. For masked gathers do
6950	// extra analysis later, so include such nodes into a special list.
6951	if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
6952	SmallVector<Value *> PointerOps;
6953	OrdersType CurrentOrder;
6954	LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(),
6955	Order&: CurrentOrder, PointerOps);
6956	if (Res == LoadsState::Vectorize \|\| Res == LoadsState::StridedVectorize \|\|
6957	Res == LoadsState::CompressVectorize)
6958	return std::move(CurrentOrder);
6959	}
6960	// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
6961	// has been auditted for correctness with non-power-of-two vectors.
6962	if (!VectorizeNonPowerOf2 \|\| !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
6963	if (std::optional<OrdersType> CurrentOrder =
6964	findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
6965	return CurrentOrder;
6966	}
6967	return std::nullopt;
6968	}
6969
6970	/// Checks if the given mask is a "clustered" mask with the same clusters of
6971	/// size \p Sz, which are not identity submasks.
6972	static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
6973	unsigned Sz) {
6974	ArrayRef<int> FirstCluster = Mask.slice(N: `0`, M: Sz);
6975	if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
6976	return false;
6977	for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
6978	ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
6979	if (Cluster != FirstCluster)
6980	return false;
6981	}
6982	return true;
6983	}
6984
6985	void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
6986	// Reorder reuses mask.
6987	reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
6988	const unsigned Sz = TE.Scalars.size();
6989	// For vectorized and non-clustered reused no need to do anything else.
6990	if (!TE.isGather() \|\|
6991	!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
6992	VF: Sz) \|\|
6993	!isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
6994	return;
6995	SmallVector<int> NewMask;
6996	inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
6997	addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
6998	// Clear reorder since it is going to be applied to the new mask.
6999	TE.ReorderIndices.clear();
7000	// Try to improve gathered nodes with clustered reuses, if possible.
7001	ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: `0`, M: Sz);
7002	SmallVector<unsigned> NewOrder(Slice);
7003	inversePermutation(Indices: NewOrder, Mask&: NewMask);
7004	reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
7005	// Fill the reuses mask with the identity submasks.
7006	for (auto *It = TE.ReuseShuffleIndices.begin(),
7007	*End = TE.ReuseShuffleIndices.end();
7008	It != End; std::advance(i&: It, n: Sz))
7009	std::iota(first: It, last: std::next(x: It, n: Sz), value: `0`);
7010	}
7011
7012	static void combineOrders(MutableArrayRef<unsigned> Order,
7013	ArrayRef<unsigned> SecondaryOrder) {
7014	assert((SecondaryOrder.empty() \|\| Order.size() == SecondaryOrder.size()) &&
7015	"Expected same size of orders");
7016	size_t Sz = Order.size();
7017	SmallBitVector UsedIndices(Sz);
7018	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz)) {
7019	if (Order [Idx] != Sz)
7020	UsedIndices.set(Order [Idx]);
7021	}
7022	if (SecondaryOrder.empty()) {
7023	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
7024	if (Order [Idx] == Sz && !UsedIndices.test(Idx))
7025	Order [Idx] = Idx;
7026	} else {
7027	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
7028	if (SecondaryOrder [Idx] != Sz && Order [Idx] == Sz &&
7029	!UsedIndices.test(Idx: SecondaryOrder [Idx]))
7030	Order [Idx] = SecondaryOrder [Idx];
7031	}
7032	}
7033
7034	bool BoUpSLP::isProfitableToReorder() const {
7035	constexpr unsigned TinyVF = `2`;
7036	constexpr unsigned TinyTree = `10`;
7037	constexpr unsigned PhiOpsLimit = `12`;
7038	constexpr unsigned GatherLoadsLimit = `2`;
7039	if (VectorizableTree.size() <= TinyTree)
7040	return true;
7041	if (VectorizableTree.front()->hasState() &&
7042	!VectorizableTree.front()->isGather() &&
7043	(VectorizableTree.front()->getOpcode() == Instruction::Store \|\|
7044	VectorizableTree.front()->getOpcode() == Instruction::PHI \|\|
7045	(VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7046	(VectorizableTree.front()->getOpcode() == Instruction::PtrToInt \|\|
7047	VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7048	VectorizableTree.front()->ReorderIndices.empty()) {
7049	// Check if the tree has only single store and single (unordered) load node,
7050	// other nodes are phis or geps/binops, combined with phis, and/orsingle
7051	// gather load node
7052	bool HasPhis = false;
7053	if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7054	VectorizableTree.front()->Scalars.size() == TinyVF &&
7055	VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7056	return false;
7057	bool HasLoad = true;
7058	unsigned GatherLoads = `0`;
7059	for (const std::unique_ptr<TreeEntry> &TE :
7060	ArrayRef(VectorizableTree).drop_front()) {
7061	if (!TE ->hasState()) {
7062	if (all_of(Range&: TE ->Scalars, P: IsaPred<Constant, PHINode>) \|\|
7063	all_of(Range&: TE ->Scalars, P: IsaPred<BinaryOperator, PHINode>))
7064	continue;
7065	if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7066	any_of(Range&: TE ->Scalars, P: IsaPred<PHINode, GEPOperator>))
7067	continue;
7068	return true;
7069	}
7070	if (TE ->getOpcode() == Instruction::Load && TE ->ReorderIndices.empty()) {
7071	if (!TE ->isGather()) {
7072	HasLoad = false;
7073	continue;
7074	}
7075	if (HasLoad)
7076	return true;
7077	++GatherLoads;
7078	if (GatherLoads >= GatherLoadsLimit)
7079	return true;
7080	}
7081	if (TE ->getOpcode() == Instruction::GetElementPtr \|\|
7082	Instruction::isBinaryOp(Opcode: TE ->getOpcode()))
7083	continue;
7084	if (TE ->getOpcode() != Instruction::PHI)
7085	return true;
7086	if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7087	TE ->getNumOperands() > PhiOpsLimit)
7088	return false;
7089	HasPhis = true;
7090	}
7091	return !HasPhis;
7092	}
7093	return true;
7094	}
7095
7096	void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7097	ArrayRef<int> MaskOrder) {
7098	assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7099	SmallVector<int> NewMask(getVectorFactor());
7100	SmallVector<int> NewMaskOrder(getVectorFactor());
7101	std::iota(first: NewMask.begin(), last: NewMask.end(), value: `0`);
7102	std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: `0`);
7103	if (Idx == `0`) {
7104	copy(Range&: Mask, Out: NewMask.begin());
7105	copy(Range&: MaskOrder, Out: NewMaskOrder.begin());
7106	} else {
7107	assert(Idx == `1` && "Expected either 0 or 1 index.");
7108	unsigned Offset = CombinedEntriesWithIndices.back().second;
7109	for (unsigned I : seq<unsigned>(Size: Mask.size())) {
7110	NewMask [I + Offset] = Mask [I] + Offset;
7111	NewMaskOrder [I + Offset] = MaskOrder [I] + Offset;
7112	}
7113	}
7114	reorderScalars(Scalars, Mask: NewMask);
7115	reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /BottomOrder=/true);
7116	if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices))
7117	ReorderIndices.clear();
7118	}
7119
7120	void BoUpSLP::reorderTopToBottom() {
7121	// Maps VF to the graph nodes.
7122	DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
7123	// ExtractElement gather nodes which can be vectorized and need to handle
7124	// their ordering.
7125	DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
7126
7127	// Phi nodes can have preferred ordering based on their result users
7128	DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
7129
7130	// AltShuffles can also have a preferred ordering that leads to fewer
7131	// instructions, e.g., the addsub instruction in x86.
7132	DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7133
7134	// Maps a TreeEntry to the reorder indices of external users.
7135	DenseMap<const TreeEntry *, SmallVector<OrdersType, `1`>>
7136	ExternalUserReorderMap;
7137	// Find all reorderable nodes with the given VF.
7138	// Currently the are vectorized stores,loads,extracts + some gathering of
7139	// extracts.
7140	for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
7141	const std::unique_ptr<TreeEntry> &TE) {
7142	// Look for external users that will probably be vectorized.
7143	SmallVector<OrdersType, `1`> ExternalUserReorderIndices =
7144	findExternalStoreUsersReorderIndices(TE: TE.get());
7145	if (!ExternalUserReorderIndices.empty()) {
7146	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
7147	ExternalUserReorderMap.try_emplace(Key: TE.get(),
7148	Args: std::move(ExternalUserReorderIndices));
7149	}
7150
7151	// Patterns like [fadd,fsub] can be combined into a single instruction in
7152	// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7153	// to take into account their order when looking for the most used order.
7154	if (TE ->hasState() && TE ->isAltShuffle() &&
7155	TE ->State != TreeEntry::SplitVectorize) {
7156	Type *ScalarTy = TE ->Scalars [`0`]->getType();
7157	VectorType *VecTy = getWidenedType(ScalarTy, VF: TE ->Scalars.size());
7158	unsigned Opcode0 = TE ->getOpcode();
7159	unsigned Opcode1 = TE ->getAltOpcode();
7160	SmallBitVector OpcodeMask(
7161	getAltInstrMask(VL: TE ->Scalars, ScalarTy, Opcode0, Opcode1));
7162	// If this pattern is supported by the target then we consider the order.
7163	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7164	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
7165	AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType ());
7166	}
7167	// TODO: Check the reverse order too.
7168	}
7169
7170	bool IgnoreReorder =
7171	!UserIgnoreList && VectorizableTree.front()->hasState() &&
7172	(VectorizableTree.front()->getOpcode() == Instruction::InsertElement \|\|
7173	VectorizableTree.front()->getOpcode() == Instruction::Store);
7174	if (std::optional<OrdersType> CurrentOrder =
7175	getReorderingData(TE: TE, /TopToBottom=/*true, IgnoreReorder)) {
7176	// Do not include ordering for nodes used in the alt opcode vectorization,
7177	// better to reorder them during bottom-to-top stage. If follow the order
7178	// here, it causes reordering of the whole graph though actually it is
7179	// profitable just to reorder the subgraph that starts from the alternate
7180	// opcode vectorization node. Such nodes already end-up with the shuffle
7181	// instruction and it is just enough to change this shuffle rather than
7182	// rotate the scalars for the whole graph.
7183	unsigned Cnt = `0`;
7184	const TreeEntry *UserTE = TE.get();
7185	while (UserTE && Cnt < RecursionMaxDepth) {
7186	if (!UserTE->UserTreeIndex)
7187	break;
7188	if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7189	UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
7190	UserTE->UserTreeIndex.UserTE->Idx != `0`)
7191	return;
7192	UserTE = UserTE->UserTreeIndex.UserTE;
7193	++Cnt;
7194	}
7195	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
7196	if (!(TE ->State == TreeEntry::Vectorize \|\|
7197	TE ->State == TreeEntry::StridedVectorize \|\|
7198	TE ->State == TreeEntry::SplitVectorize \|\|
7199	TE ->State == TreeEntry::CompressVectorize) \|\|
7200	!TE ->ReuseShuffleIndices.empty())
7201	GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
7202	if (TE ->State == TreeEntry::Vectorize &&
7203	TE ->getOpcode() == Instruction::PHI)
7204	PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
7205	}
7206	});
7207
7208	// Reorder the graph nodes according to their vectorization factor.
7209	for (unsigned VF = VectorizableTree.front()->getVectorFactor();
7210	!VFToOrderedEntries.empty() && VF > `1`; VF -= `2` - (VF & `1U`)) {
7211	auto It = VFToOrderedEntries.find(Val: VF);
7212	if (It == VFToOrderedEntries.end())
7213	continue;
7214	// Try to find the most profitable order. We just are looking for the most
7215	// used order and reorder scalar elements in the nodes according to this
7216	// mostly used order.
7217	ArrayRef<TreeEntry *> OrderedEntries = It ->second.getArrayRef();
7218	// Delete VF entry upon exit.
7219	auto Cleanup = make_scope_exit(F: [&]() { VFToOrderedEntries.erase(I: It); });
7220
7221	// All operands are reordered and used only in this node - propagate the
7222	// most used order to the user node.
7223	MapVector<OrdersType, unsigned,
7224	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
7225	OrdersUses;
7226	for (const TreeEntry *OpTE : OrderedEntries) {
7227	// No need to reorder this nodes, still need to extend and to use shuffle,
7228	// just need to merge reordering shuffle and the reuse shuffle.
7229	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) &&
7230	OpTE->State != TreeEntry::SplitVectorize)
7231	continue;
7232	// Count number of orders uses.
7233	const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
7234	&PhisToOrders]() -> const OrdersType & {
7235	if (OpTE->isGather() \|\| !OpTE->ReuseShuffleIndices.empty()) {
7236	auto It = GathersToOrders.find(Val: OpTE);
7237	if (It != GathersToOrders.end())
7238	return It ->second;
7239	}
7240	if (OpTE->hasState() && OpTE->isAltShuffle()) {
7241	auto It = AltShufflesToOrders.find(Val: OpTE);
7242	if (It != AltShufflesToOrders.end())
7243	return It ->second;
7244	}
7245	if (OpTE->State == TreeEntry::Vectorize &&
7246	OpTE->getOpcode() == Instruction::PHI) {
7247	auto It = PhisToOrders.find(Val: OpTE);
7248	if (It != PhisToOrders.end())
7249	return It ->second;
7250	}
7251	return OpTE->ReorderIndices;
7252	}();
7253	// First consider the order of the external scalar users.
7254	auto It = ExternalUserReorderMap.find(Val: OpTE);
7255	if (It != ExternalUserReorderMap.end()) {
7256	const auto &ExternalUserReorderIndices = It ->second;
7257	// If the OpTE vector factor != number of scalars - use natural order,
7258	// it is an attempt to reorder node with reused scalars but with
7259	// external uses.
7260	if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
7261	OrdersUses.insert(KV: std::make_pair(x: OrdersType (), y: `0`)).first->second +=
7262	ExternalUserReorderIndices.size();
7263	} else {
7264	for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
7265	++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: `0`)).first->second;
7266	}
7267	// No other useful reorder data in this entry.
7268	if (Order.empty())
7269	continue;
7270	}
7271	// Stores actually store the mask, not the order, need to invert.
7272	if (OpTE->State == TreeEntry::Vectorize &&
7273	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
7274	assert(!OpTE->isAltShuffle() &&
7275	"Alternate instructions are only supported by BinaryOperator "
7276	"and CastInst.");
7277	SmallVector<int> Mask;
7278	inversePermutation(Indices: Order, Mask);
7279	unsigned E = Order.size();
7280	OrdersType CurrentOrder(E, E);
7281	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
7282	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
7283	});
7284	fixupOrderingIndices(Order: CurrentOrder);
7285	++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: `0`)).first->second;
7286	} else {
7287	++OrdersUses.insert(KV: std::make_pair(x: Order, y: `0`)).first->second;
7288	}
7289	}
7290	if (OrdersUses.empty())
7291	continue;
7292	// Choose the most used order.
7293	unsigned IdentityCnt = `0`;
7294	unsigned FilledIdentityCnt = `0`;
7295	OrdersType IdentityOrder(VF, VF);
7296	for (auto &Pair : OrdersUses) {
7297	if (Pair.first.empty() \|\| isIdentityOrder(Order: Pair.first)) {
7298	if (!Pair.first.empty())
7299	FilledIdentityCnt += Pair.second;
7300	IdentityCnt += Pair.second;
7301	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
7302	}
7303	}
7304	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
7305	unsigned Cnt = IdentityCnt;
7306	for (auto &Pair : OrdersUses) {
7307	// Prefer identity order. But, if filled identity found (non-empty order)
7308	// with same number of uses, as the new candidate order, we can choose
7309	// this candidate order.
7310	if (Cnt < Pair.second \|\|
7311	(Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
7312	Cnt == Pair.second && !BestOrder.empty() &&
7313	isIdentityOrder(Order: BestOrder))) {
7314	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
7315	BestOrder = Pair.first;
7316	Cnt = Pair.second;
7317	} else {
7318	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
7319	}
7320	}
7321	// Set order of the user node.
7322	if (isIdentityOrder(Order: BestOrder))
7323	continue;
7324	fixupOrderingIndices(Order: BestOrder);
7325	SmallVector<int> Mask;
7326	inversePermutation(Indices: BestOrder, Mask);
7327	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
7328	unsigned E = BestOrder.size();
7329	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
7330	return I < E ? static_cast<int>(I) : PoisonMaskElem;
7331	});
7332	// Do an actual reordering, if profitable.
7333	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7334	// Just do the reordering for the nodes with the given VF.
7335	if (TE ->Scalars.size() != VF) {
7336	if (TE ->ReuseShuffleIndices.size() == VF) {
7337	assert(TE->State != TreeEntry::SplitVectorize &&
7338	"Split vectorized not expected.");
7339	// Need to reorder the reuses masks of the operands with smaller VF to
7340	// be able to find the match between the graph nodes and scalar
7341	// operands of the given node during vectorization/cost estimation.
7342	assert(
7343	(!TE->UserTreeIndex \|\|
7344	TE->UserTreeIndex.UserTE->Scalars.size() == VF \|\|
7345	TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() \|\|
7346	TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
7347	"All users must be of VF size.");
7348	if (SLPReVec) {
7349	assert(SLPReVec && "Only supported by REVEC.");
7350	// ShuffleVectorInst does not do reorderOperands (and it should not
7351	// because ShuffleVectorInst supports only a limited set of
7352	// patterns). Only do reorderNodeWithReuses if the user is not
7353	// ShuffleVectorInst.
7354	if (TE ->UserTreeIndex && TE ->UserTreeIndex.UserTE->hasState() &&
7355	isa<ShuffleVectorInst>(Val: TE ->UserTreeIndex.UserTE->getMainOp()))
7356	continue;
7357	}
7358	// Update ordering of the operands with the smaller VF than the given
7359	// one.
7360	reorderNodeWithReuses(TE&: *TE, Mask);
7361	// Update orders in user split vectorize nodes.
7362	if (TE ->UserTreeIndex &&
7363	TE ->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
7364	TE ->UserTreeIndex.UserTE->reorderSplitNode(
7365	Idx: TE ->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
7366	}
7367	continue;
7368	}
7369	if ((TE ->State == TreeEntry::SplitVectorize &&
7370	TE ->ReuseShuffleIndices.empty()) \|\|
7371	((TE ->State == TreeEntry::Vectorize \|\|
7372	TE ->State == TreeEntry::StridedVectorize \|\|
7373	TE ->State == TreeEntry::CompressVectorize) &&
7374	(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
7375	InsertElementInst>(Val: TE ->getMainOp()) \|\|
7376	(SLPReVec && isa<ShuffleVectorInst>(Val: TE ->getMainOp()))))) {
7377	assert(
7378	(!TE->isAltShuffle() \|\| (TE->State == TreeEntry::SplitVectorize &&
7379	TE->ReuseShuffleIndices.empty())) &&
7380	"Alternate instructions are only supported by BinaryOperator "
7381	"and CastInst.");
7382	// Build correct orders for extract{element,value}, loads,
7383	// stores and alternate (split) nodes.
7384	reorderOrder(Order&: TE ->ReorderIndices, Mask);
7385	if (isa<InsertElementInst, StoreInst>(Val: TE ->getMainOp()))
7386	TE ->reorderOperands(Mask);
7387	} else {
7388	// Reorder the node and its operands.
7389	TE ->reorderOperands(Mask);
7390	assert(TE->ReorderIndices.empty() &&
7391	"Expected empty reorder sequence.");
7392	reorderScalars(Scalars&: TE ->Scalars, Mask);
7393	}
7394	if (!TE ->ReuseShuffleIndices.empty()) {
7395	// Apply reversed order to keep the original ordering of the reused
7396	// elements to avoid extra reorder indices shuffling.
7397	OrdersType CurrentOrder;
7398	reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
7399	SmallVector<int> NewReuses;
7400	inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
7401	addMask(Mask&: NewReuses, SubMask: TE ->ReuseShuffleIndices);
7402	TE ->ReuseShuffleIndices.swap(RHS&: NewReuses);
7403	} else if (TE ->UserTreeIndex &&
7404	TE ->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
7405	// Update orders in user split vectorize nodes.
7406	TE ->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE ->UserTreeIndex.EdgeIdx,
7407	Mask, MaskOrder);
7408	}
7409	}
7410	}
7411
7412	void BoUpSLP::buildReorderableOperands(
7413	TreeEntry UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry >> &Edges,
7414	const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
7415	SmallVectorImpl<TreeEntry *> &GatherOps) {
7416	for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) {
7417	if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
7418	return OpData.first == I &&
7419	(OpData.second->State == TreeEntry::Vectorize \|\|
7420	OpData.second->State == TreeEntry::StridedVectorize \|\|
7421	OpData.second->State == TreeEntry::CompressVectorize \|\|
7422	OpData.second->State == TreeEntry::SplitVectorize);
7423	}))
7424	continue;
7425	// Do not request operands, if they do not exist.
7426	if (UserTE->hasState()) {
7427	if (UserTE->getOpcode() == Instruction::ExtractElement \|\|
7428	UserTE->getOpcode() == Instruction::ExtractValue)
7429	continue;
7430	if (UserTE->getOpcode() == Instruction::InsertElement && I == `0`)
7431	continue;
7432	if (UserTE->getOpcode() == Instruction::Store &&
7433	UserTE->State == TreeEntry::Vectorize && I == `1`)
7434	continue;
7435	if (UserTE->getOpcode() == Instruction::Load &&
7436	(UserTE->State == TreeEntry::Vectorize \|\|
7437	UserTE->State == TreeEntry::StridedVectorize \|\|
7438	UserTE->State == TreeEntry::CompressVectorize))
7439	continue;
7440	}
7441	TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I);
7442	assert(TE && "Expected operand entry.");
7443	if (!TE->isGather()) {
7444	// Add the node to the list of the ordered nodes with the identity
7445	// order.
7446	Edges.emplace_back(Args&: I, Args&: TE);
7447	// Add ScatterVectorize nodes to the list of operands, where just
7448	// reordering of the scalars is required. Similar to the gathers, so
7449	// simply add to the list of gathered ops.
7450	// If there are reused scalars, process this node as a regular vectorize
7451	// node, just reorder reuses mask.
7452	if (TE->State == TreeEntry::ScatterVectorize &&
7453	TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
7454	GatherOps.push_back(Elt: TE);
7455	continue;
7456	}
7457	if (ReorderableGathers.contains(Ptr: TE))
7458	GatherOps.push_back(Elt: TE);
7459	}
7460	}
7461
7462	void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
7463	struct TreeEntryCompare {
7464	bool operator()(const TreeEntry LHS, const* TreeEntry RHS) const* {
7465	if (LHS->UserTreeIndex && RHS->UserTreeIndex)
7466	return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
7467	return LHS->Idx < RHS->Idx;
7468	}
7469	};
7470	PriorityQueue<TreeEntry , SmallVector<TreeEntry >, TreeEntryCompare> Queue;
7471	DenseSet<const TreeEntry *> GathersToOrders;
7472	// Find all reorderable leaf nodes with the given VF.
7473	// Currently the are vectorized loads,extracts without alternate operands +
7474	// some gathering of extracts.
7475	SmallPtrSet<const TreeEntry *, `4`> NonVectorized;
7476	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7477	if (TE ->State != TreeEntry::Vectorize &&
7478	TE ->State != TreeEntry::StridedVectorize &&
7479	TE ->State != TreeEntry::CompressVectorize &&
7480	TE ->State != TreeEntry::SplitVectorize)
7481	NonVectorized.insert(Ptr: TE.get());
7482	if (std::optional<OrdersType> CurrentOrder =
7483	getReorderingData(TE: TE, /TopToBottom=/*false, IgnoreReorder)) {
7484	Queue.push(x: TE.get());
7485	if (!(TE ->State == TreeEntry::Vectorize \|\|
7486	TE ->State == TreeEntry::StridedVectorize \|\|
7487	TE ->State == TreeEntry::CompressVectorize \|\|
7488	TE ->State == TreeEntry::SplitVectorize) \|\|
7489	!TE ->ReuseShuffleIndices.empty())
7490	GathersToOrders.insert(V: TE.get());
7491	}
7492	}
7493
7494	// 1. Propagate order to the graph nodes, which use only reordered nodes.
7495	// I.e., if the node has operands, that are reordered, try to make at least
7496	// one operand order in the natural order and reorder others + reorder the
7497	// user node itself.
7498	SmallPtrSet<const TreeEntry *, `4`> Visited, RevisitedOps;
7499	while (!Queue.empty()) {
7500	// 1. Filter out only reordered nodes.
7501	std::pair<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>> Users;
7502	TreeEntry *TE = Queue.top();
7503	const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
7504	Queue.pop();
7505	SmallVector<TreeEntry *> OrderedOps(`1`, TE);
7506	while (!Queue.empty()) {
7507	TE = Queue.top();
7508	if (!UserTE \|\| UserTE != TE->UserTreeIndex.UserTE)
7509	break;
7510	Queue.pop();
7511	OrderedOps.push_back(Elt: TE);
7512	}
7513	for (TreeEntry *TE : OrderedOps) {
7514	if (!(TE->State == TreeEntry::Vectorize \|\|
7515	TE->State == TreeEntry::StridedVectorize \|\|
7516	TE->State == TreeEntry::CompressVectorize \|\|
7517	TE->State == TreeEntry::SplitVectorize \|\|
7518	(TE->isGather() && GathersToOrders.contains(V: TE))) \|\|
7519	!TE->UserTreeIndex \|\| !TE->ReuseShuffleIndices.empty() \|\|
7520	!Visited.insert(Ptr: TE).second)
7521	continue;
7522	// Build a map between user nodes and their operands order to speedup
7523	// search. The graph currently does not provide this dependency directly.
7524	Users.first = TE->UserTreeIndex.UserTE;
7525	Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE);
7526	}
7527	if (Users.first) {
7528	auto &Data = Users;
7529	if (Data.first->State == TreeEntry::SplitVectorize) {
7530	assert(
7531	Data.second.size() <= `2` &&
7532	"Expected not greater than 2 operands for split vectorize node.");
7533	if (any_of(Range&: Data.second,
7534	P: [](const auto &Op) { return !Op.second->UserTreeIndex; }))
7535	continue;
7536	// Update orders in user split vectorize nodes.
7537	assert(Data.first->CombinedEntriesWithIndices.size() == `2` &&
7538	"Expected exactly 2 entries.");
7539	for (const auto &P : Data.first->CombinedEntriesWithIndices) {
7540	TreeEntry &OpTE = *VectorizableTree [P.first];
7541	OrdersType Order = OpTE.ReorderIndices;
7542	if (Order.empty() \|\| !OpTE.ReuseShuffleIndices.empty()) {
7543	if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
7544	continue;
7545	const auto BestOrder =
7546	getReorderingData(TE: OpTE, /TopToBottom=/false, IgnoreReorder);
7547	if (!BestOrder \|\| BestOrder ->empty() \|\| isIdentityOrder(Order: *BestOrder))
7548	continue;
7549	Order = *BestOrder;
7550	}
7551	fixupOrderingIndices(Order);
7552	SmallVector<int> Mask;
7553	inversePermutation(Indices: Order, Mask);
7554	const unsigned E = Order.size();
7555	SmallVector<int> MaskOrder(E, PoisonMaskElem);
7556	transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) {
7557	return I < E ? static_cast<int>(I) : PoisonMaskElem;
7558	});
7559	Data.first->reorderSplitNode(Idx: P.second ? `1` : `0`, Mask, MaskOrder);
7560	// Clear ordering of the operand.
7561	if (!OpTE.ReorderIndices.empty()) {
7562	OpTE.ReorderIndices.clear();
7563	} else if (!OpTE.ReuseShuffleIndices.empty()) {
7564	reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask);
7565	} else {
7566	assert(OpTE.isGather() && "Expected only gather/buildvector node.");
7567	reorderScalars(Scalars&: OpTE.Scalars, Mask);
7568	}
7569	}
7570	if (Data.first->ReuseShuffleIndices.empty() &&
7571	!Data.first->ReorderIndices.empty()) {
7572	// Insert user node to the list to try to sink reordering deeper in
7573	// the graph.
7574	Queue.push(x: Data.first);
7575	}
7576	continue;
7577	}
7578	// Check that operands are used only in the User node.
7579	SmallVector<TreeEntry *> GatherOps;
7580	buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
7581	GatherOps);
7582	// All operands are reordered and used only in this node - propagate the
7583	// most used order to the user node.
7584	MapVector<OrdersType, unsigned,
7585	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
7586	OrdersUses;
7587	// Do the analysis for each tree entry only once, otherwise the order of
7588	// the same node my be considered several times, though might be not
7589	// profitable.
7590	SmallPtrSet<const TreeEntry *, `4`> VisitedOps;
7591	SmallPtrSet<const TreeEntry *, `4`> VisitedUsers;
7592	for (const auto &Op : Data.second) {
7593	TreeEntry *OpTE = Op.second;
7594	if (!VisitedOps.insert(Ptr: OpTE).second)
7595	continue;
7596	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
7597	continue;
7598	const auto Order = [&]() -> const OrdersType {
7599	if (OpTE->isGather() \|\| !OpTE->ReuseShuffleIndices.empty())
7600	return getReorderingData(TE: OpTE, /TopToBottom=/*false,
7601	IgnoreReorder)
7602	.value_or(u: OrdersType (`1`));
7603	return OpTE->ReorderIndices;
7604	}();
7605	// The order is partially ordered, skip it in favor of fully non-ordered
7606	// orders.
7607	if (Order.size() == `1`)
7608	continue;
7609
7610	// Check that the reordering does not increase number of shuffles, i.e.
7611	// same-values-nodes has same parents or their parents has same parents.
7612	if (!Order.empty() && !isIdentityOrder(Order)) {
7613	Value *Root = OpTE->hasState()
7614	? OpTE->getMainOp()
7615	: *find_if_not(Range&: OpTE->Scalars, P: isConstant);
7616	auto GetSameNodesUsers = [&](Value *Root) {
7617	SmallSetVector<TreeEntry *, `4`> Res;
7618	for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) {
7619	if (TE != OpTE && TE->UserTreeIndex &&
7620	TE->getVectorFactor() == OpTE->getVectorFactor() &&
7621	TE->Scalars.size() == OpTE->Scalars.size() &&
7622	((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) \|\|
7623	(OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
7624	Res.insert(X: TE->UserTreeIndex.UserTE);
7625	}
7626	for (const TreeEntry *TE : getTreeEntries(V: Root)) {
7627	if (TE != OpTE && TE->UserTreeIndex &&
7628	TE->getVectorFactor() == OpTE->getVectorFactor() &&
7629	TE->Scalars.size() == OpTE->Scalars.size() &&
7630	((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) \|\|
7631	(OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
7632	Res.insert(X: TE->UserTreeIndex.UserTE);
7633	}
7634	return Res.takeVector();
7635	};
7636	auto GetNumOperands = [](const TreeEntry *TE) {
7637	if (TE->State == TreeEntry::SplitVectorize)
7638	return TE->getNumOperands();
7639	if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
7640	return CI->arg_size();
7641	return TE->getNumOperands();
7642	};
7643	auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
7644	const TreeEntry *TE) {
7645	Intrinsic::ID ID = Intrinsic::not_intrinsic;
7646	if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
7647	ID = getVectorIntrinsicIDForCall(CI, TLI);
7648	for (unsigned Idx : seq<unsigned>(Size: GetNumOperands (TE))) {
7649	if (ID != Intrinsic::not_intrinsic &&
7650	isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
7651	continue;
7652	const TreeEntry *Op = getOperandEntry(E: TE, Idx);
7653	if (Op->isGather() && Op->hasState()) {
7654	const TreeEntry *VecOp =
7655	getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars);
7656	if (VecOp)
7657	Op = VecOp;
7658	}
7659	if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
7660	return false;
7661	}
7662	return true;
7663	};
7664	SmallVector<TreeEntry *> Users = GetSameNodesUsers (Root);
7665	if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) {
7666	if (!RevisitedOps.insert(Ptr: UTE).second)
7667	return false;
7668	return UTE == Data.first \|\| !UTE->ReorderIndices.empty() \|\|
7669	!UTE->ReuseShuffleIndices.empty() \|\|
7670	(UTE->UserTreeIndex &&
7671	UTE->UserTreeIndex.UserTE == Data.first) \|\|
7672	(Data.first->UserTreeIndex &&
7673	Data.first->UserTreeIndex.UserTE == UTE) \|\|
7674	(IgnoreReorder && UTE->UserTreeIndex &&
7675	UTE->UserTreeIndex.UserTE->Idx == `0`) \|\|
7676	NodeShouldBeReorderedWithOperands (UTE);
7677	}))
7678	continue;
7679	for (TreeEntry *UTE : Users) {
7680	Intrinsic::ID ID = Intrinsic::not_intrinsic;
7681	if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI)
7682	ID = getVectorIntrinsicIDForCall(CI, TLI);
7683	for (unsigned Idx : seq<unsigned>(Size: GetNumOperands (UTE))) {
7684	if (ID != Intrinsic::not_intrinsic &&
7685	isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
7686	continue;
7687	const TreeEntry *Op = getOperandEntry(E: UTE, Idx);
7688	Visited.erase(Ptr: Op);
7689	Queue.push(x: const_cast<TreeEntry *>(Op));
7690	}
7691	}
7692	}
7693	unsigned NumOps = count_if(
7694	Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
7695	return P.second == OpTE;
7696	});
7697	// Stores actually store the mask, not the order, need to invert.
7698	if (OpTE->State == TreeEntry::Vectorize &&
7699	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
7700	assert(!OpTE->isAltShuffle() &&
7701	"Alternate instructions are only supported by BinaryOperator "
7702	"and CastInst.");
7703	SmallVector<int> Mask;
7704	inversePermutation(Indices: Order, Mask);
7705	unsigned E = Order.size();
7706	OrdersType CurrentOrder(E, E);
7707	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
7708	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
7709	});
7710	fixupOrderingIndices(Order: CurrentOrder);
7711	OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: `0`)).first->second +=
7712	NumOps;
7713	} else {
7714	OrdersUses.insert(KV: std::make_pair(x: Order, y: `0`)).first->second += NumOps;
7715	}
7716	auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType (), y: `0`));
7717	const auto AllowsReordering = [&](const TreeEntry *TE) {
7718	if (!TE->ReorderIndices.empty() \|\| !TE->ReuseShuffleIndices.empty() \|\|
7719	(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) \|\|
7720	(IgnoreReorder && TE->Idx == `0`))
7721	return true;
7722	if (TE->isGather()) {
7723	if (GathersToOrders.contains(V: TE))
7724	return !getReorderingData(TE: TE, /TopToBottom=/*false,
7725	IgnoreReorder)
7726	.value_or(u: OrdersType (`1`))
7727	.empty();
7728	return true;
7729	}
7730	return false;
7731	};
7732	if (OpTE->UserTreeIndex) {
7733	TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
7734	if (!VisitedUsers.insert(Ptr: UserTE).second)
7735	continue;
7736	// May reorder user node if it requires reordering, has reused
7737	// scalars, is an alternate op vectorize node or its op nodes require
7738	// reordering.
7739	if (AllowsReordering (UserTE))
7740	continue;
7741	// Check if users allow reordering.
7742	// Currently look up just 1 level of operands to avoid increase of
7743	// the compile time.
7744	// Profitable to reorder if definitely more operands allow
7745	// reordering rather than those with natural order.
7746	ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
7747	if (static_cast<unsigned>(count_if(
7748	Range&: Ops, P: [UserTE, &AllowsReordering](
7749	const std::pair<unsigned, TreeEntry *> &Op) {
7750	return AllowsReordering (Op.second) &&
7751	Op.second->UserTreeIndex.UserTE == UserTE;
7752	})) <= Ops.size() / `2`)
7753	++Res.first->second;
7754	}
7755	}
7756	if (OrdersUses.empty()) {
7757	Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
7758	continue;
7759	}
7760	// Choose the most used order.
7761	unsigned IdentityCnt = `0`;
7762	unsigned VF = Data.second.front().second->getVectorFactor();
7763	OrdersType IdentityOrder(VF, VF);
7764	for (auto &Pair : OrdersUses) {
7765	if (Pair.first.empty() \|\| isIdentityOrder(Order: Pair.first)) {
7766	IdentityCnt += Pair.second;
7767	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
7768	}
7769	}
7770	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
7771	unsigned Cnt = IdentityCnt;
7772	for (auto &Pair : OrdersUses) {
7773	// Prefer identity order. But, if filled identity found (non-empty
7774	// order) with same number of uses, as the new candidate order, we can
7775	// choose this candidate order.
7776	if (Cnt < Pair.second) {
7777	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
7778	BestOrder = Pair.first;
7779	Cnt = Pair.second;
7780	} else {
7781	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
7782	}
7783	}
7784	// Set order of the user node.
7785	if (isIdentityOrder(Order: BestOrder)) {
7786	Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
7787	continue;
7788	}
7789	fixupOrderingIndices(Order: BestOrder);
7790	// Erase operands from OrderedEntries list and adjust their orders.
7791	VisitedOps.clear();
7792	SmallVector<int> Mask;
7793	inversePermutation(Indices: BestOrder, Mask);
7794	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
7795	unsigned E = BestOrder.size();
7796	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
7797	return I < E ? static_cast<int>(I) : PoisonMaskElem;
7798	});
7799	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
7800	TreeEntry *TE = Op.second;
7801	if (!VisitedOps.insert(Ptr: TE).second)
7802	continue;
7803	if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
7804	reorderNodeWithReuses(TE&: *TE, Mask);
7805	continue;
7806	}
7807	// Gathers are processed separately.
7808	if (TE->State != TreeEntry::Vectorize &&
7809	TE->State != TreeEntry::StridedVectorize &&
7810	TE->State != TreeEntry::CompressVectorize &&
7811	TE->State != TreeEntry::SplitVectorize &&
7812	(TE->State != TreeEntry::ScatterVectorize \|\|
7813	TE->ReorderIndices.empty()))
7814	continue;
7815	assert((BestOrder.size() == TE->ReorderIndices.size() \|\|
7816	TE->ReorderIndices.empty()) &&
7817	"Non-matching sizes of user/operand entries.");
7818	reorderOrder(Order&: TE->ReorderIndices, Mask);
7819	if (IgnoreReorder && TE == VectorizableTree.front().get())
7820	IgnoreReorder = false;
7821	}
7822	// For gathers just need to reorder its scalars.
7823	for (TreeEntry *Gather : GatherOps) {
7824	assert(Gather->ReorderIndices.empty() &&
7825	"Unexpected reordering of gathers.");
7826	if (!Gather->ReuseShuffleIndices.empty()) {
7827	// Just reorder reuses indices.
7828	reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
7829	continue;
7830	}
7831	reorderScalars(Scalars&: Gather->Scalars, Mask);
7832	Visited.insert(Ptr: Gather);
7833	}
7834	// Reorder operands of the user node and set the ordering for the user
7835	// node itself.
7836	auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
7837	return TE.isAltShuffle() &&
7838	(!TE.ReuseShuffleIndices.empty() \|\| TE.getVectorFactor() == `2` \|\|
7839	TE.ReorderIndices.empty());
7840	};
7841	if (Data.first->State != TreeEntry::Vectorize \|\|
7842	!isa<ExtractElementInst, ExtractValueInst, LoadInst>(
7843	Val: Data.first->getMainOp()) \|\|
7844	IsNotProfitableAltCodeNode (*Data.first))
7845	Data.first->reorderOperands(Mask);
7846	if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) \|\|
7847	IsNotProfitableAltCodeNode (*Data.first) \|\|
7848	Data.first->State == TreeEntry::StridedVectorize \|\|
7849	Data.first->State == TreeEntry::CompressVectorize) {
7850	reorderScalars(Scalars&: Data.first->Scalars, Mask);
7851	reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
7852	/BottomOrder=/true);
7853	if (Data.first->ReuseShuffleIndices.empty() &&
7854	!Data.first->ReorderIndices.empty() &&
7855	!IsNotProfitableAltCodeNode (*Data.first)) {
7856	// Insert user node to the list to try to sink reordering deeper in
7857	// the graph.
7858	Queue.push(x: Data.first);
7859	}
7860	} else {
7861	reorderOrder(Order&: Data.first->ReorderIndices, Mask);
7862	}
7863	}
7864	}
7865	// If the reordering is unnecessary, just remove the reorder.
7866	if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
7867	VectorizableTree.front()->ReuseShuffleIndices.empty())
7868	VectorizableTree.front()->ReorderIndices.clear();
7869	}
7870
7871	Instruction BoUpSLP::getRootEntryInstruction(const* TreeEntry &Entry) const {
7872	if ((Entry.getOpcode() == Instruction::Store \|\|
7873	Entry.getOpcode() == Instruction::Load) &&
7874	Entry.State == TreeEntry::StridedVectorize &&
7875	!Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices))
7876	return dyn_cast<Instruction>(Val: Entry.Scalars [Entry.ReorderIndices.front()]);
7877	return dyn_cast<Instruction>(Val: Entry.Scalars.front());
7878	}
7879
7880	void BoUpSLP::buildExternalUses(
7881	const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
7882	DenseMap<Value , unsigned*> ScalarToExtUses;
7883	// Collect the values that we need to extract from the tree.
7884	for (auto &TEPtr : VectorizableTree) {
7885	TreeEntry *Entry = TEPtr.get();
7886
7887	// No need to handle users of gathered values.
7888	if (Entry->isGather() \|\| Entry->State == TreeEntry::SplitVectorize)
7889	continue;
7890
7891	// For each lane:
7892	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
7893	Value *Scalar = Entry->Scalars [Lane];
7894	if (!isa<Instruction>(Val: Scalar))
7895	continue;
7896	// All uses must be replaced already? No need to do it again.
7897	auto It = ScalarToExtUses.find(Val: Scalar);
7898	if (It != ScalarToExtUses.end() && !ExternalUses [It ->second].User)
7899	continue;
7900
7901	// Check if the scalar is externally used as an extra arg.
7902	const auto ExtI = ExternallyUsedValues.find(V: Scalar);
7903	if (ExtI != ExternallyUsedValues.end()) {
7904	int FoundLane = Entry->findLaneForValue(V: Scalar);
7905	LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
7906	<< FoundLane << " from " << *Scalar << ".\n");
7907	ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
7908	ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
7909	continue;
7910	}
7911	for (User *U : Scalar->users()) {
7912	LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
7913
7914	Instruction *UserInst = dyn_cast<Instruction>(Val: U);
7915	if (!UserInst \|\| isDeleted(I: UserInst))
7916	continue;
7917
7918	// Ignore users in the user ignore list.
7919	if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
7920	continue;
7921
7922	// Skip in-tree scalars that become vectors
7923	if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U);
7924	!UseEntries.empty()) {
7925	// Some in-tree scalars will remain as scalar in vectorized
7926	// instructions. If that is the case, the one in FoundLane will
7927	// be used.
7928	if (all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) {
7929	return UseEntry->State == TreeEntry::ScatterVectorize \|\|
7930	!doesInTreeUserNeedToExtract(
7931	Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI,
7932	TTI);
7933	})) {
7934	LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
7935	<< ".\n");
7936	assert(none_of(UseEntries,
7937	[](TreeEntry *UseEntry) {
7938	return UseEntry->isGather();
7939	}) &&
7940	"Bad state");
7941	continue;
7942	}
7943	U = nullptr;
7944	if (It != ScalarToExtUses.end()) {
7945	ExternalUses [It ->second].User = nullptr;
7946	break;
7947	}
7948	}
7949
7950	if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
7951	U = nullptr;
7952	int FoundLane = Entry->findLaneForValue(V: Scalar);
7953	LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
7954	<< " from lane " << FoundLane << " from " << *Scalar
7955	<< ".\n");
7956	It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
7957	ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane);
7958	if (!U)
7959	break;
7960	}
7961	}
7962	}
7963	}
7964
7965	SmallVector<SmallVector<StoreInst *>>
7966	BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry TE) const* {
7967	SmallDenseMap<std::tuple<BasicBlock , Type , Value *>,
7968	SmallVector<StoreInst *>, `8`>
7969	PtrToStoresMap;
7970	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: TE->Scalars.size())) {
7971	Value *V = TE->Scalars [Lane];
7972	// Don't iterate over the users of constant data.
7973	if (!isa<Instruction>(Val: V))
7974	continue;
7975	// To save compilation time we don't visit if we have too many users.
7976	if (V->hasNUsesOrMore(N: UsesLimit))
7977	break;
7978
7979	// Collect stores per pointer object.
7980	for (User *U : V->users()) {
7981	auto *SI = dyn_cast<StoreInst>(Val: U);
7982	// Test whether we can handle the store. V might be a global, which could
7983	// be used in a different function.
7984	if (SI == nullptr \|\| !SI->isSimple() \|\| SI->getFunction() != F \|\|
7985	!isValidElementType(Ty: SI->getValueOperand()->getType()))
7986	continue;
7987	// Skip entry if already
7988	if (isVectorized(V: U))
7989	continue;
7990
7991	Value *Ptr =
7992	getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
7993	auto &StoresVec = PtrToStoresMap [{SI->getParent(),
7994	SI->getValueOperand()->getType(), Ptr}];
7995	// For now just keep one store per pointer object per lane.
7996	// TODO: Extend this to support multiple stores per pointer per lane
7997	if (StoresVec.size() > Lane)
7998	continue;
7999	if (!StoresVec.empty()) {
8000	std::optional<int64_t> Diff = getPointersDiff(
8001	ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(),
8002	ElemTyB: SI->getValueOperand()->getType(),
8003	PtrB: StoresVec.front()->getPointerOperand(), DL: DL, SE&: SE,
8004	/StrictCheck=/true);
8005	// We failed to compare the pointers so just abandon this store.
8006	if (!Diff)
8007	continue;
8008	}
8009	StoresVec.push_back(Elt: SI);
8010	}
8011	}
8012	SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8013	unsigned I = `0`;
8014	for (auto &P : PtrToStoresMap) {
8015	Res [I].swap(RHS&: P.second);
8016	++I;
8017	}
8018	return Res;
8019	}
8020
8021	bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8022	OrdersType &ReorderIndices) const {
8023	// We check whether the stores in StoreVec can form a vector by sorting them
8024	// and checking whether they are consecutive.
8025
8026	// To avoid calling getPointersDiff() while sorting we create a vector of
8027	// pairs {store, offset from first} and sort this instead.
8028	SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;
8029	StoreInst *S0 = StoresVec [`0`];
8030	StoreOffsetVec.emplace_back(Args: `0`, Args: `0`);
8031	Type *S0Ty = S0->getValueOperand()->getType();
8032	Value *S0Ptr = S0->getPointerOperand();
8033	for (unsigned Idx : seq<unsigned>(Begin: `1`, End: StoresVec.size())) {
8034	StoreInst *SI = StoresVec [Idx];
8035	std::optional<int64_t> Diff =
8036	getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
8037	PtrB: SI->getPointerOperand(), DL: DL, SE&: SE,
8038	/StrictCheck=/true);
8039	StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx);
8040	}
8041
8042	// Check if the stores are consecutive by checking if their difference is 1.
8043	if (StoreOffsetVec.size() != StoresVec.size())
8044	return false;
8045	sort(C&: StoreOffsetVec, Comp: llvm::less_first ());
8046	unsigned Idx = `0`;
8047	int64_t PrevDist = `0`;
8048	for (const auto &P : StoreOffsetVec) {
8049	if (Idx > `0` && P.first != PrevDist + `1`)
8050	return false;
8051	PrevDist = P.first;
8052	++Idx;
8053	}
8054
8055	// Calculate the shuffle indices according to their offset against the sorted
8056	// StoreOffsetVec.
8057	ReorderIndices.assign(NumElts: StoresVec.size(), Elt: `0`);
8058	bool IsIdentity = true;
8059	for (auto [I, P] : enumerate(First&: StoreOffsetVec)) {
8060	ReorderIndices [P.second] = I;
8061	IsIdentity &= P.second == I;
8062	}
8063	// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8064	// reorderTopToBottom() and reorderBottomToTop(), so we are following the
8065	// same convention here.
8066	if (IsIdentity)
8067	ReorderIndices.clear();
8068
8069	return true;
8070	}
8071
8072	#ifndef NDEBUG
8073	LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
8074	for (unsigned Idx : Order)
8075	dbgs() << Idx << ", ";
8076	dbgs() << "\n";
8077	}
8078	#endif
8079
8080	SmallVector<BoUpSLP::OrdersType, `1`>
8081	BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry TE) const* {
8082	unsigned NumLanes = TE->Scalars.size();
8083
8084	SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8085
8086	// Holds the reorder indices for each candidate store vector that is a user of
8087	// the current TreeEntry.
8088	SmallVector<OrdersType, `1`> ExternalReorderIndices;
8089
8090	// Now inspect the stores collected per pointer and look for vectorization
8091	// candidates. For each candidate calculate the reorder index vector and push
8092	// it into `ExternalReorderIndices`
8093	for (ArrayRef<StoreInst *> StoresVec : Stores) {
8094	// If we have fewer than NumLanes stores, then we can't form a vector.
8095	if (StoresVec.size() != NumLanes)
8096	continue;
8097
8098	// If the stores are not consecutive then abandon this StoresVec.
8099	OrdersType ReorderIndices;
8100	if (!canFormVector(StoresVec, ReorderIndices))
8101	continue;
8102
8103	// We now know that the scalars in StoresVec can form a vector instruction,
8104	// so set the reorder indices.
8105	ExternalReorderIndices.push_back(Elt: ReorderIndices);
8106	}
8107	return ExternalReorderIndices;
8108	}
8109
8110	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
8111	const SmallDenseSet<Value *> &UserIgnoreLst) {
8112	deleteTree();
8113	UserIgnoreList = &UserIgnoreLst;
8114	if (!allSameType(VL: Roots))
8115	return;
8116	buildTreeRec(Roots, Depth: `0`, EI: EdgeInfo ());
8117	}
8118
8119	void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
8120	deleteTree();
8121	if (!allSameType(VL: Roots))
8122	return;
8123	buildTreeRec(Roots, Depth: `0`, EI: EdgeInfo ());
8124	}
8125
8126	/// Tries to find subvector of loads and builds new vector of only loads if can
8127	/// be profitable.
8128	static void gatherPossiblyVectorizableLoads(
8129	const BoUpSLP &R, ArrayRef<Value > VL, const* DataLayout &DL,
8130	ScalarEvolution &SE, const TargetTransformInfo &TTI,
8131	SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8132	bool AddNew = true) {
8133	if (VL.empty())
8134	return;
8135	Type *ScalarTy = getValueType(V: VL.front());
8136	if (!isValidElementType(Ty: ScalarTy))
8137	return;
8138	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;
8139	SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8140	for (Value *V : VL) {
8141	auto *LI = dyn_cast<LoadInst>(Val: V);
8142	if (!LI)
8143	continue;
8144	if (R.isDeleted(I: LI) \|\| R.isVectorized(V: LI) \|\| !LI->isSimple())
8145	continue;
8146	bool IsFound = false;
8147	for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) {
8148	assert(LI->getParent() == Data.front().first->getParent() &&
8149	LI->getType() == Data.front().first->getType() &&
8150	getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8151	getUnderlyingObject(Data.front().first->getPointerOperand(),
8152	RecursionMaxDepth) &&
8153	"Expected loads with the same type, same parent and same "
8154	"underlying pointer.");
8155	std::optional<int64_t> Dist = getPointersDiff(
8156	ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(),
8157	PtrB: Data.front().first->getPointerOperand(), DL, SE,
8158	/StrictCheck=/true);
8159	if (!Dist)
8160	continue;
8161	auto It = Map.find(Val: *Dist);
8162	if (It != Map.end() && It ->second != LI)
8163	continue;
8164	if (It == Map.end()) {
8165	Data.emplace_back(Args&: LI, Args&: *Dist);
8166	Map.try_emplace(Key: *Dist, Args&: LI);
8167	}
8168	IsFound = true;
8169	break;
8170	}
8171	if (!IsFound) {
8172	ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: `0`);
8173	ClusteredDistToLoad.emplace_back().try_emplace(Key: `0`, Args&: LI);
8174	}
8175	}
8176	auto FindMatchingLoads =
8177	[&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,
8178	SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>
8179	&GatheredLoads,
8180	SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
8181	int64_t &Offset, unsigned &Start) {
8182	if (Loads.empty())
8183	return GatheredLoads.end();
8184	LoadInst *LI = Loads.front().first;
8185	for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) {
8186	if (Idx < Start)
8187	continue;
8188	ToAdd.clear();
8189	if (LI->getParent() != Data.front().first->getParent() \|\|
8190	LI->getType() != Data.front().first->getType())
8191	continue;
8192	std::optional<int64_t> Dist =
8193	getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(),
8194	ElemTyB: Data.front().first->getType(),
8195	PtrB: Data.front().first->getPointerOperand(), DL, SE,
8196	/StrictCheck=/true);
8197	if (!Dist)
8198	continue;
8199	SmallSet<int64_t, `4`> DataDists;
8200	SmallPtrSet<LoadInst *, `4`> DataLoads;
8201	for (std::pair<LoadInst *, int64_t> P : Data) {
8202	DataDists.insert(V: P.second);
8203	DataLoads.insert(Ptr: P.first);
8204	}
8205	// Found matching gathered loads - check if all loads are unique or
8206	// can be effectively vectorized.
8207	unsigned NumUniques = `0`;
8208	for (auto [Cnt, Pair] : enumerate(First&: Loads)) {
8209	bool Used = DataLoads.contains(Ptr: Pair.first);
8210	if (!Used && !DataDists.contains(V: *Dist + Pair.second)) {
8211	++NumUniques;
8212	ToAdd.insert(X: Cnt);
8213	} else if (Used) {
8214	Repeated.insert(X: Cnt);
8215	}
8216	}
8217	if (NumUniques > `0` &&
8218	(Loads.size() == NumUniques \|\|
8219	(Loads.size() - NumUniques >= `2` &&
8220	Loads.size() - NumUniques >= Loads.size() / `2` &&
8221	(has_single_bit(Value: Data.size() + NumUniques) \|\|
8222	bit_ceil(Value: Data.size()) <
8223	bit_ceil(Value: Data.size() + NumUniques))))) {
8224	Offset = *Dist;
8225	Start = Idx + `1`;
8226	return std::next(x: GatheredLoads.begin(), n: Idx);
8227	}
8228	}
8229	ToAdd.clear();
8230	return GatheredLoads.end();
8231	};
8232	for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
8233	unsigned Start = `0`;
8234	SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
8235	int64_t Offset = `0`;
8236	auto *It = FindMatchingLoads (Data, GatheredLoads, LocalToAdd, Repeated,
8237	Offset, Start);
8238	while (It != GatheredLoads.end()) {
8239	assert(!LocalToAdd.empty() && "Expected some elements to add.");
8240	for (unsigned Idx : LocalToAdd)
8241	It->emplace_back(Args: Data [Idx].first, Args: Data [Idx].second + Offset);
8242	ToAdd.insert_range(R&: LocalToAdd);
8243	It = FindMatchingLoads (Data, GatheredLoads, LocalToAdd, Repeated, Offset,
8244	Start);
8245	}
8246	if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) {
8247	return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx);
8248	})) {
8249	auto AddNewLoads =
8250	[&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {
8251	for (unsigned Idx : seq<unsigned>(Size: Data.size())) {
8252	if (ToAdd.contains(key: Idx) \|\| Repeated.contains(key: Idx))
8253	continue;
8254	Loads.push_back(Elt: Data [Idx]);
8255	}
8256	};
8257	if (!AddNew) {
8258	LoadInst *LI = Data.front().first;
8259	It = find_if(
8260	Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
8261	return PD.front().first->getParent() == LI->getParent() &&
8262	PD.front().first->getType() == LI->getType();
8263	});
8264	while (It != GatheredLoads.end()) {
8265	AddNewLoads (*It);
8266	It = std::find_if(
8267	first: std::next(x: It), last: GatheredLoads.end(),
8268	pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
8269	return PD.front().first->getParent() == LI->getParent() &&
8270	PD.front().first->getType() == LI->getType();
8271	});
8272	}
8273	}
8274	GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end());
8275	AddNewLoads (GatheredLoads.emplace_back());
8276	}
8277	}
8278	}
8279
8280	void BoUpSLP::tryToVectorizeGatheredLoads(
8281	const SmallMapVector<
8282	std::tuple<BasicBlock , Value , Type *>,
8283	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, `8`>
8284	&GatheredLoads) {
8285	GatheredLoadsEntriesFirst = VectorizableTree.size();
8286
8287	SmallVector<SmallPtrSet<const Value *, `4`>> LoadSetsToVectorize(
8288	LoadEntriesToVectorize.size());
8289	for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize))
8290	Set.insert_range(R&: VectorizableTree [Idx]->Scalars);
8291
8292	// Sort loads by distance.
8293	auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
8294	const std::pair<LoadInst *, int64_t> &L2) {
8295	return L1.second > L2.second;
8296	};
8297
8298	auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
8299	ArrayRef<Value > Values(reinterpret_cast<Value const *>(Loads.begin()),
8300	Loads.size());
8301	Align Alignment = computeCommonAlignment<LoadInst>(VL: Values);
8302	auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size());
8303	return TTI->isLegalMaskedGather(DataType: Ty, Alignment) &&
8304	!TTI->forceScalarizeMaskedGather(Type: Ty, Alignment);
8305	};
8306
8307	auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
8308	BoUpSLP::ValueSet &VectorizedLoads,
8309	SmallVectorImpl<LoadInst *> &NonVectorized,
8310	bool Final, unsigned MaxVF) {
8311	SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
8312	unsigned StartIdx = `0`;
8313	SmallVector<int> CandidateVFs;
8314	if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + `1`))
8315	CandidateVFs.push_back(Elt: MaxVF);
8316	for (int NumElts = getFloorFullVectorNumberOfElements(
8317	TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF);
8318	NumElts > `1`; NumElts = getFloorFullVectorNumberOfElements(
8319	TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - `1`)) {
8320	CandidateVFs.push_back(Elt: NumElts);
8321	if (VectorizeNonPowerOf2 && NumElts > `2`)
8322	CandidateVFs.push_back(Elt: NumElts - `1`);
8323	}
8324
8325	if (Final && CandidateVFs.empty())
8326	return Results;
8327
8328	unsigned BestVF = Final ? CandidateVFs.back() : `0`;
8329	for (unsigned NumElts : CandidateVFs) {
8330	if (Final && NumElts > BestVF)
8331	continue;
8332	SmallVector<unsigned> MaskedGatherVectorized;
8333	for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
8334	++Cnt) {
8335	ArrayRef<LoadInst *> Slice =
8336	ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt));
8337	if (VectorizedLoads.count(Ptr: Slice.front()) \|\|
8338	VectorizedLoads.count(Ptr: Slice.back()) \|\|
8339	areKnownNonVectorizableLoads(VL: Slice))
8340	continue;
8341	// Check if it is profitable to try vectorizing gathered loads. It is
8342	// profitable if we have more than 3 consecutive loads or if we have
8343	// less but all users are vectorized or deleted.
8344	bool AllowToVectorize = false;
8345	// Check if it is profitable to vectorize 2-elements loads.
8346	if (NumElts == `2`) {
8347	bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
8348	ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts));
8349	auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
8350	for (LoadInst *LI : Slice) {
8351	// If single use/user - allow to vectorize.
8352	if (LI->hasOneUse())
8353	continue;
8354	// 1. Check if number of uses equals number of users.
8355	// 2. All users are deleted.
8356	// 3. The load broadcasts are not allowed or the load is not
8357	// broadcasted.
8358	if (static_cast<unsigned int>(std::distance(
8359	first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses())
8360	return false;
8361	if (!IsLegalBroadcastLoad)
8362	continue;
8363	if (LI->hasNUsesOrMore(N: UsesLimit))
8364	return false;
8365	for (User *U : LI->users()) {
8366	if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI))
8367	continue;
8368	for (const TreeEntry *UTE : getTreeEntries(V: U)) {
8369	for (int I : seq<int>(Size: UTE->getNumOperands())) {
8370	if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) {
8371	return V == LI \|\| isa<PoisonValue>(Val: V);
8372	}))
8373	// Found legal broadcast - do not vectorize.
8374	return false;
8375	}
8376	}
8377	}
8378	}
8379	return true;
8380	};
8381	AllowToVectorize = CheckIfAllowed(Slice);
8382	} else {
8383	AllowToVectorize =
8384	(NumElts >= `3` \|\|
8385	any_of(Range: ValueToGatherNodes.at(Val: Slice.front()),
8386	P: [=](const TreeEntry *TE) {
8387	return TE->Scalars.size() == `2` &&
8388	((TE->Scalars.front() == Slice.front() &&
8389	TE->Scalars.back() == Slice.back()) \|\|
8390	(TE->Scalars.front() == Slice.back() &&
8391	TE->Scalars.back() == Slice.front()));
8392	})) &&
8393	hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(),
8394	Sz: Slice.size());
8395	}
8396	if (AllowToVectorize) {
8397	SmallVector<Value *> PointerOps;
8398	OrdersType CurrentOrder;
8399	// Try to build vector load.
8400	ArrayRef<Value *> Values(
8401	reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
8402	LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder,
8403	PointerOps, BestVF: &BestVF);
8404	if (LS != LoadsState::Gather \|\|
8405	(BestVF > `1` && static_cast<unsigned>(NumElts) == `2` * BestVF)) {
8406	if (LS == LoadsState::ScatterVectorize) {
8407	if (MaskedGatherVectorized.empty() \|\|
8408	Cnt >= MaskedGatherVectorized.back() + NumElts)
8409	MaskedGatherVectorized.push_back(Elt: Cnt);
8410	continue;
8411	}
8412	if (LS != LoadsState::Gather) {
8413	Results.emplace_back(Args&: Values, Args&: LS);
8414	VectorizedLoads.insert_range(R&: Slice);
8415	// If we vectorized initial block, no need to try to vectorize it
8416	// again.
8417	if (Cnt == StartIdx)
8418	StartIdx += NumElts;
8419	}
8420	// Check if the whole array was vectorized already - exit.
8421	if (StartIdx >= Loads.size())
8422	break;
8423	// Erase last masked gather candidate, if another candidate within
8424	// the range is found to be better.
8425	if (!MaskedGatherVectorized.empty() &&
8426	Cnt < MaskedGatherVectorized.back() + NumElts)
8427	MaskedGatherVectorized.pop_back();
8428	Cnt += NumElts - `1`;
8429	continue;
8430	}
8431	}
8432	if (!AllowToVectorize \|\| BestVF == `0`)
8433	registerNonVectorizableLoads(VL: Slice);
8434	}
8435	// Mark masked gathers candidates as vectorized, if any.
8436	for (unsigned Cnt : MaskedGatherVectorized) {
8437	ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
8438	N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt));
8439	ArrayRef<Value *> Values(
8440	reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
8441	Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize);
8442	VectorizedLoads.insert_range(R&: Slice);
8443	// If we vectorized initial block, no need to try to vectorize it again.
8444	if (Cnt == StartIdx)
8445	StartIdx += NumElts;
8446	}
8447	}
8448	for (LoadInst *LI : Loads) {
8449	if (!VectorizedLoads.contains(Ptr: LI))
8450	NonVectorized.push_back(Elt: LI);
8451	}
8452	return Results;
8453	};
8454	auto ProcessGatheredLoads =
8455	[&, &TTI = *TTI](
8456	ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
8457	bool Final = false) {
8458	SmallVector<LoadInst *> NonVectorized;
8459	for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
8460	GatheredLoads) {
8461	if (LoadsDists.size() <= `1`) {
8462	NonVectorized.push_back(Elt: LoadsDists.back().first);
8463	continue;
8464	}
8465	SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
8466	LoadsDists);
8467	SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists));
8468	stable_sort(Range&: LocalLoadsDists, C: LoadSorter);
8469	SmallVector<LoadInst *> Loads;
8470	unsigned MaxConsecutiveDistance = `0`;
8471	unsigned CurrentConsecutiveDist = `1`;
8472	int64_t LastDist = LocalLoadsDists.front().second;
8473	bool AllowMaskedGather = IsMaskedGatherSupported (OriginalLoads);
8474	for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
8475	if (isVectorized(V: L.first))
8476	continue;
8477	assert(LastDist >= L.second &&
8478	"Expected first distance always not less than second");
8479	if (static_cast<uint64_t>(LastDist - L.second) ==
8480	CurrentConsecutiveDist) {
8481	++CurrentConsecutiveDist;
8482	MaxConsecutiveDistance =
8483	std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist);
8484	Loads.push_back(Elt: L.first);
8485	continue;
8486	}
8487	if (!AllowMaskedGather && CurrentConsecutiveDist == `1` &&
8488	!Loads.empty())
8489	Loads.pop_back();
8490	CurrentConsecutiveDist = `1`;
8491	LastDist = L.second;
8492	Loads.push_back(Elt: L.first);
8493	}
8494	if (Loads.size() <= `1`)
8495	continue;
8496	if (AllowMaskedGather)
8497	MaxConsecutiveDistance = Loads.size();
8498	else if (MaxConsecutiveDistance < `2`)
8499	continue;
8500	BoUpSLP::ValueSet VectorizedLoads;
8501	SmallVector<LoadInst *> SortedNonVectorized;
8502	SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
8503	GetVectorizedRanges (Loads, VectorizedLoads, SortedNonVectorized,
8504	Final, MaxConsecutiveDistance);
8505	if (!Results.empty() && !SortedNonVectorized.empty() &&
8506	OriginalLoads.size() == Loads.size() &&
8507	MaxConsecutiveDistance == Loads.size() &&
8508	all_of(Range&: Results,
8509	P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
8510	return P.second == LoadsState::ScatterVectorize;
8511	})) {
8512	VectorizedLoads.clear();
8513	SmallVector<LoadInst *> UnsortedNonVectorized;
8514	SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
8515	UnsortedResults =
8516	GetVectorizedRanges (OriginalLoads, VectorizedLoads,
8517	UnsortedNonVectorized, Final,
8518	OriginalLoads.size());
8519	if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
8520	SortedNonVectorized.swap(RHS&: UnsortedNonVectorized);
8521	Results.swap(RHS&: UnsortedResults);
8522	}
8523	}
8524	for (auto [Slice, _] : Results) {
8525	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
8526	<< Slice.size() << ")\n");
8527	if (any_of(Range&: Slice, P: [&](Value V) { return* isVectorized(V); })) {
8528	for (Value *L : Slice)
8529	if (!isVectorized(V: L))
8530	SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L));
8531	continue;
8532	}
8533
8534	// Select maximum VF as a maximum of user gathered nodes and
8535	// distance between scalar loads in these nodes.
8536	unsigned MaxVF = Slice.size();
8537	unsigned UserMaxVF = `0`;
8538	unsigned InterleaveFactor = `0`;
8539	if (MaxVF == `2`) {
8540	UserMaxVF = MaxVF;
8541	} else {
8542	// Found distance between segments of the interleaved loads.
8543	std::optional<unsigned> InterleavedLoadsDistance = `0`;
8544	unsigned Order = `0`;
8545	std::optional<unsigned> CommonVF = `0`;
8546	DenseMap<const TreeEntry , unsigned*> EntryToPosition;
8547	SmallPtrSet<const TreeEntry *, `8`> DeinterleavedNodes;
8548	for (auto [Idx, V] : enumerate(First&: Slice)) {
8549	for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) {
8550	UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size());
8551	unsigned Pos =
8552	EntryToPosition.try_emplace(Key: E, Args&: Idx).first ->second;
8553	UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + `1`);
8554	if (CommonVF) {
8555	if (*CommonVF == `0`) {
8556	CommonVF = E->Scalars.size();
8557	continue;
8558	}
8559	if (*CommonVF != E->Scalars.size())
8560	CommonVF.reset();
8561	}
8562	// Check if the load is the part of the interleaved load.
8563	if (Pos != Idx && InterleavedLoadsDistance) {
8564	if (!DeinterleavedNodes.contains(Ptr: E) &&
8565	any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) {
8566	if (isa<Constant>(Val: V))
8567	return false;
8568	if (isVectorized(V))
8569	return true;
8570	const auto &Nodes = ValueToGatherNodes.at(Val: V);
8571	return (Nodes.size() != `1` \|\| !Nodes.contains(key: E)) &&
8572	!is_contained(Range: Slice, Element: V);
8573	})) {
8574	InterleavedLoadsDistance.reset();
8575	continue;
8576	}
8577	DeinterleavedNodes.insert(Ptr: E);
8578	if (*InterleavedLoadsDistance == `0`) {
8579	InterleavedLoadsDistance = Idx - Pos;
8580	continue;
8581	}
8582	if ((Idx - Pos) % *InterleavedLoadsDistance != `0` \|\|
8583	(Idx - Pos) / *InterleavedLoadsDistance < Order)
8584	InterleavedLoadsDistance.reset();
8585	Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: `1`);
8586	}
8587	}
8588	}
8589	DeinterleavedNodes.clear();
8590	// Check if the large load represents interleaved load operation.
8591	if (InterleavedLoadsDistance.value_or(u: `0`) > `1` &&
8592	CommonVF.value_or(u: `0`) != `0`) {
8593	InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance);
8594	unsigned VF = *CommonVF;
8595	OrdersType Order;
8596	SmallVector<Value *> PointerOps;
8597	// Segmented load detected - vectorize at maximum vector factor.
8598	if (InterleaveFactor <= Slice.size() &&
8599	TTI.isLegalInterleavedAccessType(
8600	VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF),
8601	Factor: InterleaveFactor,
8602	Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(),
8603	AddrSpace: cast<LoadInst>(Val: Slice.front())
8604	->getPointerAddressSpace()) &&
8605	canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
8606	PointerOps) == LoadsState::Vectorize) {
8607	UserMaxVF = InterleaveFactor * VF;
8608	} else {
8609	InterleaveFactor = `0`;
8610	}
8611	}
8612	// Cannot represent the loads as consecutive vectorizable nodes -
8613	// just exit.
8614	unsigned ConsecutiveNodesSize = `0`;
8615	if (!LoadEntriesToVectorize.empty() && InterleaveFactor == `0` &&
8616	any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
8617	P: [&, Slice = Slice](const auto &P) {
8618	const auto It = find_if(Slice, [&](Value V) {
8619	return std::get<`1`>(P).contains(V);
8620	});
8621	if (It == Slice.end())
8622	return false;
8623	const TreeEntry &TE =
8624	*VectorizableTree[std::get<`0`>(P)];
8625	ArrayRef<Value *> VL = TE.Scalars;
8626	OrdersType Order;
8627	SmallVector<Value *> PointerOps;
8628	LoadsState State = canVectorizeLoads(
8629	VL, VL0: VL.front(), Order, PointerOps);
8630	if (State == LoadsState::ScatterVectorize \|\|
8631	State == LoadsState::CompressVectorize)
8632	return false;
8633	ConsecutiveNodesSize += VL.size();
8634	unsigned Start = std::distance(Slice.begin(), It);
8635	unsigned Sz = Slice.size() - Start;
8636	return Sz < VL.size() \|\|
8637	Slice.slice(std::distance(Slice.begin(), It),
8638	VL.size()) != VL;
8639	}))
8640	continue;
8641	// Try to build long masked gather loads.
8642	UserMaxVF = bit_ceil(Value: UserMaxVF);
8643	if (InterleaveFactor == `0` &&
8644	any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF),
8645	P: [&, Slice = Slice](unsigned Idx) {
8646	OrdersType Order;
8647	SmallVector<Value *> PointerOps;
8648	return canVectorizeLoads(
8649	VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF),
8650	VL0: Slice [Idx * UserMaxVF], Order,
8651	PointerOps) ==
8652	LoadsState::ScatterVectorize;
8653	}))
8654	UserMaxVF = MaxVF;
8655	if (Slice.size() != ConsecutiveNodesSize)
8656	MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF);
8657	}
8658	for (unsigned VF = MaxVF; VF >= `2`; VF /= `2`) {
8659	bool IsVectorized = true;
8660	for (unsigned I = `0`, E = Slice.size(); I < E; I += VF) {
8661	ArrayRef<Value *> SubSlice =
8662	Slice.slice(N: I, M: std::min(a: VF, b: E - I));
8663	if (isVectorized(V: SubSlice.front()))
8664	continue;
8665	// Check if the subslice is to be-vectorized entry, which is not
8666	// equal to entry.
8667	if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
8668	P: [&](const auto &P) {
8669	return !SubSlice.equals(
8670	RHS: VectorizableTree[std::get<`0`>(P)]
8671	->Scalars) &&
8672	set_is_subset(SubSlice, std::get<`1`>(P));
8673	}))
8674	continue;
8675	unsigned Sz = VectorizableTree.size();
8676	buildTreeRec(Roots: SubSlice, Depth: `0`, EI: EdgeInfo (), InterleaveFactor);
8677	if (Sz == VectorizableTree.size()) {
8678	IsVectorized = false;
8679	// Try non-interleaved vectorization with smaller vector
8680	// factor.
8681	if (InterleaveFactor > `0`) {
8682	VF = `2` * (MaxVF / InterleaveFactor);
8683	InterleaveFactor = `0`;
8684	}
8685	continue;
8686	}
8687	}
8688	if (IsVectorized)
8689	break;
8690	}
8691	}
8692	NonVectorized.append(RHS: SortedNonVectorized);
8693	}
8694	return NonVectorized;
8695	};
8696	for (const auto &GLs : GatheredLoads) {
8697	const auto &Ref = GLs.second;
8698	SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads (Ref);
8699	if (!Ref.empty() && !NonVectorized.empty() &&
8700	std::accumulate(
8701	first: Ref.begin(), last: Ref.end(), init: `0u`,
8702	binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
8703	-> unsigned { return S + LoadsDists.size(); }) !=
8704	NonVectorized.size() &&
8705	IsMaskedGatherSupported (NonVectorized)) {
8706	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>
8707	FinalGatheredLoads;
8708	for (LoadInst *LI : NonVectorized) {
8709	// Reinsert non-vectorized loads to other list of loads with the same
8710	// base pointers.
8711	gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: DL, SE&: SE, TTI: *TTI,
8712	GatheredLoads&: FinalGatheredLoads,
8713	/AddNew=/false);
8714	}
8715	// Final attempt to vectorize non-vectorized loads.
8716	(void)ProcessGatheredLoads (FinalGatheredLoads, /Final=/true);
8717	}
8718	}
8719	// Try to vectorize postponed load entries, previously marked as gathered.
8720	for (unsigned Idx : LoadEntriesToVectorize) {
8721	const TreeEntry &E = *VectorizableTree [Idx];
8722	SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
8723	// Avoid reordering, if possible.
8724	if (!E.ReorderIndices.empty()) {
8725	// Build a mask out of the reorder indices and reorder scalars per this
8726	// mask.
8727	SmallVector<int> ReorderMask;
8728	inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask);
8729	reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
8730	}
8731	buildTreeRec(Roots: GatheredScalars, Depth: `0`, EI: EdgeInfo ());
8732	}
8733	// If no new entries created, consider it as no gathered loads entries must be
8734	// handled.
8735	if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
8736	VectorizableTree.size())
8737	GatheredLoadsEntriesFirst.reset();
8738	}
8739
8740	/// Generates key/subkey pair for the given value to provide effective sorting
8741	/// of the values and better detection of the vectorizable values sequences. The
8742	/// keys/subkeys can be used for better sorting of the values themselves (keys)
8743	/// and in values subgroups (subkeys).
8744	static std::pair<size_t, size_t> generateKeySubkey(
8745	Value V, const* TargetLibraryInfo *TLI,
8746	function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
8747	bool AllowAlternate) {
8748	hash_code Key = hash_value(value: V->getValueID() + `2`);
8749	hash_code SubKey = hash_value(value: `0`);
8750	// Sort the loads by the distance between the pointers.
8751	if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
8752	Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
8753	if (LI->isSimple())
8754	SubKey = hash_value(code: LoadsSubkeyGenerator (Key, LI));
8755	else
8756	Key = SubKey = hash_value(ptr: LI);
8757	} else if (isVectorLikeInstWithConstOps(V)) {
8758	// Sort extracts by the vector operands.
8759	if (isa<ExtractElementInst, UndefValue>(Val: V))
8760	Key = hash_value(value: Value::UndefValueVal + `1`);
8761	if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
8762	if (!isUndefVector(V: EI->getVectorOperand()).all() &&
8763	!isa<UndefValue>(Val: EI->getIndexOperand()))
8764	SubKey = hash_value(ptr: EI->getVectorOperand());
8765	}
8766	} else if (auto *I = dyn_cast<Instruction>(Val: V)) {
8767	// Sort other instructions just by the opcodes except for CMPInst.
8768	// For CMP also sort by the predicate kind.
8769	if ((isa<BinaryOperator, CastInst>(Val: I)) &&
8770	isValidForAlternation(Opcode: I->getOpcode())) {
8771	if (AllowAlternate)
8772	Key = hash_value(value: isa<BinaryOperator>(Val: I) ? `1` : `0`);
8773	else
8774	Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
8775	SubKey = hash_combine(
8776	args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
8777	args: hash_value(ptr: isa<BinaryOperator>(Val: I)
8778	? I->getType()
8779	: cast<CastInst>(Val: I)->getOperand(i_nocapture: `0`)->getType()));
8780	// For casts, look through the only operand to improve compile time.
8781	if (isa<CastInst>(Val: I)) {
8782	std::pair<size_t, size_t> OpVals =
8783	generateKeySubkey(V: I->getOperand(i: `0`), TLI, LoadsSubkeyGenerator,
8784	/AllowAlternate=/true);
8785	Key = hash_combine(args: OpVals.first, args: Key);
8786	SubKey = hash_combine(args: OpVals.first, args: SubKey);
8787	}
8788	} else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
8789	CmpInst::Predicate Pred = CI->getPredicate();
8790	if (CI->isCommutative())
8791	Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
8792	CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
8793	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
8794	args: hash_value(value: SwapPred),
8795	args: hash_value(ptr: CI->getOperand(i_nocapture: `0`)->getType()));
8796	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
8797	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
8798	if (isTriviallyVectorizable(ID)) {
8799	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
8800	} else if (!VFDatabase (Call).getMappings(CI: Call).empty()) {
8801	SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
8802	args: hash_value(ptr: Call->getCalledFunction()));
8803	} else {
8804	Key = hash_combine(args: hash_value(ptr: Call), args: Key);
8805	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
8806	}
8807	for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
8808	SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
8809	args: hash_value(ptr: Op.Tag), args: SubKey);
8810	} else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
8811	if (Gep->getNumOperands() == `2` && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: `1`)))
8812	SubKey = hash_value(ptr: Gep->getPointerOperand());
8813	else
8814	SubKey = hash_value(ptr: Gep);
8815	} else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
8816	!isa<ConstantInt>(Val: I->getOperand(i: `1`))) {
8817	// Do not try to vectorize instructions with potentially high cost.
8818	SubKey = hash_value(ptr: I);
8819	} else {
8820	SubKey = hash_value(value: I->getOpcode());
8821	}
8822	Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
8823	}
8824	return std::make_pair(x&: Key, y&: SubKey);
8825	}
8826
8827	/// Checks if the specified instruction \p I is an main operation for the given
8828	/// \p MainOp and \p AltOp instructions.
8829	static bool isMainInstruction(Instruction I, Instruction MainOp,
8830	Instruction AltOp, const* TargetLibraryInfo &TLI);
8831
8832	bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
8833	ArrayRef<Value > VL) const* {
8834	Type *ScalarTy = S.getMainOp()->getType();
8835	unsigned Opcode0 = S.getOpcode();
8836	unsigned Opcode1 = S.getAltOpcode();
8837	SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
8838	// If this pattern is supported by the target then consider it profitable.
8839	if (TTI->isLegalAltInstr(VecTy: getWidenedType(ScalarTy, VF: VL.size()), Opcode0,
8840	Opcode1, OpcodeMask))
8841	return true;
8842	SmallVector<ValueList> Operands;
8843	for (unsigned I : seq<unsigned>(Size: S.getMainOp()->getNumOperands())) {
8844	Operands.emplace_back();
8845	// Prepare the operand vector.
8846	for (Value *V : VL) {
8847	if (isa<PoisonValue>(Val: V)) {
8848	Operands.back().push_back(
8849	Elt: PoisonValue::get(T: S.getMainOp()->getOperand(i: I)->getType()));
8850	continue;
8851	}
8852	Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
8853	}
8854	}
8855	if (Operands.size() == `2`) {
8856	// Try find best operands candidates.
8857	for (unsigned I : seq<unsigned>(Begin: `0`, End: VL.size() - `1`)) {
8858	SmallVector<std::pair<Value , Value >> Candidates(`3`);
8859	Candidates [`0`] = std::make_pair(x&: Operands [`0`][I], y&: Operands [`0`][I + `1`]);
8860	Candidates [`1`] = std::make_pair(x&: Operands [`0`][I], y&: Operands [`1`][I + `1`]);
8861	Candidates [`2`] = std::make_pair(x&: Operands [`1`][I], y&: Operands [`0`][I + `1`]);
8862	std::optional<int> Res = findBestRootPair(Candidates);
8863	switch (Res.value_or(u: `0`)) {
8864	case `0`:
8865	break;
8866	case `1`:
8867	std::swap(a&: Operands [`0`][I + `1`], b&: Operands [`1`][I + `1`]);
8868	break;
8869	case `2`:
8870	std::swap(a&: Operands [`0`][I], b&: Operands [`1`][I]);
8871	break;
8872	default:
8873	llvm_unreachable("Unexpected index.");
8874	}
8875	}
8876	}
8877	DenseSet<unsigned> UniqueOpcodes;
8878	constexpr unsigned NumAltInsts = `3`; // main + alt + shuffle.
8879	unsigned NonInstCnt = `0`;
8880	// Estimate number of instructions, required for the vectorized node and for
8881	// the buildvector node.
8882	unsigned UndefCnt = `0`;
8883	// Count the number of extra shuffles, required for vector nodes.
8884	unsigned ExtraShuffleInsts = `0`;
8885	// Check that operands do not contain same values and create either perfect
8886	// diamond match or shuffled match.
8887	if (Operands.size() == `2`) {
8888	// Do not count same operands twice.
8889	if (Operands.front() == Operands.back()) {
8890	Operands.erase(CI: Operands.begin());
8891	} else if (!allConstant(VL: Operands.front()) &&
8892	all_of(Range&: Operands.front(), P: [&](Value *V) {
8893	return is_contained(Range&: Operands.back(), Element: V);
8894	})) {
8895	Operands.erase(CI: Operands.begin());
8896	++ExtraShuffleInsts;
8897	}
8898	}
8899	const Loop *L = LI->getLoopFor(BB: S.getMainOp()->getParent());
8900	// Vectorize node, if:
8901	// 1. at least single operand is constant or splat.
8902	// 2. Operands have many loop invariants (the instructions are not loop
8903	// invariants).
8904	// 3. At least single unique operands is supposed to vectorized.
8905	return none_of(Range&: Operands,
8906	P: [&](ArrayRef<Value *> Op) {
8907	if (allConstant(VL: Op) \|\|
8908	(!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
8909	getSameOpcode(VL: Op, TLI: *TLI)))
8910	return false;
8911	DenseMap<Value , unsigned*> Uniques;
8912	for (Value *V : Op) {
8913	if (isa<Constant, ExtractElementInst>(Val: V) \|\|
8914	isVectorized(V) \|\| (L && L->isLoopInvariant(V))) {
8915	if (isa<UndefValue>(Val: V))
8916	++UndefCnt;
8917	continue;
8918	}
8919	auto Res = Uniques.try_emplace(Key: V, Args: `0`);
8920	// Found first duplicate - need to add shuffle.
8921	if (!Res.second && Res.first ->second == `1`)
8922	++ExtraShuffleInsts;
8923	++Res.first ->getSecond();
8924	if (auto *I = dyn_cast<Instruction>(Val: V))
8925	UniqueOpcodes.insert(V: I->getOpcode());
8926	else if (Res.second)
8927	++NonInstCnt;
8928	}
8929	return none_of(Range&: Uniques, P: [&](const auto &P) {
8930	return P.first->hasNUsesOrMore(P.second + `1`) &&
8931	none_of(P.first->users(), [&](User *U) {
8932	return isVectorized(V: U) \|\| Uniques.contains(Val: U);
8933	});
8934	});
8935	}) \|\|
8936	// Do not vectorize node, if estimated number of vector instructions is
8937	// more than estimated number of buildvector instructions. Number of
8938	// vector operands is number of vector instructions + number of vector
8939	// instructions for operands (buildvectors). Number of buildvector
8940	// instructions is just number_of_operands number_of_scalars.*
8941	(UndefCnt < (VL.size() - `1`) * S.getMainOp()->getNumOperands() &&
8942	(UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
8943	NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
8944	}
8945
8946	/// Builds the arguments types vector for the given call instruction with the
8947	/// given \p ID for the specified vector factor.
8948	static SmallVector<Type *>
8949	buildIntrinsicArgTypes(const CallInst CI, const* Intrinsic::ID ID,
8950	const unsigned VF, unsigned MinBW,
8951	const TargetTransformInfo *TTI) {
8952	SmallVector<Type *> ArgTys;
8953	for (auto [Idx, Arg] : enumerate(First: CI->args())) {
8954	if (ID != Intrinsic::not_intrinsic) {
8955	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) {
8956	ArgTys.push_back(Elt: Arg ->getType());
8957	continue;
8958	}
8959	if (MinBW > `0`) {
8960	ArgTys.push_back(
8961	Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
8962	continue;
8963	}
8964	}
8965	ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg ->getType(), VF));
8966	}
8967	return ArgTys;
8968	}
8969
8970	/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
8971	/// function (if possible) calls. Returns invalid cost for the corresponding
8972	/// calls, if they cannot be vectorized/will be scalarized.
8973	static std::pair<InstructionCost, InstructionCost>
8974	getVectorCallCosts(CallInst CI, FixedVectorType VecTy,
8975	TargetTransformInfo TTI, TargetLibraryInfo TLI,
8976	ArrayRef<Type *> ArgTys) {
8977	auto Shape = VFShape::get(FTy: CI->getFunctionType(),
8978	EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
8979	HasGlobalPred: false /HasGlobalPred/);
8980	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
8981	auto LibCost = InstructionCost::getInvalid();
8982	if (!CI->isNoBuiltin() && VecFunc) {
8983	// Calculate the cost of the vector library call.
8984	// If the corresponding vector call is cheaper, return its cost.
8985	LibCost =
8986	TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
8987	}
8988	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8989
8990	// Calculate the cost of the vector intrinsic call.
8991	FastMathFlags FMF;
8992	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
8993	FMF = FPCI->getFastMathFlags();
8994	const InstructionCost ScalarLimit = `10000`;
8995	IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
8996	LibCost.isValid() ? LibCost : ScalarLimit);
8997	auto IntrinsicCost =
8998	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
8999	if ((LibCost.isValid() && IntrinsicCost > LibCost) \|\|
9000	(!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9001	IntrinsicCost = InstructionCost::getInvalid();
9002
9003	return {IntrinsicCost, LibCost};
9004	}
9005
9006	BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9007	const InstructionsState &S, ArrayRef<Value *> VL,
9008	bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9009	SmallVectorImpl<Value *> &PointerOps) {
9010	assert(S.getMainOp() &&
9011	"Expected instructions with same/alternate opcodes only.");
9012
9013	unsigned ShuffleOrOp =
9014	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9015	Instruction *VL0 = S.getMainOp();
9016	switch (ShuffleOrOp) {
9017	case Instruction::PHI: {
9018	// Too many operands - gather, most probably won't be vectorized.
9019	if (VL0->getNumOperands() > MaxPHINumOperands)
9020	return TreeEntry::NeedToGather;
9021	// Check for terminator values (e.g. invoke).
9022	for (Value *V : VL) {
9023	auto *PHI = dyn_cast<PHINode>(Val: V);
9024	if (!PHI)
9025	continue;
9026	for (Value *Incoming : PHI->incoming_values()) {
9027	Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
9028	if (Term && Term->isTerminator()) {
9029	LLVM_DEBUG(dbgs()
9030	<< "SLP: Need to swizzle PHINodes (terminator use).\n");
9031	return TreeEntry::NeedToGather;
9032	}
9033	}
9034	}
9035
9036	return TreeEntry::Vectorize;
9037	}
9038	case Instruction::ExtractElement:
9039	if (any_of(Range&: VL, P: [&](Value *V) {
9040	auto *EI = dyn_cast<ExtractElementInst>(Val: V);
9041	if (!EI)
9042	return true;
9043	return isVectorized(V: EI->getOperand(i_nocapture: `0`));
9044	}))
9045	return TreeEntry::NeedToGather;
9046	[[fallthrough]];
9047	case Instruction::ExtractValue: {
9048	bool Reuse = canReuseExtract(VL, CurrentOrder);
9049	// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9050	// non-full registers).
9051	if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size()))
9052	return TreeEntry::NeedToGather;
9053	if (Reuse \|\| !CurrentOrder.empty())
9054	return TreeEntry::Vectorize;
9055	LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9056	return TreeEntry::NeedToGather;
9057	}
9058	case Instruction::InsertElement: {
9059	// Check that we have a buildvector and not a shuffle of 2 or more
9060	// different vectors.
9061	ValueSet SourceVectors;
9062	for (Value *V : VL) {
9063	SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: `0`));
9064	assert(getElementIndex(V) != std::nullopt &&
9065	"Non-constant or undef index?");
9066	}
9067
9068	if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
9069	return !SourceVectors.contains(Ptr: V);
9070	}) >= `2`) {
9071	// Found 2nd source vector - cancel.
9072	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9073	"different source vectors.\n");
9074	return TreeEntry::NeedToGather;
9075	}
9076
9077	if (any_of(Range&: VL, P: [&SourceVectors](Value *V) {
9078	// The last InsertElement can have multiple uses.
9079	return SourceVectors.contains(Ptr: V) && !V->hasOneUse();
9080	})) {
9081	assert(SLPReVec && "Only supported by REVEC.");
9082	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9083	"multiple uses.\n");
9084	return TreeEntry::NeedToGather;
9085	}
9086
9087	return TreeEntry::Vectorize;
9088	}
9089	case Instruction::Load: {
9090	// Check that a vectorized load would load the same memory as a scalar
9091	// load. For example, we don't want to vectorize loads that are smaller
9092	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9093	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
9094	// from such a struct, we read/write packed bits disagreeing with the
9095	// unvectorized version.
9096	auto IsGatheredNode = [&]() {
9097	if (!GatheredLoadsEntriesFirst)
9098	return false;
9099	return all_of(Range&: VL, P: [&](Value *V) {
9100	if (isa<PoisonValue>(Val: V))
9101	return true;
9102	return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) {
9103	return TE->Idx >= *GatheredLoadsEntriesFirst;
9104	});
9105	});
9106	};
9107	switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) {
9108	case LoadsState::Vectorize:
9109	return TreeEntry::Vectorize;
9110	case LoadsState::CompressVectorize:
9111	if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9112	// Delay slow vectorized nodes for better vectorization attempts.
9113	LoadEntriesToVectorize.insert(X: VectorizableTree.size());
9114	return TreeEntry::NeedToGather;
9115	}
9116	return IsGatheredNode () ? TreeEntry::NeedToGather
9117	: TreeEntry::CompressVectorize;
9118	case LoadsState::ScatterVectorize:
9119	if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9120	// Delay slow vectorized nodes for better vectorization attempts.
9121	LoadEntriesToVectorize.insert(X: VectorizableTree.size());
9122	return TreeEntry::NeedToGather;
9123	}
9124	return IsGatheredNode () ? TreeEntry::NeedToGather
9125	: TreeEntry::ScatterVectorize;
9126	case LoadsState::StridedVectorize:
9127	if (!IsGraphTransformMode && VectorizableTree.size() > `1`) {
9128	// Delay slow vectorized nodes for better vectorization attempts.
9129	LoadEntriesToVectorize.insert(X: VectorizableTree.size());
9130	return TreeEntry::NeedToGather;
9131	}
9132	return IsGatheredNode () ? TreeEntry::NeedToGather
9133	: TreeEntry::StridedVectorize;
9134	case LoadsState::Gather:
9135	#ifndef NDEBUG
9136	Type *ScalarTy = VL0->getType();
9137	if (DL->getTypeSizeInBits(ScalarTy) !=
9138	DL->getTypeAllocSizeInBits(ScalarTy))
9139	LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9140	else if (any_of(VL, [](Value *V) {
9141	auto *LI = dyn_cast<LoadInst>(V);
9142	return !LI \|\| !LI->isSimple();
9143	}))
9144	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9145	else
9146	LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9147	#endif // NDEBUG
9148	registerNonVectorizableLoads(VL);
9149	return TreeEntry::NeedToGather;
9150	}
9151	llvm_unreachable("Unexpected state of loads");
9152	}
9153	case Instruction::ZExt:
9154	case Instruction::SExt:
9155	case Instruction::FPToUI:
9156	case Instruction::FPToSI:
9157	case Instruction::FPExt:
9158	case Instruction::PtrToInt:
9159	case Instruction::IntToPtr:
9160	case Instruction::SIToFP:
9161	case Instruction::UIToFP:
9162	case Instruction::Trunc:
9163	case Instruction::FPTrunc:
9164	case Instruction::BitCast: {
9165	Type *SrcTy = VL0->getOperand(i: `0`)->getType();
9166	for (Value *V : VL) {
9167	if (isa<PoisonValue>(Val: V))
9168	continue;
9169	Type *Ty = cast<Instruction>(Val: V)->getOperand(i: `0`)->getType();
9170	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
9171	LLVM_DEBUG(
9172	dbgs() << "SLP: Gathering casts with different src types.\n");
9173	return TreeEntry::NeedToGather;
9174	}
9175	}
9176	return TreeEntry::Vectorize;
9177	}
9178	case Instruction::ICmp:
9179	case Instruction::FCmp: {
9180	// Check that all of the compares have the same predicate.
9181	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
9182	CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
9183	Type *ComparedTy = VL0->getOperand(i: `0`)->getType();
9184	for (Value *V : VL) {
9185	if (isa<PoisonValue>(Val: V))
9186	continue;
9187	auto *Cmp = cast<CmpInst>(Val: V);
9188	if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) \|\|
9189	Cmp->getOperand(i_nocapture: `0`)->getType() != ComparedTy) {
9190	LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
9191	return TreeEntry::NeedToGather;
9192	}
9193	}
9194	return TreeEntry::Vectorize;
9195	}
9196	case Instruction::Select:
9197	case Instruction::FNeg:
9198	case Instruction::Add:
9199	case Instruction::FAdd:
9200	case Instruction::Sub:
9201	case Instruction::FSub:
9202	case Instruction::Mul:
9203	case Instruction::FMul:
9204	case Instruction::UDiv:
9205	case Instruction::SDiv:
9206	case Instruction::FDiv:
9207	case Instruction::URem:
9208	case Instruction::SRem:
9209	case Instruction::FRem:
9210	case Instruction::Shl:
9211	case Instruction::LShr:
9212	case Instruction::AShr:
9213	case Instruction::And:
9214	case Instruction::Or:
9215	case Instruction::Xor:
9216	case Instruction::Freeze:
9217	if (S.getMainOp()->getType()->isFloatingPointTy() &&
9218	TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
9219	auto *I = dyn_cast<Instruction>(Val: V);
9220	return I && I->isBinaryOp() && !I->isFast();
9221	}))
9222	return TreeEntry::NeedToGather;
9223	return TreeEntry::Vectorize;
9224	case Instruction::GetElementPtr: {
9225	// We don't combine GEPs with complicated (nested) indexing.
9226	for (Value *V : VL) {
9227	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
9228	if (!I)
9229	continue;
9230	if (I->getNumOperands() != `2`) {
9231	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
9232	return TreeEntry::NeedToGather;
9233	}
9234	}
9235
9236	// We can't combine several GEPs into one vector if they operate on
9237	// different types.
9238	Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
9239	for (Value *V : VL) {
9240	auto *GEP = dyn_cast<GEPOperator>(Val: V);
9241	if (!GEP)
9242	continue;
9243	Type *CurTy = GEP->getSourceElementType();
9244	if (Ty0 != CurTy) {
9245	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
9246	return TreeEntry::NeedToGather;
9247	}
9248	}
9249
9250	// We don't combine GEPs with non-constant indexes.
9251	Type *Ty1 = VL0->getOperand(i: `1`)->getType();
9252	for (Value *V : VL) {
9253	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
9254	if (!I)
9255	continue;
9256	auto *Op = I->getOperand(i_nocapture: `1`);
9257	if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
9258	(Op->getType() != Ty1 &&
9259	((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
9260	Op->getType()->getScalarSizeInBits() >
9261	DL->getIndexSizeInBits(
9262	AS: V->getType()->getPointerAddressSpace())))) {
9263	LLVM_DEBUG(
9264	dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
9265	return TreeEntry::NeedToGather;
9266	}
9267	}
9268
9269	return TreeEntry::Vectorize;
9270	}
9271	case Instruction::Store: {
9272	// Check if the stores are consecutive or if we need to swizzle them.
9273	llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
9274	// Avoid types that are padded when being allocated as scalars, while
9275	// being packed together in a vector (such as i1).
9276	if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
9277	DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
9278	LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
9279	return TreeEntry::NeedToGather;
9280	}
9281	// Make sure all stores in the bundle are simple - we can't vectorize
9282	// atomic or volatile stores.
9283	for (Value *V : VL) {
9284	auto *SI = cast<StoreInst>(Val: V);
9285	if (!SI->isSimple()) {
9286	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
9287	return TreeEntry::NeedToGather;
9288	}
9289	PointerOps.push_back(Elt: SI->getPointerOperand());
9290	}
9291
9292	// Check the order of pointer operands.
9293	if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: CurrentOrder)) {
9294	Value *Ptr0;
9295	Value *PtrN;
9296	if (CurrentOrder.empty()) {
9297	Ptr0 = PointerOps.front();
9298	PtrN = PointerOps.back();
9299	} else {
9300	Ptr0 = PointerOps [CurrentOrder.front()];
9301	PtrN = PointerOps [CurrentOrder.back()];
9302	}
9303	std::optional<int64_t> Dist =
9304	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
9305	// Check that the sorted pointer operands are consecutive.
9306	if (static_cast<uint64_t>(*Dist) == VL.size() - `1`)
9307	return TreeEntry::Vectorize;
9308	}
9309
9310	LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
9311	return TreeEntry::NeedToGather;
9312	}
9313	case Instruction::Call: {
9314	if (S.getMainOp()->getType()->isFloatingPointTy() &&
9315	TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
9316	auto *I = dyn_cast<Instruction>(Val: V);
9317	return I && !I->isFast();
9318	}))
9319	return TreeEntry::NeedToGather;
9320	// Check if the calls are all to the same vectorizable intrinsic or
9321	// library function.
9322	CallInst *CI = cast<CallInst>(Val: VL0);
9323	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9324
9325	VFShape Shape = VFShape::get(
9326	FTy: CI->getFunctionType(),
9327	EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
9328	HasGlobalPred: false /HasGlobalPred/);
9329	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
9330
9331	if (!VecFunc && !isTriviallyVectorizable(ID)) {
9332	LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
9333	return TreeEntry::NeedToGather;
9334	}
9335	Function *F = CI->getCalledFunction();
9336	unsigned NumArgs = CI->arg_size();
9337	SmallVector<Value , `4`> ScalarArgs(NumArgs, nullptr*);
9338	for (unsigned J = `0`; J != NumArgs; ++J)
9339	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI))
9340	ScalarArgs [J] = CI->getArgOperand(i: J);
9341	for (Value *V : VL) {
9342	CallInst *CI2 = dyn_cast<CallInst>(Val: V);
9343	if (!CI2 \|\| CI2->getCalledFunction() != F \|\|
9344	getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID \|\|
9345	(VecFunc &&
9346	VecFunc != VFDatabase (*CI2).getVectorizedFunction(Shape)) \|\|
9347	!CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
9348	LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << V
9349	<< "\n");
9350	return TreeEntry::NeedToGather;
9351	}
9352	// Some intrinsics have scalar arguments and should be same in order for
9353	// them to be vectorized.
9354	for (unsigned J = `0`; J != NumArgs; ++J) {
9355	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) {
9356	Value *A1J = CI2->getArgOperand(i: J);
9357	if (ScalarArgs [J] != A1J) {
9358	LLVM_DEBUG(dbgs()
9359	<< "SLP: mismatched arguments in call:" << *CI
9360	<< " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
9361	return TreeEntry::NeedToGather;
9362	}
9363	}
9364	}
9365	// Verify that the bundle operands are identical between the two calls.
9366	if (CI->hasOperandBundles() &&
9367	!std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
9368	last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
9369	first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
9370	LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
9371	<< "!=" << *V << `'\n'`);
9372	return TreeEntry::NeedToGather;
9373	}
9374	}
9375	SmallVector<Type *> ArgTys =
9376	buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: `0`, TTI);
9377	auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
9378	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9379	if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
9380	return TreeEntry::NeedToGather;
9381
9382	return TreeEntry::Vectorize;
9383	}
9384	case Instruction::ShuffleVector: {
9385	if (!S.isAltShuffle()) {
9386	// REVEC can support non alternate shuffle.
9387	if (SLPReVec && getShufflevectorNumGroups(VL))
9388	return TreeEntry::Vectorize;
9389	// If this is not an alternate sequence of opcode like add-sub
9390	// then do not vectorize this instruction.
9391	LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
9392	return TreeEntry::NeedToGather;
9393	}
9394	if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
9395	LLVM_DEBUG(
9396	dbgs()
9397	<< "SLP: ShuffleVector not vectorized, operands are buildvector and "
9398	"the whole alt sequence is not profitable.\n");
9399	return TreeEntry::NeedToGather;
9400	}
9401
9402	return TreeEntry::Vectorize;
9403	}
9404	default:
9405	LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
9406	return TreeEntry::NeedToGather;
9407	}
9408	}
9409
9410	namespace {
9411	/// Allows to correctly handle operands of the phi nodes based on the \p Main
9412	/// PHINode order of incoming basic blocks/values.
9413	class PHIHandler {
9414	DominatorTree &DT;
9415	PHINode Main = nullptr*;
9416	SmallVector<Value *> Phis;
9417	SmallVector<SmallVector<Value *>> Operands;
9418
9419	public:
9420	PHIHandler() = delete;
9421	PHIHandler(DominatorTree &DT, PHINode Main, ArrayRef<Value > Phis)
9422	: DT(DT), Main(Main), Phis (Phis),
9423	Operands (Main->getNumIncomingValues(),
9424	SmallVector<Value >(Phis.size(), nullptr*)) {}
9425	void buildOperands() {
9426	constexpr unsigned FastLimit = `4`;
9427	if (Main->getNumIncomingValues() <= FastLimit) {
9428	for (unsigned I : seq<unsigned>(Begin: `0`, End: Main->getNumIncomingValues())) {
9429	BasicBlock *InBB = Main->getIncomingBlock(i: I);
9430	if (!DT.isReachableFromEntry(A: InBB)) {
9431	Operands [I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
9432	continue;
9433	}
9434	// Prepare the operand vector.
9435	for (auto [Idx, V] : enumerate(First&: Phis)) {
9436	auto *P = dyn_cast<PHINode>(Val: V);
9437	if (!P) {
9438	assert(isa<PoisonValue>(V) &&
9439	"Expected isa instruction or poison value.");
9440	Operands [I][Idx] = V;
9441	continue;
9442	}
9443	if (P->getIncomingBlock(i: I) == InBB)
9444	Operands [I][Idx] = P->getIncomingValue(i: I);
9445	else
9446	Operands [I][Idx] = P->getIncomingValueForBlock(BB: InBB);
9447	}
9448	}
9449	return;
9450	}
9451	SmallMapVector<BasicBlock , SmallVector<unsigned*>, `4`>
9452	Blocks;
9453	for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) {
9454	BasicBlock *InBB = Main->getIncomingBlock(i: I);
9455	if (!DT.isReachableFromEntry(A: InBB)) {
9456	Operands [I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
9457	continue;
9458	}
9459	Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
9460	}
9461	for (auto [Idx, V] : enumerate(First&: Phis)) {
9462	if (isa<PoisonValue>(Val: V)) {
9463	for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues()))
9464	Operands [I][Idx] = V;
9465	continue;
9466	}
9467	auto *P = cast<PHINode>(Val: V);
9468	for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
9469	BasicBlock *InBB = P->getIncomingBlock(i: I);
9470	if (InBB == Main->getIncomingBlock(i: I)) {
9471	if (isa_and_nonnull<PoisonValue>(Val: Operands [I][Idx]))
9472	continue;
9473	Operands [I][Idx] = P->getIncomingValue(i: I);
9474	continue;
9475	}
9476	auto *It = Blocks.find(Key: InBB);
9477	if (It == Blocks.end())
9478	continue;
9479	Operands [It->second.front()][Idx] = P->getIncomingValue(i: I);
9480	}
9481	}
9482	for (const auto &P : Blocks) {
9483	ArrayRef<unsigned> IncomingValues = P.second;
9484	if (IncomingValues.size() <= `1`)
9485	continue;
9486	unsigned BasicI = IncomingValues.front();
9487	for (unsigned I : IncomingValues.drop_front()) {
9488	assert(all_of(enumerate(Operands[I]),
9489	[&](const auto &Data) {
9490	return !Data.value() \|\|
9491	Data.value() == Operands[BasicI][Data.index()];
9492	}) &&
9493	"Expected empty operands list.");
9494	Operands [I] = Operands [BasicI];
9495	}
9496	}
9497	}
9498	ArrayRef<Value > getOperands(unsigned* I) const { return Operands [I]; }
9499	};
9500	} // namespace
9501
9502	/// Returns main/alternate instructions for the given \p VL. Unlike
9503	/// getSameOpcode supports non-compatible instructions for better SplitVectorize
9504	/// node support.
9505	/// \returns first main/alt instructions, if only poisons and instruction with
9506	/// only 2 opcodes exists. Returns pair of nullptr otherwise.
9507	static std::pair<Instruction , Instruction >
9508	getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
9509	Instruction MainOp = nullptr*;
9510	Instruction AltOp = nullptr*;
9511	for (Value *V : VL) {
9512	if (isa<PoisonValue>(Val: V))
9513	continue;
9514	auto *I = dyn_cast<Instruction>(Val: V);
9515	if (!I)
9516	return {};
9517	if (!MainOp) {
9518	MainOp = I;
9519	continue;
9520	}
9521	if (MainOp->getOpcode() == I->getOpcode()) {
9522	if (I->getParent() != MainOp->getParent())
9523	return {};
9524	continue;
9525	}
9526	if (!AltOp) {
9527	AltOp = I;
9528	continue;
9529	}
9530	if (AltOp->getOpcode() == I->getOpcode()) {
9531	if (I->getParent() != AltOp->getParent())
9532	return {};
9533	continue;
9534	}
9535	return {};
9536	}
9537	if (!AltOp)
9538	return {};
9539	assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
9540	"Expected different main and alt instructions.");
9541	return std::make_pair(x&: MainOp, y&: AltOp);
9542	}
9543
9544	/// Checks that every instruction appears once in the list and if not, packs
9545	/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
9546	/// unique scalars is extended by poison values to the whole register size.
9547	///
9548	/// \returns false if \p VL could not be uniquified, in which case \p VL is
9549	/// unchanged and \p ReuseShuffleIndices is empty.
9550	static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
9551	SmallVectorImpl<int> &ReuseShuffleIndices,
9552	const TargetTransformInfo &TTI,
9553	const TargetLibraryInfo &TLI,
9554	const InstructionsState &S,
9555	const BoUpSLP::EdgeInfo &UserTreeIdx,
9556	bool TryPad = false) {
9557	// Check that every instruction appears once in this bundle.
9558	SmallVector<Value *> UniqueValues;
9559	SmallDenseMap<Value , unsigned*, `16`> UniquePositions(VL.size());
9560	for (Value *V : VL) {
9561	if (isConstant(V)) {
9562	// Constants are always considered distinct, even if the same constant
9563	// appears multiple times in VL.
9564	ReuseShuffleIndices.emplace_back(
9565	Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
9566	UniqueValues.emplace_back(Args&: V);
9567	continue;
9568	}
9569	auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
9570	ReuseShuffleIndices.emplace_back(Args&: Res.first ->second);
9571	if (Res.second)
9572	UniqueValues.emplace_back(Args&: V);
9573	}
9574
9575	// Easy case: VL has unique values and a "natural" size
9576	size_t NumUniqueScalarValues = UniqueValues.size();
9577	bool IsFullVectors = hasFullVectorsOrPowerOf2(
9578	TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues);
9579	if (NumUniqueScalarValues == VL.size() &&
9580	(VectorizeNonPowerOf2 \|\| IsFullVectors)) {
9581	ReuseShuffleIndices.clear();
9582	return true;
9583	}
9584
9585	// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
9586	if ((UserTreeIdx.UserTE &&
9587	UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) \|\|
9588	!hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) {
9589	LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
9590	"for nodes with padding.\n");
9591	ReuseShuffleIndices.clear();
9592	return false;
9593	}
9594
9595	LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
9596	if (NumUniqueScalarValues <= `1` \|\| !IsFullVectors \|\|
9597	(UniquePositions.size() == `1` && all_of(Range&: UniqueValues, P: [](Value *V) {
9598	return isa<UndefValue>(Val: V) \|\| !isConstant(V);
9599	}))) {
9600	if (TryPad && UniquePositions.size() > `1` && NumUniqueScalarValues > `1` &&
9601	S.getMainOp()->isSafeToRemove() &&
9602	all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>)) {
9603	// Find the number of elements, which forms full vectors.
9604	unsigned PWSz = getFullVectorNumberOfElements(
9605	TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size());
9606	PWSz = std::min<unsigned>(a: PWSz, b: VL.size());
9607	if (PWSz == VL.size()) {
9608	// We ended up with the same size after removing duplicates and
9609	// upgrading the resulting vector size to a "nice size". Just keep
9610	// the initial VL then.
9611	ReuseShuffleIndices.clear();
9612	} else {
9613	// Pad unique values with poison to grow the vector to a "nice" size
9614	SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
9615	UniqueValues.end());
9616	PaddedUniqueValues.append(
9617	NumInputs: PWSz - UniqueValues.size(),
9618	Elt: PoisonValue::get(T: UniqueValues.front()->getType()));
9619	// Check that extended with poisons operations are still valid for
9620	// vectorization (div/rem are not allowed).
9621	if (!getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) {
9622	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
9623	ReuseShuffleIndices.clear();
9624	return false;
9625	}
9626	VL = std::move(PaddedUniqueValues);
9627	}
9628	return true;
9629	}
9630	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
9631	ReuseShuffleIndices.clear();
9632	return false;
9633	}
9634	VL = std::move(UniqueValues);
9635	return true;
9636	}
9637
9638	bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
9639	const InstructionsState &LocalState,
9640	SmallVectorImpl<Value *> &Op1,
9641	SmallVectorImpl<Value *> &Op2,
9642	OrdersType &ReorderIndices) const {
9643	constexpr unsigned SmallNodeSize = `4`;
9644	if (VL.size() <= SmallNodeSize \|\| TTI->preferAlternateOpcodeVectorization() \|\|
9645	!SplitAlternateInstructions)
9646	return false;
9647
9648	// Check if this is a duplicate of another split entry.
9649	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
9650	<< ".\n");
9651	for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) {
9652	if (E->isSame(VL)) {
9653	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
9654	<< *LocalState.getMainOp() << ".\n");
9655	return false;
9656	}
9657	SmallPtrSet<Value *, `8`> Values(llvm::from_range, E->Scalars);
9658	if (all_of(Range&: VL, P: [&](Value *V) {
9659	return isa<PoisonValue>(Val: V) \|\| Values.contains(Ptr: V);
9660	})) {
9661	LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
9662	return false;
9663	}
9664	}
9665
9666	ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size());
9667	SmallBitVector Op1Indices(VL.size());
9668	for (auto [Idx, V] : enumerate(First&: VL)) {
9669	auto *I = dyn_cast<Instruction>(Val: V);
9670	if (!I) {
9671	Op1.push_back(Elt: V);
9672	Op1Indices.set(Idx);
9673	continue;
9674	}
9675	if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
9676	isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(),
9677	TLI: *TLI)) \|\|
9678	(LocalState.getAltOpcode() == LocalState.getOpcode() &&
9679	!isAlternateInstruction(I, MainOp: LocalState.getMainOp(),
9680	AltOp: LocalState.getAltOp(), TLI: *TLI))) {
9681	Op1.push_back(Elt: V);
9682	Op1Indices.set(Idx);
9683	continue;
9684	}
9685	Op2.push_back(Elt: V);
9686	}
9687	Type *ScalarTy = getValueType(V: VL.front());
9688	VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
9689	unsigned Opcode0 = LocalState.getOpcode();
9690	unsigned Opcode1 = LocalState.getAltOpcode();
9691	SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9692	// Enable split node, only if all nodes do not form legal alternate
9693	// instruction (like X86 addsub).
9694	SmallPtrSet<Value *, `4`> UOp1(llvm::from_range, Op1);
9695	SmallPtrSet<Value *, `4`> UOp2(llvm::from_range, Op2);
9696	if (UOp1.size() <= `1` \|\| UOp2.size() <= `1` \|\|
9697	TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) \|\|
9698	!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) \|\|
9699	!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size()))
9700	return false;
9701	// Enable split node, only if all nodes are power-of-2/full registers.
9702	unsigned Op1Cnt = `0`, Op2Cnt = Op1.size();
9703	for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
9704	if (Op1Indices.test(Idx)) {
9705	ReorderIndices [Op1Cnt] = Idx;
9706	++Op1Cnt;
9707	} else {
9708	ReorderIndices [Op2Cnt] = Idx;
9709	++Op2Cnt;
9710	}
9711	}
9712	if (isIdentityOrder(Order: ReorderIndices))
9713	ReorderIndices.clear();
9714	SmallVector<int> Mask;
9715	if (!ReorderIndices.empty())
9716	inversePermutation(Indices: ReorderIndices, Mask);
9717	unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
9718	VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size());
9719	VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size());
9720	// Check non-profitable single register ops, which better to be represented
9721	// as alternate ops.
9722	if (NumParts >= VL.size())
9723	return false;
9724	constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
9725	InstructionCost InsertCost = ::getShuffleCost(
9726	TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy);
9727	FixedVectorType *SubVecTy =
9728	getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size()));
9729	InstructionCost NewShuffleCost =
9730	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind);
9731	if (!LocalState.isCmpOp() && NumParts <= `1` &&
9732	(Mask.empty() \|\| InsertCost >= NewShuffleCost))
9733	return false;
9734	if ((LocalState.getMainOp()->isBinaryOp() &&
9735	LocalState.getAltOp()->isBinaryOp() &&
9736	(LocalState.isShiftOp() \|\| LocalState.isBitwiseLogicOp() \|\|
9737	LocalState.isAddSubLikeOp() \|\| LocalState.isMulDivLikeOp())) \|\|
9738	(LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) \|\|
9739	(LocalState.getMainOp()->isUnaryOp() &&
9740	LocalState.getAltOp()->isUnaryOp())) {
9741	InstructionCost OriginalVecOpsCost =
9742	TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) +
9743	TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind);
9744	SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
9745	for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
9746	if (isa<PoisonValue>(Val: VL [Idx]))
9747	continue;
9748	OriginalMask [Idx] = Idx + (Op1Indices.test(Idx) ? `0` : VL.size());
9749	}
9750	InstructionCost OriginalCost =
9751	OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
9752	Tp: VecTy, Mask: OriginalMask, CostKind: Kind);
9753	InstructionCost NewVecOpsCost =
9754	TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) +
9755	TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind);
9756	InstructionCost NewCost =
9757	NewVecOpsCost + InsertCost +
9758	(!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
9759	VectorizableTree.front()->getOpcode() == Instruction::Store
9760	? NewShuffleCost
9761	: `0`);
9762	// If not profitable to split - exit.
9763	if (NewCost >= OriginalCost)
9764	return false;
9765	}
9766	return true;
9767	}
9768
9769	namespace {
9770	/// Class accepts incoming list of values and generates the list of values
9771	/// for scheduling and list of operands for the new nodes.
9772	class InstructionsCompatibilityAnalysis {
9773	DominatorTree &DT;
9774	const DataLayout &DL;
9775	const TargetTransformInfo &TTI;
9776	const TargetLibraryInfo &TLI;
9777
9778	/// Builds operands for the original instructions.
9779	void
9780	buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
9781	SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
9782
9783	unsigned ShuffleOrOp =
9784	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9785	Instruction *VL0 = S.getMainOp();
9786
9787	switch (ShuffleOrOp) {
9788	case Instruction::PHI: {
9789	auto *PH = cast<PHINode>(Val: VL0);
9790
9791	// Keeps the reordered operands to avoid code duplication.
9792	PHIHandler Handler(DT, PH, VL);
9793	Handler.buildOperands();
9794	Operands.assign(NumElts: PH->getNumOperands(), Elt: {});
9795	for (unsigned I : seq<unsigned>(Size: PH->getNumOperands()))
9796	Operands [I].assign(in_start: Handler.getOperands(I).begin(),
9797	in_end: Handler.getOperands(I).end());
9798	return;
9799	}
9800	case Instruction::ExtractValue:
9801	case Instruction::ExtractElement:
9802	// This is a special case, as it does not gather, but at the same time
9803	// we are not extending buildTree_rec() towards the operands.
9804	Operands.assign(NumElts: `1`, Elt: {VL.size(), VL0->getOperand(i: `0`)});
9805	return;
9806	case Instruction::InsertElement:
9807	Operands.assign(NumElts: `2`, Elt: {VL.size(), nullptr});
9808	for (auto [Idx, V] : enumerate(First&: VL)) {
9809	auto *IE = cast<InsertElementInst>(Val: V);
9810	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
9811	Ops [Idx] = IE->getOperand(i_nocapture: OpIdx);
9812	}
9813	return;
9814	case Instruction::Load:
9815	Operands.assign(
9816	NumElts: `1`, Elt: {VL.size(),
9817	PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())});
9818	for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) {
9819	auto *LI = dyn_cast<LoadInst>(Val: V);
9820	if (!LI)
9821	continue;
9822	Op = LI->getPointerOperand();
9823	}
9824	return;
9825	case Instruction::ZExt:
9826	case Instruction::SExt:
9827	case Instruction::FPToUI:
9828	case Instruction::FPToSI:
9829	case Instruction::FPExt:
9830	case Instruction::PtrToInt:
9831	case Instruction::IntToPtr:
9832	case Instruction::SIToFP:
9833	case Instruction::UIToFP:
9834	case Instruction::Trunc:
9835	case Instruction::FPTrunc:
9836	case Instruction::BitCast:
9837	case Instruction::ICmp:
9838	case Instruction::FCmp:
9839	case Instruction::Select:
9840	case Instruction::FNeg:
9841	case Instruction::Add:
9842	case Instruction::FAdd:
9843	case Instruction::Sub:
9844	case Instruction::FSub:
9845	case Instruction::Mul:
9846	case Instruction::FMul:
9847	case Instruction::UDiv:
9848	case Instruction::SDiv:
9849	case Instruction::FDiv:
9850	case Instruction::URem:
9851	case Instruction::SRem:
9852	case Instruction::FRem:
9853	case Instruction::Shl:
9854	case Instruction::LShr:
9855	case Instruction::AShr:
9856	case Instruction::And:
9857	case Instruction::Or:
9858	case Instruction::Xor:
9859	case Instruction::Freeze:
9860	case Instruction::Store:
9861	case Instruction::ShuffleVector:
9862	Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
9863	for (auto [Idx, V] : enumerate(First&: VL)) {
9864	auto *I = dyn_cast<Instruction>(Val: V);
9865	if (!I) {
9866	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
9867	Ops [Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
9868	continue;
9869	}
9870	auto [Op, ConvertedOps] = convertTo(I, S);
9871	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
9872	Ops [Idx] = ConvertedOps [OpIdx];
9873	}
9874	return;
9875	case Instruction::GetElementPtr: {
9876	Operands.assign(NumElts: `2`, Elt: {VL.size(), nullptr});
9877	// Need to cast all indices to the same type before vectorization to
9878	// avoid crash.
9879	// Required to be able to find correct matches between different gather
9880	// nodes and reuse the vectorized values rather than trying to gather them
9881	// again.
9882	const unsigned IndexIdx = `1`;
9883	Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
9884	Type *Ty =
9885	all_of(Range&: VL,
9886	P: [&](Value *V) {
9887	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
9888	return !GEP \|\| VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
9889	})
9890	? VL0Ty
9891	: DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
9892	->getPointerOperandType()
9893	->getScalarType());
9894	for (auto [Idx, V] : enumerate(First&: VL)) {
9895	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
9896	if (!GEP) {
9897	Operands [`0`][Idx] = V;
9898	Operands [`1`][Idx] = ConstantInt::getNullValue(Ty);
9899	continue;
9900	}
9901	Operands [`0`][Idx] = GEP->getPointerOperand();
9902	auto *Op = GEP->getOperand(i_nocapture: IndexIdx);
9903	auto *CI = dyn_cast<ConstantInt>(Val: Op);
9904	Operands [`1`][Idx] = CI ? ConstantFoldIntegerCast(
9905	C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL)
9906	: Op;
9907	}
9908	return;
9909	}
9910	case Instruction::Call: {
9911	auto *CI = cast<CallInst>(Val: VL0);
9912	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
9913	for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) {
9914	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI))
9915	continue;
9916	auto &Ops = Operands.emplace_back();
9917	for (Value *V : VL) {
9918	auto *I = dyn_cast<Instruction>(Val: V);
9919	Ops.push_back(Elt: I ? I->getOperand(i: Idx)
9920	: PoisonValue::get(T: VL0->getOperand(i: Idx)->getType()));
9921	}
9922	}
9923	return;
9924	}
9925	default:
9926	break;
9927	}
9928	llvm_unreachable("Unexpected vectorization of the instructions.");
9929	}
9930
9931	public:
9932	InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
9933	const TargetTransformInfo &TTI,
9934	const TargetLibraryInfo &TLI)
9935	: DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
9936
9937	SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
9938	ArrayRef<Value *> VL) {
9939	assert(S && "Invalid state!");
9940	SmallVector<BoUpSLP::ValueList> Operands;
9941	buildOriginalOperands(S, VL, Operands);
9942	return Operands;
9943	}
9944	};
9945	} // namespace
9946
9947	BoUpSLP::ScalarsVectorizationLegality
9948	BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value > VL, unsigned* Depth,
9949	const EdgeInfo &UserTreeIdx) const {
9950	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");
9951
9952	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
9953
9954	// Don't go into catchswitch blocks, which can happen with PHIs.
9955	// Such blocks can only have PHIs and the catchswitch. There is no
9956	// place to insert a shuffle if we need to, so just avoid that issue.
9957	if (S && isa<CatchSwitchInst>(Val: S.getMainOp()->getParent()->getTerminator())) {
9958	LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
9959	// Do not try to pack to avoid extra instructions here.
9960	return ScalarsVectorizationLegality (S, /IsLegal=/false,
9961	/TryToFindDuplicates=/false);
9962	}
9963
9964	// Check if this is a duplicate of another entry.
9965	if (S) {
9966	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
9967	for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) {
9968	if (E->isSame(VL)) {
9969	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
9970	<< ".\n");
9971	return ScalarsVectorizationLegality (S, /IsLegal=/false);
9972	}
9973	SmallPtrSet<Value *, `8`> Values(llvm::from_range, E->Scalars);
9974	if (all_of(Range&: VL, P: [&](Value *V) {
9975	return isa<PoisonValue>(Val: V) \|\| Values.contains(Ptr: V);
9976	})) {
9977	LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
9978	return ScalarsVectorizationLegality (S, /IsLegal=/false);
9979	}
9980	}
9981	}
9982
9983	// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
9984	// a load), in which case peek through to include it in the tree, without
9985	// ballooning over-budget.
9986	if (Depth >= RecursionMaxDepth &&
9987	!(S && !S.isAltShuffle() && VL.size() >= `4` &&
9988	(match(V: S.getMainOp(), P: m_Load(Op: m_Value())) \|\|
9989	all_of(Range&: VL, P: [&S](const Value *I) {
9990	return match(V: I,
9991	P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
9992	cast<Instruction>(Val: I)->getOpcode() == S.getOpcode();
9993	})))) {
9994	LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
9995	return ScalarsVectorizationLegality (S, /IsLegal=/false);
9996	}
9997
9998	// Don't handle scalable vectors
9999	if (S && S.getOpcode() == Instruction::ExtractElement &&
10000	isa<ScalableVectorType>(
10001	Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) {
10002	LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
10003	return ScalarsVectorizationLegality (S, /IsLegal=/false);
10004	}
10005
10006	// Don't handle vectors.
10007	if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) {
10008	LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
10009	// Do not try to pack to avoid extra instructions here.
10010	return ScalarsVectorizationLegality (S, /IsLegal=/false,
10011	/TryToFindDuplicates=/false);
10012	}
10013
10014	// If all of the operands are identical or constant we have a simple solution.
10015	// If we deal with insert/extract instructions, they all must have constant
10016	// indices, otherwise we should gather them, not try to vectorize.
10017	// If alternate op node with 2 elements with gathered operands - do not
10018	// vectorize.
10019	auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
10020	if (!S \|\| !S.isAltShuffle() \|\| VL.size() > `2`)
10021	return false;
10022	if (VectorizableTree.size() < MinTreeSize)
10023	return false;
10024	if (Depth >= RecursionMaxDepth - `1`)
10025	return true;
10026	// Check if all operands are extracts, part of vector node or can build a
10027	// regular vectorize node.
10028	SmallVector<unsigned, `8`> InstsCount;
10029	for (Value *V : VL) {
10030	auto *I = cast<Instruction>(Val: V);
10031	InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
10032	return isa<Instruction>(Val: Op) \|\| isVectorLikeInstWithConstOps(V: Op);
10033	}));
10034	}
10035	bool IsCommutative =
10036	isCommutative(I: S.getMainOp()) \|\| isCommutative(I: S.getAltOp());
10037	if ((IsCommutative &&
10038	std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: `0`) < `2`) \|\|
10039	(!IsCommutative &&
10040	all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < `2`; })))
10041	return true;
10042	assert(VL.size() == `2` && "Expected only 2 alternate op instructions.");
10043	SmallVector<SmallVector<std::pair<Value , Value >>> Candidates;
10044	auto *I1 = cast<Instruction>(Val: VL.front());
10045	auto *I2 = cast<Instruction>(Val: VL.back());
10046	for (int Op : seq<int>(Size: S.getMainOp()->getNumOperands()))
10047	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
10048	Args: I2->getOperand(i: Op));
10049	if (static_cast<unsigned>(count_if(
10050	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
10051	return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
10052	})) >= S.getMainOp()->getNumOperands() / `2`)
10053	return false;
10054	if (S.getMainOp()->getNumOperands() > `2`)
10055	return true;
10056	if (IsCommutative) {
10057	// Check permuted operands.
10058	Candidates.clear();
10059	for (int Op = `0`, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
10060	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
10061	Args: I2->getOperand(i: (Op + `1`) % E));
10062	if (any_of(
10063	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
10064	return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
10065	}))
10066	return false;
10067	}
10068	return true;
10069	};
10070	SmallVector<unsigned> SortedIndices;
10071	BasicBlock BB = nullptr*;
10072	bool IsScatterVectorizeUserTE =
10073	UserTreeIdx.UserTE &&
10074	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
10075	bool AreAllSameBlock = S && allSameBlock(VL);
10076	bool AreScatterAllGEPSameBlock =
10077	(IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
10078	VL.size() > `2` &&
10079	all_of(Range&: VL,
10080	P: [&BB](Value *V) {
10081	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10082	if (!I)
10083	return doesNotNeedToBeScheduled(V);
10084	if (!BB)
10085	BB = I->getParent();
10086	return BB == I->getParent() && I->getNumOperands() == `2`;
10087	}) &&
10088	BB &&
10089	sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: DL, SE&: SE,
10090	SortedIndices));
10091	bool AreAllSameInsts = AreAllSameBlock \|\| AreScatterAllGEPSameBlock;
10092	if (!AreAllSameInsts \|\| (!S && allConstant(VL)) \|\| isSplat(VL) \|\|
10093	(S &&
10094	isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
10095	Val: S.getMainOp()) &&
10096	!all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) \|\|
10097	NotProfitableForVectorization (VL)) {
10098	if (!S) {
10099	LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
10100	"C,S,B,O, small shuffle. \n");
10101	return ScalarsVectorizationLegality (S, /IsLegal=/false,
10102	/TryToFindDuplicates=/true,
10103	/TrySplitVectorize=/true);
10104	}
10105	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
10106	return ScalarsVectorizationLegality (S, /IsLegal=/false);
10107	}
10108
10109	// Don't vectorize ephemeral values.
10110	if (S && !EphValues.empty()) {
10111	for (Value *V : VL) {
10112	if (EphValues.count(Ptr: V)) {
10113	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
10114	<< ") is ephemeral.\n");
10115	// Do not try to pack to avoid extra instructions here.
10116	return ScalarsVectorizationLegality (S, /IsLegal=/false,
10117	/TryToFindDuplicates=/false);
10118	}
10119	}
10120	}
10121
10122	// We now know that this is a vector of instructions of the same type from
10123	// the same block.
10124
10125	// Check that none of the instructions in the bundle are already in the tree
10126	// and the node may be not profitable for the vectorization as the small
10127	// alternate node.
10128	if (S && S.isAltShuffle()) {
10129	auto GetNumVectorizedExtracted = [&]() {
10130	APInt Extracted = APInt::getZero(numBits: VL.size());
10131	APInt Vectorized = APInt::getAllOnes(numBits: VL.size());
10132	for (auto [Idx, V] : enumerate(First&: VL)) {
10133	auto *I = dyn_cast<Instruction>(Val: V);
10134	if (!I \|\| doesNotNeedToBeScheduled(V: I) \|\|
10135	all_of(Range: I->operands(), P: [&](const Use &U) {
10136	return isa<ExtractElementInst>(Val: U.get());
10137	}))
10138	continue;
10139	if (isVectorized(V: I))
10140	Vectorized.clearBit(BitPosition: Idx);
10141	else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList))
10142	Extracted.setBit(Idx);
10143	}
10144	return std::make_pair(x&: Vectorized, y&: Extracted);
10145	};
10146	auto [Vectorized, Extracted] = GetNumVectorizedExtracted ();
10147	constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10148	bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == `2`;
10149	if (!Vectorized.isAllOnes() && !PreferScalarize) {
10150	// Rough cost estimation, if the vector code (+ potential extracts) is
10151	// more profitable than the scalar + buildvector.
10152	Type *ScalarTy = VL.front()->getType();
10153	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
10154	InstructionCost VectorizeCostEstimate =
10155	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) +
10156	::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted,
10157	/Insert=/false, /Extract=/true, CostKind: Kind);
10158	InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
10159	TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized,
10160	/Insert=/true, /Extract=/false, CostKind: Kind, /ForPoisonSrc=/false);
10161	PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
10162	}
10163	if (PreferScalarize) {
10164	LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
10165	"node is not profitable.\n");
10166	return ScalarsVectorizationLegality (S, /IsLegal=/false);
10167	}
10168	}
10169
10170	// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
10171	if (UserIgnoreList && !UserIgnoreList->empty()) {
10172	for (Value *V : VL) {
10173	if (UserIgnoreList->contains(V)) {
10174	LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
10175	return ScalarsVectorizationLegality (S, /IsLegal=/false);
10176	}
10177	}
10178	}
10179
10180	// Special processing for sorted pointers for ScatterVectorize node with
10181	// constant indeces only.
10182	if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
10183	assert(VL.front()->getType()->isPointerTy() &&
10184	count_if(VL, IsaPred<GetElementPtrInst>) >= `2` &&
10185	"Expected pointers only.");
10186	// Reset S to make it GetElementPtr kind of node.
10187	const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
10188	assert(It != VL.end() && "Expected at least one GEP.");
10189	S = getSameOpcode(VL: It, TLI: TLI);
10190	}
10191
10192	// Check that all of the users of the scalars that we want to vectorize are
10193	// schedulable.
10194	Instruction *VL0 = S.getMainOp();
10195	BB = VL0->getParent();
10196
10197	if (S &&
10198	(BB->isEHPad() \|\| isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) \|\|
10199	!DT->isReachableFromEntry(A: BB))) {
10200	// Don't go into unreachable blocks. They may contain instructions with
10201	// dependency cycles which confuse the final scheduling.
10202	// Do not vectorize EH and non-returning blocks, not profitable in most
10203	// cases.
10204	LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
10205	return ScalarsVectorizationLegality (S, /IsLegal=/false);
10206	}
10207	return ScalarsVectorizationLegality (S, /IsLegal=/true);
10208	}
10209
10210	void BoUpSLP::buildTreeRec(ArrayRef<Value > VLRef, unsigned* Depth,
10211	const EdgeInfo &UserTreeIdx,
10212	unsigned InterleaveFactor) {
10213	assert((allConstant(VLRef) \|\| allSameType(VLRef)) && "Invalid types!");
10214
10215	SmallVector<int> ReuseShuffleIndices;
10216	SmallVector<Value *> VL(VLRef);
10217
10218	// Tries to build split node.
10219	auto TrySplitNode = [&](const InstructionsState &LocalState) {
10220	SmallVector<Value *> Op1, Op2;
10221	OrdersType ReorderIndices;
10222	if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
10223	return false;
10224
10225	SmallVector<Value *> NewVL(VL.size());
10226	copy(Range&: Op1, Out: NewVL.begin());
10227	copy(Range&: Op2, Out: std::next(x: NewVL.begin(), n: Op1.size()));
10228	auto Invalid = ScheduleBundle::invalid();
10229	auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState,
10230	UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices);
10231	LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
10232	auto AddNode = [&](ArrayRef<Value > Op, unsigned* Idx) {
10233	InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
10234	if (S && (isa<LoadInst>(Val: S.getMainOp()) \|\|
10235	getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /SameVF=/true))) {
10236	// Build gather node for loads, they will be gathered later.
10237	TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
10238	Args: Idx == `0` ? `0` : Op1.size());
10239	(void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx});
10240	} else {
10241	TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
10242	Args: Idx == `0` ? `0` : Op1.size());
10243	buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx});
10244	}
10245	};
10246	AddNode(Op1, `0`);
10247	AddNode(Op2, `1`);
10248	return true;
10249	};
10250
10251	ScalarsVectorizationLegality Legality =
10252	getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
10253	const InstructionsState &S = Legality.getInstructionsState();
10254	if (!Legality.isLegal()) {
10255	if (Legality.trySplitVectorize()) {
10256	auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
10257	// Last chance to try to vectorize alternate node.
10258	if (MainOp && AltOp && TrySplitNode (InstructionsState (MainOp, AltOp)))
10259	return;
10260	}
10261	if (Legality.tryToFindDuplicates())
10262	tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: TTI, TLI: TLI, S, UserTreeIdx);
10263
10264	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10265	return;
10266	}
10267
10268	// FIXME: investigate if there are profitable cases for VL.size() <= 4.
10269	if (S.isAltShuffle() && TrySplitNode (S))
10270	return;
10271
10272	// Check that every instruction appears once in this bundle.
10273	if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: TTI, TLI: TLI, S, UserTreeIdx,
10274	/TryPad=/true)) {
10275	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10276	return;
10277	}
10278
10279	// Perform specific checks for each particular instruction kind.
10280	bool IsScatterVectorizeUserTE =
10281	UserTreeIdx.UserTE &&
10282	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
10283	OrdersType CurrentOrder;
10284	SmallVector<Value *> PointerOps;
10285	TreeEntry::EntryState State = getScalarsVectorizationState(
10286	S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
10287	if (State == TreeEntry::NeedToGather) {
10288	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10289	return;
10290	}
10291
10292	Instruction *VL0 = S.getMainOp();
10293	BasicBlock *BB = VL0->getParent();
10294	auto &BSRef = BlocksSchedules [BB];
10295	if (!BSRef)
10296	BSRef = std::make_unique<BlockScheduling>(args&: BB);
10297
10298	BlockScheduling &BS = *BSRef;
10299
10300	SetVector<Value *> UniqueValues(llvm::from_range, VL);
10301	std::optional<ScheduleBundle *> BundlePtr =
10302	BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S);
10303	#ifdef EXPENSIVE_CHECKS
10304	// Make sure we didn't break any internal invariants
10305	BS.verify();
10306	#endif
10307	if (!BundlePtr \|\| (BundlePtr && !BundlePtr.value())) {
10308	LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
10309	// Last chance to try to vectorize alternate node.
10310	if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode (S))
10311	return;
10312	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10313	NonScheduledFirst.insert(Ptr: VL.front());
10314	if (S.getOpcode() == Instruction::Load &&
10315	BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
10316	registerNonVectorizableLoads(VL: ArrayRef(VL));
10317	return;
10318	}
10319	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
10320	SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
10321	ScheduleBundle Empty;
10322	ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
10323	LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
10324
10325	unsigned ShuffleOrOp =
10326	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10327	auto CreateOperandNodes = [&](TreeEntry TE, const* auto &Operands) {
10328	// Postpone PHI nodes creation
10329	SmallVector<unsigned> PHIOps;
10330	for (unsigned I : seq<unsigned>(Operands.size())) {
10331	ArrayRef<Value *> Op = Operands[I];
10332	if (Op.empty())
10333	continue;
10334	InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
10335	if ((!S \|\| S.getOpcode() != Instruction::PHI) \|\| S.isAltShuffle())
10336	buildTreeRec(VLRef: Op, Depth: Depth + `1`, UserTreeIdx: {TE, I});
10337	else
10338	PHIOps.push_back(Elt: I);
10339	}
10340	for (unsigned I : PHIOps)
10341	buildTreeRec(VLRef: Operands[I], Depth: Depth + `1`, UserTreeIdx: {TE, I});
10342	};
10343	switch (ShuffleOrOp) {
10344	case Instruction::PHI: {
10345	TreeEntry *TE =
10346	newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
10347	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
10348	TE->dump());
10349
10350	TE->setOperands(Operands);
10351	CreateOperandNodes (TE, Operands);
10352	return;
10353	}
10354	case Instruction::ExtractValue:
10355	case Instruction::ExtractElement: {
10356	if (CurrentOrder.empty()) {
10357	LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
10358	} else {
10359	LLVM_DEBUG({
10360	dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
10361	"with order";
10362	for (unsigned Idx : CurrentOrder)
10363	dbgs() << " " << Idx;
10364	dbgs() << "\n";
10365	});
10366	fixupOrderingIndices(Order: CurrentOrder);
10367	}
10368	// Insert new order with initial value 0, if it does not exist,
10369	// otherwise return the iterator to the existing one.
10370	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10371	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10372	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
10373	"(ExtractValueInst/ExtractElementInst).\n";
10374	TE->dump());
10375	// This is a special case, as it does not gather, but at the same time
10376	// we are not extending buildTreeRec() towards the operands.
10377	TE->setOperands(Operands);
10378	return;
10379	}
10380	case Instruction::InsertElement: {
10381	assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
10382
10383	auto OrdCompare = [](const std::pair<int, int> &P1,
10384	const std::pair<int, int> &P2) {
10385	return P1.first > P2.first;
10386	};
10387	PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
10388	decltype(OrdCompare)>
10389	Indices(OrdCompare);
10390	for (int I = `0`, E = VL.size(); I < E; ++I) {
10391	unsigned Idx = *getElementIndex(Inst: VL [I]);
10392	Indices.emplace(args&: Idx, args&: I);
10393	}
10394	OrdersType CurrentOrder(VL.size(), VL.size());
10395	bool IsIdentity = true;
10396	for (int I = `0`, E = VL.size(); I < E; ++I) {
10397	CurrentOrder [Indices.top().second] = I;
10398	IsIdentity &= Indices.top().second == I;
10399	Indices.pop();
10400	}
10401	if (IsIdentity)
10402	CurrentOrder.clear();
10403	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10404	ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder);
10405	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
10406	TE->dump());
10407
10408	TE->setOperands(Operands);
10409	buildTreeRec(VLRef: TE->getOperand(OpIdx: `1`), Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
10410	return;
10411	}
10412	case Instruction::Load: {
10413	// Check that a vectorized load would load the same memory as a scalar
10414	// load. For example, we don't want to vectorize loads that are smaller
10415	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10416	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
10417	// from such a struct, we read/write packed bits disagreeing with the
10418	// unvectorized version.
10419	TreeEntry TE = nullptr*;
10420	fixupOrderingIndices(Order: CurrentOrder);
10421	switch (State) {
10422	case TreeEntry::Vectorize:
10423	TE = newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
10424	ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor);
10425	if (CurrentOrder.empty())
10426	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
10427	TE->dump());
10428	else
10429	LLVM_DEBUG(dbgs()
10430	<< "SLP: added a new TreeEntry (jumbled LoadInst).\n";
10431	TE->dump());
10432	break;
10433	case TreeEntry::CompressVectorize:
10434	// Vectorizing non-consecutive loads with (masked)load + compress.
10435	TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S,
10436	UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10437	LLVM_DEBUG(
10438	dbgs()
10439	<< "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
10440	TE->dump());
10441	break;
10442	case TreeEntry::StridedVectorize:
10443	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
10444	TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
10445	UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10446	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
10447	TE->dump());
10448	break;
10449	case TreeEntry::ScatterVectorize:
10450	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
10451	TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
10452	UserTreeIdx, ReuseShuffleIndices);
10453	LLVM_DEBUG(
10454	dbgs()
10455	<< "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
10456	TE->dump());
10457	break;
10458	case TreeEntry::CombinedVectorize:
10459	case TreeEntry::SplitVectorize:
10460	case TreeEntry::NeedToGather:
10461	llvm_unreachable("Unexpected loads state.");
10462	}
10463	if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
10464	assert(Operands.size() == `1` && "Expected a single operand only");
10465	SmallVector<int> Mask;
10466	inversePermutation(Indices: CurrentOrder, Mask);
10467	reorderScalars(Scalars&: Operands.front(), Mask);
10468	}
10469	TE->setOperands(Operands);
10470	if (State == TreeEntry::ScatterVectorize)
10471	buildTreeRec(VLRef: PointerOps, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
10472	return;
10473	}
10474	case Instruction::ZExt:
10475	case Instruction::SExt:
10476	case Instruction::FPToUI:
10477	case Instruction::FPToSI:
10478	case Instruction::FPExt:
10479	case Instruction::PtrToInt:
10480	case Instruction::IntToPtr:
10481	case Instruction::SIToFP:
10482	case Instruction::UIToFP:
10483	case Instruction::Trunc:
10484	case Instruction::FPTrunc:
10485	case Instruction::BitCast: {
10486	auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
10487	u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
10488	y: std::numeric_limits<unsigned>::max()));
10489	if (ShuffleOrOp == Instruction::ZExt \|\|
10490	ShuffleOrOp == Instruction::SExt) {
10491	CastMaxMinBWSizes = std::make_pair(
10492	x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
10493	b: PrevMaxBW),
10494	y: std::min<unsigned>(
10495	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
10496	b: PrevMinBW));
10497	} else if (ShuffleOrOp == Instruction::Trunc) {
10498	CastMaxMinBWSizes = std::make_pair(
10499	x: std::max<unsigned>(
10500	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
10501	b: PrevMaxBW),
10502	y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
10503	b: PrevMinBW));
10504	}
10505	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10506	ReuseShuffleIndices);
10507	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
10508	TE->dump());
10509
10510	TE->setOperands(Operands);
10511	for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
10512	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
10513	if (ShuffleOrOp == Instruction::Trunc) {
10514	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
10515	} else if (ShuffleOrOp == Instruction::SIToFP \|\|
10516	ShuffleOrOp == Instruction::UIToFP) {
10517	unsigned NumSignBits =
10518	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
10519	if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: `0`))) {
10520	APInt Mask = DB->getDemandedBits(I: OpI);
10521	NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
10522	}
10523	if (NumSignBits * `2` >=
10524	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
10525	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
10526	}
10527	return;
10528	}
10529	case Instruction::ICmp:
10530	case Instruction::FCmp: {
10531	// Check that all of the compares have the same predicate.
10532	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
10533	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10534	ReuseShuffleIndices);
10535	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
10536	TE->dump());
10537
10538	VLOperands Ops(VL, Operands, S, *this);
10539	if (cast<CmpInst>(Val: VL0)->isCommutative()) {
10540	// Commutative predicate - collect + sort operands of the instructions
10541	// so that each side is more likely to have the same opcode.
10542	assert(P0 == CmpInst::getSwappedPredicate(P0) &&
10543	"Commutative Predicate mismatch");
10544	Ops.reorder();
10545	Operands.front() = Ops.getVL(OpIdx: `0`);
10546	Operands.back() = Ops.getVL(OpIdx: `1`);
10547	} else {
10548	// Collect operands - commute if it uses the swapped predicate.
10549	for (auto [Idx, V] : enumerate(First&: VL)) {
10550	if (isa<PoisonValue>(Val: V))
10551	continue;
10552	auto *Cmp = cast<CmpInst>(Val: V);
10553	if (Cmp->getPredicate() != P0)
10554	std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
10555	}
10556	}
10557	TE->setOperands(Operands);
10558	buildTreeRec(VLRef: Operands.front(), Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
10559	buildTreeRec(VLRef: Operands.back(), Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
10560	if (ShuffleOrOp == Instruction::ICmp) {
10561	unsigned NumSignBits0 =
10562	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
10563	if (NumSignBits0 * `2` >=
10564	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
10565	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
10566	unsigned NumSignBits1 =
10567	ComputeNumSignBits(Op: VL0->getOperand(i: `1`), DL: DL, AC, CxtI: nullptr*, DT);
10568	if (NumSignBits1 * `2` >=
10569	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `1`)->getType()))
10570	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `1`)->Idx);
10571	}
10572	return;
10573	}
10574	case Instruction::Select:
10575	case Instruction::FNeg:
10576	case Instruction::Add:
10577	case Instruction::FAdd:
10578	case Instruction::Sub:
10579	case Instruction::FSub:
10580	case Instruction::Mul:
10581	case Instruction::FMul:
10582	case Instruction::UDiv:
10583	case Instruction::SDiv:
10584	case Instruction::FDiv:
10585	case Instruction::URem:
10586	case Instruction::SRem:
10587	case Instruction::FRem:
10588	case Instruction::Shl:
10589	case Instruction::LShr:
10590	case Instruction::AShr:
10591	case Instruction::And:
10592	case Instruction::Or:
10593	case Instruction::Xor:
10594	case Instruction::Freeze: {
10595	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10596	ReuseShuffleIndices);
10597	LLVM_DEBUG(
10598	dbgs() << "SLP: added a new TreeEntry "
10599	"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
10600	TE->dump());
10601
10602	if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
10603	VLOperands Ops(VL, Operands, S, *this);
10604	Ops.reorder();
10605	Operands [`0`] = Ops.getVL(OpIdx: `0`);
10606	Operands [`1`] = Ops.getVL(OpIdx: `1`);
10607	}
10608	TE->setOperands(Operands);
10609	for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
10610	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
10611	return;
10612	}
10613	case Instruction::GetElementPtr: {
10614	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10615	ReuseShuffleIndices);
10616	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
10617	TE->dump());
10618	TE->setOperands(Operands);
10619
10620	for (unsigned I = `0`, Ops = Operands.size(); I < Ops; ++I)
10621	buildTreeRec(VLRef: Operands [I], Depth: Depth + `1`, UserTreeIdx: {TE, I});
10622	return;
10623	}
10624	case Instruction::Store: {
10625	bool Consecutive = CurrentOrder.empty();
10626	if (!Consecutive)
10627	fixupOrderingIndices(Order: CurrentOrder);
10628	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10629	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10630	if (Consecutive)
10631	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
10632	TE->dump());
10633	else
10634	LLVM_DEBUG(
10635	dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
10636	TE->dump());
10637	TE->setOperands(Operands);
10638	buildTreeRec(VLRef: TE->getOperand(OpIdx: `0`), Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
10639	return;
10640	}
10641	case Instruction::Call: {
10642	// Check if the calls are all to the same vectorizable intrinsic or
10643	// library function.
10644	CallInst *CI = cast<CallInst>(Val: VL0);
10645	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10646
10647	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10648	ReuseShuffleIndices);
10649	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
10650	TE->dump());
10651	if (isCommutative(I: VL0)) {
10652	VLOperands Ops(VL, Operands, S, *this);
10653	Ops.reorder();
10654	Operands [`0`] = Ops.getVL(OpIdx: `0`);
10655	Operands [`1`] = Ops.getVL(OpIdx: `1`);
10656	}
10657	TE->setOperands(Operands);
10658	for (unsigned I : seq<unsigned>(Size: CI->arg_size())) {
10659	// For scalar operands no need to create an entry since no need to
10660	// vectorize it.
10661	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI))
10662	continue;
10663	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
10664	}
10665	return;
10666	}
10667	case Instruction::ShuffleVector: {
10668	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
10669	ReuseShuffleIndices);
10670	if (S.isAltShuffle()) {
10671	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
10672	TE->dump());
10673	} else {
10674	assert(SLPReVec && "Only supported by REVEC.");
10675	LLVM_DEBUG(
10676	dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
10677	TE->dump());
10678	}
10679
10680	// Reorder operands if reordering would enable vectorization.
10681	auto *CI = dyn_cast<CmpInst>(Val: VL0);
10682	if (CI && any_of(Range&: VL, P: [](Value *V) {
10683	return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative();
10684	})) {
10685	auto *MainCI = cast<CmpInst>(Val: S.getMainOp());
10686	auto *AltCI = cast<CmpInst>(Val: S.getAltOp());
10687	CmpInst::Predicate MainP = MainCI->getPredicate();
10688	CmpInst::Predicate AltP = AltCI->getPredicate();
10689	assert(MainP != AltP &&
10690	"Expected different main/alternate predicates.");
10691	// Collect operands - commute if it uses the swapped predicate or
10692	// alternate operation.
10693	for (auto [Idx, V] : enumerate(First&: VL)) {
10694	if (isa<PoisonValue>(Val: V))
10695	continue;
10696	auto *Cmp = cast<CmpInst>(Val: V);
10697
10698	if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
10699	if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
10700	std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
10701	} else {
10702	if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
10703	std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
10704	}
10705	}
10706	TE->setOperands(Operands);
10707	buildTreeRec(VLRef: Operands.front(), Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
10708	buildTreeRec(VLRef: Operands.back(), Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
10709	return;
10710	}
10711
10712	if (isa<BinaryOperator>(Val: VL0) \|\| CI) {
10713	VLOperands Ops(VL, Operands, S, *this);
10714	Ops.reorder();
10715	Operands [`0`] = Ops.getVL(OpIdx: `0`);
10716	Operands [`1`] = Ops.getVL(OpIdx: `1`);
10717	}
10718	TE->setOperands(Operands);
10719	for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
10720	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
10721	return;
10722	}
10723	default:
10724	break;
10725	}
10726	llvm_unreachable("Unexpected vectorization of the instructions.");
10727	}
10728
10729	unsigned BoUpSLP::canMapToVector(Type T) const* {
10730	unsigned N = `1`;
10731	Type *EltTy = T;
10732
10733	while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
10734	if (EltTy->isEmptyTy())
10735	return `0`;
10736	if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
10737	// Check that struct is homogeneous.
10738	for (const auto *Ty : ST->elements())
10739	if (Ty != *ST->element_begin())
10740	return `0`;
10741	N *= ST->getNumElements();
10742	EltTy = *ST->element_begin();
10743	} else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
10744	N *= AT->getNumElements();
10745	EltTy = AT->getElementType();
10746	} else {
10747	auto *VT = cast<FixedVectorType>(Val: EltTy);
10748	N *= VT->getNumElements();
10749	EltTy = VT->getElementType();
10750	}
10751	}
10752
10753	if (!isValidElementType(Ty: EltTy))
10754	return `0`;
10755	size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
10756	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\|
10757	VTSize != DL->getTypeStoreSizeInBits(Ty: T))
10758	return `0`;
10759	return N;
10760	}
10761
10762	bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
10763	SmallVectorImpl<unsigned> &CurrentOrder,
10764	bool ResizeAllowed) const {
10765	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
10766	assert(It != VL.end() && "Expected at least one extract instruction.");
10767	auto E0 = cast<Instruction>(Val: It);
10768	assert(
10769	all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
10770	"Invalid opcode");
10771	// Check if all of the extracts come from the same vector and from the
10772	// correct offset.
10773	Value *Vec = E0->getOperand(i: `0`);
10774
10775	CurrentOrder.clear();
10776
10777	// We have to extract from a vector/aggregate with the same number of elements.
10778	unsigned NElts;
10779	if (E0->getOpcode() == Instruction::ExtractValue) {
10780	NElts = canMapToVector(T: Vec->getType());
10781	if (!NElts)
10782	return false;
10783	// Check if load can be rewritten as load of vector.
10784	LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
10785	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(N: VL.size()))
10786	return false;
10787	} else {
10788	NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
10789	}
10790
10791	unsigned E = VL.size();
10792	if (!ResizeAllowed && NElts != E)
10793	return false;
10794	SmallVector<int> Indices(E, PoisonMaskElem);
10795	unsigned MinIdx = NElts, MaxIdx = `0`;
10796	for (auto [I, V] : enumerate(First&: VL)) {
10797	auto *Inst = dyn_cast<Instruction>(Val: V);
10798	if (!Inst)
10799	continue;
10800	if (Inst->getOperand(i: `0`) != Vec)
10801	return false;
10802	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
10803	if (isa<UndefValue>(Val: EE->getIndexOperand()))
10804	continue;
10805	std::optional<unsigned> Idx = getExtractIndex(E: Inst);
10806	if (!Idx)
10807	return false;
10808	const unsigned ExtIdx = *Idx;
10809	if (ExtIdx >= NElts)
10810	continue;
10811	Indices [I] = ExtIdx;
10812	if (MinIdx > ExtIdx)
10813	MinIdx = ExtIdx;
10814	if (MaxIdx < ExtIdx)
10815	MaxIdx = ExtIdx;
10816	}
10817	if (MaxIdx - MinIdx + `1` > E)
10818	return false;
10819	if (MaxIdx + `1` <= E)
10820	MinIdx = `0`;
10821
10822	// Check that all of the indices extract from the correct offset.
10823	bool ShouldKeepOrder = true;
10824	// Assign to all items the initial value E + 1 so we can check if the extract
10825	// instruction index was used already.
10826	// Also, later we can check that all the indices are used and we have a
10827	// consecutive access in the extract instructions, by checking that no
10828	// element of CurrentOrder still has value E + 1.
10829	CurrentOrder.assign(NumElts: E, Elt: E);
10830	for (unsigned I = `0`; I < E; ++I) {
10831	if (Indices [I] == PoisonMaskElem)
10832	continue;
10833	const unsigned ExtIdx = Indices [I] - MinIdx;
10834	if (CurrentOrder [ExtIdx] != E) {
10835	CurrentOrder.clear();
10836	return false;
10837	}
10838	ShouldKeepOrder &= ExtIdx == I;
10839	CurrentOrder [ExtIdx] = I;
10840	}
10841	if (ShouldKeepOrder)
10842	CurrentOrder.clear();
10843
10844	return ShouldKeepOrder;
10845	}
10846
10847	bool BoUpSLP::areAllUsersVectorized(
10848	Instruction I, const* SmallDenseSet<Value > VectorizedVals) const {
10849	return (I->hasOneUse() && (!VectorizedVals \|\| VectorizedVals->contains(V: I))) \|\|
10850	all_of(Range: I->users(), P: [this](User *U) {
10851	return isVectorized(V: U) \|\| isVectorLikeInstWithConstOps(V: U) \|\|
10852	(isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
10853	});
10854	}
10855
10856	void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
10857	const function_ref<bool(Instruction )> IsAltOp, SmallVectorImpl<int*> &Mask,
10858	SmallVectorImpl<Value > OpScalars,
10859	SmallVectorImpl<Value > AltScalars) const {
10860	unsigned Sz = Scalars.size();
10861	Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
10862	SmallVector<int> OrderMask;
10863	if (!ReorderIndices.empty())
10864	inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
10865	for (unsigned I = `0`; I < Sz; ++I) {
10866	unsigned Idx = I;
10867	if (!ReorderIndices.empty())
10868	Idx = OrderMask [I];
10869	if (isa<PoisonValue>(Val: Scalars [Idx]))
10870	continue;
10871	auto *OpInst = cast<Instruction>(Val: Scalars [Idx]);
10872	if (IsAltOp (OpInst)) {
10873	Mask [I] = Sz + Idx;
10874	if (AltScalars)
10875	AltScalars->push_back(Elt: OpInst);
10876	} else {
10877	Mask [I] = Idx;
10878	if (OpScalars)
10879	OpScalars->push_back(Elt: OpInst);
10880	}
10881	}
10882	if (!ReuseShuffleIndices.empty()) {
10883	SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
10884	transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
10885	return Idx != PoisonMaskElem ? Mask [Idx] : PoisonMaskElem;
10886	});
10887	Mask.swap(RHS&: NewMask);
10888	}
10889	}
10890
10891	static bool isMainInstruction(Instruction I, Instruction MainOp,
10892	Instruction *AltOp,
10893	const TargetLibraryInfo &TLI) {
10894	return InstructionsState (MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
10895	}
10896
10897	static bool isAlternateInstruction(Instruction I, Instruction MainOp,
10898	Instruction *AltOp,
10899	const TargetLibraryInfo &TLI) {
10900	if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
10901	auto *AltCI = cast<CmpInst>(Val: AltOp);
10902	CmpInst::Predicate MainP = MainCI->getPredicate();
10903	[[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
10904	assert(MainP != AltP && "Expected different main/alternate predicates.");
10905	auto *CI = cast<CmpInst>(Val: I);
10906	if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
10907	return false;
10908	if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
10909	return true;
10910	CmpInst::Predicate P = CI->getPredicate();
10911	CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
10912
10913	assert((MainP == P \|\| AltP == P \|\| MainP == SwappedP \|\| AltP == SwappedP) &&
10914	"CmpInst expected to match either main or alternate predicate or "
10915	"their swap.");
10916	return MainP != P && MainP != SwappedP;
10917	}
10918	return InstructionsState (MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
10919	}
10920
10921	TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
10922	assert(!Ops.empty());
10923	const auto *Op0 = Ops.front();
10924
10925	const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
10926	// TODO: We should allow undef elements here
10927	return isConstant(V) && !isa<UndefValue>(Val: V);
10928	});
10929	const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
10930	// TODO: We should allow undef elements here
10931	return V == Op0;
10932	});
10933	const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
10934	// TODO: We should allow undef elements here
10935	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
10936	return CI->getValue().isPowerOf2();
10937	return false;
10938	});
10939	const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
10940	// TODO: We should allow undef elements here
10941	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
10942	return CI->getValue().isNegatedPowerOf2();
10943	return false;
10944	});
10945
10946	TTI::OperandValueKind VK = TTI::OK_AnyValue;
10947	if (IsConstant && IsUniform)
10948	VK = TTI::OK_UniformConstantValue;
10949	else if (IsConstant)
10950	VK = TTI::OK_NonUniformConstantValue;
10951	else if (IsUniform)
10952	VK = TTI::OK_UniformValue;
10953
10954	TTI::OperandValueProperties VP = TTI::OP_None;
10955	VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
10956	VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
10957
10958	return {.Kind: VK, .Properties: VP};
10959	}
10960
10961	namespace {
10962	/// The base class for shuffle instruction emission and shuffle cost estimation.
10963	class BaseShuffleAnalysis {
10964	protected:
10965	Type ScalarTy = nullptr*;
10966
10967	BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
10968
10969	/// V is expected to be a vectorized value.
10970	/// When REVEC is disabled, there is no difference between VF and
10971	/// VNumElements.
10972	/// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
10973	/// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
10974	/// of 8.
10975	unsigned getVF(Value V) const* {
10976	assert(V && "V cannot be nullptr");
10977	assert(isa<FixedVectorType>(V->getType()) &&
10978	"V does not have FixedVectorType");
10979	assert(ScalarTy && "ScalarTy cannot be nullptr");
10980	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
10981	unsigned VNumElements =
10982	cast<FixedVectorType>(Val: V->getType())->getNumElements();
10983	assert(VNumElements > ScalarTyNumElements &&
10984	"the number of elements of V is not large enough");
10985	assert(VNumElements % ScalarTyNumElements == `0` &&
10986	"the number of elements of V is not a vectorized value");
10987	return VNumElements / ScalarTyNumElements;
10988	}
10989
10990	/// Checks if the mask is an identity mask.
10991	/// \param IsStrict if is true the function returns false if mask size does
10992	/// not match vector size.
10993	static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
10994	bool IsStrict) {
10995	int Limit = Mask.size();
10996	int VF = VecTy->getNumElements();
10997	int Index = -`1`;
10998	if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
10999	return true;
11000	if (!IsStrict) {
11001	// Consider extract subvector starting from index 0.
11002	if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
11003	Index == `0`)
11004	return true;
11005	// All VF-size submasks are identity (e.g.
11006	// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
11007	if (Limit % VF == `0` && all_of(Range: seq<int>(Begin: `0`, End: Limit / VF), P: [=](int Idx) {
11008	ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
11009	return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) \|\|
11010	ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
11011	}))
11012	return true;
11013	}
11014	return false;
11015	}
11016
11017	/// Tries to combine 2 different masks into single one.
11018	/// \param LocalVF Vector length of the permuted input vector. \p Mask may
11019	/// change the size of the vector, \p LocalVF is the original size of the
11020	/// shuffled vector.
11021	static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
11022	ArrayRef<int> ExtMask) {
11023	unsigned VF = Mask.size();
11024	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11025	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
11026	if (ExtMask [I] == PoisonMaskElem)
11027	continue;
11028	int MaskedIdx = Mask [ExtMask [I] % VF];
11029	NewMask [I] =
11030	MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
11031	}
11032	Mask.swap(RHS&: NewMask);
11033	}
11034
11035	/// Looks through shuffles trying to reduce final number of shuffles in the
11036	/// code. The function looks through the previously emitted shuffle
11037	/// instructions and properly mark indices in mask as undef.
11038	/// For example, given the code
11039	/// \code
11040	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11041	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11042	/// \endcode
11043	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11044	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
11045	/// <0, 1, 2, 3> for the shuffle.
11046	/// If 2 operands are of different size, the smallest one will be resized and
11047	/// the mask recalculated properly.
11048	/// For example, given the code
11049	/// \code
11050	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11051	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11052	/// \endcode
11053	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11054	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
11055	/// <0, 1, 2, 3> for the shuffle.
11056	/// So, it tries to transform permutations to simple vector merge, if
11057	/// possible.
11058	/// \param V The input vector which must be shuffled using the given \p Mask.
11059	/// If the better candidate is found, \p V is set to this best candidate
11060	/// vector.
11061	/// \param Mask The input mask for the shuffle. If the best candidate is found
11062	/// during looking-through-shuffles attempt, it is updated accordingly.
11063	/// \param SinglePermute true if the shuffle operation is originally a
11064	/// single-value-permutation. In this case the look-through-shuffles procedure
11065	/// may look for resizing shuffles as the best candidates.
11066	/// \return true if the shuffle results in the non-resizing identity shuffle
11067	/// (and thus can be ignored), false - otherwise.
11068	static bool peekThroughShuffles(Value &V, SmallVectorImpl<int*> &Mask,
11069	bool SinglePermute) {
11070	Value *Op = V;
11071	ShuffleVectorInst IdentityOp = nullptr*;
11072	SmallVector<int> IdentityMask;
11073	while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
11074	// Exit if not a fixed vector type or changing size shuffle.
11075	auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
11076	if (!SVTy)
11077	break;
11078	// Remember the identity or broadcast mask, if it is not a resizing
11079	// shuffle. If no better candidates are found, this Op and Mask will be
11080	// used in the final shuffle.
11081	if (isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/false)) {
11082	if (!IdentityOp \|\| !SinglePermute \|\|
11083	(isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/true) &&
11084	!ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
11085	NumSrcElts: IdentityMask.size()))) {
11086	IdentityOp = SV;
11087	// Store current mask in the IdentityMask so later we did not lost
11088	// this info if IdentityOp is selected as the best candidate for the
11089	// permutation.
11090	IdentityMask.assign(RHS: Mask);
11091	}
11092	}
11093	// Remember the broadcast mask. If no better candidates are found, this Op
11094	// and Mask will be used in the final shuffle.
11095	// Zero splat can be used as identity too, since it might be used with
11096	// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
11097	// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
11098	// expensive, the analysis founds out, that the source vector is just a
11099	// broadcast, this original mask can be transformed to identity mask <0,
11100	// 1, 2, 3>.
11101	// \code
11102	// %0 = shuffle %v, poison, zeroinitalizer
11103	// %res = shuffle %0, poison, <3, 1, 2, 0>
11104	// \endcode
11105	// may be transformed to
11106	// \code
11107	// %0 = shuffle %v, poison, zeroinitalizer
11108	// %res = shuffle %0, poison, <0, 1, 2, 3>
11109	// \endcode
11110	if (SV->isZeroEltSplat()) {
11111	IdentityOp = SV;
11112	IdentityMask.assign(RHS: Mask);
11113	}
11114	int LocalVF = Mask.size();
11115	if (auto *SVOpTy =
11116	dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType()))
11117	LocalVF = SVOpTy->getNumElements();
11118	SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
11119	for (auto [Idx, I] : enumerate(First&: Mask)) {
11120	if (I == PoisonMaskElem \|\|
11121	static_cast<unsigned>(I) >= SV->getShuffleMask().size())
11122	continue;
11123	ExtMask [Idx] = SV->getMaskValue(Elt: I);
11124	}
11125	bool IsOp1Undef = isUndefVector</isPoisonOnly=/true>(
11126	V: SV->getOperand(i_nocapture: `0`),
11127	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
11128	.all();
11129	bool IsOp2Undef = isUndefVector</isPoisonOnly=/true>(
11130	V: SV->getOperand(i_nocapture: `1`),
11131	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
11132	.all();
11133	if (!IsOp1Undef && !IsOp2Undef) {
11134	// Update mask and mark undef elems.
11135	for (int &I : Mask) {
11136	if (I == PoisonMaskElem)
11137	continue;
11138	if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
11139	PoisonMaskElem)
11140	I = PoisonMaskElem;
11141	}
11142	break;
11143	}
11144	SmallVector<int> ShuffleMask(SV->getShuffleMask());
11145	combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
11146	Mask.swap(RHS&: ShuffleMask);
11147	if (IsOp2Undef)
11148	Op = SV->getOperand(i_nocapture: `0`);
11149	else
11150	Op = SV->getOperand(i_nocapture: `1`);
11151	}
11152	if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
11153	!OpTy \|\| !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) \|\|
11154	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
11155	if (IdentityOp) {
11156	V = IdentityOp;
11157	assert(Mask.size() == IdentityMask.size() &&
11158	"Expected masks of same sizes.");
11159	// Clear known poison elements.
11160	for (auto [I, Idx] : enumerate(First&: Mask))
11161	if (Idx == PoisonMaskElem)
11162	IdentityMask [I] = PoisonMaskElem;
11163	Mask.swap(RHS&: IdentityMask);
11164	auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
11165	return SinglePermute &&
11166	(isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
11167	/IsStrict=/true) \|\|
11168	(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
11169	Shuffle->isZeroEltSplat() &&
11170	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) &&
11171	all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) {
11172	return P.value() == PoisonMaskElem \|\|
11173	Shuffle->getShuffleMask()[P.index()] == `0`;
11174	})));
11175	}
11176	V = Op;
11177	return false;
11178	}
11179	V = Op;
11180	return true;
11181	}
11182
11183	/// Smart shuffle instruction emission, walks through shuffles trees and
11184	/// tries to find the best matching vector for the actual shuffle
11185	/// instruction.
11186	template <typename T, typename ShuffleBuilderTy>
11187	static T createShuffle(Value V1, Value V2, ArrayRef<int> Mask,
11188	ShuffleBuilderTy &Builder, Type *ScalarTy) {
11189	assert(V1 && "Expected at least one vector value.");
11190	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
11191	SmallVector<int> NewMask(Mask);
11192	if (ScalarTyNumElements != `1`) {
11193	assert(SLPReVec && "FixedVectorType is not expected.");
11194	transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask);
11195	Mask = NewMask;
11196	}
11197	if (V2)
11198	Builder.resizeToMatch(V1, V2);
11199	int VF = Mask.size();
11200	if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
11201	VF = FTy->getNumElements();
11202	if (V2 && !isUndefVector</IsPoisonOnly=/true>(
11203	V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg))
11204	.all()) {
11205	// Peek through shuffles.
11206	Value *Op1 = V1;
11207	Value *Op2 = V2;
11208	int VF =
11209	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
11210	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
11211	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
11212	for (int I = `0`, E = Mask.size(); I < E; ++I) {
11213	if (Mask [I] < VF)
11214	CombinedMask1 [I] = Mask [I];
11215	else
11216	CombinedMask2 [I] = Mask [I] - VF;
11217	}
11218	Value *PrevOp1;
11219	Value *PrevOp2;
11220	do {
11221	PrevOp1 = Op1;
11222	PrevOp2 = Op2;
11223	(void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /SinglePermute=/false);
11224	(void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /SinglePermute=/false);
11225	// Check if we have 2 resizing shuffles - need to peek through operands
11226	// again.
11227	if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
11228	if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
11229	SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
11230	for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
11231	if (I == PoisonMaskElem)
11232	continue;
11233	ExtMask1 [Idx] = SV1->getMaskValue(Elt: I);
11234	}
11235	SmallBitVector UseMask1 = buildUseMask(
11236	VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: `1`)->getType())
11237	->getNumElements(),
11238	Mask: ExtMask1, MaskArg: UseMask::SecondArg);
11239	SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
11240	for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
11241	if (I == PoisonMaskElem)
11242	continue;
11243	ExtMask2 [Idx] = SV2->getMaskValue(Elt: I);
11244	}
11245	SmallBitVector UseMask2 = buildUseMask(
11246	VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: `1`)->getType())
11247	->getNumElements(),
11248	Mask: ExtMask2, MaskArg: UseMask::SecondArg);
11249	if (SV1->getOperand(i_nocapture: `0`)->getType() ==
11250	SV2->getOperand(i_nocapture: `0`)->getType() &&
11251	SV1->getOperand(i_nocapture: `0`)->getType() != SV1->getType() &&
11252	isUndefVector(V: SV1->getOperand(i_nocapture: `1`), UseMask: UseMask1).all() &&
11253	isUndefVector(V: SV2->getOperand(i_nocapture: `1`), UseMask: UseMask2).all()) {
11254	Op1 = SV1->getOperand(i_nocapture: `0`);
11255	Op2 = SV2->getOperand(i_nocapture: `0`);
11256	SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
11257	int LocalVF = ShuffleMask1.size();
11258	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
11259	LocalVF = FTy->getNumElements();
11260	combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
11261	CombinedMask1.swap(RHS&: ShuffleMask1);
11262	SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
11263	LocalVF = ShuffleMask2.size();
11264	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
11265	LocalVF = FTy->getNumElements();
11266	combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
11267	CombinedMask2.swap(RHS&: ShuffleMask2);
11268	}
11269	}
11270	} while (PrevOp1 != Op1 \|\| PrevOp2 != Op2);
11271	Builder.resizeToMatch(Op1, Op2);
11272	VF = std::max(a: cast<VectorType>(Val: Op1->getType())
11273	->getElementCount()
11274	.getKnownMinValue(),
11275	b: cast<VectorType>(Val: Op2->getType())
11276	->getElementCount()
11277	.getKnownMinValue());
11278	for (int I = `0`, E = Mask.size(); I < E; ++I) {
11279	if (CombinedMask2 [I] != PoisonMaskElem) {
11280	assert(CombinedMask1[I] == PoisonMaskElem &&
11281	"Expected undefined mask element");
11282	CombinedMask1 [I] = CombinedMask2 [I] + (Op1 == Op2 ? `0` : VF);
11283	}
11284	}
11285	if (Op1 == Op2 &&
11286	(ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) \|\|
11287	(ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
11288	isa<ShuffleVectorInst>(Val: Op1) &&
11289	cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
11290	ArrayRef(CombinedMask1))))
11291	return Builder.createIdentity(Op1);
11292	return Builder.createShuffleVector(
11293	Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
11294	CombinedMask1);
11295	}
11296	if (isa<PoisonValue>(Val: V1))
11297	return Builder.createPoison(
11298	cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
11299	bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /SinglePermute=/true);
11300	assert(V1 && "Expected non-null value after looking through shuffles.");
11301
11302	if (!IsIdentity)
11303	return Builder.createShuffleVector(V1, NewMask);
11304	return Builder.createIdentity(V1);
11305	}
11306
11307	/// Transforms mask \p CommonMask per given \p Mask to make proper set after
11308	/// shuffle emission.
11309	static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11310	ArrayRef<int> Mask) {
11311	for (unsigned I : seq<unsigned>(Size: CommonMask.size()))
11312	if (Mask [I] != PoisonMaskElem)
11313	CommonMask [I] = I;
11314	}
11315	};
11316	} // namespace
11317
11318	/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
11319	static std::pair<InstructionCost, InstructionCost>
11320	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
11321	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
11322	Type ScalarTy, VectorType VecTy) {
11323	InstructionCost ScalarCost = `0`;
11324	InstructionCost VecCost = `0`;
11325	// Here we differentiate two cases: (1) when Ptrs represent a regular
11326	// vectorization tree node (as they are pointer arguments of scattered
11327	// loads) or (2) when Ptrs are the arguments of loads or stores being
11328	// vectorized as plane wide unit-stride load/store since all the
11329	// loads/stores are known to be from/to adjacent locations.
11330	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store) {
11331	// Case 2: estimate costs for pointer related costs when vectorizing to
11332	// a wide load/store.
11333	// Scalar cost is estimated as a set of pointers with known relationship
11334	// between them.
11335	// For vector code we will use BasePtr as argument for the wide load/store
11336	// but we also need to account all the instructions which are going to
11337	// stay in vectorized code due to uses outside of these scalar
11338	// loads/stores.
11339	ScalarCost = TTI.getPointersChainCost(
11340	Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
11341	CostKind);
11342
11343	SmallVector<const Value *> PtrsRetainedInVecCode;
11344	for (Value *V : Ptrs) {
11345	if (V == BasePtr) {
11346	PtrsRetainedInVecCode.push_back(Elt: V);
11347	continue;
11348	}
11349	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
11350	// For simplicity assume Ptr to stay in vectorized code if it's not a
11351	// GEP instruction. We don't care since it's cost considered free.
11352	// TODO: We should check for any uses outside of vectorizable tree
11353	// rather than just single use.
11354	if (!Ptr \|\| !Ptr->hasOneUse())
11355	PtrsRetainedInVecCode.push_back(Elt: V);
11356	}
11357
11358	if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
11359	// If all pointers stay in vectorized code then we don't have
11360	// any savings on that.
11361	return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
11362	}
11363	VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
11364	Info: TTI::PointersChainInfo::getKnownStride(),
11365	AccessTy: VecTy, CostKind);
11366	} else {
11367	// Case 1: Ptrs are the arguments of loads that we are going to transform
11368	// into masked gather load intrinsic.
11369	// All the scalar GEPs will be removed as a result of vectorization.
11370	// For any external uses of some lanes extract element instructions will
11371	// be generated (which cost is estimated separately).
11372	TTI::PointersChainInfo PtrsInfo =
11373	all_of(Range&: Ptrs,
11374	P: [](const Value *V) {
11375	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
11376	return Ptr && !Ptr->hasAllConstantIndices();
11377	})
11378	? TTI::PointersChainInfo::getUnknownStride()
11379	: TTI::PointersChainInfo::getKnownStride();
11380
11381	ScalarCost =
11382	TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
11383	auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
11384	if (!BaseGEP) {
11385	auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
11386	if (It != Ptrs.end())
11387	BaseGEP = cast<GEPOperator>(Val: *It);
11388	}
11389	if (BaseGEP) {
11390	SmallVector<const Value *> Indices(BaseGEP->indices());
11391	VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
11392	Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
11393	CostKind);
11394	}
11395	}
11396
11397	return std::make_pair(x&: ScalarCost, y&: VecCost);
11398	}
11399
11400	void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
11401	assert(TE.isGather() && TE.ReorderIndices.empty() &&
11402	"Expected gather node without reordering.");
11403	DenseMap<std::pair<size_t, Value >, SmallVector<LoadInst >> LoadsMap;
11404	SmallSet<size_t, `2`> LoadKeyUsed;
11405
11406	// Do not reorder nodes if it small (just 2 elements), all-constant or all
11407	// instructions have same opcode already.
11408	if (TE.Scalars.size() == `2` \|\| (TE.hasState() && !TE.isAltShuffle()) \|\|
11409	all_of(Range&: TE.Scalars, P: isConstant))
11410	return;
11411
11412	if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) {
11413	return VectorizableTree [Idx]->isSame(VL: TE.Scalars);
11414	}))
11415	return;
11416
11417	auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
11418	Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
11419	Value *Ptr =
11420	getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
11421	if (LoadKeyUsed.contains(V: Key)) {
11422	auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
11423	if (LIt != LoadsMap.end()) {
11424	for (LoadInst *RLI : LIt ->second) {
11425	if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
11426	ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: DL, SE&: SE,
11427	/StrictCheck=/true))
11428	return hash_value(ptr: RLI->getPointerOperand());
11429	}
11430	for (LoadInst *RLI : LIt ->second) {
11431	if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
11432	Ptr2: LI->getPointerOperand(), TLI: *TLI)) {
11433	hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
11434	return SubKey;
11435	}
11436	}
11437	if (LIt ->second.size() > `2`) {
11438	hash_code SubKey =
11439	hash_value(ptr: LIt ->second.back()->getPointerOperand());
11440	return SubKey;
11441	}
11442	}
11443	}
11444	LoadKeyUsed.insert(V: Key);
11445	LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first ->second.push_back(Elt: LI);
11446	return hash_value(ptr: LI->getPointerOperand());
11447	};
11448	MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
11449	SmallDenseMap<Value , SmallVector<unsigned*>, `8`> KeyToIndex;
11450	bool IsOrdered = true;
11451	unsigned NumInstructions = `0`;
11452	// Try to "cluster" scalar instructions, to be able to build extra vectorized
11453	// nodes.
11454	for (auto [I, V] : enumerate(First&: TE.Scalars)) {
11455	size_t Key = `1`, Idx = `1`;
11456	if (auto *Inst = dyn_cast<Instruction>(Val: V);
11457	Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) &&
11458	!isDeleted(I: Inst) && !isVectorized(V)) {
11459	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
11460	/AllowAlternate=/false);
11461	++NumInstructions;
11462	}
11463	auto &Container = SortedValues [Key];
11464	if (IsOrdered && !KeyToIndex.contains(Val: V) &&
11465	!(isa<Constant, ExtractElementInst>(Val: V) \|\|
11466	isVectorLikeInstWithConstOps(V)) &&
11467	((Container.contains(Key: Idx) &&
11468	KeyToIndex.at(Val: Container [Idx].back()).back() != I - `1`) \|\|
11469	(!Container.empty() && !Container.contains(Key: Idx) &&
11470	KeyToIndex.at(Val: Container.back().second.back()).back() != I - `1`)))
11471	IsOrdered = false;
11472	auto &KTI = KeyToIndex [V];
11473	if (KTI.empty())
11474	Container [Idx].push_back(Elt: V);
11475	KTI.push_back(Elt: I);
11476	}
11477	SmallVector<std::pair<unsigned, unsigned>> SubVectors;
11478	APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
11479	if (!IsOrdered && NumInstructions > `1`) {
11480	unsigned Cnt = `0`;
11481	TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size());
11482	for (const auto &D : SortedValues) {
11483	for (const auto &P : D.second) {
11484	unsigned Sz = `0`;
11485	for (Value *V : P.second) {
11486	ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V);
11487	for (auto [K, Idx] : enumerate(First&: Indices)) {
11488	TE.ReorderIndices [Cnt + K] = Idx;
11489	TE.Scalars [Cnt + K] = V;
11490	}
11491	Sz += Indices.size();
11492	Cnt += Indices.size();
11493	}
11494	if (Sz > `1` && isa<Instruction>(Val: P.second.front())) {
11495	const unsigned SubVF = getFloorFullVectorNumberOfElements(
11496	TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz);
11497	SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF);
11498	for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF))
11499	DemandedElts.clearBit(BitPosition: I);
11500	} else if (!P.second.empty() && isConstant(V: P.second.front())) {
11501	for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt))
11502	DemandedElts.clearBit(BitPosition: I);
11503	}
11504	}
11505	}
11506	}
11507	// Reuses always require shuffles, so consider it as profitable.
11508	if (!TE.ReuseShuffleIndices.empty() \|\| TE.ReorderIndices.empty())
11509	return;
11510	// Do simple cost estimation.
11511	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11512	InstructionCost Cost = `0`;
11513	auto *ScalarTy = TE.Scalars.front()->getType();
11514	auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size());
11515	for (auto [Idx, Sz] : SubVectors) {
11516	Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind,
11517	Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz));
11518	}
11519	Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
11520	/Insert=/true,
11521	/Extract=/false, CostKind);
11522	int Sz = TE.Scalars.size();
11523	SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
11524	TE.ReorderIndices.end());
11525	for (unsigned I : seq<unsigned>(Size: Sz)) {
11526	Value *V = TE.getOrdered(Idx: I);
11527	if (isa<PoisonValue>(Val: V)) {
11528	ReorderMask [I] = PoisonMaskElem;
11529	} else if (isConstant(V) \|\| DemandedElts [I]) {
11530	ReorderMask [I] = I + TE.ReorderIndices.size();
11531	}
11532	}
11533	Cost += ::getShuffleCost(TTI: *TTI,
11534	Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; })
11535	? TTI::SK_PermuteTwoSrc
11536	: TTI::SK_PermuteSingleSrc,
11537	Tp: VecTy, Mask: ReorderMask);
11538	DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
11539	ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
11540	for (unsigned I : seq<unsigned>(Size: Sz)) {
11541	Value *V = TE.getOrdered(Idx: I);
11542	if (isConstant(V)) {
11543	DemandedElts.clearBit(BitPosition: I);
11544	if (!isa<PoisonValue>(Val: V))
11545	ReorderMask [I] = I;
11546	} else {
11547	ReorderMask [I] = I + Sz;
11548	}
11549	}
11550	InstructionCost BVCost =
11551	getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
11552	/Insert=/true, /Extract=/false, CostKind);
11553	if (!DemandedElts.isAllOnes())
11554	BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask);
11555	if (Cost >= BVCost) {
11556	SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
11557	reorderScalars(Scalars&: TE.Scalars, Mask);
11558	TE.ReorderIndices.clear();
11559	}
11560	}
11561
11562	void BoUpSLP::transformNodes() {
11563	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11564	BaseGraphSize = VectorizableTree.size();
11565	// Turn graph transforming mode on and off, when done.
11566	class GraphTransformModeRAAI {
11567	bool &SavedIsGraphTransformMode;
11568
11569	public:
11570	GraphTransformModeRAAI(bool &IsGraphTransformMode)
11571	: SavedIsGraphTransformMode(IsGraphTransformMode) {
11572	IsGraphTransformMode = true;
11573	}
11574	~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
11575	} TransformContext(IsGraphTransformMode);
11576	// Operands are profitable if they are:
11577	// 1. At least one constant
11578	// or
11579	// 2. Splats
11580	// or
11581	// 3. Results in good vectorization opportunity, i.e. may generate vector
11582	// nodes and reduce cost of the graph.
11583	auto CheckOperandsProfitability = [this](Instruction I1, Instruction I2,
11584	const InstructionsState &S) {
11585	SmallVector<SmallVector<std::pair<Value , Value >>> Candidates;
11586	for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands()))
11587	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
11588	Args: I2->getOperand(i: Op));
11589	return all_of(
11590	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
11591	return all_of(Range&: Cand,
11592	P: [](const std::pair<Value , Value > &P) {
11593	return isa<Constant>(Val: P.first) \|\|
11594	isa<Constant>(Val: P.second) \|\| P.first == P.second;
11595	}) \|\|
11596	findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads);
11597	});
11598	};
11599
11600	// Try to reorder gather nodes for better vectorization opportunities.
11601	for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
11602	TreeEntry &E = *VectorizableTree [Idx];
11603	if (E.isGather())
11604	reorderGatherNode(TE&: E);
11605	}
11606
11607	// Better to use full gathered loads analysis, if there are only 2 loads
11608	// gathered nodes each having less than 16 elements.
11609	constexpr unsigned VFLimit = `16`;
11610	bool ForceLoadGather =
11611	count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
11612	return TE ->isGather() && TE ->hasState() &&
11613	TE ->getOpcode() == Instruction::Load &&
11614	TE ->getVectorFactor() < VFLimit;
11615	}) == `2`;
11616
11617	// Checks if the scalars are used in other node.
11618	auto AreReusedScalars = [&](const TreeEntry TE, ArrayRef<Value > VL,
11619	function_ref<bool(Value *)> CheckContainer) {
11620	return TE->isSame(VL) \|\| all_of(Range&: VL, P: [&](Value *V) {
11621	if (isa<PoisonValue>(Val: V))
11622	return true;
11623	auto *I = dyn_cast<Instruction>(Val: V);
11624	if (!I)
11625	return false;
11626	return is_contained(Range: TE->Scalars, Element: I) \|\| CheckContainer (I);
11627	});
11628	};
11629	auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
11630	if (E.hasState()) {
11631	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp());
11632	!TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
11633	return AreReusedScalars (TE, E.Scalars, [&](Value *V) {
11634	ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
11635	return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
11636	return is_contained(Range&: TEs, Element: TE);
11637	});
11638	});
11639	}))
11640	return true;
11641	;
11642	if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp());
11643	!TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
11644	return AreReusedScalars (TE, E.Scalars, [&](Value *V) {
11645	ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
11646	return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
11647	return is_contained(Range&: TEs, Element: TE);
11648	});
11649	});
11650	}))
11651	return true;
11652	} else {
11653	// Check if the gather node full copy of split node.
11654	auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>);
11655	if (It != E.Scalars.end()) {
11656	if (ArrayRef<TreeEntry > TEs = getSplitTreeEntries(V: It);
11657	!TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
11658	return AreReusedScalars (TE, E.Scalars, [&](Value *V) {
11659	ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
11660	return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
11661	return is_contained(Range&: TEs, Element: TE);
11662	});
11663	});
11664	}))
11665	return true;
11666	}
11667	}
11668	return false;
11669	};
11670	// The tree may grow here, so iterate over nodes, built before.
11671	for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
11672	TreeEntry &E = *VectorizableTree [Idx];
11673	if (E.isGather()) {
11674	ArrayRef<Value *> VL = E.Scalars;
11675	const unsigned Sz = getVectorElementSize(V: VL.front());
11676	unsigned MinVF = getMinVF(Sz: `2` * Sz);
11677	// Do not try partial vectorization for small nodes (<= 2), nodes with the
11678	// same opcode and same parent block or all constants.
11679	if (VL.size() <= `2` \|\| LoadEntriesToVectorize.contains(key: Idx) \|\|
11680	!(!E.hasState() \|\| E.getOpcode() == Instruction::Load \|\|
11681	// We use allSameOpcode instead of isAltShuffle because we don't
11682	// want to use interchangeable instruction here.
11683	!allSameOpcode(VL) \|\| !allSameBlock(VL)) \|\|
11684	allConstant(VL) \|\| isSplat(VL))
11685	continue;
11686	if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
11687	continue;
11688	// Check if the node is a copy of other vector nodes.
11689	if (CheckForSameVectorNodes (E))
11690	continue;
11691	// Try to find vectorizable sequences and transform them into a series of
11692	// insertvector instructions.
11693	unsigned StartIdx = `0`;
11694	unsigned End = VL.size();
11695	for (unsigned VF = getFloorFullVectorNumberOfElements(
11696	TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - `1`);
11697	VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
11698	TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - `1`)) {
11699	if (StartIdx + VF > End)
11700	continue;
11701	SmallVector<std::pair<unsigned, unsigned>> Slices;
11702	for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
11703	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
11704	// If any instruction is vectorized already - do not try again.
11705	// Reuse the existing node, if it fully matches the slice.
11706	if (isVectorized(V: Slice.front()) &&
11707	!getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /SameVF=/true))
11708	continue;
11709	// Constant already handled effectively - skip.
11710	if (allConstant(VL: Slice))
11711	continue;
11712	// Do not try to vectorize small splats (less than vector register and
11713	// only with the single non-undef element).
11714	bool IsSplat = isSplat(VL: Slice);
11715	bool IsTwoRegisterSplat = true;
11716	if (IsSplat && VF == `2`) {
11717	unsigned NumRegs2VF = ::getNumberOfParts(
11718	TTI: TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: `2` VF));
11719	IsTwoRegisterSplat = NumRegs2VF == `2`;
11720	}
11721	if (Slices.empty() \|\| !IsSplat \|\| !IsTwoRegisterSplat \|\|
11722	count(Range&: Slice, Element: Slice.front()) ==
11723	static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - `1`
11724	: `1`)) {
11725	if (IsSplat)
11726	continue;
11727	InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI);
11728	if (!S \|\| !allSameOpcode(VL: Slice) \|\| !allSameBlock(VL: Slice) \|\|
11729	(S.getOpcode() == Instruction::Load &&
11730	areKnownNonVectorizableLoads(VL: Slice)) \|\|
11731	(S.getOpcode() != Instruction::Load &&
11732	!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF)))
11733	continue;
11734	if (VF == `2`) {
11735	// Try to vectorize reduced values or if all users are vectorized.
11736	// For expensive instructions extra extracts might be profitable.
11737	if ((!UserIgnoreList \|\| E.Idx != `0`) &&
11738	TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
11739	TTI::TCC_Expensive &&
11740	!all_of(Range&: Slice, P: [&](Value *V) {
11741	if (isa<PoisonValue>(Val: V))
11742	return true;
11743	return areAllUsersVectorized(I: cast<Instruction>(Val: V),
11744	VectorizedVals: UserIgnoreList);
11745	}))
11746	continue;
11747	if (S.getOpcode() == Instruction::Load) {
11748	OrdersType Order;
11749	SmallVector<Value *> PointerOps;
11750	LoadsState Res =
11751	canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps);
11752	// Do not vectorize gathers.
11753	if (Res == LoadsState::ScatterVectorize \|\|
11754	Res == LoadsState::Gather) {
11755	if (Res == LoadsState::Gather) {
11756	registerNonVectorizableLoads(VL: Slice);
11757	// If reductions and the scalars from the root node are
11758	// analyzed - mark as non-vectorizable reduction.
11759	if (UserIgnoreList && E.Idx == `0`)
11760	analyzedReductionVals(VL: Slice);
11761	}
11762	continue;
11763	}
11764	} else if (S.getOpcode() == Instruction::ExtractElement \|\|
11765	(TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
11766	TTI::TCC_Expensive &&
11767	!CheckOperandsProfitability (
11768	S.getMainOp(),
11769	cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice),
11770	P: IsaPred<Instruction>)),
11771	S))) {
11772	// Do not vectorize extractelements (handled effectively
11773	// alread). Do not vectorize non-profitable instructions (with
11774	// low cost and non-vectorizable operands.)
11775	continue;
11776	}
11777	}
11778	}
11779	Slices.emplace_back(Args&: Cnt, Args: Slice.size());
11780	}
11781	auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
11782	E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt);
11783	if (StartIdx == Cnt)
11784	StartIdx = Cnt + Sz;
11785	if (End == Cnt + Sz)
11786	End = Cnt;
11787	};
11788	for (auto [Cnt, Sz] : Slices) {
11789	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz);
11790	const TreeEntry SameTE = nullptr*;
11791	if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>);
11792	It != Slice.end()) {
11793	// If any instruction is vectorized already - do not try again.
11794	SameTE = getSameValuesTreeEntry(V: *It, VL: Slice);
11795	}
11796	unsigned PrevSize = VectorizableTree.size();
11797	[[maybe_unused]] unsigned PrevEntriesSize =
11798	LoadEntriesToVectorize.size();
11799	buildTreeRec(VLRef: Slice, Depth: `0`, UserTreeIdx: EdgeInfo (&E, UINT_MAX));
11800	if (PrevSize + `1` == VectorizableTree.size() && !SameTE &&
11801	VectorizableTree [PrevSize]->isGather() &&
11802	VectorizableTree [PrevSize]->hasState() &&
11803	VectorizableTree [PrevSize]->getOpcode() !=
11804	Instruction::ExtractElement &&
11805	!isSplat(VL: Slice)) {
11806	if (UserIgnoreList && E.Idx == `0` && VF == `2`)
11807	analyzedReductionVals(VL: Slice);
11808	VectorizableTree.pop_back();
11809	assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
11810	"LoadEntriesToVectorize expected to remain the same");
11811	continue;
11812	}
11813	AddCombinedNode (PrevSize, Cnt, Sz);
11814	}
11815	}
11816	// Restore ordering, if no extra vectorization happened.
11817	if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
11818	SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
11819	reorderScalars(Scalars&: E.Scalars, Mask);
11820	E.ReorderIndices.clear();
11821	}
11822	}
11823	if (!E.hasState())
11824	continue;
11825	switch (E.getOpcode()) {
11826	case Instruction::Load: {
11827	// No need to reorder masked gather loads, just reorder the scalar
11828	// operands.
11829	if (E.State != TreeEntry::Vectorize)
11830	break;
11831	Type *ScalarTy = E.getMainOp()->getType();
11832	auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
11833	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
11834	// Check if profitable to represent consecutive load + reverse as strided
11835	// load with stride -1.
11836	if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
11837	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
11838	SmallVector<int> Mask;
11839	inversePermutation(Indices: E.ReorderIndices, Mask);
11840	auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
11841	InstructionCost OriginalVecCost =
11842	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
11843	AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
11844	OpdInfo: TTI::OperandValueInfo ()) +
11845	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
11846	InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
11847	Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(),
11848	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: BaseLI);
11849	if (StridedCost < OriginalVecCost)
11850	// Strided load is more profitable than consecutive load + reverse -
11851	// transform the node to strided load.
11852	E.State = TreeEntry::StridedVectorize;
11853	}
11854	break;
11855	}
11856	case Instruction::Store: {
11857	Type *ScalarTy =
11858	cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
11859	auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
11860	Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
11861	// Check if profitable to represent consecutive load + reverse as strided
11862	// load with stride -1.
11863	if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
11864	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
11865	SmallVector<int> Mask;
11866	inversePermutation(Indices: E.ReorderIndices, Mask);
11867	auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
11868	InstructionCost OriginalVecCost =
11869	TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
11870	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
11871	OpdInfo: TTI::OperandValueInfo ()) +
11872	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
11873	InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
11874	Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
11875	/VariableMask=/false, Alignment: CommonAlignment, CostKind, I: BaseSI);
11876	if (StridedCost < OriginalVecCost)
11877	// Strided store is more profitable than reverse + consecutive store -
11878	// transform the node to strided store.
11879	E.State = TreeEntry::StridedVectorize;
11880	} else if (!E.ReorderIndices.empty()) {
11881	// Check for interleaved stores.
11882	auto IsInterleaveMask = [&, &TTI = TTI](ArrayRef<int*> Mask) {
11883	auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front());
11884	assert(Mask.size() > `1` && "Expected mask greater than 1 element.");
11885	if (Mask.size() < `4`)
11886	return `0u`;
11887	for (unsigned Factor : seq<unsigned>(Begin: `2`, End: Mask.size() / `2` + `1`)) {
11888	if (ShuffleVectorInst::isInterleaveMask(
11889	Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) &&
11890	TTI.isLegalInterleavedAccessType(
11891	VTy: VecTy, Factor, Alignment: BaseSI->getAlign(),
11892	AddrSpace: BaseSI->getPointerAddressSpace()))
11893	return Factor;
11894	}
11895
11896	return `0u`;
11897	};
11898	SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
11899	unsigned InterleaveFactor = IsInterleaveMask (Mask);
11900	if (InterleaveFactor != `0`)
11901	E.setInterleave(InterleaveFactor);
11902	}
11903	break;
11904	}
11905	case Instruction::Select: {
11906	if (E.State != TreeEntry::Vectorize)
11907	break;
11908	auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars);
11909	if (MinMaxID == Intrinsic::not_intrinsic)
11910	break;
11911	// This node is a minmax node.
11912	E.CombinedOp = TreeEntry::MinMax;
11913	TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: `0`);
11914	if (SelectOnly && CondEntry->UserTreeIndex &&
11915	CondEntry->State == TreeEntry::Vectorize) {
11916	// The condition node is part of the combined minmax node.
11917	CondEntry->State = TreeEntry::CombinedVectorize;
11918	}
11919	break;
11920	}
11921	default:
11922	break;
11923	}
11924	}
11925
11926	if (LoadEntriesToVectorize.empty()) {
11927	// Single load node - exit.
11928	if (VectorizableTree.size() <= `1` && VectorizableTree.front()->hasState() &&
11929	VectorizableTree.front()->getOpcode() == Instruction::Load)
11930	return;
11931	// Small graph with small VF - exit.
11932	constexpr unsigned SmallTree = `3`;
11933	constexpr unsigned SmallVF = `2`;
11934	if ((VectorizableTree.size() <= SmallTree &&
11935	VectorizableTree.front()->Scalars.size() == SmallVF) \|\|
11936	(VectorizableTree.size() <= `2` && UserIgnoreList))
11937	return;
11938
11939	if (VectorizableTree.front()->isNonPowOf2Vec() &&
11940	getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
11941	getCanonicalGraphSize() <= SmallTree &&
11942	count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
11943	P: [](const std::unique_ptr<TreeEntry> &TE) {
11944	return TE ->isGather() && TE ->hasState() &&
11945	TE ->getOpcode() == Instruction::Load &&
11946	!allSameBlock(VL: TE ->Scalars);
11947	}) == `1`)
11948	return;
11949	}
11950
11951	// A list of loads to be gathered during the vectorization process. We can
11952	// try to vectorize them at the end, if profitable.
11953	SmallMapVector<std::tuple<BasicBlock , Value , Type *>,
11954	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, `8`>
11955	GatheredLoads;
11956
11957	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11958	TreeEntry &E = *TE;
11959	if (E.isGather() &&
11960	((E.hasState() && E.getOpcode() == Instruction::Load) \|\|
11961	(!E.hasState() && any_of(Range&: E.Scalars,
11962	P: [&](Value *V) {
11963	return isa<LoadInst>(Val: V) &&
11964	!isVectorized(V) &&
11965	!isDeleted(I: cast<Instruction>(Val: V));
11966	}))) &&
11967	!isSplat(VL: E.Scalars)) {
11968	for (Value *V : E.Scalars) {
11969	auto *LI = dyn_cast<LoadInst>(Val: V);
11970	if (!LI)
11971	continue;
11972	if (isDeleted(I: LI) \|\| isVectorized(V: LI) \|\| !LI->isSimple())
11973	continue;
11974	gatherPossiblyVectorizableLoads(
11975	R: *this, VL: V, DL: DL, SE&: SE, TTI: *TTI,
11976	GatheredLoads&: GatheredLoads [std::make_tuple(
11977	args: LI->getParent(),
11978	args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth),
11979	args: LI->getType())]);
11980	}
11981	}
11982	}
11983	// Try to vectorize gathered loads if this is not just a gather of loads.
11984	if (!GatheredLoads.empty())
11985	tryToVectorizeGatheredLoads(GatheredLoads);
11986	}
11987
11988	/// Merges shuffle masks and emits final shuffle instruction, if required. It
11989	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11990	/// when the actual shuffle instruction is generated only if this is actually
11991	/// required. Otherwise, the shuffle instruction emission is delayed till the
11992	/// end of the process, to reduce the number of emitted instructions and further
11993	/// analysis/transformations.
11994	class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
11995	bool IsFinalized = false;
11996	SmallVector<int> CommonMask;
11997	SmallVector<PointerUnion<Value , const* TreeEntry *>, `2`> InVectors;
11998	const TargetTransformInfo &TTI;
11999	InstructionCost Cost = `0`;
12000	SmallDenseSet<Value *> VectorizedVals;
12001	BoUpSLP &R;
12002	SmallPtrSetImpl<Value *> &CheckedExtracts;
12003	constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
12004	/// While set, still trying to estimate the cost for the same nodes and we
12005	/// can delay actual cost estimation (virtual shuffle instruction emission).
12006	/// May help better estimate the cost if same nodes must be permuted + allows
12007	/// to move most of the long shuffles cost estimation to TTI.
12008	bool SameNodesEstimated = true;
12009
12010	static Constant getAllOnesValue(const* DataLayout &DL, Type *Ty) {
12011	if (Ty->getScalarType()->isPointerTy()) {
12012	Constant *Res = ConstantExpr::getIntToPtr(
12013	C: ConstantInt::getAllOnesValue(
12014	Ty: IntegerType::get(C&: Ty->getContext(),
12015	NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
12016	Ty: Ty->getScalarType());
12017	if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
12018	Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
12019	return Res;
12020	}
12021	return Constant::getAllOnesValue(Ty);
12022	}
12023
12024	InstructionCost getBuildVectorCost(ArrayRef<Value > VL, Value Root) {
12025	if ((!Root && allConstant(VL)) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>))
12026	return TTI::TCC_Free;
12027	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
12028	InstructionCost GatherCost = `0`;
12029	SmallVector<Value *> Gathers(VL);
12030	if (!Root && isSplat(VL)) {
12031	// Found the broadcasting of the single scalar, calculate the cost as
12032	// the broadcast.
12033	const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
12034	assert(It != VL.end() && "Expected at least one non-undef value.");
12035	// Add broadcast for non-identity shuffle only.
12036	bool NeedShuffle =
12037	count(Range&: VL, Element: *It) > `1` &&
12038	(VL.front() != *It \|\| !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
12039	if (!NeedShuffle) {
12040	if (isa<FixedVectorType>(Val: ScalarTy)) {
12041	assert(SLPReVec && "FixedVectorType is not expected.");
12042	return TTI.getShuffleCost(
12043	Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind,
12044	Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy),
12045	SubTp: cast<FixedVectorType>(Val: ScalarTy));
12046	}
12047	return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
12048	CostKind, Index: std::distance(first: VL.begin(), last: It),
12049	Op0: PoisonValue::get(T: VecTy), Op1: *It);
12050	}
12051
12052	SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
12053	transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
12054	return isa<PoisonValue>(Val: V) ? PoisonMaskElem : `0`;
12055	});
12056	InstructionCost InsertCost =
12057	TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`,
12058	Op0: PoisonValue::get(T: VecTy), Op1: *It);
12059	return InsertCost + ::getShuffleCost(TTI,
12060	Kind: TargetTransformInfo::SK_Broadcast,
12061	Tp: VecTy, Mask: ShuffleMask, CostKind,
12062	/Index=/`0`, /SubTp=/nullptr,
12063	/Args=/*It);
12064	}
12065	return GatherCost +
12066	(all_of(Range&: Gathers, P: IsaPred<UndefValue>)
12067	? TTI::TCC_Free
12068	: R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
12069	ScalarTy));
12070	};
12071
12072	/// Compute the cost of creating a vector containing the extracted values from
12073	/// \p VL.
12074	InstructionCost
12075	computeExtractCost(ArrayRef<Value > VL, ArrayRef<int*> Mask,
12076	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12077	unsigned NumParts) {
12078	assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
12079	unsigned NumElts =
12080	std::accumulate(first: VL.begin(), last: VL.end(), init: `0`, binary_op: [](unsigned Sz, Value *V) {
12081	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
12082	if (!EE)
12083	return Sz;
12084	auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
12085	if (!VecTy)
12086	return Sz;
12087	return std::max(a: Sz, b: VecTy->getNumElements());
12088	});
12089	// FIXME: this must be moved to TTI for better estimation.
12090	unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
12091	auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
12092	SmallVectorImpl<unsigned> &Indices,
12093	SmallVectorImpl<unsigned> &SubVecSizes)
12094	-> std::optional<TTI::ShuffleKind> {
12095	if (NumElts <= EltsPerVector)
12096	return std::nullopt;
12097	int OffsetReg0 =
12098	alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
12099	binary_op: [](int S, int I) {
12100	if (I == PoisonMaskElem)
12101	return S;
12102	return std::min(a: S, b: I);
12103	}),
12104	Align: EltsPerVector);
12105	int OffsetReg1 = OffsetReg0;
12106	DenseSet<int> RegIndices;
12107	// Check that if trying to permute same single/2 input vectors.
12108	TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
12109	int FirstRegId = -`1`;
12110	Indices.assign(NumElts: `1`, Elt: OffsetReg0);
12111	for (auto [Pos, I] : enumerate(First&: Mask)) {
12112	if (I == PoisonMaskElem)
12113	continue;
12114	int Idx = I - OffsetReg0;
12115	int RegId =
12116	(Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
12117	if (FirstRegId < `0`)
12118	FirstRegId = RegId;
12119	RegIndices.insert(V: RegId);
12120	if (RegIndices.size() > `2`)
12121	return std::nullopt;
12122	if (RegIndices.size() == `2`) {
12123	ShuffleKind = TTI::SK_PermuteTwoSrc;
12124	if (Indices.size() == `1`) {
12125	OffsetReg1 = alignDown(
12126	Value: std::accumulate(
12127	first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
12128	binary_op: [&](int S, int I) {
12129	if (I == PoisonMaskElem)
12130	return S;
12131	int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
12132	((I - OffsetReg0) % NumElts) / EltsPerVector;
12133	if (RegId == FirstRegId)
12134	return S;
12135	return std::min(a: S, b: I);
12136	}),
12137	Align: EltsPerVector);
12138	unsigned Index = OffsetReg1 % NumElts;
12139	Indices.push_back(Elt: Index);
12140	SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector));
12141	}
12142	Idx = I - OffsetReg1;
12143	}
12144	I = (Idx % NumElts) % EltsPerVector +
12145	(RegId == FirstRegId ? `0` : EltsPerVector);
12146	}
12147	return ShuffleKind;
12148	};
12149	InstructionCost Cost = `0`;
12150
12151	// Process extracts in blocks of EltsPerVector to check if the source vector
12152	// operand can be re-used directly. If not, add the cost of creating a
12153	// shuffle to extract the values into a vector register.
12154	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
12155	if (!ShuffleKinds [Part])
12156	continue;
12157	ArrayRef<int> MaskSlice = Mask.slice(
12158	N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
12159	SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
12160	copy(Range&: MaskSlice, Out: SubMask.begin());
12161	SmallVector<unsigned, `2`> Indices;
12162	SmallVector<unsigned, `2`> SubVecSizes;
12163	std::optional<TTI::ShuffleKind> RegShuffleKind =
12164	CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
12165	if (!RegShuffleKind) {
12166	if (*ShuffleKinds [Part] != TTI::SK_PermuteSingleSrc \|\|
12167	!ShuffleVectorInst::isIdentityMask(
12168	Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
12169	Cost +=
12170	::getShuffleCost(TTI, Kind: *ShuffleKinds [Part],
12171	Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
12172	continue;
12173	}
12174	if (*RegShuffleKind != TTI::SK_PermuteSingleSrc \|\|
12175	!ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
12176	Cost +=
12177	::getShuffleCost(TTI, Kind: *RegShuffleKind,
12178	Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
12179	}
12180	const unsigned BaseVF = getFullVectorNumberOfElements(
12181	TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector));
12182	for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) {
12183	assert((Idx + SubVecSize) <= BaseVF &&
12184	"SK_ExtractSubvector index out of range");
12185	Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
12186	Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind,
12187	Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize));
12188	}
12189	// Second attempt to check, if just a permute is better estimated than
12190	// subvector extract.
12191	SubMask.assign(NumElts, Elt: PoisonMaskElem);
12192	copy(Range&: MaskSlice, Out: SubMask.begin());
12193	InstructionCost OriginalCost = ::getShuffleCost(
12194	TTI, Kind: *ShuffleKinds [Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
12195	if (OriginalCost < Cost)
12196	Cost = OriginalCost;
12197	}
12198	return Cost;
12199	}
12200	/// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
12201	/// mask \p Mask, register number \p Part, that includes \p SliceSize
12202	/// elements.
12203	void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
12204	ArrayRef<int> Mask, unsigned Part,
12205	unsigned SliceSize) {
12206	if (SameNodesEstimated) {
12207	// Delay the cost estimation if the same nodes are reshuffling.
12208	// If we already requested the cost of reshuffling of E1 and E2 before, no
12209	// need to estimate another cost with the sub-Mask, instead include this
12210	// sub-Mask into the CommonMask to estimate it later and avoid double cost
12211	// estimation.
12212	if ((InVectors.size() == `2` &&
12213	cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 &&
12214	cast<const TreeEntry *>(Val&: InVectors.back()) == E2) \|\|
12215	(!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) {
12216	unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
12217	assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
12218	[](int Idx) { return Idx == PoisonMaskElem; }) &&
12219	"Expected all poisoned elements.");
12220	ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
12221	copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
12222	return;
12223	}
12224	// Found non-matching nodes - need to estimate the cost for the matched
12225	// and transform mask.
12226	Cost += createShuffle(P1: InVectors.front(),
12227	P2: InVectors.size() == `1` ? nullptr : InVectors.back(),
12228	Mask: CommonMask);
12229	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12230	} else if (InVectors.size() == `2`) {
12231	Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
12232	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12233	}
12234	SameNodesEstimated = false;
12235	if (!E2 && InVectors.size() == `1`) {
12236	unsigned VF = E1.getVectorFactor();
12237	if (Value V1 = dyn_cast<Value >(Val&: InVectors.front())) {
12238	VF = std::max(a: VF, b: getVF(V: V1));
12239	} else {
12240	const auto E = cast<const* TreeEntry *>(Val&: InVectors.front());
12241	VF = std::max(a: VF, b: E->getVectorFactor());
12242	}
12243	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12244	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
12245	CommonMask [Idx] = Mask [Idx] + VF;
12246	Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
12247	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12248	} else {
12249	auto P = InVectors.front();
12250	Cost += createShuffle(P1: &E1, P2: E2, Mask);
12251	unsigned VF = Mask.size();
12252	if (Value V1 = dyn_cast<Value >(Val&: P)) {
12253	VF = std::max(a: VF,
12254	b: getNumElements(Ty: V1->getType()));
12255	} else {
12256	const auto E = cast<const* TreeEntry *>(Val&: P);
12257	VF = std::max(a: VF, b: E->getVectorFactor());
12258	}
12259	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12260	if (Mask [Idx] != PoisonMaskElem)
12261	CommonMask [Idx] = Idx + (InVectors.empty() ? `0` : VF);
12262	Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask);
12263	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12264	}
12265	}
12266
12267	class ShuffleCostBuilder {
12268	const TargetTransformInfo &TTI;
12269
12270	static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
12271	int Index = -`1`;
12272	return Mask.empty() \|\|
12273	(VF == Mask.size() &&
12274	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) \|\|
12275	(ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
12276	Index == `0`);
12277	}
12278
12279	public:
12280	ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
12281	~ShuffleCostBuilder() = default;
12282	InstructionCost createShuffleVector(Value V1, Value ,
12283	ArrayRef<int> Mask) const {
12284	// Empty mask or identity mask are free.
12285	unsigned VF =
12286	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12287	if (isEmptyOrIdentity(Mask, VF))
12288	return TTI::TCC_Free;
12289	return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
12290	Tp: cast<VectorType>(Val: V1->getType()), Mask);
12291	}
12292	InstructionCost createShuffleVector(Value V1, ArrayRef<int> Mask) const* {
12293	// Empty mask or identity mask are free.
12294	unsigned VF =
12295	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12296	if (isEmptyOrIdentity(Mask, VF))
12297	return TTI::TCC_Free;
12298	return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
12299	Tp: cast<VectorType>(Val: V1->getType()), Mask);
12300	}
12301	InstructionCost createIdentity(Value ) const* { return TTI::TCC_Free; }
12302	InstructionCost createPoison(Type Ty, unsigned* VF) const {
12303	return TTI::TCC_Free;
12304	}
12305	void resizeToMatch(Value &, Value &) const {}
12306	};
12307
12308	/// Smart shuffle instruction emission, walks through shuffles trees and
12309	/// tries to find the best matching vector for the actual shuffle
12310	/// instruction.
12311	InstructionCost
12312	createShuffle(const PointerUnion<Value , const* TreeEntry *> &P1,
12313	const PointerUnion<Value , const* TreeEntry *> &P2,
12314	ArrayRef<int> Mask) {
12315	ShuffleCostBuilder Builder(TTI);
12316	SmallVector<int> CommonMask(Mask);
12317	Value V1 = P1.dyn_cast<Value >(), V2 = P2.dyn_cast<Value >();
12318	unsigned CommonVF = Mask.size();
12319	InstructionCost ExtraCost = `0`;
12320	auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
12321	unsigned VF) -> InstructionCost {
12322	if (E.isGather() && allConstant(VL: E.Scalars))
12323	return TTI::TCC_Free;
12324	Type *EScalarTy = E.Scalars.front()->getType();
12325	bool IsSigned = true;
12326	if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
12327	EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It ->second.first);
12328	IsSigned = It ->second.second;
12329	}
12330	if (EScalarTy != ScalarTy) {
12331	unsigned CastOpcode = Instruction::Trunc;
12332	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
12333	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
12334	if (DstSz > SrcSz)
12335	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
12336	return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
12337	Src: getWidenedType(ScalarTy: EScalarTy, VF),
12338	CCH: TTI::CastContextHint::None, CostKind);
12339	}
12340	return TTI::TCC_Free;
12341	};
12342	auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
12343	if (isa<Constant>(Val: V))
12344	return TTI::TCC_Free;
12345	auto *VecTy = cast<VectorType>(Val: V->getType());
12346	Type *EScalarTy = VecTy->getElementType();
12347	if (EScalarTy != ScalarTy) {
12348	bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery (*R.DL));
12349	unsigned CastOpcode = Instruction::Trunc;
12350	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
12351	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
12352	if (DstSz > SrcSz)
12353	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
12354	return TTI.getCastInstrCost(
12355	Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
12356	Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
12357	}
12358	return TTI::TCC_Free;
12359	};
12360	if (!V1 && !V2 && !P2.isNull()) {
12361	// Shuffle 2 entry nodes.
12362	const TreeEntry E = cast<const* TreeEntry *>(Val: P1);
12363	unsigned VF = E->getVectorFactor();
12364	const TreeEntry E2 = cast<const* TreeEntry *>(Val: P2);
12365	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
12366	assert(all_of(Mask,
12367	[=](int Idx) {
12368	return Idx < `2` * static_cast<int>(CommonVF);
12369	}) &&
12370	"All elements in mask must be less than 2 * CommonVF.");
12371	if (E->Scalars.size() == E2->Scalars.size()) {
12372	SmallVector<int> EMask = E->getCommonMask();
12373	SmallVector<int> E2Mask = E2->getCommonMask();
12374	if (!EMask.empty() \|\| !E2Mask.empty()) {
12375	for (int &Idx : CommonMask) {
12376	if (Idx == PoisonMaskElem)
12377	continue;
12378	if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
12379	Idx = EMask [Idx];
12380	else if (Idx >= static_cast<int>(CommonVF))
12381	Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask [Idx - CommonVF]) +
12382	E->Scalars.size();
12383	}
12384	}
12385	CommonVF = E->Scalars.size();
12386	ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
12387	GetNodeMinBWAffectedCost(*E2, CommonVF);
12388	} else {
12389	ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
12390	GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
12391	}
12392	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12393	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12394	} else if (!V1 && P2.isNull()) {
12395	// Shuffle single entry node.
12396	const TreeEntry E = cast<const* TreeEntry *>(Val: P1);
12397	unsigned VF = E->getVectorFactor();
12398	CommonVF = VF;
12399	assert(
12400	all_of(Mask,
12401	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
12402	"All elements in mask must be less than CommonVF.");
12403	if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
12404	SmallVector<int> EMask = E->getCommonMask();
12405	assert(!EMask.empty() && "Expected non-empty common mask.");
12406	for (int &Idx : CommonMask) {
12407	if (Idx != PoisonMaskElem)
12408	Idx = EMask [Idx];
12409	}
12410	CommonVF = E->Scalars.size();
12411	} else if (unsigned Factor = E->getInterleaveFactor();
12412	Factor > `0` && E->Scalars.size() != Mask.size() &&
12413	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask,
12414	Factor)) {
12415	// Deinterleaved nodes are free.
12416	std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: `0`);
12417	}
12418	ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
12419	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12420	// Not identity/broadcast? Try to see if the original vector is better.
12421	if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
12422	CommonVF == CommonMask.size() &&
12423	any_of(Range: enumerate(First&: CommonMask),
12424	P: [](const auto &&P) {
12425	return P.value() != PoisonMaskElem &&
12426	static_cast<unsigned>(P.value()) != P.index();
12427	}) &&
12428	any_of(Range&: CommonMask,
12429	P: [](int Idx) { return Idx != PoisonMaskElem && Idx != `0`; })) {
12430	SmallVector<int> ReorderMask;
12431	inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
12432	::addMask(Mask&: CommonMask, SubMask: ReorderMask);
12433	}
12434	} else if (V1 && P2.isNull()) {
12435	// Shuffle single vector.
12436	ExtraCost += GetValueMinBWAffectedCost(V1);
12437	CommonVF = getVF(V: V1);
12438	assert(
12439	all_of(Mask,
12440	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
12441	"All elements in mask must be less than CommonVF.");
12442	} else if (V1 && !V2) {
12443	// Shuffle vector and tree node.
12444	unsigned VF = getVF(V: V1);
12445	const TreeEntry E2 = cast<const* TreeEntry *>(Val: P2);
12446	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
12447	assert(all_of(Mask,
12448	[=](int Idx) {
12449	return Idx < `2` * static_cast<int>(CommonVF);
12450	}) &&
12451	"All elements in mask must be less than 2 * CommonVF.");
12452	if (E2->Scalars.size() == VF && VF != CommonVF) {
12453	SmallVector<int> E2Mask = E2->getCommonMask();
12454	assert(!E2Mask.empty() && "Expected non-empty common mask.");
12455	for (int &Idx : CommonMask) {
12456	if (Idx == PoisonMaskElem)
12457	continue;
12458	if (Idx >= static_cast<int>(CommonVF))
12459	Idx = E2Mask [Idx - CommonVF] + VF;
12460	}
12461	CommonVF = VF;
12462	}
12463	ExtraCost += GetValueMinBWAffectedCost(V1);
12464	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12465	ExtraCost += GetNodeMinBWAffectedCost(
12466	*E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
12467	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12468	} else if (!V1 && V2) {
12469	// Shuffle vector and tree node.
12470	unsigned VF = getVF(V: V2);
12471	const TreeEntry E1 = cast<const* TreeEntry *>(Val: P1);
12472	CommonVF = std::max(a: VF, b: E1->getVectorFactor());
12473	assert(all_of(Mask,
12474	[=](int Idx) {
12475	return Idx < `2` * static_cast<int>(CommonVF);
12476	}) &&
12477	"All elements in mask must be less than 2 * CommonVF.");
12478	if (E1->Scalars.size() == VF && VF != CommonVF) {
12479	SmallVector<int> E1Mask = E1->getCommonMask();
12480	assert(!E1Mask.empty() && "Expected non-empty common mask.");
12481	for (int &Idx : CommonMask) {
12482	if (Idx == PoisonMaskElem)
12483	continue;
12484	if (Idx >= static_cast<int>(CommonVF))
12485	Idx = E1Mask [Idx - CommonVF] + VF;
12486	else
12487	Idx = E1Mask [Idx];
12488	}
12489	CommonVF = VF;
12490	}
12491	ExtraCost += GetNodeMinBWAffectedCost(
12492	*E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
12493	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12494	ExtraCost += GetValueMinBWAffectedCost(V2);
12495	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12496	} else {
12497	assert(V1 && V2 && "Expected both vectors.");
12498	unsigned VF = getVF(V: V1);
12499	CommonVF = std::max(a: VF, b: getVF(V: V2));
12500	assert(all_of(Mask,
12501	[=](int Idx) {
12502	return Idx < `2` * static_cast<int>(CommonVF);
12503	}) &&
12504	"All elements in mask must be less than 2 * CommonVF.");
12505	ExtraCost +=
12506	GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
12507	if (V1->getType() != V2->getType()) {
12508	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12509	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12510	} else {
12511	if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
12512	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12513	if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
12514	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12515	}
12516	}
12517	InVectors.front() =
12518	Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
12519	if (InVectors.size() == `2`)
12520	InVectors.pop_back();
12521	return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
12522	V1, V2, Mask: CommonMask, Builder, ScalarTy);
12523	}
12524
12525	public:
12526	ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
12527	ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
12528	SmallPtrSetImpl<Value *> &CheckedExtracts)
12529	: BaseShuffleAnalysis (ScalarTy), TTI(TTI),
12530	VectorizedVals (VectorizedVals.begin(), VectorizedVals.end()), R(R),
12531	CheckedExtracts(CheckedExtracts) {}
12532	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
12533	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12534	unsigned NumParts, bool &UseVecBaseAsInput) {
12535	UseVecBaseAsInput = false;
12536	if (Mask.empty())
12537	return nullptr;
12538	Value VecBase = nullptr*;
12539	SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
12540	if (!E->ReorderIndices.empty()) {
12541	SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
12542	E->ReorderIndices.end());
12543	reorderScalars(Scalars&: VL, Mask: ReorderMask);
12544	}
12545	// Check if it can be considered reused if same extractelements were
12546	// vectorized already.
12547	bool PrevNodeFound = any_of(
12548	Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
12549	P: [&](const std::unique_ptr<TreeEntry> &TE) {
12550	return ((TE ->hasState() && !TE ->isAltShuffle() &&
12551	TE ->getOpcode() == Instruction::ExtractElement) \|\|
12552	TE ->isGather()) &&
12553	all_of(Range: enumerate(First&: TE ->Scalars), P: [&](auto &&Data) {
12554	return VL.size() > Data.index() &&
12555	(Mask[Data.index()] == PoisonMaskElem \|\|
12556	isa<UndefValue>(VL[Data.index()]) \|\|
12557	Data.value() == VL[Data.index()]);
12558	});
12559	});
12560	SmallPtrSet<Value *, `4`> UniqueBases;
12561	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
12562	SmallDenseMap<Value *, APInt, `4`> VectorOpsToExtracts;
12563	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
12564	unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
12565	ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
12566	for (auto [I, V] :
12567	enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) {
12568	// Ignore non-extractelement scalars.
12569	if (isa<UndefValue>(Val: V) \|\|
12570	(!SubMask.empty() && SubMask [I] == PoisonMaskElem))
12571	continue;
12572	// If all users of instruction are going to be vectorized and this
12573	// instruction itself is not going to be vectorized, consider this
12574	// instruction as dead and remove its cost from the final cost of the
12575	// vectorized tree.
12576	// Also, avoid adjusting the cost for extractelements with multiple uses
12577	// in different graph entries.
12578	auto *EE = cast<ExtractElementInst>(Val: V);
12579	VecBase = EE->getVectorOperand();
12580	UniqueBases.insert(Ptr: VecBase);
12581	ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
12582	if (!CheckedExtracts.insert(Ptr: V).second \|\|
12583	!R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) \|\|
12584	any_of(Range: EE->users(),
12585	P: [&](User *U) {
12586	return isa<GetElementPtrInst>(Val: U) &&
12587	!R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
12588	VectorizedVals: &VectorizedVals);
12589	}) \|\|
12590	(!VEs.empty() && !is_contained(Range&: VEs, Element: E)))
12591	continue;
12592	std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
12593	if (!EEIdx)
12594	continue;
12595	unsigned Idx = *EEIdx;
12596	// Take credit for instruction that will become dead.
12597	if (EE->hasOneUse() \|\| !PrevNodeFound) {
12598	Instruction *Ext = EE->user_back();
12599	if (isa<SExtInst, ZExtInst>(Val: Ext) &&
12600	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
12601	// Use getExtractWithExtendCost() to calculate the cost of
12602	// extractelement/ext pair.
12603	Cost -= TTI.getExtractWithExtendCost(
12604	Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(),
12605	Index: Idx, CostKind);
12606	// Add back the cost of s\|zext which is subtracted separately.
12607	Cost += TTI.getCastInstrCost(
12608	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
12609	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
12610	continue;
12611	}
12612	}
12613	APInt &DemandedElts =
12614	VectorOpsToExtracts
12615	.try_emplace(Key: VecBase,
12616	Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType())))
12617	.first ->getSecond();
12618	DemandedElts.setBit(Idx);
12619	}
12620	}
12621	for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
12622	Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()),
12623	DemandedElts, /Insert=/false,
12624	/Extract=/true, CostKind);
12625	// Check that gather of extractelements can be represented as just a
12626	// shuffle of a single/two vectors the scalars are extracted from.
12627	// Found the bunch of extractelement instructions that must be gathered
12628	// into a vector and can be represented as a permutation elements in a
12629	// single input vector or of 2 input vectors.
12630	// Done for reused if same extractelements were vectorized already.
12631	if (!PrevNodeFound)
12632	Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
12633	InVectors.assign(NumElts: `1`, Elt: E);
12634	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12635	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12636	SameNodesEstimated = false;
12637	if (NumParts != `1` && UniqueBases.size() != `1`) {
12638	UseVecBaseAsInput = true;
12639	VecBase =
12640	Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
12641	}
12642	return VecBase;
12643	}
12644	/// Checks if the specified entry \p E needs to be delayed because of its
12645	/// dependency nodes.
12646	std::optional<InstructionCost>
12647	needToDelay(const TreeEntry *,
12648	ArrayRef<SmallVector<const TreeEntry >>) const* {
12649	// No need to delay the cost estimation during analysis.
12650	return std::nullopt;
12651	}
12652	/// Reset the builder to handle perfect diamond match.
12653	void resetForSameNode() {
12654	IsFinalized = false;
12655	CommonMask.clear();
12656	InVectors.clear();
12657	Cost = `0`;
12658	VectorizedVals.clear();
12659	SameNodesEstimated = true;
12660	}
12661	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12662	if (&E1 == &E2) {
12663	assert(all_of(Mask,
12664	[&](int Idx) {
12665	return Idx < static_cast<int>(E1.getVectorFactor());
12666	}) &&
12667	"Expected single vector shuffle mask.");
12668	add(E1, Mask);
12669	return;
12670	}
12671	if (InVectors.empty()) {
12672	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12673	InVectors.assign(IL: {&E1, &E2});
12674	return;
12675	}
12676	assert(!CommonMask.empty() && "Expected non-empty common mask.");
12677	auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
12678	unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
12679	unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
12680	const auto *It =
12681	find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
12682	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
12683	estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
12684	}
12685	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12686	if (InVectors.empty()) {
12687	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12688	InVectors.assign(NumElts: `1`, Elt: &E1);
12689	return;
12690	}
12691	assert(!CommonMask.empty() && "Expected non-empty common mask.");
12692	auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
12693	unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
12694	unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
12695	const auto *It =
12696	find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
12697	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
12698	estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
12699	if (!SameNodesEstimated && InVectors.size() == `1`)
12700	InVectors.emplace_back(Args: &E1);
12701	}
12702	/// Adds 2 input vectors and the mask for their shuffling.
12703	void add(Value V1, Value V2, ArrayRef<int> Mask) {
12704	// May come only for shuffling of 2 vectors with extractelements, already
12705	// handled in adjustExtracts.
12706	assert(InVectors.size() == `1` &&
12707	all_of(enumerate(CommonMask),
12708	[&](auto P) {
12709	if (P.value() == PoisonMaskElem)
12710	return Mask[P.index()] == PoisonMaskElem;
12711	auto *EI = cast<ExtractElementInst>(
12712	cast<const TreeEntry *>(InVectors.front())
12713	->getOrdered(P.index()));
12714	return EI->getVectorOperand() == V1 \|\|
12715	EI->getVectorOperand() == V2;
12716	}) &&
12717	"Expected extractelement vectors.");
12718	}
12719	/// Adds another one input vector and the mask for the shuffling.
12720	void add(Value V1, ArrayRef<int> Mask, bool* ForExtracts = false) {
12721	if (InVectors.empty()) {
12722	assert(CommonMask.empty() && !ForExtracts &&
12723	"Expected empty input mask/vectors.");
12724	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12725	InVectors.assign(NumElts: `1`, Elt: V1);
12726	return;
12727	}
12728	if (ForExtracts) {
12729	// No need to add vectors here, already handled them in adjustExtracts.
12730	assert(InVectors.size() == `1` && isa<const TreeEntry *>(InVectors[`0`]) &&
12731	!CommonMask.empty() &&
12732	all_of(enumerate(CommonMask),
12733	[&](auto P) {
12734	Value Scalar = cast<const* TreeEntry *>(InVectors[`0`])
12735	->getOrdered(P.index());
12736	if (P.value() == PoisonMaskElem)
12737	return P.value() == Mask[P.index()] \|\|
12738	isa<UndefValue>(Scalar);
12739	if (isa<Constant>(V1))
12740	return true;
12741	auto *EI = cast<ExtractElementInst>(Scalar);
12742	return EI->getVectorOperand() == V1;
12743	}) &&
12744	"Expected only tree entry for extractelement vectors.");
12745	return;
12746	}
12747	assert(!InVectors.empty() && !CommonMask.empty() &&
12748	"Expected only tree entries from extracts/reused buildvectors.");
12749	unsigned VF = getVF(V: V1);
12750	if (InVectors.size() == `2`) {
12751	Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
12752	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12753	VF = std::max<unsigned>(a: VF, b: CommonMask.size());
12754	} else if (const auto *InTE =
12755	InVectors.front().dyn_cast<const TreeEntry *>()) {
12756	VF = std::max(a: VF, b: InTE->getVectorFactor());
12757	} else {
12758	VF = std::max(
12759	a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType())
12760	->getNumElements());
12761	}
12762	InVectors.push_back(Elt: V1);
12763	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12764	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
12765	CommonMask [Idx] = Mask [Idx] + VF;
12766	}
12767	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
12768	Value Root = nullptr*) {
12769	Cost += getBuildVectorCost(VL, Root);
12770	if (!Root) {
12771	// FIXME: Need to find a way to avoid use of getNullValue here.
12772	SmallVector<Constant *> Vals;
12773	unsigned VF = VL.size();
12774	if (MaskVF != `0`)
12775	VF = std::min(a: VF, b: MaskVF);
12776	Type *VLScalarTy = VL.front()->getType();
12777	for (Value *V : VL.take_front(N: VF)) {
12778	Type *ScalarTy = VLScalarTy->getScalarType();
12779	if (isa<PoisonValue>(Val: V)) {
12780	Vals.push_back(Elt: PoisonValue::get(T: ScalarTy));
12781	continue;
12782	}
12783	if (isa<UndefValue>(Val: V)) {
12784	Vals.push_back(Elt: UndefValue::get(T: ScalarTy));
12785	continue;
12786	}
12787	Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy));
12788	}
12789	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) {
12790	assert(SLPReVec && "FixedVectorType is not expected.");
12791	// When REVEC is enabled, we need to expand vector types into scalar
12792	// types.
12793	Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements());
12794	}
12795	return ConstantVector::get(V: Vals);
12796	}
12797	return ConstantVector::getSplat(
12798	EC: ElementCount::getFixed(
12799	MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
12800	Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType()));
12801	}
12802	InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
12803	/// Finalize emission of the shuffles.
12804	InstructionCost finalize(
12805	ArrayRef<int> ExtMask,
12806	ArrayRef<std::pair<const TreeEntry , unsigned*>> SubVectors,
12807	ArrayRef<int> SubVectorsMask, unsigned VF = `0`,
12808	function_ref<void(Value &, SmallVectorImpl<int*> &,
12809	function_ref<Value (Value , Value , ArrayRef<int*>)>)>
12810	Action = {}) {
12811	IsFinalized = true;
12812	if (Action) {
12813	const PointerUnion<Value , const* TreeEntry *> &Vec = InVectors.front();
12814	if (InVectors.size() == `2`)
12815	Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
12816	else
12817	Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
12818	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12819	assert(VF > `0` &&
12820	"Expected vector length for the final value before action.");
12821	Value V = cast<Value >(Val: Vec);
12822	Action (V, CommonMask, [this](Value V1, Value V2, ArrayRef<int> Mask) {
12823	Cost += createShuffle(P1: V1, P2: V2, Mask);
12824	return V1;
12825	});
12826	InVectors.front() = V;
12827	}
12828	if (!SubVectors.empty()) {
12829	const PointerUnion<Value , const* TreeEntry *> &Vec = InVectors.front();
12830	if (InVectors.size() == `2`)
12831	Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
12832	else
12833	Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
12834	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12835	// Add subvectors permutation cost.
12836	if (!SubVectorsMask.empty()) {
12837	assert(SubVectorsMask.size() <= CommonMask.size() &&
12838	"Expected same size of masks for subvectors and common mask.");
12839	SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
12840	copy(Range&: SubVectorsMask, Out: SVMask.begin());
12841	for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
12842	if (I2 != PoisonMaskElem) {
12843	assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
12844	I1 = I2 + CommonMask.size();
12845	}
12846	}
12847	Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
12848	Tp: getWidenedType(ScalarTy, VF: CommonMask.size()),
12849	Mask: SVMask, CostKind);
12850	}
12851	for (auto [E, Idx] : SubVectors) {
12852	Type *EScalarTy = E->Scalars.front()->getType();
12853	bool IsSigned = true;
12854	if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) {
12855	EScalarTy =
12856	IntegerType::get(C&: EScalarTy->getContext(), NumBits: It ->second.first);
12857	IsSigned = It ->second.second;
12858	}
12859	if (ScalarTy != EScalarTy) {
12860	unsigned CastOpcode = Instruction::Trunc;
12861	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
12862	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
12863	if (DstSz > SrcSz)
12864	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
12865	Cost += TTI.getCastInstrCost(
12866	Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()),
12867	Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()),
12868	CCH: TTI::CastContextHint::Normal, CostKind);
12869	}
12870	Cost += ::getShuffleCost(
12871	TTI, Kind: TTI::SK_InsertSubvector,
12872	Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx,
12873	SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor()));
12874	if (!CommonMask.empty()) {
12875	std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
12876	last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
12877	value: Idx);
12878	}
12879	}
12880	}
12881
12882	if (!ExtMask.empty()) {
12883	if (CommonMask.empty()) {
12884	CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
12885	} else {
12886	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12887	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
12888	if (ExtMask [I] == PoisonMaskElem)
12889	continue;
12890	NewMask [I] = CommonMask [ExtMask [I]];
12891	}
12892	CommonMask.swap(RHS&: NewMask);
12893	}
12894	}
12895	if (CommonMask.empty()) {
12896	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
12897	return Cost;
12898	}
12899	return Cost +
12900	createShuffle(P1: InVectors.front(),
12901	P2: InVectors.size() == `2` ? InVectors.back() : nullptr,
12902	Mask: CommonMask);
12903	}
12904
12905	~ShuffleCostEstimator() {
12906	assert((IsFinalized \|\| CommonMask.empty()) &&
12907	"Shuffle construction must be finalized.");
12908	}
12909	};
12910
12911	const BoUpSLP::TreeEntry BoUpSLP::getOperandEntry(const* TreeEntry *E,
12912	unsigned Idx) const {
12913	TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx});
12914	assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
12915	return Op;
12916	}
12917
12918	TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
12919	if (TE.State == TreeEntry::ScatterVectorize \|\|
12920	TE.State == TreeEntry::StridedVectorize)
12921	return TTI::CastContextHint::GatherScatter;
12922	if (TE.State == TreeEntry::CompressVectorize)
12923	return TTI::CastContextHint::Masked;
12924	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
12925	!TE.isAltShuffle()) {
12926	if (TE.ReorderIndices.empty())
12927	return TTI::CastContextHint::Normal;
12928	SmallVector<int> Mask;
12929	inversePermutation(Indices: TE.ReorderIndices, Mask);
12930	if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
12931	return TTI::CastContextHint::Reversed;
12932	}
12933	return TTI::CastContextHint::None;
12934	}
12935
12936	InstructionCost
12937	BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
12938	SmallPtrSetImpl<Value *> &CheckedExtracts) {
12939	ArrayRef<Value *> VL = E->Scalars;
12940
12941	Type *ScalarTy = getValueType(V: VL [`0`]);
12942	if (!isValidElementType(Ty: ScalarTy))
12943	return InstructionCost::getInvalid();
12944	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
12945
12946	// If we have computed a smaller type for the expression, update VecTy so
12947	// that the costs will be accurate.
12948	auto It = MinBWs.find(Val: E);
12949	Type *OrigScalarTy = ScalarTy;
12950	if (It != MinBWs.end()) {
12951	auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
12952	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
12953	if (VecTy)
12954	ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
12955	}
12956	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
12957	unsigned EntryVF = E->getVectorFactor();
12958	auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
12959
12960	if (E->isGather()) {
12961	if (allConstant(VL))
12962	return `0`;
12963	if (isa<InsertElementInst>(Val: VL [`0`]))
12964	return InstructionCost::getInvalid();
12965	if (isa<CmpInst>(Val: VL.front()))
12966	ScalarTy = VL.front()->getType();
12967	return processBuildVector<ShuffleCostEstimator, InstructionCost>(
12968	E, ScalarTy, Params&: TTI, Params&: VectorizedVals, Params&: this, Params&: CheckedExtracts);
12969	}
12970	if (E->State == TreeEntry::SplitVectorize) {
12971	assert(E->CombinedEntriesWithIndices.size() == `2` &&
12972	"Expected exactly 2 combined entries.");
12973	assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
12974	InstructionCost VectorCost = `0`;
12975	if (E->ReorderIndices.empty()) {
12976	VectorCost = ::getShuffleCost(
12977	TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind,
12978	Index: E->CombinedEntriesWithIndices.back().second,
12979	SubTp: getWidenedType(
12980	ScalarTy,
12981	VF: VectorizableTree [E->CombinedEntriesWithIndices.back().first]
12982	->getVectorFactor()));
12983	} else {
12984	unsigned CommonVF =
12985	std::max(a: VectorizableTree [E->CombinedEntriesWithIndices.front().first]
12986	->getVectorFactor(),
12987	b: VectorizableTree [E->CombinedEntriesWithIndices.back().first]
12988	->getVectorFactor());
12989	VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
12990	Tp: getWidenedType(ScalarTy, VF: CommonVF),
12991	Mask: E->getSplitMask(), CostKind);
12992	}
12993	LLVM_DEBUG(dumpTreeCosts(E, `0`, VectorCost, `0`, "Calculated costs for Tree"));
12994	return VectorCost;
12995	}
12996	InstructionCost CommonCost = `0`;
12997	SmallVector<int> Mask;
12998	if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
12999	(E->State != TreeEntry::StridedVectorize \|\|
13000	!isReverseOrder(Order: E->ReorderIndices))) {
13001	SmallVector<int> NewMask;
13002	if (E->getOpcode() == Instruction::Store) {
13003	// For stores the order is actually a mask.
13004	NewMask.resize(N: E->ReorderIndices.size());
13005	copy(Range: E->ReorderIndices, Out: NewMask.begin());
13006	} else {
13007	inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
13008	}
13009	::addMask(Mask, SubMask: NewMask);
13010	}
13011	if (!E->ReuseShuffleIndices.empty())
13012	::addMask(Mask, SubMask: E->ReuseShuffleIndices);
13013	if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
13014	CommonCost =
13015	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
13016	assert((E->State == TreeEntry::Vectorize \|\|
13017	E->State == TreeEntry::ScatterVectorize \|\|
13018	E->State == TreeEntry::StridedVectorize \|\|
13019	E->State == TreeEntry::CompressVectorize) &&
13020	"Unhandled state");
13021	assert(E->getOpcode() &&
13022	((allSameType(VL) && allSameBlock(VL)) \|\|
13023	(E->getOpcode() == Instruction::GetElementPtr &&
13024	E->getMainOp()->getType()->isPointerTy())) &&
13025	"Invalid VL");
13026	Instruction *VL0 = E->getMainOp();
13027	unsigned ShuffleOrOp =
13028	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
13029	if (E->CombinedOp != TreeEntry::NotCombinedOp)
13030	ShuffleOrOp = E->CombinedOp;
13031	SmallSetVector<Value *, `16`> UniqueValues(VL.begin(), VL.end());
13032	const unsigned Sz = UniqueValues.size();
13033	SmallBitVector UsedScalars(Sz, false);
13034	for (unsigned I = `0`; I < Sz; ++I) {
13035	if (isa<Instruction>(Val: UniqueValues [I]) &&
13036	getTreeEntries(V: UniqueValues [I]).front() == E)
13037	continue;
13038	UsedScalars.set(I);
13039	}
13040	auto GetCastContextHint = [&](Value *V) {
13041	if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == `1`)
13042	return getCastContextHint(TE: *OpTEs.front());
13043	InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: `0`), TLI: *TLI);
13044	if (SrcState && SrcState.getOpcode() == Instruction::Load &&
13045	!SrcState.isAltShuffle())
13046	return TTI::CastContextHint::GatherScatter;
13047	return TTI::CastContextHint::None;
13048	};
13049	auto GetCostDiff =
13050	[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
13051	function_ref<InstructionCost(InstructionCost)> VectorCost) {
13052	// Calculate the cost of this instruction.
13053	InstructionCost ScalarCost = `0`;
13054	if (isa<CastInst, CallInst>(Val: VL0)) {
13055	// For some of the instructions no need to calculate cost for each
13056	// particular instruction, we can use the cost of the single
13057	// instruction x total number of scalar instructions.
13058	ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost (`0`);
13059	} else {
13060	for (unsigned I = `0`; I < Sz; ++I) {
13061	if (UsedScalars.test(Idx: I))
13062	continue;
13063	ScalarCost += ScalarEltCost (I);
13064	}
13065	}
13066
13067	InstructionCost VecCost = VectorCost (CommonCost);
13068	// Check if the current node must be resized, if the parent node is not
13069	// resized.
13070	if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) &&
13071	E->Idx != `0` &&
13072	(E->getOpcode() != Instruction::Load \|\| E->UserTreeIndex)) {
13073	const EdgeInfo &EI = E->UserTreeIndex;
13074	if (!EI.UserTE->hasState() \|\|
13075	EI.UserTE->getOpcode() != Instruction::Select \|\|
13076	EI.EdgeIdx != `0`) {
13077	auto UserBWIt = MinBWs.find(Val: EI.UserTE);
13078	Type *UserScalarTy =
13079	(EI.UserTE->isGather() \|\|
13080	EI.UserTE->State == TreeEntry::SplitVectorize)
13081	? EI.UserTE->Scalars.front()->getType()
13082	: EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
13083	if (UserBWIt != MinBWs.end())
13084	UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
13085	NumBits: UserBWIt ->second.first);
13086	if (ScalarTy != UserScalarTy) {
13087	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13088	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
13089	unsigned VecOpcode;
13090	auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size());
13091	if (BWSz > SrcBWSz)
13092	VecOpcode = Instruction::Trunc;
13093	else
13094	VecOpcode =
13095	It ->second.second ? Instruction::SExt : Instruction::ZExt;
13096	TTI::CastContextHint CCH = GetCastContextHint (VL0);
13097	VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
13098	CostKind);
13099	}
13100	}
13101	}
13102	LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
13103	ScalarCost, "Calculated costs for Tree"));
13104	return VecCost - ScalarCost;
13105	};
13106	// Calculate cost difference from vectorizing set of GEPs.
13107	// Negative value means vectorizing is profitable.
13108	auto GetGEPCostDiff = [=](ArrayRef<Value > Ptrs, Value BasePtr) {
13109	assert((E->State == TreeEntry::Vectorize \|\|
13110	E->State == TreeEntry::StridedVectorize \|\|
13111	E->State == TreeEntry::CompressVectorize) &&
13112	"Entry state expected to be Vectorize, StridedVectorize or "
13113	"MaskedLoadCompressVectorize here.");
13114	InstructionCost ScalarCost = `0`;
13115	InstructionCost VecCost = `0`;
13116	std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
13117	TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
13118	LLVM_DEBUG(dumpTreeCosts(E, `0`, VecCost, ScalarCost,
13119	"Calculated GEPs cost for Tree"));
13120
13121	return VecCost - ScalarCost;
13122	};
13123
13124	auto GetMinMaxCost = [&](Type Ty, Instruction VI = nullptr) {
13125	auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL);
13126	if (MinMaxID == Intrinsic::not_intrinsic)
13127	return InstructionCost::getInvalid();
13128	Type *CanonicalType = Ty;
13129	if (CanonicalType->isPtrOrPtrVectorTy())
13130	CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
13131	C&: CanonicalType->getContext(),
13132	NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
13133
13134	IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
13135	{CanonicalType, CanonicalType});
13136	InstructionCost IntrinsicCost =
13137	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13138	// If the selects are the only uses of the compares, they will be
13139	// dead and we can adjust the cost by removing their cost.
13140	if (VI && SelectOnly) {
13141	assert((!Ty->isVectorTy() \|\| SLPReVec) &&
13142	"Expected only for scalar type.");
13143	auto *CI = cast<CmpInst>(Val: VI->getOperand(i: `0`));
13144	IntrinsicCost -= TTI->getCmpSelInstrCost(
13145	Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(),
13146	CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
13147	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI);
13148	}
13149	return IntrinsicCost;
13150	};
13151	switch (ShuffleOrOp) {
13152	case Instruction::PHI: {
13153	// Count reused scalars.
13154	InstructionCost ScalarCost = `0`;
13155	SmallPtrSet<const TreeEntry *, `4`> CountedOps;
13156	for (Value *V : UniqueValues) {
13157	auto *PHI = dyn_cast<PHINode>(Val: V);
13158	if (!PHI)
13159	continue;
13160
13161	ValueList Operands(PHI->getNumIncomingValues(), nullptr);
13162	for (unsigned I = `0`, N = PHI->getNumIncomingValues(); I < N; ++I) {
13163	Value *Op = PHI->getIncomingValue(i: I);
13164	Operands [I] = Op;
13165	}
13166	if (const TreeEntry *OpTE =
13167	getSameValuesTreeEntry(V: Operands.front(), VL: Operands))
13168	if (CountedOps.insert(Ptr: OpTE).second &&
13169	!OpTE->ReuseShuffleIndices.empty())
13170	ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
13171	OpTE->Scalars.size());
13172	}
13173
13174	return CommonCost - ScalarCost;
13175	}
13176	case Instruction::ExtractValue:
13177	case Instruction::ExtractElement: {
13178	APInt DemandedElts;
13179	VectorType SrcVecTy = nullptr*;
13180	auto GetScalarCost = [&](unsigned Idx) {
13181	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
13182	return InstructionCost (TTI::TCC_Free);
13183
13184	auto *I = cast<Instruction>(Val: UniqueValues [Idx]);
13185	if (!SrcVecTy) {
13186	if (ShuffleOrOp == Instruction::ExtractElement) {
13187	auto *EE = cast<ExtractElementInst>(Val: I);
13188	SrcVecTy = EE->getVectorOperandType();
13189	} else {
13190	auto *EV = cast<ExtractValueInst>(Val: I);
13191	Type *AggregateTy = EV->getAggregateOperand()->getType();
13192	unsigned NumElts;
13193	if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
13194	NumElts = ATy->getNumElements();
13195	else
13196	NumElts = AggregateTy->getStructNumElements();
13197	SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
13198	}
13199	}
13200	if (I->hasOneUse()) {
13201	Instruction *Ext = I->user_back();
13202	if ((isa<SExtInst>(Val: Ext) \|\| isa<ZExtInst>(Val: Ext)) &&
13203	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
13204	// Use getExtractWithExtendCost() to calculate the cost of
13205	// extractelement/ext pair.
13206	InstructionCost Cost = TTI->getExtractWithExtendCost(
13207	Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I),
13208	CostKind);
13209	// Subtract the cost of s\|zext which is subtracted separately.
13210	Cost -= TTI->getCastInstrCost(
13211	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
13212	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
13213	return Cost;
13214	}
13215	}
13216	if (DemandedElts.isZero())
13217	DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy));
13218	DemandedElts.setBit(*getExtractIndex(E: I));
13219	return InstructionCost (TTI::TCC_Free);
13220	};
13221	auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
13222	return CommonCost - (DemandedElts.isZero()
13223	? TTI::TCC_Free
13224	: TTI.getScalarizationOverhead(
13225	Ty: SrcVecTy, DemandedElts, /Insert=/false,
13226	/Extract=/true, CostKind));
13227	};
13228	return GetCostDiff (GetScalarCost, GetVectorCost);
13229	}
13230	case Instruction::InsertElement: {
13231	assert(E->ReuseShuffleIndices.empty() &&
13232	"Unique insertelements only are expected.");
13233	auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
13234	unsigned const NumElts = SrcVecTy->getNumElements();
13235	unsigned const NumScalars = VL.size();
13236
13237	unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy);
13238
13239	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13240	unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
13241	unsigned OffsetEnd = OffsetBeg;
13242	InsertMask [OffsetBeg] = `0`;
13243	for (auto [I, V] : enumerate(First: VL.drop_front())) {
13244	unsigned Idx = *getElementIndex(Inst: V);
13245	if (OffsetBeg > Idx)
13246	OffsetBeg = Idx;
13247	else if (OffsetEnd < Idx)
13248	OffsetEnd = Idx;
13249	InsertMask [Idx] = I + `1`;
13250	}
13251	unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
13252	if (NumOfParts > `0` && NumOfParts < NumElts)
13253	VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - `1`) / NumOfParts);
13254	unsigned VecSz = (`1` + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
13255	VecScalarsSz;
13256	unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
13257	unsigned InsertVecSz = std::min<unsigned>(
13258	a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + `1`),
13259	b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
13260	bool IsWholeSubvector =
13261	OffsetBeg == Offset && ((OffsetEnd + `1`) % VecScalarsSz == `0`);
13262	// Check if we can safely insert a subvector. If it is not possible, just
13263	// generate a whole-sized vector and shuffle the source vector and the new
13264	// subvector.
13265	if (OffsetBeg + InsertVecSz > VecSz) {
13266	// Align OffsetBeg to generate correct mask.
13267	OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
13268	InsertVecSz = VecSz;
13269	}
13270
13271	APInt DemandedElts = APInt::getZero(numBits: NumElts);
13272	// TODO: Add support for Instruction::InsertValue.
13273	SmallVector<int> Mask;
13274	if (!E->ReorderIndices.empty()) {
13275	inversePermutation(Indices: E->ReorderIndices, Mask);
13276	Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
13277	} else {
13278	Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
13279	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: `0`);
13280	}
13281	bool IsIdentity = true;
13282	SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
13283	Mask.swap(RHS&: PrevMask);
13284	for (unsigned I = `0`; I < NumScalars; ++I) {
13285	unsigned InsertIdx = *getElementIndex(Inst: VL [PrevMask [I]]);
13286	DemandedElts.setBit(InsertIdx);
13287	IsIdentity &= InsertIdx - OffsetBeg == I;
13288	Mask [InsertIdx - OffsetBeg] = I;
13289	}
13290	assert(Offset < NumElts && "Failed to find vector index offset");
13291
13292	InstructionCost Cost = `0`;
13293	Cost -=
13294	getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts,
13295	/Insert/ true, /Extract/ false, CostKind);
13296
13297	// First cost - resize to actual vector size if not identity shuffle or
13298	// need to shift the vector.
13299	// Do not calculate the cost if the actual size is the register size and
13300	// we can merge this shuffle with the following SK_Select.
13301	auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
13302	if (!IsIdentity)
13303	Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc,
13304	Tp: InsertVecTy, Mask);
13305	auto FirstInsert = cast<Instruction>(Val: find_if(Range: E->Scalars, P: [E](Value *V) {
13306	return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
13307	}));
13308	// Second cost - permutation with subvector, if some elements are from the
13309	// initial vector or inserting a subvector.
13310	// TODO: Implement the analysis of the FirstInsert->getOperand(0)
13311	// subvector of ActualVecTy.
13312	SmallBitVector InMask =
13313	isUndefVector(V: FirstInsert->getOperand(i: `0`),
13314	UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
13315	if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
13316	if (InsertVecSz != VecSz) {
13317	auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
13318	Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {},
13319	CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy);
13320	} else {
13321	for (unsigned I = `0`, End = OffsetBeg - Offset; I < End; ++I)
13322	Mask [I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
13323	for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
13324	I <= End; ++I)
13325	if (Mask [I] != PoisonMaskElem)
13326	Mask [I] = I + VecSz;
13327	for (unsigned I = OffsetEnd + `1` - Offset; I < VecSz; ++I)
13328	Mask [I] =
13329	((I >= InMask.size()) \|\| InMask.test(Idx: I)) ? PoisonMaskElem : I;
13330	Cost +=
13331	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
13332	}
13333	}
13334	return Cost;
13335	}
13336	case Instruction::ZExt:
13337	case Instruction::SExt:
13338	case Instruction::FPToUI:
13339	case Instruction::FPToSI:
13340	case Instruction::FPExt:
13341	case Instruction::PtrToInt:
13342	case Instruction::IntToPtr:
13343	case Instruction::SIToFP:
13344	case Instruction::UIToFP:
13345	case Instruction::Trunc:
13346	case Instruction::FPTrunc:
13347	case Instruction::BitCast: {
13348	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
13349	Type *SrcScalarTy = VL0->getOperand(i: `0`)->getType();
13350	auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
13351	unsigned Opcode = ShuffleOrOp;
13352	unsigned VecOpcode = Opcode;
13353	if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
13354	(SrcIt != MinBWs.end() \|\| It != MinBWs.end())) {
13355	// Check if the values are candidates to demote.
13356	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType());
13357	if (SrcIt != MinBWs.end()) {
13358	SrcBWSz = SrcIt ->second.first;
13359	unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy);
13360	SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
13361	SrcVecTy =
13362	getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements);
13363	}
13364	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
13365	if (BWSz == SrcBWSz) {
13366	VecOpcode = Instruction::BitCast;
13367	} else if (BWSz < SrcBWSz) {
13368	VecOpcode = Instruction::Trunc;
13369	} else if (It != MinBWs.end()) {
13370	assert(BWSz > SrcBWSz && "Invalid cast!");
13371	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
13372	} else if (SrcIt != MinBWs.end()) {
13373	assert(BWSz > SrcBWSz && "Invalid cast!");
13374	VecOpcode =
13375	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
13376	}
13377	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13378	!SrcIt ->second.second) {
13379	VecOpcode = Instruction::UIToFP;
13380	}
13381	auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
13382	assert(Idx == `0` && "Expected 0 index only");
13383	return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
13384	Src: VL0->getOperand(i: `0`)->getType(),
13385	CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0);
13386	};
13387	auto GetVectorCost = [=](InstructionCost CommonCost) {
13388	// Do not count cost here if minimum bitwidth is in effect and it is just
13389	// a bitcast (here it is just a noop).
13390	if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
13391	return CommonCost;
13392	auto VI = VL0->getOpcode() == Opcode ? VL0 : nullptr*;
13393	TTI::CastContextHint CCH = GetCastContextHint (VL0->getOperand(i: `0`));
13394
13395	bool IsArithmeticExtendedReduction =
13396	E->Idx == `0` && UserIgnoreList &&
13397	all_of(Range: UserIgnoreList, P: [](Value V) {
13398	auto *I = cast<Instruction>(Val: V);
13399	return is_contained(Set: {Instruction::Add, Instruction::FAdd,
13400	Instruction::Mul, Instruction::FMul,
13401	Instruction::And, Instruction::Or,
13402	Instruction::Xor},
13403	Element: I->getOpcode());
13404	});
13405	if (IsArithmeticExtendedReduction &&
13406	(VecOpcode == Instruction::ZExt \|\| VecOpcode == Instruction::SExt))
13407	return CommonCost;
13408	return CommonCost +
13409	TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
13410	I: VecOpcode == Opcode ? VI : nullptr);
13411	};
13412	return GetCostDiff (GetScalarCost, GetVectorCost);
13413	}
13414	case Instruction::FCmp:
13415	case Instruction::ICmp:
13416	case Instruction::Select: {
13417	CmpPredicate VecPred, SwappedVecPred;
13418	auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
13419	if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) \|\|
13420	match(V: VL0, P: MatchCmp))
13421	SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
13422	else
13423	SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
13424	? CmpInst::BAD_FCMP_PREDICATE
13425	: CmpInst::BAD_ICMP_PREDICATE;
13426	auto GetScalarCost = [&](unsigned Idx) {
13427	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
13428	return InstructionCost (TTI::TCC_Free);
13429
13430	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
13431	CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
13432	? CmpInst::BAD_FCMP_PREDICATE
13433	: CmpInst::BAD_ICMP_PREDICATE;
13434	auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
13435	if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
13436	!match(V: VI, P: MatchCmp)) \|\|
13437	(CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
13438	CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
13439	VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
13440	? CmpInst::BAD_FCMP_PREDICATE
13441	: CmpInst::BAD_ICMP_PREDICATE;
13442
13443	InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
13444	Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
13445	CostKind, Op1Info: getOperandInfo(Ops: VI->getOperand(i: `0`)),
13446	Op2Info: getOperandInfo(Ops: VI->getOperand(i: `1`)), I: VI);
13447	InstructionCost IntrinsicCost = GetMinMaxCost (OrigScalarTy, VI);
13448	if (IntrinsicCost.isValid())
13449	ScalarCost = IntrinsicCost;
13450
13451	return ScalarCost;
13452	};
13453	auto GetVectorCost = [&](InstructionCost CommonCost) {
13454	auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
13455
13456	InstructionCost VecCost =
13457	TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred,
13458	CostKind, Op1Info: getOperandInfo(Ops: E->getOperand(OpIdx: `0`)),
13459	Op2Info: getOperandInfo(Ops: E->getOperand(OpIdx: `1`)), I: VL0);
13460	if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) {
13461	auto *CondType =
13462	getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size());
13463	unsigned CondNumElements = CondType->getNumElements();
13464	unsigned VecTyNumElements = getNumElements(Ty: VecTy);
13465	assert(VecTyNumElements >= CondNumElements &&
13466	VecTyNumElements % CondNumElements == `0` &&
13467	"Cannot vectorize Instruction::Select");
13468	if (CondNumElements != VecTyNumElements) {
13469	// When the return type is i1 but the source is fixed vector type, we
13470	// need to duplicate the condition value.
13471	VecCost += ::getShuffleCost(
13472	TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType,
13473	Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements,
13474	VF: CondNumElements));
13475	}
13476	}
13477	return VecCost + CommonCost;
13478	};
13479	return GetCostDiff (GetScalarCost, GetVectorCost);
13480	}
13481	case TreeEntry::MinMax: {
13482	auto GetScalarCost = [&](unsigned Idx) {
13483	return GetMinMaxCost (OrigScalarTy);
13484	};
13485	auto GetVectorCost = [&](InstructionCost CommonCost) {
13486	InstructionCost VecCost = GetMinMaxCost (VecTy);
13487	return VecCost + CommonCost;
13488	};
13489	return GetCostDiff (GetScalarCost, GetVectorCost);
13490	}
13491	case Instruction::FNeg:
13492	case Instruction::Add:
13493	case Instruction::FAdd:
13494	case Instruction::Sub:
13495	case Instruction::FSub:
13496	case Instruction::Mul:
13497	case Instruction::FMul:
13498	case Instruction::UDiv:
13499	case Instruction::SDiv:
13500	case Instruction::FDiv:
13501	case Instruction::URem:
13502	case Instruction::SRem:
13503	case Instruction::FRem:
13504	case Instruction::Shl:
13505	case Instruction::LShr:
13506	case Instruction::AShr:
13507	case Instruction::And:
13508	case Instruction::Or:
13509	case Instruction::Xor: {
13510	auto GetScalarCost = [&](unsigned Idx) {
13511	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
13512	return InstructionCost (TTI::TCC_Free);
13513
13514	// We cannot retrieve the operand from UniqueValues[Idx] because an
13515	// interchangeable instruction may be used. The order and the actual
13516	// operand might differ from what is retrieved from UniqueValues[Idx].
13517	Value *Op1 = E->getOperand(OpIdx: `0`)[Idx];
13518	Value *Op2;
13519	SmallVector<const Value *, `2`> Operands(`1`, Op1);
13520	if (isa<UnaryOperator>(Val: UniqueValues [Idx])) {
13521	Op2 = Op1;
13522	} else {
13523	Op2 = E->getOperand(OpIdx: `1`)[Idx];
13524	Operands.push_back(Elt: Op2);
13525	}
13526	TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1);
13527	TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2);
13528	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind,
13529	Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands);
13530	};
13531	auto GetVectorCost = [=](InstructionCost CommonCost) {
13532	if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13533	for (unsigned I : seq<unsigned>(Begin: `0`, End: E->getNumOperands())) {
13534	ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
13535	if (all_of(Range&: Ops, P: [&](Value *Op) {
13536	auto *CI = dyn_cast<ConstantInt>(Val: Op);
13537	return CI && CI->getValue().countr_one() >= It ->second.first;
13538	}))
13539	return CommonCost;
13540	}
13541	}
13542	unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? `0` : `1`;
13543	TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
13544	TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
13545	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
13546	Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) +
13547	CommonCost;
13548	};
13549	return GetCostDiff (GetScalarCost, GetVectorCost);
13550	}
13551	case Instruction::GetElementPtr: {
13552	return CommonCost + GetGEPCostDiff (VL, VL0);
13553	}
13554	case Instruction::Load: {
13555	auto GetScalarCost = [&](unsigned Idx) {
13556	auto *VI = cast<LoadInst>(Val: UniqueValues [Idx]);
13557	return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
13558	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
13559	CostKind, OpdInfo: TTI::OperandValueInfo (), I: VI);
13560	};
13561	auto *LI0 = cast<LoadInst>(Val: VL0);
13562	auto GetVectorCost = [&](InstructionCost CommonCost) {
13563	InstructionCost VecLdCost;
13564	switch (E->State) {
13565	case TreeEntry::Vectorize:
13566	if (unsigned Factor = E->getInterleaveFactor()) {
13567	VecLdCost = TTI->getInterleavedMemoryOpCost(
13568	Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(),
13569	AddressSpace: LI0->getPointerAddressSpace(), CostKind);
13570
13571	} else {
13572	VecLdCost = TTI->getMemoryOpCost(
13573	Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
13574	AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo ());
13575	}
13576	break;
13577	case TreeEntry::StridedVectorize: {
13578	Align CommonAlignment =
13579	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
13580	VecLdCost = TTI->getStridedMemoryOpCost(
13581	Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
13582	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
13583	break;
13584	}
13585	case TreeEntry::CompressVectorize: {
13586	bool IsMasked;
13587	unsigned InterleaveFactor;
13588	SmallVector<int> CompressMask;
13589	VectorType *LoadVecTy;
13590	SmallVector<Value *> Scalars(VL);
13591	if (!E->ReorderIndices.empty()) {
13592	SmallVector<int> Mask(E->ReorderIndices.begin(),
13593	E->ReorderIndices.end());
13594	reorderScalars(Scalars, Mask);
13595	}
13596	SmallVector<Value *> PointerOps(Scalars.size());
13597	for (auto [I, V] : enumerate(First&: Scalars))
13598	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
13599	[[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
13600	VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: TTI, DL: DL, SE&: SE, AC&: AC, DT: *DT,
13601	TLI: TLI, AreAllUsersVectorized: [](Value ) { return true; }, IsMasked, InterleaveFactor,
13602	CompressMask, LoadVecTy);
13603	assert(IsVectorized && "Failed to vectorize load");
13604	CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy,
13605	Args&: InterleaveFactor, Args&: IsMasked);
13606	Align CommonAlignment = LI0->getAlign();
13607	if (InterleaveFactor) {
13608	VecLdCost = TTI->getInterleavedMemoryOpCost(
13609	Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {},
13610	Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind);
13611	} else if (IsMasked) {
13612	VecLdCost = TTI->getMaskedMemoryOpCost(
13613	Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
13614	AddressSpace: LI0->getPointerAddressSpace(), CostKind);
13615	// TODO: include this cost into CommonCost.
13616	VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
13617	Tp: LoadVecTy, Mask: CompressMask, CostKind);
13618	} else {
13619	VecLdCost = TTI->getMemoryOpCost(
13620	Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
13621	AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo ());
13622	// TODO: include this cost into CommonCost.
13623	VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
13624	Tp: LoadVecTy, Mask: CompressMask, CostKind);
13625	}
13626	break;
13627	}
13628	case TreeEntry::ScatterVectorize: {
13629	Align CommonAlignment =
13630	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
13631	VecLdCost = TTI->getGatherScatterOpCost(
13632	Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
13633	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
13634	break;
13635	}
13636	case TreeEntry::CombinedVectorize:
13637	case TreeEntry::SplitVectorize:
13638	case TreeEntry::NeedToGather:
13639	llvm_unreachable("Unexpected vectorization state.");
13640	}
13641	return VecLdCost + CommonCost;
13642	};
13643
13644	InstructionCost Cost = GetCostDiff (GetScalarCost, GetVectorCost);
13645	// If this node generates masked gather load then it is not a terminal node.
13646	// Hence address operand cost is estimated separately.
13647	if (E->State == TreeEntry::ScatterVectorize)
13648	return Cost;
13649
13650	// Estimate cost of GEPs since this tree node is a terminator.
13651	SmallVector<Value *> PointerOps(VL.size());
13652	for (auto [I, V] : enumerate(First&: VL))
13653	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
13654	return Cost + GetGEPCostDiff (PointerOps, LI0->getPointerOperand());
13655	}
13656	case Instruction::Store: {
13657	bool IsReorder = !E->ReorderIndices.empty();
13658	auto GetScalarCost = [=](unsigned Idx) {
13659	auto *VI = cast<StoreInst>(Val: VL [Idx]);
13660	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
13661	return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
13662	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
13663	CostKind, OpdInfo: OpInfo, I: VI);
13664	};
13665	auto *BaseSI =
13666	cast<StoreInst>(Val: IsReorder ? VL [E->ReorderIndices.front()] : VL0);
13667	auto GetVectorCost = [=](InstructionCost CommonCost) {
13668	// We know that we can merge the stores. Calculate the cost.
13669	InstructionCost VecStCost;
13670	if (E->State == TreeEntry::StridedVectorize) {
13671	Align CommonAlignment =
13672	computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
13673	VecStCost = TTI->getStridedMemoryOpCost(
13674	Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
13675	/VariableMask=/false, Alignment: CommonAlignment, CostKind);
13676	} else {
13677	assert(E->State == TreeEntry::Vectorize &&
13678	"Expected either strided or consecutive stores.");
13679	if (unsigned Factor = E->getInterleaveFactor()) {
13680	assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
13681	"No reused shuffles expected");
13682	CommonCost = `0`;
13683	VecStCost = TTI->getInterleavedMemoryOpCost(
13684	Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(),
13685	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind);
13686	} else {
13687	TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
13688	VecStCost = TTI->getMemoryOpCost(
13689	Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
13690	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
13691	}
13692	}
13693	return VecStCost + CommonCost;
13694	};
13695	SmallVector<Value *> PointerOps(VL.size());
13696	for (auto [I, V] : enumerate(First&: VL)) {
13697	unsigned Idx = IsReorder ? E->ReorderIndices [I] : I;
13698	PointerOps [Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
13699	}
13700
13701	return GetCostDiff (GetScalarCost, GetVectorCost) +
13702	GetGEPCostDiff (PointerOps, BaseSI->getPointerOperand());
13703	}
13704	case Instruction::Call: {
13705	auto GetScalarCost = [&](unsigned Idx) {
13706	auto *CI = cast<CallInst>(Val: UniqueValues [Idx]);
13707	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13708	if (ID != Intrinsic::not_intrinsic) {
13709	IntrinsicCostAttributes CostAttrs(ID, *CI, `1`);
13710	return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13711	}
13712	return TTI->getCallInstrCost(F: CI->getCalledFunction(),
13713	RetTy: CI->getFunctionType()->getReturnType(),
13714	Tys: CI->getFunctionType()->params(), CostKind);
13715	};
13716	auto GetVectorCost = [=](InstructionCost CommonCost) {
13717	auto *CI = cast<CallInst>(Val: VL0);
13718	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13719	SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
13720	CI, ID, VF: VecTy->getNumElements(),
13721	MinBW: It != MinBWs.end() ? It ->second.first : `0`, TTI);
13722	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13723	return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
13724	};
13725	return GetCostDiff (GetScalarCost, GetVectorCost);
13726	}
13727	case Instruction::ShuffleVector: {
13728	if (!SLPReVec \|\| E->isAltShuffle())
13729	assert(E->isAltShuffle() &&
13730	((Instruction::isBinaryOp(E->getOpcode()) &&
13731	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
13732	(Instruction::isCast(E->getOpcode()) &&
13733	Instruction::isCast(E->getAltOpcode())) \|\|
13734	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13735	"Invalid Shuffle Vector Operand");
13736	// Try to find the previous shuffle node with the same operands and same
13737	// main/alternate ops.
13738	auto TryFindNodeWithEqualOperands = [=]() {
13739	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13740	if (TE.get() == E)
13741	break;
13742	if (TE ->hasState() && TE ->isAltShuffle() &&
13743	((TE ->getOpcode() == E->getOpcode() &&
13744	TE ->getAltOpcode() == E->getAltOpcode()) \|\|
13745	(TE ->getOpcode() == E->getAltOpcode() &&
13746	TE ->getAltOpcode() == E->getOpcode())) &&
13747	TE ->hasEqualOperands(TE: *E))
13748	return true;
13749	}
13750	return false;
13751	};
13752	auto GetScalarCost = [&](unsigned Idx) {
13753	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
13754	return InstructionCost (TTI::TCC_Free);
13755
13756	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
13757	assert(E->getMatchingMainOpOrAltOp(VI) &&
13758	"Unexpected main/alternate opcode");
13759	(void)E;
13760	return TTI->getInstructionCost(U: VI, CostKind);
13761	};
13762	// Need to clear CommonCost since the final shuffle cost is included into
13763	// vector cost.
13764	auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
13765	// VecCost is equal to sum of the cost of creating 2 vectors
13766	// and the cost of creating shuffle.
13767	InstructionCost VecCost = `0`;
13768	if (TryFindNodeWithEqualOperands ()) {
13769	LLVM_DEBUG({
13770	dbgs() << "SLP: diamond match for alternate node found.\n";
13771	E->dump();
13772	});
13773	// No need to add new vector costs here since we're going to reuse
13774	// same main/alternate vector ops, just do different shuffling.
13775	} else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
13776	VecCost =
13777	TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
13778	VecCost +=
13779	TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
13780	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
13781	auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
13782	VecCost = TTIRef.getCmpSelInstrCost(
13783	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind,
13784	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
13785	I: VL0);
13786	VecCost += TTIRef.getCmpSelInstrCost(
13787	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
13788	VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
13789	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
13790	I: E->getAltOp());
13791	} else {
13792	Type *SrcSclTy = E->getMainOp()->getOperand(i: `0`)->getType();
13793	auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
13794	if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
13795	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
13796	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13797	unsigned SrcBWSz =
13798	DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: `0`)->getType());
13799	if (SrcIt != MinBWs.end()) {
13800	SrcBWSz = SrcIt ->second.first;
13801	SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
13802	SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
13803	}
13804	if (BWSz <= SrcBWSz) {
13805	if (BWSz < SrcBWSz)
13806	VecCost =
13807	TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
13808	CCH: TTI::CastContextHint::None, CostKind);
13809	LLVM_DEBUG({
13810	dbgs()
13811	<< "SLP: alternate extension, which should be truncated.\n";
13812	E->dump();
13813	});
13814	return VecCost;
13815	}
13816	}
13817	VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
13818	CCH: TTI::CastContextHint::None, CostKind);
13819	VecCost +=
13820	TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
13821	CCH: TTI::CastContextHint::None, CostKind);
13822	}
13823	SmallVector<int> Mask;
13824	E->buildAltOpShuffleMask(
13825	IsAltOp: [&](Instruction *I) {
13826	assert(E->getMatchingMainOpOrAltOp(I) &&
13827	"Unexpected main/alternate opcode");
13828	return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
13829	TLI: *TLI);
13830	},
13831	Mask);
13832	VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
13833	Tp: FinalVecTy, Mask, CostKind);
13834	// Patterns like [fadd,fsub] can be combined into a single instruction
13835	// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
13836	// need to take into account their order when looking for the most used
13837	// order.
13838	unsigned Opcode0 = E->getOpcode();
13839	unsigned Opcode1 = E->getAltOpcode();
13840	SmallBitVector OpcodeMask(
13841	getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1));
13842	// If this pattern is supported by the target then we consider the
13843	// order.
13844	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
13845	InstructionCost AltVecCost = TTIRef.getAltInstrCost(
13846	VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
13847	return AltVecCost < VecCost ? AltVecCost : VecCost;
13848	}
13849	// TODO: Check the reverse order too.
13850	return VecCost;
13851	};
13852	if (SLPReVec && !E->isAltShuffle())
13853	return GetCostDiff (
13854	GetScalarCost, [&](InstructionCost) -> InstructionCost {
13855	// If a group uses mask in order, the shufflevector can be
13856	// eliminated by instcombine. Then the cost is 0.
13857	assert(isa<ShuffleVectorInst>(VL.front()) &&
13858	"Not supported shufflevector usage.");
13859	auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
13860	unsigned SVNumElements =
13861	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())
13862	->getNumElements();
13863	unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
13864	for (size_t I = `0`, End = VL.size(); I != End; I += GroupSize) {
13865	ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
13866	int NextIndex = `0`;
13867	if (!all_of(Range&: Group, P: [&](Value *V) {
13868	assert(isa<ShuffleVectorInst>(V) &&
13869	"Not supported shufflevector usage.");
13870	auto *SV = cast<ShuffleVectorInst>(Val: V);
13871	int Index;
13872	[[maybe_unused]] bool IsExtractSubvectorMask =
13873	SV->isExtractSubvectorMask(Index);
13874	assert(IsExtractSubvectorMask &&
13875	"Not supported shufflevector usage.");
13876	if (NextIndex != Index)
13877	return false;
13878	NextIndex += SV->getShuffleMask().size();
13879	return true;
13880	}))
13881	return ::getShuffleCost(
13882	TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy,
13883	Mask: calculateShufflevectorMask(VL: E->Scalars));
13884	}
13885	return TTI::TCC_Free;
13886	});
13887	return GetCostDiff (GetScalarCost, GetVectorCost);
13888	}
13889	case Instruction::Freeze:
13890	return CommonCost;
13891	default:
13892	llvm_unreachable("Unknown instruction");
13893	}
13894	}
13895
13896	bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
13897	LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
13898	<< VectorizableTree.size() << " is fully vectorizable .\n");
13899
13900	auto &&AreVectorizableGathers = [this](const TreeEntry TE, unsigned* Limit) {
13901	SmallVector<int> Mask;
13902	return TE->isGather() &&
13903	!any_of(Range: TE->Scalars,
13904	P: [this](Value V) { return* EphValues.contains(Ptr: V); }) &&
13905	(allConstant(VL: TE->Scalars) \|\| isSplat(VL: TE->Scalars) \|\|
13906	TE->Scalars.size() < Limit \|\|
13907	(((TE->hasState() &&
13908	TE->getOpcode() == Instruction::ExtractElement) \|\|
13909	all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
13910	isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) \|\|
13911	(TE->hasState() && TE->getOpcode() == Instruction::Load &&
13912	!TE->isAltShuffle()) \|\|
13913	any_of(Range: TE->Scalars, P: IsaPred<LoadInst>));
13914	};
13915
13916	// We only handle trees of heights 1 and 2.
13917	if (VectorizableTree.size() == `1` &&
13918	(VectorizableTree [`0`]->State == TreeEntry::Vectorize \|\|
13919	VectorizableTree [`0`]->State == TreeEntry::StridedVectorize \|\|
13920	VectorizableTree [`0`]->State == TreeEntry::CompressVectorize \|\|
13921	(ForReduction &&
13922	AreVectorizableGathers (VectorizableTree [`0`].get(),
13923	VectorizableTree [`0`]->Scalars.size()) &&
13924	VectorizableTree [`0`]->getVectorFactor() > `2`)))
13925	return true;
13926
13927	if (VectorizableTree.size() != `2`)
13928	return false;
13929
13930	// Handle splat and all-constants stores. Also try to vectorize tiny trees
13931	// with the second gather nodes if they have less scalar operands rather than
13932	// the initial tree element (may be profitable to shuffle the second gather)
13933	// or they are extractelements, which form shuffle.
13934	if (VectorizableTree [`0`]->State == TreeEntry::Vectorize &&
13935	AreVectorizableGathers (VectorizableTree [`1`].get(),
13936	VectorizableTree [`0`]->Scalars.size()))
13937	return true;
13938
13939	// Gathering cost would be too much for tiny trees.
13940	if (VectorizableTree [`0`]->isGather() \|\|
13941	(VectorizableTree [`1`]->isGather() &&
13942	VectorizableTree [`0`]->State != TreeEntry::ScatterVectorize &&
13943	VectorizableTree [`0`]->State != TreeEntry::StridedVectorize &&
13944	VectorizableTree [`0`]->State != TreeEntry::CompressVectorize))
13945	return false;
13946
13947	return true;
13948	}
13949
13950	static bool isLoadCombineCandidateImpl(Value Root, unsigned* NumElts,
13951	TargetTransformInfo *TTI,
13952	bool MustMatchOrInst) {
13953	// Look past the root to find a source value. Arbitrarily follow the
13954	// path through operand 0 of any 'or'. Also, peek through optional
13955	// shift-left-by-multiple-of-8-bits.
13956	Value *ZextLoad = Root;
13957	const APInt *ShAmtC;
13958	bool FoundOr = false;
13959	while (!isa<ConstantExpr>(Val: ZextLoad) &&
13960	(match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) \|\|
13961	(match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
13962	ShAmtC->urem(RHS: `8`) == `0`))) {
13963	auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
13964	ZextLoad = BinOp->getOperand(i_nocapture: `0`);
13965	if (BinOp->getOpcode() == Instruction::Or)
13966	FoundOr = true;
13967	}
13968	// Check if the input is an extended load of the required or/shift expression.
13969	Value *Load;
13970	if ((MustMatchOrInst && !FoundOr) \|\| ZextLoad == Root \|\|
13971	!match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) \|\| !isa<LoadInst>(Val: Load))
13972	return false;
13973
13974	// Require that the total load bit width is a legal integer type.
13975	// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
13976	// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
13977	Type *SrcTy = Load->getType();
13978	unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
13979	if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
13980	return false;
13981
13982	// Everything matched - assume that we can fold the whole sequence using
13983	// load combining.
13984	LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
13985	<< *(cast<Instruction>(Root)) << "\n");
13986
13987	return true;
13988	}
13989
13990	bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
13991	if (RdxKind != RecurKind::Or)
13992	return false;
13993
13994	unsigned NumElts = VectorizableTree [`0`]->Scalars.size();
13995	Value *FirstReduced = VectorizableTree [`0`]->Scalars [`0`];
13996	return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
13997	/ MatchOr / MustMatchOrInst: false);
13998	}
13999
14000	bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value > Stores) const* {
14001	// Peek through a final sequence of stores and check if all operations are
14002	// likely to be load-combined.
14003	unsigned NumElts = Stores.size();
14004	for (Value *Scalar : Stores) {
14005	Value *X;
14006	if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) \|\|
14007	!isLoadCombineCandidateImpl(Root: X, NumElts, TTI, / MatchOr / MustMatchOrInst: true))
14008	return false;
14009	}
14010	return true;
14011	}
14012
14013	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
14014	if (!DebugCounter::shouldExecute(CounterName: VectorizedGraphs))
14015	return true;
14016
14017	// Graph is empty - do nothing.
14018	if (VectorizableTree.empty()) {
14019	assert(ExternalUses.empty() && "We shouldn't have any external users");
14020
14021	return true;
14022	}
14023
14024	// No need to vectorize inserts of gathered values.
14025	if (VectorizableTree.size() == `2` &&
14026	isa<InsertElementInst>(Val: VectorizableTree [`0`]->Scalars [`0`]) &&
14027	VectorizableTree [`1`]->isGather() &&
14028	(VectorizableTree [`1`]->getVectorFactor() <= `2` \|\|
14029	!(isSplat(VL: VectorizableTree [`1`]->Scalars) \|\|
14030	allConstant(VL: VectorizableTree [`1`]->Scalars))))
14031	return true;
14032
14033	// If the graph includes only PHI nodes and gathers, it is defnitely not
14034	// profitable for the vectorization, we can skip it, if the cost threshold is
14035	// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
14036	// gathers/buildvectors.
14037	constexpr int Limit = `4`;
14038	if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
14039	!VectorizableTree.empty() &&
14040	all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
14041	return (TE ->isGather() &&
14042	(!TE ->hasState() \|\|
14043	TE ->getOpcode() != Instruction::ExtractElement) &&
14044	count_if(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) \|\|
14045	(TE ->hasState() && TE ->getOpcode() == Instruction::PHI);
14046	}))
14047	return true;
14048
14049	// Do not vectorize small tree of phis only, if all vector phis are also
14050	// gathered.
14051	if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
14052	VectorizableTree.size() <= Limit &&
14053	all_of(Range: VectorizableTree,
14054	P: [&](const std::unique_ptr<TreeEntry> &TE) {
14055	return (TE ->isGather() &&
14056	(!TE ->hasState() \|\|
14057	TE ->getOpcode() != Instruction::ExtractElement) &&
14058	count_if(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>) <=
14059	Limit) \|\|
14060	(TE ->hasState() &&
14061	(TE ->getOpcode() == Instruction::InsertElement \|\|
14062	(TE ->getOpcode() == Instruction::PHI &&
14063	all_of(Range&: TE ->Scalars, P: [&](Value *V) {
14064	return isa<PoisonValue>(Val: V) \|\| MustGather.contains(Ptr: V);
14065	}))));
14066	}) &&
14067	any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
14068	return TE ->State == TreeEntry::Vectorize &&
14069	TE ->getOpcode() == Instruction::PHI;
14070	}))
14071	return true;
14072
14073	// If the tree contains only phis, buildvectors, split nodes and
14074	// small nodes with reuses, we can skip it.
14075	if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
14076	all_of(Range: VectorizableTree, P: [](const std::unique_ptr<TreeEntry> &TE) {
14077	return TE ->State == TreeEntry::SplitVectorize \|\|
14078	(TE ->isGather() &&
14079	none_of(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>)) \|\|
14080	(TE ->hasState() && (TE ->getOpcode() == Instruction::PHI \|\|
14081	(!TE ->ReuseShuffleIndices.empty() &&
14082	TE ->Scalars.size() == `2`)));
14083	}))
14084	return true;
14085
14086	// We can vectorize the tree if its size is greater than or equal to the
14087	// minimum size specified by the MinTreeSize command line option.
14088	if (VectorizableTree.size() >= MinTreeSize)
14089	return false;
14090
14091	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
14092	// can vectorize it if we can prove it fully vectorizable.
14093	if (isFullyVectorizableTinyTree(ForReduction))
14094	return false;
14095
14096	// Check if any of the gather node forms an insertelement buildvector
14097	// somewhere.
14098	bool IsAllowedSingleBVNode =
14099	VectorizableTree.size() > `1` \|\|
14100	(VectorizableTree.size() == `1` && VectorizableTree.front()->hasState() &&
14101	!VectorizableTree.front()->isAltShuffle() &&
14102	VectorizableTree.front()->getOpcode() != Instruction::PHI &&
14103	VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
14104	allSameBlock(VL: VectorizableTree.front()->Scalars));
14105	if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
14106	return TE ->isGather() && all_of(Range&: TE ->Scalars, P: [&](Value *V) {
14107	return isa<ExtractElementInst, Constant>(Val: V) \|\|
14108	(IsAllowedSingleBVNode &&
14109	!V->hasNUsesOrMore(N: UsesLimit) &&
14110	any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
14111	});
14112	}))
14113	return false;
14114
14115	if (VectorizableTree.back()->isGather() &&
14116	VectorizableTree.back()->hasState() &&
14117	VectorizableTree.back()->isAltShuffle() &&
14118	VectorizableTree.back()->getVectorFactor() > `2` &&
14119	allSameBlock(VL: VectorizableTree.back()->Scalars) &&
14120	!VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
14121	TTI->getScalarizationOverhead(
14122	Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(),
14123	VF: VectorizableTree.back()->getVectorFactor()),
14124	DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()),
14125	/Insert=/true, /Extract=/false,
14126	CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold)
14127	return false;
14128
14129	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
14130	// vectorizable.
14131	return true;
14132	}
14133
14134	bool BoUpSLP::isTreeNotExtendable() const {
14135	if (getCanonicalGraphSize() != getTreeSize()) {
14136	constexpr unsigned SmallTree = `3`;
14137	if (VectorizableTree.front()->isNonPowOf2Vec() &&
14138	getCanonicalGraphSize() <= SmallTree &&
14139	count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
14140	P: [](const std::unique_ptr<TreeEntry> &TE) {
14141	return TE ->isGather() && TE ->hasState() &&
14142	TE ->getOpcode() == Instruction::Load &&
14143	!allSameBlock(VL: TE ->Scalars);
14144	}) == `1`)
14145	return true;
14146	return false;
14147	}
14148	bool Res = false;
14149	for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) {
14150	TreeEntry &E = *VectorizableTree [Idx];
14151	if (E.State == TreeEntry::SplitVectorize)
14152	return false;
14153	if (!E.isGather())
14154	continue;
14155	if ((E.hasState() && E.getOpcode() != Instruction::Load) \|\|
14156	(!E.hasState() &&
14157	all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) \|\|
14158	(isa<ExtractElementInst>(Val: E.Scalars.front()) &&
14159	getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid()))
14160	return false;
14161	if (isSplat(VL: E.Scalars) \|\| allConstant(VL: E.Scalars))
14162	continue;
14163	Res = true;
14164	}
14165	return Res;
14166	}
14167
14168	InstructionCost BoUpSLP::getSpillCost() {
14169	// Walk from the bottom of the tree to the top, tracking which values are
14170	// live. When we see a call instruction that is not part of our tree,
14171	// query TTI to see if there is a cost to keeping values live over it
14172	// (for example, if spills and fills are required).
14173
14174	const TreeEntry *Root = VectorizableTree.front().get();
14175	if (Root->isGather())
14176	return `0`;
14177
14178	InstructionCost Cost = `0`;
14179	SmallDenseMap<const TreeEntry , SmallVector<const* TreeEntry *>>
14180	EntriesToOperands;
14181	SmallDenseMap<const TreeEntry , Instruction > EntriesToLastInstruction;
14182	SmallPtrSet<const Instruction *, `8`> LastInstructions;
14183	for (const auto &TEPtr : VectorizableTree) {
14184	if (!TEPtr ->isGather()) {
14185	Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get());
14186	EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst);
14187	LastInstructions.insert(Ptr: LastInst);
14188	}
14189	if (TEPtr ->UserTreeIndex)
14190	EntriesToOperands [TEPtr ->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get());
14191	}
14192
14193	auto NoCallIntrinsic = [this](const Instruction *I) {
14194	const auto *II = dyn_cast<IntrinsicInst>(Val: I);
14195	if (!II)
14196	return false;
14197	if (II->isAssumeLikeIntrinsic())
14198	return true;
14199	IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
14200	InstructionCost IntrCost =
14201	TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
14202	InstructionCost CallCost = TTI->getCallInstrCost(
14203	F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput);
14204	return IntrCost < CallCost;
14205	};
14206
14207	// Maps last instruction in the entry to the last instruction for the one of
14208	// operand entries and the flag. If the flag is true, there are no calls in
14209	// between these instructions.
14210	SmallDenseMap<const Instruction , PointerIntPair<const* Instruction *, `1`>>
14211	CheckedInstructions;
14212	unsigned Budget = `0`;
14213	const unsigned BudgetLimit =
14214	ScheduleRegionSizeBudget / VectorizableTree.size();
14215	auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
14216	const Instruction *Last) {
14217	assert(First->getParent() == Last->getParent() &&
14218	"Expected instructions in same block.");
14219	if (auto It = CheckedInstructions.find(Val: Last);
14220	It != CheckedInstructions.end()) {
14221	const Instruction *Checked = It ->second.getPointer();
14222	if (Checked == First \|\| Checked->comesBefore(Other: First))
14223	return It ->second.getInt() != `0`;
14224	Last = Checked;
14225	} else if (Last == First \|\| Last->comesBefore(Other: First)) {
14226	return true;
14227	}
14228	BasicBlock::const_reverse_iterator InstIt =
14229	++First->getIterator().getReverse(),
14230	PrevInstIt =
14231	Last->getIterator().getReverse();
14232	SmallVector<const Instruction *> LastInstsInRange;
14233	while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
14234	// Debug information does not impact spill cost.
14235	// Vectorized calls, represented as vector intrinsics, do not impact spill
14236	// cost.
14237	if (const auto CB = dyn_cast<CallBase>(Val: &PrevInstIt);
14238	CB && !NoCallIntrinsic (CB) && !isVectorized(V: CB)) {
14239	for (const Instruction *LastInst : LastInstsInRange)
14240	CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: `0`);
14241	return false;
14242	}
14243	if (LastInstructions.contains(Ptr: &*PrevInstIt))
14244	LastInstsInRange.push_back(Elt: &*PrevInstIt);
14245
14246	++PrevInstIt;
14247	++Budget;
14248	}
14249	for (const Instruction *LastInst : LastInstsInRange)
14250	CheckedInstructions.try_emplace(
14251	Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt,
14252	Args: Budget <= BudgetLimit ? `1` : `0`);
14253	return Budget <= BudgetLimit;
14254	};
14255	auto AddCosts = [&](const TreeEntry *Op) {
14256	Type *ScalarTy = Op->Scalars.front()->getType();
14257	auto It = MinBWs.find(Val: Op);
14258	if (It != MinBWs.end())
14259	ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It ->second.first);
14260	auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor());
14261	Cost += TTI->getCostOfKeepingLiveOverCall(Tys: VecTy);
14262	if (ScalarTy->isVectorTy()) {
14263	// Handle revec dead vector instructions.
14264	Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy);
14265	}
14266	};
14267	// Memoize the relationship between blocks, i.e. if there is (at least one)
14268	// non-vectorized call between the blocks. This allows to skip the analysis of
14269	// the same block paths multiple times.
14270	SmallDenseMap<std::pair<const BasicBlock , const* BasicBlock >, bool*>
14271	ParentOpParentToPreds;
14272	auto CheckPredecessors = [&](BasicBlock Root, BasicBlock Pred,
14273	BasicBlock *OpParent) {
14274	auto Key = std::make_pair(x&: Root, y&: OpParent);
14275	if (auto It = ParentOpParentToPreds.find(Val: Key);
14276	It != ParentOpParentToPreds.end())
14277	return It ->second;
14278	SmallVector<BasicBlock *> Worklist;
14279	if (Pred)
14280	Worklist.push_back(Elt: Pred);
14281	else
14282	Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root));
14283	SmallPtrSet<const BasicBlock *, `16`> Visited;
14284	SmallDenseSet<std::pair<const BasicBlock , const* BasicBlock *>>
14285	ParentsPairsToAdd;
14286	bool Res = false;
14287	auto Cleanup = make_scope_exit(F: [&]() {
14288	for (const auto &KeyPair : ParentsPairsToAdd) {
14289	assert(!ParentOpParentToPreds.contains(KeyPair) &&
14290	"Should not have been added before.");
14291	ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res);
14292	}
14293	});
14294	while (!Worklist.empty()) {
14295	BasicBlock *BB = Worklist.pop_back_val();
14296	if (BB == OpParent \|\| !Visited.insert(Ptr: BB).second)
14297	continue;
14298	auto Pair = std::make_pair(x&: BB, y&: OpParent);
14299	if (auto It = ParentOpParentToPreds.find(Val: Pair);
14300	It != ParentOpParentToPreds.end()) {
14301	Res = It ->second;
14302	return Res;
14303	}
14304	ParentsPairsToAdd.insert(V: Pair);
14305	unsigned BlockSize = BB->size();
14306	if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
14307	return Res;
14308	Budget += BlockSize;
14309	if (Budget > BudgetLimit)
14310	return Res;
14311	if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) &&
14312	!CheckForNonVecCallsInSameBlock (&*BB->getFirstNonPHIOrDbgOrAlloca(),
14313	BB->getTerminator()))
14314	return Res;
14315	Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB));
14316	}
14317	Res = true;
14318	return Res;
14319	};
14320	SmallVector<const TreeEntry *> LiveEntries(`1`, Root);
14321	while (!LiveEntries.empty()) {
14322	const TreeEntry *Entry = LiveEntries.pop_back_val();
14323	SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry);
14324	if (Operands.empty())
14325	continue;
14326	Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry);
14327	BasicBlock *Parent = LastInst->getParent();
14328	for (const TreeEntry *Op : Operands) {
14329	if (!Op->isGather())
14330	LiveEntries.push_back(Elt: Op);
14331	if (Entry->State == TreeEntry::SplitVectorize \|\|
14332	(Entry->getOpcode() != Instruction::PHI && Op->isGather()) \|\|
14333	(Op->isGather() && allConstant(VL: Op->Scalars)))
14334	continue;
14335	Budget = `0`;
14336	BasicBlock Pred = nullptr*;
14337	if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp()))
14338	Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
14339	BasicBlock *OpParent;
14340	Instruction *OpLastInst;
14341	if (Op->isGather()) {
14342	assert(Entry->getOpcode() == Instruction::PHI &&
14343	"Expected phi node only.");
14344	OpParent = cast<PHINode>(Val: Entry->getMainOp())
14345	->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
14346	OpLastInst = OpParent->getTerminator();
14347	for (Value *V : Op->Scalars) {
14348	auto *Inst = dyn_cast<Instruction>(Val: V);
14349	if (!Inst)
14350	continue;
14351	if (isVectorized(V)) {
14352	OpParent = Inst->getParent();
14353	OpLastInst = Inst;
14354	break;
14355	}
14356	}
14357	} else {
14358	OpLastInst = EntriesToLastInstruction.at(Val: Op);
14359	OpParent = OpLastInst->getParent();
14360	}
14361	// Check the call instructions within the same basic blocks.
14362	if (OpParent == Parent) {
14363	if (Entry->getOpcode() == Instruction::PHI) {
14364	if (!CheckForNonVecCallsInSameBlock (LastInst, OpLastInst))
14365	AddCosts (Op);
14366	continue;
14367	}
14368	if (!CheckForNonVecCallsInSameBlock (OpLastInst, LastInst))
14369	AddCosts (Op);
14370	continue;
14371	}
14372	// Check for call instruction in between blocks.
14373	// 1. Check entry's block to the head.
14374	if (Entry->getOpcode() != Instruction::PHI &&
14375	!CheckForNonVecCallsInSameBlock (
14376	&*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
14377	LastInst)) {
14378	AddCosts (Op);
14379	continue;
14380	}
14381	// 2. Check op's block from the end.
14382	if (!CheckForNonVecCallsInSameBlock (OpLastInst,
14383	OpParent->getTerminator())) {
14384	AddCosts (Op);
14385	continue;
14386	}
14387	// 3. Check the predecessors of entry's block till op's block.
14388	if (!CheckPredecessors (Parent, Pred, OpParent)) {
14389	AddCosts (Op);
14390	continue;
14391	}
14392	}
14393	}
14394
14395	return Cost;
14396	}
14397
14398	/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
14399	/// buildvector sequence.
14400	static bool isFirstInsertElement(const InsertElementInst *IE1,
14401	const InsertElementInst *IE2) {
14402	if (IE1 == IE2)
14403	return false;
14404	const auto *I1 = IE1;
14405	const auto *I2 = IE2;
14406	const InsertElementInst *PrevI1;
14407	const InsertElementInst *PrevI2;
14408	unsigned Idx1 = *getElementIndex(Inst: IE1);
14409	unsigned Idx2 = *getElementIndex(Inst: IE2);
14410	do {
14411	if (I2 == IE1)
14412	return true;
14413	if (I1 == IE2)
14414	return false;
14415	PrevI1 = I1;
14416	PrevI2 = I2;
14417	if (I1 && (I1 == IE1 \|\| I1->hasOneUse()) &&
14418	getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
14419	I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: `0`));
14420	if (I2 && ((I2 == IE2 \|\| I2->hasOneUse())) &&
14421	getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
14422	I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: `0`));
14423	} while ((I1 && PrevI1 != I1) \|\| (I2 && PrevI2 != I2));
14424	llvm_unreachable("Two different buildvectors not expected.");
14425	}
14426
14427	namespace {
14428	/// Returns incoming Value , if the requested type is Value * too, or a default*
14429	/// value, otherwise.
14430	struct ValueSelect {
14431	template <typename U>
14432	static std::enable_if_t<std::is_same_v<Value , U>, Value > get(Value *V) {
14433	return V;
14434	}
14435	template <typename U>
14436	static std::enable_if_t<!std::is_same_v<Value , U>, U> get(Value ) {
14437	return U();
14438	}
14439	};
14440	} // namespace
14441
14442	/// Does the analysis of the provided shuffle masks and performs the requested
14443	/// actions on the vectors with the given shuffle masks. It tries to do it in
14444	/// several steps.
14445	/// 1. If the Base vector is not undef vector, resizing the very first mask to
14446	/// have common VF and perform action for 2 input vectors (including non-undef
14447	/// Base). Other shuffle masks are combined with the resulting after the 1 stage
14448	/// and processed as a shuffle of 2 elements.
14449	/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
14450	/// action only for 1 vector with the given mask, if it is not the identity
14451	/// mask.
14452	/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
14453	/// vectors, combing the masks properly between the steps.
14454	template <typename T>
14455	static T *performExtractsShuffleAction(
14456	MutableArrayRef<std::pair<T , SmallVector<int>>> ShuffleMask, Value Base,
14457	function_ref<unsigned(T *)> GetVF,
14458	function_ref<std::pair<T , bool>(T , ArrayRef<int>, bool)> ResizeAction,
14459	function_ref<T (ArrayRef<int>, ArrayRef<T >)> Action) {
14460	assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
14461	SmallVector<int> Mask(ShuffleMask.begin()->second);
14462	auto VMIt = std::next(ShuffleMask.begin());
14463	T Prev = nullptr*;
14464	SmallBitVector UseMask =
14465	buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
14466	SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
14467	if (!IsBaseUndef.all()) {
14468	// Base is not undef, need to combine it with the next subvectors.
14469	std::pair<T , bool*> Res =
14470	ResizeAction(ShuffleMask.begin()->first, Mask, /ForSingleMask=/false);
14471	SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
14472	for (unsigned Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
14473	if (Mask [Idx] == PoisonMaskElem)
14474	Mask [Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
14475	else
14476	Mask [Idx] = (Res.second ? Idx : Mask [Idx]) + VF;
14477	}
14478	[[maybe_unused]] auto V = ValueSelect::get<T >(Base);
14479	assert((!V \|\| GetVF(V) == Mask.size()) &&
14480	"Expected base vector of VF number of elements.");
14481	Prev = Action(Mask, {nullptr, Res.first});
14482	} else if (ShuffleMask.size() == `1`) {
14483	// Base is undef and only 1 vector is shuffled - perform the action only for
14484	// single vector, if the mask is not the identity mask.
14485	std::pair<T , bool*> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
14486	/ForSingleMask=/true);
14487	if (Res.second)
14488	// Identity mask is found.
14489	Prev = Res.first;
14490	else
14491	Prev = Action(Mask, {ShuffleMask.begin()->first});
14492	} else {
14493	// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
14494	// shuffles step by step, combining shuffle between the steps.
14495	unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
14496	unsigned Vec2VF = GetVF(VMIt->first);
14497	if (Vec1VF == Vec2VF) {
14498	// No need to resize the input vectors since they are of the same size, we
14499	// can shuffle them directly.
14500	ArrayRef<int> SecMask = VMIt->second;
14501	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
14502	if (SecMask [I] != PoisonMaskElem) {
14503	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
14504	Mask [I] = SecMask [I] + Vec1VF;
14505	}
14506	}
14507	Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
14508	} else {
14509	// Vectors of different sizes - resize and reshuffle.
14510	std::pair<T , bool*> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
14511	/ForSingleMask=/false);
14512	std::pair<T , bool*> Res2 =
14513	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
14514	ArrayRef<int> SecMask = VMIt->second;
14515	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
14516	if (Mask [I] != PoisonMaskElem) {
14517	assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
14518	if (Res1.second)
14519	Mask [I] = I;
14520	} else if (SecMask [I] != PoisonMaskElem) {
14521	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
14522	Mask [I] = (Res2.second ? I : SecMask [I]) + VF;
14523	}
14524	}
14525	Prev = Action(Mask, {Res1.first, Res2.first});
14526	}
14527	VMIt = std::next(VMIt);
14528	}
14529	[[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
14530	// Perform requested actions for the remaining masks/vectors.
14531	for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
14532	// Shuffle other input vectors, if any.
14533	std::pair<T , bool*> Res =
14534	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
14535	ArrayRef<int> SecMask = VMIt->second;
14536	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
14537	if (SecMask [I] != PoisonMaskElem) {
14538	assert((Mask[I] == PoisonMaskElem \|\| IsBaseNotUndef) &&
14539	"Multiple uses of scalars.");
14540	Mask [I] = (Res.second ? I : SecMask [I]) + VF;
14541	} else if (Mask [I] != PoisonMaskElem) {
14542	Mask [I] = I;
14543	}
14544	}
14545	Prev = Action(Mask, {Prev, Res.first});
14546	}
14547	return Prev;
14548	}
14549
14550	namespace {
14551	/// Data type for handling buildvector sequences with the reused scalars from
14552	/// other tree entries.
14553	template <typename T> struct ShuffledInsertData {
14554	/// List of insertelements to be replaced by shuffles.
14555	SmallVector<InsertElementInst *> InsertElements;
14556	/// The parent vectors and shuffle mask for the given list of inserts.
14557	MapVector<T, SmallVector<int>> ValueMasks;
14558	};
14559	} // namespace
14560
14561	InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
14562	InstructionCost ReductionCost) {
14563	InstructionCost Cost = ReductionCost;
14564	LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
14565	<< VectorizableTree.size() << ".\n");
14566
14567	unsigned BundleWidth = VectorizableTree [`0`]->Scalars.size();
14568
14569	SmallPtrSet<Value *, `4`> CheckedExtracts;
14570	for (unsigned I = `0`, E = VectorizableTree.size(); I < E; ++I) {
14571	TreeEntry &TE = *VectorizableTree [I];
14572	// No need to count the cost for combined entries, they are combined and
14573	// just skip their cost.
14574	if (TE.State == TreeEntry::CombinedVectorize) {
14575	LLVM_DEBUG(
14576	dbgs() << "SLP: Skipping cost for combined node that starts with "
14577	<< *TE.Scalars[`0`] << ".\n";
14578	TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
14579	continue;
14580	}
14581	if (TE.hasState() &&
14582	(TE.isGather() \|\| TE.State == TreeEntry::SplitVectorize)) {
14583	if (const TreeEntry *E =
14584	getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars);
14585	E && E->getVectorFactor() == TE.getVectorFactor()) {
14586	// Some gather nodes might be absolutely the same as some vectorizable
14587	// nodes after reordering, need to handle it.
14588	LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
14589	<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
14590	<< "SLP: Current total cost = " << Cost << "\n");
14591	continue;
14592	}
14593	}
14594
14595	// Exclude cost of gather loads nodes which are not used. These nodes were
14596	// built as part of the final attempt to vectorize gathered loads.
14597	assert((!TE.isGather() \|\| TE.Idx == `0` \|\| TE.UserTreeIndex) &&
14598	"Expected gather nodes with users only.");
14599
14600	InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
14601	Cost += C;
14602	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
14603	<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
14604	<< "SLP: Current total cost = " << Cost << "\n");
14605	}
14606
14607	if (Cost >= -SLPCostThreshold &&
14608	none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) {
14609	return isa_and_nonnull<InsertElementInst>(Val: EU.User);
14610	}))
14611	return Cost;
14612
14613	SmallPtrSet<Value *, `16`> ExtractCostCalculated;
14614	InstructionCost ExtractCost = `0`;
14615	SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
14616	SmallVector<APInt> DemandedElts;
14617	SmallDenseSet<Value *, `4`> UsedInserts;
14618	DenseSet<std::pair<const TreeEntry , Type >> VectorCasts;
14619	std::optional<DenseMap<Value , unsigned*>> ValueToExtUses;
14620	DenseMap<const TreeEntry , DenseSet<Value >> ExtractsCount;
14621	SmallPtrSet<Value *, `4`> ScalarOpsFromCasts;
14622	// Keep track {Scalar, Index, User} tuple.
14623	// On AArch64, this helps in fusing a mov instruction, associated with
14624	// extractelement, with fmul in the backend so that extractelement is free.
14625	SmallVector<std::tuple<Value , User , int>, `4`> ScalarUserAndIdx;
14626	for (ExternalUser &EU : ExternalUses) {
14627	ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane);
14628	}
14629	SmallDenseSet<std::pair<Value , Value >, `8`> CheckedScalarUser;
14630	for (ExternalUser &EU : ExternalUses) {
14631	// Uses by ephemeral values are free (because the ephemeral value will be
14632	// removed prior to code generation, and so the extraction will be
14633	// removed as well).
14634	if (EphValues.count(Ptr: EU.User))
14635	continue;
14636
14637	// Check if the scalar for the given user or all users is accounted already.
14638	if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second \|\|
14639	(EU.User &&
14640	CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr))))
14641	continue;
14642
14643	// Used in unreachable blocks or in EH pads (rarely executed) or is
14644	// terminated with unreachable instruction.
14645	if (BasicBlock *UserParent =
14646	EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr;
14647	UserParent &&
14648	(!DT->isReachableFromEntry(A: UserParent) \|\| UserParent->isEHPad() \|\|
14649	isa_and_present<UnreachableInst>(Val: UserParent->getTerminator())))
14650	continue;
14651
14652	// We only add extract cost once for the same scalar.
14653	if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
14654	!ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
14655	continue;
14656
14657	// No extract cost for vector "scalar" if REVEC is disabled
14658	if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType()))
14659	continue;
14660
14661	// If found user is an insertelement, do not calculate extract cost but try
14662	// to detect it as a final shuffled/identity match.
14663	// TODO: what if a user is insertvalue when REVEC is enabled?
14664	if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
14665	VU && VU->getOperand(i_nocapture: `1`) == EU.Scalar) {
14666	if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
14667	if (!UsedInserts.insert(V: VU).second)
14668	continue;
14669	std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
14670	if (InsertIdx) {
14671	const TreeEntry *ScalarTE = &EU.E;
14672	auto *It = find_if(
14673	Range&: ShuffledInserts,
14674	P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
14675	// Checks if 2 insertelements are from the same buildvector.
14676	InsertElementInst *VecInsert = Data.InsertElements.front();
14677	return areTwoInsertFromSameBuildVector(
14678	VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst II) -> Value {
14679	Value *Op0 = II->getOperand(i_nocapture: `0`);
14680	if (isVectorized(V: II) && !isVectorized(V: Op0))
14681	return nullptr;
14682	return Op0;
14683	});
14684	});
14685	int VecId = -`1`;
14686	if (It == ShuffledInserts.end()) {
14687	auto &Data = ShuffledInserts.emplace_back();
14688	Data.InsertElements.emplace_back(Args&: VU);
14689	DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
14690	VecId = ShuffledInserts.size() - `1`;
14691	auto It = MinBWs.find(Val: ScalarTE);
14692	if (It != MinBWs.end() &&
14693	VectorCasts
14694	.insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
14695	.second) {
14696	unsigned BWSz = It ->second.first;
14697	unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
14698	unsigned VecOpcode;
14699	if (DstBWSz < BWSz)
14700	VecOpcode = Instruction::Trunc;
14701	else
14702	VecOpcode =
14703	It ->second.second ? Instruction::SExt : Instruction::ZExt;
14704	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14705	InstructionCost C = TTI->getCastInstrCost(
14706	Opcode: VecOpcode, Dst: FTy,
14707	Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
14708	VF: FTy->getNumElements()),
14709	CCH: TTI::CastContextHint::None, CostKind);
14710	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
14711	<< " for extending externally used vector with "
14712	"non-equal minimum bitwidth.\n");
14713	Cost += C;
14714	}
14715	} else {
14716	if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front()))
14717	It->InsertElements.front() = VU;
14718	VecId = std::distance(first: ShuffledInserts.begin(), last: It);
14719	}
14720	int InIdx = *InsertIdx;
14721	SmallVectorImpl<int> &Mask =
14722	ShuffledInserts [VecId].ValueMasks [ScalarTE];
14723	if (Mask.empty())
14724	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
14725	Mask [InIdx] = EU.Lane;
14726	DemandedElts [VecId].setBit(InIdx);
14727	continue;
14728	}
14729	}
14730	}
14731
14732	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14733	// If we plan to rewrite the tree in a smaller type, we will need to sign
14734	// extend the extracted value back to the original type. Here, we account
14735	// for the extract and the added cost of the sign extend if needed.
14736	InstructionCost ExtraCost = TTI::TCC_Free;
14737	auto *ScalarTy = EU.Scalar->getType();
14738	auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth);
14739	const TreeEntry *Entry = &EU.E;
14740	auto It = MinBWs.find(Val: Entry);
14741	if (It != MinBWs.end()) {
14742	Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
14743	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
14744	MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements());
14745	unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery (*DL))
14746	? Instruction::ZExt
14747	: Instruction::SExt;
14748	VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
14749	ExtraCost =
14750	getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane);
14751	} else {
14752	ExtraCost =
14753	getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy,
14754	CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx);
14755	}
14756	// Leave the scalar instructions as is if they are cheaper than extracts.
14757	if (Entry->Idx != `0` \|\| Entry->getOpcode() == Instruction::GetElementPtr \|\|
14758	Entry->getOpcode() == Instruction::Load) {
14759	// Checks if the user of the external scalar is phi in loop body.
14760	auto IsPhiInLoop = [&](const ExternalUser &U) {
14761	if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) {
14762	auto *I = cast<Instruction>(Val: U.Scalar);
14763	const Loop *L = LI->getLoopFor(BB: Phi->getParent());
14764	return L && (Phi->getParent() == I->getParent() \|\|
14765	L == LI->getLoopFor(BB: I->getParent()));
14766	}
14767	return false;
14768	};
14769	if (!ValueToExtUses) {
14770	ValueToExtUses.emplace();
14771	for (const auto &P : enumerate(First&: ExternalUses)) {
14772	// Ignore phis in loops.
14773	if (IsPhiInLoop (P.value()))
14774	continue;
14775
14776	ValueToExtUses ->try_emplace(Key: P.value().Scalar, Args: P.index());
14777	}
14778	}
14779	// Can use original instruction, if no operands vectorized or they are
14780	// marked as externally used already.
14781	auto *Inst = cast<Instruction>(Val: EU.Scalar);
14782	InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind);
14783	auto OperandIsScalar = [&](Value *V) {
14784	if (!isVectorized(V)) {
14785	// Some extractelements might be not vectorized, but
14786	// transformed into shuffle and removed from the function,
14787	// consider it here.
14788	if (auto *EE = dyn_cast<ExtractElementInst>(Val: V))
14789	return !EE->hasOneUse() \|\| !MustGather.contains(Ptr: EE);
14790	return true;
14791	}
14792	return ValueToExtUses ->contains(Val: V);
14793	};
14794	bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar);
14795	bool CanBeUsedAsScalarCast = false;
14796	if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) {
14797	if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: `0`));
14798	Op && all_of(Range: Op->operands(), P: OperandIsScalar)) {
14799	InstructionCost OpCost =
14800	(isVectorized(V: Op) && !ValueToExtUses ->contains(Val: Op))
14801	? TTI->getInstructionCost(U: Op, CostKind)
14802	: `0`;
14803	if (ScalarCost + OpCost <= ExtraCost) {
14804	CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
14805	ScalarCost += OpCost;
14806	}
14807	}
14808	}
14809	if (CanBeUsedAsScalar) {
14810	bool KeepScalar = ScalarCost <= ExtraCost;
14811	// Try to keep original scalar if the user is the phi node from the same
14812	// block as the root phis, currently vectorized. It allows to keep
14813	// better ordering info of PHIs, being vectorized currently.
14814	bool IsProfitablePHIUser =
14815	(KeepScalar \|\| (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
14816	VectorizableTree.front()->Scalars.size() > `2`)) &&
14817	VectorizableTree.front()->getOpcode() == Instruction::PHI &&
14818	!Inst->hasNUsesOrMore(N: UsesLimit) &&
14819	none_of(Range: Inst->users(),
14820	P: [&](User *U) {
14821	auto *PHIUser = dyn_cast<PHINode>(Val: U);
14822	return (!PHIUser \|\|
14823	PHIUser->getParent() !=
14824	cast<Instruction>(
14825	Val: VectorizableTree.front()->getMainOp())
14826	->getParent()) &&
14827	!isVectorized(V: U);
14828	}) &&
14829	count_if(Range: Entry->Scalars, P: [&](Value *V) {
14830	return ValueToExtUses ->contains(Val: V);
14831	}) <= `2`;
14832	if (IsProfitablePHIUser) {
14833	KeepScalar = true;
14834	} else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
14835	ExtraCost - ScalarCost <= TTI::TCC_Basic &&
14836	(!GatheredLoadsEntriesFirst.has_value() \|\|
14837	Entry->Idx < *GatheredLoadsEntriesFirst)) {
14838	unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) {
14839	return ValueToExtUses ->contains(Val: V);
14840	});
14841	auto It = ExtractsCount.find(Val: Entry);
14842	if (It != ExtractsCount.end()) {
14843	assert(ScalarUsesCount >= It->getSecond().size() &&
14844	"Expected total number of external uses not less than "
14845	"number of scalar uses.");
14846	ScalarUsesCount -= It ->getSecond().size();
14847	}
14848	// Keep original scalar if number of externally used instructions in
14849	// the same entry is not power of 2. It may help to do some extra
14850	// vectorization for now.
14851	KeepScalar = ScalarUsesCount <= `1` \|\| !has_single_bit(Value: ScalarUsesCount);
14852	}
14853	if (KeepScalar) {
14854	ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar);
14855	for (Value *V : Inst->operands()) {
14856	auto It = ValueToExtUses ->find(Val: V);
14857	if (It != ValueToExtUses ->end()) {
14858	// Replace all uses to avoid compiler crash.
14859	ExternalUses [It ->second].User = nullptr;
14860	}
14861	}
14862	ExtraCost = ScalarCost;
14863	if (!IsPhiInLoop (EU))
14864	ExtractsCount [Entry].insert(V: Inst);
14865	if (CanBeUsedAsScalarCast) {
14866	ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: `0`));
14867	// Update the users of the operands of the cast operand to avoid
14868	// compiler crash.
14869	if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: `0`))) {
14870	for (Value *V : IOp->operands()) {
14871	auto It = ValueToExtUses ->find(Val: V);
14872	if (It != ValueToExtUses ->end()) {
14873	// Replace all uses to avoid compiler crash.
14874	ExternalUses [It ->second].User = nullptr;
14875	}
14876	}
14877	}
14878	}
14879	}
14880	}
14881	}
14882
14883	ExtractCost += ExtraCost;
14884	}
14885	// Insert externals for extract of operands of casts to be emitted as scalars
14886	// instead of extractelement.
14887	for (Value *V : ScalarOpsFromCasts) {
14888	ExternalUsesAsOriginalScalar.insert(Ptr: V);
14889	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
14890	ExternalUses.emplace_back(Args&: V, Args: nullptr, Args&: *TEs.front(),
14891	Args: TEs.front()->findLaneForValue(V));
14892	}
14893	}
14894	// Add reduced value cost, if resized.
14895	if (!VectorizedVals.empty()) {
14896	const TreeEntry &Root = *VectorizableTree.front();
14897	auto BWIt = MinBWs.find(Val: &Root);
14898	if (BWIt != MinBWs.end()) {
14899	Type *DstTy = Root.Scalars.front()->getType();
14900	unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType());
14901	unsigned SrcSz =
14902	ReductionBitWidth == `0` ? BWIt ->second.first : ReductionBitWidth;
14903	if (OriginalSz != SrcSz) {
14904	unsigned Opcode = Instruction::Trunc;
14905	if (OriginalSz > SrcSz)
14906	Opcode = BWIt ->second.second ? Instruction::SExt : Instruction::ZExt;
14907	Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
14908	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) {
14909	assert(SLPReVec && "Only supported by REVEC.");
14910	SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements());
14911	}
14912	Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
14913	CCH: TTI::CastContextHint::None,
14914	CostKind: TTI::TCK_RecipThroughput);
14915	}
14916	}
14917	}
14918
14919	Cost += ExtractCost;
14920	auto &&ResizeToVF = [this, &Cost](const TreeEntry TE, ArrayRef<int*> Mask,
14921	bool ForSingleMask) {
14922	InstructionCost C = `0`;
14923	unsigned VF = Mask.size();
14924	unsigned VecVF = TE->getVectorFactor();
14925	bool HasLargeIndex =
14926	any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); });
14927	if ((VF != VecVF && HasLargeIndex) \|\|
14928	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) {
14929
14930	if (HasLargeIndex) {
14931	SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
14932	std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
14933	result: OrigMask.begin());
14934	C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
14935	Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
14936	Mask: OrigMask);
14937	LLVM_DEBUG(
14938	dbgs() << "SLP: Adding cost " << C
14939	<< " for final shuffle of insertelement external users.\n";
14940	TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
14941	Cost += C;
14942	return std::make_pair(x&: TE, y: true);
14943	}
14944
14945	if (!ForSingleMask) {
14946	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14947	for (unsigned I = `0`; I < VF; ++I) {
14948	if (Mask [I] != PoisonMaskElem)
14949	ResizeMask [Mask [I]] = Mask [I];
14950	}
14951	if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF))
14952	C = ::getShuffleCost(
14953	TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
14954	Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask);
14955	LLVM_DEBUG(
14956	dbgs() << "SLP: Adding cost " << C
14957	<< " for final shuffle of insertelement external users.\n";
14958	TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
14959
14960	Cost += C;
14961	}
14962	}
14963	return std::make_pair(x&: TE, y: false);
14964	};
14965	// Calculate the cost of the reshuffled vectors, if any.
14966	for (int I = `0`, E = ShuffledInserts.size(); I < E; ++I) {
14967	Value *Base = ShuffledInserts [I].InsertElements.front()->getOperand(i_nocapture: `0`);
14968	auto Vector = ShuffledInserts [I].ValueMasks.takeVector();
14969	unsigned VF = `0`;
14970	auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
14971	ArrayRef<const TreeEntry *> TEs) {
14972	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
14973	"Expected exactly 1 or 2 tree entries.");
14974	if (TEs.size() == `1`) {
14975	if (VF == `0`)
14976	VF = TEs.front()->getVectorFactor();
14977	auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
14978	if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
14979	!all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
14980	return Data.value() == PoisonMaskElem \|\|
14981	(Data.index() < VF &&
14982	static_cast<int>(Data.index()) == Data.value());
14983	})) {
14984	InstructionCost C =
14985	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
14986	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
14987	<< " for final shuffle of insertelement "
14988	"external users.\n";
14989	TEs.front()->dump();
14990	dbgs() << "SLP: Current total cost = " << Cost << "\n");
14991	Cost += C;
14992	}
14993	} else {
14994	if (VF == `0`) {
14995	if (TEs.front() &&
14996	TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
14997	VF = TEs.front()->getVectorFactor();
14998	else
14999	VF = Mask.size();
15000	}
15001	auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
15002	InstructionCost C =
15003	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
15004	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
15005	<< " for final shuffle of vector node and external "
15006	"insertelement users.\n";
15007	if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
15008	dbgs() << "SLP: Current total cost = " << Cost << "\n");
15009	Cost += C;
15010	}
15011	VF = Mask.size();
15012	return TEs.back();
15013	};
15014	(void)performExtractsShuffleAction<const TreeEntry>(
15015	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
15016	GetVF: [](const TreeEntry E) { return* E->getVectorFactor(); }, ResizeAction: ResizeToVF,
15017	Action: EstimateShufflesCost);
15018	InstructionCost InsertCost = TTI->getScalarizationOverhead(
15019	Ty: cast<FixedVectorType>(
15020	Val: ShuffledInserts [I].InsertElements.front()->getType()),
15021	DemandedElts: DemandedElts [I],
15022	/Insert/ true, /Extract/ false, CostKind: TTI::TCK_RecipThroughput);
15023	Cost -= InsertCost;
15024	}
15025
15026	// Add the cost for reduced value resize (if required).
15027	if (ReductionBitWidth != `0`) {
15028	assert(UserIgnoreList && "Expected reduction tree.");
15029	const TreeEntry &E = *VectorizableTree.front();
15030	auto It = MinBWs.find(Val: &E);
15031	if (It != MinBWs.end() && It ->second.first != ReductionBitWidth) {
15032	unsigned SrcSize = It ->second.first;
15033	unsigned DstSize = ReductionBitWidth;
15034	unsigned Opcode = Instruction::Trunc;
15035	if (SrcSize < DstSize) {
15036	bool IsArithmeticExtendedReduction =
15037	all_of(Range: UserIgnoreList, P: [](Value V) {
15038	auto *I = cast<Instruction>(Val: V);
15039	return is_contained(Set: {Instruction::Add, Instruction::FAdd,
15040	Instruction::Mul, Instruction::FMul,
15041	Instruction::And, Instruction::Or,
15042	Instruction::Xor},
15043	Element: I->getOpcode());
15044	});
15045	if (IsArithmeticExtendedReduction)
15046	Opcode =
15047	Instruction::BitCast; // Handle it by getExtendedReductionCost
15048	else
15049	Opcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
15050	}
15051	if (Opcode != Instruction::BitCast) {
15052	auto *SrcVecTy =
15053	getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
15054	auto *DstVecTy =
15055	getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
15056	TTI::CastContextHint CCH = getCastContextHint(TE: E);
15057	InstructionCost CastCost;
15058	switch (E.getOpcode()) {
15059	case Instruction::SExt:
15060	case Instruction::ZExt:
15061	case Instruction::Trunc: {
15062	const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: `0`);
15063	CCH = getCastContextHint(TE: *OpTE);
15064	break;
15065	}
15066	default:
15067	break;
15068	}
15069	CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
15070	CostKind: TTI::TCK_RecipThroughput);
15071	Cost += CastCost;
15072	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
15073	<< " for final resize for reduction from " << SrcVecTy
15074	<< " to " << DstVecTy << "\n";
15075	dbgs() << "SLP: Current total cost = " << Cost << "\n");
15076	}
15077	}
15078	}
15079
15080	std::optional<InstructionCost> SpillCost;
15081	if (Cost < -SLPCostThreshold) {
15082	SpillCost = getSpillCost();
15083	Cost += *SpillCost;
15084	}
15085	#ifndef NDEBUG
15086	SmallString<`256`> Str;
15087	{
15088	raw_svector_ostream OS(Str);
15089	OS << "SLP: Spill Cost = ";
15090	if (SpillCost)
15091	OS << *SpillCost;
15092	else
15093	OS << "<skipped>";
15094	OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
15095	<< "SLP: Total Cost = " << Cost << ".\n";
15096	}
15097	LLVM_DEBUG(dbgs() << Str);
15098	if (ViewSLPTree)
15099	ViewGraph(this, "SLP" + F->getName(), false, Str);
15100	#endif
15101
15102	return Cost;
15103	}
15104
15105	/// Tries to find extractelement instructions with constant indices from fixed
15106	/// vector type and gather such instructions into a bunch, which highly likely
15107	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
15108	/// successful, the matched scalars are replaced by poison values in \p VL for
15109	/// future analysis.
15110	std::optional<TTI::ShuffleKind>
15111	BoUpSLP::tryToGatherSingleRegisterExtractElements(
15112	MutableArrayRef<Value > VL, SmallVectorImpl<int> &Mask) const* {
15113	// Scan list of gathered scalars for extractelements that can be represented
15114	// as shuffles.
15115	MapVector<Value , SmallVector<int*>> VectorOpToIdx;
15116	SmallVector<int> UndefVectorExtracts;
15117	for (int I = `0`, E = VL.size(); I < E; ++I) {
15118	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
15119	if (!EI) {
15120	if (isa<UndefValue>(Val: VL [I]))
15121	UndefVectorExtracts.push_back(Elt: I);
15122	continue;
15123	}
15124	auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
15125	if (!VecTy \|\| !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
15126	continue;
15127	std::optional<unsigned> Idx = getExtractIndex(E: EI);
15128	// Undefined index.
15129	if (!Idx) {
15130	UndefVectorExtracts.push_back(Elt: I);
15131	continue;
15132	}
15133	if (Idx >= VecTy->getNumElements()) {
15134	UndefVectorExtracts.push_back(Elt: I);
15135	continue;
15136	}
15137	SmallBitVector ExtractMask(VecTy->getNumElements(), true);
15138	ExtractMask.reset(Idx: *Idx);
15139	if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
15140	UndefVectorExtracts.push_back(Elt: I);
15141	continue;
15142	}
15143	VectorOpToIdx [EI->getVectorOperand()].push_back(Elt: I);
15144	}
15145	// Sort the vector operands by the maximum number of uses in extractelements.
15146	SmallVector<std::pair<Value , SmallVector<int*>>> Vectors =
15147	VectorOpToIdx.takeVector();
15148	stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
15149	return P1.second.size() > P2.second.size();
15150	});
15151	// Find the best pair of the vectors or a single vector.
15152	const int UndefSz = UndefVectorExtracts.size();
15153	unsigned SingleMax = `0`;
15154	unsigned PairMax = `0`;
15155	if (!Vectors.empty()) {
15156	SingleMax = Vectors.front().second.size() + UndefSz;
15157	if (Vectors.size() > `1`) {
15158	auto *ItNext = std::next(x: Vectors.begin());
15159	PairMax = SingleMax + ItNext->second.size();
15160	}
15161	}
15162	if (SingleMax == `0` && PairMax == `0` && UndefSz == `0`)
15163	return std::nullopt;
15164	// Check if better to perform a shuffle of 2 vectors or just of a single
15165	// vector.
15166	SmallVector<Value *> SavedVL(VL.begin(), VL.end());
15167	SmallVector<Value *> GatheredExtracts(
15168	VL.size(), PoisonValue::get(T: VL.front()->getType()));
15169	if (SingleMax >= PairMax && SingleMax) {
15170	for (int Idx : Vectors.front().second)
15171	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
15172	} else if (!Vectors.empty()) {
15173	for (unsigned Idx : {`0`, `1`})
15174	for (int Idx : Vectors [Idx].second)
15175	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
15176	}
15177	// Add extracts from undefs too.
15178	for (int Idx : UndefVectorExtracts)
15179	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
15180	// Check that gather of extractelements can be represented as just a
15181	// shuffle of a single/two vectors the scalars are extracted from.
15182	std::optional<TTI::ShuffleKind> Res =
15183	isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC);
15184	if (!Res \|\| all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
15185	// TODO: try to check other subsets if possible.
15186	// Restore the original VL if attempt was not successful.
15187	copy(Range&: SavedVL, Out: VL.begin());
15188	return std::nullopt;
15189	}
15190	// Restore unused scalars from mask, if some of the extractelements were not
15191	// selected for shuffle.
15192	for (int I = `0`, E = GatheredExtracts.size(); I < E; ++I) {
15193	if (Mask [I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts [I]) &&
15194	isa<UndefValue>(Val: GatheredExtracts [I])) {
15195	std::swap(a&: VL [I], b&: GatheredExtracts [I]);
15196	continue;
15197	}
15198	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
15199	if (!EI \|\| !isa<FixedVectorType>(Val: EI->getVectorOperandType()) \|\|
15200	!isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) \|\|
15201	is_contained(Range&: UndefVectorExtracts, Element: I))
15202	continue;
15203	}
15204	return Res;
15205	}
15206
15207	/// Tries to find extractelement instructions with constant indices from fixed
15208	/// vector type and gather such instructions into a bunch, which highly likely
15209	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
15210	/// successful, the matched scalars are replaced by poison values in \p VL for
15211	/// future analysis.
15212	SmallVector<std::optional<TTI::ShuffleKind>>
15213	BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
15214	SmallVectorImpl<int> &Mask,
15215	unsigned NumParts) const {
15216	assert(NumParts > `0` && "NumParts expected be greater than or equal to 1.");
15217	SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
15218	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
15219	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
15220	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
15221	// Scan list of gathered scalars for extractelements that can be represented
15222	// as shuffles.
15223	MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
15224	N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
15225	SmallVector<int> SubMask;
15226	std::optional<TTI::ShuffleKind> Res =
15227	tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
15228	ShufflesRes [Part] = Res;
15229	copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
15230	}
15231	if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
15232	return Res.has_value();
15233	}))
15234	ShufflesRes.clear();
15235	return ShufflesRes;
15236	}
15237
15238	std::optional<TargetTransformInfo::ShuffleKind>
15239	BoUpSLP::isGatherShuffledSingleRegisterEntry(
15240	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
15241	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part, bool ForOrder) {
15242	Entries.clear();
15243	// TODO: currently checking only for Scalars in the tree entry, need to count
15244	// reused elements too for better cost estimation.
15245	auto GetUserEntry = [&](const TreeEntry *TE) {
15246	while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
15247	TE = TE->UserTreeIndex.UserTE;
15248	if (TE == VectorizableTree.front().get())
15249	return EdgeInfo (const_cast<TreeEntry *>(TE), `0`);
15250	return TE->UserTreeIndex;
15251	};
15252	auto HasGatherUser = [&](const TreeEntry *TE) {
15253	while (TE->Idx != `0` && TE->UserTreeIndex) {
15254	if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
15255	return true;
15256	TE = TE->UserTreeIndex.UserTE;
15257	}
15258	return false;
15259	};
15260	const EdgeInfo TEUseEI = GetUserEntry (TE);
15261	if (!TEUseEI)
15262	return std::nullopt;
15263	const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
15264	const BasicBlock TEInsertBlock = nullptr*;
15265	// Main node of PHI entries keeps the correct order of operands/incoming
15266	// blocks.
15267	if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp());
15268	PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
15269	TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
15270	TEInsertPt = TEInsertBlock->getTerminator();
15271	} else {
15272	TEInsertBlock = TEInsertPt->getParent();
15273	}
15274	if (!DT->isReachableFromEntry(A: TEInsertBlock))
15275	return std::nullopt;
15276	auto *NodeUI = DT->getNode(BB: TEInsertBlock);
15277	assert(NodeUI && "Should only process reachable instructions");
15278	SmallPtrSet<Value *, `4`> GatheredScalars(llvm::from_range, VL);
15279	auto CheckOrdering = [&](const Instruction *InsertPt) {
15280	// Argument InsertPt is an instruction where vector code for some other
15281	// tree entry (one that shares one or more scalars with TE) is going to be
15282	// generated. This lambda returns true if insertion point of vector code
15283	// for the TE dominates that point (otherwise dependency is the other way
15284	// around). The other node is not limited to be of a gather kind. Gather
15285	// nodes are not scheduled and their vector code is inserted before their
15286	// first user. If user is PHI, that is supposed to be at the end of a
15287	// predecessor block. Otherwise it is the last instruction among scalars of
15288	// the user node. So, instead of checking dependency between instructions
15289	// themselves, we check dependency between their insertion points for vector
15290	// code (since each scalar instruction ends up as a lane of a vector
15291	// instruction).
15292	const BasicBlock *InsertBlock = InsertPt->getParent();
15293	auto *NodeEUI = DT->getNode(BB: InsertBlock);
15294	if (!NodeEUI)
15295	return false;
15296	assert((NodeUI == NodeEUI) ==
15297	(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
15298	"Different nodes should have different DFS numbers");
15299	// Check the order of the gather nodes users.
15300	if (TEInsertPt->getParent() != InsertBlock &&
15301	(DT->dominates(A: NodeUI, B: NodeEUI) \|\| !DT->dominates(A: NodeEUI, B: NodeUI)))
15302	return false;
15303	if (TEInsertPt->getParent() == InsertBlock &&
15304	TEInsertPt->comesBefore(Other: InsertPt))
15305	return false;
15306	return true;
15307	};
15308	// Find all tree entries used by the gathered values. If no common entries
15309	// found - not a shuffle.
15310	// Here we build a set of tree nodes for each gathered value and trying to
15311	// find the intersection between these sets. If we have at least one common
15312	// tree node for each gathered value - we have just a permutation of the
15313	// single vector. If we have 2 different sets, we're in situation where we
15314	// have a permutation of 2 input vectors.
15315	SmallVector<SmallPtrSet<const TreeEntry *, `4`>> UsedTEs;
15316	SmallDenseMap<Value , int*> UsedValuesEntry;
15317	SmallPtrSet<const Value *, `16`> VisitedValue;
15318	auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
15319	// The node is reused - exit.
15320	if ((TEPtr->getVectorFactor() != VL.size() &&
15321	TEPtr->Scalars.size() != VL.size()) \|\|
15322	(!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars)))
15323	return false;
15324	UsedTEs.clear();
15325	UsedTEs.emplace_back().insert(Ptr: TEPtr);
15326	for (Value *V : VL) {
15327	if (isConstant(V))
15328	continue;
15329	UsedValuesEntry.try_emplace(Key: V, Args: `0`);
15330	}
15331	return true;
15332	};
15333	auto CheckParentNodes = [&](const TreeEntry User1, const* TreeEntry *User2,
15334	unsigned EdgeIdx) {
15335	const TreeEntry *Ptr1 = User1;
15336	const TreeEntry *Ptr2 = User2;
15337	SmallDenseMap<const TreeEntry , unsigned*> PtrToIdx;
15338	while (Ptr2) {
15339	PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx);
15340	EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
15341	Ptr2 = Ptr2->UserTreeIndex.UserTE;
15342	}
15343	while (Ptr1) {
15344	unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
15345	Ptr1 = Ptr1->UserTreeIndex.UserTE;
15346	if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end())
15347	return Idx < It ->second;
15348	}
15349	return false;
15350	};
15351	for (Value *V : VL) {
15352	if (isConstant(V) \|\| !VisitedValue.insert(Ptr: V).second)
15353	continue;
15354	// Build a list of tree entries where V is used.
15355	SmallPtrSet<const TreeEntry *, `4`> VToTEs;
15356	for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(Val: V)) {
15357	if (TEPtr == TE \|\| TEPtr->Idx == `0`)
15358	continue;
15359	assert(any_of(TEPtr->Scalars,
15360	[&](Value V) { return* GatheredScalars.contains(V); }) &&
15361	"Must contain at least single gathered value.");
15362	assert(TEPtr->UserTreeIndex &&
15363	"Expected only single user of a gather node.");
15364	const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
15365
15366	PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
15367	? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp())
15368	: nullptr;
15369	Instruction *InsertPt =
15370	UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
15371	: &getLastInstructionInBundle(E: UseEI.UserTE);
15372	if (TEInsertPt == InsertPt) {
15373	// Check nodes, which might be emitted first.
15374	if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
15375	(TEUseEI.UserTE->getOpcode() != Instruction::PHI \|\|
15376	TEUseEI.UserTE->isAltShuffle()) &&
15377	all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) {
15378	if (UseEI.UserTE->State != TreeEntry::Vectorize \|\|
15379	(UseEI.UserTE->getOpcode() == Instruction::PHI &&
15380	!UseEI.UserTE->isAltShuffle()) \|\|
15381	!all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock))
15382	continue;
15383	}
15384
15385	// If the schedulable insertion point is used in multiple entries - just
15386	// exit, no known ordering at this point, available only after real
15387	// scheduling.
15388	if (!doesNotNeedToBeScheduled(V: InsertPt) &&
15389	(TEUseEI.UserTE != UseEI.UserTE \|\| TEUseEI.EdgeIdx < UseEI.EdgeIdx))
15390	continue;
15391	// If the users are the PHI nodes with the same incoming blocks - skip.
15392	if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
15393	TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
15394	UseEI.UserTE->State == TreeEntry::Vectorize &&
15395	UseEI.UserTE->getOpcode() == Instruction::PHI &&
15396	TEUseEI.UserTE != UseEI.UserTE)
15397	continue;
15398	// If 2 gathers are operands of the same entry (regardless of whether
15399	// user is PHI or else), compare operands indices, use the earlier one
15400	// as the base.
15401	if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
15402	continue;
15403	// If the user instruction is used for some reason in different
15404	// vectorized nodes - make it depend on index.
15405	if (TEUseEI.UserTE != UseEI.UserTE &&
15406	(TEUseEI.UserTE->Idx < UseEI.UserTE->Idx \|\|
15407	HasGatherUser (TEUseEI.UserTE)))
15408	continue;
15409	// If the user node is the operand of the other user node - skip.
15410	if (CheckParentNodes (TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
15411	continue;
15412	}
15413
15414	if (!TEUseEI.UserTE->isGather() && !UserPHI &&
15415	TEUseEI.UserTE->doesNotNeedToSchedule() !=
15416	UseEI.UserTE->doesNotNeedToSchedule() &&
15417	is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))
15418	continue;
15419	// Check if the user node of the TE comes after user node of TEPtr,
15420	// otherwise TEPtr depends on TE.
15421	if ((TEInsertBlock != InsertPt->getParent() \|\|
15422	TEUseEI.EdgeIdx < UseEI.EdgeIdx \|\| TEUseEI.UserTE != UseEI.UserTE) &&
15423	!CheckOrdering (InsertPt))
15424	continue;
15425	// The node is reused - exit.
15426	if (CheckAndUseSameNode (TEPtr))
15427	break;
15428	VToTEs.insert(Ptr: TEPtr);
15429	}
15430	if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
15431	const auto *It = find_if(
15432	Range&: VTEs, P: [&](const TreeEntry MTE) { return* MTE != TEUseEI.UserTE; });
15433	if (It != VTEs.end()) {
15434	const TreeEntry VTE = It;
15435	if (none_of(Range: TE->CombinedEntriesWithIndices,
15436	P: [&](const auto &P) { return P.first == VTE->Idx; })) {
15437	Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
15438	if (&LastBundleInst == TEInsertPt \|\| !CheckOrdering (&LastBundleInst))
15439	continue;
15440	}
15441	// The node is reused - exit.
15442	if (CheckAndUseSameNode (VTE))
15443	break;
15444	VToTEs.insert(Ptr: VTE);
15445	}
15446	}
15447	if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
15448	const TreeEntry *VTE = VTEs.front();
15449	if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: `0`) &&
15450	VTEs.size() > `1` && VTE->State != TreeEntry::Vectorize) {
15451	VTEs = VTEs.drop_front();
15452	// Iterate through all vectorized nodes.
15453	const auto MIt = find_if(Range&: VTEs, P: [](const* TreeEntry *MTE) {
15454	return MTE->State == TreeEntry::Vectorize;
15455	});
15456	if (MIt == VTEs.end())
15457	continue;
15458	VTE = *MIt;
15459	}
15460	if (none_of(Range: TE->CombinedEntriesWithIndices,
15461	P: [&](const auto &P) { return P.first == VTE->Idx; })) {
15462	Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
15463	if (&LastBundleInst == TEInsertPt \|\| !CheckOrdering (&LastBundleInst))
15464	continue;
15465	}
15466	// The node is reused - exit.
15467	if (CheckAndUseSameNode (VTE))
15468	break;
15469	VToTEs.insert(Ptr: VTE);
15470	}
15471	if (VToTEs.empty())
15472	continue;
15473	if (UsedTEs.empty()) {
15474	// The first iteration, just insert the list of nodes to vector.
15475	UsedTEs.push_back(Elt: VToTEs);
15476	UsedValuesEntry.try_emplace(Key: V, Args: `0`);
15477	} else {
15478	// Need to check if there are any previously used tree nodes which use V.
15479	// If there are no such nodes, consider that we have another one input
15480	// vector.
15481	SmallPtrSet<const TreeEntry *, `4`> SavedVToTEs(VToTEs);
15482	unsigned Idx = `0`;
15483	for (SmallPtrSet<const TreeEntry *, `4`> &Set : UsedTEs) {
15484	// Do we have a non-empty intersection of previously listed tree entries
15485	// and tree entries using current V?
15486	set_intersect(S1&: VToTEs, S2: Set);
15487	if (!VToTEs.empty()) {
15488	// Yes, write the new subset and continue analysis for the next
15489	// scalar.
15490	Set.swap(RHS&: VToTEs);
15491	break;
15492	}
15493	VToTEs = SavedVToTEs;
15494	++Idx;
15495	}
15496	// No non-empty intersection found - need to add a second set of possible
15497	// source vectors.
15498	if (Idx == UsedTEs.size()) {
15499	// If the number of input vectors is greater than 2 - not a permutation,
15500	// fallback to the regular gather.
15501	// TODO: support multiple reshuffled nodes.
15502	if (UsedTEs.size() == `2`)
15503	continue;
15504	UsedTEs.push_back(Elt: SavedVToTEs);
15505	Idx = UsedTEs.size() - `1`;
15506	}
15507	UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
15508	}
15509	}
15510
15511	if (UsedTEs.empty()) {
15512	Entries.clear();
15513	return std::nullopt;
15514	}
15515
15516	unsigned VF = `0`;
15517	if (UsedTEs.size() == `1`) {
15518	// Keep the order to avoid non-determinism.
15519	SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
15520	UsedTEs.front().end());
15521	sort(C&: FirstEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
15522	return TE1->Idx < TE2->Idx;
15523	});
15524	// Try to find the perfect match in another gather node at first.
15525	auto It = find_if(Range&: FirstEntries, P: [=](const* TreeEntry *EntryPtr) {
15526	return EntryPtr->isSame(VL) \|\| EntryPtr->isSame(VL: TE->Scalars);
15527	});
15528	if (It != FirstEntries.end() &&
15529	((*It)->getVectorFactor() == VL.size() \|\|
15530	((*It)->getVectorFactor() == TE->Scalars.size() &&
15531	TE->ReuseShuffleIndices.size() == VL.size() &&
15532	(*It)->isSame(VL: TE->Scalars)))) {
15533	Entries.push_back(Elt: *It);
15534	if ((*It)->getVectorFactor() == VL.size()) {
15535	std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15536	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: `0`);
15537	} else {
15538	SmallVector<int> CommonMask = TE->getCommonMask();
15539	copy(Range&: CommonMask, Out: Mask.begin());
15540	}
15541	// Clear undef scalars.
15542	for (unsigned I : seq<unsigned>(Size: VL.size()))
15543	if (isa<PoisonValue>(Val: VL [I]))
15544	Mask [Part * VL.size() + I] = PoisonMaskElem;
15545	return TargetTransformInfo::SK_PermuteSingleSrc;
15546	}
15547	// No perfect match, just shuffle, so choose the first tree node from the
15548	// tree.
15549	Entries.push_back(Elt: FirstEntries.front());
15550	// Update mapping between values and corresponding tree entries.
15551	for (auto &P : UsedValuesEntry)
15552	P.second = `0`;
15553	VF = FirstEntries.front()->getVectorFactor();
15554	} else {
15555	// Try to find nodes with the same vector factor.
15556	assert(UsedTEs.size() == `2` && "Expected at max 2 permuted entries.");
15557	// Keep the order of tree nodes to avoid non-determinism.
15558	DenseMap<int, const TreeEntry *> VFToTE;
15559	for (const TreeEntry *TE : UsedTEs.front()) {
15560	unsigned VF = TE->getVectorFactor();
15561	auto It = VFToTE.find(Val: VF);
15562	if (It != VFToTE.end()) {
15563	if (It ->second->Idx > TE->Idx)
15564	It ->getSecond() = TE;
15565	continue;
15566	}
15567	VFToTE.try_emplace(Key: VF, Args&: TE);
15568	}
15569	// Same, keep the order to avoid non-determinism.
15570	SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
15571	UsedTEs.back().end());
15572	sort(C&: SecondEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
15573	return TE1->Idx < TE2->Idx;
15574	});
15575	for (const TreeEntry *TE : SecondEntries) {
15576	auto It = VFToTE.find(Val: TE->getVectorFactor());
15577	if (It != VFToTE.end()) {
15578	VF = It ->first;
15579	Entries.push_back(Elt: It ->second);
15580	Entries.push_back(Elt: TE);
15581	break;
15582	}
15583	}
15584	// No 2 source vectors with the same vector factor - just choose 2 with max
15585	// index.
15586	if (Entries.empty()) {
15587	Entries.push_back(Elt: *llvm::max_element(
15588	Range&: UsedTEs.front(), C: [](const TreeEntry TE1, const* TreeEntry *TE2) {
15589	return TE1->Idx < TE2->Idx;
15590	}));
15591	Entries.push_back(Elt: SecondEntries.front());
15592	VF = std::max(a: Entries.front()->getVectorFactor(),
15593	b: Entries.back()->getVectorFactor());
15594	} else {
15595	VF = Entries.front()->getVectorFactor();
15596	}
15597	SmallVector<SmallPtrSet<Value *, `8`>> ValuesToEntries;
15598	for (const TreeEntry *E : Entries)
15599	ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(),
15600	E: E->Scalars.end());
15601	// Update mapping between values and corresponding tree entries.
15602	for (auto &P : UsedValuesEntry) {
15603	for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size()))
15604	if (ValuesToEntries [Idx].contains(Ptr: P.first)) {
15605	P.second = Idx;
15606	break;
15607	}
15608	}
15609	}
15610
15611	bool IsSplatOrUndefs = isSplat(VL) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>);
15612	// Checks if the 2 PHIs are compatible in terms of high possibility to be
15613	// vectorized.
15614	auto AreCompatiblePHIs = [&](Value V, Value V1) {
15615	auto *PHI = cast<PHINode>(Val: V);
15616	auto *PHI1 = cast<PHINode>(Val: V1);
15617	// Check that all incoming values are compatible/from same parent (if they
15618	// are instructions).
15619	// The incoming values are compatible if they all are constants, or
15620	// instruction with the same/alternate opcodes from the same basic block.
15621	for (int I = `0`, E = PHI->getNumIncomingValues(); I < E; ++I) {
15622	Value *In = PHI->getIncomingValue(i: I);
15623	Value *In1 = PHI1->getIncomingValue(i: I);
15624	if (isConstant(V: In) && isConstant(V: In1))
15625	continue;
15626	if (!getSameOpcode(VL: {In, In1}, TLI: *TLI))
15627	return false;
15628	if (cast<Instruction>(Val: In)->getParent() !=
15629	cast<Instruction>(Val: In1)->getParent())
15630	return false;
15631	}
15632	return true;
15633	};
15634	// Check if the value can be ignored during analysis for shuffled gathers.
15635	// We suppose it is better to ignore instruction, which do not form splats,
15636	// are not vectorized/not extractelements (these instructions will be handled
15637	// by extractelements processing) or may form vector node in future.
15638	auto MightBeIgnored = [=](Value *V) {
15639	auto *I = dyn_cast<Instruction>(Val: V);
15640	return I && !IsSplatOrUndefs && !isVectorized(V: I) &&
15641	!isVectorLikeInstWithConstOps(V: I) &&
15642	!areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
15643	};
15644	// Check that the neighbor instruction may form a full vector node with the
15645	// current instruction V. It is possible, if they have same/alternate opcode
15646	// and same parent basic block.
15647	auto NeighborMightBeIgnored = [&](Value V, int* Idx) {
15648	Value *V1 = VL [Idx];
15649	bool UsedInSameVTE = false;
15650	auto It = UsedValuesEntry.find(Val: V1);
15651	if (It != UsedValuesEntry.end())
15652	UsedInSameVTE = It ->second == UsedValuesEntry.find(Val: V)->second;
15653	return V != V1 && MightBeIgnored (V1) && !UsedInSameVTE &&
15654	getSameOpcode(VL: {V, V1}, TLI: *TLI) &&
15655	cast<Instruction>(Val: V)->getParent() ==
15656	cast<Instruction>(Val: V1)->getParent() &&
15657	(!isa<PHINode>(Val: V1) \|\| AreCompatiblePHIs (V, V1));
15658	};
15659	// Build a shuffle mask for better cost estimation and vector emission.
15660	SmallBitVector UsedIdxs(Entries.size());
15661	SmallVector<std::pair<unsigned, int>> EntryLanes;
15662	for (int I = `0`, E = VL.size(); I < E; ++I) {
15663	Value *V = VL [I];
15664	auto It = UsedValuesEntry.find(Val: V);
15665	if (It == UsedValuesEntry.end())
15666	continue;
15667	// Do not try to shuffle scalars, if they are constants, or instructions
15668	// that can be vectorized as a result of the following vector build
15669	// vectorization.
15670	if (isConstant(V) \|\| (MightBeIgnored (V) &&
15671	((I > `0` && NeighborMightBeIgnored (V, I - `1`)) \|\|
15672	(I != E - `1` && NeighborMightBeIgnored (V, I + `1`)))))
15673	continue;
15674	unsigned Idx = It ->second;
15675	EntryLanes.emplace_back(Args&: Idx, Args&: I);
15676	UsedIdxs.set(Idx);
15677	}
15678	// Iterate through all shuffled scalars and select entries, which can be used
15679	// for final shuffle.
15680	SmallVector<const TreeEntry *> TempEntries;
15681	for (unsigned I = `0`, Sz = Entries.size(); I < Sz; ++I) {
15682	if (!UsedIdxs.test(Idx: I))
15683	continue;
15684	// Fix the entry number for the given scalar. If it is the first entry, set
15685	// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
15686	// These indices are used when calculating final shuffle mask as the vector
15687	// offset.
15688	for (std::pair<unsigned, int> &Pair : EntryLanes)
15689	if (Pair.first == I)
15690	Pair.first = TempEntries.size();
15691	TempEntries.push_back(Elt: Entries [I]);
15692	}
15693	Entries.swap(RHS&: TempEntries);
15694	if (EntryLanes.size() == Entries.size() &&
15695	!VL.equals(RHS: ArrayRef(TE->Scalars)
15696	.slice(N: Part * VL.size(),
15697	M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
15698	// We may have here 1 or 2 entries only. If the number of scalars is equal
15699	// to the number of entries, no need to do the analysis, it is not very
15700	// profitable. Since VL is not the same as TE->Scalars, it means we already
15701	// have some shuffles before. Cut off not profitable case.
15702	Entries.clear();
15703	return std::nullopt;
15704	}
15705	// Build the final mask, check for the identity shuffle, if possible.
15706	bool IsIdentity = Entries.size() == `1`;
15707	// Pair.first is the offset to the vector, while Pair.second is the index of
15708	// scalar in the list.
15709	for (const std::pair<unsigned, int> &Pair : EntryLanes) {
15710	unsigned Idx = Part * VL.size() + Pair.second;
15711	Mask [Idx] =
15712	Pair.first * VF +
15713	(ForOrder ? std::distance(
15714	first: Entries [Pair.first]->Scalars.begin(),
15715	last: find(Range: Entries [Pair.first]->Scalars, Val: VL [Pair.second]))
15716	: Entries [Pair.first]->findLaneForValue(V: VL [Pair.second]));
15717	IsIdentity &= Mask [Idx] == Pair.second;
15718	}
15719	if (ForOrder \|\| IsIdentity \|\| Entries.empty()) {
15720	switch (Entries.size()) {
15721	case `1`:
15722	if (IsIdentity \|\| EntryLanes.size() > `1` \|\| VL.size() <= `2`)
15723	return TargetTransformInfo::SK_PermuteSingleSrc;
15724	break;
15725	case `2`:
15726	if (EntryLanes.size() > `2` \|\| VL.size() <= `2`)
15727	return TargetTransformInfo::SK_PermuteTwoSrc;
15728	break;
15729	default:
15730	break;
15731	}
15732	} else if (!isa<VectorType>(Val: VL.front()->getType()) &&
15733	(EntryLanes.size() > Entries.size() \|\| VL.size() <= `2`)) {
15734	// Do the cost estimation if shuffle beneficial than buildvector.
15735	SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()),
15736	std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()));
15737	int MinElement = SubMask.front(), MaxElement = SubMask.front();
15738	for (int Idx : SubMask) {
15739	if (Idx == PoisonMaskElem)
15740	continue;
15741	if (MinElement == PoisonMaskElem \|\| MinElement % VF > Idx % VF)
15742	MinElement = Idx;
15743	if (MaxElement == PoisonMaskElem \|\| MaxElement % VF < Idx % VF)
15744	MaxElement = Idx;
15745	}
15746	assert(MaxElement >= `0` && MinElement >= `0` &&
15747	MaxElement % VF >= MinElement % VF &&
15748	"Expected at least single element.");
15749	unsigned NewVF = std::max<unsigned>(
15750	a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(),
15751	Sz: (MaxElement % VF) -
15752	(MinElement % VF) + `1`));
15753	if (NewVF < VF) {
15754	for (int &Idx : SubMask) {
15755	if (Idx == PoisonMaskElem)
15756	continue;
15757	Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
15758	(Idx >= static_cast<int>(VF) ? NewVF : `0`);
15759	}
15760	} else {
15761	NewVF = VF;
15762	}
15763
15764	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15765	auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF);
15766	auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size());
15767	auto GetShuffleCost = [&,
15768	&TTI = TTI](ArrayRef<int*> Mask,
15769	ArrayRef<const TreeEntry *> Entries,
15770	VectorType *VecTy) -> InstructionCost {
15771	if (Entries.size() == `1` && Entries.front()->getInterleaveFactor() > `0` &&
15772	ShuffleVectorInst::isDeInterleaveMaskOfFactor(
15773	Mask, Factor: Entries.front()->getInterleaveFactor()))
15774	return TTI::TCC_Free;
15775	return ::getShuffleCost(TTI,
15776	Kind: Entries.size() > `1` ? TTI::SK_PermuteTwoSrc
15777	: TTI::SK_PermuteSingleSrc,
15778	Tp: VecTy, Mask, CostKind);
15779	};
15780	InstructionCost ShuffleCost = GetShuffleCost (SubMask, Entries, VecTy);
15781	InstructionCost FirstShuffleCost = `0`;
15782	SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
15783	if (Entries.size() == `1` \|\| !Entries [`0`]->isGather()) {
15784	FirstShuffleCost = ShuffleCost;
15785	} else {
15786	// Transform mask to include only first entry.
15787	APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
15788	bool IsIdentity = true;
15789	for (auto [I, Idx] : enumerate(First&: FirstMask)) {
15790	if (Idx >= static_cast<int>(NewVF)) {
15791	Idx = PoisonMaskElem;
15792	} else {
15793	DemandedElts.clearBit(BitPosition: I);
15794	if (Idx != PoisonMaskElem)
15795	IsIdentity &= static_cast<int>(I) == Idx;
15796	}
15797	}
15798	if (!IsIdentity)
15799	FirstShuffleCost = GetShuffleCost (FirstMask, Entries.front(), VecTy);
15800	FirstShuffleCost += getScalarizationOverhead(
15801	TTI: TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /Insert=/*true,
15802	/Extract=/false, CostKind);
15803	}
15804	InstructionCost SecondShuffleCost = `0`;
15805	SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
15806	if (Entries.size() == `1` \|\| !Entries [`1`]->isGather()) {
15807	SecondShuffleCost = ShuffleCost;
15808	} else {
15809	// Transform mask to include only first entry.
15810	APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
15811	bool IsIdentity = true;
15812	for (auto [I, Idx] : enumerate(First&: SecondMask)) {
15813	if (Idx < static_cast<int>(NewVF) && Idx >= `0`) {
15814	Idx = PoisonMaskElem;
15815	} else {
15816	DemandedElts.clearBit(BitPosition: I);
15817	if (Idx != PoisonMaskElem) {
15818	Idx -= NewVF;
15819	IsIdentity &= static_cast<int>(I) == Idx;
15820	}
15821	}
15822	}
15823	if (!IsIdentity)
15824	SecondShuffleCost = GetShuffleCost (SecondMask, Entries [`1`], VecTy);
15825	SecondShuffleCost += getScalarizationOverhead(
15826	TTI: TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /Insert=/*true,
15827	/Extract=/false, CostKind);
15828	}
15829	APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
15830	for (auto [I, Idx] : enumerate(First&: SubMask))
15831	if (Idx == PoisonMaskElem)
15832	DemandedElts.clearBit(BitPosition: I);
15833	InstructionCost BuildVectorCost = getScalarizationOverhead(
15834	TTI: TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /Insert=/*true,
15835	/Extract=/false, CostKind);
15836	const TreeEntry BestEntry = nullptr*;
15837	if (FirstShuffleCost < ShuffleCost) {
15838	std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15839	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()),
15840	f: [&](int &Idx) {
15841	if (Idx >= static_cast<int>(VF))
15842	Idx = PoisonMaskElem;
15843	});
15844	BestEntry = Entries.front();
15845	ShuffleCost = FirstShuffleCost;
15846	}
15847	if (SecondShuffleCost < ShuffleCost) {
15848	std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15849	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()),
15850	f: [&](int &Idx) {
15851	if (Idx < static_cast<int>(VF))
15852	Idx = PoisonMaskElem;
15853	else
15854	Idx -= VF;
15855	});
15856	BestEntry = Entries [`1`];
15857	ShuffleCost = SecondShuffleCost;
15858	}
15859	if (BuildVectorCost >= ShuffleCost) {
15860	if (BestEntry) {
15861	Entries.clear();
15862	Entries.push_back(Elt: BestEntry);
15863	}
15864	return Entries.size() > `1` ? TargetTransformInfo::SK_PermuteTwoSrc
15865	: TargetTransformInfo::SK_PermuteSingleSrc;
15866	}
15867	}
15868	Entries.clear();
15869	// Clear the corresponding mask elements.
15870	std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15871	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: PoisonMaskElem);
15872	return std::nullopt;
15873	}
15874
15875	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
15876	BoUpSLP::isGatherShuffledEntry(
15877	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
15878	SmallVectorImpl<SmallVector<const TreeEntry >> &Entries, unsigned* NumParts,
15879	bool ForOrder) {
15880	assert(NumParts > `0` && NumParts < VL.size() &&
15881	"Expected positive number of registers.");
15882	Entries.clear();
15883	// No need to check for the topmost gather node.
15884	if (TE == VectorizableTree.front().get() &&
15885	(!GatheredLoadsEntriesFirst.has_value() \|\|
15886	none_of(Range: ArrayRef(VectorizableTree).drop_front(),
15887	P: [](const std::unique_ptr<TreeEntry> &TE) {
15888	return !TE ->isGather();
15889	})))
15890	return {};
15891	// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
15892	// implemented yet.
15893	if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
15894	return {};
15895	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
15896	assert((TE->UserTreeIndex \|\| TE == VectorizableTree.front().get()) &&
15897	"Expected only single user of the gather node.");
15898	assert(VL.size() % NumParts == `0` &&
15899	"Number of scalars must be divisible by NumParts.");
15900	if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
15901	TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
15902	(TE->Idx == `0` \|\|
15903	(TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) \|\|
15904	isSplat(VL: TE->Scalars) \|\|
15905	(TE->hasState() &&
15906	getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars))))
15907	return {};
15908	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
15909	SmallVector<std::optional<TTI::ShuffleKind>> Res;
15910	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
15911	ArrayRef<Value *> SubVL =
15912	VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
15913	SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
15914	std::optional<TTI::ShuffleKind> SubRes =
15915	isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
15916	ForOrder);
15917	if (!SubRes)
15918	SubEntries.clear();
15919	Res.push_back(Elt: SubRes);
15920	if (SubEntries.size() == `1` && *SubRes == TTI::SK_PermuteSingleSrc &&
15921	SubEntries.front()->getVectorFactor() == VL.size() &&
15922	(SubEntries.front()->isSame(VL: TE->Scalars) \|\|
15923	SubEntries.front()->isSame(VL))) {
15924	SmallVector<const TreeEntry *> LocalSubEntries;
15925	LocalSubEntries.swap(RHS&: SubEntries);
15926	Entries.clear();
15927	Res.clear();
15928	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
15929	// Clear undef scalars.
15930	for (int I = `0`, Sz = VL.size(); I < Sz; ++I)
15931	if (isa<PoisonValue>(Val: VL [I]))
15932	Mask [I] = PoisonMaskElem;
15933	Entries.emplace_back(Args: `1`, Args&: LocalSubEntries.front());
15934	Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
15935	return Res;
15936	}
15937	}
15938	if (all_of(Range&: Res,
15939	P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
15940	Entries.clear();
15941	return {};
15942	}
15943	return Res;
15944	}
15945
15946	InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc,
15947	Type ScalarTy) const* {
15948	const unsigned VF = VL.size();
15949	auto *VecTy = getWidenedType(ScalarTy, VF);
15950	// Find the cost of inserting/extracting values from the vector.
15951	// Check if the same elements are inserted several times and count them as
15952	// shuffle candidates.
15953	APInt DemandedElements = APInt::getZero(numBits: VF);
15954	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15955	InstructionCost Cost;
15956	auto EstimateInsertCost = [&](unsigned I, Value *V) {
15957	DemandedElements.setBit(I);
15958	if (V->getType() != ScalarTy)
15959	Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
15960	CCH: TTI::CastContextHint::None, CostKind);
15961	};
15962	SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
15963	std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: `0`);
15964	for (auto [I, V] : enumerate(First&: VL)) {
15965	// No need to shuffle duplicates for constants.
15966	if ((ForPoisonSrc && isConstant(V)) \|\| isa<UndefValue>(Val: V))
15967	continue;
15968
15969	if (isConstant(V)) {
15970	ConstantShuffleMask [I] = I + VF;
15971	continue;
15972	}
15973	EstimateInsertCost (I, V);
15974	}
15975	// FIXME: add a cost for constant vector materialization.
15976	bool IsAnyNonUndefConst =
15977	any_of(Range&: VL, P: [](Value V) { return* !isa<UndefValue>(Val: V) && isConstant(V); });
15978	// 1. Shuffle input source vector and constant vector.
15979	if (!ForPoisonSrc && IsAnyNonUndefConst) {
15980	Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy,
15981	Mask: ConstantShuffleMask);
15982	}
15983
15984	// 2. Insert unique non-constants.
15985	if (!DemandedElements.isZero())
15986	Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements,
15987	/Insert=/true,
15988	/Extract=/false, CostKind,
15989	ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL);
15990	return Cost;
15991	}
15992
15993	Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
15994	auto It = EntryToLastInstruction.find(Val: E);
15995	if (It != EntryToLastInstruction.end())
15996	return *cast<Instruction>(Val&: It ->second);
15997	Instruction Res = nullptr*;
15998	// Get the basic block this bundle is in. All instructions in the bundle
15999	// should be in this block (except for extractelement-like instructions with
16000	// constant indices or gathered loads).
16001	auto *Front = E->getMainOp();
16002	auto *BB = Front->getParent();
16003	assert(((GatheredLoadsEntriesFirst.has_value() &&
16004	E->getOpcode() == Instruction::Load && E->isGather() &&
16005	E->Idx < *GatheredLoadsEntriesFirst) \|\|
16006	E->State == TreeEntry::SplitVectorize \|\|
16007	all_of(E->Scalars,
16008	[=](Value V) -> bool* {
16009	if (E->getOpcode() == Instruction::GetElementPtr &&
16010	!isa<GetElementPtrInst>(V))
16011	return true;
16012	auto *I = dyn_cast<Instruction>(V);
16013	return !I \|\| !E->getMatchingMainOpOrAltOp(I) \|\|
16014	I->getParent() == BB \|\|
16015	isVectorLikeInstWithConstOps(I);
16016	})) &&
16017	"Expected gathered loads or GEPs or instructions from same basic "
16018	"block.");
16019
16020	auto FindLastInst = [&]() {
16021	Instruction *LastInst = Front;
16022	for (Value *V : E->Scalars) {
16023	auto *I = dyn_cast<Instruction>(Val: V);
16024	if (!I)
16025	continue;
16026	if (LastInst->getParent() == I->getParent()) {
16027	if (LastInst->comesBefore(Other: I))
16028	LastInst = I;
16029	continue;
16030	}
16031	assert(((E->getOpcode() == Instruction::GetElementPtr &&
16032	!isa<GetElementPtrInst>(I)) \|\|
16033	E->State == TreeEntry::SplitVectorize \|\|
16034	(isVectorLikeInstWithConstOps(LastInst) &&
16035	isVectorLikeInstWithConstOps(I)) \|\|
16036	(GatheredLoadsEntriesFirst.has_value() &&
16037	E->getOpcode() == Instruction::Load && E->isGather() &&
16038	E->Idx < *GatheredLoadsEntriesFirst)) &&
16039	"Expected vector-like or non-GEP in GEP node insts only.");
16040	if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
16041	LastInst = I;
16042	continue;
16043	}
16044	if (!DT->isReachableFromEntry(A: I->getParent()))
16045	continue;
16046	auto *NodeA = DT->getNode(BB: LastInst->getParent());
16047	auto *NodeB = DT->getNode(BB: I->getParent());
16048	assert(NodeA && "Should only process reachable instructions");
16049	assert(NodeB && "Should only process reachable instructions");
16050	assert((NodeA == NodeB) ==
16051	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
16052	"Different nodes should have different DFS numbers");
16053	if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
16054	LastInst = I;
16055	}
16056	BB = LastInst->getParent();
16057	return LastInst;
16058	};
16059
16060	auto FindFirstInst = [&]() {
16061	Instruction *FirstInst = Front;
16062	for (Value *V : E->Scalars) {
16063	auto *I = dyn_cast<Instruction>(Val: V);
16064	if (!I)
16065	continue;
16066	if (FirstInst->getParent() == I->getParent()) {
16067	if (I->comesBefore(Other: FirstInst))
16068	FirstInst = I;
16069	continue;
16070	}
16071	assert(((E->getOpcode() == Instruction::GetElementPtr &&
16072	!isa<GetElementPtrInst>(I)) \|\|
16073	(isVectorLikeInstWithConstOps(FirstInst) &&
16074	isVectorLikeInstWithConstOps(I))) &&
16075	"Expected vector-like or non-GEP in GEP node insts only.");
16076	if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
16077	FirstInst = I;
16078	continue;
16079	}
16080	if (!DT->isReachableFromEntry(A: I->getParent()))
16081	continue;
16082	auto *NodeA = DT->getNode(BB: FirstInst->getParent());
16083	auto *NodeB = DT->getNode(BB: I->getParent());
16084	assert(NodeA && "Should only process reachable instructions");
16085	assert(NodeB && "Should only process reachable instructions");
16086	assert((NodeA == NodeB) ==
16087	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
16088	"Different nodes should have different DFS numbers");
16089	if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
16090	FirstInst = I;
16091	}
16092	return FirstInst;
16093	};
16094
16095	if (E->State == TreeEntry::SplitVectorize) {
16096	Res = FindLastInst ();
16097	if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) {
16098	for (auto *E : Entries) {
16099	auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue);
16100	if (!I)
16101	I = &getLastInstructionInBundle(E);
16102	if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I))
16103	Res = I;
16104	}
16105	}
16106	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16107	return *Res;
16108	}
16109
16110	// Set insertpoint for gathered loads to the very first load.
16111	if (GatheredLoadsEntriesFirst.has_value() &&
16112	E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
16113	E->getOpcode() == Instruction::Load) {
16114	Res = FindFirstInst ();
16115	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16116	return *Res;
16117	}
16118
16119	// Set the insert point to the beginning of the basic block if the entry
16120	// should not be scheduled.
16121	auto FindScheduleBundle = [&](const TreeEntry E) -> const* ScheduleBundle * {
16122	if (E->isGather())
16123	return nullptr;
16124	// Found previously that the instruction do not need to be scheduled.
16125	const auto *It = BlocksSchedules.find(Key: BB);
16126	if (It == BlocksSchedules.end())
16127	return nullptr;
16128	for (Value *V : E->Scalars) {
16129	auto *I = dyn_cast<Instruction>(Val: V);
16130	if (!I \|\| isa<PHINode>(Val: I) \|\| doesNotNeedToBeScheduled(V: I))
16131	continue;
16132	ArrayRef<ScheduleBundle *> Bundles = It->second ->getScheduleBundles(V: I);
16133	if (Bundles.empty())
16134	continue;
16135	const auto *It = find_if(
16136	Range&: Bundles, P: [&](ScheduleBundle B) { return* B->getTreeEntry() == E; });
16137	if (It != Bundles.end())
16138	return *It;
16139	}
16140	return nullptr;
16141	};
16142	const ScheduleBundle *Bundle = FindScheduleBundle (E);
16143	if (!E->isGather() && !Bundle) {
16144	if ((E->getOpcode() == Instruction::GetElementPtr &&
16145	any_of(Range: E->Scalars,
16146	P: [](Value *V) {
16147	return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
16148	})) \|\|
16149	all_of(Range: E->Scalars, P: [](Value *V) {
16150	return isa<PoisonValue>(Val: V) \|\|
16151	(!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
16152	}))
16153	Res = FindLastInst ();
16154	else
16155	Res = FindFirstInst ();
16156	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16157	return *Res;
16158	}
16159
16160	// Find the last instruction. The common case should be that BB has been
16161	// scheduled, and the last instruction is VL.back(). So we start with
16162	// VL.back() and iterate over schedule data until we reach the end of the
16163	// bundle. The end of the bundle is marked by null ScheduleData.
16164	if (Bundle) {
16165	assert(!E->isGather() && "Gathered instructions should not be scheduled");
16166	Res = Bundle->getBundle().back()->getInst();
16167	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16168	return *Res;
16169	}
16170
16171	// LastInst can still be null at this point if there's either not an entry
16172	// for BB in BlocksSchedules or there's no ScheduleData available for
16173	// VL.back(). This can be the case if buildTreeRec aborts for various
16174	// reasons (e.g., the maximum recursion depth is reached, the maximum region
16175	// size is reached, etc.). ScheduleData is initialized in the scheduling
16176	// "dry-run".
16177	//
16178	// If this happens, we can still find the last instruction by brute force. We
16179	// iterate forwards from Front (inclusive) until we either see all
16180	// instructions in the bundle or reach the end of the block. If Front is the
16181	// last instruction in program order, LastInst will be set to Front, and we
16182	// will visit all the remaining instructions in the block.
16183	//
16184	// One of the reasons we exit early from buildTreeRec is to place an upper
16185	// bound on compile-time. Thus, taking an additional compile-time hit here is
16186	// not ideal. However, this should be exceedingly rare since it requires that
16187	// we both exit early from buildTreeRec and that the bundle be out-of-order
16188	// (causing us to iterate all the way to the end of the block).
16189	if (!Res)
16190	Res = FindLastInst ();
16191	assert(Res && "Failed to find last instruction in bundle");
16192	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16193	return *Res;
16194	}
16195
16196	void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
16197	auto *Front = E->getMainOp();
16198	Instruction *LastInst = &getLastInstructionInBundle(E);
16199	assert(LastInst && "Failed to find last instruction in bundle");
16200	BasicBlock::iterator LastInstIt = LastInst->getIterator();
16201	// If the instruction is PHI, set the insert point after all the PHIs.
16202	bool IsPHI = isa<PHINode>(Val: LastInst);
16203	if (IsPHI) {
16204	LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
16205	if (LastInstIt != LastInst->getParent()->end() &&
16206	LastInstIt ->getParent()->isLandingPad())
16207	LastInstIt = std::next(x: LastInstIt);
16208	}
16209	if (IsPHI \|\|
16210	(!E->isGather() && E->State != TreeEntry::SplitVectorize &&
16211	E->doesNotNeedToSchedule()) \|\|
16212	(GatheredLoadsEntriesFirst.has_value() &&
16213	E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
16214	E->getOpcode() == Instruction::Load)) {
16215	Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
16216	} else {
16217	// Set the insertion point after the last instruction in the bundle. Set the
16218	// debug location to Front.
16219	Builder.SetInsertPoint(
16220	TheBB: LastInst->getParent(),
16221	IP: LastInst->getNextNonDebugInstruction()->getIterator());
16222	}
16223	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
16224	}
16225
16226	Value *BoUpSLP::gather(
16227	ArrayRef<Value > VL, Value Root, Type *ScalarTy,
16228	function_ref<Value (Value , Value , ArrayRef<int*>)> CreateShuffle) {
16229	// List of instructions/lanes from current block and/or the blocks which are
16230	// part of the current loop. These instructions will be inserted at the end to
16231	// make it possible to optimize loops and hoist invariant instructions out of
16232	// the loops body with better chances for success.
16233	SmallVector<std::pair<Value , unsigned*>, `4`> PostponedInsts;
16234	SmallSet<int, `4`> PostponedIndices;
16235	Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
16236	auto &&CheckPredecessor = [](BasicBlock InstBB, BasicBlock InsertBB) {
16237	SmallPtrSet<BasicBlock *, `4`> Visited;
16238	while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
16239	InsertBB = InsertBB->getSinglePredecessor();
16240	return InsertBB && InsertBB == InstBB;
16241	};
16242	for (int I = `0`, E = VL.size(); I < E; ++I) {
16243	if (auto *Inst = dyn_cast<Instruction>(Val: VL [I]))
16244	if ((CheckPredecessor (Inst->getParent(), Builder.GetInsertBlock()) \|\|
16245	isVectorized(V: Inst) \|\|
16246	(L && (!Root \|\| L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
16247	PostponedIndices.insert(V: I).second)
16248	PostponedInsts.emplace_back(Args&: Inst, Args&: I);
16249	}
16250
16251	auto &&CreateInsertElement = [this](Value Vec, Value V, unsigned Pos,
16252	Type *Ty) {
16253	Value *Scalar = V;
16254	if (Scalar->getType() != Ty) {
16255	assert(Scalar->getType()->isIntOrIntVectorTy() &&
16256	Ty->isIntOrIntVectorTy() && "Expected integer types only.");
16257	Value *V = Scalar;
16258	if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
16259	isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
16260	Value *Op = CI->getOperand(i_nocapture: `0`);
16261	if (auto *IOp = dyn_cast<Instruction>(Val: Op);
16262	!IOp \|\| !(isDeleted(I: IOp) \|\| isVectorized(V: IOp)))
16263	V = Op;
16264	}
16265	Scalar = Builder.CreateIntCast(
16266	V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery (*DL)));
16267	}
16268
16269	Instruction *InsElt;
16270	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
16271	assert(SLPReVec && "FixedVectorType is not expected.");
16272	Vec =
16273	createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy));
16274	auto *II = dyn_cast<IntrinsicInst>(Val: Vec);
16275	if (!II \|\| II->getIntrinsicID() != Intrinsic::vector_insert)
16276	return Vec;
16277	InsElt = II;
16278	} else {
16279	Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
16280	InsElt = dyn_cast<InsertElementInst>(Val: Vec);
16281	if (!InsElt)
16282	return Vec;
16283	}
16284	GatherShuffleExtractSeq.insert(X: InsElt);
16285	CSEBlocks.insert(V: InsElt->getParent());
16286	// Add to our 'need-to-extract' list.
16287	if (isa<Instruction>(Val: V)) {
16288	if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
16289	// Find which lane we need to extract.
16290	User UserOp = nullptr*;
16291	if (Scalar != V) {
16292	if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
16293	UserOp = SI;
16294	} else {
16295	UserOp = InsElt;
16296	}
16297	if (UserOp) {
16298	unsigned FoundLane = Entries.front()->findLaneForValue(V);
16299	ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: *Entries.front(), Args&: FoundLane);
16300	}
16301	}
16302	}
16303	return Vec;
16304	};
16305	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
16306	Value *Vec = PoisonValue::get(T: VecTy);
16307	SmallVector<int> NonConsts;
16308	SmallVector<int> Mask(VL.size());
16309	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
16310	Value *OriginalRoot = Root;
16311	if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root);
16312	SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: `1`)) &&
16313	SV->getOperand(i_nocapture: `0`)->getType() == VecTy) {
16314	Root = SV->getOperand(i_nocapture: `0`);
16315	Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end());
16316	}
16317	// Insert constant values at first.
16318	for (int I = `0`, E = VL.size(); I < E; ++I) {
16319	if (PostponedIndices.contains(V: I))
16320	continue;
16321	if (!isConstant(V: VL [I])) {
16322	NonConsts.push_back(Elt: I);
16323	continue;
16324	}
16325	if (isa<PoisonValue>(Val: VL [I]))
16326	continue;
16327	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
16328	Mask [I] = I + E;
16329	}
16330	if (Root) {
16331	if (isa<PoisonValue>(Val: Vec)) {
16332	Vec = OriginalRoot;
16333	} else {
16334	Vec = CreateShuffle (Root, Vec, Mask);
16335	if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot);
16336	OI && OI->use_empty() &&
16337	none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16338	return TE ->VectorizedValue == OI;
16339	}))
16340	eraseInstruction(I: OI);
16341	}
16342	}
16343	// Insert non-constant values.
16344	for (int I : NonConsts)
16345	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
16346	// Append instructions, which are/may be part of the loop, in the end to make
16347	// it possible to hoist non-loop-based instructions.
16348	for (const std::pair<Value , unsigned*> &Pair : PostponedInsts)
16349	Vec = CreateInsertElement (Vec, Pair.first, Pair.second, ScalarTy);
16350
16351	return Vec;
16352	}
16353
16354	/// Merges shuffle masks and emits final shuffle instruction, if required. It
16355	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
16356	/// when the actual shuffle instruction is generated only if this is actually
16357	/// required. Otherwise, the shuffle instruction emission is delayed till the
16358	/// end of the process, to reduce the number of emitted instructions and further
16359	/// analysis/transformations.
16360	/// The class also will look through the previously emitted shuffle instructions
16361	/// and properly mark indices in mask as undef.
16362	/// For example, given the code
16363	/// \code
16364	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
16365	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
16366	/// \endcode
16367	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
16368	/// look through %s1 and %s2 and emit
16369	/// \code
16370	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
16371	/// \endcode
16372	/// instead.
16373	/// If 2 operands are of different size, the smallest one will be resized and
16374	/// the mask recalculated properly.
16375	/// For example, given the code
16376	/// \code
16377	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
16378	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
16379	/// \endcode
16380	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
16381	/// look through %s1 and %s2 and emit
16382	/// \code
16383	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
16384	/// \endcode
16385	/// instead.
16386	class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
16387	bool IsFinalized = false;
16388	/// Combined mask for all applied operands and masks. It is built during
16389	/// analysis and actual emission of shuffle vector instructions.
16390	SmallVector<int> CommonMask;
16391	/// List of operands for the shuffle vector instruction. It hold at max 2
16392	/// operands, if the 3rd is going to be added, the first 2 are combined into
16393	/// shuffle with \p CommonMask mask, the first operand sets to be the
16394	/// resulting shuffle and the second operand sets to be the newly added
16395	/// operand. The \p CommonMask is transformed in the proper way after that.
16396	SmallVector<Value *, `2`> InVectors;
16397	IRBuilderBase &Builder;
16398	BoUpSLP &R;
16399
16400	class ShuffleIRBuilder {
16401	IRBuilderBase &Builder;
16402	/// Holds all of the instructions that we gathered.
16403	SetVector<Instruction *> &GatherShuffleExtractSeq;
16404	/// A list of blocks that we are going to CSE.
16405	DenseSet<BasicBlock *> &CSEBlocks;
16406	/// Data layout.
16407	const DataLayout &DL;
16408
16409	public:
16410	ShuffleIRBuilder(IRBuilderBase &Builder,
16411	SetVector<Instruction *> &GatherShuffleExtractSeq,
16412	DenseSet<BasicBlock > &CSEBlocks, const* DataLayout &DL)
16413	: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
16414	CSEBlocks(CSEBlocks), DL(DL) {}
16415	~ShuffleIRBuilder() = default;
16416	/// Creates shufflevector for the 2 operands with the given mask.
16417	Value createShuffleVector(Value V1, Value V2, ArrayRef<int*> Mask) {
16418	if (V1->getType() != V2->getType()) {
16419	assert(V1->getType()->isIntOrIntVectorTy() &&
16420	V1->getType()->isIntOrIntVectorTy() &&
16421	"Expected integer vector types only.");
16422	if (V1->getType() != V2->getType()) {
16423	if (cast<VectorType>(Val: V2->getType())
16424	->getElementType()
16425	->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
16426	->getElementType()
16427	->getIntegerBitWidth())
16428	V2 = Builder.CreateIntCast(
16429	V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery (DL)));
16430	else
16431	V1 = Builder.CreateIntCast(
16432	V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery (DL)));
16433	}
16434	}
16435	Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
16436	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
16437	GatherShuffleExtractSeq.insert(X: I);
16438	CSEBlocks.insert(V: I->getParent());
16439	}
16440	return Vec;
16441	}
16442	/// Creates permutation of the single vector operand with the given mask, if
16443	/// it is not identity mask.
16444	Value createShuffleVector(Value V1, ArrayRef<int> Mask) {
16445	if (Mask.empty())
16446	return V1;
16447	unsigned VF = Mask.size();
16448	unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
16449	if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
16450	return V1;
16451	Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
16452	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
16453	GatherShuffleExtractSeq.insert(X: I);
16454	CSEBlocks.insert(V: I->getParent());
16455	}
16456	return Vec;
16457	}
16458	Value createIdentity(Value V) { return V; }
16459	Value createPoison(Type Ty, unsigned VF) {
16460	return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
16461	}
16462	/// Resizes 2 input vector to match the sizes, if the they are not equal
16463	/// yet. The smallest vector is resized to the size of the larger vector.
16464	void resizeToMatch(Value &V1, Value &V2) {
16465	if (V1->getType() == V2->getType())
16466	return;
16467	int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
16468	int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
16469	int VF = std::max(a: V1VF, b: V2VF);
16470	int MinVF = std::min(a: V1VF, b: V2VF);
16471	SmallVector<int> IdentityMask(VF, PoisonMaskElem);
16472	std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
16473	value: `0`);
16474	Value *&Op = MinVF == V1VF ? V1 : V2;
16475	Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
16476	if (auto *I = dyn_cast<Instruction>(Val: Op)) {
16477	GatherShuffleExtractSeq.insert(X: I);
16478	CSEBlocks.insert(V: I->getParent());
16479	}
16480	if (MinVF == V1VF)
16481	V1 = Op;
16482	else
16483	V2 = Op;
16484	}
16485	};
16486
16487	/// Smart shuffle instruction emission, walks through shuffles trees and
16488	/// tries to find the best matching vector for the actual shuffle
16489	/// instruction.
16490	Value createShuffle(Value V1, Value V2, ArrayRef<int*> Mask) {
16491	assert(V1 && "Expected at least one vector value.");
16492	ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
16493	R.CSEBlocks, *R.DL);
16494	return BaseShuffleAnalysis::createShuffle<Value *>(
16495	V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy);
16496	}
16497
16498	/// Cast value \p V to the vector type with the same number of elements, but
16499	/// the base type \p ScalarTy.
16500	Value castToScalarTyElem(Value V,
16501	std::optional<bool> IsSigned = std::nullopt) {
16502	auto *VecTy = cast<VectorType>(Val: V->getType());
16503	assert(getNumElements(VecTy) % getNumElements(ScalarTy) == `0`);
16504	if (VecTy->getElementType() == ScalarTy->getScalarType())
16505	return V;
16506	return Builder.CreateIntCast(
16507	V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
16508	isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery (*R.DL))));
16509	}
16510
16511	public:
16512	ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
16513	: BaseShuffleAnalysis (ScalarTy), Builder(Builder), R(R) {}
16514
16515	/// Adjusts extractelements after reusing them.
16516	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
16517	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
16518	unsigned NumParts, bool &UseVecBaseAsInput) {
16519	UseVecBaseAsInput = false;
16520	SmallPtrSet<Value *, `4`> UniqueBases;
16521	Value VecBase = nullptr*;
16522	SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
16523	if (!E->ReorderIndices.empty()) {
16524	SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
16525	E->ReorderIndices.end());
16526	reorderScalars(Scalars&: VL, Mask: ReorderMask);
16527	}
16528	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
16529	int Idx = Mask [I];
16530	if (Idx == PoisonMaskElem)
16531	continue;
16532	auto *EI = cast<ExtractElementInst>(Val: VL [I]);
16533	VecBase = EI->getVectorOperand();
16534	if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty())
16535	VecBase = TEs.front()->VectorizedValue;
16536	assert(VecBase && "Expected vectorized value.");
16537	UniqueBases.insert(Ptr: VecBase);
16538	// If the only one use is vectorized - can delete the extractelement
16539	// itself.
16540	if (!EI->hasOneUse() \|\| R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) \|\|
16541	(NumParts != `1` && count(Range&: VL, Element: EI) > `1`) \|\|
16542	any_of(Range: EI->users(), P: [&](User *U) {
16543	ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U);
16544	return UTEs.empty() \|\| UTEs.size() > `1` \|\|
16545	(isa<GetElementPtrInst>(Val: U) &&
16546	!R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) \|\|
16547	(!UTEs.empty() &&
16548	count_if(Range&: R.VectorizableTree,
16549	P: [&](const std::unique_ptr<TreeEntry> &TE) {
16550	return TE ->UserTreeIndex.UserTE ==
16551	UTEs.front() &&
16552	is_contained(Range&: VL, Element: EI);
16553	}) != `1`);
16554	}))
16555	continue;
16556	R.eraseInstruction(I: EI);
16557	}
16558	if (NumParts == `1` \|\| UniqueBases.size() == `1`) {
16559	assert(VecBase && "Expected vectorized value.");
16560	return castToScalarTyElem(V: VecBase);
16561	}
16562	UseVecBaseAsInput = true;
16563	auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
16564	for (auto [I, Idx] : enumerate(First&: Mask))
16565	if (Idx != PoisonMaskElem)
16566	Idx = I;
16567	};
16568	// Perform multi-register vector shuffle, joining them into a single virtual
16569	// long vector.
16570	// Need to shuffle each part independently and then insert all this parts
16571	// into a long virtual vector register, forming the original vector.
16572	Value Vec = nullptr*;
16573	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
16574	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
16575	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
16576	unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
16577	ArrayRef<Value > SubVL = ArrayRef(VL).slice(N: Part SliceSize, M: Limit);
16578	MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
16579	constexpr int MaxBases = `2`;
16580	SmallVector<Value *, MaxBases> Bases(MaxBases);
16581	auto VLMask = zip(t&: SubVL, u&: SubMask);
16582	const unsigned VF = std::accumulate(
16583	first: VLMask.begin(), last: VLMask.end(), init: `0U`, binary_op: [&](unsigned S, const auto &D) {
16584	if (std::get<`1`>(D) == PoisonMaskElem)
16585	return S;
16586	Value *VecOp =
16587	cast<ExtractElementInst>(std::get<`0`>(D))->getVectorOperand();
16588	if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp);
16589	!TEs.empty())
16590	VecOp = TEs.front()->VectorizedValue;
16591	assert(VecOp && "Expected vectorized value.");
16592	const unsigned Size =
16593	cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
16594	return std::max(a: S, b: Size);
16595	});
16596	for (const auto [V, I] : VLMask) {
16597	if (I == PoisonMaskElem)
16598	continue;
16599	Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
16600	if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty())
16601	VecOp = TEs.front()->VectorizedValue;
16602	assert(VecOp && "Expected vectorized value.");
16603	VecOp = castToScalarTyElem(V: VecOp);
16604	Bases [I / VF] = VecOp;
16605	}
16606	if (!Bases.front())
16607	continue;
16608	Value *SubVec;
16609	if (Bases.back()) {
16610	SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
16611	TransformToIdentity(SubMask);
16612	} else {
16613	SubVec = Bases.front();
16614	}
16615	if (!Vec) {
16616	Vec = SubVec;
16617	assert((Part == `0` \|\| all_of(seq<unsigned>(`0`, Part),
16618	[&](unsigned P) {
16619	ArrayRef<int> SubMask =
16620	Mask.slice(P * SliceSize,
16621	getNumElems(Mask.size(),
16622	SliceSize, P));
16623	return all_of(SubMask, [](int Idx) {
16624	return Idx == PoisonMaskElem;
16625	});
16626	})) &&
16627	"Expected first part or all previous parts masked.");
16628	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
16629	} else {
16630	unsigned NewVF =
16631	cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
16632	if (Vec->getType() != SubVec->getType()) {
16633	unsigned SubVecVF =
16634	cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
16635	NewVF = std::max(a: NewVF, b: SubVecVF);
16636	}
16637	// Adjust SubMask.
16638	for (int &Idx : SubMask)
16639	if (Idx != PoisonMaskElem)
16640	Idx += NewVF;
16641	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
16642	Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
16643	TransformToIdentity(VecMask);
16644	}
16645	}
16646	copy(Range&: VecMask, Out: Mask.begin());
16647	return Vec;
16648	}
16649	/// Checks if the specified entry \p E needs to be delayed because of its
16650	/// dependency nodes.
16651	std::optional<Value *>
16652	needToDelay(const TreeEntry *E,
16653	ArrayRef<SmallVector<const TreeEntry >> Deps) const* {
16654	// No need to delay emission if all deps are ready.
16655	if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
16656	return all_of(
16657	Range&: TEs, P: [](const TreeEntry TE) { return* TE->VectorizedValue; });
16658	}))
16659	return std::nullopt;
16660	// Postpone gather emission, will be emitted after the end of the
16661	// process to keep correct order.
16662	auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
16663	return Builder.CreateAlignedLoad(
16664	Ty: ResVecTy,
16665	Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
16666	Align: MaybeAlign ());
16667	}
16668	/// Reset the builder to handle perfect diamond match.
16669	void resetForSameNode() {
16670	IsFinalized = false;
16671	CommonMask.clear();
16672	InVectors.clear();
16673	}
16674	/// Adds 2 input vectors (in form of tree entries) and the mask for their
16675	/// shuffling.
16676	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
16677	Value *V1 = E1.VectorizedValue;
16678	if (V1->getType()->isIntOrIntVectorTy())
16679	V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
16680	if (isa<PoisonValue>(Val: V))
16681	return false;
16682	return !isKnownNonNegative(
16683	V, SQ: SimplifyQuery (*R.DL));
16684	}));
16685	Value *V2 = E2.VectorizedValue;
16686	if (V2->getType()->isIntOrIntVectorTy())
16687	V2 = castToScalarTyElem(V: V2, IsSigned: any_of(Range: E2.Scalars, P: [&](Value *V) {
16688	if (isa<PoisonValue>(Val: V))
16689	return false;
16690	return !isKnownNonNegative(
16691	V, SQ: SimplifyQuery (*R.DL));
16692	}));
16693	add(V1, V2, Mask);
16694	}
16695	/// Adds single input vector (in form of tree entry) and the mask for its
16696	/// shuffling.
16697	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
16698	Value *V1 = E1.VectorizedValue;
16699	if (V1->getType()->isIntOrIntVectorTy())
16700	V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
16701	if (isa<PoisonValue>(Val: V))
16702	return false;
16703	return !isKnownNonNegative(
16704	V, SQ: SimplifyQuery (*R.DL));
16705	}));
16706	add(V1, Mask);
16707	}
16708	/// Adds 2 input vectors and the mask for their shuffling.
16709	void add(Value V1, Value V2, ArrayRef<int> Mask) {
16710	assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
16711	assert(isa<FixedVectorType>(V1->getType()) &&
16712	isa<FixedVectorType>(V2->getType()) &&
16713	"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
16714	V1 = castToScalarTyElem(V: V1);
16715	V2 = castToScalarTyElem(V: V2);
16716	if (InVectors.empty()) {
16717	InVectors.push_back(Elt: V1);
16718	InVectors.push_back(Elt: V2);
16719	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
16720	return;
16721	}
16722	Value *Vec = InVectors.front();
16723	if (InVectors.size() == `2`) {
16724	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
16725	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16726	} else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
16727	Mask.size()) {
16728	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
16729	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16730	}
16731	V1 = createShuffle(V1, V2, Mask);
16732	unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec));
16733	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16734	if (Mask [Idx] != PoisonMaskElem)
16735	CommonMask [Idx] = Idx + VF;
16736	InVectors.front() = Vec;
16737	if (InVectors.size() == `2`)
16738	InVectors.back() = V1;
16739	else
16740	InVectors.push_back(Elt: V1);
16741	}
16742	/// Adds another one input vector and the mask for the shuffling.
16743	void add(Value V1, ArrayRef<int> Mask, bool* = false) {
16744	assert(isa<FixedVectorType>(V1->getType()) &&
16745	"castToScalarTyElem expects V1 to be FixedVectorType");
16746	V1 = castToScalarTyElem(V: V1);
16747	if (InVectors.empty()) {
16748	InVectors.push_back(Elt: V1);
16749	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
16750	return;
16751	}
16752	const auto *It = find(Range&: InVectors, Val: V1);
16753	if (It == InVectors.end()) {
16754	if (InVectors.size() == `2` \|\|
16755	InVectors.front()->getType() != V1->getType()) {
16756	Value *V = InVectors.front();
16757	if (InVectors.size() == `2`) {
16758	V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
16759	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16760	} else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
16761	CommonMask.size()) {
16762	V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
16763	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16764	}
16765	unsigned VF = std::max(a: CommonMask.size(), b: Mask.size());
16766	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16767	if (CommonMask [Idx] == PoisonMaskElem && Mask [Idx] != PoisonMaskElem)
16768	CommonMask [Idx] = V->getType() != V1->getType()
16769	? Idx + VF
16770	: Mask [Idx] + getVF(V: V1);
16771	if (V->getType() != V1->getType())
16772	V1 = createShuffle(V1, V2: nullptr, Mask);
16773	InVectors.front() = V;
16774	if (InVectors.size() == `2`)
16775	InVectors.back() = V1;
16776	else
16777	InVectors.push_back(Elt: V1);
16778	return;
16779	}
16780	// Check if second vector is required if the used elements are already
16781	// used from the first one.
16782	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16783	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem) {
16784	InVectors.push_back(Elt: V1);
16785	break;
16786	}
16787	}
16788	unsigned VF = `0`;
16789	for (Value *V : InVectors)
16790	VF = std::max(a: VF, b: getVF(V));
16791	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16792	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
16793	CommonMask [Idx] = Mask [Idx] + (It == InVectors.begin() ? `0` : VF);
16794	}
16795	/// Adds another one input vector and the mask for the shuffling.
16796	void addOrdered(Value V1, ArrayRef<unsigned*> Order) {
16797	SmallVector<int> NewMask;
16798	inversePermutation(Indices: Order, Mask&: NewMask);
16799	add(V1, Mask: NewMask);
16800	}
16801	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
16802	Value Root = nullptr*) {
16803	return R.gather(VL, Root, ScalarTy,
16804	CreateShuffle: [&](Value V1, Value V2, ArrayRef<int> Mask) {
16805	return createShuffle(V1, V2, Mask);
16806	});
16807	}
16808	Value createFreeze(Value V) { return Builder.CreateFreeze(V); }
16809	/// Finalize emission of the shuffles.
16810	/// \param Action the action (if any) to be performed before final applying of
16811	/// the \p ExtMask mask.
16812	Value *finalize(
16813	ArrayRef<int> ExtMask,
16814	ArrayRef<std::pair<const TreeEntry , unsigned*>> SubVectors,
16815	ArrayRef<int> SubVectorsMask, unsigned VF = `0`,
16816	function_ref<void(Value &, SmallVectorImpl<int*> &,
16817	function_ref<Value (Value , Value , ArrayRef<int*>)>)>
16818	Action = {}) {
16819	IsFinalized = true;
16820	if (Action) {
16821	Value *Vec = InVectors.front();
16822	if (InVectors.size() == `2`) {
16823	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
16824	InVectors.pop_back();
16825	} else {
16826	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
16827	}
16828	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16829	assert(VF > `0` &&
16830	"Expected vector length for the final value before action.");
16831	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
16832	if (VecVF < VF) {
16833	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16834	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: `0`);
16835	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
16836	}
16837	Action (Vec, CommonMask, [this](Value V1, Value V2, ArrayRef<int> Mask) {
16838	return createShuffle(V1, V2, Mask);
16839	});
16840	InVectors.front() = Vec;
16841	}
16842	if (!SubVectors.empty()) {
16843	Value *Vec = InVectors.front();
16844	if (InVectors.size() == `2`) {
16845	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
16846	InVectors.pop_back();
16847	} else {
16848	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
16849	}
16850	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16851	auto CreateSubVectors = [&](Value *Vec,
16852	SmallVectorImpl<int> &CommonMask) {
16853	for (auto [E, Idx] : SubVectors) {
16854	Value *V = E->VectorizedValue;
16855	if (V->getType()->isIntOrIntVectorTy())
16856	V = castToScalarTyElem(V, IsSigned: any_of(Range: E->Scalars, P: [&](Value *V) {
16857	if (isa<PoisonValue>(Val: V))
16858	return false;
16859	return !isKnownNonNegative(
16860	V, SQ: SimplifyQuery (*R.DL));
16861	}));
16862	unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy);
16863	Vec = createInsertVector(
16864	Builder, Vec, V, Index: InsertionIndex,
16865	Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2,
16866	args: _3));
16867	if (!CommonMask.empty()) {
16868	std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
16869	last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
16870	value: Idx);
16871	}
16872	}
16873	return Vec;
16874	};
16875	if (SubVectorsMask.empty()) {
16876	Vec = CreateSubVectors(Vec, CommonMask);
16877	} else {
16878	SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
16879	copy(Range&: SubVectorsMask, Out: SVMask.begin());
16880	for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
16881	if (I2 != PoisonMaskElem) {
16882	assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
16883	I1 = I2 + CommonMask.size();
16884	}
16885	}
16886	Value *InsertVec =
16887	CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask);
16888	Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask);
16889	transformMaskAfterShuffle(CommonMask, Mask: SVMask);
16890	}
16891	InVectors.front() = Vec;
16892	}
16893
16894	if (!ExtMask.empty()) {
16895	if (CommonMask.empty()) {
16896	CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
16897	} else {
16898	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
16899	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
16900	if (ExtMask [I] == PoisonMaskElem)
16901	continue;
16902	NewMask [I] = CommonMask [ExtMask [I]];
16903	}
16904	CommonMask.swap(RHS&: NewMask);
16905	}
16906	}
16907	if (CommonMask.empty()) {
16908	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
16909	return InVectors.front();
16910	}
16911	if (InVectors.size() == `2`)
16912	return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
16913	return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
16914	}
16915
16916	~ShuffleInstructionBuilder() {
16917	assert((IsFinalized \|\| CommonMask.empty()) &&
16918	"Shuffle construction must be finalized.");
16919	}
16920	};
16921
16922	Value BoUpSLP::vectorizeOperand(TreeEntry E, unsigned NodeIdx) {
16923	return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx));
16924	}
16925
16926	template <typename BVTy, typename ResTy, typename... Args>
16927	ResTy BoUpSLP::processBuildVector(const TreeEntry E, Type ScalarTy,
16928	Args &...Params) {
16929	assert(E->isGather() && "Expected gather node.");
16930	unsigned VF = E->getVectorFactor();
16931
16932	bool NeedFreeze = false;
16933	SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
16934	// Clear values, to be replaced by insertvector instructions.
16935	for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
16936	for_each(MutableArrayRef(GatheredScalars)
16937	.slice(N: Idx, M: VectorizableTree [EIdx]->getVectorFactor()),
16938	[&](Value *&V) { V = PoisonValue::get(T: V->getType()); });
16939	SmallVector<std::pair<const TreeEntry , unsigned*>> SubVectors(
16940	E->CombinedEntriesWithIndices.size());
16941	transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
16942	[&](const auto &P) {
16943	return std::make_pair(VectorizableTree[P.first].get(), P.second);
16944	});
16945	// Build a mask out of the reorder indices and reorder scalars per this
16946	// mask.
16947	SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
16948	E->ReorderIndices.end());
16949	if (!ReorderMask.empty())
16950	reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
16951	SmallVector<int> SubVectorsMask;
16952	inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask);
16953	// Transform non-clustered elements in the mask to poison (-1).
16954	// "Clustered" operations will be reordered using this mask later.
16955	if (!SubVectors.empty() && !SubVectorsMask.empty()) {
16956	for (unsigned I : seq<unsigned>(Size: GatheredScalars.size()))
16957	if (E->Scalars [I] == GatheredScalars [ReorderMask [I]])
16958	SubVectorsMask [ReorderMask [I]] = PoisonMaskElem;
16959	} else {
16960	SubVectorsMask.clear();
16961	}
16962	SmallVector<Value *> StoredGS(GatheredScalars);
16963	auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
16964	unsigned I, unsigned SliceSize,
16965	bool IsNotPoisonous) {
16966	if (!isSplat(VL: E->Scalars) \|\| none_of(E->Scalars, [](Value *V) {
16967	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
16968	}))
16969	return false;
16970	TreeEntry *UserTE = E->UserTreeIndex.UserTE;
16971	unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
16972	if (UserTE->getNumOperands() != `2`)
16973	return false;
16974	if (!IsNotPoisonous) {
16975	auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + `1`),
16976	[=](const std::unique_ptr<TreeEntry> &TE) {
16977	return TE ->UserTreeIndex.UserTE == UserTE &&
16978	TE ->UserTreeIndex.EdgeIdx != EdgeIdx;
16979	});
16980	if (It == VectorizableTree.end())
16981	return false;
16982	SmallVector<Value > GS((It)->Scalars.begin(), (*It)->Scalars.end());
16983	if (!(*It)->ReorderIndices.empty()) {
16984	inversePermutation((*It)->ReorderIndices, ReorderMask);
16985	reorderScalars(Scalars&: GS, Mask: ReorderMask);
16986	}
16987	if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) {
16988	Value *V0 = std::get<`0`>(P);
16989	Value *V1 = std::get<`1`>(P);
16990	return !isa<UndefValue>(Val: V0) \|\| isa<PoisonValue>(Val: V0) \|\|
16991	(isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) &&
16992	is_contained(Range: E->Scalars, Element: V1));
16993	}))
16994	return false;
16995	}
16996	int Idx;
16997	if ((Mask.size() < InputVF &&
16998	ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
16999	Idx == `0`) \|\|
17000	(Mask.size() == InputVF &&
17001	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
17002	std::iota(
17003	first: std::next(x: Mask.begin(), n: I * SliceSize),
17004	last: std::next(x: Mask.begin(),
17005	n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
17006	value: `0`);
17007	} else {
17008	unsigned IVal =
17009	find_if_not(Mask, [](int* Idx) { return Idx == PoisonMaskElem; });
17010	std::fill(
17011	first: std::next(x: Mask.begin(), n: I * SliceSize),
17012	last: std::next(x: Mask.begin(),
17013	n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
17014	value: IVal);
17015	}
17016	return true;
17017	};
17018	BVTy ShuffleBuilder(ScalarTy, Params...);
17019	ResTy Res = ResTy();
17020	SmallVector<int> Mask;
17021	SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
17022	SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
17023	Value ExtractVecBase = nullptr*;
17024	bool UseVecBaseAsInput = false;
17025	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
17026	SmallVector<SmallVector<const TreeEntry *>> Entries;
17027	Type *OrigScalarTy = GatheredScalars.front()->getType();
17028	auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
17029	unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size());
17030	if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
17031	// Check for gathered extracts.
17032	bool Resized = false;
17033	ExtractShuffles =
17034	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
17035	if (!ExtractShuffles.empty()) {
17036	SmallVector<const TreeEntry *> ExtractEntries;
17037	for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
17038	if (I == PoisonMaskElem)
17039	continue;
17040	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
17041	V: cast<ExtractElementInst>(Val: StoredGS [Idx])->getVectorOperand());
17042	!TEs.empty())
17043	ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end());
17044	}
17045	if (std::optional<ResTy> Delayed =
17046	ShuffleBuilder.needToDelay(E, ExtractEntries)) {
17047	// Delay emission of gathers which are not ready yet.
17048	PostponedGathers.insert(X: E);
17049	// Postpone gather emission, will be emitted after the end of the
17050	// process to keep correct order.
17051	return *Delayed;
17052	}
17053	if (Value *VecBase = ShuffleBuilder.adjustExtracts(
17054	E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
17055	ExtractVecBase = VecBase;
17056	if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
17057	if (VF == VecBaseTy->getNumElements() &&
17058	GatheredScalars.size() != VF) {
17059	Resized = true;
17060	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
17061	Elt: PoisonValue::get(T: OrigScalarTy));
17062	NumParts =
17063	::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF);
17064	}
17065	}
17066	}
17067	// Gather extracts after we check for full matched gathers only.
17068	if (!ExtractShuffles.empty() \|\| !E->hasState() \|\|
17069	E->getOpcode() != Instruction::Load \|\|
17070	(((E->hasState() && E->getOpcode() == Instruction::Load) \|\|
17071	any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) &&
17072	any_of(E->Scalars,
17073	[this](Value *V) {
17074	return isa<LoadInst>(Val: V) && isVectorized(V);
17075	})) \|\|
17076	(E->hasState() && E->isAltShuffle()) \|\|
17077	all_of(E->Scalars, [this](Value V) { return* isVectorized(V); }) \|\|
17078	isSplat(VL: E->Scalars) \|\|
17079	(E->Scalars != GatheredScalars && GatheredScalars.size() <= `2`)) {
17080	GatherShuffles =
17081	isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
17082	}
17083	if (!GatherShuffles.empty()) {
17084	if (std::optional<ResTy> Delayed =
17085	ShuffleBuilder.needToDelay(E, Entries)) {
17086	// Delay emission of gathers which are not ready yet.
17087	PostponedGathers.insert(X: E);
17088	// Postpone gather emission, will be emitted after the end of the
17089	// process to keep correct order.
17090	return *Delayed;
17091	}
17092	if (GatherShuffles.size() == `1` &&
17093	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
17094	Entries.front().front()->isSame(VL: E->Scalars)) {
17095	// Perfect match in the graph, will reuse the previously vectorized
17096	// node. Cost is 0.
17097	LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
17098	<< shortBundleName(E->Scalars, E->Idx) << ".\n");
17099	// Restore the mask for previous partially matched values.
17100	Mask.resize(N: E->Scalars.size());
17101	const TreeEntry *FrontTE = Entries.front().front();
17102	if (FrontTE->ReorderIndices.empty() &&
17103	((FrontTE->ReuseShuffleIndices.empty() &&
17104	E->Scalars.size() == FrontTE->Scalars.size()) \|\|
17105	(E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
17106	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
17107	} else {
17108	for (auto [I, V] : enumerate(First: E->Scalars)) {
17109	if (isa<PoisonValue>(Val: V)) {
17110	Mask [I] = PoisonMaskElem;
17111	continue;
17112	}
17113	Mask [I] = FrontTE->findLaneForValue(V);
17114	}
17115	}
17116	// Reset the builder(s) to correctly handle perfect diamond matched
17117	// nodes.
17118	ShuffleBuilder.resetForSameNode();
17119	ShuffleBuilder.add(*FrontTE, Mask);
17120	// Full matched entry found, no need to insert subvectors.
17121	Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
17122	return Res;
17123	}
17124	if (!Resized) {
17125	if (GatheredScalars.size() != VF &&
17126	any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
17127	return any_of(TEs, [&](const TreeEntry *TE) {
17128	return TE->getVectorFactor() == VF;
17129	});
17130	}))
17131	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
17132	Elt: PoisonValue::get(T: OrigScalarTy));
17133	}
17134	// Remove shuffled elements from list of gathers.
17135	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
17136	if (Mask [I] != PoisonMaskElem)
17137	GatheredScalars [I] = PoisonValue::get(T: OrigScalarTy);
17138	}
17139	}
17140	}
17141	auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
17142	SmallVectorImpl<int> &ReuseMask,
17143	bool IsRootPoison) {
17144	// For splats with can emit broadcasts instead of gathers, so try to find
17145	// such sequences.
17146	bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
17147	(Scalars.size() > `2` \|\| Scalars.front() == Scalars.back());
17148	Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
17149	SmallVector<int> UndefPos;
17150	DenseMap<Value , unsigned*> UniquePositions;
17151	// Gather unique non-const values and all constant values.
17152	// For repeated values, just shuffle them.
17153	int NumNonConsts = `0`;
17154	int SinglePos = `0`;
17155	for (auto [I, V] : enumerate(First&: Scalars)) {
17156	if (isa<UndefValue>(Val: V)) {
17157	if (!isa<PoisonValue>(Val: V)) {
17158	ReuseMask [I] = I;
17159	UndefPos.push_back(Elt: I);
17160	}
17161	continue;
17162	}
17163	if (isConstant(V)) {
17164	ReuseMask [I] = I;
17165	continue;
17166	}
17167	++NumNonConsts;
17168	SinglePos = I;
17169	Value *OrigV = V;
17170	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
17171	if (IsSplat) {
17172	Scalars.front() = OrigV;
17173	ReuseMask [I] = `0`;
17174	} else {
17175	const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
17176	Scalars [Res.first ->second] = OrigV;
17177	ReuseMask [I] = Res.first ->second;
17178	}
17179	}
17180	if (NumNonConsts == `1`) {
17181	// Restore single insert element.
17182	if (IsSplat) {
17183	ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
17184	std::swap(a&: Scalars.front(), b&: Scalars [SinglePos]);
17185	if (!UndefPos.empty() && UndefPos.front() == `0`)
17186	Scalars.front() = UndefValue::get(T: OrigScalarTy);
17187	}
17188	ReuseMask [SinglePos] = SinglePos;
17189	} else if (!UndefPos.empty() && IsSplat) {
17190	// For undef values, try to replace them with the simple broadcast.
17191	// We can do it if the broadcasted value is guaranteed to be
17192	// non-poisonous, or by freezing the incoming scalar value first.
17193	auto It = find_if(Scalars, [this, E](Value V) {
17194	return !isa<UndefValue>(Val: V) &&
17195	(isVectorized(V) \|\| isGuaranteedNotToBePoison(V, AC) \|\|
17196	(E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
17197	// Check if the value already used in the same operation in
17198	// one of the nodes already.
17199	return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
17200	is_contained(Range&: E->UserTreeIndex.UserTE->Scalars,
17201	Element: U.getUser());
17202	})));
17203	});
17204	if (It != Scalars.end()) {
17205	// Replace undefs by the non-poisoned scalars and emit broadcast.
17206	int Pos = std::distance(Scalars.begin(), It);
17207	for (int I : UndefPos) {
17208	// Set the undef position to the non-poisoned scalar.
17209	ReuseMask [I] = Pos;
17210	// Replace the undef by the poison, in the mask it is replaced by
17211	// non-poisoned scalar already.
17212	if (I != Pos)
17213	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
17214	}
17215	} else {
17216	// Replace undefs by the poisons, emit broadcast and then emit
17217	// freeze.
17218	for (int I : UndefPos) {
17219	ReuseMask [I] = PoisonMaskElem;
17220	if (isa<UndefValue>(Val: Scalars [I]))
17221	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
17222	}
17223	NeedFreeze = true;
17224	}
17225	}
17226	};
17227	if (!ExtractShuffles.empty() \|\| !GatherShuffles.empty()) {
17228	bool IsNonPoisoned = true;
17229	bool IsUsedInExpr = true;
17230	Value Vec1 = nullptr*;
17231	if (!ExtractShuffles.empty()) {
17232	// Gather of extractelements can be represented as just a shuffle of
17233	// a single/two vectors the scalars are extracted from.
17234	// Find input vectors.
17235	Value Vec2 = nullptr*;
17236	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
17237	if (!Mask.empty() && Mask [I] != PoisonMaskElem)
17238	ExtractMask [I] = PoisonMaskElem;
17239	}
17240	if (UseVecBaseAsInput) {
17241	Vec1 = ExtractVecBase;
17242	} else {
17243	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
17244	if (ExtractMask [I] == PoisonMaskElem)
17245	continue;
17246	if (isa<UndefValue>(Val: StoredGS [I]))
17247	continue;
17248	auto *EI = cast<ExtractElementInst>(Val: StoredGS [I]);
17249	Value *VecOp = EI->getVectorOperand();
17250	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp);
17251	!TEs.empty() && TEs.front()->VectorizedValue)
17252	VecOp = TEs.front()->VectorizedValue;
17253	if (!Vec1) {
17254	Vec1 = VecOp;
17255	} else if (Vec1 != VecOp) {
17256	assert((!Vec2 \|\| Vec2 == VecOp) &&
17257	"Expected only 1 or 2 vectors shuffle.");
17258	Vec2 = VecOp;
17259	}
17260	}
17261	}
17262	if (Vec2) {
17263	IsUsedInExpr = false;
17264	IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) &&
17265	isGuaranteedNotToBePoison(V: Vec2, AC);
17266	ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
17267	} else if (Vec1) {
17268	bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC);
17269	IsUsedInExpr &= FindReusedSplat(
17270	ExtractMask,
17271	cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), `0`,
17272	ExtractMask.size(), IsNotPoisonedVec);
17273	ShuffleBuilder.add(Vec1, ExtractMask, /ForExtracts=/true);
17274	IsNonPoisoned &= IsNotPoisonedVec;
17275	} else {
17276	IsUsedInExpr = false;
17277	ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
17278	/ForExtracts=/true);
17279	}
17280	}
17281	if (!GatherShuffles.empty()) {
17282	unsigned SliceSize =
17283	getPartNumElems(Size: E->Scalars.size(),
17284	NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size()));
17285	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17286	for (const auto [I, TEs] : enumerate(First&: Entries)) {
17287	if (TEs.empty()) {
17288	assert(!GatherShuffles[I] &&
17289	"No shuffles with empty entries list expected.");
17290	continue;
17291	}
17292	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
17293	"Expected shuffle of 1 or 2 entries.");
17294	unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
17295	auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
17296	VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
17297	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
17298	if (TEs.size() == `1`) {
17299	bool IsNotPoisonedVec =
17300	TEs.front()->VectorizedValue
17301	? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC)
17302	: true;
17303	IsUsedInExpr &=
17304	FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
17305	SliceSize, IsNotPoisonedVec);
17306	ShuffleBuilder.add(*TEs.front(), VecMask);
17307	IsNonPoisoned &= IsNotPoisonedVec;
17308	} else {
17309	IsUsedInExpr = false;
17310	ShuffleBuilder.add(TEs.front(), TEs.back(), VecMask);
17311	if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
17312	IsNonPoisoned &=
17313	isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) &&
17314	isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC);
17315	}
17316	}
17317	}
17318	// Try to figure out best way to combine values: build a shuffle and insert
17319	// elements or just build several shuffles.
17320	// Insert non-constant scalars.
17321	SmallVector<Value *> NonConstants(GatheredScalars);
17322	int EMSz = ExtractMask.size();
17323	int MSz = Mask.size();
17324	// Try to build constant vector and shuffle with it only if currently we
17325	// have a single permutation and more than 1 scalar constants.
17326	bool IsSingleShuffle = ExtractShuffles.empty() \|\| GatherShuffles.empty();
17327	bool IsIdentityShuffle =
17328	((UseVecBaseAsInput \|\|
17329	all_of(ExtractShuffles,
17330	[](const std::optional<TTI::ShuffleKind> &SK) {
17331	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
17332	TTI::SK_PermuteSingleSrc;
17333	})) &&
17334	none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
17335	ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) \|\|
17336	(!GatherShuffles.empty() &&
17337	all_of(GatherShuffles,
17338	[](const std::optional<TTI::ShuffleKind> &SK) {
17339	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
17340	TTI::SK_PermuteSingleSrc;
17341	}) &&
17342	none_of(Mask, [&](int I) { return I >= MSz; }) &&
17343	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
17344	bool EnoughConstsForShuffle =
17345	IsSingleShuffle &&
17346	(none_of(GatheredScalars,
17347	[](Value *V) {
17348	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
17349	}) \|\|
17350	any_of(GatheredScalars,
17351	[](Value *V) {
17352	return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
17353	})) &&
17354	(!IsIdentityShuffle \|\|
17355	(GatheredScalars.size() == `2` &&
17356	any_of(GatheredScalars,
17357	[](Value V) { return* !isa<UndefValue>(Val: V); })) \|\|
17358	count_if(GatheredScalars, [](Value *V) {
17359	return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
17360	}) > `1`);
17361	// NonConstants array contains just non-constant values, GatheredScalars
17362	// contains only constant to build final vector and then shuffle.
17363	for (int I = `0`, Sz = GatheredScalars.size(); I < Sz; ++I) {
17364	if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars [I]))
17365	NonConstants [I] = PoisonValue::get(T: OrigScalarTy);
17366	else
17367	GatheredScalars [I] = PoisonValue::get(T: OrigScalarTy);
17368	}
17369	// Generate constants for final shuffle and build a mask for them.
17370	if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
17371	SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
17372	TryPackScalars(GatheredScalars, BVMask, /IsRootPoison=/true);
17373	Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
17374	ShuffleBuilder.add(BV, BVMask);
17375	}
17376	if (all_of(NonConstants, [=](Value *V) {
17377	return isa<PoisonValue>(Val: V) \|\|
17378	(IsSingleShuffle && ((IsIdentityShuffle &&
17379	IsNonPoisoned) \|\| IsUsedInExpr) && isa<UndefValue>(Val: V));
17380	}))
17381	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
17382	SubVectorsMask);
17383	else
17384	Res = ShuffleBuilder.finalize(
17385	E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
17386	[&](Value &Vec, SmallVectorImpl<int> &Mask, auto* CreateShuffle) {
17387	bool IsSplat = isSplat(VL: NonConstants);
17388	SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
17389	TryPackScalars(NonConstants, BVMask, /IsRootPoison=/false);
17390	auto CheckIfSplatIsProfitable = [&]() {
17391	// Estimate the cost of splatting + shuffle and compare with
17392	// insert + shuffle.
17393	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17394	Value V = find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
17395	if (isa<ExtractElementInst>(Val: V) \|\| isVectorized(V))
17396	return false;
17397	InstructionCost SplatCost = TTI->getVectorInstrCost(
17398	Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /Index=/`0`,
17399	Op0: PoisonValue::get(T: VecTy), Op1: V);
17400	SmallVector<int> NewMask(Mask.begin(), Mask.end());
17401	for (auto [Idx, I] : enumerate(First&: BVMask))
17402	if (I != PoisonMaskElem)
17403	NewMask [Idx] = Mask.size();
17404	SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy,
17405	Mask: NewMask, CostKind);
17406	InstructionCost BVCost = TTI->getVectorInstrCost(
17407	Instruction::InsertElement, VecTy, CostKind,
17408	find_if(Mask, [](int* I) { return I != PoisonMaskElem; }),
17409	Vec, V);
17410	// Shuffle required?
17411	if (count(Range&: BVMask, Element: PoisonMaskElem) <
17412	static_cast<int>(BVMask.size() - `1`)) {
17413	SmallVector<int> NewMask(Mask.begin(), Mask.end());
17414	for (auto [Idx, I] : enumerate(First&: BVMask))
17415	if (I != PoisonMaskElem)
17416	NewMask [Idx] = I;
17417	BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17418	Tp: VecTy, Mask: NewMask, CostKind);
17419	}
17420	return SplatCost <= BVCost;
17421	};
17422	if (!IsSplat \|\| Mask.size() <= `2` \|\| !CheckIfSplatIsProfitable()) {
17423	for (auto [Idx, I] : enumerate(First&: BVMask))
17424	if (I != PoisonMaskElem)
17425	Mask [Idx] = I;
17426	Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
17427	} else {
17428	Value V = find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
17429	SmallVector<Value *> Values(NonConstants.size(),
17430	PoisonValue::get(T: ScalarTy));
17431	Values [`0`] = V;
17432	Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
17433	SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
17434	transform(BVMask, SplatMask.begin(), [](int I) {
17435	return I == PoisonMaskElem ? PoisonMaskElem : `0`;
17436	});
17437	if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF))
17438	BV = CreateShuffle(BV, nullptr, SplatMask);
17439	for (auto [Idx, I] : enumerate(First&: BVMask))
17440	if (I != PoisonMaskElem)
17441	Mask [Idx] = BVMask.size() + Idx;
17442	Vec = CreateShuffle(Vec, BV, Mask);
17443	for (auto [Idx, I] : enumerate(First&: Mask))
17444	if (I != PoisonMaskElem)
17445	Mask [Idx] = Idx;
17446	}
17447	});
17448	} else if (!allConstant(VL: GatheredScalars)) {
17449	// Gather unique scalars and all constants.
17450	SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
17451	TryPackScalars(GatheredScalars, ReuseMask, /IsRootPoison=/true);
17452	Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
17453	ShuffleBuilder.add(BV, ReuseMask);
17454	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
17455	SubVectorsMask);
17456	} else {
17457	// Gather all constants.
17458	SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
17459	for (auto [I, V] : enumerate(First&: GatheredScalars)) {
17460	if (!isa<PoisonValue>(Val: V))
17461	Mask [I] = I;
17462	}
17463	Value *BV = ShuffleBuilder.gather(GatheredScalars);
17464	ShuffleBuilder.add(BV, Mask);
17465	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
17466	SubVectorsMask);
17467	}
17468
17469	if (NeedFreeze)
17470	Res = ShuffleBuilder.createFreeze(Res);
17471	return Res;
17472	}
17473
17474	Value BoUpSLP::createBuildVector(const* TreeEntry E, Type ScalarTy) {
17475	for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
17476	(void)vectorizeTree(E: VectorizableTree [EIdx].get());
17477	return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
17478	Params&: Builder, Params&: *this);
17479	}
17480
17481	/// \returns \p I after propagating metadata from \p VL only for instructions in
17482	/// \p VL.
17483	static Instruction propagateMetadata(Instruction Inst, ArrayRef<Value *> VL) {
17484	SmallVector<Value *> Insts;
17485	for (Value *V : VL)
17486	if (isa<Instruction>(Val: V))
17487	Insts.push_back(Elt: V);
17488	return llvm::propagateMetadata(I: Inst, VL: Insts);
17489	}
17490
17491	static DebugLoc getDebugLocFromPHI(PHINode &PN) {
17492	if (DebugLoc DL = PN.getDebugLoc())
17493	return DL;
17494	return DebugLoc::getUnknown();
17495	}
17496
17497	Value BoUpSLP::vectorizeTree(TreeEntry E) {
17498	IRBuilderBase::InsertPointGuard Guard(Builder);
17499
17500	Value *V = E->Scalars.front();
17501	Type *ScalarTy = V->getType();
17502	if (!isa<CmpInst>(Val: V))
17503	ScalarTy = getValueType(V);
17504	auto It = MinBWs.find(Val: E);
17505	if (It != MinBWs.end()) {
17506	auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
17507	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
17508	if (VecTy)
17509	ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
17510	}
17511	if (E->VectorizedValue)
17512	return E->VectorizedValue;
17513	auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
17514	if (E->isGather()) {
17515	// Set insert point for non-reduction initial nodes.
17516	if (E->hasState() && E->Idx == `0` && !UserIgnoreList)
17517	setInsertPointAfterBundle(E);
17518	Value *Vec = createBuildVector(E, ScalarTy);
17519	E->VectorizedValue = Vec;
17520	return Vec;
17521	}
17522	if (E->State == TreeEntry::SplitVectorize) {
17523	assert(E->CombinedEntriesWithIndices.size() == `2` &&
17524	"Expected exactly 2 combined entries.");
17525	setInsertPointAfterBundle(E);
17526	TreeEntry &OpTE1 =
17527	*VectorizableTree [E->CombinedEntriesWithIndices.front().first];
17528	assert(OpTE1.isSame(
17529	ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
17530	"Expected same first part of scalars.");
17531	Value *Op1 = vectorizeTree(E: &OpTE1);
17532	TreeEntry &OpTE2 =
17533	*VectorizableTree [E->CombinedEntriesWithIndices.back().first];
17534	assert(
17535	OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
17536	"Expected same second part of scalars.");
17537	Value *Op2 = vectorizeTree(E: &OpTE2);
17538	auto GetOperandSignedness = [&](const TreeEntry *OpE) {
17539	bool IsSigned = false;
17540	auto It = MinBWs.find(Val: OpE);
17541	if (It != MinBWs.end())
17542	IsSigned = It ->second.second;
17543	else
17544	IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
17545	if (isa<PoisonValue>(Val: V))
17546	return false;
17547	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
17548	});
17549	return IsSigned;
17550	};
17551	if (cast<VectorType>(Val: Op1->getType())->getElementType() !=
17552	ScalarTy->getScalarType()) {
17553	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
17554	Op1 = Builder.CreateIntCast(
17555	V: Op1,
17556	DestTy: getWidenedType(
17557	ScalarTy,
17558	VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()),
17559	isSigned: GetOperandSignedness (&OpTE1));
17560	}
17561	if (cast<VectorType>(Val: Op2->getType())->getElementType() !=
17562	ScalarTy->getScalarType()) {
17563	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
17564	Op2 = Builder.CreateIntCast(
17565	V: Op2,
17566	DestTy: getWidenedType(
17567	ScalarTy,
17568	VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()),
17569	isSigned: GetOperandSignedness (&OpTE2));
17570	}
17571	if (E->ReorderIndices.empty()) {
17572	SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
17573	std::iota(
17574	first: Mask.begin(),
17575	last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second),
17576	value: `0`);
17577	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
17578	if (ScalarTyNumElements != `1`) {
17579	assert(SLPReVec && "Only supported by REVEC.");
17580	transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask);
17581	}
17582	Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask);
17583	Vec = createInsertVector(Builder, Vec, V: Op2,
17584	Index: E->CombinedEntriesWithIndices.back().second *
17585	ScalarTyNumElements);
17586	E->VectorizedValue = Vec;
17587	return Vec;
17588	}
17589	unsigned CommonVF =
17590	std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor());
17591	if (getNumElements(Ty: Op1->getType()) != CommonVF) {
17592	SmallVector<int> Mask(CommonVF, PoisonMaskElem);
17593	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE1.getVectorFactor()),
17594	value: `0`);
17595	Op1 = Builder.CreateShuffleVector(V: Op1, Mask);
17596	}
17597	if (getNumElements(Ty: Op2->getType()) != CommonVF) {
17598	SmallVector<int> Mask(CommonVF, PoisonMaskElem);
17599	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE2.getVectorFactor()),
17600	value: `0`);
17601	Op2 = Builder.CreateShuffleVector(V: Op2, Mask);
17602	}
17603	Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask());
17604	E->VectorizedValue = Vec;
17605	return Vec;
17606	}
17607
17608	bool IsReverseOrder =
17609	!E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices);
17610	auto FinalShuffle = [&](Value V, const* TreeEntry *E) {
17611	ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
17612	if (E->getOpcode() == Instruction::Store &&
17613	E->State == TreeEntry::Vectorize) {
17614	ArrayRef<int> Mask =
17615	ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
17616	E->ReorderIndices.size());
17617	ShuffleBuilder.add(V1: V, Mask);
17618	} else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) \|\|
17619	E->State == TreeEntry::CompressVectorize) {
17620	ShuffleBuilder.addOrdered(V1: V, Order: {});
17621	} else {
17622	ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
17623	}
17624	SmallVector<std::pair<const TreeEntry , unsigned*>> SubVectors(
17625	E->CombinedEntriesWithIndices.size());
17626	transform(
17627	Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) {
17628	return std::make_pair(VectorizableTree[P.first].get(), P.second);
17629	});
17630	assert(
17631	(E->CombinedEntriesWithIndices.empty() \|\| E->ReorderIndices.empty()) &&
17632	"Expected either combined subnodes or reordering");
17633	return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {});
17634	};
17635
17636	assert(!E->isGather() && "Unhandled state");
17637	unsigned ShuffleOrOp =
17638	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
17639	Instruction *VL0 = E->getMainOp();
17640	auto GetOperandSignedness = [&](unsigned Idx) {
17641	const TreeEntry *OpE = getOperandEntry(E, Idx);
17642	bool IsSigned = false;
17643	auto It = MinBWs.find(Val: OpE);
17644	if (It != MinBWs.end())
17645	IsSigned = It ->second.second;
17646	else
17647	IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
17648	if (isa<PoisonValue>(Val: V))
17649	return false;
17650	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
17651	});
17652	return IsSigned;
17653	};
17654	switch (ShuffleOrOp) {
17655	case Instruction::PHI: {
17656	assert((E->ReorderIndices.empty() \|\| !E->ReuseShuffleIndices.empty() \|\|
17657	E != VectorizableTree.front().get() \|\| E->UserTreeIndex) &&
17658	"PHI reordering is free.");
17659	auto *PH = cast<PHINode>(Val: VL0);
17660	Builder.SetInsertPoint(TheBB: PH->getParent(),
17661	IP: PH->getParent()->getFirstNonPHIIt());
17662	Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
17663	PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
17664	Value *V = NewPhi;
17665
17666	// Adjust insertion point once all PHI's have been generated.
17667	Builder.SetInsertPoint(TheBB: PH->getParent(),
17668	IP: PH->getParent()->getFirstInsertionPt());
17669	Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
17670
17671	V = FinalShuffle (V, E);
17672
17673	E->VectorizedValue = V;
17674	// If phi node is fully emitted - exit.
17675	if (NewPhi->getNumIncomingValues() != `0`)
17676	return NewPhi;
17677
17678	// PHINodes may have multiple entries from the same block. We want to
17679	// visit every block once.
17680	SmallPtrSet<BasicBlock *, `4`> VisitedBBs;
17681
17682	for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) {
17683	BasicBlock *IBB = PH->getIncomingBlock(i: I);
17684
17685	// Stop emission if all incoming values are generated.
17686	if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
17687	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
17688	return NewPhi;
17689	}
17690
17691	if (!VisitedBBs.insert(Ptr: IBB).second) {
17692	Value *VecOp = NewPhi->getIncomingValueForBlock(BB: IBB);
17693	NewPhi->addIncoming(V: VecOp, BB: IBB);
17694	TreeEntry *OpTE = getOperandEntry(E, Idx: I);
17695	assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
17696	OpTE->VectorizedValue = VecOp;
17697	continue;
17698	}
17699
17700	Builder.SetInsertPoint(IBB->getTerminator());
17701	Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
17702	Value *Vec = vectorizeOperand(E, NodeIdx: I);
17703	if (VecTy != Vec->getType()) {
17704	assert((It != MinBWs.end() \|\| getOperandEntry(E, I)->isGather() \|\|
17705	MinBWs.contains(getOperandEntry(E, I))) &&
17706	"Expected item in MinBWs.");
17707	Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
17708	}
17709	NewPhi->addIncoming(V: Vec, BB: IBB);
17710	}
17711
17712	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
17713	"Invalid number of incoming values");
17714	assert(E->VectorizedValue && "Expected vectorized value.");
17715	return E->VectorizedValue;
17716	}
17717
17718	case Instruction::ExtractElement: {
17719	Value *V = E->getSingleOperand(OpIdx: `0`);
17720	setInsertPointAfterBundle(E);
17721	V = FinalShuffle (V, E);
17722	E->VectorizedValue = V;
17723	return V;
17724	}
17725	case Instruction::ExtractValue: {
17726	auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: `0`));
17727	Builder.SetInsertPoint(LI);
17728	Value *Ptr = LI->getPointerOperand();
17729	LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
17730	Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars);
17731	NewV = FinalShuffle (NewV, E);
17732	E->VectorizedValue = NewV;
17733	return NewV;
17734	}
17735	case Instruction::InsertElement: {
17736	assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
17737	Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
17738	Value *V = vectorizeOperand(E, NodeIdx: `1`);
17739	ArrayRef<Value *> Op = E->getOperand(OpIdx: `1`);
17740	Type *ScalarTy = Op.front()->getType();
17741	if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
17742	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
17743	std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: `1`));
17744	assert(Res.first > `0` && "Expected item in MinBWs.");
17745	V = Builder.CreateIntCast(
17746	V,
17747	DestTy: getWidenedType(
17748	ScalarTy,
17749	VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
17750	isSigned: Res.second);
17751	}
17752
17753	// Create InsertVector shuffle if necessary
17754	auto FirstInsert = cast<Instruction>(Val: find_if(Range&: E->Scalars, P: [E](Value *V) {
17755	return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
17756	}));
17757	const unsigned NumElts =
17758	cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
17759	const unsigned NumScalars = E->Scalars.size();
17760
17761	unsigned Offset = *getElementIndex(Inst: VL0);
17762	assert(Offset < NumElts && "Failed to find vector index offset");
17763
17764	// Create shuffle to resize vector
17765	SmallVector<int> Mask;
17766	if (!E->ReorderIndices.empty()) {
17767	inversePermutation(Indices: E->ReorderIndices, Mask);
17768	Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
17769	} else {
17770	Mask.assign(NumElts, Elt: PoisonMaskElem);
17771	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: `0`);
17772	}
17773	// Create InsertVector shuffle if necessary
17774	bool IsIdentity = true;
17775	SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
17776	Mask.swap(RHS&: PrevMask);
17777	for (unsigned I = `0`; I < NumScalars; ++I) {
17778	Value *Scalar = E->Scalars [PrevMask [I]];
17779	unsigned InsertIdx = *getElementIndex(Inst: Scalar);
17780	IsIdentity &= InsertIdx - Offset == I;
17781	Mask [InsertIdx - Offset] = I;
17782	}
17783	if (!IsIdentity \|\| NumElts != NumScalars) {
17784	Value V2 = nullptr*;
17785	bool IsVNonPoisonous =
17786	!isConstant(V) && isGuaranteedNotToBePoison(V, AC);
17787	SmallVector<int> InsertMask(Mask);
17788	if (NumElts != NumScalars && Offset == `0`) {
17789	// Follow all insert element instructions from the current buildvector
17790	// sequence.
17791	InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
17792	do {
17793	std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
17794	if (!InsertIdx)
17795	break;
17796	if (InsertMask [*InsertIdx] == PoisonMaskElem)
17797	InsertMask [InsertIdx] = InsertIdx;
17798	if (!Ins->hasOneUse())
17799	break;
17800	Ins = dyn_cast_or_null<InsertElementInst>(
17801	Val: Ins->getUniqueUndroppableUser());
17802	} while (Ins);
17803	SmallBitVector UseMask =
17804	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
17805	SmallBitVector IsFirstPoison =
17806	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
17807	SmallBitVector IsFirstUndef =
17808	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
17809	if (!IsFirstPoison.all()) {
17810	unsigned Idx = `0`;
17811	for (unsigned I = `0`; I < NumElts; I++) {
17812	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
17813	IsFirstUndef.test(Idx: I)) {
17814	if (IsVNonPoisonous) {
17815	InsertMask [I] = I < NumScalars ? I : `0`;
17816	continue;
17817	}
17818	if (!V2)
17819	V2 = UndefValue::get(T: V->getType());
17820	if (Idx >= NumScalars)
17821	Idx = NumScalars - `1`;
17822	InsertMask [I] = NumScalars + Idx;
17823	++Idx;
17824	} else if (InsertMask [I] != PoisonMaskElem &&
17825	Mask [I] == PoisonMaskElem) {
17826	InsertMask [I] = PoisonMaskElem;
17827	}
17828	}
17829	} else {
17830	InsertMask = Mask;
17831	}
17832	}
17833	if (!V2)
17834	V2 = PoisonValue::get(T: V->getType());
17835	V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
17836	if (auto *I = dyn_cast<Instruction>(Val: V)) {
17837	GatherShuffleExtractSeq.insert(X: I);
17838	CSEBlocks.insert(V: I->getParent());
17839	}
17840	}
17841
17842	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
17843	for (unsigned I = `0`; I < NumElts; I++) {
17844	if (Mask [I] != PoisonMaskElem)
17845	InsertMask [Offset + I] = I;
17846	}
17847	SmallBitVector UseMask =
17848	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
17849	SmallBitVector IsFirstUndef =
17850	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
17851	if ((!IsIdentity \|\| Offset != `0` \|\| !IsFirstUndef.all()) &&
17852	NumElts != NumScalars) {
17853	if (IsFirstUndef.all()) {
17854	if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
17855	SmallBitVector IsFirstPoison =
17856	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
17857	if (!IsFirstPoison.all()) {
17858	for (unsigned I = `0`; I < NumElts; I++) {
17859	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
17860	InsertMask [I] = I + NumElts;
17861	}
17862	}
17863	V = Builder.CreateShuffleVector(
17864	V1: V,
17865	V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
17866	: FirstInsert->getOperand(i: `0`),
17867	Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
17868	if (auto *I = dyn_cast<Instruction>(Val: V)) {
17869	GatherShuffleExtractSeq.insert(X: I);
17870	CSEBlocks.insert(V: I->getParent());
17871	}
17872	}
17873	} else {
17874	SmallBitVector IsFirstPoison =
17875	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
17876	for (unsigned I = `0`; I < NumElts; I++) {
17877	if (InsertMask [I] == PoisonMaskElem)
17878	InsertMask [I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
17879	else
17880	InsertMask [I] += NumElts;
17881	}
17882	V = Builder.CreateShuffleVector(
17883	V1: FirstInsert->getOperand(i: `0`), V2: V, Mask: InsertMask,
17884	Name: cast<Instruction>(Val: E->Scalars.back())->getName());
17885	if (auto *I = dyn_cast<Instruction>(Val: V)) {
17886	GatherShuffleExtractSeq.insert(X: I);
17887	CSEBlocks.insert(V: I->getParent());
17888	}
17889	}
17890	}
17891
17892	++NumVectorInstructions;
17893	E->VectorizedValue = V;
17894	return V;
17895	}
17896	case Instruction::ZExt:
17897	case Instruction::SExt:
17898	case Instruction::FPToUI:
17899	case Instruction::FPToSI:
17900	case Instruction::FPExt:
17901	case Instruction::PtrToInt:
17902	case Instruction::IntToPtr:
17903	case Instruction::SIToFP:
17904	case Instruction::UIToFP:
17905	case Instruction::Trunc:
17906	case Instruction::FPTrunc:
17907	case Instruction::BitCast: {
17908	setInsertPointAfterBundle(E);
17909
17910	Value *InVec = vectorizeOperand(E, NodeIdx: `0`);
17911
17912	auto *CI = cast<CastInst>(Val: VL0);
17913	Instruction::CastOps VecOpcode = CI->getOpcode();
17914	Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
17915	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
17916	if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
17917	(SrcIt != MinBWs.end() \|\| It != MinBWs.end() \|\|
17918	SrcScalarTy != CI->getOperand(i_nocapture: `0`)->getType()->getScalarType())) {
17919	// Check if the values are candidates to demote.
17920	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
17921	if (SrcIt != MinBWs.end())
17922	SrcBWSz = SrcIt ->second.first;
17923	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
17924	if (BWSz == SrcBWSz) {
17925	VecOpcode = Instruction::BitCast;
17926	} else if (BWSz < SrcBWSz) {
17927	VecOpcode = Instruction::Trunc;
17928	} else if (It != MinBWs.end()) {
17929	assert(BWSz > SrcBWSz && "Invalid cast!");
17930	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
17931	} else if (SrcIt != MinBWs.end()) {
17932	assert(BWSz > SrcBWSz && "Invalid cast!");
17933	VecOpcode =
17934	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
17935	}
17936	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
17937	!SrcIt ->second.second) {
17938	VecOpcode = Instruction::UIToFP;
17939	}
17940	Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
17941	? InVec
17942	: Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
17943	V = FinalShuffle (V, E);
17944
17945	E->VectorizedValue = V;
17946	++NumVectorInstructions;
17947	return V;
17948	}
17949	case Instruction::FCmp:
17950	case Instruction::ICmp: {
17951	setInsertPointAfterBundle(E);
17952
17953	Value *L = vectorizeOperand(E, NodeIdx: `0`);
17954	Value *R = vectorizeOperand(E, NodeIdx: `1`);
17955	if (L->getType() != R->getType()) {
17956	assert((getOperandEntry(E, `0`)->isGather() \|\|
17957	getOperandEntry(E, `1`)->isGather() \|\|
17958	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
17959	MinBWs.contains(getOperandEntry(E, `1`))) &&
17960	"Expected item in MinBWs.");
17961	if (cast<VectorType>(Val: L->getType())
17962	->getElementType()
17963	->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
17964	->getElementType()
17965	->getIntegerBitWidth()) {
17966	Type *CastTy = R->getType();
17967	L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
17968	} else {
17969	Type *CastTy = L->getType();
17970	R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
17971	}
17972	}
17973
17974	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
17975	Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
17976	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
17977	if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end())
17978	ICmp->setSameSign(/B=/false);
17979	// Do not cast for cmps.
17980	VecTy = cast<FixedVectorType>(Val: V->getType());
17981	V = FinalShuffle (V, E);
17982
17983	E->VectorizedValue = V;
17984	++NumVectorInstructions;
17985	return V;
17986	}
17987	case Instruction::Select: {
17988	setInsertPointAfterBundle(E);
17989
17990	Value *Cond = vectorizeOperand(E, NodeIdx: `0`);
17991	Value *True = vectorizeOperand(E, NodeIdx: `1`);
17992	Value *False = vectorizeOperand(E, NodeIdx: `2`);
17993	if (True->getType() != VecTy \|\| False->getType() != VecTy) {
17994	assert((It != MinBWs.end() \|\| getOperandEntry(E, `1`)->isGather() \|\|
17995	getOperandEntry(E, `2`)->isGather() \|\|
17996	MinBWs.contains(getOperandEntry(E, `1`)) \|\|
17997	MinBWs.contains(getOperandEntry(E, `2`))) &&
17998	"Expected item in MinBWs.");
17999	if (True->getType() != VecTy)
18000	True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
18001	if (False->getType() != VecTy)
18002	False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness (`2`));
18003	}
18004
18005	unsigned CondNumElements = getNumElements(Ty: Cond->getType());
18006	unsigned TrueNumElements = getNumElements(Ty: True->getType());
18007	assert(TrueNumElements >= CondNumElements &&
18008	TrueNumElements % CondNumElements == `0` &&
18009	"Cannot vectorize Instruction::Select");
18010	assert(TrueNumElements == getNumElements(False->getType()) &&
18011	"Cannot vectorize Instruction::Select");
18012	if (CondNumElements != TrueNumElements) {
18013	// When the return type is i1 but the source is fixed vector type, we
18014	// need to duplicate the condition value.
18015	Cond = Builder.CreateShuffleVector(
18016	V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements,
18017	VF: CondNumElements));
18018	}
18019	assert(getNumElements(Cond->getType()) == TrueNumElements &&
18020	"Cannot vectorize Instruction::Select");
18021	Value *V = Builder.CreateSelect(C: Cond, True, False);
18022	V = FinalShuffle (V, E);
18023
18024	E->VectorizedValue = V;
18025	++NumVectorInstructions;
18026	return V;
18027	}
18028	case Instruction::FNeg: {
18029	setInsertPointAfterBundle(E);
18030
18031	Value *Op = vectorizeOperand(E, NodeIdx: `0`);
18032
18033	Value *V = Builder.CreateUnOp(
18034	Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
18035	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
18036	if (auto *I = dyn_cast<Instruction>(Val: V))
18037	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18038
18039	V = FinalShuffle (V, E);
18040
18041	E->VectorizedValue = V;
18042	++NumVectorInstructions;
18043
18044	return V;
18045	}
18046	case Instruction::Freeze: {
18047	setInsertPointAfterBundle(E);
18048
18049	Value *Op = vectorizeOperand(E, NodeIdx: `0`);
18050
18051	if (Op->getType() != VecTy) {
18052	assert((It != MinBWs.end() \|\| getOperandEntry(E, `0`)->isGather() \|\|
18053	MinBWs.contains(getOperandEntry(E, `0`))) &&
18054	"Expected item in MinBWs.");
18055	Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
18056	}
18057	Value *V = Builder.CreateFreeze(V: Op);
18058	V = FinalShuffle (V, E);
18059
18060	E->VectorizedValue = V;
18061	++NumVectorInstructions;
18062
18063	return V;
18064	}
18065	case Instruction::Add:
18066	case Instruction::FAdd:
18067	case Instruction::Sub:
18068	case Instruction::FSub:
18069	case Instruction::Mul:
18070	case Instruction::FMul:
18071	case Instruction::UDiv:
18072	case Instruction::SDiv:
18073	case Instruction::FDiv:
18074	case Instruction::URem:
18075	case Instruction::SRem:
18076	case Instruction::FRem:
18077	case Instruction::Shl:
18078	case Instruction::LShr:
18079	case Instruction::AShr:
18080	case Instruction::And:
18081	case Instruction::Or:
18082	case Instruction::Xor: {
18083	setInsertPointAfterBundle(E);
18084
18085	Value *LHS = vectorizeOperand(E, NodeIdx: `0`);
18086	Value *RHS = vectorizeOperand(E, NodeIdx: `1`);
18087	if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
18088	for (unsigned I : seq<unsigned>(Begin: `0`, End: E->getNumOperands())) {
18089	ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
18090	if (all_of(Range&: Ops, P: [&](Value *Op) {
18091	auto *CI = dyn_cast<ConstantInt>(Val: Op);
18092	return CI && CI->getValue().countr_one() >= It ->second.first;
18093	})) {
18094	V = FinalShuffle (I == `0` ? RHS : LHS, E);
18095	E->VectorizedValue = V;
18096	++NumVectorInstructions;
18097	return V;
18098	}
18099	}
18100	}
18101	if (LHS->getType() != VecTy \|\| RHS->getType() != VecTy) {
18102	assert((It != MinBWs.end() \|\| getOperandEntry(E, `0`)->isGather() \|\|
18103	getOperandEntry(E, `1`)->isGather() \|\|
18104	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
18105	MinBWs.contains(getOperandEntry(E, `1`))) &&
18106	"Expected item in MinBWs.");
18107	if (LHS->getType() != VecTy)
18108	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
18109	if (RHS->getType() != VecTy)
18110	RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
18111	}
18112
18113	Value *V = Builder.CreateBinOp(
18114	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
18115	RHS);
18116	propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end());
18117	if (auto *I = dyn_cast<Instruction>(Val: V)) {
18118	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18119	// Drop nuw flags for abs(sub(commutative), true).
18120	if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
18121	any_of(Range&: E->Scalars, P: [](Value *V) {
18122	return isa<PoisonValue>(Val: V) \|\| isCommutative(I: cast<Instruction>(Val: V));
18123	}))
18124	I->setHasNoUnsignedWrap(/b=/false);
18125	}
18126
18127	V = FinalShuffle (V, E);
18128
18129	E->VectorizedValue = V;
18130	++NumVectorInstructions;
18131
18132	return V;
18133	}
18134	case Instruction::Load: {
18135	// Loads are inserted at the head of the tree because we don't want to
18136	// sink them all the way down past store instructions.
18137	setInsertPointAfterBundle(E);
18138
18139	LoadInst *LI = cast<LoadInst>(Val: VL0);
18140	Instruction *NewLI;
18141	Value *PO = LI->getPointerOperand();
18142	if (E->State == TreeEntry::Vectorize) {
18143	NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
18144	} else if (E->State == TreeEntry::CompressVectorize) {
18145	auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
18146	CompressEntryToData.at(Val: E);
18147	Align CommonAlignment = LI->getAlign();
18148	if (IsMasked) {
18149	unsigned VF = getNumElements(Ty: LoadVecTy);
18150	SmallVector<Constant *> MaskValues(
18151	VF / getNumElements(Ty: LI->getType()),
18152	ConstantInt::getFalse(Context&: VecTy->getContext()));
18153	for (int I : CompressMask)
18154	MaskValues [I] = ConstantInt::getTrue(Context&: VecTy->getContext());
18155	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
18156	assert(SLPReVec && "Only supported by REVEC.");
18157	MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements());
18158	}
18159	Constant *MaskValue = ConstantVector::get(V: MaskValues);
18160	NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment,
18161	Mask: MaskValue);
18162	} else {
18163	NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment);
18164	}
18165	NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
18166	// TODO: include this cost into CommonCost.
18167	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
18168	assert(SLPReVec && "FixedVectorType is not expected.");
18169	transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(),
18170	Mask&: CompressMask);
18171	}
18172	NewLI =
18173	cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask));
18174	} else if (E->State == TreeEntry::StridedVectorize) {
18175	Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
18176	Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
18177	PO = IsReverseOrder ? PtrN : Ptr0;
18178	std::optional<int64_t> Diff = getPointersDiff(
18179	ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: DL, SE&: SE);
18180	Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
18181	Value *StrideVal;
18182	if (Diff) {
18183	int64_t Stride =
18184	Diff / (static_cast*<int64_t>(E->Scalars.size()) - `1`);
18185	StrideVal =
18186	ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -`1` : `1`) * Stride *
18187	DL->getTypeAllocSize(Ty: ScalarTy));
18188	} else {
18189	SmallVector<Value > PointerOps(E->Scalars.size(), nullptr*);
18190	transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) {
18191	return cast<LoadInst>(Val: V)->getPointerOperand();
18192	});
18193	OrdersType Order;
18194	std::optional<Value *> Stride =
18195	calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order,
18196	Inst: &*Builder.GetInsertPoint());
18197	Value *NewStride =
18198	Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /isSigned=/*true);
18199	StrideVal = Builder.CreateMul(
18200	LHS: NewStride,
18201	RHS: ConstantInt::get(
18202	Ty: StrideTy,
18203	V: (IsReverseOrder ? -`1` : `1`) *
18204	static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))));
18205	}
18206	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
18207	auto *Inst = Builder.CreateIntrinsic(
18208	ID: Intrinsic::experimental_vp_strided_load,
18209	Types: {VecTy, PO->getType(), StrideTy},
18210	Args: {PO, StrideVal, Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
18211	Builder.getInt32(C: E->Scalars.size())});
18212	Inst->addParamAttr(
18213	/ArgNo=/`0`,
18214	Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
18215	NewLI = Inst;
18216	} else {
18217	assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
18218	Value *VecPtr = vectorizeOperand(E, NodeIdx: `0`);
18219	if (isa<FixedVectorType>(Val: ScalarTy)) {
18220	assert(SLPReVec && "FixedVectorType is not expected.");
18221	// CreateMaskedGather expects VecTy and VecPtr have same size. We need
18222	// to expand VecPtr if ScalarTy is a vector type.
18223	unsigned ScalarTyNumElements =
18224	cast<FixedVectorType>(Val: ScalarTy)->getNumElements();
18225	unsigned VecTyNumElements =
18226	cast<FixedVectorType>(Val: VecTy)->getNumElements();
18227	assert(VecTyNumElements % ScalarTyNumElements == `0` &&
18228	"Cannot expand getelementptr.");
18229	unsigned VF = VecTyNumElements / ScalarTyNumElements;
18230	SmallVector<Constant *> Indices(VecTyNumElements);
18231	transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) {
18232	return Builder.getInt64(C: I % ScalarTyNumElements);
18233	});
18234	VecPtr = Builder.CreateGEP(
18235	Ty: VecTy->getElementType(),
18236	Ptr: Builder.CreateShuffleVector(
18237	V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)),
18238	IdxList: ConstantVector::get(V: Indices));
18239	}
18240	// Use the minimum alignment of the gathered loads.
18241	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
18242	NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
18243	}
18244	Value *V = E->State == TreeEntry::CompressVectorize
18245	? NewLI
18246	: ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
18247
18248	V = FinalShuffle (V, E);
18249	E->VectorizedValue = V;
18250	++NumVectorInstructions;
18251	return V;
18252	}
18253	case Instruction::Store: {
18254	auto *SI = cast<StoreInst>(Val: VL0);
18255
18256	setInsertPointAfterBundle(E);
18257
18258	Value *VecValue = vectorizeOperand(E, NodeIdx: `0`);
18259	if (VecValue->getType() != VecTy)
18260	VecValue =
18261	Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
18262	VecValue = FinalShuffle (VecValue, E);
18263
18264	Value *Ptr = SI->getPointerOperand();
18265	Instruction *ST;
18266	if (E->State == TreeEntry::Vectorize) {
18267	ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
18268	} else {
18269	assert(E->State == TreeEntry::StridedVectorize &&
18270	"Expected either strided or consecutive stores.");
18271	if (!E->ReorderIndices.empty()) {
18272	SI = cast<StoreInst>(Val: E->Scalars [E->ReorderIndices.front()]);
18273	Ptr = SI->getPointerOperand();
18274	}
18275	Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
18276	Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
18277	auto *Inst = Builder.CreateIntrinsic(
18278	ID: Intrinsic::experimental_vp_strided_store,
18279	Types: {VecTy, Ptr->getType(), StrideTy},
18280	Args: {VecValue, Ptr,
18281	ConstantInt::get(
18282	Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
18283	Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
18284	Builder.getInt32(C: E->Scalars.size())});
18285	Inst->addParamAttr(
18286	/ArgNo=/`1`,
18287	Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
18288	ST = Inst;
18289	}
18290
18291	Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars);
18292
18293	E->VectorizedValue = V;
18294	++NumVectorInstructions;
18295	return V;
18296	}
18297	case Instruction::GetElementPtr: {
18298	auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
18299	setInsertPointAfterBundle(E);
18300
18301	Value *Op0 = vectorizeOperand(E, NodeIdx: `0`);
18302
18303	SmallVector<Value *> OpVecs;
18304	for (int J = `1`, N = GEP0->getNumOperands(); J < N; ++J) {
18305	Value *OpVec = vectorizeOperand(E, NodeIdx: J);
18306	OpVecs.push_back(Elt: OpVec);
18307	}
18308
18309	Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
18310	if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
18311	SmallVector<Value *> GEPs;
18312	for (Value *V : E->Scalars) {
18313	if (isa<GetElementPtrInst>(Val: V))
18314	GEPs.push_back(Elt: V);
18315	}
18316	V = ::propagateMetadata(Inst: I, VL: GEPs);
18317	}
18318
18319	V = FinalShuffle (V, E);
18320
18321	E->VectorizedValue = V;
18322	++NumVectorInstructions;
18323
18324	return V;
18325	}
18326	case Instruction::Call: {
18327	CallInst *CI = cast<CallInst>(Val: VL0);
18328	setInsertPointAfterBundle(E);
18329
18330	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
18331
18332	SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
18333	CI, ID, VF: VecTy->getNumElements(),
18334	MinBW: It != MinBWs.end() ? It ->second.first : `0`, TTI);
18335	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
18336	bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
18337	VecCallCosts.first <= VecCallCosts.second;
18338
18339	Value ScalarArg = nullptr*;
18340	SmallVector<Value *> OpVecs;
18341	SmallVector<Type *, `2`> TysForDecl;
18342	// Add return type if intrinsic is overloaded on it.
18343	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -`1`, TTI))
18344	TysForDecl.push_back(Elt: VecTy);
18345	auto *CEI = cast<CallInst>(Val: VL0);
18346	for (unsigned I : seq<unsigned>(Begin: `0`, End: CI->arg_size())) {
18347	// Some intrinsics have scalar arguments. This argument should not be
18348	// vectorized.
18349	if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) {
18350	ScalarArg = CEI->getArgOperand(i: I);
18351	// if decided to reduce bitwidth of abs intrinsic, it second argument
18352	// must be set false (do not return poison, if value issigned min).
18353	if (ID == Intrinsic::abs && It != MinBWs.end() &&
18354	It ->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
18355	ScalarArg = Builder.getFalse();
18356	OpVecs.push_back(Elt: ScalarArg);
18357	if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
18358	TysForDecl.push_back(Elt: ScalarArg->getType());
18359	continue;
18360	}
18361
18362	Value *OpVec = vectorizeOperand(E, NodeIdx: I);
18363	ScalarArg = CEI->getArgOperand(i: I);
18364	if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
18365	ScalarArg->getType()->getScalarType() &&
18366	It == MinBWs.end()) {
18367	auto *CastTy =
18368	getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
18369	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness (I));
18370	} else if (It != MinBWs.end()) {
18371	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
18372	}
18373	LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
18374	OpVecs.push_back(Elt: OpVec);
18375	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
18376	TysForDecl.push_back(Elt: OpVec->getType());
18377	}
18378
18379	Function *CF;
18380	if (!UseIntrinsic) {
18381	VFShape Shape =
18382	VFShape::get(FTy: CI->getFunctionType(),
18383	EC: ElementCount::getFixed(
18384	MinVal: static_cast<unsigned>(VecTy->getNumElements())),
18385	HasGlobalPred: false /HasGlobalPred/);
18386	CF = VFDatabase (*CI).getVectorizedFunction(Shape);
18387	} else {
18388	CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
18389	}
18390
18391	SmallVector<OperandBundleDef, `1`> OpBundles;
18392	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
18393	Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
18394
18395	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
18396	V = FinalShuffle (V, E);
18397
18398	E->VectorizedValue = V;
18399	++NumVectorInstructions;
18400	return V;
18401	}
18402	case Instruction::ShuffleVector: {
18403	Value *V;
18404	if (SLPReVec && !E->isAltShuffle()) {
18405	setInsertPointAfterBundle(E);
18406	Value *Src = vectorizeOperand(E, NodeIdx: `0`);
18407	SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars));
18408	if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) {
18409	SmallVector<int> NewMask(ThisMask.size());
18410	transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) {
18411	return SVSrc->getShuffleMask()[Mask];
18412	});
18413	V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: `0`),
18414	V2: SVSrc->getOperand(i_nocapture: `1`), Mask: NewMask);
18415	} else {
18416	V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask);
18417	}
18418	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
18419	if (auto *I = dyn_cast<Instruction>(Val: V))
18420	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18421	V = FinalShuffle (V, E);
18422	} else {
18423	assert(E->isAltShuffle() &&
18424	((Instruction::isBinaryOp(E->getOpcode()) &&
18425	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
18426	(Instruction::isCast(E->getOpcode()) &&
18427	Instruction::isCast(E->getAltOpcode())) \|\|
18428	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
18429	"Invalid Shuffle Vector Operand");
18430
18431	Value LHS = nullptr, RHS = nullptr;
18432	if (Instruction::isBinaryOp(Opcode: E->getOpcode()) \|\| isa<CmpInst>(Val: VL0)) {
18433	setInsertPointAfterBundle(E);
18434	LHS = vectorizeOperand(E, NodeIdx: `0`);
18435	RHS = vectorizeOperand(E, NodeIdx: `1`);
18436	} else {
18437	setInsertPointAfterBundle(E);
18438	LHS = vectorizeOperand(E, NodeIdx: `0`);
18439	}
18440	if (LHS && RHS &&
18441	((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
18442	(LHS->getType() != VecTy \|\| RHS->getType() != VecTy)) \|\|
18443	(isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
18444	assert((It != MinBWs.end() \|\|
18445	getOperandEntry(E, `0`)->State == TreeEntry::NeedToGather \|\|
18446	getOperandEntry(E, `1`)->State == TreeEntry::NeedToGather \|\|
18447	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
18448	MinBWs.contains(getOperandEntry(E, `1`))) &&
18449	"Expected item in MinBWs.");
18450	Type *CastTy = VecTy;
18451	if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
18452	if (cast<VectorType>(Val: LHS->getType())
18453	->getElementType()
18454	->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
18455	->getElementType()
18456	->getIntegerBitWidth())
18457	CastTy = RHS->getType();
18458	else
18459	CastTy = LHS->getType();
18460	}
18461	if (LHS->getType() != CastTy)
18462	LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
18463	if (RHS->getType() != CastTy)
18464	RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
18465	}
18466
18467	Value V0, V1;
18468	if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
18469	V0 = Builder.CreateBinOp(
18470	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
18471	V1 = Builder.CreateBinOp(
18472	Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
18473	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
18474	V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
18475	auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
18476	CmpInst::Predicate AltPred = AltCI->getPredicate();
18477	V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
18478	} else {
18479	if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
18480	unsigned SrcBWSz = DL->getTypeSizeInBits(
18481	Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
18482	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
18483	if (BWSz <= SrcBWSz) {
18484	if (BWSz < SrcBWSz)
18485	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It ->second.first);
18486	assert(LHS->getType() == VecTy &&
18487	"Expected same type as operand.");
18488	if (auto *I = dyn_cast<Instruction>(Val: LHS))
18489	LHS = ::propagateMetadata(Inst: I, VL: E->Scalars);
18490	LHS = FinalShuffle (LHS, E);
18491	E->VectorizedValue = LHS;
18492	++NumVectorInstructions;
18493	return LHS;
18494	}
18495	}
18496	V0 = Builder.CreateCast(
18497	Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
18498	V1 = Builder.CreateCast(
18499	Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
18500	}
18501	// Add V0 and V1 to later analysis to try to find and remove matching
18502	// instruction, if any.
18503	for (Value *V : {V0, V1}) {
18504	if (auto *I = dyn_cast<Instruction>(Val: V)) {
18505	GatherShuffleExtractSeq.insert(X: I);
18506	CSEBlocks.insert(V: I->getParent());
18507	}
18508	}
18509
18510	// Create shuffle to take alternate operations from the vector.
18511	// Also, gather up main and alt scalar ops to propagate IR flags to
18512	// each vector operation.
18513	ValueList OpScalars, AltScalars;
18514	SmallVector<int> Mask;
18515	E->buildAltOpShuffleMask(
18516	IsAltOp: [E, this](Instruction *I) {
18517	assert(E->getMatchingMainOpOrAltOp(I) &&
18518	"Unexpected main/alternate opcode");
18519	return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
18520	TLI: *TLI);
18521	},
18522	Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
18523
18524	propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
18525	propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
18526	auto DropNuwFlag = [&](Value Vec, unsigned* Opcode) {
18527	// Drop nuw flags for abs(sub(commutative), true).
18528	if (auto *I = dyn_cast<Instruction>(Val: Vec);
18529	I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
18530	any_of(Range&: E->Scalars, P: [](Value *V) {
18531	if (isa<PoisonValue>(Val: V))
18532	return false;
18533	auto *IV = cast<Instruction>(Val: V);
18534	return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV);
18535	}))
18536	I->setHasNoUnsignedWrap(/b=/false);
18537	};
18538	DropNuwFlag (V0, E->getOpcode());
18539	DropNuwFlag (V1, E->getAltOpcode());
18540
18541	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
18542	assert(SLPReVec && "FixedVectorType is not expected.");
18543	transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask);
18544	}
18545	V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
18546	if (auto *I = dyn_cast<Instruction>(Val: V)) {
18547	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18548	GatherShuffleExtractSeq.insert(X: I);
18549	CSEBlocks.insert(V: I->getParent());
18550	}
18551	}
18552
18553	E->VectorizedValue = V;
18554	++NumVectorInstructions;
18555
18556	return V;
18557	}
18558	default:
18559	llvm_unreachable("unknown inst");
18560	}
18561	return nullptr;
18562	}
18563
18564	Value *BoUpSLP::vectorizeTree() {
18565	ExtraValueToDebugLocsMap ExternallyUsedValues;
18566	return vectorizeTree(ExternallyUsedValues);
18567	}
18568
18569	Value *BoUpSLP::vectorizeTree(
18570	const ExtraValueToDebugLocsMap &ExternallyUsedValues,
18571	Instruction *ReductionRoot,
18572	ArrayRef<std::tuple<Value , unsigned, bool*>> VectorValuesAndScales) {
18573	// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
18574	// need to rebuild it.
18575	EntryToLastInstruction.clear();
18576	// All blocks must be scheduled before any instructions are inserted.
18577	for (auto &BSIter : BlocksSchedules)
18578	scheduleBlock(BS: BSIter.second.get());
18579	// Cache last instructions for the nodes to avoid side effects, which may
18580	// appear during vectorization, like extra uses, etc.
18581	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18582	if (TE ->isGather())
18583	continue;
18584	(void)getLastInstructionInBundle(E: TE.get());
18585	}
18586
18587	if (ReductionRoot)
18588	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
18589	IP: ReductionRoot->getIterator());
18590	else
18591	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
18592
18593	// Vectorize gather operands of the nodes with the external uses only.
18594	SmallVector<std::pair<TreeEntry , Instruction >> GatherEntries;
18595	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18596	if (TE ->isGather() && !TE ->VectorizedValue && TE ->UserTreeIndex.UserTE &&
18597	TE ->UserTreeIndex.UserTE->hasState() &&
18598	TE ->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
18599	(TE ->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI \|\|
18600	TE ->UserTreeIndex.UserTE->isAltShuffle()) &&
18601	all_of(Range&: TE ->UserTreeIndex.UserTE->Scalars,
18602	P: [](Value V) { return* isUsedOutsideBlock(V); })) {
18603	Instruction &LastInst =
18604	getLastInstructionInBundle(E: TE ->UserTreeIndex.UserTE);
18605	GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst);
18606	}
18607	}
18608	for (auto &Entry : GatherEntries) {
18609	IRBuilderBase::InsertPointGuard Guard(Builder);
18610	Builder.SetInsertPoint(Entry.second);
18611	Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
18612	(void)vectorizeTree(E: Entry.first);
18613	}
18614	// Emit gathered loads first to emit better code for the users of those
18615	// gathered loads.
18616	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18617	if (GatheredLoadsEntriesFirst.has_value() &&
18618	TE ->Idx >= *GatheredLoadsEntriesFirst && !TE ->VectorizedValue &&
18619	(!TE ->isGather() \|\| TE ->UserTreeIndex)) {
18620	assert((TE->UserTreeIndex \|\|
18621	(TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
18622	"Expected gathered load node.");
18623	(void)vectorizeTree(E: TE.get());
18624	}
18625	}
18626	(void)vectorizeTree(E: VectorizableTree [`0`].get());
18627	// Run through the list of postponed gathers and emit them, replacing the temp
18628	// emitted allocas with actual vector instructions.
18629	ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
18630	DenseMap<Value , SmallVector<TreeEntry >> PostponedValues;
18631	for (const TreeEntry *E : PostponedNodes) {
18632	auto TE = const_cast<TreeEntry >(E);
18633	auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
18634	TE->VectorizedValue = nullptr;
18635	auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue);
18636	// If user is a PHI node, its vector code have to be inserted right before
18637	// block terminator. Since the node was delayed, there were some unresolved
18638	// dependencies at the moment when stab instruction was emitted. In a case
18639	// when any of these dependencies turn out an operand of another PHI, coming
18640	// from this same block, position of a stab instruction will become invalid.
18641	// The is because source vector that supposed to feed this gather node was
18642	// inserted at the end of the block [after stab instruction]. So we need
18643	// to adjust insertion point again to the end of block.
18644	if (isa<PHINode>(Val: UserI)) {
18645	// Insert before all users.
18646	Instruction *InsertPt = PrevVec->getParent()->getTerminator();
18647	for (User *U : PrevVec->users()) {
18648	if (U == UserI)
18649	continue;
18650	auto *UI = dyn_cast<Instruction>(Val: U);
18651	if (!UI \|\| isa<PHINode>(Val: UI) \|\| UI->getParent() != InsertPt->getParent())
18652	continue;
18653	if (UI->comesBefore(Other: InsertPt))
18654	InsertPt = UI;
18655	}
18656	Builder.SetInsertPoint(InsertPt);
18657	} else {
18658	Builder.SetInsertPoint(PrevVec);
18659	}
18660	Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
18661	Value *Vec = vectorizeTree(E: TE);
18662	if (auto *VecI = dyn_cast<Instruction>(Val: Vec);
18663	VecI && VecI->getParent() == Builder.GetInsertBlock() &&
18664	Builder.GetInsertPoint()->comesBefore(Other: VecI))
18665	VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(),
18666	I: Builder.GetInsertPoint());
18667	if (Vec->getType() != PrevVec->getType()) {
18668	assert(Vec->getType()->isIntOrIntVectorTy() &&
18669	PrevVec->getType()->isIntOrIntVectorTy() &&
18670	"Expected integer vector types only.");
18671	std::optional<bool> IsSigned;
18672	for (Value *V : TE->Scalars) {
18673	if (isVectorized(V)) {
18674	for (const TreeEntry *MNTE : getTreeEntries(V)) {
18675	auto It = MinBWs.find(Val: MNTE);
18676	if (It != MinBWs.end()) {
18677	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
18678	if (*IsSigned)
18679	break;
18680	}
18681	}
18682	if (IsSigned.value_or(u: false))
18683	break;
18684	// Scan through gather nodes.
18685	for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
18686	auto It = MinBWs.find(Val: BVE);
18687	if (It != MinBWs.end()) {
18688	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
18689	if (*IsSigned)
18690	break;
18691	}
18692	}
18693	if (IsSigned.value_or(u: false))
18694	break;
18695	if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
18696	IsSigned =
18697	IsSigned.value_or(u: false) \|\|
18698	!isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery (*DL));
18699	continue;
18700	}
18701	if (IsSigned.value_or(u: false))
18702	break;
18703	}
18704	}
18705	if (IsSigned.value_or(u: false)) {
18706	// Final attempt - check user node.
18707	auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE);
18708	if (It != MinBWs.end())
18709	IsSigned = It ->second.second;
18710	}
18711	assert(IsSigned &&
18712	"Expected user node or perfect diamond match in MinBWs.");
18713	Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
18714	}
18715	PrevVec->replaceAllUsesWith(V: Vec);
18716	PostponedValues.try_emplace(Key: Vec).first ->second.push_back(Elt: TE);
18717	// Replace the stub vector node, if it was used before for one of the
18718	// buildvector nodes already.
18719	auto It = PostponedValues.find(Val: PrevVec);
18720	if (It != PostponedValues.end()) {
18721	for (TreeEntry *VTE : It ->getSecond())
18722	VTE->VectorizedValue = Vec;
18723	}
18724	eraseInstruction(I: PrevVec);
18725	}
18726
18727	LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
18728	<< " values .\n");
18729
18730	SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
18731	// Maps vector instruction to original insertelement instruction
18732	DenseMap<Value , InsertElementInst > VectorToInsertElement;
18733	// Maps extract Scalar to the corresponding extractelement instruction in the
18734	// basic block. Only one extractelement per block should be emitted.
18735	DenseMap<Value , DenseMap<BasicBlock , std::pair<Value , Value >>>
18736	ScalarToEEs;
18737	SmallDenseSet<Value *, `4`> UsedInserts;
18738	DenseMap<std::pair<Value , Type >, Value *> VectorCasts;
18739	SmallDenseSet<Value *, `4`> ScalarsWithNullptrUser;
18740	SmallDenseSet<ExtractElementInst *, `4`> IgnoredExtracts;
18741	// Extract all of the elements with the external uses.
18742	for (const auto &ExternalUse : ExternalUses) {
18743	Value *Scalar = ExternalUse.Scalar;
18744	llvm::User *User = ExternalUse.User;
18745
18746	// Skip users that we already RAUW. This happens when one instruction
18747	// has multiple uses of the same value.
18748	if (User && !is_contained(Range: Scalar->users(), Element: User))
18749	continue;
18750	const TreeEntry *E = &ExternalUse.E;
18751	assert(E && "Invalid scalar");
18752	assert(!E->isGather() && "Extracting from a gather list");
18753	// Non-instruction pointers are not deleted, just skip them.
18754	if (E->getOpcode() == Instruction::GetElementPtr &&
18755	!isa<GetElementPtrInst>(Val: Scalar))
18756	continue;
18757
18758	Value *Vec = E->VectorizedValue;
18759	assert(Vec && "Can't find vectorizable value");
18760
18761	Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
18762	auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
18763	if (Scalar->getType() != Vec->getType()) {
18764	Value Ex = nullptr*;
18765	Value ExV = nullptr*;
18766	auto *Inst = dyn_cast<Instruction>(Val: Scalar);
18767	bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst);
18768	auto It = ScalarToEEs.find(Val: Scalar);
18769	if (It != ScalarToEEs.end()) {
18770	// No need to emit many extracts, just move the only one in the
18771	// current block.
18772	auto EEIt = It ->second.find(Val: ReplaceInst ? Inst->getParent()
18773	: Builder.GetInsertBlock());
18774	if (EEIt != It ->second.end()) {
18775	Value *PrevV = EEIt ->second.first;
18776	if (auto *I = dyn_cast<Instruction>(Val: PrevV);
18777	I && !ReplaceInst &&
18778	Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
18779	Builder.GetInsertPoint()->comesBefore(Other: I)) {
18780	I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
18781	I: Builder.GetInsertPoint());
18782	if (auto *CI = dyn_cast<Instruction>(Val: EEIt ->second.second))
18783	CI->moveAfter(MovePos: I);
18784	}
18785	Ex = PrevV;
18786	ExV = EEIt ->second.second ? EEIt ->second.second : Ex;
18787	}
18788	}
18789	if (!Ex) {
18790	// "Reuse" the existing extract to improve final codegen.
18791	if (ReplaceInst) {
18792	// Leave the instruction as is, if it cheaper extracts and all
18793	// operands are scalar.
18794	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) {
18795	IgnoredExtracts.insert(V: EE);
18796	Ex = EE;
18797	} else {
18798	auto *CloneInst = Inst->clone();
18799	CloneInst->insertBefore(InsertPos: Inst->getIterator());
18800	if (Inst->hasName())
18801	CloneInst->takeName(V: Inst);
18802	Ex = CloneInst;
18803	}
18804	} else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar);
18805	ES && isa<Instruction>(Val: Vec)) {
18806	Value *V = ES->getVectorOperand();
18807	auto *IVec = cast<Instruction>(Val: Vec);
18808	if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
18809	V = ETEs.front()->VectorizedValue;
18810	if (auto *IV = dyn_cast<Instruction>(Val: V);
18811	!IV \|\| IV == Vec \|\| IV->getParent() != IVec->getParent() \|\|
18812	IV->comesBefore(Other: IVec))
18813	Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
18814	else
18815	Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
18816	} else if (auto *VecTy =
18817	dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
18818	assert(SLPReVec && "FixedVectorType is not expected.");
18819	unsigned VecTyNumElements = VecTy->getNumElements();
18820	// When REVEC is enabled, we need to extract a vector.
18821	// Note: The element size of Scalar may be different from the
18822	// element size of Vec.
18823	Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements,
18824	Index: ExternalUse.Lane * VecTyNumElements);
18825	} else {
18826	Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
18827	}
18828	// If necessary, sign-extend or zero-extend ScalarRoot
18829	// to the larger type.
18830	ExV = Ex;
18831	if (Scalar->getType() != Ex->getType())
18832	ExV = Builder.CreateIntCast(
18833	V: Ex, DestTy: Scalar->getType(),
18834	isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery (*DL)));
18835	auto *I = dyn_cast<Instruction>(Val: Ex);
18836	ScalarToEEs [Scalar].try_emplace(Key: I ? I->getParent()
18837	: &F->getEntryBlock(),
18838	Args: std::make_pair(x&: Ex, y&: ExV));
18839	}
18840	// The then branch of the previous if may produce constants, since 0
18841	// operand might be a constant.
18842	if (auto *ExI = dyn_cast<Instruction>(Val: Ex);
18843	ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) {
18844	GatherShuffleExtractSeq.insert(X: ExI);
18845	CSEBlocks.insert(V: ExI->getParent());
18846	}
18847	return ExV;
18848	}
18849	assert(isa<FixedVectorType>(Scalar->getType()) &&
18850	isa<InsertElementInst>(Scalar) &&
18851	"In-tree scalar of vector type is not insertelement?");
18852	auto *IE = cast<InsertElementInst>(Val: Scalar);
18853	VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
18854	return Vec;
18855	};
18856	// If User == nullptr, the Scalar remains as scalar in vectorized
18857	// instructions or is used as extra arg. Generate ExtractElement instruction
18858	// and update the record for this scalar in ExternallyUsedValues.
18859	if (!User) {
18860	if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
18861	continue;
18862	assert(
18863	(ExternallyUsedValues.count(Scalar) \|\|
18864	Scalar->hasNUsesOrMore(UsesLimit) \|\|
18865	ExternalUsesAsOriginalScalar.contains(Scalar) \|\|
18866	any_of(
18867	Scalar->users(),
18868	[&, TTI = TTI](llvm::User *U) {
18869	if (ExternalUsesAsOriginalScalar.contains(U))
18870	return true;
18871	ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
18872	return !UseEntries.empty() &&
18873	(E->State == TreeEntry::Vectorize \|\|
18874	E->State == TreeEntry::StridedVectorize \|\|
18875	E->State == TreeEntry::CompressVectorize) &&
18876	any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
18877	return (UseEntry->State == TreeEntry::Vectorize \|\|
18878	UseEntry->State ==
18879	TreeEntry::StridedVectorize \|\|
18880	UseEntry->State ==
18881	TreeEntry::CompressVectorize) &&
18882	doesInTreeUserNeedToExtract(
18883	Scalar, getRootEntryInstruction(*UseEntry),
18884	TLI, TTI);
18885	});
18886	})) &&
18887	"Scalar with nullptr User must be registered in "
18888	"ExternallyUsedValues map or remain as scalar in vectorized "
18889	"instructions");
18890	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
18891	if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) {
18892	if (PHI->getParent()->isLandingPad())
18893	Builder.SetInsertPoint(
18894	TheBB: PHI->getParent(),
18895	IP: std::next(
18896	x: PHI->getParent()->getLandingPadInst()->getIterator()));
18897	else
18898	Builder.SetInsertPoint(TheBB: PHI->getParent(),
18899	IP: PHI->getParent()->getFirstNonPHIIt());
18900	} else {
18901	Builder.SetInsertPoint(TheBB: VecI->getParent(),
18902	IP: std::next(x: VecI->getIterator()));
18903	}
18904	} else {
18905	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
18906	}
18907	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
18908	// Required to update internally referenced instructions.
18909	if (Scalar != NewInst) {
18910	assert((!isa<ExtractElementInst>(Scalar) \|\|
18911	!IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
18912	"Extractelements should not be replaced.");
18913	Scalar->replaceAllUsesWith(V: NewInst);
18914	}
18915	continue;
18916	}
18917
18918	if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
18919	VU && VU->getOperand(i_nocapture: `1`) == Scalar) {
18920	// Skip if the scalar is another vector op or Vec is not an instruction.
18921	if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
18922	if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
18923	if (!UsedInserts.insert(V: VU).second)
18924	continue;
18925	// Need to use original vector, if the root is truncated.
18926	auto BWIt = MinBWs.find(Val: E);
18927	if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
18928	auto *ScalarTy = FTy->getElementType();
18929	auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
18930	auto VecIt = VectorCasts.find(Val: Key);
18931	if (VecIt == VectorCasts.end()) {
18932	IRBuilderBase::InsertPointGuard Guard(Builder);
18933	if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) {
18934	if (IVec->getParent()->isLandingPad())
18935	Builder.SetInsertPoint(TheBB: IVec->getParent(),
18936	IP: std::next(x: IVec->getParent()
18937	->getLandingPadInst()
18938	->getIterator()));
18939	else
18940	Builder.SetInsertPoint(
18941	IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
18942	} else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) {
18943	Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
18944	}
18945	Vec = Builder.CreateIntCast(
18946	V: Vec,
18947	DestTy: getWidenedType(
18948	ScalarTy,
18949	VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
18950	isSigned: BWIt ->second.second);
18951	VectorCasts.try_emplace(Key, Args&: Vec);
18952	} else {
18953	Vec = VecIt ->second;
18954	}
18955	}
18956
18957	std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
18958	if (InsertIdx) {
18959	auto *It = find_if(
18960	Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) {
18961	// Checks if 2 insertelements are from the same buildvector.
18962	InsertElementInst *VecInsert = Data.InsertElements.front();
18963	return areTwoInsertFromSameBuildVector(
18964	VU, V: VecInsert,
18965	GetBaseOperand: [](InsertElementInst II) { return* II->getOperand(i_nocapture: `0`); });
18966	});
18967	unsigned Idx = *InsertIdx;
18968	if (It == ShuffledInserts.end()) {
18969	(void)ShuffledInserts.emplace_back();
18970	It = std::next(x: ShuffledInserts.begin(),
18971	n: ShuffledInserts.size() - `1`);
18972	}
18973	SmallVectorImpl<int> &Mask = It->ValueMasks [Vec];
18974	if (Mask.empty())
18975	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
18976	Mask [Idx] = ExternalUse.Lane;
18977	It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
18978	continue;
18979	}
18980	}
18981	}
18982	}
18983
18984	// Generate extracts for out-of-tree users.
18985	// Find the insertion point for the extractelement lane.
18986	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
18987	if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
18988	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumIncomingValues())) {
18989	if (PH->getIncomingValue(i: I) == Scalar) {
18990	Instruction *IncomingTerminator =
18991	PH->getIncomingBlock(i: I)->getTerminator();
18992	if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
18993	Builder.SetInsertPoint(TheBB: VecI->getParent(),
18994	IP: std::next(x: VecI->getIterator()));
18995	} else {
18996	Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
18997	}
18998	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
18999	PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
19000	}
19001	}
19002	} else {
19003	Builder.SetInsertPoint(cast<Instruction>(Val: User));
19004	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
19005	User->replaceUsesOfWith(From: Scalar, To: NewInst);
19006	}
19007	} else {
19008	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
19009	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
19010	User->replaceUsesOfWith(From: Scalar, To: NewInst);
19011	}
19012
19013	LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
19014	}
19015
19016	auto CreateShuffle = [&](Value V1, Value V2, ArrayRef<int> Mask) {
19017	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
19018	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
19019	int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19020	for (int I = `0`, E = Mask.size(); I < E; ++I) {
19021	if (Mask [I] < VF)
19022	CombinedMask1 [I] = Mask [I];
19023	else
19024	CombinedMask2 [I] = Mask [I] - VF;
19025	}
19026	ShuffleInstructionBuilder ShuffleBuilder(
19027	cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
19028	ShuffleBuilder.add(V1, Mask: CombinedMask1);
19029	if (V2)
19030	ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
19031	return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {});
19032	};
19033
19034	auto &&ResizeToVF = [&CreateShuffle](Value Vec, ArrayRef<int*> Mask,
19035	bool ForSingleMask) {
19036	unsigned VF = Mask.size();
19037	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19038	if (VF != VecVF) {
19039	if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
19040	Vec = CreateShuffle (Vec, nullptr, Mask);
19041	return std::make_pair(x&: Vec, y: true);
19042	}
19043	if (!ForSingleMask) {
19044	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19045	for (unsigned I = `0`; I < VF; ++I) {
19046	if (Mask [I] != PoisonMaskElem)
19047	ResizeMask [Mask [I]] = Mask [I];
19048	}
19049	Vec = CreateShuffle (Vec, nullptr, ResizeMask);
19050	}
19051	}
19052
19053	return std::make_pair(x&: Vec, y: false);
19054	};
19055	// Perform shuffling of the vectorize tree entries for better handling of
19056	// external extracts.
19057	for (int I = `0`, E = ShuffledInserts.size(); I < E; ++I) {
19058	// Find the first and the last instruction in the list of insertelements.
19059	sort(C&: ShuffledInserts [I].InsertElements, Comp: isFirstInsertElement);
19060	InsertElementInst *FirstInsert = ShuffledInserts [I].InsertElements.front();
19061	InsertElementInst *LastInsert = ShuffledInserts [I].InsertElements.back();
19062	Builder.SetInsertPoint(LastInsert);
19063	auto Vector = ShuffledInserts [I].ValueMasks.takeVector();
19064	Value *NewInst = performExtractsShuffleAction<Value>(
19065	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
19066	Base: FirstInsert->getOperand(i_nocapture: `0`),
19067	GetVF: [](Value *Vec) {
19068	return cast<VectorType>(Val: Vec->getType())
19069	->getElementCount()
19070	.getKnownMinValue();
19071	},
19072	ResizeAction: ResizeToVF,
19073	Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
19074	ArrayRef<Value *> Vals) {
19075	assert((Vals.size() == `1` \|\| Vals.size() == `2`) &&
19076	"Expected exactly 1 or 2 input values.");
19077	if (Vals.size() == `1`) {
19078	// Do not create shuffle if the mask is a simple identity
19079	// non-resizing mask.
19080	if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
19081	->getNumElements() \|\|
19082	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
19083	return CreateShuffle (Vals.front(), nullptr, Mask);
19084	return Vals.front();
19085	}
19086	return CreateShuffle (Vals.front() ? Vals.front()
19087	: FirstInsert->getOperand(i_nocapture: `0`),
19088	Vals.back(), Mask);
19089	});
19090	auto It = ShuffledInserts [I].InsertElements.rbegin();
19091	// Rebuild buildvector chain.
19092	InsertElementInst II = nullptr*;
19093	if (It != ShuffledInserts [I].InsertElements.rend())
19094	II = *It;
19095	SmallVector<Instruction *> Inserts;
19096	while (It != ShuffledInserts [I].InsertElements.rend()) {
19097	assert(II && "Must be an insertelement instruction.");
19098	if (*It == II)
19099	++It;
19100	else
19101	Inserts.push_back(Elt: cast<Instruction>(Val: II));
19102	II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: `0`));
19103	}
19104	for (Instruction *II : reverse(C&: Inserts)) {
19105	II->replaceUsesOfWith(From: II->getOperand(i: `0`), To: NewInst);
19106	if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
19107	if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
19108	II->moveAfter(MovePos: NewI);
19109	NewInst = II;
19110	}
19111	LastInsert->replaceAllUsesWith(V: NewInst);
19112	for (InsertElementInst *IE : reverse(C&: ShuffledInserts [I].InsertElements)) {
19113	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `0`),
19114	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `0`)->getType()));
19115	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `1`),
19116	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `1`)->getType()));
19117	eraseInstruction(I: IE);
19118	}
19119	CSEBlocks.insert(V: LastInsert->getParent());
19120	}
19121
19122	SmallVector<Instruction *> RemovedInsts;
19123	// For each vectorized value:
19124	for (auto &TEPtr : VectorizableTree) {
19125	TreeEntry *Entry = TEPtr.get();
19126
19127	// No need to handle users of gathered values.
19128	if (Entry->isGather() \|\| Entry->State == TreeEntry::SplitVectorize)
19129	continue;
19130
19131	assert(Entry->VectorizedValue && "Can't find vectorizable value");
19132
19133	// For each lane:
19134	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
19135	Value *Scalar = Entry->Scalars [Lane];
19136
19137	if (Entry->getOpcode() == Instruction::GetElementPtr &&
19138	!isa<GetElementPtrInst>(Val: Scalar))
19139	continue;
19140	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar);
19141	EE && IgnoredExtracts.contains(V: EE))
19142	continue;
19143	if (isa<PoisonValue>(Val: Scalar))
19144	continue;
19145	#ifndef NDEBUG
19146	Type *Ty = Scalar->getType();
19147	if (!Ty->isVoidTy()) {
19148	for (User *U : Scalar->users()) {
19149	LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
19150
19151	// It is legal to delete users in the ignorelist.
19152	assert((isVectorized(U) \|\|
19153	(UserIgnoreList && UserIgnoreList->contains(U)) \|\|
19154	(isa_and_nonnull<Instruction>(U) &&
19155	isDeleted(cast<Instruction>(U)))) &&
19156	"Deleting out-of-tree value");
19157	}
19158	}
19159	#endif
19160	LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
19161	auto *I = cast<Instruction>(Val: Scalar);
19162	RemovedInsts.push_back(Elt: I);
19163	}
19164	}
19165
19166	// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
19167	// new vector instruction.
19168	if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree [`0`]->VectorizedValue))
19169	V->mergeDIAssignID(SourceInstructions: RemovedInsts);
19170
19171	// Clear up reduction references, if any.
19172	if (UserIgnoreList) {
19173	for (Instruction *I : RemovedInsts) {
19174	const TreeEntry *IE = getTreeEntries(V: I).front();
19175	if (IE->Idx != `0` &&
19176	!(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
19177	(ValueToGatherNodes.lookup(Val: I).contains(
19178	key: VectorizableTree.front().get()) \|\|
19179	(IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
19180	IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
19181	!(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
19182	IE->UserTreeIndex &&
19183	is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
19184	!(GatheredLoadsEntriesFirst.has_value() &&
19185	IE->Idx >= *GatheredLoadsEntriesFirst &&
19186	VectorizableTree.front()->isGather() &&
19187	is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)))
19188	continue;
19189	SmallVector<SelectInst *> LogicalOpSelects;
19190	I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
19191	// Do not replace condition of the logical op in form select <cond>.
19192	bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
19193	(match(V: U.getUser(), P: m_LogicalAnd()) \|\|
19194	match(V: U.getUser(), P: m_LogicalOr())) &&
19195	U.getOperandNo() == `0`;
19196	if (IsPoisoningLogicalOp) {
19197	LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
19198	return false;
19199	}
19200	return UserIgnoreList->contains(V: U.getUser());
19201	});
19202	// Replace conditions of the poisoning logical ops with the non-poison
19203	// constant value.
19204	for (SelectInst *SI : LogicalOpSelects)
19205	SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
19206	}
19207	}
19208	// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
19209	// cache correctness.
19210	// NOTE: removeInstructionAndOperands only marks the instruction for deletion
19211	// - instructions are not deleted until later.
19212	removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales);
19213
19214	Builder.ClearInsertionPoint();
19215	InstrElementSize.clear();
19216
19217	const TreeEntry &RootTE = *VectorizableTree.front();
19218	Value *Vec = RootTE.VectorizedValue;
19219	if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != `0` &&
19220	It != MinBWs.end() &&
19221	ReductionBitWidth != It ->second.first) {
19222	IRBuilder<>::InsertPointGuard Guard(Builder);
19223	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
19224	IP: ReductionRoot->getIterator());
19225	Vec = Builder.CreateIntCast(
19226	V: Vec,
19227	DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
19228	EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
19229	isSigned: It ->second.second);
19230	}
19231	return Vec;
19232	}
19233
19234	void BoUpSLP::optimizeGatherSequence() {
19235	LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
19236	<< " gather sequences instructions.\n");
19237	// LICM InsertElementInst sequences.
19238	for (Instruction *I : GatherShuffleExtractSeq) {
19239	if (isDeleted(I))
19240	continue;
19241
19242	// Check if this block is inside a loop.
19243	Loop *L = LI->getLoopFor(BB: I->getParent());
19244	if (!L)
19245	continue;
19246
19247	// Check if it has a preheader.
19248	BasicBlock *PreHeader = L->getLoopPreheader();
19249	if (!PreHeader)
19250	continue;
19251
19252	// If the vector or the element that we insert into it are
19253	// instructions that are defined in this basic block then we can't
19254	// hoist this instruction.
19255	if (any_of(Range: I->operands(), P: [L](Value *V) {
19256	auto *OpI = dyn_cast<Instruction>(Val: V);
19257	return OpI && L->contains(Inst: OpI);
19258	}))
19259	continue;
19260
19261	// We can hoist this instruction. Move it to the pre-header.
19262	I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator());
19263	CSEBlocks.insert(V: PreHeader);
19264	}
19265
19266	// Make a list of all reachable blocks in our CSE queue.
19267	SmallVector<const DomTreeNode *, `8`> CSEWorkList;
19268	CSEWorkList.reserve(N: CSEBlocks.size());
19269	for (BasicBlock *BB : CSEBlocks)
19270	if (DomTreeNode *N = DT->getNode(BB)) {
19271	assert(DT->isReachableFromEntry(N));
19272	CSEWorkList.push_back(Elt: N);
19273	}
19274
19275	// Sort blocks by domination. This ensures we visit a block after all blocks
19276	// dominating it are visited.
19277	llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode A, const* DomTreeNode *B) {
19278	assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
19279	"Different nodes should have different DFS numbers");
19280	return A->getDFSNumIn() < B->getDFSNumIn();
19281	});
19282
19283	// Less defined shuffles can be replaced by the more defined copies.
19284	// Between two shuffles one is less defined if it has the same vector operands
19285	// and its mask indeces are the same as in the first one or undefs. E.g.
19286	// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
19287	// poison, <0, 0, 0, 0>.
19288	auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
19289	Instruction *I2,
19290	SmallVectorImpl<int> &NewMask) {
19291	if (I1->getType() != I2->getType())
19292	return false;
19293	auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
19294	auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
19295	if (!SI1 \|\| !SI2)
19296	return I1->isIdenticalTo(I: I2);
19297	if (SI1->isIdenticalTo(I: SI2))
19298	return true;
19299	for (int I = `0`, E = SI1->getNumOperands(); I < E; ++I)
19300	if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
19301	return false;
19302	// Check if the second instruction is more defined than the first one.
19303	NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
19304	ArrayRef<int> SM1 = SI1->getShuffleMask();
19305	// Count trailing undefs in the mask to check the final number of used
19306	// registers.
19307	unsigned LastUndefsCnt = `0`;
19308	for (int I = `0`, E = NewMask.size(); I < E; ++I) {
19309	if (SM1 [I] == PoisonMaskElem)
19310	++LastUndefsCnt;
19311	else
19312	LastUndefsCnt = `0`;
19313	if (NewMask [I] != PoisonMaskElem && SM1 [I] != PoisonMaskElem &&
19314	NewMask [I] != SM1 [I])
19315	return false;
19316	if (NewMask [I] == PoisonMaskElem)
19317	NewMask [I] = SM1 [I];
19318	}
19319	// Check if the last undefs actually change the final number of used vector
19320	// registers.
19321	return SM1.size() - LastUndefsCnt > `1` &&
19322	::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) ==
19323	::getNumberOfParts(
19324	TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
19325	VF: SM1.size() - LastUndefsCnt));
19326	};
19327	// Perform O(N^2) search over the gather/shuffle sequences and merge identical
19328	// instructions. TODO: We can further optimize this scan if we split the
19329	// instructions into different buckets based on the insert lane.
19330	SmallVector<Instruction *, `16`> Visited;
19331	for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
19332	assert(*I &&
19333	(I == CSEWorkList.begin() \|\| !DT->dominates(I, std::prev(I))) &&
19334	"Worklist not sorted properly!");
19335	BasicBlock BB = (I)->getBlock();
19336	// For all instructions in blocks containing gather sequences:
19337	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
19338	if (isDeleted(I: &In))
19339	continue;
19340	if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
19341	!GatherShuffleExtractSeq.contains(key: &In))
19342	continue;
19343
19344	// Check if we can replace this instruction with any of the
19345	// visited instructions.
19346	bool Replaced = false;
19347	for (Instruction *&V : Visited) {
19348	SmallVector<int> NewMask;
19349	if (IsIdenticalOrLessDefined (&In, V, NewMask) &&
19350	DT->dominates(A: V->getParent(), B: In.getParent())) {
19351	In.replaceAllUsesWith(V);
19352	eraseInstruction(I: &In);
19353	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
19354	if (!NewMask.empty())
19355	SI->setShuffleMask(NewMask);
19356	Replaced = true;
19357	break;
19358	}
19359	if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
19360	GatherShuffleExtractSeq.contains(key: V) &&
19361	IsIdenticalOrLessDefined (V, &In, NewMask) &&
19362	DT->dominates(A: In.getParent(), B: V->getParent())) {
19363	In.moveAfter(MovePos: V);
19364	V->replaceAllUsesWith(V: &In);
19365	eraseInstruction(I: V);
19366	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
19367	if (!NewMask.empty())
19368	SI->setShuffleMask(NewMask);
19369	V = &In;
19370	Replaced = true;
19371	break;
19372	}
19373	}
19374	if (!Replaced) {
19375	assert(!is_contained(Visited, &In));
19376	Visited.push_back(Elt: &In);
19377	}
19378	}
19379	}
19380	CSEBlocks.clear();
19381	GatherShuffleExtractSeq.clear();
19382	}
19383
19384	BoUpSLP::ScheduleBundle &
19385	BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
19386	auto &BundlePtr =
19387	ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>());
19388	for (Value *V : VL) {
19389	if (doesNotNeedToBeScheduled(V))
19390	continue;
19391	ScheduleData *BundleMember = getScheduleData(V);
19392	assert(BundleMember && "no ScheduleData for bundle member "
19393	"(maybe not in same basic block)");
19394	// Group the instructions to a bundle.
19395	BundlePtr ->add(SD: BundleMember);
19396	ScheduledBundles.try_emplace(Key: cast<Instruction>(Val: V))
19397	.first ->getSecond()
19398	.push_back(Elt: BundlePtr.get());
19399	}
19400	assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
19401	return *BundlePtr;
19402	}
19403
19404	// Groups the instructions to a bundle (which is then a single scheduling entity)
19405	// and schedules instructions until the bundle gets ready.
19406	std::optional<BoUpSLP::ScheduleBundle *>
19407	BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
19408	const InstructionsState &S) {
19409	// No need to schedule PHIs, insertelement, extractelement and extractvalue
19410	// instructions.
19411	if (isa<PHINode>(Val: S.getMainOp()) \|\|
19412	isVectorLikeInstWithConstOps(V: S.getMainOp()) \|\| doesNotNeedToSchedule(VL))
19413	return nullptr;
19414
19415	// Initialize the instruction bundle.
19416	Instruction *OldScheduleEnd = ScheduleEnd;
19417	LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
19418
19419	auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
19420	// The scheduling region got new instructions at the lower end (or it is a
19421	// new region for the first bundle). This makes it necessary to
19422	// recalculate all dependencies.
19423	// It is seldom that this needs to be done a second time after adding the
19424	// initial bundle to the region.
19425	if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
19426	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
19427	if (ScheduleData *SD = getScheduleData(I))
19428	SD->clearDependencies();
19429	}
19430	ReSchedule = true;
19431	}
19432	if (Bundle && !Bundle.getBundle().empty()) {
19433	LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
19434	<< BB->getName() << "\n");
19435	calculateDependencies(Bundle, /InsertInReadyList=/!ReSchedule, SLP);
19436	}
19437
19438	if (ReSchedule) {
19439	resetSchedule();
19440	initialFillReadyList(ReadyList&: ReadyInsts);
19441	}
19442
19443	// Now try to schedule the new bundle or (if no bundle) just calculate
19444	// dependencies. As soon as the bundle is "ready" it means that there are no
19445	// cyclic dependencies and we can schedule it. Note that's important that we
19446	// don't "schedule" the bundle yet.
19447	while (((!Bundle && ReSchedule) \|\| (Bundle && !Bundle.isReady())) &&
19448	!ReadyInsts.empty()) {
19449	ScheduleEntity *Picked = ReadyInsts.pop_back_val();
19450	assert(Picked->isReady() && "must be ready to schedule");
19451	schedule(Data: Picked, ReadyList&: ReadyInsts);
19452	if (Picked == &Bundle)
19453	break;
19454	}
19455	};
19456
19457	// Make sure that the scheduling region contains all
19458	// instructions of the bundle.
19459	for (Value *V : VL) {
19460	if (doesNotNeedToBeScheduled(V))
19461	continue;
19462	if (!extendSchedulingRegion(V, S)) {
19463	// If the scheduling region got new instructions at the lower end (or it
19464	// is a new region for the first bundle). This makes it necessary to
19465	// recalculate all dependencies.
19466	// Otherwise the compiler may crash trying to incorrectly calculate
19467	// dependencies and emit instruction in the wrong order at the actual
19468	// scheduling.
19469	ScheduleBundle Invalid = ScheduleBundle::invalid();
19470	TryScheduleBundleImpl (/ReSchedule=/false, Invalid);
19471	return std::nullopt;
19472	}
19473	}
19474
19475	bool ReSchedule = false;
19476	for (Value *V : VL) {
19477	if (doesNotNeedToBeScheduled(V))
19478	continue;
19479	ScheduleData *BundleMember = getScheduleData(V);
19480	assert(BundleMember &&
19481	"no ScheduleData for bundle member (maybe not in same basic block)");
19482
19483	// Make sure we don't leave the pieces of the bundle in the ready list when
19484	// whole bundle might not be ready.
19485	ReadyInsts.remove(X: BundleMember);
19486	if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
19487	!Bundles.empty()) {
19488	for (ScheduleBundle *B : Bundles)
19489	ReadyInsts.remove(X: B);
19490	}
19491
19492	if (!BundleMember->isScheduled())
19493	continue;
19494	// A bundle member was scheduled as single instruction before and now
19495	// needs to be scheduled as part of the bundle. We just get rid of the
19496	// existing schedule.
19497	LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
19498	<< " was already scheduled\n");
19499	ReSchedule = true;
19500	}
19501
19502	ScheduleBundle &Bundle = buildBundle(VL);
19503	TryScheduleBundleImpl (ReSchedule, Bundle);
19504	if (!Bundle.isReady()) {
19505	for (ScheduleData *BD : Bundle.getBundle()) {
19506	if (BD->isReady()) {
19507	ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst());
19508	if (Bundles.empty()) {
19509	ReadyInsts.insert(X: BD);
19510	continue;
19511	}
19512	for (ScheduleBundle *B : Bundles)
19513	if (B->isReady())
19514	ReadyInsts.insert(X: B);
19515	}
19516	}
19517	ScheduledBundlesList.pop_back();
19518	for (Value *V : VL) {
19519	if (doesNotNeedToBeScheduled(V))
19520	continue;
19521	ScheduledBundles.find(Val: cast<Instruction>(Val: V))->getSecond().pop_back();
19522	}
19523	return std::nullopt;
19524	}
19525	return &Bundle;
19526	}
19527
19528	BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
19529	// Allocate a new ScheduleData for the instruction.
19530	if (ChunkPos >= ChunkSize) {
19531	ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
19532	ChunkPos = `0`;
19533	}
19534	return &(ScheduleDataChunks.back()[ChunkPos++]);
19535	}
19536
19537	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
19538	Value V, const* InstructionsState &S) {
19539	Instruction *I = dyn_cast<Instruction>(Val: V);
19540	assert(I && "bundle member must be an instruction");
19541	assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
19542	!doesNotNeedToBeScheduled(I) &&
19543	"phi nodes/insertelements/extractelements/extractvalues don't need to "
19544	"be scheduled");
19545	if (getScheduleData(I))
19546	return true;
19547	if (!ScheduleStart) {
19548	// It's the first instruction in the new region.
19549	initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
19550	ScheduleStart = I;
19551	ScheduleEnd = I->getNextNode();
19552	assert(ScheduleEnd && "tried to vectorize a terminator?");
19553	LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
19554	return true;
19555	}
19556	// Search up and down at the same time, because we don't know if the new
19557	// instruction is above or below the existing scheduling region.
19558	// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
19559	// against the budget. Otherwise debug info could affect codegen.
19560	BasicBlock::reverse_iterator UpIter =
19561	++ScheduleStart->getIterator().getReverse();
19562	BasicBlock::reverse_iterator UpperEnd = BB->rend();
19563	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
19564	BasicBlock::iterator LowerEnd = BB->end();
19565	auto IsAssumeLikeIntr = [](const Instruction &I) {
19566	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
19567	return II->isAssumeLikeIntrinsic();
19568	return false;
19569	};
19570	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
19571	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
19572	while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
19573	&*DownIter != I) {
19574	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
19575	LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
19576	return false;
19577	}
19578
19579	++UpIter;
19580	++DownIter;
19581
19582	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
19583	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
19584	}
19585	if (DownIter == LowerEnd \|\| (UpIter != UpperEnd && &*UpIter == I)) {
19586	assert(I->getParent() == ScheduleStart->getParent() &&
19587	"Instruction is in wrong basic block.");
19588	initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
19589	ScheduleStart = I;
19590	LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
19591	<< "\n");
19592	return true;
19593	}
19594	assert((UpIter == UpperEnd \|\| (DownIter != LowerEnd && &*DownIter == I)) &&
19595	"Expected to reach top of the basic block or instruction down the "
19596	"lower end.");
19597	assert(I->getParent() == ScheduleEnd->getParent() &&
19598	"Instruction is in wrong basic block.");
19599	initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
19600	NextLoadStore: nullptr);
19601	ScheduleEnd = I->getNextNode();
19602	assert(ScheduleEnd && "tried to vectorize a terminator?");
19603	LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
19604	return true;
19605	}
19606
19607	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
19608	Instruction *ToI,
19609	ScheduleData *PrevLoadStore,
19610	ScheduleData *NextLoadStore) {
19611	ScheduleData *CurrentLoadStore = PrevLoadStore;
19612	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
19613	// No need to allocate data for non-schedulable instructions.
19614	if (doesNotNeedToBeScheduled(V: I))
19615	continue;
19616	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
19617	if (!SD) {
19618	SD = allocateScheduleDataChunks();
19619	ScheduleDataMap [I] = SD;
19620	}
19621	assert(!isInSchedulingRegion(SD) &&
19622	"new ScheduleData already in scheduling region");
19623	SD->init(BlockSchedulingRegionID: SchedulingRegionID, I);
19624
19625	if (I->mayReadOrWriteMemory() &&
19626	(!isa<IntrinsicInst>(Val: I) \|\|
19627	(cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
19628	cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
19629	Intrinsic::pseudoprobe))) {
19630	// Update the linked list of memory accessing instructions.
19631	if (CurrentLoadStore) {
19632	CurrentLoadStore->setNextLoadStore(SD);
19633	} else {
19634	FirstLoadStoreInRegion = SD;
19635	}
19636	CurrentLoadStore = SD;
19637	}
19638
19639	if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
19640	match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
19641	RegionHasStackSave = true;
19642	}
19643	if (NextLoadStore) {
19644	if (CurrentLoadStore)
19645	CurrentLoadStore->setNextLoadStore(NextLoadStore);
19646	} else {
19647	LastLoadStoreInRegion = CurrentLoadStore;
19648	}
19649	}
19650
19651	void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
19652	bool InsertInReadyList,
19653	BoUpSLP *SLP) {
19654	SmallVector<ScheduleData *> WorkList;
19655	auto ProcessNode = [&](ScheduleData *BundleMember) {
19656	if (BundleMember->hasValidDependencies())
19657	return;
19658	LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
19659	BundleMember->initDependencies();
19660	BundleMember->resetUnscheduledDeps();
19661	// Handle def-use chain dependencies.
19662	for (User *U : BundleMember->getInst()->users()) {
19663	if (ScheduleData *UseSD = getScheduleData(V: U)) {
19664	BundleMember->incDependencies();
19665	if (!UseSD->isScheduled())
19666	BundleMember->incrementUnscheduledDeps(Incr: `1`);
19667	WorkList.push_back(Elt: UseSD);
19668	}
19669	}
19670
19671	auto MakeControlDependent = [&](Instruction *I) {
19672	auto *DepDest = getScheduleData(I);
19673	assert(DepDest && "must be in schedule window");
19674	DepDest->addControlDependency(Dep: BundleMember);
19675	BundleMember->incDependencies();
19676	if (!DepDest->isScheduled())
19677	BundleMember->incrementUnscheduledDeps(Incr: `1`);
19678	WorkList.push_back(Elt: DepDest);
19679	};
19680
19681	// Any instruction which isn't safe to speculate at the beginning of the
19682	// block is control depend on any early exit or non-willreturn call
19683	// which proceeds it.
19684	if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) {
19685	for (Instruction *I = BundleMember->getInst()->getNextNode();
19686	I != ScheduleEnd; I = I->getNextNode()) {
19687	if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
19688	continue;
19689
19690	// Add the dependency
19691	MakeControlDependent(I);
19692
19693	if (!isGuaranteedToTransferExecutionToSuccessor(I))
19694	// Everything past here must be control dependent on I.
19695	break;
19696	}
19697	}
19698
19699	if (RegionHasStackSave) {
19700	// If we have an inalloc alloca instruction, it needs to be scheduled
19701	// after any preceeding stacksave. We also need to prevent any alloca
19702	// from reordering above a preceeding stackrestore.
19703	if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
19704	match(V: BundleMember->getInst(),
19705	P: m_Intrinsic<Intrinsic::stackrestore>())) {
19706	for (Instruction *I = BundleMember->getInst()->getNextNode();
19707	I != ScheduleEnd; I = I->getNextNode()) {
19708	if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
19709	match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
19710	// Any allocas past here must be control dependent on I, and I
19711	// must be memory dependend on BundleMember->Inst.
19712	break;
19713
19714	if (!isa<AllocaInst>(Val: I))
19715	continue;
19716
19717	// Add the dependency
19718	MakeControlDependent(I);
19719	}
19720	}
19721
19722	// In addition to the cases handle just above, we need to prevent
19723	// allocas and loads/stores from moving below a stacksave or a
19724	// stackrestore. Avoiding moving allocas below stackrestore is currently
19725	// thought to be conservatism. Moving loads/stores below a stackrestore
19726	// can lead to incorrect code.
19727	if (isa<AllocaInst>(Val: BundleMember->getInst()) \|\|
19728	BundleMember->getInst()->mayReadOrWriteMemory()) {
19729	for (Instruction *I = BundleMember->getInst()->getNextNode();
19730	I != ScheduleEnd; I = I->getNextNode()) {
19731	if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
19732	!match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
19733	continue;
19734
19735	// Add the dependency
19736	MakeControlDependent(I);
19737	break;
19738	}
19739	}
19740	}
19741
19742	// Handle the memory dependencies (if any).
19743	ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
19744	if (!NextLoadStore)
19745	return;
19746	Instruction *SrcInst = BundleMember->getInst();
19747	assert(SrcInst->mayReadOrWriteMemory() &&
19748	"NextLoadStore list for non memory effecting bundle?");
19749	MemoryLocation SrcLoc = getLocation(I: SrcInst);
19750	bool SrcMayWrite = SrcInst->mayWriteToMemory();
19751	unsigned NumAliased = `0`;
19752	unsigned DistToSrc = `1`;
19753	bool IsNonSimpleSrc = !SrcLoc.Ptr \|\| !isSimple(I: SrcInst);
19754
19755	for (ScheduleData *DepDest = NextLoadStore; DepDest;
19756	DepDest = DepDest->getNextLoadStore()) {
19757	assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
19758
19759	// We have two limits to reduce the complexity:
19760	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
19761	// SLP->isAliased (which is the expensive part in this loop).
19762	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
19763	// the whole loop (even if the loop is fast, it's quadratic).
19764	// It's important for the loop break condition (see below) to
19765	// check this limit even between two read-only instructions.
19766	if (DistToSrc >= MaxMemDepDistance \|\|
19767	((SrcMayWrite \|\| DepDest->getInst()->mayWriteToMemory()) &&
19768	(IsNonSimpleSrc \|\| NumAliased >= AliasedCheckLimit \|\|
19769	SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) {
19770
19771	// We increment the counter only if the locations are aliased
19772	// (instead of counting all alias checks). This gives a better
19773	// balance between reduced runtime and accurate dependencies.
19774	NumAliased++;
19775
19776	DepDest->addMemoryDependency(Dep: BundleMember);
19777	BundleMember->incDependencies();
19778	if (!DepDest->isScheduled())
19779	BundleMember->incrementUnscheduledDeps(Incr: `1`);
19780	WorkList.push_back(Elt: DepDest);
19781	}
19782
19783	// Example, explaining the loop break condition: Let's assume our
19784	// starting instruction is i0 and MaxMemDepDistance = 3.
19785	//
19786	// +--------v--v--v
19787	// i0,i1,i2,i3,i4,i5,i6,i7,i8
19788	// +--------^--^--^
19789	//
19790	// MaxMemDepDistance let us stop alias-checking at i3 and we add
19791	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
19792	// Previously we already added dependencies from i3 to i6,i7,i8
19793	// (because of MaxMemDepDistance). As we added a dependency from
19794	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
19795	// and we can abort this loop at i6.
19796	if (DistToSrc >= `2` * MaxMemDepDistance)
19797	break;
19798	DistToSrc++;
19799	}
19800	};
19801
19802	WorkList.push_back(Elt: Bundle.getBundle().front());
19803	SmallPtrSet<ScheduleBundle *, `16`> Visited;
19804	while (!WorkList.empty()) {
19805	ScheduleData *SD = WorkList.pop_back_val();
19806	ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: SD->getInst());
19807	if (Bundles.empty()) {
19808	ProcessNode (SD);
19809	if (InsertInReadyList && SD->isReady()) {
19810	ReadyInsts.insert(X: SD);
19811	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
19812	}
19813	continue;
19814	}
19815	for (ScheduleBundle *Bundle : Bundles) {
19816	if (!Visited.insert(Ptr: Bundle).second \|\| Bundle->hasValidDependencies())
19817	continue;
19818	assert(isInSchedulingRegion(*Bundle) &&
19819	"ScheduleData not in scheduling region");
19820	for_each(Range: Bundle->getBundle(), F: ProcessNode);
19821	}
19822	if (InsertInReadyList && SD->isReady()) {
19823	for (ScheduleBundle *Bundle : Bundles) {
19824	assert(isInSchedulingRegion(*Bundle) &&
19825	"ScheduleData not in scheduling region");
19826	if (!Bundle->isReady())
19827	continue;
19828	ReadyInsts.insert(X: Bundle);
19829	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
19830	<< "\n");
19831	}
19832	}
19833	}
19834	}
19835
19836	void BoUpSLP::BlockScheduling::resetSchedule() {
19837	assert(ScheduleStart &&
19838	"tried to reset schedule on block which has not been scheduled");
19839	for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
19840	if (ScheduleData *SD = getScheduleData(I)) {
19841	assert(isInSchedulingRegion(SD) &&
19842	"ScheduleData not in scheduling region");
19843	SD->setScheduled(/Scheduled=/false);
19844	SD->resetUnscheduledDeps();
19845	}
19846	for (ScheduleBundle *Bundle : getScheduleBundles(V: I)) {
19847	assert(isInSchedulingRegion(*Bundle) &&
19848	"ScheduleBundle not in scheduling region");
19849	Bundle->setScheduled(/Scheduled=/false);
19850	}
19851	}
19852	ReadyInsts.clear();
19853	}
19854
19855	void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
19856	if (!BS->ScheduleStart)
19857	return;
19858
19859	LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
19860
19861	// A key point - if we got here, pre-scheduling was able to find a valid
19862	// scheduling of the sub-graph of the scheduling window which consists
19863	// of all vector bundles and their transitive users. As such, we do not
19864	// need to reschedule anything outside of* that subgraph.*
19865
19866	BS->resetSchedule();
19867
19868	// For the real scheduling we use a more sophisticated ready-list: it is
19869	// sorted by the original instruction location. This lets the final schedule
19870	// be as close as possible to the original instruction order.
19871	// WARNING: If changing this order causes a correctness issue, that means
19872	// there is some missing dependence edge in the schedule data graph.
19873	struct ScheduleDataCompare {
19874	bool operator()(const ScheduleEntity *SD1,
19875	const ScheduleEntity SD2) const* {
19876	return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
19877	}
19878	};
19879	std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
19880
19881	// Ensure that all dependency data is updated (for nodes in the sub-graph)
19882	// and fill the ready-list with initial instructions.
19883	int Idx = `0`;
19884	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
19885	I = I->getNextNode()) {
19886	ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I);
19887	if (!Bundles.empty()) {
19888	for (ScheduleBundle *Bundle : Bundles) {
19889	Bundle->setSchedulingPriority(Idx++);
19890	if (!Bundle->hasValidDependencies())
19891	BS->calculateDependencies(Bundle&: Bundle, /InsertInReadyList=/*false, SLP: this);
19892	}
19893	continue;
19894	}
19895	if (ScheduleData *SD = BS->getScheduleData(I)) {
19896	[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I);
19897	assert((isVectorLikeInstWithConstOps(SD->getInst()) \|\| SDTEs.empty() \|\|
19898	SDTEs.front()->doesNotNeedToSchedule()) &&
19899	"scheduler and vectorizer bundle mismatch");
19900	SD->setSchedulingPriority(Idx++);
19901	continue;
19902	}
19903	}
19904	BS->initialFillReadyList(ReadyList&: ReadyInsts);
19905
19906	Instruction *LastScheduledInst = BS->ScheduleEnd;
19907
19908	// Do the "real" scheduling.
19909	SmallPtrSet<Instruction *, `16`> Scheduled;
19910	while (!ReadyInsts.empty()) {
19911	auto Picked = ReadyInsts.begin();
19912	ReadyInsts.erase(position: ReadyInsts.begin());
19913
19914	// Move the scheduled instruction(s) to their dedicated places, if not
19915	// there yet.
19916	if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) {
19917	for (const ScheduleData *BundleMember : Bundle->getBundle()) {
19918	Instruction *PickedInst = BundleMember->getInst();
19919	if (!Scheduled.insert(Ptr: PickedInst).second)
19920	continue;
19921	if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
19922	PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
19923	LastScheduledInst = PickedInst;
19924	}
19925	EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(),
19926	Args&: LastScheduledInst);
19927	} else {
19928	auto *SD = cast<ScheduleData>(Val: Picked);
19929	Instruction *PickedInst = SD->getInst();
19930	if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
19931	PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
19932	LastScheduledInst = PickedInst;
19933	}
19934	BS->schedule(Data: Picked, ReadyList&: ReadyInsts);
19935	}
19936
19937	// Check that we didn't break any of our invariants.
19938	#ifdef EXPENSIVE_CHECKS
19939	BS->verify();
19940	#endif
19941
19942	#if !defined(NDEBUG) \|\| defined(EXPENSIVE_CHECKS)
19943	// Check that all schedulable entities got scheduled
19944	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
19945	I = I->getNextNode()) {
19946	ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
19947	assert(all_of(Bundles,
19948	[](const ScheduleBundle *Bundle) {
19949	return Bundle->isScheduled();
19950	}) &&
19951	"must be scheduled at this point");
19952	}
19953	#endif
19954
19955	// Avoid duplicate scheduling of the block.
19956	BS->ScheduleStart = nullptr;
19957	}
19958
19959	unsigned BoUpSLP::getVectorElementSize(Value *V) {
19960	// If V is a store, just return the width of the stored value (or value
19961	// truncated just before storing) without traversing the expression tree.
19962	// This is the common case.
19963	if (auto *Store = dyn_cast<StoreInst>(Val: V))
19964	return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
19965
19966	if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
19967	return getVectorElementSize(V: IEI->getOperand(i_nocapture: `1`));
19968
19969	auto E = InstrElementSize.find(Val: V);
19970	if (E != InstrElementSize.end())
19971	return E ->second;
19972
19973	// If V is not a store, we can traverse the expression tree to find loads
19974	// that feed it. The type of the loaded value may indicate a more suitable
19975	// width than V's type. We want to base the vector element size on the width
19976	// of memory operations where possible.
19977	SmallVector<std::tuple<Instruction , BasicBlock , unsigned>> Worklist;
19978	SmallPtrSet<Instruction *, `16`> Visited;
19979	if (auto *I = dyn_cast<Instruction>(Val: V)) {
19980	Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: `0`);
19981	Visited.insert(Ptr: I);
19982	}
19983
19984	// Traverse the expression tree in bottom-up order looking for loads. If we
19985	// encounter an instruction we don't yet handle, we give up.
19986	auto Width = `0u`;
19987	Value FirstNonBool = nullptr*;
19988	while (!Worklist.empty()) {
19989	auto [I, Parent, Level] = Worklist.pop_back_val();
19990
19991	// We should only be looking at scalar instructions here. If the current
19992	// instruction has a vector type, skip.
19993	auto *Ty = I->getType();
19994	if (isa<VectorType>(Val: Ty))
19995	continue;
19996	if (Ty != Builder.getInt1Ty() && !FirstNonBool)
19997	FirstNonBool = I;
19998	if (Level > RecursionMaxDepth)
19999	continue;
20000
20001	// If the current instruction is a load, update MaxWidth to reflect the
20002	// width of the loaded value.
20003	if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
20004	Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
20005
20006	// Otherwise, we need to visit the operands of the instruction. We only
20007	// handle the interesting cases from buildTree here. If an operand is an
20008	// instruction we haven't yet visited and from the same basic block as the
20009	// user or the use is a PHI node, we add it to the worklist.
20010	else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
20011	BinaryOperator, UnaryOperator>(Val: I)) {
20012	for (Use &U : I->operands()) {
20013	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
20014	if (Visited.insert(Ptr: J).second &&
20015	(isa<PHINode>(Val: I) \|\| J->getParent() == Parent)) {
20016	Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + `1`);
20017	continue;
20018	}
20019	if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
20020	FirstNonBool = U.get();
20021	}
20022	} else {
20023	break;
20024	}
20025	}
20026
20027	// If we didn't encounter a memory access in the expression tree, or if we
20028	// gave up for some reason, just return the width of V. Otherwise, return the
20029	// maximum width we found.
20030	if (!Width) {
20031	if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
20032	V = FirstNonBool;
20033	Width = DL->getTypeSizeInBits(Ty: V->getType());
20034	}
20035
20036	for (Instruction *I : Visited)
20037	InstrElementSize [I] = Width;
20038
20039	return Width;
20040	}
20041
20042	bool BoUpSLP::collectValuesToDemote(
20043	const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
20044	SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
20045	const SmallDenseSet<unsigned, `8`> &NodesToKeepBWs, unsigned &MaxDepthLevel,
20046	bool &IsProfitableToDemote, bool IsTruncRoot) const {
20047	// We can always demote constants.
20048	if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
20049	return true;
20050
20051	unsigned OrigBitWidth =
20052	DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType());
20053	if (OrigBitWidth == BitWidth) {
20054	MaxDepthLevel = `1`;
20055	return true;
20056	}
20057
20058	// Check if the node was analyzed already and must keep its original bitwidth.
20059	if (NodesToKeepBWs.contains(V: E.Idx))
20060	return false;
20061
20062	// If the value is not a vectorized instruction in the expression and not used
20063	// by the insertelement instruction and not used in multiple vector nodes, it
20064	// cannot be demoted.
20065	bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
20066	if (isa<PoisonValue>(Val: R))
20067	return false;
20068	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
20069	});
20070	auto IsPotentiallyTruncated = [&](Value V, unsigned* &BitWidth) -> bool {
20071	if (isa<PoisonValue>(Val: V))
20072	return true;
20073	if (getTreeEntries(V).size() > `1`)
20074	return false;
20075	// For lat shuffle of sext/zext with many uses need to check the extra bit
20076	// for unsigned values, otherwise may have incorrect casting for reused
20077	// scalars.
20078	bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery (*DL));
20079	if ((!IsSignedNode \|\| IsSignedVal) && OrigBitWidth > BitWidth) {
20080	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20081	if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL)))
20082	return true;
20083	}
20084	unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: DL, AC, CxtI: nullptr*, DT);
20085	unsigned BitWidth1 = OrigBitWidth - NumSignBits;
20086	if (IsSignedNode)
20087	++BitWidth1;
20088	if (auto *I = dyn_cast<Instruction>(Val: V)) {
20089	APInt Mask = DB->getDemandedBits(I);
20090	unsigned BitWidth2 =
20091	std::max<unsigned>(a: `1`, b: Mask.getBitWidth() - Mask.countl_zero());
20092	while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
20093	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - `1`);
20094	if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL)))
20095	break;
20096	BitWidth2 *= `2`;
20097	}
20098	BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
20099	}
20100	BitWidth = std::max(a: BitWidth, b: BitWidth1);
20101	return BitWidth > `0` && OrigBitWidth >= (BitWidth * `2`);
20102	};
20103	auto FinalAnalysis = [&, TTI = TTI]() {
20104	if (!IsProfitableToDemote)
20105	return false;
20106	bool Res = all_of(
20107	Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
20108	// Demote gathers.
20109	if (Res && E.isGather()) {
20110	if (E.hasState()) {
20111	if (const TreeEntry *SameTE =
20112	getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars);
20113	SameTE)
20114	if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth,
20115	ToDemote, Visited, NodesToKeepBWs,
20116	MaxDepthLevel, IsProfitableToDemote,
20117	IsTruncRoot)) {
20118	ToDemote.push_back(Elt: E.Idx);
20119	return true;
20120	}
20121	}
20122	// Check possible extractelement instructions bases and final vector
20123	// length.
20124	SmallPtrSet<Value *, `4`> UniqueBases;
20125	for (Value *V : E.Scalars) {
20126	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
20127	if (!EE)
20128	continue;
20129	UniqueBases.insert(Ptr: EE->getVectorOperand());
20130	}
20131	const unsigned VF = E.Scalars.size();
20132	Type *OrigScalarTy = E.Scalars.front()->getType();
20133	if (UniqueBases.size() <= `2` \|\|
20134	::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >=
20135	::getNumberOfParts(
20136	TTI: *TTI,
20137	VecTy: getWidenedType(
20138	ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth),
20139	VF))) {
20140	ToDemote.push_back(Elt: E.Idx);
20141	return true;
20142	}
20143	}
20144	return Res;
20145	};
20146	if (E.isGather() \|\| !Visited.insert(V: &E).second \|\|
20147	any_of(Range: E.Scalars, P: [&](Value *V) {
20148	return !isa<PoisonValue>(Val: V) && all_of(Range: V->users(), P: [&](User *U) {
20149	return isa<InsertElementInst>(Val: U) && !isVectorized(V: U);
20150	});
20151	}))
20152	return FinalAnalysis ();
20153
20154	if (any_of(Range: E.Scalars, P: [&](Value *V) {
20155	return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) {
20156	return isVectorized(V: U) \|\|
20157	(E.Idx == `0` && UserIgnoreList &&
20158	UserIgnoreList->contains(V: U)) \|\|
20159	(!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
20160	!U->getType()->isScalableTy() &&
20161	DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
20162	}) && !IsPotentiallyTruncated (V, BitWidth);
20163	}))
20164	return false;
20165
20166	auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
20167	bool &NeedToExit) {
20168	NeedToExit = false;
20169	unsigned InitLevel = MaxDepthLevel;
20170	for (const TreeEntry *Op : Operands) {
20171	unsigned Level = InitLevel;
20172	if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
20173	ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level,
20174	IsProfitableToDemote, IsTruncRoot)) {
20175	if (!IsProfitableToDemote)
20176	return false;
20177	NeedToExit = true;
20178	if (!FinalAnalysis ())
20179	return false;
20180	continue;
20181	}
20182	MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
20183	}
20184	return true;
20185	};
20186	auto AttemptCheckBitwidth =
20187	[&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
20188	// Try all bitwidth < OrigBitWidth.
20189	NeedToExit = false;
20190	unsigned BestFailBitwidth = `0`;
20191	for (; BitWidth < OrigBitWidth; BitWidth *= `2`) {
20192	if (Checker (BitWidth, OrigBitWidth))
20193	return true;
20194	if (BestFailBitwidth == `0` && FinalAnalysis ())
20195	BestFailBitwidth = BitWidth;
20196	}
20197	if (BitWidth >= OrigBitWidth) {
20198	if (BestFailBitwidth == `0`) {
20199	BitWidth = OrigBitWidth;
20200	return false;
20201	}
20202	MaxDepthLevel = `1`;
20203	BitWidth = BestFailBitwidth;
20204	NeedToExit = true;
20205	return true;
20206	}
20207	return false;
20208	};
20209	auto TryProcessInstruction =
20210	[&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
20211	function_ref<bool(unsigned, unsigned)> Checker = {}) {
20212	if (Operands.empty()) {
20213	if (!IsTruncRoot)
20214	MaxDepthLevel = `1`;
20215	for (Value *V : E.Scalars)
20216	(void)IsPotentiallyTruncated (V, BitWidth);
20217	} else {
20218	// Several vectorized uses? Check if we can truncate it, otherwise -
20219	// exit.
20220	if (any_of(Range: E.Scalars, P: [&](Value *V) {
20221	return !V->hasOneUse() && !IsPotentiallyTruncated (V, BitWidth);
20222	}))
20223	return false;
20224	bool NeedToExit = false;
20225	if (Checker && !AttemptCheckBitwidth (Checker, NeedToExit))
20226	return false;
20227	if (NeedToExit)
20228	return true;
20229	if (!ProcessOperands (Operands, NeedToExit))
20230	return false;
20231	if (NeedToExit)
20232	return true;
20233	}
20234
20235	++MaxDepthLevel;
20236	// Record the entry that we can demote.
20237	ToDemote.push_back(Elt: E.Idx);
20238	return IsProfitableToDemote;
20239	};
20240
20241	if (E.State == TreeEntry::SplitVectorize)
20242	return TryProcessInstruction (
20243	BitWidth,
20244	{VectorizableTree [E.CombinedEntriesWithIndices.front().first].get(),
20245	VectorizableTree [E.CombinedEntriesWithIndices.back().first].get()});
20246
20247	switch (E.getOpcode()) {
20248
20249	// We can always demote truncations and extensions. Since truncations can
20250	// seed additional demotion, we save the truncated value.
20251	case Instruction::Trunc:
20252	if (IsProfitableToDemoteRoot)
20253	IsProfitableToDemote = true;
20254	return TryProcessInstruction (BitWidth);
20255	case Instruction::ZExt:
20256	case Instruction::SExt:
20257	IsProfitableToDemote = true;
20258	return TryProcessInstruction (BitWidth);
20259
20260	// We can demote certain binary operations if we can demote both of their
20261	// operands.
20262	case Instruction::Add:
20263	case Instruction::Sub:
20264	case Instruction::Mul:
20265	case Instruction::And:
20266	case Instruction::Or:
20267	case Instruction::Xor: {
20268	return TryProcessInstruction (
20269	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)});
20270	}
20271	case Instruction::Freeze:
20272	return TryProcessInstruction (BitWidth, getOperandEntry(E: &E, Idx: `0`));
20273	case Instruction::Shl: {
20274	// If we are truncating the result of this SHL, and if it's a shift of an
20275	// inrange amount, we can always perform a SHL in a smaller type.
20276	auto ShlChecker = [&](unsigned BitWidth, unsigned) {
20277	return all_of(Range: E.Scalars, P: [&](Value *V) {
20278	if (isa<PoisonValue>(Val: V))
20279	return true;
20280	auto *I = cast<Instruction>(Val: V);
20281	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
20282	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
20283	});
20284	};
20285	return TryProcessInstruction (
20286	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, ShlChecker);
20287	}
20288	case Instruction::LShr: {
20289	// If this is a truncate of a logical shr, we can truncate it to a smaller
20290	// lshr iff we know that the bits we would otherwise be shifting in are
20291	// already zeros.
20292	auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20293	return all_of(Range: E.Scalars, P: [&](Value *V) {
20294	if (isa<PoisonValue>(Val: V))
20295	return true;
20296	auto *I = cast<Instruction>(Val: V);
20297	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
20298	APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20299	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
20300	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask: ShiftedBits,
20301	SQ: SimplifyQuery (*DL));
20302	});
20303	};
20304	return TryProcessInstruction (
20305	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
20306	LShrChecker);
20307	}
20308	case Instruction::AShr: {
20309	// If this is a truncate of an arithmetic shr, we can truncate it to a
20310	// smaller ashr iff we know that all the bits from the sign bit of the
20311	// original type and the sign bit of the truncate type are similar.
20312	auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20313	return all_of(Range: E.Scalars, P: [&](Value *V) {
20314	if (isa<PoisonValue>(Val: V))
20315	return true;
20316	auto *I = cast<Instruction>(Val: V);
20317	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
20318	unsigned ShiftedBits = OrigBitWidth - BitWidth;
20319	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
20320	ShiftedBits <
20321	ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
20322	});
20323	};
20324	return TryProcessInstruction (
20325	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
20326	AShrChecker);
20327	}
20328	case Instruction::UDiv:
20329	case Instruction::URem: {
20330	// UDiv and URem can be truncated if all the truncated bits are zero.
20331	auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20332	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
20333	return all_of(Range: E.Scalars, P: [&](Value *V) {
20334	auto *I = cast<Instruction>(Val: V);
20335	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20336	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask, SQ: SimplifyQuery (*DL)) &&
20337	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, SQ: SimplifyQuery (*DL));
20338	});
20339	};
20340	return TryProcessInstruction (
20341	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, Checker);
20342	}
20343
20344	// We can demote selects if we can demote their true and false values.
20345	case Instruction::Select: {
20346	return TryProcessInstruction (
20347	BitWidth, {getOperandEntry(E: &E, Idx: `1`), getOperandEntry(E: &E, Idx: `2`)});
20348	}
20349
20350	// We can demote phis if we can demote all their incoming operands.
20351	case Instruction::PHI: {
20352	const unsigned NumOps = E.getNumOperands();
20353	SmallVector<const TreeEntry *> Ops(NumOps);
20354	transform(Range: seq<unsigned>(Begin: `0`, End: NumOps), d_first: Ops.begin(),
20355	F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); });
20356
20357	return TryProcessInstruction (BitWidth, Ops);
20358	}
20359
20360	case Instruction::Call: {
20361	auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
20362	if (!IC)
20363	break;
20364	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
20365	if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
20366	ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
20367	break;
20368	SmallVector<const TreeEntry *, `2`> Operands(`1`, getOperandEntry(E: &E, Idx: `0`));
20369	function_ref<bool(unsigned, unsigned)> CallChecker;
20370	auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20371	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
20372	return all_of(Range: E.Scalars, P: [&](Value *V) {
20373	auto *I = cast<Instruction>(Val: V);
20374	if (ID == Intrinsic::umin \|\| ID == Intrinsic::umax) {
20375	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20376	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
20377	SQ: SimplifyQuery (*DL)) &&
20378	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, SQ: SimplifyQuery (*DL));
20379	}
20380	assert((ID == Intrinsic::smin \|\| ID == Intrinsic::smax) &&
20381	"Expected min/max intrinsics only.");
20382	unsigned SignBits = OrigBitWidth - BitWidth;
20383	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - `1`);
20384	unsigned Op0SignBits =
20385	ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
20386	unsigned Op1SignBits =
20387	ComputeNumSignBits(Op: I->getOperand(i: `1`), DL: DL, AC, CxtI: nullptr*, DT);
20388	return SignBits <= Op0SignBits &&
20389	((SignBits != Op0SignBits &&
20390	!isKnownNonNegative(V: I->getOperand(i: `0`), SQ: SimplifyQuery (*DL))) \|\|
20391	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
20392	SQ: SimplifyQuery (*DL))) &&
20393	SignBits <= Op1SignBits &&
20394	((SignBits != Op1SignBits &&
20395	!isKnownNonNegative(V: I->getOperand(i: `1`), SQ: SimplifyQuery (*DL))) \|\|
20396	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, SQ: SimplifyQuery (*DL)));
20397	});
20398	};
20399	auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20400	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
20401	return all_of(Range: E.Scalars, P: [&](Value *V) {
20402	auto *I = cast<Instruction>(Val: V);
20403	unsigned SignBits = OrigBitWidth - BitWidth;
20404	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - `1`);
20405	unsigned Op0SignBits =
20406	ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
20407	return SignBits <= Op0SignBits &&
20408	((SignBits != Op0SignBits &&
20409	!isKnownNonNegative(V: I->getOperand(i: `0`), SQ: SimplifyQuery (*DL))) \|\|
20410	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask, SQ: SimplifyQuery (*DL)));
20411	});
20412	};
20413	if (ID != Intrinsic::abs) {
20414	Operands.push_back(Elt: getOperandEntry(E: &E, Idx: `1`));
20415	CallChecker = CompChecker;
20416	} else {
20417	CallChecker = AbsChecker;
20418	}
20419	InstructionCost BestCost =
20420	std::numeric_limits<InstructionCost::CostType>::max();
20421	unsigned BestBitWidth = BitWidth;
20422	unsigned VF = E.Scalars.size();
20423	// Choose the best bitwidth based on cost estimations.
20424	auto Checker = [&](unsigned BitWidth, unsigned) {
20425	unsigned MinBW = PowerOf2Ceil(A: BitWidth);
20426	SmallVector<Type *> ArgTys =
20427	buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI);
20428	auto VecCallCosts = getVectorCallCosts(
20429	CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
20430	TTI, TLI, ArgTys);
20431	InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
20432	if (Cost < BestCost) {
20433	BestCost = Cost;
20434	BestBitWidth = BitWidth;
20435	}
20436	return false;
20437	};
20438	[[maybe_unused]] bool NeedToExit;
20439	(void)AttemptCheckBitwidth (Checker, NeedToExit);
20440	BitWidth = BestBitWidth;
20441	return TryProcessInstruction (BitWidth, Operands, CallChecker);
20442	}
20443
20444	// Otherwise, conservatively give up.
20445	default:
20446	break;
20447	}
20448	MaxDepthLevel = `1`;
20449	return FinalAnalysis ();
20450	}
20451
20452	static RecurKind getRdxKind(Value *V);
20453
20454	void BoUpSLP::computeMinimumValueSizes() {
20455	// We only attempt to truncate integer expressions.
20456	bool IsStoreOrInsertElt =
20457	VectorizableTree.front()->hasState() &&
20458	(VectorizableTree.front()->getOpcode() == Instruction::Store \|\|
20459	VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
20460	if ((IsStoreOrInsertElt \|\| UserIgnoreList) &&
20461	ExtraBitWidthNodes.size() <= `1` &&
20462	(!CastMaxMinBWSizes \|\| CastMaxMinBWSizes ->second == `0` \|\|
20463	CastMaxMinBWSizes ->first / CastMaxMinBWSizes ->second <= `2`))
20464	return;
20465
20466	unsigned NodeIdx = `0`;
20467	if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
20468	NodeIdx = `1`;
20469
20470	// Ensure the roots of the vectorizable tree don't form a cycle.
20471	assert((VectorizableTree[NodeIdx]->isGather() \|\| NodeIdx != `0` \|\|
20472	!VectorizableTree[NodeIdx]->UserTreeIndex) &&
20473	"Unexpected tree is graph.");
20474
20475	// The first value node for store/insertelement is sext/zext/trunc? Skip it,
20476	// resize to the final type.
20477	bool IsTruncRoot = false;
20478	bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
20479	SmallVector<unsigned> RootDemotes;
20480	SmallDenseSet<unsigned, `8`> NodesToKeepBWs;
20481	if (NodeIdx != `0` &&
20482	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
20483	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
20484	assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
20485	IsTruncRoot = true;
20486	RootDemotes.push_back(Elt: NodeIdx);
20487	IsProfitableToDemoteRoot = true;
20488	++NodeIdx;
20489	}
20490
20491	// Analyzed the reduction already and not profitable - exit.
20492	if (AnalyzedMinBWVals.contains(V: VectorizableTree [NodeIdx]->Scalars.front()))
20493	return;
20494
20495	SmallVector<unsigned> ToDemote;
20496	auto ComputeMaxBitWidth =
20497	[&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
20498	unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
20499	ToDemote.clear();
20500	// Check if the root is trunc and the next node is gather/buildvector, then
20501	// keep trunc in scalars, which is free in most cases.
20502	if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
20503	!NodesToKeepBWs.contains(V: E.Idx) &&
20504	E.Idx > (IsStoreOrInsertElt ? `2u` : `1u`) &&
20505	all_of(Range: E.Scalars, P: [&](Value *V) {
20506	return V->hasOneUse() \|\| isa<Constant>(Val: V) \|\|
20507	(!V->hasNUsesOrMore(N: UsesLimit) &&
20508	none_of(Range: V->users(), P: [&](User *U) {
20509	ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U);
20510	const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
20511	if (TEs.empty() \|\| is_contained(Range&: TEs, Element: UserTE))
20512	return false;
20513	if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
20514	SelectInst>(Val: U) \|\|
20515	isa<SIToFPInst, UIToFPInst>(Val: U) \|\|
20516	!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
20517	SelectInst>(Val: UserTE->getMainOp()) \|\|
20518	isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp()))
20519	return true;
20520	unsigned UserTESz = DL->getTypeSizeInBits(
20521	Ty: UserTE->Scalars.front()->getType());
20522	if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) {
20523	auto It = MinBWs.find(Val: TE);
20524	return It != MinBWs.end() &&
20525	It ->second.first > UserTESz;
20526	}))
20527	return true;
20528	return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
20529	}));
20530	})) {
20531	ToDemote.push_back(Elt: E.Idx);
20532	const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
20533	auto It = MinBWs.find(Val: UserTE);
20534	if (It != MinBWs.end())
20535	return It ->second.first;
20536	unsigned MaxBitWidth =
20537	DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
20538	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
20539	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
20540	MaxBitWidth = `8`;
20541	return MaxBitWidth;
20542	}
20543
20544	if (!E.hasState())
20545	return `0u`;
20546
20547	unsigned VF = E.getVectorFactor();
20548	Type *ScalarTy = E.Scalars.front()->getType();
20549	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
20550	auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType());
20551	if (!TreeRootIT)
20552	return `0u`;
20553
20554	if (any_of(Range: E.Scalars,
20555	P: [&](Value V) { return* AnalyzedMinBWVals.contains(V); }))
20556	return `0u`;
20557
20558	unsigned NumParts = ::getNumberOfParts(
20559	TTI: TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF ScalarTyNumElements));
20560
20561	// The maximum bit width required to represent all the values that can be
20562	// demoted without loss of precision. It would be safe to truncate the roots
20563	// of the expression to this width.
20564	unsigned MaxBitWidth = `1u`;
20565
20566	// True if the roots can be zero-extended back to their original type,
20567	// rather than sign-extended. We know that if the leading bits are not
20568	// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
20569	// True.
20570	// Determine if the sign bit of all the roots is known to be zero. If not,
20571	// IsKnownPositive is set to False.
20572	bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
20573	if (isa<PoisonValue>(Val: R))
20574	return true;
20575	KnownBits Known = computeKnownBits(V: R, DL: *DL);
20576	return Known.isNonNegative();
20577	});
20578
20579	if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
20580	E.UserTreeIndex.UserTE->hasState() &&
20581	E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
20582	MaxBitWidth =
20583	std::min(a: DL->getTypeSizeInBits(
20584	Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()),
20585	b: DL->getTypeSizeInBits(Ty: ScalarTy));
20586
20587	// We first check if all the bits of the roots are demanded. If they're not,
20588	// we can truncate the roots to this narrower type.
20589	for (Value *Root : E.Scalars) {
20590	if (isa<PoisonValue>(Val: Root))
20591	continue;
20592	unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: DL, AC, CxtI: nullptr*, DT);
20593	TypeSize NumTypeBits =
20594	DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType());
20595	unsigned BitWidth1 = NumTypeBits - NumSignBits;
20596	// If we can't prove that the sign bit is zero, we must add one to the
20597	// maximum bit width to account for the unknown sign bit. This preserves
20598	// the existing sign bit so we can safely sign-extend the root back to the
20599	// original type. Otherwise, if we know the sign bit is zero, we will
20600	// zero-extend the root instead.
20601	//
20602	// FIXME: This is somewhat suboptimal, as there will be cases where adding
20603	// one to the maximum bit width will yield a larger-than-necessary
20604	// type. In general, we need to add an extra bit only if we can't
20605	// prove that the upper bit of the original type is equal to the
20606	// upper bit of the proposed smaller type. If these two bits are
20607	// the same (either zero or one) we know that sign-extending from
20608	// the smaller type will result in the same value. Here, since we
20609	// can't yet prove this, we are just making the proposed smaller
20610	// type larger to ensure correctness.
20611	if (!IsKnownPositive)
20612	++BitWidth1;
20613
20614	APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root));
20615	unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
20616	MaxBitWidth =
20617	std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
20618	}
20619
20620	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
20621	MaxBitWidth = `8`;
20622
20623	// If the original type is large, but reduced type does not improve the reg
20624	// use - ignore it.
20625	if (NumParts > `1` &&
20626	NumParts ==
20627	::getNumberOfParts(
20628	TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(),
20629	NumBits: bit_ceil(Value: MaxBitWidth)),
20630	VF)))
20631	return `0u`;
20632
20633	unsigned Opcode = E.getOpcode();
20634	bool IsProfitableToDemote = Opcode == Instruction::Trunc \|\|
20635	Opcode == Instruction::SExt \|\|
20636	Opcode == Instruction::ZExt \|\| NumParts > `1`;
20637	// Conservatively determine if we can actually truncate the roots of the
20638	// expression. Collect the values that can be demoted in ToDemote and
20639	// additional roots that require investigating in Roots.
20640	DenseSet<const TreeEntry *> Visited;
20641	unsigned MaxDepthLevel = IsTruncRoot ? Limit : `1`;
20642	bool NeedToDemote = IsProfitableToDemote;
20643
20644	if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
20645	ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
20646	IsProfitableToDemote&: NeedToDemote, IsTruncRoot) \|\|
20647	(MaxDepthLevel <= Limit &&
20648	!(((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
20649	(!IsTopRoot \|\| !(IsStoreOrInsertElt \|\| UserIgnoreList) \|\|
20650	DL->getTypeSizeInBits(Ty: TreeRootIT) /
20651	DL->getTypeSizeInBits(
20652	Ty: E.getMainOp()->getOperand(i: `0`)->getType()) >
20653	`2`)))))
20654	return `0u`;
20655	// Round MaxBitWidth up to the next power-of-two.
20656	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
20657
20658	return MaxBitWidth;
20659	};
20660
20661	// If we can truncate the root, we must collect additional values that might
20662	// be demoted as a result. That is, those seeded by truncations we will
20663	// modify.
20664	// Add reduction ops sizes, if any.
20665	if (UserIgnoreList &&
20666	isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
20667	// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
20668	// x i1> to in)).
20669	if (all_of(Range: *UserIgnoreList,
20670	P: [](Value *V) {
20671	return isa<PoisonValue>(Val: V) \|\|
20672	cast<Instruction>(Val: V)->getOpcode() == Instruction::Add;
20673	}) &&
20674	VectorizableTree.front()->State == TreeEntry::Vectorize &&
20675	VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
20676	cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() ==
20677	Builder.getInt1Ty()) {
20678	ReductionBitWidth = `1`;
20679	} else {
20680	for (Value V : UserIgnoreList) {
20681	if (isa<PoisonValue>(Val: V))
20682	continue;
20683	unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: DL, AC, CxtI: nullptr*, DT);
20684	TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
20685	unsigned BitWidth1 = NumTypeBits - NumSignBits;
20686	if (!isKnownNonNegative(V, SQ: SimplifyQuery (*DL)))
20687	++BitWidth1;
20688	unsigned BitWidth2 = BitWidth1;
20689	if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
20690	APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
20691	BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
20692	}
20693	ReductionBitWidth =
20694	std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
20695	}
20696	if (ReductionBitWidth < `8` && ReductionBitWidth > `1`)
20697	ReductionBitWidth = `8`;
20698
20699	ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
20700	}
20701	}
20702	bool IsTopRoot = NodeIdx == `0`;
20703	while (NodeIdx < VectorizableTree.size() &&
20704	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
20705	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
20706	RootDemotes.push_back(Elt: NodeIdx);
20707	++NodeIdx;
20708	IsTruncRoot = true;
20709	}
20710	bool IsSignedCmp = false;
20711	if (UserIgnoreList && all_of(Range: UserIgnoreList, P: [](Value V) {
20712	return match(V, P: m_SMin(L: m_Value(), R: m_Value())) \|\|
20713	match(V, P: m_SMax(L: m_Value(), R: m_Value()));
20714	}))
20715	IsSignedCmp = true;
20716	while (NodeIdx < VectorizableTree.size()) {
20717	ArrayRef<Value *> TreeRoot = VectorizableTree [NodeIdx]->Scalars;
20718	unsigned Limit = `2`;
20719	if (IsTopRoot &&
20720	ReductionBitWidth ==
20721	DL->getTypeSizeInBits(
20722	Ty: VectorizableTree.front()->Scalars.front()->getType()))
20723	Limit = `3`;
20724	unsigned MaxBitWidth = ComputeMaxBitWidth (
20725	*VectorizableTree [NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
20726	IsTruncRoot, IsSignedCmp);
20727	if (ReductionBitWidth != `0` && (IsTopRoot \|\| !RootDemotes.empty())) {
20728	if (MaxBitWidth != `0` && ReductionBitWidth < MaxBitWidth)
20729	ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
20730	else if (MaxBitWidth == `0`)
20731	ReductionBitWidth = `0`;
20732	}
20733
20734	for (unsigned Idx : RootDemotes) {
20735	if (all_of(Range&: VectorizableTree [Idx]->Scalars, P: [&](Value *V) {
20736	uint32_t OrigBitWidth =
20737	DL->getTypeSizeInBits(Ty: V->getType()->getScalarType());
20738	if (OrigBitWidth > MaxBitWidth) {
20739	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
20740	return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL));
20741	}
20742	return false;
20743	}))
20744	ToDemote.push_back(Elt: Idx);
20745	}
20746	RootDemotes.clear();
20747	IsTopRoot = false;
20748	IsProfitableToDemoteRoot = true;
20749
20750	if (ExtraBitWidthNodes.empty()) {
20751	NodeIdx = VectorizableTree.size();
20752	} else {
20753	unsigned NewIdx = `0`;
20754	do {
20755	NewIdx = *ExtraBitWidthNodes.begin();
20756	ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
20757	} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
20758	NodeIdx = NewIdx;
20759	IsTruncRoot =
20760	NodeIdx < VectorizableTree.size() &&
20761	VectorizableTree [NodeIdx]->UserTreeIndex &&
20762	VectorizableTree [NodeIdx]->UserTreeIndex.EdgeIdx == `0` &&
20763	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
20764	Instruction::Trunc &&
20765	!VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
20766	IsSignedCmp =
20767	NodeIdx < VectorizableTree.size() &&
20768	VectorizableTree [NodeIdx]->UserTreeIndex &&
20769	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->hasState() &&
20770	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
20771	Instruction::ICmp &&
20772	any_of(
20773	Range&: VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->Scalars,
20774	P: [&](Value *V) {
20775	auto *IC = dyn_cast<ICmpInst>(Val: V);
20776	return IC && (IC->isSigned() \|\|
20777	!isKnownNonNegative(V: IC->getOperand(i_nocapture: `0`),
20778	SQ: SimplifyQuery (*DL)) \|\|
20779	!isKnownNonNegative(V: IC->getOperand(i_nocapture: `1`),
20780	SQ: SimplifyQuery (*DL)));
20781	});
20782	}
20783
20784	// If the maximum bit width we compute is less than the width of the roots'
20785	// type, we can proceed with the narrowing. Otherwise, do nothing.
20786	if (MaxBitWidth == `0` \|\|
20787	MaxBitWidth >=
20788	cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType())
20789	->getBitWidth()) {
20790	if (UserIgnoreList)
20791	AnalyzedMinBWVals.insert_range(R&: TreeRoot);
20792	NodesToKeepBWs.insert_range(R&: ToDemote);
20793	continue;
20794	}
20795
20796	// Finally, map the values we can demote to the maximum bit with we
20797	// computed.
20798	for (unsigned Idx : ToDemote) {
20799	TreeEntry *TE = VectorizableTree [Idx].get();
20800	if (MinBWs.contains(Val: TE))
20801	continue;
20802	bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
20803	if (isa<PoisonValue>(Val: R))
20804	return false;
20805	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
20806	});
20807	MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
20808	}
20809	}
20810	}
20811
20812	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
20813	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
20814	auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
20815	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
20816	auto *AA = &AM.getResult<AAManager>(IR&: F);
20817	auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
20818	auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
20819	auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
20820	auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
20821	auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
20822
20823	bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
20824	if (!Changed)
20825	return PreservedAnalyses::all();
20826
20827	PreservedAnalyses PA;
20828	PA.preserveSet<CFGAnalyses>();
20829	return PA;
20830	}
20831
20832	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
20833	TargetTransformInfo *TTI_,
20834	TargetLibraryInfo TLI_, AAResults AA_,
20835	LoopInfo LI_, DominatorTree DT_,
20836	AssumptionCache AC_, DemandedBits DB_,
20837	OptimizationRemarkEmitter *ORE_) {
20838	if (!RunSLPVectorization)
20839	return false;
20840	SE = SE_;
20841	TTI = TTI_;
20842	TLI = TLI_;
20843	AA = AA_;
20844	LI = LI_;
20845	DT = DT_;
20846	AC = AC_;
20847	DB = DB_;
20848	DL = &F.getDataLayout();
20849
20850	Stores.clear();
20851	GEPs.clear();
20852	bool Changed = false;
20853
20854	// If the target claims to have no vector registers don't attempt
20855	// vectorization.
20856	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
20857	LLVM_DEBUG(
20858	dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
20859	return false;
20860	}
20861
20862	// Don't vectorize when the attribute NoImplicitFloat is used.
20863	if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
20864	return false;
20865
20866	LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
20867
20868	// Use the bottom up slp vectorizer to construct chains that start with
20869	// store instructions.
20870	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
20871
20872	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
20873	// delete instructions.
20874
20875	// Update DFS numbers now so that we can use them for ordering.
20876	DT->updateDFSNumbers();
20877
20878	// Scan the blocks in the function in post order.
20879	for (auto *BB : post_order(G: &F.getEntryBlock())) {
20880	if (BB->isEHPad() \|\| isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()))
20881	continue;
20882
20883	// Start new block - clear the list of reduction roots.
20884	R.clearReductionData();
20885	collectSeedInstructions(BB);
20886
20887	// Vectorize trees that end at stores.
20888	if (!Stores.empty()) {
20889	LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
20890	<< " underlying objects.\n");
20891	Changed \|= vectorizeStoreChains(R);
20892	}
20893
20894	// Vectorize trees that end at reductions.
20895	Changed \|= vectorizeChainsInBlock(BB, R);
20896
20897	// Vectorize the index computations of getelementptr instructions. This
20898	// is primarily intended to catch gather-like idioms ending at
20899	// non-consecutive loads.
20900	if (!GEPs.empty()) {
20901	LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
20902	<< " underlying objects.\n");
20903	Changed \|= vectorizeGEPIndices(BB, R);
20904	}
20905	}
20906
20907	if (Changed) {
20908	R.optimizeGatherSequence();
20909	LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
20910	}
20911	return Changed;
20912	}
20913
20914	std::optional<bool>
20915	SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
20916	unsigned Idx, unsigned MinVF,
20917	unsigned &Size) {
20918	Size = `0`;
20919	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
20920	<< "\n");
20921	const unsigned Sz = R.getVectorElementSize(V: Chain [`0`]);
20922	unsigned VF = Chain.size();
20923
20924	if (!has_single_bit(Value: Sz) \|\|
20925	!hasFullVectorsOrPowerOf2(
20926	TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(),
20927	Sz: VF) \|\|
20928	VF < `2` \|\| VF < MinVF) {
20929	// Check if vectorizing with a non-power-of-2 VF should be considered. At
20930	// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
20931	// all vector lanes are used.
20932	if (!VectorizeNonPowerOf2 \|\| (VF < MinVF && VF + `1` != MinVF))
20933	return false;
20934	}
20935
20936	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
20937	<< "\n");
20938
20939	SetVector<Value *> ValOps;
20940	for (Value *V : Chain)
20941	ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
20942	// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
20943	InstructionsState S = getSameOpcode(VL: ValOps.getArrayRef(), TLI: *TLI);
20944	if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > `1`) {
20945	DenseSet<Value *> Stores(Chain.begin(), Chain.end());
20946	bool IsAllowedSize =
20947	hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(),
20948	Sz: ValOps.size()) \|\|
20949	(VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + `1`));
20950	if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
20951	(!S.getMainOp()->isSafeToRemove() \|\|
20952	any_of(Range: ValOps.getArrayRef(),
20953	P: [&](Value *V) {
20954	return !isa<ExtractElementInst>(Val: V) &&
20955	(V->getNumUses() > Chain.size() \|\|
20956	any_of(Range: V->users(), P: [&](User *U) {
20957	return !Stores.contains(V: U);
20958	}));
20959	}))) \|\|
20960	(ValOps.size() > Chain.size() / `2` && !S)) {
20961	Size = (!IsAllowedSize && S) ? `1` : `2`;
20962	return false;
20963	}
20964	}
20965	if (R.isLoadCombineCandidate(Stores: Chain))
20966	return true;
20967	R.buildTree(Roots: Chain);
20968	// Check if tree tiny and store itself or its value is not vectorized.
20969	if (R.isTreeTinyAndNotFullyVectorizable()) {
20970	if (R.isGathered(V: Chain.front()) \|\|
20971	R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
20972	return std::nullopt;
20973	Size = R.getCanonicalGraphSize();
20974	return false;
20975	}
20976	if (R.isProfitableToReorder()) {
20977	R.reorderTopToBottom();
20978	R.reorderBottomToTop();
20979	}
20980	R.transformNodes();
20981	R.buildExternalUses();
20982
20983	R.computeMinimumValueSizes();
20984
20985	Size = R.getCanonicalGraphSize();
20986	if (S && S.getOpcode() == Instruction::Load)
20987	Size = `2`; // cut off masked gather small trees
20988	InstructionCost Cost = R.getTreeCost();
20989
20990	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
20991	if (Cost < -SLPCostThreshold) {
20992	LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
20993
20994	using namespace ore;
20995
20996	R.getORE()->emit(OptDiag: OptimizationRemark (SV_NAME, "StoresVectorized",
20997	cast<StoreInst>(Val: Chain [`0`]))
20998	<< "Stores SLP vectorized with cost " << NV ("Cost", Cost)
20999	<< " and with tree size "
21000	<< NV ("TreeSize", R.getTreeSize()));
21001
21002	R.vectorizeTree();
21003	return true;
21004	}
21005
21006	return false;
21007	}
21008
21009	/// Checks if the quadratic mean deviation is less than 90% of the mean size.
21010	static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
21011	bool First) {
21012	unsigned Num = `0`;
21013	uint64_t Sum = std::accumulate(
21014	first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(`0`),
21015	binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
21016	unsigned Size = First ? Val.first : Val.second;
21017	if (Size == `1`)
21018	return V;
21019	++Num;
21020	return V + Size;
21021	});
21022	if (Num == `0`)
21023	return true;
21024	uint64_t Mean = Sum / Num;
21025	if (Mean == `0`)
21026	return true;
21027	uint64_t Dev = std::accumulate(
21028	first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(`0`),
21029	binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
21030	unsigned P = First ? Val.first : Val.second;
21031	if (P == `1`)
21032	return V;
21033	return V + (P - Mean) * (P - Mean);
21034	}) /
21035	Num;
21036	return Dev * `96` / (Mean * Mean) == `0`;
21037	}
21038
21039	namespace {
21040
21041	/// A group of stores that we'll try to bundle together using vector ops.
21042	/// They are ordered using the signed distance of their address operand to the
21043	/// address of this group's BaseInstr.
21044	class RelatedStoreInsts {
21045	public:
21046	RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
21047	: AllStores (AllStores) {
21048	reset(NewBaseInstr: BaseInstrIdx);
21049	}
21050
21051	void reset(unsigned NewBaseInstr) {
21052	assert(NewBaseInstr < AllStores.size() &&
21053	"Instruction index out of bounds");
21054	BaseInstrIdx = NewBaseInstr;
21055	Instrs.clear();
21056	insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: `0`);
21057	}
21058
21059	/// Tries to insert \p InstrIdx as the store with a pointer distance of
21060	/// \p PtrDist.
21061	/// Does nothing if there is already a store with that \p PtrDist.
21062	/// \returns The previously associated Instruction index, or std::nullopt
21063	std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
21064	auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx);
21065	return Inserted ? std::nullopt : std::make_optional(t&: It ->second);
21066	}
21067
21068	using DistToInstMap = std::map<int64_t, unsigned>;
21069	const DistToInstMap &getStores() const { return Instrs; }
21070
21071	/// If \p SI is related to this group of stores, return the distance of its
21072	/// pointer operand to the one the group's BaseInstr.
21073	std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
21074	ScalarEvolution &SE) const {
21075	StoreInst &BaseStore = *AllStores [BaseInstrIdx];
21076	return getPointersDiff(
21077	ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(),
21078	ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE,
21079	/StrictCheck=/true);
21080	}
21081
21082	/// Recompute the pointer distances to be based on \p NewBaseInstIdx.
21083	/// Stores whose index is less than \p MinSafeIdx will be dropped.
21084	void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
21085	int64_t DistFromCurBase) {
21086	DistToInstMap PrevSet = std::move(Instrs);
21087	reset(NewBaseInstr: NewBaseInstIdx);
21088
21089	// Re-insert stores that come after MinSafeIdx to try and vectorize them
21090	// again. Their distance will be "rebased" to use NewBaseInstIdx as
21091	// reference.
21092	for (auto [Dist, InstIdx] : PrevSet) {
21093	if (InstIdx >= MinSafeIdx)
21094	insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase);
21095	}
21096	}
21097
21098	/// Remove all stores that have been vectorized from this group.
21099	void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
21100	DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
21101	Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
21102	return VectorizedStores.contains(Ptr: AllStores [DistAndIdx.second]);
21103	});
21104
21105	// Get a forward iterator pointing after the last vectorized store and erase
21106	// all stores before it so we don't try to vectorize them again.
21107	DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
21108	Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd);
21109	}
21110
21111	private:
21112	/// The index of the Base instruction, i.e. the one with a 0 pointer distance.
21113	unsigned BaseInstrIdx;
21114
21115	/// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
21116	DistToInstMap Instrs;
21117
21118	/// Reference to all the stores in the BB being analyzed.
21119	ArrayRef<StoreInst *> AllStores;
21120	};
21121
21122	} // end anonymous namespace
21123
21124	bool SLPVectorizerPass::vectorizeStores(
21125	ArrayRef<StoreInst *> Stores, BoUpSLP &R,
21126	DenseSet<std::tuple<Value , Value , Value , Value , unsigned>>
21127	&Visited) {
21128	// We may run into multiple chains that merge into a single chain. We mark the
21129	// stores that we vectorized so that we don't visit the same store twice.
21130	BoUpSLP::ValueSet VectorizedStores;
21131	bool Changed = false;
21132
21133	auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
21134	int64_t PrevDist = -`1`;
21135	BoUpSLP::ValueList Operands;
21136	// Collect the chain into a list.
21137	for (auto [Idx, Data] : enumerate(First: StoreSeq)) {
21138	auto &[Dist, InstIdx] = Data;
21139	if (Operands.empty() \|\| Dist - PrevDist == `1`) {
21140	Operands.push_back(Elt: Stores [InstIdx]);
21141	PrevDist = Dist;
21142	if (Idx != StoreSeq.size() - `1`)
21143	continue;
21144	}
21145	auto E = make_scope_exit(F: [&, &Dist = Dist, &InstIdx = InstIdx]() {
21146	Operands.clear();
21147	Operands.push_back(Elt: Stores [InstIdx]);
21148	PrevDist = Dist;
21149	});
21150
21151	if (Operands.size() <= `1` \|\|
21152	!Visited
21153	.insert(V: {Operands.front(),
21154	cast<StoreInst>(Val: Operands.front())->getValueOperand(),
21155	Operands.back(),
21156	cast<StoreInst>(Val: Operands.back())->getValueOperand(),
21157	Operands.size()})
21158	.second)
21159	continue;
21160
21161	unsigned MaxVecRegSize = R.getMaxVecRegSize();
21162	unsigned EltSize = R.getVectorElementSize(V: Operands [`0`]);
21163	unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
21164
21165	unsigned MaxVF =
21166	std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
21167	auto *Store = cast<StoreInst>(Val: Operands [`0`]);
21168	Type *StoreTy = Store->getValueOperand()->getType();
21169	Type *ValueTy = StoreTy;
21170	if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
21171	ValueTy = Trunc->getSrcTy();
21172	// When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
21173	// getStoreMinimumVF only support scalar type as arguments. As a result,
21174	// we need to use the element type of StoreTy and ValueTy to retrieve the
21175	// VF and then transform it back.
21176	// Remember: VF is defined as the number we want to vectorize, not the
21177	// number of elements in the final vector.
21178	Type *StoreScalarTy = StoreTy->getScalarType();
21179	unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
21180	VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy,
21181	ScalarValTy: ValueTy->getScalarType()));
21182	MinVF /= getNumElements(Ty: StoreTy);
21183	MinVF = std::max<unsigned>(a: `2`, b: MinVF);
21184
21185	if (MaxVF < MinVF) {
21186	LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
21187	<< ") < "
21188	<< "MinVF (" << MinVF << ")\n");
21189	continue;
21190	}
21191
21192	unsigned NonPowerOf2VF = `0`;
21193	if (VectorizeNonPowerOf2) {
21194	// First try vectorizing with a non-power-of-2 VF. At the moment, only
21195	// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
21196	// lanes are used.
21197	unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF);
21198	if (has_single_bit(Value: CandVF + `1`)) {
21199	NonPowerOf2VF = CandVF;
21200	assert(NonPowerOf2VF != MaxVF &&
21201	"Non-power-of-2 VF should not be equal to MaxVF");
21202	}
21203	}
21204
21205	// MaxRegVF represents the number of instructions (scalar, or vector in
21206	// case of revec) that can be vectorized to naturally fit in a vector
21207	// register.
21208	unsigned MaxRegVF = MaxVF;
21209
21210	MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
21211	if (MaxVF < MinVF) {
21212	LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
21213	<< ") < "
21214	<< "MinVF (" << MinVF << ")\n");
21215	continue;
21216	}
21217
21218	SmallVector<unsigned> CandidateVFs;
21219	for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF;
21220	VF = divideCeil(Numerator: VF, Denominator: `2`))
21221	CandidateVFs.push_back(Elt: VF);
21222
21223	unsigned End = Operands.size();
21224	unsigned Repeat = `0`;
21225	constexpr unsigned MaxAttempts = `4`;
21226	OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
21227	for (std::pair<unsigned, unsigned> &P : RangeSizes)
21228	P.first = P.second = `1`;
21229	DenseMap<Value , std::pair<unsigned, unsigned*>> NonSchedulable;
21230	auto IsNotVectorized = [](bool First,
21231	const std::pair<unsigned, unsigned> &P) {
21232	return First ? P.first > `0` : P.second > `0`;
21233	};
21234	auto IsVectorized = [](bool First,
21235	const std::pair<unsigned, unsigned> &P) {
21236	return First ? P.first == `0` : P.second == `0`;
21237	};
21238	auto VFIsProfitable = [](bool First, unsigned Size,
21239	const std::pair<unsigned, unsigned> &P) {
21240	return First ? Size >= P.first : Size >= P.second;
21241	};
21242	auto FirstSizeSame = [](unsigned Size,
21243	const std::pair<unsigned, unsigned> &P) {
21244	return Size == P.first;
21245	};
21246	while (true) {
21247	++Repeat;
21248	bool RepeatChanged = false;
21249	bool AnyProfitableGraph = false;
21250	for (unsigned VF : CandidateVFs) {
21251	AnyProfitableGraph = false;
21252	unsigned FirstUnvecStore =
21253	std::distance(first: RangeSizes.begin(),
21254	last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized,
21255	args: VF >= MaxRegVF, args: _1)));
21256
21257	// Form slices of size VF starting from FirstUnvecStore and try to
21258	// vectorize them.
21259	while (FirstUnvecStore < End) {
21260	unsigned FirstVecStore = std::distance(
21261	first: RangeSizes.begin(),
21262	last: find_if(Range: RangeSizes.drop_front(N: FirstUnvecStore),
21263	P: std::bind(f&: IsVectorized, args: VF >= MaxRegVF, args: _1)));
21264	unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
21265	for (unsigned SliceStartIdx = FirstUnvecStore;
21266	SliceStartIdx + VF <= MaxSliceEnd;) {
21267	if (!checkTreeSizes(Sizes: RangeSizes.slice(N: SliceStartIdx, M: VF),
21268	First: VF >= MaxRegVF)) {
21269	++SliceStartIdx;
21270	continue;
21271	}
21272	ArrayRef<Value *> Slice =
21273	ArrayRef(Operands).slice(N: SliceStartIdx, M: VF);
21274	assert(all_of(Slice,
21275	[&](Value *V) {
21276	return cast<StoreInst>(V)
21277	->getValueOperand()
21278	->getType() ==
21279	cast<StoreInst>(Slice.front())
21280	->getValueOperand()
21281	->getType();
21282	}) &&
21283	"Expected all operands of same type.");
21284	if (!NonSchedulable.empty()) {
21285	auto [NonSchedSizeMax, NonSchedSizeMin] =
21286	NonSchedulable.lookup(Val: Slice.front());
21287	if (NonSchedSizeMax > `0` && NonSchedSizeMin <= VF) {
21288	// VF is too ambitious. Try to vectorize another slice before
21289	// trying a smaller VF.
21290	SliceStartIdx += NonSchedSizeMax;
21291	continue;
21292	}
21293	}
21294	unsigned TreeSize;
21295	std::optional<bool> Res =
21296	vectorizeStoreChain(Chain: Slice, R, Idx: SliceStartIdx, MinVF, Size&: TreeSize);
21297	if (!Res) {
21298	// Update the range of non schedulable VFs for slices starting
21299	// at SliceStartIdx.
21300	NonSchedulable
21301	.try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF))
21302	.first ->getSecond()
21303	.second = VF;
21304	} else if (*Res) {
21305	// Mark the vectorized stores so that we don't vectorize them
21306	// again.
21307	VectorizedStores.insert_range(R&: Slice);
21308	// Mark the vectorized stores so that we don't vectorize them
21309	// again.
21310	AnyProfitableGraph = RepeatChanged = Changed = true;
21311	// If we vectorized initial block, no need to try to vectorize
21312	// it again.
21313	for (std::pair<unsigned, unsigned> &P :
21314	RangeSizes.slice(N: SliceStartIdx, M: VF))
21315	P.first = P.second = `0`;
21316	if (SliceStartIdx < FirstUnvecStore + MinVF) {
21317	for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
21318	N: FirstUnvecStore, M: SliceStartIdx - FirstUnvecStore))
21319	P.first = P.second = `0`;
21320	FirstUnvecStore = SliceStartIdx + VF;
21321	}
21322	if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
21323	for (std::pair<unsigned, unsigned> &P :
21324	RangeSizes.slice(N: SliceStartIdx + VF,
21325	M: MaxSliceEnd - (SliceStartIdx + VF)))
21326	P.first = P.second = `0`;
21327	if (MaxSliceEnd == End)
21328	End = SliceStartIdx;
21329	MaxSliceEnd = SliceStartIdx;
21330	}
21331	SliceStartIdx += VF;
21332	continue;
21333	}
21334	if (VF > `2` && Res &&
21335	!all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
21336	P: std::bind(f&: VFIsProfitable, args: VF >= MaxRegVF, args&: TreeSize,
21337	args: _1))) {
21338	SliceStartIdx += VF;
21339	continue;
21340	}
21341	// Check for the very big VFs that we're not rebuilding same
21342	// trees, just with larger number of elements.
21343	if (VF > MaxRegVF && TreeSize > `1` &&
21344	all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
21345	P: std::bind(f&: FirstSizeSame, args&: TreeSize, args: _1))) {
21346	SliceStartIdx += VF;
21347	while (SliceStartIdx != MaxSliceEnd &&
21348	RangeSizes [SliceStartIdx].first == TreeSize)
21349	++SliceStartIdx;
21350	continue;
21351	}
21352	if (TreeSize > `1`) {
21353	for (std::pair<unsigned, unsigned> &P :
21354	RangeSizes.slice(N: SliceStartIdx, M: VF)) {
21355	if (VF >= MaxRegVF)
21356	P.second = std::max(a: P.second, b: TreeSize);
21357	else
21358	P.first = std::max(a: P.first, b: TreeSize);
21359	}
21360	}
21361	++SliceStartIdx;
21362	AnyProfitableGraph = true;
21363	}
21364	if (FirstUnvecStore >= End)
21365	break;
21366	if (MaxSliceEnd - FirstUnvecStore < VF &&
21367	MaxSliceEnd - FirstUnvecStore >= MinVF)
21368	AnyProfitableGraph = true;
21369	FirstUnvecStore = std::distance(
21370	first: RangeSizes.begin(),
21371	last: find_if(Range: RangeSizes.drop_front(N: MaxSliceEnd),
21372	P: std::bind(f&: IsNotVectorized, args: VF >= MaxRegVF, args: _1)));
21373	}
21374	if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF))
21375	break;
21376	}
21377	// All values vectorized - exit.
21378	if (all_of(Range&: RangeSizes, P: [](const std::pair<unsigned, unsigned> &P) {
21379	return P.first == `0` && P.second == `0`;
21380	}))
21381	break;
21382	// Check if tried all attempts or no need for the last attempts at all.
21383	if (Repeat >= MaxAttempts \|\|
21384	(Repeat > `1` && (RepeatChanged \|\| !AnyProfitableGraph)))
21385	break;
21386	constexpr unsigned StoresLimit = `64`;
21387	const unsigned MaxTotalNum = std::min<unsigned>(
21388	a: Operands.size(),
21389	b: static_cast<unsigned>(
21390	End -
21391	std::distance(
21392	first: RangeSizes.begin(),
21393	last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: true, args: _1))) +
21394	`1`));
21395	unsigned VF = bit_ceil(Value: CandidateVFs.front()) * `2`;
21396	unsigned Limit =
21397	getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: StoreTy, Sz: MaxTotalNum);
21398	CandidateVFs.clear();
21399	if (bit_floor(Value: Limit) == VF)
21400	CandidateVFs.push_back(Elt: Limit);
21401	if (VF > MaxTotalNum \|\| VF >= StoresLimit)
21402	break;
21403	for (std::pair<unsigned, unsigned> &P : RangeSizes) {
21404	if (P.first != `0`)
21405	P.first = std::max(a: P.second, b: P.first);
21406	}
21407	// Last attempt to vectorize max number of elements, if all previous
21408	// attempts were unsuccessful because of the cost issues.
21409	CandidateVFs.push_back(Elt: VF);
21410	}
21411	}
21412	};
21413
21414	/// Groups of stores to vectorize
21415	SmallVector<RelatedStoreInsts> SortedStores;
21416
21417	// Inserts the specified store SI with the given index Idx to the set of the
21418	// stores. If the store with the same distance is found already - stop
21419	// insertion, try to vectorize already found stores. If some stores from this
21420	// sequence were not vectorized - try to vectorize them with the new store
21421	// later. But this logic is applied only to the stores, that come before the
21422	// previous store with the same distance.
21423	// Example:
21424	// 1. store x, %p
21425	// 2. store y, %p+1
21426	// 3. store z, %p+2
21427	// 4. store a, %p
21428	// 5. store b, %p+3
21429	// - Scan this from the last to first store. The very first bunch of stores is
21430	// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
21431	// vector).
21432	// - The next store in the list - #1 - has the same distance from store #5 as
21433	// the store #4.
21434	// - Try to vectorize sequence of stores 4,2,3,5.
21435	// - If all these stores are vectorized - just drop them.
21436	// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
21437	// - Start new stores sequence.
21438	// The new bunch of stores is {1, {1, 0}}.
21439	// - Add the stores from previous sequence, that were not vectorized.
21440	// Here we consider the stores in the reversed order, rather they are used in
21441	// the IR (Stores are reversed already, see vectorizeStoreChains() function).
21442	// Store #3 can be added -> comes after store #4 with the same distance as
21443	// store #1.
21444	// Store #5 cannot be added - comes before store #4.
21445	// This logic allows to improve the compile time, we assume that the stores
21446	// after previous store with the same distance most likely have memory
21447	// dependencies and no need to waste compile time to try to vectorize them.
21448	// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
21449	auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
21450	std::optional<int64_t> PtrDist;
21451	auto *RelatedStores = find_if(
21452	Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
21453	PtrDist = StoreSeq.getPointerDiff(SI&: SI, DL: DL, SE&: *SE);
21454	return PtrDist.has_value();
21455	});
21456
21457	// We did not find a comparable store, start a new group.
21458	if (RelatedStores == SortedStores.end()) {
21459	SortedStores.emplace_back(Args&: Idx, Args&: Stores);
21460	return;
21461	}
21462
21463	// If there is already a store in the group with the same PtrDiff, try to
21464	// vectorize the existing instructions before adding the current store.
21465	// Otherwise, insert this store and keep collecting.
21466	if (std::optional<unsigned> PrevInst =
21467	RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) {
21468	TryToVectorize (RelatedStores->getStores());
21469	RelatedStores->clearVectorizedStores(VectorizedStores);
21470	RelatedStores->rebase(/MinSafeIdx=/*PrevInst + `1`,
21471	/NewBaseInstIdx=/Idx,
21472	/DistFromCurBase=/*PtrDist);
21473	}
21474	};
21475	Type PrevValTy = nullptr*;
21476	for (auto [I, SI] : enumerate(First&: Stores)) {
21477	if (R.isDeleted(I: SI))
21478	continue;
21479	if (!PrevValTy)
21480	PrevValTy = SI->getValueOperand()->getType();
21481	// Check that we do not try to vectorize stores of different types.
21482	if (PrevValTy != SI->getValueOperand()->getType()) {
21483	for (RelatedStoreInsts &StoreSeq : SortedStores)
21484	TryToVectorize (StoreSeq.getStores());
21485	SortedStores.clear();
21486	PrevValTy = SI->getValueOperand()->getType();
21487	}
21488	FillStoresSet (I, SI);
21489	}
21490
21491	// Final vectorization attempt.
21492	for (RelatedStoreInsts &StoreSeq : SortedStores)
21493	TryToVectorize (StoreSeq.getStores());
21494
21495	return Changed;
21496	}
21497
21498	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
21499	// Initialize the collections. We will make a single pass over the block.
21500	Stores.clear();
21501	GEPs.clear();
21502
21503	// Visit the store and getelementptr instructions in BB and organize them in
21504	// Stores and GEPs according to the underlying objects of their pointer
21505	// operands.
21506	for (Instruction &I : *BB) {
21507	// Ignore store instructions that are volatile or have a pointer operand
21508	// that doesn't point to a scalar type.
21509	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
21510	if (!SI->isSimple())
21511	continue;
21512	if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
21513	continue;
21514	Stores [getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
21515	}
21516
21517	// Ignore getelementptr instructions that have more than one index, a
21518	// constant index, or a pointer operand that doesn't point to a scalar
21519	// type.
21520	else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
21521	if (GEP->getNumIndices() != `1`)
21522	continue;
21523	Value *Idx = GEP->idx_begin()->get();
21524	if (isa<Constant>(Val: Idx))
21525	continue;
21526	if (!isValidElementType(Ty: Idx->getType()))
21527	continue;
21528	if (GEP->getType()->isVectorTy())
21529	continue;
21530	GEPs [GEP->getPointerOperand()].push_back(Elt: GEP);
21531	}
21532	}
21533	}
21534
21535	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
21536	bool MaxVFOnly) {
21537	if (VL.size() < `2`)
21538	return false;
21539
21540	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
21541	<< VL.size() << ".\n");
21542
21543	// Check that all of the parts are instructions of the same type,
21544	// we permit an alternate opcode via InstructionsState.
21545	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
21546	if (!S)
21547	return false;
21548
21549	Instruction *I0 = S.getMainOp();
21550	// Make sure invalid types (including vector type) are rejected before
21551	// determining vectorization factor for scalar instructions.
21552	for (Value *V : VL) {
21553	Type *Ty = V->getType();
21554	if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
21555	// NOTE: the following will give user internal llvm type name, which may
21556	// not be useful.
21557	R.getORE()->emit(RemarkBuilder: [&]() {
21558	std::string TypeStr;
21559	llvm::raw_string_ostream OS(TypeStr);
21560	Ty->print(O&: OS);
21561	return OptimizationRemarkMissed (SV_NAME, "UnsupportedType", I0)
21562	<< "Cannot SLP vectorize list: type "
21563	<< TypeStr + " is unsupported by vectorizer";
21564	});
21565	return false;
21566	}
21567	}
21568
21569	Type *ScalarTy = getValueType(V: VL [`0`]);
21570	unsigned Sz = R.getVectorElementSize(V: I0);
21571	unsigned MinVF = R.getMinVF(Sz);
21572	unsigned MaxVF = std::max<unsigned>(
21573	a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF);
21574	MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
21575	if (MaxVF < `2`) {
21576	R.getORE()->emit(RemarkBuilder: [&]() {
21577	return OptimizationRemarkMissed (SV_NAME, "SmallVF", I0)
21578	<< "Cannot SLP vectorize list: vectorization factor "
21579	<< "less than 2 is not supported";
21580	});
21581	return false;
21582	}
21583
21584	bool Changed = false;
21585	bool CandidateFound = false;
21586	InstructionCost MinCost = SLPCostThreshold.getValue();
21587
21588	unsigned NextInst = `0`, MaxInst = VL.size();
21589	for (unsigned VF = MaxVF; NextInst + `1` < MaxInst && VF >= MinVF;
21590	VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - `1`)) {
21591	// No actual vectorization should happen, if number of parts is the same as
21592	// provided vectorization factor (i.e. the scalar type is used for vector
21593	// code during codegen).
21594	auto *VecTy = getWidenedType(ScalarTy, VF);
21595	if (TTI->getNumberOfParts(Tp: VecTy) == VF)
21596	continue;
21597	for (unsigned I = NextInst; I < MaxInst; ++I) {
21598	unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
21599
21600	if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF))
21601	continue;
21602
21603	if (MaxVFOnly && ActualVF < MaxVF)
21604	break;
21605	if ((VF > MinVF && ActualVF < VF) \|\| (VF == MinVF && ActualVF < `2`))
21606	break;
21607
21608	SmallVector<Value > Ops(ActualVF, nullptr*);
21609	unsigned Idx = `0`;
21610	for (Value *V : VL.drop_front(N: I)) {
21611	// Check that a previous iteration of this loop did not delete the
21612	// Value.
21613	if (auto *Inst = dyn_cast<Instruction>(Val: V);
21614	!Inst \|\| !R.isDeleted(I: Inst)) {
21615	Ops [Idx] = V;
21616	++Idx;
21617	if (Idx == ActualVF)
21618	break;
21619	}
21620	}
21621	// Not enough vectorizable instructions - exit.
21622	if (Idx != ActualVF)
21623	break;
21624
21625	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
21626	<< "\n");
21627
21628	R.buildTree(Roots: Ops);
21629	if (R.isTreeTinyAndNotFullyVectorizable())
21630	continue;
21631	if (R.isProfitableToReorder()) {
21632	R.reorderTopToBottom();
21633	R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front()));
21634	}
21635	R.transformNodes();
21636	R.buildExternalUses();
21637
21638	R.computeMinimumValueSizes();
21639	InstructionCost Cost = R.getTreeCost();
21640	CandidateFound = true;
21641	MinCost = std::min(a: MinCost, b: Cost);
21642
21643	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
21644	<< " for VF=" << ActualVF << "\n");
21645	if (Cost < -SLPCostThreshold) {
21646	LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
21647	R.getORE()->emit(OptDiag: OptimizationRemark (SV_NAME, "VectorizedList",
21648	cast<Instruction>(Val: Ops [`0`]))
21649	<< "SLP vectorized with cost " << ore::NV ("Cost", Cost)
21650	<< " and with tree size "
21651	<< ore::NV ("TreeSize", R.getTreeSize()));
21652
21653	R.vectorizeTree();
21654	// Move to the next bundle.
21655	I += VF - `1`;
21656	NextInst = I + `1`;
21657	Changed = true;
21658	}
21659	}
21660	}
21661
21662	if (!Changed && CandidateFound) {
21663	R.getORE()->emit(RemarkBuilder: [&]() {
21664	return OptimizationRemarkMissed (SV_NAME, "NotBeneficial", I0)
21665	<< "List vectorization was possible but not beneficial with cost "
21666	<< ore::NV ("Cost", MinCost) << " >= "
21667	<< ore::NV ("Treshold", -SLPCostThreshold);
21668	});
21669	} else if (!Changed) {
21670	R.getORE()->emit(RemarkBuilder: [&]() {
21671	return OptimizationRemarkMissed (SV_NAME, "NotPossible", I0)
21672	<< "Cannot SLP vectorize list: vectorization was impossible"
21673	<< " with available vectorization factors";
21674	});
21675	}
21676	return Changed;
21677	}
21678
21679	bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
21680	if (!I)
21681	return false;
21682
21683	if (!isa<BinaryOperator, CmpInst>(Val: I) \|\| isa<VectorType>(Val: I->getType()))
21684	return false;
21685
21686	Value *P = I->getParent();
21687
21688	// Vectorize in current basic block only.
21689	auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
21690	auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: `1`));
21691	if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P \|\|
21692	R.isDeleted(I: Op0) \|\| R.isDeleted(I: Op1))
21693	return false;
21694
21695	// First collect all possible candidates
21696	SmallVector<std::pair<Value , Value >, `4`> Candidates;
21697	Candidates.emplace_back(Args&: Op0, Args&: Op1);
21698
21699	auto *A = dyn_cast<BinaryOperator>(Val: Op0);
21700	auto *B = dyn_cast<BinaryOperator>(Val: Op1);
21701	// Try to skip B.
21702	if (A && B && B->hasOneUse()) {
21703	auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `0`));
21704	auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `1`));
21705	if (B0 && B0->getParent() == P && !R.isDeleted(I: B0))
21706	Candidates.emplace_back(Args&: A, Args&: B0);
21707	if (B1 && B1->getParent() == P && !R.isDeleted(I: B1))
21708	Candidates.emplace_back(Args&: A, Args&: B1);
21709	}
21710	// Try to skip A.
21711	if (B && A && A->hasOneUse()) {
21712	auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `0`));
21713	auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `1`));
21714	if (A0 && A0->getParent() == P && !R.isDeleted(I: A0))
21715	Candidates.emplace_back(Args&: A0, Args&: B);
21716	if (A1 && A1->getParent() == P && !R.isDeleted(I: A1))
21717	Candidates.emplace_back(Args&: A1, Args&: B);
21718	}
21719
21720	if (Candidates.size() == `1`)
21721	return tryToVectorizeList(VL: {Op0, Op1}, R);
21722
21723	// We have multiple options. Try to pick the single best.
21724	std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
21725	if (!BestCandidate)
21726	return false;
21727	return tryToVectorizeList(
21728	VL: {Candidates [BestCandidate].first, Candidates [BestCandidate].second}, R);
21729	}
21730
21731	namespace {
21732
21733	/// Model horizontal reductions.
21734	///
21735	/// A horizontal reduction is a tree of reduction instructions that has values
21736	/// that can be put into a vector as its leaves. For example:
21737	///
21738	/// mul mul mul mul
21739	/// \ / \ /
21740	/// + +
21741	/// \ /
21742	/// +
21743	/// This tree has "mul" as its leaf values and "+" as its reduction
21744	/// instructions. A reduction can feed into a store or a binary operation
21745	/// feeding a phi.
21746	/// ...
21747	/// \ /
21748	/// +
21749	/// \|
21750	/// phi +=
21751	///
21752	/// Or:
21753	/// ...
21754	/// \ /
21755	/// +
21756	/// \|
21757	/// p =*
21758	///
21759	class HorizontalReduction {
21760	using ReductionOpsType = SmallVector<Value *, `16`>;
21761	using ReductionOpsListType = SmallVector<ReductionOpsType, `2`>;
21762	ReductionOpsListType ReductionOps;
21763	/// List of possibly reduced values.
21764	SmallVector<SmallVector<Value *>> ReducedVals;
21765	/// Maps reduced value to the corresponding reduction operation.
21766	SmallDenseMap<Value , SmallVector<Instruction >, `16`> ReducedValsToOps;
21767	WeakTrackingVH ReductionRoot;
21768	/// The type of reduction operation.
21769	RecurKind RdxKind;
21770	/// Checks if the optimization of original scalar identity operations on
21771	/// matched horizontal reductions is enabled and allowed.
21772	bool IsSupportedHorRdxIdentityOp = false;
21773	/// Contains vector values for reduction including their scale factor and
21774	/// signedness.
21775	SmallVector<std::tuple<Value , unsigned, bool*>> VectorValuesAndScales;
21776
21777	static bool isCmpSelMinMax(Instruction *I) {
21778	return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
21779	RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
21780	}
21781
21782	// And/or are potentially poison-safe logical patterns like:
21783	// select x, y, false
21784	// select x, true, y
21785	static bool isBoolLogicOp(Instruction *I) {
21786	return isa<SelectInst>(Val: I) &&
21787	(match(V: I, P: m_LogicalAnd()) \|\| match(V: I, P: m_LogicalOr()));
21788	}
21789
21790	/// Checks if instruction is associative and can be vectorized.
21791	static bool isVectorizable(RecurKind Kind, Instruction *I) {
21792	if (Kind == RecurKind::None)
21793	return false;
21794
21795	// Integer ops that map to select instructions or intrinsics are fine.
21796	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) \|\|
21797	isBoolLogicOp(I))
21798	return true;
21799
21800	if (Kind == RecurKind::FMax \|\| Kind == RecurKind::FMin) {
21801	// FP min/max are associative except for NaN and -0.0. We do not
21802	// have to rule out -0.0 here because the intrinsic semantics do not
21803	// specify a fixed result for it.
21804	return I->getFastMathFlags().noNaNs();
21805	}
21806
21807	if (Kind == RecurKind::FMaximum \|\| Kind == RecurKind::FMinimum)
21808	return true;
21809
21810	return I->isAssociative();
21811	}
21812
21813	static Value getRdxOperand(Instruction I, unsigned Index) {
21814	// Poison-safe 'or' takes the form: select X, true, Y
21815	// To make that work with the normal operand processing, we skip the
21816	// true value operand.
21817	// TODO: Change the code and data structures to handle this without a hack.
21818	if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == `1`)
21819	return I->getOperand(i: `2`);
21820	return I->getOperand(i: Index);
21821	}
21822
21823	/// Creates reduction operation with the current opcode.
21824	static Value createOp(IRBuilderBase &Builder, RecurKind Kind, Value LHS,
21825	Value RHS, const* Twine &Name, bool UseSelect) {
21826	Type *OpTy = LHS->getType();
21827	assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
21828	switch (Kind) {
21829	case RecurKind::Or: {
21830	if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
21831	return Builder.CreateSelect(
21832	C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
21833	False: RHS, Name);
21834	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
21835	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
21836	Name);
21837	}
21838	case RecurKind::And: {
21839	if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
21840	return Builder.CreateSelect(
21841	C: LHS, True: RHS,
21842	False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)), Name);
21843	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
21844	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
21845	Name);
21846	}
21847	case RecurKind::Add:
21848	case RecurKind::Mul:
21849	case RecurKind::Xor:
21850	case RecurKind::FAdd:
21851	case RecurKind::FMul: {
21852	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
21853	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
21854	Name);
21855	}
21856	case RecurKind::SMax:
21857	case RecurKind::SMin:
21858	case RecurKind::UMax:
21859	case RecurKind::UMin:
21860	if (UseSelect) {
21861	CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind);
21862	Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name);
21863	return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
21864	}
21865	[[fallthrough]];
21866	case RecurKind::FMax:
21867	case RecurKind::FMin:
21868	case RecurKind::FMaximum:
21869	case RecurKind::FMinimum:
21870	case RecurKind::FMaximumNum:
21871	case RecurKind::FMinimumNum: {
21872	Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind);
21873	return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS);
21874	}
21875	default:
21876	llvm_unreachable("Unknown reduction operation.");
21877	}
21878	}
21879
21880	/// Creates reduction operation with the current opcode with the IR flags
21881	/// from \p ReductionOps, dropping nuw/nsw flags.
21882	static Value createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value LHS,
21883	Value RHS, const* Twine &Name,
21884	const ReductionOpsListType &ReductionOps) {
21885	bool UseSelect = ReductionOps.size() == `2` \|\|
21886	// Logical or/and.
21887	(ReductionOps.size() == `1` &&
21888	any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
21889	assert((!UseSelect \|\| ReductionOps.size() != `2` \|\|
21890	isa<SelectInst>(ReductionOps[`1`][`0`])) &&
21891	"Expected cmp + select pairs for reduction");
21892	Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
21893	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
21894	if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
21895	propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps [`0`], OpValue: nullptr,
21896	/IncludeWrapFlags=/false);
21897	propagateIRFlags(I: Op, VL: ReductionOps [`1`], OpValue: nullptr,
21898	/IncludeWrapFlags=/false);
21899	return Op;
21900	}
21901	}
21902	propagateIRFlags(I: Op, VL: ReductionOps [`0`], OpValue: nullptr, /IncludeWrapFlags=/false);
21903	return Op;
21904	}
21905
21906	public:
21907	static RecurKind getRdxKind(Value *V) {
21908	auto *I = dyn_cast<Instruction>(Val: V);
21909	if (!I)
21910	return RecurKind::None;
21911	if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
21912	return RecurKind::Add;
21913	if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
21914	return RecurKind::Mul;
21915	if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) \|\|
21916	match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
21917	return RecurKind::And;
21918	if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) \|\|
21919	match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
21920	return RecurKind::Or;
21921	if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
21922	return RecurKind::Xor;
21923	if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
21924	return RecurKind::FAdd;
21925	if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
21926	return RecurKind::FMul;
21927
21928	if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
21929	return RecurKind::FMax;
21930	if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
21931	return RecurKind::FMin;
21932
21933	if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value())))
21934	return RecurKind::FMaximum;
21935	if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value())))
21936	return RecurKind::FMinimum;
21937	// This matches either cmp+select or intrinsics. SLP is expected to handle
21938	// either form.
21939	// TODO: If we are canonicalizing to intrinsics, we can remove several
21940	// special-case paths that deal with selects.
21941	if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
21942	return RecurKind::SMax;
21943	if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
21944	return RecurKind::SMin;
21945	if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
21946	return RecurKind::UMax;
21947	if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
21948	return RecurKind::UMin;
21949
21950	if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
21951	// Try harder: look for min/max pattern based on instructions producing
21952	// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
21953	// During the intermediate stages of SLP, it's very common to have
21954	// pattern like this (since optimizeGatherSequence is run only once
21955	// at the end):
21956	// %1 = extractelement <2 x i32> %a, i32 0
21957	// %2 = extractelement <2 x i32> %a, i32 1
21958	// %cond = icmp sgt i32 %1, %2
21959	// %3 = extractelement <2 x i32> %a, i32 0
21960	// %4 = extractelement <2 x i32> %a, i32 1
21961	// %select = select i1 %cond, i32 %3, i32 %4
21962	CmpPredicate Pred;
21963	Instruction *L1;
21964	Instruction *L2;
21965
21966	Value *LHS = Select->getTrueValue();
21967	Value *RHS = Select->getFalseValue();
21968	Value *Cond = Select->getCondition();
21969
21970	// TODO: Support inverse predicates.
21971	if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
21972	if (!isa<ExtractElementInst>(Val: RHS) \|\|
21973	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
21974	return RecurKind::None;
21975	} else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
21976	if (!isa<ExtractElementInst>(Val: LHS) \|\|
21977	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
21978	return RecurKind::None;
21979	} else {
21980	if (!isa<ExtractElementInst>(Val: LHS) \|\| !isa<ExtractElementInst>(Val: RHS))
21981	return RecurKind::None;
21982	if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) \|\|
21983	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) \|\|
21984	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
21985	return RecurKind::None;
21986	}
21987
21988	switch (Pred) {
21989	default:
21990	return RecurKind::None;
21991	case CmpInst::ICMP_SGT:
21992	case CmpInst::ICMP_SGE:
21993	return RecurKind::SMax;
21994	case CmpInst::ICMP_SLT:
21995	case CmpInst::ICMP_SLE:
21996	return RecurKind::SMin;
21997	case CmpInst::ICMP_UGT:
21998	case CmpInst::ICMP_UGE:
21999	return RecurKind::UMax;
22000	case CmpInst::ICMP_ULT:
22001	case CmpInst::ICMP_ULE:
22002	return RecurKind::UMin;
22003	}
22004	}
22005	return RecurKind::None;
22006	}
22007
22008	/// Get the index of the first operand.
22009	static unsigned getFirstOperandIndex(Instruction *I) {
22010	return isCmpSelMinMax(I) ? `1` : `0`;
22011	}
22012
22013	private:
22014	/// Total number of operands in the reduction operation.
22015	static unsigned getNumberOfOperands(Instruction *I) {
22016	return isCmpSelMinMax(I) ? `3` : `2`;
22017	}
22018
22019	/// Checks if the instruction is in basic block \p BB.
22020	/// For a cmp+sel min/max reduction check that both ops are in \p BB.
22021	static bool hasSameParent(Instruction I, BasicBlock BB) {
22022	if (isCmpSelMinMax(I) \|\| isBoolLogicOp(I)) {
22023	auto *Sel = cast<SelectInst>(Val: I);
22024	auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
22025	return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
22026	}
22027	return I->getParent() == BB;
22028	}
22029
22030	/// Expected number of uses for reduction operations/reduced values.
22031	static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
22032	if (IsCmpSelMinMax) {
22033	// SelectInst must be used twice while the condition op must have single
22034	// use only.
22035	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
22036	return Sel->hasNUses(N: `2`) && Sel->getCondition()->hasOneUse();
22037	return I->hasNUses(N: `2`);
22038	}
22039
22040	// Arithmetic reduction operation must be used once only.
22041	return I->hasOneUse();
22042	}
22043
22044	/// Initializes the list of reduction operations.
22045	void initReductionOps(Instruction *I) {
22046	if (isCmpSelMinMax(I))
22047	ReductionOps.assign(NumElts: `2`, Elt: ReductionOpsType ());
22048	else
22049	ReductionOps.assign(NumElts: `1`, Elt: ReductionOpsType ());
22050	}
22051
22052	/// Add all reduction operations for the reduction instruction \p I.
22053	void addReductionOps(Instruction *I) {
22054	if (isCmpSelMinMax(I)) {
22055	ReductionOps [`0`].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
22056	ReductionOps [`1`].emplace_back(Args&: I);
22057	} else {
22058	ReductionOps [`0`].emplace_back(Args&: I);
22059	}
22060	}
22061
22062	static bool isGoodForReduction(ArrayRef<Value *> Data) {
22063	int Sz = Data.size();
22064	auto *I = dyn_cast<Instruction>(Val: Data.front());
22065	return Sz > `1` \|\| isConstant(V: Data.front()) \|\|
22066	(I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
22067	}
22068
22069	public:
22070	HorizontalReduction() = default;
22071
22072	/// Try to find a reduction tree.
22073	bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
22074	ScalarEvolution &SE, const DataLayout &DL,
22075	const TargetLibraryInfo &TLI) {
22076	RdxKind = HorizontalReduction::getRdxKind(V: Root);
22077	if (!isVectorizable(Kind: RdxKind, I: Root))
22078	return false;
22079
22080	// Analyze "regular" integer/FP types for reductions - no target-specific
22081	// types or pointers.
22082	Type *Ty = Root->getType();
22083	if (!isValidElementType(Ty) \|\| Ty->isPointerTy())
22084	return false;
22085
22086	// Though the ultimate reduction may have multiple uses, its condition must
22087	// have only single use.
22088	if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
22089	if (!Sel->getCondition()->hasOneUse())
22090	return false;
22091
22092	ReductionRoot = Root;
22093
22094	// Iterate through all the operands of the possible reduction tree and
22095	// gather all the reduced values, sorting them by their value id.
22096	BasicBlock *BB = Root->getParent();
22097	bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
22098	SmallVector<std::pair<Instruction , unsigned*>> Worklist(
22099	`1`, std::make_pair(x&: Root, y: `0`));
22100	// Checks if the operands of the \p TreeN instruction are also reduction
22101	// operations or should be treated as reduced values or an extra argument,
22102	// which is not part of the reduction.
22103	auto CheckOperands = [&](Instruction *TreeN,
22104	SmallVectorImpl<Value *> &PossibleReducedVals,
22105	SmallVectorImpl<Instruction *> &ReductionOps,
22106	unsigned Level) {
22107	for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
22108	End: getNumberOfOperands(I: TreeN)))) {
22109	Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
22110	ReducedValsToOps [EdgeVal].push_back(Elt: TreeN);
22111	auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
22112	// If the edge is not an instruction, or it is different from the main
22113	// reduction opcode or has too many uses - possible reduced value.
22114	// Also, do not try to reduce const values, if the operation is not
22115	// foldable.
22116	if (!EdgeInst \|\| Level > RecursionMaxDepth \|\|
22117	getRdxKind(V: EdgeInst) != RdxKind \|\|
22118	IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) \|\|
22119	!hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) \|\|
22120	!isVectorizable(Kind: RdxKind, I: EdgeInst) \|\|
22121	(R.isAnalyzedReductionRoot(I: EdgeInst) &&
22122	all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
22123	PossibleReducedVals.push_back(Elt: EdgeVal);
22124	continue;
22125	}
22126	ReductionOps.push_back(Elt: EdgeInst);
22127	}
22128	};
22129	// Try to regroup reduced values so that it gets more profitable to try to
22130	// reduce them. Values are grouped by their value ids, instructions - by
22131	// instruction op id and/or alternate op id, plus do extra analysis for
22132	// loads (grouping them by the distabce between pointers) and cmp
22133	// instructions (grouping them by the predicate).
22134	SmallMapVector<
22135	size_t, SmallMapVector<size_t, SmallMapVector<Value , unsigned*, `2`>, `2`>,
22136	`8`>
22137	PossibleReducedVals;
22138	initReductionOps(I: Root);
22139	DenseMap<std::pair<size_t, Value >, SmallVector<LoadInst >> LoadsMap;
22140	SmallSet<size_t, `2`> LoadKeyUsed;
22141
22142	auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
22143	Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
22144	Value *Ptr =
22145	getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
22146	if (!LoadKeyUsed.insert(V: Key).second) {
22147	auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
22148	if (LIt != LoadsMap.end()) {
22149	for (LoadInst *RLI : LIt ->second) {
22150	if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
22151	ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
22152	/StrictCheck=/true))
22153	return hash_value(ptr: RLI->getPointerOperand());
22154	}
22155	for (LoadInst *RLI : LIt ->second) {
22156	if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
22157	Ptr2: LI->getPointerOperand(), TLI)) {
22158	hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
22159	return SubKey;
22160	}
22161	}
22162	if (LIt ->second.size() > `2`) {
22163	hash_code SubKey =
22164	hash_value(ptr: LIt ->second.back()->getPointerOperand());
22165	return SubKey;
22166	}
22167	}
22168	}
22169	LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr))
22170	.first ->second.push_back(Elt: LI);
22171	return hash_value(ptr: LI->getPointerOperand());
22172	};
22173
22174	while (!Worklist.empty()) {
22175	auto [TreeN, Level] = Worklist.pop_back_val();
22176	SmallVector<Value *> PossibleRedVals;
22177	SmallVector<Instruction *> PossibleReductionOps;
22178	CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
22179	addReductionOps(I: TreeN);
22180	// Add reduction values. The values are sorted for better vectorization
22181	// results.
22182	for (Value *V : PossibleRedVals) {
22183	size_t Key, Idx;
22184	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
22185	/AllowAlternate=/false);
22186	++PossibleReducedVals [Key][Idx]
22187	.insert(KV: std::make_pair(x&: V, y: `0`))
22188	.first->second;
22189	}
22190	for (Instruction *I : reverse(C&: PossibleReductionOps))
22191	Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? `0` : Level + `1`);
22192	}
22193	auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
22194	// Sort values by the total number of values kinds to start the reduction
22195	// from the longest possible reduced values sequences.
22196	for (auto &PossibleReducedVals : PossibleReducedValsVect) {
22197	auto PossibleRedVals = PossibleReducedVals.second.takeVector();
22198	SmallVector<SmallVector<Value *>> PossibleRedValsVect;
22199	for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
22200	It != E; ++It) {
22201	PossibleRedValsVect.emplace_back();
22202	auto RedValsVect = It->second.takeVector();
22203	stable_sort(Range&: RedValsVect, C: llvm::less_second ());
22204	for (const std::pair<Value , unsigned*> &Data : RedValsVect)
22205	PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
22206	}
22207	stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
22208	return P1.size() > P2.size();
22209	});
22210	int NewIdx = -`1`;
22211	for (ArrayRef<Value *> Data : PossibleRedValsVect) {
22212	if (NewIdx < `0` \|\|
22213	(!isGoodForReduction(Data) &&
22214	(!isa<LoadInst>(Val: Data.front()) \|\|
22215	!isa<LoadInst>(Val: ReducedVals [NewIdx].front()) \|\|
22216	getUnderlyingObject(
22217	V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) !=
22218	getUnderlyingObject(
22219	V: cast<LoadInst>(Val: ReducedVals [NewIdx].front())
22220	->getPointerOperand())))) {
22221	NewIdx = ReducedVals.size();
22222	ReducedVals.emplace_back();
22223	}
22224	ReducedVals [NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend());
22225	}
22226	}
22227	// Sort the reduced values by number of same/alternate opcode and/or pointer
22228	// operand.
22229	stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value > P1, ArrayRef<Value > P2) {
22230	return P1.size() > P2.size();
22231	});
22232	return true;
22233	}
22234
22235	/// Attempt to vectorize the tree found by matchAssociativeReduction.
22236	Value tryToReduce(BoUpSLP &V, const* DataLayout &DL, TargetTransformInfo *TTI,
22237	const TargetLibraryInfo &TLI, AssumptionCache *AC) {
22238	const unsigned ReductionLimit = VectorizeNonPowerOf2 ? `3` : `4`;
22239	constexpr unsigned RegMaxNumber = `4`;
22240	constexpr unsigned RedValsMaxNumber = `128`;
22241	// If there are a sufficient number of reduction values, reduce
22242	// to a nearby power-of-2. We can safely generate oversized
22243	// vectors and rely on the backend to split them to legal sizes.
22244	if (unsigned NumReducedVals = std::accumulate(
22245	first: ReducedVals.begin(), last: ReducedVals.end(), init: `0`,
22246	binary_op: [](unsigned Num, ArrayRef<Value > Vals) -> unsigned* {
22247	if (!isGoodForReduction(Data: Vals))
22248	return Num;
22249	return Num + Vals.size();
22250	});
22251	NumReducedVals < ReductionLimit &&
22252	all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
22253	return RedV.size() < `2` \|\| !allConstant(VL: RedV) \|\| !isSplat(VL: RedV);
22254	})) {
22255	for (ReductionOpsType &RdxOps : ReductionOps)
22256	for (Value *RdxOp : RdxOps)
22257	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
22258	return nullptr;
22259	}
22260
22261	IRBuilder<TargetFolder> Builder(ReductionRoot ->getContext(),
22262	TargetFolder (DL));
22263	Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
22264
22265	// Track the reduced values in case if they are replaced by extractelement
22266	// because of the vectorization.
22267	DenseMap<Value , WeakTrackingVH> TrackedVals(ReducedVals.size()
22268	ReducedVals.front().size());
22269
22270	// The compare instruction of a min/max is the insertion point for new
22271	// instructions and may be replaced with a new compare instruction.
22272	auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
22273	assert(isa<SelectInst>(RdxRootInst) &&
22274	"Expected min/max reduction to have select root instruction");
22275	Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
22276	assert(isa<Instruction>(ScalarCond) &&
22277	"Expected min/max reduction to have compare condition");
22278	return cast<Instruction>(Val: ScalarCond);
22279	};
22280
22281	bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) {
22282	return isBoolLogicOp(I: cast<Instruction>(Val: V));
22283	});
22284	// Return new VectorizedTree, based on previous value.
22285	auto GetNewVectorizedTree = [&](Value VectorizedTree, Value Res) {
22286	if (VectorizedTree) {
22287	// Update the final value in the reduction.
22288	Builder.SetCurrentDebugLocation(
22289	cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
22290	if (AnyBoolLogicOp) {
22291	auto It = ReducedValsToOps.find(Val: VectorizedTree);
22292	auto It1 = ReducedValsToOps.find(Val: Res);
22293	if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) \|\|
22294	isGuaranteedNotToBePoison(V: VectorizedTree, AC) \|\|
22295	(It != ReducedValsToOps.end() &&
22296	any_of(Range&: It ->getSecond(), P: [&](Instruction *I) {
22297	return isBoolLogicOp(I) &&
22298	getRdxOperand(I, Index: `0`) == VectorizedTree;
22299	}))) {
22300	;
22301	} else if (isGuaranteedNotToBePoison(V: Res, AC) \|\|
22302	(It1 != ReducedValsToOps.end() &&
22303	any_of(Range&: It1 ->getSecond(), P: [&](Instruction *I) {
22304	return isBoolLogicOp(I) && getRdxOperand(I, Index: `0`) == Res;
22305	}))) {
22306	std::swap(a&: VectorizedTree, b&: Res);
22307	} else {
22308	VectorizedTree = Builder.CreateFreeze(V: VectorizedTree);
22309	}
22310	}
22311
22312	return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
22313	ReductionOps);
22314	}
22315	// Initialize the final value in the reduction.
22316	return Res;
22317	};
22318	SmallDenseSet<Value > IgnoreList(ReductionOps.size()
22319	ReductionOps.front().size());
22320	for (ReductionOpsType &RdxOps : ReductionOps)
22321	for (Value *RdxOp : RdxOps) {
22322	if (!RdxOp)
22323	continue;
22324	IgnoreList.insert(V: RdxOp);
22325	}
22326	// Intersect the fast-math-flags from all reduction operations.
22327	FastMathFlags RdxFMF;
22328	RdxFMF.set();
22329	for (Value *U : IgnoreList)
22330	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
22331	RdxFMF &= FPMO->getFastMathFlags();
22332	bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
22333
22334	// Need to track reduced vals, they may be changed during vectorization of
22335	// subvectors.
22336	for (ArrayRef<Value *> Candidates : ReducedVals)
22337	for (Value *V : Candidates)
22338	TrackedVals.try_emplace(Key: V, Args&: V);
22339
22340	auto At = [](SmallMapVector<Value , unsigned*, `16`> &MV,
22341	Value V) -> unsigned* & {
22342	auto *It = MV.find(Key: V);
22343	assert(It != MV.end() && "Unable to find given key.");
22344	return It->second;
22345	};
22346
22347	DenseMap<Value , unsigned*> VectorizedVals(ReducedVals.size());
22348	// List of the values that were reduced in other trees as part of gather
22349	// nodes and thus requiring extract if fully vectorized in other trees.
22350	SmallPtrSet<Value *, `4`> RequiredExtract;
22351	WeakTrackingVH VectorizedTree = nullptr;
22352	bool CheckForReusedReductionOps = false;
22353	// Try to vectorize elements based on their type.
22354	SmallVector<InstructionsState> States;
22355	for (ArrayRef<Value *> RV : ReducedVals)
22356	States.push_back(Elt: getSameOpcode(VL: RV, TLI));
22357	for (unsigned I = `0`, E = ReducedVals.size(); I < E; ++I) {
22358	ArrayRef<Value *> OrigReducedVals = ReducedVals [I];
22359	InstructionsState S = States [I];
22360	SmallVector<Value *> Candidates;
22361	Candidates.reserve(N: `2` * OrigReducedVals.size());
22362	DenseMap<Value , Value > TrackedToOrig(`2` * OrigReducedVals.size());
22363	for (unsigned Cnt = `0`, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
22364	Value *RdxVal = TrackedVals.at(Val: OrigReducedVals [Cnt]);
22365	// Check if the reduction value was not overriden by the extractelement
22366	// instruction because of the vectorization and exclude it, if it is not
22367	// compatible with other values.
22368	// Also check if the instruction was folded to constant/other value.
22369	auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
22370	if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
22371	(!S \|\| !S.getMatchingMainOpOrAltOp(I: Inst))) \|\|
22372	(S && !Inst))
22373	continue;
22374	Candidates.push_back(Elt: RdxVal);
22375	TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals [Cnt]);
22376	}
22377	bool ShuffledExtracts = false;
22378	// Try to handle shuffled extractelements.
22379	if (S && S.getOpcode() == Instruction::ExtractElement &&
22380	!S.isAltShuffle() && I + `1` < E) {
22381	SmallVector<Value *> CommonCandidates(Candidates);
22382	for (Value *RV : ReducedVals [I + `1`]) {
22383	Value *RdxVal = TrackedVals.at(Val: RV);
22384	// Check if the reduction value was not overriden by the
22385	// extractelement instruction because of the vectorization and
22386	// exclude it, if it is not compatible with other values.
22387	auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal);
22388	if (!Inst)
22389	continue;
22390	CommonCandidates.push_back(Elt: RdxVal);
22391	TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
22392	}
22393	SmallVector<int> Mask;
22394	if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) {
22395	++I;
22396	Candidates.swap(RHS&: CommonCandidates);
22397	ShuffledExtracts = true;
22398	}
22399	}
22400
22401	// Emit code for constant values.
22402	if (Candidates.size() > `1` && allConstant(VL: Candidates)) {
22403	Value *Res = Candidates.front();
22404	Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
22405	++VectorizedVals.try_emplace(Key: OrigV).first ->getSecond();
22406	for (Value *VC : ArrayRef(Candidates).drop_front()) {
22407	Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
22408	Value *OrigV = TrackedToOrig.at(Val: VC);
22409	++VectorizedVals.try_emplace(Key: OrigV).first ->getSecond();
22410	if (auto *ResI = dyn_cast<Instruction>(Val: Res))
22411	V.analyzedReductionRoot(I: ResI);
22412	}
22413	VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
22414	continue;
22415	}
22416
22417	unsigned NumReducedVals = Candidates.size();
22418	if (NumReducedVals < ReductionLimit &&
22419	(NumReducedVals < `2` \|\| !isSplat(VL: Candidates)))
22420	continue;
22421
22422	// Check if we support repeated scalar values processing (optimization of
22423	// original scalar identity operations on matched horizontal reductions).
22424	IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
22425	RdxKind != RecurKind::FMul &&
22426	RdxKind != RecurKind::FMulAdd;
22427	// Gather same values.
22428	SmallMapVector<Value , unsigned*, `16`> SameValuesCounter;
22429	if (IsSupportedHorRdxIdentityOp)
22430	for (Value *V : Candidates) {
22431	Value *OrigV = TrackedToOrig.at(Val: V);
22432	++SameValuesCounter.try_emplace(Key: OrigV).first->second;
22433	}
22434	// Used to check if the reduced values used same number of times. In this
22435	// case the compiler may produce better code. E.g. if reduced values are
22436	// aabbccdd (8 x values), then the first node of the tree will have a node
22437	// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
22438	// Plus, the final reduction will be performed on <8 x aabbccdd>.
22439	// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
22440	// x abcd) 2.*
22441	// Currently it only handles add/fadd/xor. and/or/min/max do not require
22442	// this analysis, other operations may require an extra estimation of
22443	// the profitability.
22444	bool SameScaleFactor = false;
22445	bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
22446	SameValuesCounter.size() != Candidates.size();
22447	BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
22448	if (OptReusedScalars) {
22449	SameScaleFactor =
22450	(RdxKind == RecurKind::Add \|\| RdxKind == RecurKind::FAdd \|\|
22451	RdxKind == RecurKind::Xor) &&
22452	all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
22453	P: [&SameValuesCounter](const std::pair<Value , unsigned*> &P) {
22454	return P.second == SameValuesCounter.front().second;
22455	});
22456	Candidates.resize(N: SameValuesCounter.size());
22457	transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
22458	F: [&](const auto &P) { return TrackedVals.at(Val: P.first); });
22459	NumReducedVals = Candidates.size();
22460	// Have a reduction of the same element.
22461	if (NumReducedVals == `1`) {
22462	Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
22463	unsigned Cnt = At(SameValuesCounter, OrigV);
22464	Value *RedVal =
22465	emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
22466	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
22467	VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
22468	ExternallyUsedValues.insert(V: OrigV);
22469	continue;
22470	}
22471	}
22472
22473	unsigned MaxVecRegSize = V.getMaxVecRegSize();
22474	unsigned EltSize = V.getVectorElementSize(V: Candidates [`0`]);
22475	const unsigned MaxElts = std::clamp<unsigned>(
22476	val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber,
22477	hi: RegMaxNumber * RedValsMaxNumber);
22478
22479	unsigned ReduxWidth = NumReducedVals;
22480	auto GetVectorFactor = [&, &TTI = TTI](unsigned* ReduxWidth) {
22481	unsigned NumParts, NumRegs;
22482	Type *ScalarTy = Candidates.front()->getType();
22483	ReduxWidth =
22484	getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth);
22485	VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
22486	NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
22487	NumRegs =
22488	TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
22489	while (NumParts > NumRegs) {
22490	assert(ReduxWidth > `0` && "ReduxWidth is unexpectedly 0.");
22491	ReduxWidth = bit_floor(Value: ReduxWidth - `1`);
22492	VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
22493	NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
22494	NumRegs =
22495	TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
22496	}
22497	if (NumParts > NumRegs / `2`)
22498	ReduxWidth = bit_floor(Value: ReduxWidth);
22499	return ReduxWidth;
22500	};
22501	if (!VectorizeNonPowerOf2 \|\| !has_single_bit(Value: ReduxWidth + `1`))
22502	ReduxWidth = GetVectorFactor(ReduxWidth);
22503	ReduxWidth = std::min(a: ReduxWidth, b: MaxElts);
22504
22505	unsigned Start = `0`;
22506	unsigned Pos = Start;
22507	// Restarts vectorization attempt with lower vector factor.
22508	unsigned PrevReduxWidth = ReduxWidth;
22509	bool CheckForReusedReductionOpsLocal = false;
22510	auto AdjustReducedVals = [&](bool IgnoreVL = false) {
22511	bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
22512	if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
22513	// Check if any of the reduction ops are gathered. If so, worth
22514	// trying again with less number of reduction ops.
22515	CheckForReusedReductionOpsLocal \|= IsAnyRedOpGathered;
22516	}
22517	++Pos;
22518	if (Pos < NumReducedVals - ReduxWidth + `1`)
22519	return IsAnyRedOpGathered;
22520	Pos = Start;
22521	--ReduxWidth;
22522	if (ReduxWidth > `1`)
22523	ReduxWidth = GetVectorFactor(ReduxWidth);
22524	return IsAnyRedOpGathered;
22525	};
22526	bool AnyVectorized = false;
22527	SmallDenseSet<std::pair<unsigned, unsigned>, `8`> IgnoredCandidates;
22528	while (Pos < NumReducedVals - ReduxWidth + `1` &&
22529	ReduxWidth >= ReductionLimit) {
22530	// Dependency in tree of the reduction ops - drop this attempt, try
22531	// later.
22532	if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
22533	Start == `0`) {
22534	CheckForReusedReductionOps = true;
22535	break;
22536	}
22537	PrevReduxWidth = ReduxWidth;
22538	ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
22539	// Been analyzed already - skip.
22540	if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) \|\|
22541	(!has_single_bit(Value: ReduxWidth) &&
22542	(IgnoredCandidates.contains(
22543	V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) \|\|
22544	IgnoredCandidates.contains(
22545	V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)),
22546	y: bit_floor(Value: ReduxWidth))))) \|\|
22547	V.areAnalyzedReductionVals(VL)) {
22548	(void)AdjustReducedVals(/IgnoreVL=/true);
22549	continue;
22550	}
22551	// Early exit if any of the reduction values were deleted during
22552	// previous vectorization attempts.
22553	if (any_of(Range&: VL, P: [&V](Value *RedVal) {
22554	auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
22555	if (!RedValI)
22556	return false;
22557	return V.isDeleted(I: RedValI);
22558	}))
22559	break;
22560	V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
22561	if (V.isTreeTinyAndNotFullyVectorizable(/ForReduction=/true)) {
22562	if (!AdjustReducedVals())
22563	V.analyzedReductionVals(VL);
22564	continue;
22565	}
22566	if (V.isLoadCombineReductionCandidate(RdxKind)) {
22567	if (!AdjustReducedVals())
22568	V.analyzedReductionVals(VL);
22569	continue;
22570	}
22571	V.reorderTopToBottom();
22572	// No need to reorder the root node at all.
22573	V.reorderBottomToTop(/IgnoreReorder=/true);
22574	// Keep extracted other reduction values, if they are used in the
22575	// vectorization trees.
22576	BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
22577	ExternallyUsedValues);
22578	// The reduction root is used as the insertion point for new
22579	// instructions, so set it as externally used to prevent it from being
22580	// deleted.
22581	LocalExternallyUsedValues.insert(V: ReductionRoot);
22582	for (unsigned Cnt = `0`, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
22583	if (Cnt == I \|\| (ShuffledExtracts && Cnt == I - `1`))
22584	continue;
22585	for (Value *V : ReducedVals [Cnt])
22586	if (isa<Instruction>(Val: V))
22587	LocalExternallyUsedValues.insert(V: TrackedVals [V]);
22588	}
22589	if (!IsSupportedHorRdxIdentityOp) {
22590	// Number of uses of the candidates in the vector of values.
22591	assert(SameValuesCounter.empty() &&
22592	"Reused values counter map is not empty");
22593	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
22594	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
22595	continue;
22596	Value *V = Candidates [Cnt];
22597	Value *OrigV = TrackedToOrig.at(Val: V);
22598	++SameValuesCounter.try_emplace(Key: OrigV).first->second;
22599	}
22600	}
22601	V.transformNodes();
22602	SmallPtrSet<Value *, `4`> VLScalars(llvm::from_range, VL);
22603	// Gather externally used values.
22604	SmallPtrSet<Value *, `4`> Visited;
22605	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
22606	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
22607	continue;
22608	Value *RdxVal = Candidates [Cnt];
22609	if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end())
22610	RdxVal = It ->second;
22611	if (!Visited.insert(Ptr: RdxVal).second)
22612	continue;
22613	// Check if the scalar was vectorized as part of the vectorization
22614	// tree but not the top node.
22615	if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
22616	LocalExternallyUsedValues.insert(V: RdxVal);
22617	continue;
22618	}
22619	Value *OrigV = TrackedToOrig.at(Val: RdxVal);
22620	unsigned NumOps =
22621	VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV);
22622	if (NumOps != ReducedValsToOps.at(Val: OrigV).size())
22623	LocalExternallyUsedValues.insert(V: RdxVal);
22624	}
22625	// Do not need the list of reused scalars in regular mode anymore.
22626	if (!IsSupportedHorRdxIdentityOp)
22627	SameValuesCounter.clear();
22628	for (Value *RdxVal : VL)
22629	if (RequiredExtract.contains(Ptr: RdxVal))
22630	LocalExternallyUsedValues.insert(V: RdxVal);
22631	V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
22632
22633	V.computeMinimumValueSizes();
22634
22635	// Estimate cost.
22636	InstructionCost ReductionCost =
22637	getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, FMF: RdxFMF, R: V);
22638	InstructionCost Cost = V.getTreeCost(VectorizedVals: VL, ReductionCost);
22639	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
22640	<< " for reduction\n");
22641	if (!Cost.isValid())
22642	break;
22643	if (Cost >= -SLPCostThreshold) {
22644	V.getORE()->emit(RemarkBuilder: [&]() {
22645	return OptimizationRemarkMissed (SV_NAME, "HorSLPNotBeneficial",
22646	ReducedValsToOps.at(Val: VL [`0`]).front())
22647	<< "Vectorizing horizontal reduction is possible "
22648	<< "but not beneficial with cost " << ore::NV ("Cost", Cost)
22649	<< " and threshold "
22650	<< ore::NV ("Threshold", -SLPCostThreshold);
22651	});
22652	if (!AdjustReducedVals()) {
22653	V.analyzedReductionVals(VL);
22654	unsigned Offset = Pos == Start ? Pos : Pos - `1`;
22655	if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
22656	// Add subvectors of VL to the list of the analyzed values.
22657	for (unsigned VF = getFloorFullVectorNumberOfElements(
22658	TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - `1`);
22659	VF >= ReductionLimit;
22660	VF = getFloorFullVectorNumberOfElements(
22661	TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - `1`)) {
22662	if (has_single_bit(Value: VF) &&
22663	V.getCanonicalGraphSize() != V.getTreeSize())
22664	continue;
22665	for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF))
22666	IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF));
22667	}
22668	}
22669	}
22670	continue;
22671	}
22672
22673	LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
22674	<< Cost << ". (HorRdx)\n");
22675	V.getORE()->emit(RemarkBuilder: [&]() {
22676	return OptimizationRemark (SV_NAME, "VectorizedHorizontalReduction",
22677	ReducedValsToOps.at(Val: VL [`0`]).front())
22678	<< "Vectorized horizontal reduction with cost "
22679	<< ore::NV ("Cost", Cost) << " and with tree size "
22680	<< ore::NV ("TreeSize", V.getTreeSize());
22681	});
22682
22683	Builder.setFastMathFlags(RdxFMF);
22684
22685	// Emit a reduction. If the root is a select (min/max idiom), the insert
22686	// point is the compare condition of that select.
22687	Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
22688	Instruction *InsertPt = RdxRootInst;
22689	if (IsCmpSelMinMax)
22690	InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
22691
22692	// Vectorize a tree.
22693	Value *VectorizedRoot = V.vectorizeTree(
22694	ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales);
22695	// Update TrackedToOrig mapping, since the tracked values might be
22696	// updated.
22697	for (Value *RdxVal : Candidates) {
22698	Value *OrigVal = TrackedToOrig.at(Val: RdxVal);
22699	Value *TransformedRdxVal = TrackedVals.at(Val: OrigVal);
22700	if (TransformedRdxVal != RdxVal)
22701	TrackedToOrig.try_emplace(Key: TransformedRdxVal, Args&: OrigVal);
22702	}
22703
22704	Builder.SetInsertPoint(InsertPt);
22705
22706	// To prevent poison from leaking across what used to be sequential,
22707	// safe, scalar boolean logic operations, the reduction operand must be
22708	// frozen.
22709	if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC))
22710	VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
22711
22712	// Emit code to correctly handle reused reduced values, if required.
22713	if (OptReusedScalars && !SameScaleFactor) {
22714	VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
22715	SameValuesCounter, TrackedToOrig);
22716	}
22717
22718	Type *ScalarTy = VL.front()->getType();
22719	Type *VecTy = VectorizedRoot->getType();
22720	Type *RedScalarTy = VecTy->getScalarType();
22721	VectorValuesAndScales.emplace_back(
22722	Args&: VectorizedRoot,
22723	Args: OptReusedScalars && SameScaleFactor
22724	? SameValuesCounter.front().second
22725	: `1`,
22726	Args: RedScalarTy != ScalarTy->getScalarType()
22727	? V.isSignedMinBitwidthRootNode()
22728	: true);
22729
22730	// Count vectorized reduced values to exclude them from final reduction.
22731	for (Value *RdxVal : VL) {
22732	Value *OrigV = TrackedToOrig.at(Val: RdxVal);
22733	if (IsSupportedHorRdxIdentityOp) {
22734	VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV));
22735	continue;
22736	}
22737	++VectorizedVals.try_emplace(Key: OrigV).first ->getSecond();
22738	if (!V.isVectorized(V: RdxVal))
22739	RequiredExtract.insert(Ptr: RdxVal);
22740	}
22741	Pos += ReduxWidth;
22742	Start = Pos;
22743	ReduxWidth = NumReducedVals - Pos;
22744	if (ReduxWidth > `1`)
22745	ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
22746	AnyVectorized = true;
22747	}
22748	if (OptReusedScalars && !AnyVectorized) {
22749	for (const std::pair<Value , unsigned*> &P : SameValuesCounter) {
22750	Value *RdxVal = TrackedVals.at(Val: P.first);
22751	Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second);
22752	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
22753	VectorizedVals.try_emplace(Key: P.first, Args: P.second);
22754	}
22755	continue;
22756	}
22757	}
22758	if (!VectorValuesAndScales.empty())
22759	VectorizedTree = GetNewVectorizedTree(
22760	VectorizedTree,
22761	emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot ->getType()));
22762	if (VectorizedTree) {
22763	// Reorder operands of bool logical op in the natural order to avoid
22764	// possible problem with poison propagation. If not possible to reorder
22765	// (both operands are originally RHS), emit an extra freeze instruction
22766	// for the LHS operand.
22767	// I.e., if we have original code like this:
22768	// RedOp1 = select i1 ?, i1 LHS, i1 false
22769	// RedOp2 = select i1 RHS, i1 ?, i1 false
22770
22771	// Then, we swap LHS/RHS to create a new op that matches the poison
22772	// semantics of the original code.
22773
22774	// If we have original code like this and both values could be poison:
22775	// RedOp1 = select i1 ?, i1 LHS, i1 false
22776	// RedOp2 = select i1 ?, i1 RHS, i1 false
22777
22778	// Then, we must freeze LHS in the new op.
22779	auto FixBoolLogicalOps = [&, VectorizedTree](Value &LHS, Value &RHS,
22780	Instruction *RedOp1,
22781	Instruction *RedOp2,
22782	bool InitStep) {
22783	if (!AnyBoolLogicOp)
22784	return;
22785	if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) \|\|
22786	getRdxOperand(I: RedOp1, Index: `0`) == LHS \|\|
22787	isGuaranteedNotToBePoison(V: LHS, AC)))
22788	return;
22789	if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) \|\|
22790	getRdxOperand(I: RedOp2, Index: `0`) == RHS \|\|
22791	isGuaranteedNotToBePoison(V: RHS, AC))) {
22792	std::swap(a&: LHS, b&: RHS);
22793	return;
22794	}
22795	if (LHS != VectorizedTree)
22796	LHS = Builder.CreateFreeze(V: LHS);
22797	};
22798	// Finish the reduction.
22799	// Need to add extra arguments and not vectorized possible reduction
22800	// values.
22801	// Try to avoid dependencies between the scalar remainders after
22802	// reductions.
22803	auto FinalGen =
22804	[&](ArrayRef<std::pair<Instruction , Value >> InstVals,
22805	bool InitStep) {
22806	unsigned Sz = InstVals.size();
22807	SmallVector<std::pair<Instruction , Value >> ExtraReds(Sz / `2` +
22808	Sz % `2`);
22809	for (unsigned I = `0`, E = (Sz / `2`) * `2`; I < E; I += `2`) {
22810	Instruction *RedOp = InstVals [I + `1`].first;
22811	Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
22812	Value *RdxVal1 = InstVals [I].second;
22813	Value *StableRdxVal1 = RdxVal1;
22814	auto It1 = TrackedVals.find(Val: RdxVal1);
22815	if (It1 != TrackedVals.end())
22816	StableRdxVal1 = It1 ->second;
22817	Value *RdxVal2 = InstVals [I + `1`].second;
22818	Value *StableRdxVal2 = RdxVal2;
22819	auto It2 = TrackedVals.find(Val: RdxVal2);
22820	if (It2 != TrackedVals.end())
22821	StableRdxVal2 = It2 ->second;
22822	// To prevent poison from leaking across what used to be
22823	// sequential, safe, scalar boolean logic operations, the
22824	// reduction operand must be frozen.
22825	FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals [I].first,
22826	RedOp, InitStep);
22827	Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
22828	RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
22829	ExtraReds [I / `2`] = std::make_pair(x: InstVals [I].first, y&: ExtraRed);
22830	}
22831	if (Sz % `2` == `1`)
22832	ExtraReds [Sz / `2`] = InstVals.back();
22833	return ExtraReds;
22834	};
22835	SmallVector<std::pair<Instruction , Value >> ExtraReductions;
22836	ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
22837	Args&: VectorizedTree);
22838	SmallPtrSet<Value *, `8`> Visited;
22839	for (ArrayRef<Value *> Candidates : ReducedVals) {
22840	for (Value *RdxVal : Candidates) {
22841	if (!Visited.insert(Ptr: RdxVal).second)
22842	continue;
22843	unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
22844	for (Instruction *RedOp :
22845	ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps))
22846	ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
22847	}
22848	}
22849	// Iterate through all not-vectorized reduction values/extra arguments.
22850	bool InitStep = true;
22851	while (ExtraReductions.size() > `1`) {
22852	SmallVector<std::pair<Instruction , Value >> NewReds =
22853	FinalGen(ExtraReductions, InitStep);
22854	ExtraReductions.swap(RHS&: NewReds);
22855	InitStep = false;
22856	}
22857	VectorizedTree = ExtraReductions.front().second;
22858
22859	ReductionRoot ->replaceAllUsesWith(V: VectorizedTree);
22860
22861	// The original scalar reduction is expected to have no remaining
22862	// uses outside the reduction tree itself. Assert that we got this
22863	// correct, replace internal uses with undef, and mark for eventual
22864	// deletion.
22865	#ifndef NDEBUG
22866	SmallSet<Value *, `4`> IgnoreSet;
22867	for (ArrayRef<Value *> RdxOps : ReductionOps)
22868	IgnoreSet.insert_range(RdxOps);
22869	#endif
22870	for (ArrayRef<Value *> RdxOps : ReductionOps) {
22871	for (Value *Ignore : RdxOps) {
22872	if (!Ignore)
22873	continue;
22874	#ifndef NDEBUG
22875	for (auto *U : Ignore->users()) {
22876	assert(IgnoreSet.count(U) &&
22877	"All users must be either in the reduction ops list.");
22878	}
22879	#endif
22880	if (!Ignore->use_empty()) {
22881	Value *P = PoisonValue::get(T: Ignore->getType());
22882	Ignore->replaceAllUsesWith(V: P);
22883	}
22884	}
22885	V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales);
22886	}
22887	} else if (!CheckForReusedReductionOps) {
22888	for (ReductionOpsType &RdxOps : ReductionOps)
22889	for (Value *RdxOp : RdxOps)
22890	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
22891	}
22892	return VectorizedTree;
22893	}
22894
22895	private:
22896	/// Creates the reduction from the given \p Vec vector value with the given
22897	/// scale \p Scale and signedness \p IsSigned.
22898	Value createSingleOp(IRBuilderBase &Builder, const* TargetTransformInfo &TTI,
22899	Value Vec, unsigned* Scale, bool IsSigned,
22900	Type *DestTy) {
22901	Value *Rdx;
22902	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) {
22903	unsigned DestTyNumElements = getNumElements(Ty: VecTy);
22904	unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements;
22905	Rdx = PoisonValue::get(
22906	T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements));
22907	for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) {
22908	// Do reduction for each lane.
22909	// e.g., do reduce add for
22910	// VL[0] = <4 x Ty> <a, b, c, d>
22911	// VL[1] = <4 x Ty> <e, f, g, h>
22912	// Lane[0] = <2 x Ty> <a, e>
22913	// Lane[1] = <2 x Ty> <b, f>
22914	// Lane[2] = <2 x Ty> <c, g>
22915	// Lane[3] = <2 x Ty> <d, h>
22916	// result[0] = reduce add Lane[0]
22917	// result[1] = reduce add Lane[1]
22918	// result[2] = reduce add Lane[2]
22919	// result[3] = reduce add Lane[3]
22920	SmallVector<int, `16`> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF);
22921	Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask);
22922	Rdx = Builder.CreateInsertElement(
22923	Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I);
22924	}
22925	} else {
22926	Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy);
22927	}
22928	if (Rdx->getType() != DestTy)
22929	Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned);
22930	// Improved analysis for add/fadd/xor reductions with same scale
22931	// factor for all operands of reductions. We can emit scalar ops for
22932	// them instead.
22933	if (Scale > `1`)
22934	Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale);
22935	return Rdx;
22936	}
22937
22938	/// Calculate the cost of a reduction.
22939	InstructionCost getReductionCost(TargetTransformInfo *TTI,
22940	ArrayRef<Value *> ReducedVals,
22941	bool IsCmpSelMinMax, FastMathFlags FMF,
22942	const BoUpSLP &R) {
22943	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
22944	Type *ScalarTy = ReducedVals.front()->getType();
22945	unsigned ReduxWidth = ReducedVals.size();
22946	FixedVectorType *VectorTy = R.getReductionType();
22947	InstructionCost VectorCost = `0`, ScalarCost;
22948	// If all of the reduced values are constant, the vector cost is 0, since
22949	// the reduction value can be calculated at the compile time.
22950	bool AllConsts = allConstant(VL: ReducedVals);
22951	auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
22952	InstructionCost Cost = `0`;
22953	// Scalar cost is repeated for N-1 elements.
22954	int Cnt = ReducedVals.size();
22955	for (Value *RdxVal : ReducedVals) {
22956	if (Cnt == `1`)
22957	break;
22958	--Cnt;
22959	if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? `3` : `2`)) {
22960	Cost += GenCostFn ();
22961	continue;
22962	}
22963	InstructionCost ScalarCost = `0`;
22964	for (User *U : RdxVal->users()) {
22965	auto *RdxOp = cast<Instruction>(Val: U);
22966	if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
22967	ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
22968	continue;
22969	}
22970	ScalarCost = InstructionCost::getInvalid();
22971	break;
22972	}
22973	if (ScalarCost.isValid())
22974	Cost += ScalarCost;
22975	else
22976	Cost += GenCostFn ();
22977	}
22978	return Cost;
22979	};
22980	// Require reduction cost if:
22981	// 1. This type is not a full register type and no other vectors with the
22982	// same type in the storage (first vector with small type).
22983	// 2. The storage does not have any vector with full vector use (first
22984	// vector with full register use).
22985	bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
22986	switch (RdxKind) {
22987	case RecurKind::Add:
22988	case RecurKind::Mul:
22989	case RecurKind::Or:
22990	case RecurKind::And:
22991	case RecurKind::Xor:
22992	case RecurKind::FAdd:
22993	case RecurKind::FMul: {
22994	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
22995	if (!AllConsts) {
22996	if (DoesRequireReductionOp) {
22997	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
22998	assert(SLPReVec && "FixedVectorType is not expected.");
22999	unsigned ScalarTyNumElements = VecTy->getNumElements();
23000	for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) {
23001	VectorCost += TTI->getShuffleCost(
23002	Kind: TTI::SK_PermuteSingleSrc,
23003	DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(),
23004	NumElts: ReducedVals.size()),
23005	SrcTy: VectorTy,
23006	Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size()));
23007	VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy,
23008	FMF, CostKind);
23009	}
23010	VectorCost += TTI->getScalarizationOverhead(
23011	Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /Insert/ true,
23012	/Extract/ false, CostKind: TTI::TCK_RecipThroughput);
23013	} else {
23014	Type *RedTy = VectorTy->getElementType();
23015	auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
23016	u: std::make_pair(x&: RedTy, y: true));
23017	if (RType == RedTy) {
23018	VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy,
23019	FMF, CostKind);
23020	} else {
23021	VectorCost = TTI->getExtendedReductionCost(
23022	Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy,
23023	Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind);
23024	}
23025	}
23026	} else {
23027	Type *RedTy = VectorTy->getElementType();
23028	auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
23029	u: std::make_pair(x&: RedTy, y: true));
23030	VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
23031	VectorCost +=
23032	TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind);
23033	if (RType != RedTy) {
23034	unsigned Opcode = Instruction::Trunc;
23035	if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
23036	Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
23037	VectorCost += TTI->getCastInstrCost(
23038	Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
23039	}
23040	}
23041	}
23042	ScalarCost = EvaluateScalarCost([&]() {
23043	return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
23044	});
23045	break;
23046	}
23047	case RecurKind::FMax:
23048	case RecurKind::FMin:
23049	case RecurKind::FMaximum:
23050	case RecurKind::FMinimum:
23051	case RecurKind::SMax:
23052	case RecurKind::SMin:
23053	case RecurKind::UMax:
23054	case RecurKind::UMin: {
23055	Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
23056	if (!AllConsts) {
23057	if (DoesRequireReductionOp) {
23058	VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
23059	} else {
23060	// Check if the previous reduction already exists and account it as
23061	// series of operations + single reduction.
23062	Type *RedTy = VectorTy->getElementType();
23063	auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
23064	u: std::make_pair(x&: RedTy, y: true));
23065	VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
23066	IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
23067	VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
23068	if (RType != RedTy) {
23069	unsigned Opcode = Instruction::Trunc;
23070	if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
23071	Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
23072	VectorCost += TTI->getCastInstrCost(
23073	Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
23074	}
23075	}
23076	}
23077	ScalarCost = EvaluateScalarCost([&]() {
23078	IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
23079	return TTI->getIntrinsicInstrCost(ICA, CostKind);
23080	});
23081	break;
23082	}
23083	default:
23084	llvm_unreachable("Expected arithmetic or min/max reduction operation");
23085	}
23086
23087	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
23088	<< " for reduction of " << shortBundleName(ReducedVals)
23089	<< " (It is a splitting reduction)\n");
23090	return VectorCost - ScalarCost;
23091	}
23092
23093	/// Splits the values, stored in VectorValuesAndScales, into registers/free
23094	/// sub-registers, combines them with the given reduction operation as a
23095	/// vector operation and then performs single (small enough) reduction.
23096	Value emitReduction(IRBuilderBase &Builder, const* TargetTransformInfo &TTI,
23097	Type *DestTy) {
23098	Value ReducedSubTree = nullptr*;
23099	// Creates reduction and combines with the previous reduction.
23100	auto CreateSingleOp = [&](Value Vec, unsigned* Scale, bool IsSigned) {
23101	Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
23102	if (ReducedSubTree)
23103	ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx,
23104	Name: "op.rdx", ReductionOps);
23105	else
23106	ReducedSubTree = Rdx;
23107	};
23108	if (VectorValuesAndScales.size() == `1`) {
23109	const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
23110	CreateSingleOp(Vec, Scale, IsSigned);
23111	return ReducedSubTree;
23112	}
23113	// Scales Vec using given Cnt scale factor and then performs vector combine
23114	// with previous value of VecOp.
23115	Value VecRes = nullptr*;
23116	bool VecResSignedness = false;
23117	auto CreateVecOp = [&](Value Vec, unsigned* Cnt, bool IsSigned) {
23118	Type *ScalarTy = Vec->getType()->getScalarType();
23119	// Scale Vec using given Cnt scale factor.
23120	if (Cnt > `1`) {
23121	ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount();
23122	switch (RdxKind) {
23123	case RecurKind::Add: {
23124	if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
23125	unsigned VF = getNumElements(Ty: Vec->getType());
23126	LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
23127	<< ". (HorRdx)\n");
23128	SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
23129	for (unsigned I : seq<unsigned>(Size: Cnt))
23130	std::iota(first: std::next(x: Mask.begin(), n: VF * I),
23131	last: std::next(x: Mask.begin(), n: VF * (I + `1`)), value: `0`);
23132	++NumVectorInstructions;
23133	Vec = Builder.CreateShuffleVector(V: Vec, Mask);
23134	break;
23135	}
23136	// res = mul vv, n
23137	if (ScalarTy != DestTy->getScalarType())
23138	Vec = Builder.CreateIntCast(
23139	V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
23140	isSigned: IsSigned);
23141	Value *Scale = ConstantVector::getSplat(
23142	EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt));
23143	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
23144	<< ". (HorRdx)\n");
23145	++NumVectorInstructions;
23146	Vec = Builder.CreateMul(LHS: Vec, RHS: Scale);
23147	break;
23148	}
23149	case RecurKind::Xor: {
23150	// res = n % 2 ? 0 : vv
23151	LLVM_DEBUG(dbgs()
23152	<< "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
23153	if (Cnt % `2` == `0`)
23154	Vec = Constant::getNullValue(Ty: Vec->getType());
23155	break;
23156	}
23157	case RecurKind::FAdd: {
23158	// res = fmul v, n
23159	Value *Scale =
23160	ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt));
23161	LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
23162	<< ". (HorRdx)\n");
23163	++NumVectorInstructions;
23164	Vec = Builder.CreateFMul(L: Vec, R: Scale);
23165	break;
23166	}
23167	case RecurKind::And:
23168	case RecurKind::Or:
23169	case RecurKind::SMax:
23170	case RecurKind::SMin:
23171	case RecurKind::UMax:
23172	case RecurKind::UMin:
23173	case RecurKind::FMax:
23174	case RecurKind::FMin:
23175	case RecurKind::FMaximum:
23176	case RecurKind::FMinimum:
23177	// res = vv
23178	break;
23179	case RecurKind::Mul:
23180	case RecurKind::FMul:
23181	case RecurKind::FMulAdd:
23182	case RecurKind::AnyOf:
23183	case RecurKind::FindFirstIVSMin:
23184	case RecurKind::FindLastIVSMax:
23185	case RecurKind::FindLastIVUMax:
23186	case RecurKind::FMaximumNum:
23187	case RecurKind::FMinimumNum:
23188	case RecurKind::None:
23189	llvm_unreachable("Unexpected reduction kind for repeated scalar.");
23190	}
23191	}
23192	// Combine Vec with the previous VecOp.
23193	if (!VecRes) {
23194	VecRes = Vec;
23195	VecResSignedness = IsSigned;
23196	} else {
23197	++NumVectorInstructions;
23198	if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
23199	VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
23200	// Handle ctpop.
23201	unsigned VecResVF = getNumElements(Ty: VecRes->getType());
23202	unsigned VecVF = getNumElements(Ty: Vec->getType());
23203	SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
23204	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
23205	// Ensure that VecRes is always larger than Vec
23206	if (VecResVF < VecVF) {
23207	std::swap(a&: VecRes, b&: Vec);
23208	std::swap(a&: VecResVF, b&: VecVF);
23209	}
23210	if (VecResVF != VecVF) {
23211	SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
23212	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: `0`);
23213	Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask);
23214	}
23215	VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op");
23216	return;
23217	}
23218	if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
23219	VecRes = Builder.CreateIntCast(
23220	V: VecRes, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: VecRes->getType())),
23221	isSigned: VecResSignedness);
23222	if (ScalarTy != DestTy->getScalarType())
23223	Vec = Builder.CreateIntCast(
23224	V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
23225	isSigned: IsSigned);
23226	unsigned VecResVF = getNumElements(Ty: VecRes->getType());
23227	unsigned VecVF = getNumElements(Ty: Vec->getType());
23228	// Ensure that VecRes is always larger than Vec
23229	if (VecResVF < VecVF) {
23230	std::swap(a&: VecRes, b&: Vec);
23231	std::swap(a&: VecResVF, b&: VecVF);
23232	}
23233	// extract + op + insert
23234	Value *Op = VecRes;
23235	if (VecResVF != VecVF)
23236	Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /Index=/`0`);
23237	Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op", ReductionOps);
23238	if (VecResVF != VecVF)
23239	Op = createInsertVector(Builder, Vec: VecRes, V: Op, /Index=/`0`);
23240	VecRes = Op;
23241	}
23242	};
23243	for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
23244	CreateVecOp(Vec, Scale, IsSigned);
23245	CreateSingleOp(VecRes, /Scale=/`1`, /IsSigned=/false);
23246
23247	return ReducedSubTree;
23248	}
23249
23250	/// Emit a horizontal reduction of the vectorized value.
23251	Value emitReduction(Value VectorizedValue, IRBuilderBase &Builder,
23252	const TargetTransformInfo TTI, Type DestTy) {
23253	assert(VectorizedValue && "Need to have a vectorized tree node");
23254	assert(RdxKind != RecurKind::FMulAdd &&
23255	"A call to the llvm.fmuladd intrinsic is not handled yet");
23256
23257	auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
23258	if (FTy->getScalarType() == Builder.getInt1Ty() &&
23259	RdxKind == RecurKind::Add &&
23260	DestTy->getScalarType() != FTy->getScalarType()) {
23261	// Convert vector_reduce_add(ZExt(<n x i1>)) to
23262	// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
23263	Value *V = Builder.CreateBitCast(
23264	V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements()));
23265	++NumVectorInstructions;
23266	return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V);
23267	}
23268	++NumVectorInstructions;
23269	return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind);
23270	}
23271
23272	/// Emits optimized code for unique scalar value reused \p Cnt times.
23273	Value emitScaleForReusedOps(Value VectorizedValue, IRBuilderBase &Builder,
23274	unsigned Cnt) {
23275	assert(IsSupportedHorRdxIdentityOp &&
23276	"The optimization of matched scalar identity horizontal reductions "
23277	"must be supported.");
23278	if (Cnt == `1`)
23279	return VectorizedValue;
23280	switch (RdxKind) {
23281	case RecurKind::Add: {
23282	// res = mul vv, n
23283	Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
23284	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
23285	<< VectorizedValue << ". (HorRdx)\n");
23286	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
23287	}
23288	case RecurKind::Xor: {
23289	// res = n % 2 ? 0 : vv
23290	LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
23291	<< ". (HorRdx)\n");
23292	if (Cnt % `2` == `0`)
23293	return Constant::getNullValue(Ty: VectorizedValue->getType());
23294	return VectorizedValue;
23295	}
23296	case RecurKind::FAdd: {
23297	// res = fmul v, n
23298	Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
23299	LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
23300	<< VectorizedValue << ". (HorRdx)\n");
23301	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
23302	}
23303	case RecurKind::And:
23304	case RecurKind::Or:
23305	case RecurKind::SMax:
23306	case RecurKind::SMin:
23307	case RecurKind::UMax:
23308	case RecurKind::UMin:
23309	case RecurKind::FMax:
23310	case RecurKind::FMin:
23311	case RecurKind::FMaximum:
23312	case RecurKind::FMinimum:
23313	// res = vv
23314	return VectorizedValue;
23315	case RecurKind::Mul:
23316	case RecurKind::FMul:
23317	case RecurKind::FMulAdd:
23318	case RecurKind::AnyOf:
23319	case RecurKind::FindFirstIVSMin:
23320	case RecurKind::FindLastIVSMax:
23321	case RecurKind::FindLastIVUMax:
23322	case RecurKind::FMaximumNum:
23323	case RecurKind::FMinimumNum:
23324	case RecurKind::None:
23325	llvm_unreachable("Unexpected reduction kind for repeated scalar.");
23326	}
23327	return nullptr;
23328	}
23329
23330	/// Emits actual operation for the scalar identity values, found during
23331	/// horizontal reduction analysis.
23332	Value *
23333	emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
23334	const SmallMapVector<Value , unsigned*, `16`> &SameValuesCounter,
23335	const DenseMap<Value , Value > &TrackedToOrig) {
23336	assert(IsSupportedHorRdxIdentityOp &&
23337	"The optimization of matched scalar identity horizontal reductions "
23338	"must be supported.");
23339	ArrayRef<Value *> VL = R.getRootNodeScalars();
23340	auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
23341	if (VTy->getElementType() != VL.front()->getType()) {
23342	VectorizedValue = Builder.CreateIntCast(
23343	V: VectorizedValue,
23344	DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
23345	isSigned: R.isSignedMinBitwidthRootNode());
23346	}
23347	switch (RdxKind) {
23348	case RecurKind::Add: {
23349	// root = mul prev_root, <1, 1, n, 1>
23350	SmallVector<Constant *> Vals;
23351	for (Value *V : VL) {
23352	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
23353	Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /IsSigned=/false));
23354	}
23355	auto *Scale = ConstantVector::get(V: Vals);
23356	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
23357	<< VectorizedValue << ". (HorRdx)\n");
23358	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
23359	}
23360	case RecurKind::And:
23361	case RecurKind::Or:
23362	// No need for multiple or/and(s).
23363	LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
23364	<< ". (HorRdx)\n");
23365	return VectorizedValue;
23366	case RecurKind::SMax:
23367	case RecurKind::SMin:
23368	case RecurKind::UMax:
23369	case RecurKind::UMin:
23370	case RecurKind::FMax:
23371	case RecurKind::FMin:
23372	case RecurKind::FMaximum:
23373	case RecurKind::FMinimum:
23374	// No need for multiple min/max(s) of the same value.
23375	LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
23376	<< ". (HorRdx)\n");
23377	return VectorizedValue;
23378	case RecurKind::Xor: {
23379	// Replace values with even number of repeats with 0, since
23380	// x xor x = 0.
23381	// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
23382	// 7>, if elements 4th and 6th elements have even number of repeats.
23383	SmallVector<int> Mask(
23384	cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
23385	PoisonMaskElem);
23386	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
23387	bool NeedShuffle = false;
23388	for (unsigned I = `0`, VF = VL.size(); I < VF; ++I) {
23389	Value *V = VL [I];
23390	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
23391	if (Cnt % `2` == `0`) {
23392	Mask [I] = VF;
23393	NeedShuffle = true;
23394	}
23395	}
23396	LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
23397	: Mask) dbgs()
23398	<< I << " ";
23399	dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
23400	if (NeedShuffle)
23401	VectorizedValue = Builder.CreateShuffleVector(
23402	V1: VectorizedValue,
23403	V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
23404	return VectorizedValue;
23405	}
23406	case RecurKind::FAdd: {
23407	// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
23408	SmallVector<Constant *> Vals;
23409	for (Value *V : VL) {
23410	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
23411	Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
23412	}
23413	auto *Scale = ConstantVector::get(V: Vals);
23414	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
23415	}
23416	case RecurKind::Mul:
23417	case RecurKind::FMul:
23418	case RecurKind::FMulAdd:
23419	case RecurKind::AnyOf:
23420	case RecurKind::FindFirstIVSMin:
23421	case RecurKind::FindLastIVSMax:
23422	case RecurKind::FindLastIVUMax:
23423	case RecurKind::FMaximumNum:
23424	case RecurKind::FMinimumNum:
23425	case RecurKind::None:
23426	llvm_unreachable("Unexpected reduction kind for reused scalars.");
23427	}
23428	return nullptr;
23429	}
23430	};
23431	} // end anonymous namespace
23432
23433	/// Gets recurrence kind from the specified value.
23434	static RecurKind getRdxKind(Value *V) {
23435	return HorizontalReduction::getRdxKind(V);
23436	}
23437	static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
23438	if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
23439	return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
23440
23441	unsigned AggregateSize = `1`;
23442	auto *IV = cast<InsertValueInst>(Val: InsertInst);
23443	Type *CurrentType = IV->getType();
23444	do {
23445	if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
23446	for (auto *Elt : ST->elements())
23447	if (Elt != ST->getElementType(N: `0`)) // check homogeneity
23448	return std::nullopt;
23449	AggregateSize *= ST->getNumElements();
23450	CurrentType = ST->getElementType(N: `0`);
23451	} else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
23452	AggregateSize *= AT->getNumElements();
23453	CurrentType = AT->getElementType();
23454	} else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
23455	AggregateSize *= VT->getNumElements();
23456	return AggregateSize;
23457	} else if (CurrentType->isSingleValueType()) {
23458	return AggregateSize;
23459	} else {
23460	return std::nullopt;
23461	}
23462	} while (true);
23463	}
23464
23465	static void findBuildAggregateRec(Instruction *LastInsertInst,
23466	TargetTransformInfo *TTI,
23467	SmallVectorImpl<Value *> &BuildVectorOpds,
23468	SmallVectorImpl<Value *> &InsertElts,
23469	unsigned OperandOffset, const BoUpSLP &R) {
23470	do {
23471	Value *InsertedOperand = LastInsertInst->getOperand(i: `1`);
23472	std::optional<unsigned> OperandIndex =
23473	getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
23474	if (!OperandIndex \|\| R.isDeleted(I: LastInsertInst))
23475	return;
23476	if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
23477	findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
23478	BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R);
23479
23480	} else {
23481	BuildVectorOpds [*OperandIndex] = InsertedOperand;
23482	InsertElts [*OperandIndex] = LastInsertInst;
23483	}
23484	LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: `0`));
23485	} while (LastInsertInst != nullptr &&
23486	isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
23487	LastInsertInst->hasOneUse());
23488	}
23489
23490	/// Recognize construction of vectors like
23491	/// %ra = insertelement <4 x float> poison, float %s0, i32 0
23492	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
23493	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
23494	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
23495	/// starting from the last insertelement or insertvalue instruction.
23496	///
23497	/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
23498	/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
23499	/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
23500	///
23501	/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
23502	///
23503	/// \return true if it matches.
23504	static bool findBuildAggregate(Instruction *LastInsertInst,
23505	TargetTransformInfo *TTI,
23506	SmallVectorImpl<Value *> &BuildVectorOpds,
23507	SmallVectorImpl<Value *> &InsertElts,
23508	const BoUpSLP &R) {
23509
23510	assert((isa<InsertElementInst>(LastInsertInst) \|\|
23511	isa<InsertValueInst>(LastInsertInst)) &&
23512	"Expected insertelement or insertvalue instruction!");
23513
23514	assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
23515	"Expected empty result vectors!");
23516
23517	std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
23518	if (!AggregateSize)
23519	return false;
23520	BuildVectorOpds.resize(N: *AggregateSize);
23521	InsertElts.resize(N: *AggregateSize);
23522
23523	findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: `0`, R);
23524	llvm::erase(C&: BuildVectorOpds, V: nullptr);
23525	llvm::erase(C&: InsertElts, V: nullptr);
23526	if (BuildVectorOpds.size() >= `2`)
23527	return true;
23528
23529	return false;
23530	}
23531
23532	/// Try and get a reduction instruction from a phi node.
23533	///
23534	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
23535	/// if they come from either \p ParentBB or a containing loop latch.
23536	///
23537	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
23538	/// if not possible.
23539	static Instruction getReductionInstr(const* DominatorTree DT, PHINode P,
23540	BasicBlock ParentBB, LoopInfo LI) {
23541	// There are situations where the reduction value is not dominated by the
23542	// reduction phi. Vectorizing such cases has been reported to cause
23543	// miscompiles. See PR25787.
23544	auto DominatedReduxValue = [&](Value *R) {
23545	return isa<Instruction>(Val: R) &&
23546	DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
23547	};
23548
23549	Instruction Rdx = nullptr*;
23550
23551	// Return the incoming value if it comes from the same BB as the phi node.
23552	if (P->getIncomingBlock(i: `0`) == ParentBB) {
23553	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
23554	} else if (P->getIncomingBlock(i: `1`) == ParentBB) {
23555	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
23556	}
23557
23558	if (Rdx && DominatedReduxValue (Rdx))
23559	return Rdx;
23560
23561	// Otherwise, check whether we have a loop latch to look at.
23562	Loop *BBL = LI->getLoopFor(BB: ParentBB);
23563	if (!BBL)
23564	return nullptr;
23565	BasicBlock *BBLatch = BBL->getLoopLatch();
23566	if (!BBLatch)
23567	return nullptr;
23568
23569	// There is a loop latch, return the incoming value if it comes from
23570	// that. This reduction pattern occasionally turns up.
23571	if (P->getIncomingBlock(i: `0`) == BBLatch) {
23572	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
23573	} else if (P->getIncomingBlock(i: `1`) == BBLatch) {
23574	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
23575	}
23576
23577	if (Rdx && DominatedReduxValue (Rdx))
23578	return Rdx;
23579
23580	return nullptr;
23581	}
23582
23583	static bool matchRdxBop(Instruction I, Value &V0, Value *&V1) {
23584	if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
23585	return true;
23586	if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23587	return true;
23588	if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23589	return true;
23590	if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23591	return true;
23592	if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23593	return true;
23594	if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23595	return true;
23596	if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23597	return true;
23598	if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23599	return true;
23600	if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23601	return true;
23602	return false;
23603	}
23604
23605	/// We could have an initial reduction that is not an add.
23606	/// r = v1 + v2 + v3 + v4*
23607	/// In such a case start looking for a tree rooted in the first '+'.
23608	/// \Returns the new root if found, which may be nullptr if not an instruction.
23609	static Instruction tryGetSecondaryReductionRoot(PHINode Phi,
23610	Instruction *Root) {
23611	assert((isa<BinaryOperator>(Root) \|\| isa<SelectInst>(Root) \|\|
23612	isa<IntrinsicInst>(Root)) &&
23613	"Expected binop, select, or intrinsic for reduction matching");
23614	Value *LHS =
23615	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
23616	Value *RHS =
23617	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + `1`);
23618	if (LHS == Phi)
23619	return dyn_cast<Instruction>(Val: RHS);
23620	if (RHS == Phi)
23621	return dyn_cast<Instruction>(Val: LHS);
23622	return nullptr;
23623	}
23624
23625	/// \p Returns the first operand of \p I that does not match \p Phi. If
23626	/// operand is not an instruction it returns nullptr.
23627	static Instruction getNonPhiOperand(Instruction I, PHINode *Phi) {
23628	Value Op0 = nullptr*;
23629	Value Op1 = nullptr*;
23630	if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
23631	return nullptr;
23632	return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
23633	}
23634
23635	/// \Returns true if \p I is a candidate instruction for reduction vectorization.
23636	static bool isReductionCandidate(Instruction *I) {
23637	bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
23638	Value B0 = nullptr, B1 = nullptr;
23639	bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
23640	return IsBinop \|\| IsSelect;
23641	}
23642
23643	bool SLPVectorizerPass::vectorizeHorReduction(
23644	PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,
23645	SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
23646	if (!ShouldVectorizeHor)
23647	return false;
23648	bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
23649
23650	if (Root->getParent() != BB \|\| isa<PHINode>(Val: Root))
23651	return false;
23652
23653	// If we can find a secondary reduction root, use that instead.
23654	auto SelectRoot = [&]() {
23655	if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
23656	HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
23657	if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
23658	return NewRoot;
23659	return Root;
23660	};
23661
23662	// Start analysis starting from Root instruction. If horizontal reduction is
23663	// found, try to vectorize it. If it is not a horizontal reduction or
23664	// vectorization is not possible or not effective, and currently analyzed
23665	// instruction is a binary operation, try to vectorize the operands, using
23666	// pre-order DFS traversal order. If the operands were not vectorized, repeat
23667	// the same procedure considering each operand as a possible root of the
23668	// horizontal reduction.
23669	// Interrupt the process if the Root instruction itself was vectorized or all
23670	// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
23671	// If a horizintal reduction was not matched or vectorized we collect
23672	// instructions for possible later attempts for vectorization.
23673	std::queue<std::pair<Instruction , unsigned*>> Stack;
23674	Stack.emplace(args: SelectRoot (), args: `0`);
23675	SmallPtrSet<Value *, `8`> VisitedInstrs;
23676	bool Res = false;
23677	auto &&TryToReduce = [this, &R](Instruction Inst) -> Value {
23678	if (R.isAnalyzedReductionRoot(I: Inst))
23679	return nullptr;
23680	if (!isReductionCandidate(I: Inst))
23681	return nullptr;
23682	HorizontalReduction HorRdx;
23683	if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: SE, DL: DL, TLI: *TLI))
23684	return nullptr;
23685	return HorRdx.tryToReduce(V&: R, DL: DL, TTI, TLI: TLI, AC);
23686	};
23687	auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
23688	if (TryOperandsAsNewSeeds && FutureSeed == Root) {
23689	FutureSeed = getNonPhiOperand(I: Root, Phi: P);
23690	if (!FutureSeed)
23691	return false;
23692	}
23693	// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
23694	// analysis is done separately.
23695	if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
23696	PostponedInsts.push_back(Elt: FutureSeed);
23697	return true;
23698	};
23699
23700	while (!Stack.empty()) {
23701	Instruction *Inst;
23702	unsigned Level;
23703	std::tie(args&: Inst, args&: Level) = Stack.front();
23704	Stack.pop();
23705	// Do not try to analyze instruction that has already been vectorized.
23706	// This may happen when we vectorize instruction operands on a previous
23707	// iteration while stack was populated before that happened.
23708	if (R.isDeleted(I: Inst))
23709	continue;
23710	if (Value *VectorizedV = TryToReduce (Inst)) {
23711	Res = true;
23712	if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
23713	// Try to find another reduction.
23714	Stack.emplace(args&: I, args&: Level);
23715	continue;
23716	}
23717	if (R.isDeleted(I: Inst))
23718	continue;
23719	} else {
23720	// We could not vectorize `Inst` so try to use it as a future seed.
23721	if (!TryAppendToPostponedInsts (Inst)) {
23722	assert(Stack.empty() && "Expected empty stack");
23723	break;
23724	}
23725	}
23726
23727	// Try to vectorize operands.
23728	// Continue analysis for the instruction from the same basic block only to
23729	// save compile time.
23730	if (++Level < RecursionMaxDepth)
23731	for (auto *Op : Inst->operand_values())
23732	if (VisitedInstrs.insert(Ptr: Op).second)
23733	if (auto *I = dyn_cast<Instruction>(Val: Op))
23734	// Do not try to vectorize CmpInst operands, this is done
23735	// separately.
23736	if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
23737	!R.isDeleted(I) && I->getParent() == BB)
23738	Stack.emplace(args&: I, args&: Level);
23739	}
23740	return Res;
23741	}
23742
23743	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Instruction Root,
23744	BasicBlock *BB, BoUpSLP &R) {
23745	SmallVector<WeakTrackingVH> PostponedInsts;
23746	bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
23747	Res \|= tryToVectorize(Insts: PostponedInsts, R);
23748	return Res;
23749	}
23750
23751	bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
23752	BoUpSLP &R) {
23753	bool Res = false;
23754	for (Value *V : Insts)
23755	if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
23756	Res \|= tryToVectorize(I: Inst, R);
23757	return Res;
23758	}
23759
23760	bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
23761	BasicBlock *BB, BoUpSLP &R,
23762	bool MaxVFOnly) {
23763	if (!R.canMapToVector(T: IVI->getType()))
23764	return false;
23765
23766	SmallVector<Value *, `16`> BuildVectorOpds;
23767	SmallVector<Value *, `16`> BuildVectorInsts;
23768	if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R))
23769	return false;
23770
23771	if (MaxVFOnly && BuildVectorOpds.size() == `2`) {
23772	R.getORE()->emit(RemarkBuilder: [&]() {
23773	return OptimizationRemarkMissed (SV_NAME, "NotPossible", IVI)
23774	<< "Cannot SLP vectorize list: only 2 elements of buildvalue, "
23775	"trying reduction first.";
23776	});
23777	return false;
23778	}
23779	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
23780	// Aggregate value is unlikely to be processed in vector register.
23781	return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
23782	}
23783
23784	bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
23785	BasicBlock *BB, BoUpSLP &R,
23786	bool MaxVFOnly) {
23787	SmallVector<Value *, `16`> BuildVectorInsts;
23788	SmallVector<Value *, `16`> BuildVectorOpds;
23789	SmallVector<int> Mask;
23790	if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) \|\|
23791	(all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
23792	isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC)))
23793	return false;
23794
23795	if (MaxVFOnly && BuildVectorInsts.size() == `2`) {
23796	R.getORE()->emit(RemarkBuilder: [&]() {
23797	return OptimizationRemarkMissed (SV_NAME, "NotPossible", IEI)
23798	<< "Cannot SLP vectorize list: only 2 elements of buildvector, "
23799	"trying reduction first.";
23800	});
23801	return false;
23802	}
23803	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
23804	return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
23805	}
23806
23807	template <typename T>
23808	static bool tryToVectorizeSequence(
23809	SmallVectorImpl<T > &Incoming, function_ref<bool(T , T *)> Comparator,
23810	function_ref<bool(T , T )> AreCompatible,
23811	function_ref<bool(ArrayRef<T >, bool*)> TryToVectorizeHelper,
23812	bool MaxVFOnly, BoUpSLP &R) {
23813	bool Changed = false;
23814	// Sort by type, parent, operands.
23815	stable_sort(Incoming, Comparator);
23816
23817	// Try to vectorize elements base on their type.
23818	SmallVector<T *> Candidates;
23819	SmallVector<T *> VL;
23820	for (auto IncIt = Incoming.begin(), E = Incoming.end(); IncIt != E;
23821	VL.clear()) {
23822	// Look for the next elements with the same type, parent and operand
23823	// kinds.
23824	auto I = dyn_cast<Instruction>(IncIt);
23825	if (!I \|\| R.isDeleted(I)) {
23826	++IncIt;
23827	continue;
23828	}
23829	auto *SameTypeIt = IncIt;
23830	while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) \|\|
23831	R.isDeleted(I: cast<Instruction>(*SameTypeIt)) \|\|
23832	AreCompatible(SameTypeIt, IncIt))) {
23833	auto I = dyn_cast<Instruction>(SameTypeIt);
23834	++SameTypeIt;
23835	if (I && !R.isDeleted(I))
23836	VL.push_back(cast<T>(I));
23837	}
23838
23839	// Try to vectorize them.
23840	unsigned NumElts = VL.size();
23841	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
23842	<< NumElts << ")\n");
23843	// The vectorization is a 3-state attempt:
23844	// 1. Try to vectorize instructions with the same/alternate opcodes with the
23845	// size of maximal register at first.
23846	// 2. Try to vectorize remaining instructions with the same type, if
23847	// possible. This may result in the better vectorization results rather than
23848	// if we try just to vectorize instructions with the same/alternate opcodes.
23849	// 3. Final attempt to try to vectorize all instructions with the
23850	// same/alternate ops only, this may result in some extra final
23851	// vectorization.
23852	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
23853	// Success start over because instructions might have been changed.
23854	Changed = true;
23855	VL.swap(Candidates);
23856	Candidates.clear();
23857	for (T *V : VL) {
23858	if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
23859	Candidates.push_back(V);
23860	}
23861	} else {
23862	/// \Returns the minimum number of elements that we will attempt to
23863	/// vectorize.
23864	auto GetMinNumElements = [&R](Value *V) {
23865	unsigned EltSize = R.getVectorElementSize(V);
23866	return std::max(a: `2U`, b: R.getMaxVecRegSize() / EltSize);
23867	};
23868	if (NumElts < GetMinNumElements(*IncIt) &&
23869	(Candidates.empty() \|\|
23870	Candidates.front()->getType() == (*IncIt)->getType())) {
23871	for (T *V : VL) {
23872	if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
23873	Candidates.push_back(V);
23874	}
23875	}
23876	}
23877	// Final attempt to vectorize instructions with the same types.
23878	if (Candidates.size() > `1` &&
23879	(SameTypeIt == E \|\| (SameTypeIt)->getType() != (IncIt)->getType())) {
23880	if (TryToVectorizeHelper(Candidates, /MaxVFOnly=/false)) {
23881	// Success start over because instructions might have been changed.
23882	Changed = true;
23883	} else if (MaxVFOnly) {
23884	// Try to vectorize using small vectors.
23885	SmallVector<T *> VL;
23886	for (auto It = Candidates.begin(), End = Candidates.end(); It != End;
23887	VL.clear()) {
23888	auto I = dyn_cast<Instruction>(It);
23889	if (!I \|\| R.isDeleted(I)) {
23890	++It;
23891	continue;
23892	}
23893	auto *SameTypeIt = It;
23894	while (SameTypeIt != End &&
23895	(!isa<Instruction>(*SameTypeIt) \|\|
23896	R.isDeleted(I: cast<Instruction>(*SameTypeIt)) \|\|
23897	AreCompatible(SameTypeIt, It))) {
23898	auto I = dyn_cast<Instruction>(SameTypeIt);
23899	++SameTypeIt;
23900	if (I && !R.isDeleted(I))
23901	VL.push_back(cast<T>(I));
23902	}
23903	unsigned NumElts = VL.size();
23904	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(VL),
23905	/MaxVFOnly=/false))
23906	Changed = true;
23907	It = SameTypeIt;
23908	}
23909	}
23910	Candidates.clear();
23911	}
23912
23913	// Start over at the next instruction of a different type (or the end).
23914	IncIt = SameTypeIt;
23915	}
23916	return Changed;
23917	}
23918
23919	/// Compare two cmp instructions. If IsCompatibility is true, function returns
23920	/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
23921	/// operands. If IsCompatibility is false, function implements strict weak
23922	/// ordering relation between two cmp instructions, returning true if the first
23923	/// instruction is "less" than the second, i.e. its predicate is less than the
23924	/// predicate of the second or the operands IDs are less than the operands IDs
23925	/// of the second cmp instruction.
23926	template <bool IsCompatibility>
23927	static bool compareCmp(Value V, Value V2, TargetLibraryInfo &TLI,
23928	const DominatorTree &DT) {
23929	assert(isValidElementType(V->getType()) &&
23930	isValidElementType(V2->getType()) &&
23931	"Expected valid element types only.");
23932	if (V == V2)
23933	return IsCompatibility;
23934	auto *CI1 = cast<CmpInst>(Val: V);
23935	auto *CI2 = cast<CmpInst>(Val: V2);
23936	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() <
23937	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
23938	return !IsCompatibility;
23939	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() >
23940	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
23941	return false;
23942	if (CI1->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() <
23943	CI2->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits())
23944	return !IsCompatibility;
23945	if (CI1->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() >
23946	CI2->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits())
23947	return false;
23948	CmpInst::Predicate Pred1 = CI1->getPredicate();
23949	CmpInst::Predicate Pred2 = CI2->getPredicate();
23950	CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
23951	CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
23952	CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
23953	CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
23954	if (BasePred1 < BasePred2)
23955	return !IsCompatibility;
23956	if (BasePred1 > BasePred2)
23957	return false;
23958	// Compare operands.
23959	bool CI1Preds = Pred1 == BasePred1;
23960	bool CI2Preds = Pred2 == BasePred1;
23961	for (int I = `0`, E = CI1->getNumOperands(); I < E; ++I) {
23962	auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - `1`);
23963	auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - `1`);
23964	if (Op1 == Op2)
23965	continue;
23966	if (Op1->getValueID() < Op2->getValueID())
23967	return !IsCompatibility;
23968	if (Op1->getValueID() > Op2->getValueID())
23969	return false;
23970	if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
23971	if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
23972	if (IsCompatibility) {
23973	if (I1->getParent() != I2->getParent())
23974	return false;
23975	} else {
23976	// Try to compare nodes with same parent.
23977	DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
23978	DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
23979	if (!NodeI1)
23980	return NodeI2 != nullptr;
23981	if (!NodeI2)
23982	return false;
23983	assert((NodeI1 == NodeI2) ==
23984	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
23985	"Different nodes should have different DFS numbers");
23986	if (NodeI1 != NodeI2)
23987	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
23988	}
23989	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
23990	if (S && (IsCompatibility \|\| !S.isAltShuffle()))
23991	continue;
23992	if (IsCompatibility)
23993	return false;
23994	if (I1->getOpcode() != I2->getOpcode())
23995	return I1->getOpcode() < I2->getOpcode();
23996	}
23997	}
23998	return IsCompatibility;
23999	}
24000
24001	template <typename ItT>
24002	bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
24003	BasicBlock *BB, BoUpSLP &R) {
24004	bool Changed = false;
24005	// Try to find reductions first.
24006	for (CmpInst *I : CmpInsts) {
24007	if (R.isDeleted(I))
24008	continue;
24009	for (Value *Op : I->operands())
24010	if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) {
24011	Changed \|= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R);
24012	if (R.isDeleted(I))
24013	break;
24014	}
24015	}
24016	// Try to vectorize operands as vector bundles.
24017	for (CmpInst *I : CmpInsts) {
24018	if (R.isDeleted(I))
24019	continue;
24020	Changed \|= tryToVectorize(I, R);
24021	}
24022	// Try to vectorize list of compares.
24023	// Sort by type, compare predicate, etc.
24024	auto CompareSorter = [&](Value V, Value V2) {
24025	if (V == V2)
24026	return false;
24027	return compareCmp<false>(V, V2, TLI&: TLI, DT: DT);
24028	};
24029
24030	auto AreCompatibleCompares = [&](Value V1, Value V2) {
24031	if (V1 == V2)
24032	return true;
24033	return compareCmp<true>(V: V1, V2, TLI&: TLI, DT: DT);
24034	};
24035
24036	SmallVector<Value *> Vals;
24037	for (Instruction *V : CmpInsts)
24038	if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V)))
24039	Vals.push_back(Elt: V);
24040	if (Vals.size() <= `1`)
24041	return Changed;
24042	Changed \|= tryToVectorizeSequence<Value>(
24043	Vals, CompareSorter, AreCompatibleCompares,
24044	[this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
24045	// Exclude possible reductions from other blocks.
24046	bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
24047	return any_of(V->users(), [V](User *U) {
24048	auto *Select = dyn_cast<SelectInst>(Val: U);
24049	return Select &&
24050	Select->getParent() != cast<Instruction>(Val: V)->getParent();
24051	});
24052	});
24053	if (ArePossiblyReducedInOtherBlock)
24054	return false;
24055	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
24056	},
24057	/MaxVFOnly=/true, R);
24058	return Changed;
24059	}
24060
24061	bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
24062	BasicBlock *BB, BoUpSLP &R) {
24063	assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
24064	"This function only accepts Insert instructions");
24065	bool OpsChanged = false;
24066	SmallVector<WeakTrackingVH> PostponedInsts;
24067	for (auto *I : reverse(C&: Instructions)) {
24068	// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
24069	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
24070	continue;
24071	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
24072	OpsChanged \|=
24073	vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /MaxVFOnly=/true);
24074	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
24075	OpsChanged \|=
24076	vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /MaxVFOnly=/true);
24077	}
24078	// pass2 - try to vectorize reductions only
24079	if (R.isDeleted(I))
24080	continue;
24081	OpsChanged \|= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts);
24082	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
24083	continue;
24084	// pass3 - try to match and vectorize a buildvector sequence.
24085	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
24086	OpsChanged \|=
24087	vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /MaxVFOnly=/false);
24088	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
24089	OpsChanged \|= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
24090	/MaxVFOnly=/false);
24091	}
24092	}
24093	// Now try to vectorize postponed instructions.
24094	OpsChanged \|= tryToVectorize(Insts: PostponedInsts, R);
24095
24096	Instructions.clear();
24097	return OpsChanged;
24098	}
24099
24100	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
24101	bool Changed = false;
24102	SmallVector<Value *, `4`> Incoming;
24103	SmallPtrSet<Value *, `16`> VisitedInstrs;
24104	// Maps phi nodes to the non-phi nodes found in the use tree for each phi
24105	// node. Allows better to identify the chains that can be vectorized in the
24106	// better way.
24107	DenseMap<Value , SmallVector<Value , `4`>> PHIToOpcodes;
24108	auto PHICompare = [this, &PHIToOpcodes](Value V1, Value V2) {
24109	assert(isValidElementType(V1->getType()) &&
24110	isValidElementType(V2->getType()) &&
24111	"Expected vectorizable types only.");
24112	if (V1 == V2)
24113	return false;
24114	// It is fine to compare type IDs here, since we expect only vectorizable
24115	// types, like ints, floats and pointers, we don't care about other type.
24116	if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
24117	return true;
24118	if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
24119	return false;
24120	if (V1->getType()->getScalarSizeInBits() <
24121	V2->getType()->getScalarSizeInBits())
24122	return true;
24123	if (V1->getType()->getScalarSizeInBits() >
24124	V2->getType()->getScalarSizeInBits())
24125	return false;
24126	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
24127	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
24128	if (Opcodes1.size() < Opcodes2.size())
24129	return true;
24130	if (Opcodes1.size() > Opcodes2.size())
24131	return false;
24132	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
24133	{
24134	// Instructions come first.
24135	auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]);
24136	auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I]);
24137	if (I1 && I2) {
24138	DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
24139	DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
24140	if (!NodeI1)
24141	return NodeI2 != nullptr;
24142	if (!NodeI2)
24143	return false;
24144	assert((NodeI1 == NodeI2) ==
24145	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
24146	"Different nodes should have different DFS numbers");
24147	if (NodeI1 != NodeI2)
24148	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
24149	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
24150	if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
24151	const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1);
24152	const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2);
24153	if (!E1 \|\| !E2)
24154	continue;
24155
24156	// Sort on ExtractElementInsts primarily by vector operands. Prefer
24157	// program order of the vector operands.
24158	const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand());
24159	const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand());
24160	if (V1 != V2) {
24161	if (V1 && !V2)
24162	return true;
24163	if (!V1 && V2)
24164	return false;
24165	DomTreeNodeBase<BasicBlock> *NodeI1 =
24166	DT->getNode(BB: V1->getParent());
24167	DomTreeNodeBase<BasicBlock> *NodeI2 =
24168	DT->getNode(BB: V2->getParent());
24169	if (!NodeI1)
24170	return NodeI2 != nullptr;
24171	if (!NodeI2)
24172	return false;
24173	assert((NodeI1 == NodeI2) ==
24174	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
24175	"Different nodes should have different DFS numbers");
24176	if (NodeI1 != NodeI2)
24177	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
24178	return V1->comesBefore(Other: V2);
24179	}
24180	// If we have the same vector operand, try to sort by constant
24181	// index.
24182	std::optional<unsigned> Id1 = getExtractIndex(E: E1);
24183	std::optional<unsigned> Id2 = getExtractIndex(E: E2);
24184	// Bring constants to the top
24185	if (Id1 && !Id2)
24186	return true;
24187	if (!Id1 && Id2)
24188	return false;
24189	// First elements come first.
24190	if (Id1 && Id2)
24191	return Id1 < Id2;
24192
24193	continue;
24194	}
24195	if (I1->getOpcode() == I2->getOpcode())
24196	continue;
24197	return I1->getOpcode() < I2->getOpcode();
24198	}
24199	if (I1)
24200	return true;
24201	if (I2)
24202	return false;
24203	}
24204	{
24205	// Non-undef constants come next.
24206	bool C1 = isa<Constant>(Val: Opcodes1 [I]) && !isa<UndefValue>(Val: Opcodes1 [I]);
24207	bool C2 = isa<Constant>(Val: Opcodes2 [I]) && !isa<UndefValue>(Val: Opcodes2 [I]);
24208	if (C1 && C2)
24209	continue;
24210	if (C1)
24211	return true;
24212	if (C2)
24213	return false;
24214	}
24215	bool U1 = isa<UndefValue>(Val: Opcodes1 [I]);
24216	bool U2 = isa<UndefValue>(Val: Opcodes2 [I]);
24217	{
24218	// Non-constant non-instructions come next.
24219	if (!U1 && !U2) {
24220	auto ValID1 = Opcodes1 [I]->getValueID();
24221	auto ValID2 = Opcodes2 [I]->getValueID();
24222	if (ValID1 == ValID2)
24223	continue;
24224	if (ValID1 < ValID2)
24225	return true;
24226	if (ValID1 > ValID2)
24227	return false;
24228	}
24229	if (!U1)
24230	return true;
24231	if (!U2)
24232	return false;
24233	}
24234	// Undefs come last.
24235	assert(U1 && U2 && "The only thing left should be undef & undef.");
24236	}
24237	return false;
24238	};
24239	auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value V1, Value V2) {
24240	if (V1 == V2)
24241	return true;
24242	if (V1->getType() != V2->getType())
24243	return false;
24244	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
24245	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
24246	if (Opcodes1.size() != Opcodes2.size())
24247	return false;
24248	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
24249	// Undefs are compatible with any other value.
24250	if (isa<UndefValue>(Val: Opcodes1 [I]) \|\| isa<UndefValue>(Val: Opcodes2 [I]))
24251	continue;
24252	if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]))
24253	if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I])) {
24254	if (R.isDeleted(I: I1) \|\| R.isDeleted(I: I2))
24255	return false;
24256	if (I1->getParent() != I2->getParent())
24257	return false;
24258	if (getSameOpcode(VL: {I1, I2}, TLI: *TLI))
24259	continue;
24260	return false;
24261	}
24262	if (isa<Constant>(Val: Opcodes1 [I]) && isa<Constant>(Val: Opcodes2 [I]))
24263	continue;
24264	if (Opcodes1 [I]->getValueID() != Opcodes2 [I]->getValueID())
24265	return false;
24266	}
24267	return true;
24268	};
24269
24270	bool HaveVectorizedPhiNodes = false;
24271	do {
24272	// Collect the incoming values from the PHIs.
24273	Incoming.clear();
24274	for (Instruction &I : *BB) {
24275	auto *P = dyn_cast<PHINode>(Val: &I);
24276	if (!P \|\| P->getNumIncomingValues() > MaxPHINumOperands)
24277	break;
24278
24279	// No need to analyze deleted, vectorized and non-vectorizable
24280	// instructions.
24281	if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
24282	isValidElementType(Ty: P->getType()))
24283	Incoming.push_back(Elt: P);
24284	}
24285
24286	if (Incoming.size() <= `1`)
24287	break;
24288
24289	// Find the corresponding non-phi nodes for better matching when trying to
24290	// build the tree.
24291	for (Value *V : Incoming) {
24292	SmallVectorImpl<Value *> &Opcodes =
24293	PHIToOpcodes.try_emplace(Key: V).first ->getSecond();
24294	if (!Opcodes.empty())
24295	continue;
24296	SmallVector<Value *, `4`> Nodes(`1`, V);
24297	SmallPtrSet<Value *, `4`> Visited;
24298	while (!Nodes.empty()) {
24299	auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
24300	if (!Visited.insert(Ptr: PHI).second)
24301	continue;
24302	for (Value *V : PHI->incoming_values()) {
24303	if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
24304	Nodes.push_back(Elt: PHI1);
24305	continue;
24306	}
24307	Opcodes.emplace_back(Args&: V);
24308	}
24309	}
24310	}
24311
24312	HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
24313	Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
24314	TryToVectorizeHelper: [this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
24315	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
24316	},
24317	/MaxVFOnly=/true, R);
24318	Changed \|= HaveVectorizedPhiNodes;
24319	if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
24320	auto *PHI = dyn_cast<PHINode>(P.first);
24321	return !PHI \|\| R.isDeleted(I: PHI);
24322	}))
24323	PHIToOpcodes.clear();
24324	VisitedInstrs.insert_range(R&: Incoming);
24325	} while (HaveVectorizedPhiNodes);
24326
24327	VisitedInstrs.clear();
24328
24329	InstSetVector PostProcessInserts;
24330	SmallSetVector<CmpInst *, `8`> PostProcessCmps;
24331	// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
24332	// also vectorizes `PostProcessCmps`.
24333	auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
24334	bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
24335	if (VectorizeCmps) {
24336	Changed \|= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
24337	PostProcessCmps.clear();
24338	}
24339	PostProcessInserts.clear();
24340	return Changed;
24341	};
24342	// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
24343	auto IsInPostProcessInstrs = [&](Instruction *I) {
24344	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
24345	return PostProcessCmps.contains(key: Cmp);
24346	return isa<InsertElementInst, InsertValueInst>(Val: I) &&
24347	PostProcessInserts.contains(key: I);
24348	};
24349	// Returns true if `I` is an instruction without users, like terminator, or
24350	// function call with ignored return value, store. Ignore unused instructions
24351	// (basing on instruction type, except for CallInst and InvokeInst).
24352	auto HasNoUsers = [](Instruction *I) {
24353	return I->use_empty() &&
24354	(I->getType()->isVoidTy() \|\| isa<CallInst, InvokeInst>(Val: I));
24355	};
24356	for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
24357	// Skip instructions with scalable type. The num of elements is unknown at
24358	// compile-time for scalable type.
24359	if (isa<ScalableVectorType>(Val: It ->getType()))
24360	continue;
24361
24362	// Skip instructions marked for the deletion.
24363	if (R.isDeleted(I: &*It))
24364	continue;
24365	// We may go through BB multiple times so skip the one we have checked.
24366	if (!VisitedInstrs.insert(Ptr: &*It).second) {
24367	if (HasNoUsers (&*It) &&
24368	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator())) {
24369	// We would like to start over since some instructions are deleted
24370	// and the iterator may become invalid value.
24371	Changed = true;
24372	It = BB->begin();
24373	E = BB->end();
24374	}
24375	continue;
24376	}
24377
24378	// Try to vectorize reductions that use PHINodes.
24379	if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
24380	// Check that the PHI is a reduction PHI.
24381	if (P->getNumIncomingValues() == `2`) {
24382	// Try to match and vectorize a horizontal reduction.
24383	Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
24384	if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
24385	Changed = true;
24386	It = BB->begin();
24387	E = BB->end();
24388	continue;
24389	}
24390	}
24391	// Try to vectorize the incoming values of the PHI, to catch reductions
24392	// that feed into PHIs.
24393	for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
24394	// Skip if the incoming block is the current BB for now. Also, bypass
24395	// unreachable IR for efficiency and to avoid crashing.
24396	// TODO: Collect the skipped incoming values and try to vectorize them
24397	// after processing BB.
24398	if (BB == P->getIncomingBlock(i: I) \|\|
24399	!DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
24400	continue;
24401
24402	// Postponed instructions should not be vectorized here, delay their
24403	// vectorization.
24404	if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
24405	PI && !IsInPostProcessInstrs (PI)) {
24406	bool Res =
24407	vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R);
24408	Changed \|= Res;
24409	if (Res && R.isDeleted(I: P)) {
24410	It = BB->begin();
24411	E = BB->end();
24412	break;
24413	}
24414	}
24415	}
24416	continue;
24417	}
24418
24419	if (HasNoUsers (&*It)) {
24420	bool OpsChanged = false;
24421	auto *SI = dyn_cast<StoreInst>(Val&: It);
24422	bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore \|\| !SI;
24423	if (SI) {
24424	auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
24425	// Try to vectorize chain in store, if this is the only store to the
24426	// address in the block.
24427	// TODO: This is just a temporarily solution to save compile time. Need
24428	// to investigate if we can safely turn on slp-vectorize-hor-store
24429	// instead to allow lookup for reduction chains in all non-vectorized
24430	// stores (need to check side effects and compile time).
24431	TryToVectorizeRoot \|= (I == Stores.end() \|\| I->second.size() == `1`) &&
24432	SI->getValueOperand()->hasOneUse();
24433	}
24434	if (TryToVectorizeRoot) {
24435	for (auto *V : It ->operand_values()) {
24436	// Postponed instructions should not be vectorized here, delay their
24437	// vectorization.
24438	if (auto *VI = dyn_cast<Instruction>(Val: V);
24439	VI && !IsInPostProcessInstrs (VI))
24440	// Try to match and vectorize a horizontal reduction.
24441	OpsChanged \|= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R);
24442	}
24443	}
24444	// Start vectorization of post-process list of instructions from the
24445	// top-tree instructions to try to vectorize as many instructions as
24446	// possible.
24447	OpsChanged \|=
24448	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator());
24449	if (OpsChanged) {
24450	// We would like to start over since some instructions are deleted
24451	// and the iterator may become invalid value.
24452	Changed = true;
24453	It = BB->begin();
24454	E = BB->end();
24455	continue;
24456	}
24457	}
24458
24459	if (isa<InsertElementInst, InsertValueInst>(Val: It))
24460	PostProcessInserts.insert(X: &*It);
24461	else if (isa<CmpInst>(Val: It))
24462	PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
24463	}
24464
24465	return Changed;
24466	}
24467
24468	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
24469	auto Changed = false;
24470	for (auto &Entry : GEPs) {
24471	// If the getelementptr list has fewer than two elements, there's nothing
24472	// to do.
24473	if (Entry.second.size() < `2`)
24474	continue;
24475
24476	LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
24477	<< Entry.second.size() << ".\n");
24478
24479	// Process the GEP list in chunks suitable for the target's supported
24480	// vector size. If a vector register can't hold 1 element, we are done. We
24481	// are trying to vectorize the index computations, so the maximum number of
24482	// elements is based on the size of the index expression, rather than the
24483	// size of the GEP itself (the target's pointer size).
24484	auto It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst GEP) {
24485	return !R.isDeleted(I: GEP);
24486	});
24487	if (It == Entry.second.end())
24488	continue;
24489	unsigned MaxVecRegSize = R.getMaxVecRegSize();
24490	unsigned EltSize = R.getVectorElementSize(V: (It)->idx_begin());
24491	if (MaxVecRegSize < EltSize)
24492	continue;
24493
24494	unsigned MaxElts = MaxVecRegSize / EltSize;
24495	for (unsigned BI = `0`, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
24496	auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
24497	ArrayRef<GetElementPtrInst *> GEPList(&Entry.second [BI], Len);
24498
24499	// Initialize a set a candidate getelementptrs. Note that we use a
24500	// SetVector here to preserve program order. If the index computations
24501	// are vectorizable and begin with loads, we want to minimize the chance
24502	// of having to reorder them later.
24503	SetVector<Value *> Candidates(llvm::from_range, GEPList);
24504
24505	// Some of the candidates may have already been vectorized after we
24506	// initially collected them or their index is optimized to constant value.
24507	// If so, they are marked as deleted, so remove them from the set of
24508	// candidates.
24509	Candidates.remove_if(P: [&R](Value *I) {
24510	return R.isDeleted(I: cast<Instruction>(Val: I)) \|\|
24511	isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
24512	});
24513
24514	// Remove from the set of candidates all pairs of getelementptrs with
24515	// constant differences. Such getelementptrs are likely not good
24516	// candidates for vectorization in a bottom-up phase since one can be
24517	// computed from the other. We also ensure all candidate getelementptr
24518	// indices are unique.
24519	for (int I = `0`, E = GEPList.size(); I < E && Candidates.size() > `1`; ++I) {
24520	auto *GEPI = GEPList [I];
24521	if (!Candidates.count(key: GEPI))
24522	continue;
24523	const SCEV *SCEVI = SE->getSCEV(V: GEPList [I]);
24524	for (int J = I + `1`; J < E && Candidates.size() > `1`; ++J) {
24525	auto *GEPJ = GEPList [J];
24526	const SCEV *SCEVJ = SE->getSCEV(V: GEPList [J]);
24527	if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
24528	Candidates.remove(X: GEPI);
24529	Candidates.remove(X: GEPJ);
24530	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
24531	Candidates.remove(X: GEPJ);
24532	}
24533	}
24534	}
24535
24536	// We break out of the above computation as soon as we know there are
24537	// fewer than two candidates remaining.
24538	if (Candidates.size() < `2`)
24539	continue;
24540
24541	// Add the single, non-constant index of each candidate to the bundle. We
24542	// ensured the indices met these constraints when we originally collected
24543	// the getelementptrs.
24544	SmallVector<Value *, `16`> Bundle(Candidates.size());
24545	auto BundleIndex = `0u`;
24546	for (auto *V : Candidates) {
24547	auto *GEP = cast<GetElementPtrInst>(Val: V);
24548	auto *GEPIdx = GEP->idx_begin()->get();
24549	assert(GEP->getNumIndices() == `1` && !isa<Constant>(GEPIdx));
24550	Bundle [BundleIndex++] = GEPIdx;
24551	}
24552
24553	// Try and vectorize the indices. We are currently only interested in
24554	// gather-like cases of the form:
24555	//
24556	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
24557	//
24558	// where the loads of "a", the loads of "b", and the subtractions can be
24559	// performed in parallel. It's likely that detecting this pattern in a
24560	// bottom-up phase will be simpler and less costly than building a
24561	// full-blown top-down phase beginning at the consecutive loads.
24562	Changed \|= tryToVectorizeList(VL: Bundle, R);
24563	}
24564	}
24565	return Changed;
24566	}
24567
24568	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
24569	bool Changed = false;
24570	// Sort by type, base pointers and values operand. Value operands must be
24571	// compatible (have the same opcode, same parent), otherwise it is
24572	// definitely not profitable to try to vectorize them.
24573	auto &&StoreSorter = [this](StoreInst V, StoreInst V2) {
24574	if (V->getValueOperand()->getType()->getTypeID() <
24575	V2->getValueOperand()->getType()->getTypeID())
24576	return true;
24577	if (V->getValueOperand()->getType()->getTypeID() >
24578	V2->getValueOperand()->getType()->getTypeID())
24579	return false;
24580	if (V->getPointerOperandType()->getTypeID() <
24581	V2->getPointerOperandType()->getTypeID())
24582	return true;
24583	if (V->getPointerOperandType()->getTypeID() >
24584	V2->getPointerOperandType()->getTypeID())
24585	return false;
24586	if (V->getValueOperand()->getType()->getScalarSizeInBits() <
24587	V2->getValueOperand()->getType()->getScalarSizeInBits())
24588	return true;
24589	if (V->getValueOperand()->getType()->getScalarSizeInBits() >
24590	V2->getValueOperand()->getType()->getScalarSizeInBits())
24591	return false;
24592	// UndefValues are compatible with all other values.
24593	if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand()))
24594	if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
24595	DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
24596	DT->getNode(BB: I1->getParent());
24597	DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
24598	DT->getNode(BB: I2->getParent());
24599	assert(NodeI1 && "Should only process reachable instructions");
24600	assert(NodeI2 && "Should only process reachable instructions");
24601	assert((NodeI1 == NodeI2) ==
24602	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
24603	"Different nodes should have different DFS numbers");
24604	if (NodeI1 != NodeI2)
24605	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
24606	return I1->getOpcode() < I2->getOpcode();
24607	}
24608	return V->getValueOperand()->getValueID() <
24609	V2->getValueOperand()->getValueID();
24610	};
24611
24612	auto &&AreCompatibleStores = [this](StoreInst V1, StoreInst V2) {
24613	if (V1 == V2)
24614	return true;
24615	if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
24616	return false;
24617	if (V1->getPointerOperandType() != V2->getPointerOperandType())
24618	return false;
24619	// Undefs are compatible with any other value.
24620	if (isa<UndefValue>(Val: V1->getValueOperand()) \|\|
24621	isa<UndefValue>(Val: V2->getValueOperand()))
24622	return true;
24623	if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand()))
24624	if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
24625	if (I1->getParent() != I2->getParent())
24626	return false;
24627	return getSameOpcode(VL: {I1, I2}, TLI: *TLI).valid();
24628	}
24629	if (isa<Constant>(Val: V1->getValueOperand()) &&
24630	isa<Constant>(Val: V2->getValueOperand()))
24631	return true;
24632	return V1->getValueOperand()->getValueID() ==
24633	V2->getValueOperand()->getValueID();
24634	};
24635
24636	// Attempt to sort and vectorize each of the store-groups.
24637	DenseSet<std::tuple<Value , Value , Value , Value , unsigned>> Attempted;
24638	for (auto &Pair : Stores) {
24639	if (Pair.second.size() < `2`)
24640	continue;
24641
24642	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
24643	<< Pair.second.size() << ".\n");
24644
24645	if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
24646	continue;
24647
24648	// Reverse stores to do bottom-to-top analysis. This is important if the
24649	// values are stores to the same addresses several times, in this case need
24650	// to follow the stores order (reversed to meet the memory dependecies).
24651	SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
24652	Pair.second.rend());
24653	Changed \|= tryToVectorizeSequence<StoreInst>(
24654	Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
24655	TryToVectorizeHelper: [&](ArrayRef<StoreInst > Candidates, bool*) {
24656	return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
24657	},
24658	/MaxVFOnly=/false, R);
24659	}
24660	return Changed;
24661	}
24662

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp