SLPVectorizer.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp]

1	//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10	// stores that can be put together into vector-stores. Next, it attempts to
11	// construct vectorizable tree using the use-def chains. If a profitable tree
12	// was found, the SLP vectorizer performs vectorization on the tree.
13	//
14	// The pass is inspired by the work described in the paper:
15	// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16	//
17	//===----------------------------------------------------------------------===//
18
19	#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20	#include "llvm/ADT/DenseMap.h"
21	#include "llvm/ADT/DenseSet.h"
22	#include "llvm/ADT/PriorityQueue.h"
23	#include "llvm/ADT/STLExtras.h"
24	#include "llvm/ADT/ScopeExit.h"
25	#include "llvm/ADT/SetOperations.h"
26	#include "llvm/ADT/SetVector.h"
27	#include "llvm/ADT/SmallBitVector.h"
28	#include "llvm/ADT/SmallPtrSet.h"
29	#include "llvm/ADT/SmallSet.h"
30	#include "llvm/ADT/SmallString.h"
31	#include "llvm/ADT/Statistic.h"
32	#include "llvm/ADT/iterator.h"
33	#include "llvm/ADT/iterator_range.h"
34	#include "llvm/Analysis/AliasAnalysis.h"
35	#include "llvm/Analysis/AssumptionCache.h"
36	#include "llvm/Analysis/CodeMetrics.h"
37	#include "llvm/Analysis/ConstantFolding.h"
38	#include "llvm/Analysis/DemandedBits.h"
39	#include "llvm/Analysis/GlobalsModRef.h"
40	#include "llvm/Analysis/IVDescriptors.h"
41	#include "llvm/Analysis/Loads.h"
42	#include "llvm/Analysis/LoopAccessAnalysis.h"
43	#include "llvm/Analysis/LoopInfo.h"
44	#include "llvm/Analysis/MemoryLocation.h"
45	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
46	#include "llvm/Analysis/ScalarEvolution.h"
47	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
48	#include "llvm/Analysis/TargetLibraryInfo.h"
49	#include "llvm/Analysis/TargetTransformInfo.h"
50	#include "llvm/Analysis/ValueTracking.h"
51	#include "llvm/Analysis/VectorUtils.h"
52	#include "llvm/IR/Attributes.h"
53	#include "llvm/IR/BasicBlock.h"
54	#include "llvm/IR/Constant.h"
55	#include "llvm/IR/Constants.h"
56	#include "llvm/IR/DataLayout.h"
57	#include "llvm/IR/DerivedTypes.h"
58	#include "llvm/IR/Dominators.h"
59	#include "llvm/IR/Function.h"
60	#include "llvm/IR/IRBuilder.h"
61	#include "llvm/IR/InstrTypes.h"
62	#include "llvm/IR/Instruction.h"
63	#include "llvm/IR/Instructions.h"
64	#include "llvm/IR/IntrinsicInst.h"
65	#include "llvm/IR/Intrinsics.h"
66	#include "llvm/IR/Module.h"
67	#include "llvm/IR/Operator.h"
68	#include "llvm/IR/PatternMatch.h"
69	#include "llvm/IR/Type.h"
70	#include "llvm/IR/Use.h"
71	#include "llvm/IR/User.h"
72	#include "llvm/IR/Value.h"
73	#include "llvm/IR/ValueHandle.h"
74	#ifdef EXPENSIVE_CHECKS
75	#include "llvm/IR/Verifier.h"
76	#endif
77	#include "llvm/Pass.h"
78	#include "llvm/Support/Casting.h"
79	#include "llvm/Support/CommandLine.h"
80	#include "llvm/Support/Compiler.h"
81	#include "llvm/Support/DOTGraphTraits.h"
82	#include "llvm/Support/Debug.h"
83	#include "llvm/Support/DebugCounter.h"
84	#include "llvm/Support/ErrorHandling.h"
85	#include "llvm/Support/GraphWriter.h"
86	#include "llvm/Support/InstructionCost.h"
87	#include "llvm/Support/KnownBits.h"
88	#include "llvm/Support/MathExtras.h"
89	#include "llvm/Support/raw_ostream.h"
90	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
91	#include "llvm/Transforms/Utils/Local.h"
92	#include "llvm/Transforms/Utils/LoopUtils.h"
93	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
94	#include <algorithm>
95	#include <cassert>
96	#include <cstdint>
97	#include <iterator>
98	#include <map>
99	#include <memory>
100	#include <optional>
101	#include <set>
102	#include <string>
103	#include <tuple>
104	#include <utility>
105
106	using namespace llvm;
107	using namespace llvm::PatternMatch;
108	using namespace slpvectorizer;
109	using namespace std::placeholders;
110
111	#define SV_NAME "slp-vectorizer"
112	#define DEBUG_TYPE "SLP"
113
114	STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116	DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117	"Controls which SLP graphs should be vectorized.");
118
119	static cl::opt<bool>
120	RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
121	cl::desc ("Run the SLP vectorization passes"));
122
123	static cl::opt<bool>
124	SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
125	cl::desc ("Enable vectorization for wider vector utilization"));
126
127	static cl::opt<int>
128	SLPCostThreshold("slp-threshold", cl::init(Val: `0`), cl::Hidden,
129	cl::desc ("Only vectorize if you gain more than this "
130	"number "));
131
132	static cl::opt<bool>
133	ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
134	cl::desc ("Attempt to vectorize horizontal reductions"));
135
136	static cl::opt<bool> ShouldStartVectorizeHorAtStore(
137	"slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
138	cl::desc (
139	"Attempt to vectorize horizontal reductions feeding into a store"));
140
141	static cl::opt<bool> SplitAlternateInstructions(
142	"slp-split-alternate-instructions", cl::init(Val: true), cl::Hidden,
143	cl::desc ("Improve the code quality by splitting alternate instructions"));
144
145	static cl::opt<int>
146	MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: `128`), cl::Hidden,
147	cl::desc ("Attempt to vectorize for this register size in bits"));
148
149	static cl::opt<unsigned>
150	MaxVFOption("slp-max-vf", cl::init(Val: `0`), cl::Hidden,
151	cl::desc ("Maximum SLP vectorization factor (0=unlimited)"));
152
153	/// Limits the size of scheduling regions in a block.
154	/// It avoid long compile times for _very_ large blocks where vector
155	/// instructions are spread over a wide range.
156	/// This limit is way higher than needed by real-world functions.
157	static cl::opt<int>
158	ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: `100000`), cl::Hidden,
159	cl::desc ("Limit the size of the SLP scheduling region per block"));
160
161	static cl::opt<int> MinVectorRegSizeOption(
162	"slp-min-reg-size", cl::init(Val: `128`), cl::Hidden,
163	cl::desc ("Attempt to vectorize for this register size in bits"));
164
165	static cl::opt<unsigned> RecursionMaxDepth(
166	"slp-recursion-max-depth", cl::init(Val: `12`), cl::Hidden,
167	cl::desc ("Limit the recursion depth when building a vectorizable tree"));
168
169	static cl::opt<unsigned> MinTreeSize(
170	"slp-min-tree-size", cl::init(Val: `3`), cl::Hidden,
171	cl::desc ("Only vectorize small trees if they are fully vectorizable"));
172
173	// The maximum depth that the look-ahead score heuristic will explore.
174	// The higher this value, the higher the compilation time overhead.
175	static cl::opt<int> LookAheadMaxDepth(
176	"slp-max-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
177	cl::desc ("The maximum look-ahead depth for operand reordering scores"));
178
179	// The maximum depth that the look-ahead score heuristic will explore
180	// when it probing among candidates for vectorization tree roots.
181	// The higher this value, the higher the compilation time overhead but unlike
182	// similar limit for operands ordering this is less frequently used, hence
183	// impact of higher value is less noticeable.
184	static cl::opt<int> RootLookAheadMaxDepth(
185	"slp-max-root-look-ahead-depth", cl::init(Val: `2`), cl::Hidden,
186	cl::desc ("The maximum look-ahead depth for searching best rooting option"));
187
188	static cl::opt<unsigned> MinProfitableStridedLoads(
189	"slp-min-strided-loads", cl::init(Val: `2`), cl::Hidden,
190	cl::desc ("The minimum number of loads, which should be considered strided, "
191	"if the stride is > 1 or is runtime value"));
192
193	static cl::opt<unsigned> MaxProfitableLoadStride(
194	"slp-max-stride", cl::init(Val: `8`), cl::Hidden,
195	cl::desc ("The maximum stride, considered to be profitable."));
196
197	static cl::opt<bool>
198	DisableTreeReorder("slp-disable-tree-reorder", cl::init(Val: false), cl::Hidden,
199	cl::desc ("Disable tree reordering even if it is "
200	"profitable. Used for testing only."));
201
202	static cl::opt<bool>
203	ForceStridedLoads("slp-force-strided-loads", cl::init(Val: false), cl::Hidden,
204	cl::desc ("Generate strided loads even if they are not "
205	"profitable. Used for testing only."));
206
207	static cl::opt<bool>
208	ViewSLPTree("view-slp-tree", cl::Hidden,
209	cl::desc ("Display the SLP trees with Graphviz"));
210
211	static cl::opt<bool> VectorizeNonPowerOf2(
212	"slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
213	cl::desc ("Try to vectorize with non-power-of-2 number of elements."));
214
215	/// Enables vectorization of copyable elements.
216	static cl::opt<bool> VectorizeCopyableElements(
217	"slp-copyable-elements", cl::init(Val: true), cl::Hidden,
218	cl::desc ("Try to replace values with the idempotent instructions for "
219	"better vectorization."));
220
221	// Limit the number of alias checks. The limit is chosen so that
222	// it has no negative effect on the llvm benchmarks.
223	static const unsigned AliasedCheckLimit = `10`;
224
225	// Limit of the number of uses for potentially transformed instructions/values,
226	// used in checks to avoid compile-time explode.
227	static constexpr int UsesLimit = `64`;
228
229	// Another limit for the alias checks: The maximum distance between load/store
230	// instructions where alias checks are done.
231	// This limit is useful for very large basic blocks.
232	static const unsigned MaxMemDepDistance = `160`;
233
234	/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
235	/// regions to be handled.
236	static const int MinScheduleRegionSize = `16`;
237
238	/// Maximum allowed number of operands in the PHI nodes.
239	static const unsigned MaxPHINumOperands = `128`;
240
241	/// Predicate for the element types that the SLP vectorizer supports.
242	///
243	/// The most important thing to filter here are types which are invalid in LLVM
244	/// vectors. We also filter target specific types which have absolutely no
245	/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
246	/// avoids spending time checking the cost model and realizing that they will
247	/// be inevitably scalarized.
248	static bool isValidElementType(Type *Ty) {
249	// TODO: Support ScalableVectorType.
250	if (SLPReVec && isa<FixedVectorType>(Val: Ty))
251	Ty = Ty->getScalarType();
252	return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
253	!Ty->isPPC_FP128Ty();
254	}
255
256	/// Returns the type of the given value/instruction \p V. If it is store,
257	/// returns the type of its value operand, for Cmp - the types of the compare
258	/// operands and for insertelement - the type os the inserted operand.
259	/// Otherwise, just the type of the value is returned.
260	static Type getValueType(Value V) {
261	if (auto *SI = dyn_cast<StoreInst>(Val: V))
262	return SI->getValueOperand()->getType();
263	if (auto *CI = dyn_cast<CmpInst>(Val: V))
264	return CI->getOperand(i_nocapture: `0`)->getType();
265	if (!SLPReVec)
266	if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
267	return IE->getOperand(i_nocapture: `1`)->getType();
268	return V->getType();
269	}
270
271	/// \returns the number of elements for Ty.
272	static unsigned getNumElements(Type *Ty) {
273	assert(!isa<ScalableVectorType>(Ty) &&
274	"ScalableVectorType is not supported.");
275	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
276	return VecTy->getNumElements();
277	return `1`;
278	}
279
280	/// \returns the vector type of ScalarTy based on vectorization factor.
281	static FixedVectorType getWidenedType(Type ScalarTy, unsigned VF) {
282	return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
283	NumElts: VF * getNumElements(Ty: ScalarTy));
284	}
285
286	/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
287	/// which forms type, which splits by \p TTI into whole vector types during
288	/// legalization.
289	static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
290	Type Ty, unsigned* Sz) {
291	if (!isValidElementType(Ty))
292	return bit_ceil(Value: Sz);
293	// Find the number of elements, which forms full vectors.
294	const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
295	if (NumParts == `0` \|\| NumParts >= Sz)
296	return bit_ceil(Value: Sz);
297	return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts;
298	}
299
300	/// Returns the number of elements of the given type \p Ty, not greater than \p
301	/// Sz, which forms type, which splits by \p TTI into whole vector types during
302	/// legalization.
303	static unsigned
304	getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
305	unsigned Sz) {
306	if (!isValidElementType(Ty))
307	return bit_floor(Value: Sz);
308	// Find the number of elements, which forms full vectors.
309	unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
310	if (NumParts == `0` \|\| NumParts >= Sz)
311	return bit_floor(Value: Sz);
312	unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts));
313	if (RegVF > Sz)
314	return bit_floor(Value: Sz);
315	return (Sz / RegVF) * RegVF;
316	}
317
318	static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
319	SmallVectorImpl<int> &Mask) {
320	// The ShuffleBuilder implementation use shufflevector to splat an "element".
321	// But the element have different meaning for SLP (scalar) and REVEC
322	// (vector). We need to expand Mask into masks which shufflevector can use
323	// directly.
324	SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
325	for (unsigned I : seq<unsigned>(Size: Mask.size()))
326	for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice(
327	N: I * VecTyNumElements, M: VecTyNumElements)))
328	MaskV = Mask [I] == PoisonMaskElem ? PoisonMaskElem
329	: Mask [I] * VecTyNumElements + J;
330	Mask.swap(RHS&: NewMask);
331	}
332
333	/// \returns the number of groups of shufflevector
334	/// A group has the following features
335	/// 1. All of value in a group are shufflevector.
336	/// 2. The mask of all shufflevector is isExtractSubvectorMask.
337	/// 3. The mask of all shufflevector uses all of the elements of the source.
338	/// e.g., it is 1 group (%0)
339	/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
340	/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
341	/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
342	/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
343	/// it is 2 groups (%3 and %4)
344	/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
345	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
346	/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
347	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
348	/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
349	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350	/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
351	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352	/// it is 0 group
353	/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
354	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
355	/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
356	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357	static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
358	if (VL.empty())
359	return `0`;
360	if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>))
361	return `0`;
362	auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
363	unsigned SVNumElements =
364	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())->getNumElements();
365	unsigned ShuffleMaskSize = SV->getShuffleMask().size();
366	if (SVNumElements % ShuffleMaskSize != `0`)
367	return `0`;
368	unsigned GroupSize = SVNumElements / ShuffleMaskSize;
369	if (GroupSize == `0` \|\| (VL.size() % GroupSize) != `0`)
370	return `0`;
371	unsigned NumGroup = `0`;
372	for (size_t I = `0`, E = VL.size(); I != E; I += GroupSize) {
373	auto *SV = cast<ShuffleVectorInst>(Val: VL [I]);
374	Value *Src = SV->getOperand(i_nocapture: `0`);
375	ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
376	SmallBitVector ExpectedIndex(GroupSize);
377	if (!all_of(Range&: Group, P: [&](Value *V) {
378	auto *SV = cast<ShuffleVectorInst>(Val: V);
379	// From the same source.
380	if (SV->getOperand(i_nocapture: `0`) != Src)
381	return false;
382	int Index;
383	if (!SV->isExtractSubvectorMask(Index))
384	return false;
385	ExpectedIndex.set(Index / ShuffleMaskSize);
386	return true;
387	}))
388	return `0`;
389	if (!ExpectedIndex.all())
390	return `0`;
391	++NumGroup;
392	}
393	assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
394	return NumGroup;
395	}
396
397	/// \returns a shufflevector mask which is used to vectorize shufflevectors
398	/// e.g.,
399	/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
400	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
401	/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
402	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
403	/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
404	/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405	/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
406	/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407	/// the result is
408	/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
409	static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
410	assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
411	auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
412	unsigned SVNumElements =
413	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())->getNumElements();
414	SmallVector<int> Mask;
415	unsigned AccumulateLength = `0`;
416	for (Value *V : VL) {
417	auto *SV = cast<ShuffleVectorInst>(Val: V);
418	for (int M : SV->getShuffleMask())
419	Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem
420	: AccumulateLength + M);
421	AccumulateLength += SVNumElements;
422	}
423	return Mask;
424	}
425
426	/// \returns True if the value is a constant (but not globals/constant
427	/// expressions).
428	static bool isConstant(Value *V) {
429	return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
430	}
431
432	/// Checks if \p V is one of vector-like instructions, i.e. undef,
433	/// insertelement/extractelement with constant indices for fixed vector type or
434	/// extractvalue instruction.
435	static bool isVectorLikeInstWithConstOps(Value *V) {
436	if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
437	!isa<ExtractValueInst, UndefValue>(Val: V))
438	return false;
439	auto *I = dyn_cast<Instruction>(Val: V);
440	if (!I \|\| isa<ExtractValueInst>(Val: I))
441	return true;
442	if (!isa<FixedVectorType>(Val: I->getOperand(i: `0`)->getType()))
443	return false;
444	if (isa<ExtractElementInst>(Val: I))
445	return isConstant(V: I->getOperand(i: `1`));
446	assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
447	return isConstant(V: I->getOperand(i: `2`));
448	}
449
450	/// Returns power-of-2 number of elements in a single register (part), given the
451	/// total number of elements \p Size and number of registers (parts) \p
452	/// NumParts.
453	static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
454	return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts)));
455	}
456
457	/// Returns correct remaining number of elements, considering total amount \p
458	/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
459	/// and current register (part) \p Part.
460	static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
461	unsigned Part) {
462	return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
463	}
464
465	#if !defined(NDEBUG)
466	/// Print a short descriptor of the instruction bundle suitable for debug output.
467	static std::string shortBundleName(ArrayRef<Value > VL, int* Idx = -`1`) {
468	std::string Result;
469	raw_string_ostream OS(Result);
470	if (Idx >= `0`)
471	OS << "Idx: " << Idx << ", ";
472	OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
473	return Result;
474	}
475	#endif
476
477	/// \returns true if all of the instructions in \p VL are in the same block or
478	/// false otherwise.
479	static bool allSameBlock(ArrayRef<Value *> VL) {
480	auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
481	if (It == VL.end())
482	return false;
483	Instruction I0 = cast<Instruction>(Val: It);
484	if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
485	return true;
486
487	BasicBlock *BB = I0->getParent();
488	for (Value *V : iterator_range(It, VL.end())) {
489	if (isa<PoisonValue>(Val: V))
490	continue;
491	auto *II = dyn_cast<Instruction>(Val: V);
492	if (!II)
493	return false;
494
495	if (BB != II->getParent())
496	return false;
497	}
498	return true;
499	}
500
501	/// \returns True if all of the values in \p VL are constants (but not
502	/// globals/constant expressions).
503	static bool allConstant(ArrayRef<Value *> VL) {
504	// Constant expressions and globals can't be vectorized like normal integer/FP
505	// constants.
506	return all_of(Range&: VL, P: isConstant);
507	}
508
509	/// \returns True if all of the values in \p VL are identical or some of them
510	/// are UndefValue.
511	static bool isSplat(ArrayRef<Value *> VL) {
512	Value FirstNonUndef = nullptr*;
513	for (Value *V : VL) {
514	if (isa<UndefValue>(Val: V))
515	continue;
516	if (!FirstNonUndef) {
517	FirstNonUndef = V;
518	continue;
519	}
520	if (V != FirstNonUndef)
521	return false;
522	}
523	return FirstNonUndef != nullptr;
524	}
525
526	/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
527	/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
528	/// patterns that make it effectively commutative (like equality comparisons
529	/// with zero).
530	/// In most cases, users should not call this function directly (since \p I and
531	/// \p InstWithUses are the same). However, when analyzing interchangeable
532	/// instructions, we need to use the converted opcode along with the original
533	/// uses.
534	/// \param I The instruction to check for commutativity
535	/// \param ValWithUses The value whose uses are analyzed for special
536	/// patterns
537	static bool isCommutative(Instruction I, Value ValWithUses,
538	bool IsCopyable = false) {
539	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
540	return Cmp->isCommutative();
541	if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
542	return BO->isCommutative() \|\|
543	(BO->getOpcode() == Instruction::Sub &&
544	ValWithUses->hasUseList() &&
545	!ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
546	all_of(
547	Range: ValWithUses->uses(),
548	P: [&](const Use &U) {
549	// Commutative, if icmp eq/ne sub, 0
550	CmpPredicate Pred;
551	if (match(V: U.getUser(),
552	P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
553	(Pred == ICmpInst::ICMP_EQ \|\| Pred == ICmpInst::ICMP_NE))
554	return true;
555	// Commutative, if abs(sub nsw, true) or abs(sub, false).
556	ConstantInt *Flag;
557	auto *I = dyn_cast<BinaryOperator>(Val: U.get());
558	return match(V: U.getUser(),
559	P: m_Intrinsic<Intrinsic::abs>(
560	Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
561	((!IsCopyable && I && !I->hasNoSignedWrap()) \|\|
562	Flag->isOne());
563	})) \|\|
564	(BO->getOpcode() == Instruction::FSub &&
565	ValWithUses->hasUseList() &&
566	!ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
567	all_of(Range: ValWithUses->uses(), P: [](const Use &U) {
568	return match(V: U.getUser(),
569	P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
570	}));
571	return I->isCommutative();
572	}
573
574	/// Checks if the operand is commutative. In commutative operations, not all
575	/// operands might commutable, e.g. for fmuladd only 2 first operands are
576	/// commutable.
577	static bool isCommutableOperand(Instruction I, Value ValWithUses, unsigned Op,
578	bool IsCopyable = false) {
579	assert(::isCommutative(I, ValWithUses, IsCopyable) &&
580	"The instruction is not commutative.");
581	if (isa<CmpInst>(Val: I))
582	return true;
583	if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
584	switch (BO->getOpcode()) {
585	case Instruction::Sub:
586	case Instruction::FSub:
587	return true;
588	default:
589	break;
590	}
591	}
592	return I->isCommutableOperand(Op);
593	}
594
595	/// This is a helper function to check whether \p I is commutative.
596	/// This is a convenience wrapper that calls the two-parameter version of
597	/// isCommutative with the same instruction for both parameters. This is
598	/// the common case where the instruction being checked for commutativity
599	/// is the same as the instruction whose uses are analyzed for special
600	/// patterns (see the two-parameter version above for details).
601	/// \param I The instruction to check for commutativity
602	/// \returns true if the instruction is commutative, false otherwise
603	static bool isCommutative(Instruction I) { return* isCommutative(I, ValWithUses: I); }
604
605	/// \returns number of operands of \p I, considering commutativity. Returns 2
606	/// for commutative intrinsics.
607	/// \param I The instruction to check for commutativity
608	static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
609	if (isa<IntrinsicInst>(Val: I) && isCommutative(I)) {
610	// IntrinsicInst::isCommutative returns true if swapping the first "two"
611	// arguments to the intrinsic produces the same result.
612	constexpr unsigned IntrinsicNumOperands = `2`;
613	return IntrinsicNumOperands;
614	}
615	return I->getNumOperands();
616	}
617
618	template <typename T>
619	static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
620	unsigned Offset) {
621	static_assert(std::is_same_v<T, InsertElementInst> \|\|
622	std::is_same_v<T, ExtractElementInst>,
623	"unsupported T");
624	int Index = Offset;
625	if (const auto *IE = dyn_cast<T>(Inst)) {
626	const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
627	if (!VT)
628	return std::nullopt;
629	const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(`2`));
630	if (!CI)
631	return std::nullopt;
632	if (CI->getValue().uge(VT->getNumElements()))
633	return std::nullopt;
634	Index *= VT->getNumElements();
635	Index += CI->getZExtValue();
636	return Index;
637	}
638	return std::nullopt;
639	}
640
641	/// \returns inserting or extracting index of InsertElement, ExtractElement or
642	/// InsertValue instruction, using Offset as base offset for index.
643	/// \returns std::nullopt if the index is not an immediate.
644	static std::optional<unsigned> getElementIndex(const Value *Inst,
645	unsigned Offset = `0`) {
646	if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
647	return Index;
648	if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
649	return Index;
650
651	int Index = Offset;
652
653	const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
654	if (!IV)
655	return std::nullopt;
656
657	Type *CurrentType = IV->getType();
658	for (unsigned I : IV->indices()) {
659	if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
660	Index *= ST->getNumElements();
661	CurrentType = ST->getElementType(N: I);
662	} else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
663	Index *= AT->getNumElements();
664	CurrentType = AT->getElementType();
665	} else {
666	return std::nullopt;
667	}
668	Index += I;
669	}
670	return Index;
671	}
672
673	/// \returns true if all of the values in \p VL use the same opcode.
674	/// For comparison instructions, also checks if predicates match.
675	/// PoisonValues are considered matching.
676	/// Interchangeable instructions are not considered.
677	static bool allSameOpcode(ArrayRef<Value *> VL) {
678	auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
679	if (It == VL.end())
680	return true;
681	Instruction MainOp = cast<Instruction>(Val: It);
682	unsigned Opcode = MainOp->getOpcode();
683	bool IsCmpOp = isa<CmpInst>(Val: MainOp);
684	CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
685	: CmpInst::BAD_ICMP_PREDICATE;
686	return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) {
687	if (auto *CI = dyn_cast<CmpInst>(Val: V))
688	return BasePred == CI->getPredicate();
689	if (auto *I = dyn_cast<Instruction>(Val: V))
690	return I->getOpcode() == Opcode;
691	return isa<PoisonValue>(Val: V);
692	});
693	}
694
695	namespace {
696	/// Specifies the way the mask should be analyzed for undefs/poisonous elements
697	/// in the shuffle mask.
698	enum class UseMask {
699	FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
700	///< check for the mask elements for the first argument (mask
701	///< indices are in range [0:VF)).
702	SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
703	///< for the mask elements for the second argument (mask indices
704	///< are in range [VF:2VF))*
705	UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
706	///< future shuffle elements and mark them as ones as being used
707	///< in future. Non-undef elements are considered as unused since
708	///< they're already marked as used in the mask.
709	};
710	} // namespace
711
712	/// Prepares a use bitset for the given mask either for the first argument or
713	/// for the second.
714	static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
715	UseMask MaskArg) {
716	SmallBitVector UseMask(VF, true);
717	for (auto [Idx, Value] : enumerate(First&: Mask)) {
718	if (Value == PoisonMaskElem) {
719	if (MaskArg == UseMask::UndefsAsMask)
720	UseMask.reset(Idx);
721	continue;
722	}
723	if (MaskArg == UseMask::FirstArg && Value < VF)
724	UseMask.reset(Idx: Value);
725	else if (MaskArg == UseMask::SecondArg && Value >= VF)
726	UseMask.reset(Idx: Value - VF);
727	}
728	return UseMask;
729	}
730
731	/// Checks if the given value is actually an undefined constant vector.
732	/// Also, if the \p UseMask is not empty, tries to check if the non-masked
733	/// elements actually mask the insertelement buildvector, if any.
734	template <bool IsPoisonOnly = false>
735	static SmallBitVector isUndefVector(const Value *V,
736	const SmallBitVector &UseMask = {}) {
737	SmallBitVector Res(UseMask.empty() ? `1` : UseMask.size(), true);
738	using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
739	if (isa<T>(V))
740	return Res;
741	auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
742	if (!VecTy)
743	return Res.reset();
744	auto *C = dyn_cast<Constant>(Val: V);
745	if (!C) {
746	if (!UseMask.empty()) {
747	const Value *Base = V;
748	while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
749	Base = II->getOperand(i_nocapture: `0`);
750	if (isa<T>(II->getOperand(i_nocapture: `1`)))
751	continue;
752	std::optional<unsigned> Idx = getElementIndex(Inst: II);
753	if (!Idx) {
754	Res.reset();
755	return Res;
756	}
757	if (Idx < UseMask.size() && !UseMask.test(Idx: Idx))
758	Res.reset(Idx: *Idx);
759	}
760	// TODO: Add analysis for shuffles here too.
761	if (V == Base) {
762	Res.reset();
763	} else {
764	SmallBitVector SubMask(UseMask.size(), false);
765	Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
766	}
767	} else {
768	Res.reset();
769	}
770	return Res;
771	}
772	for (unsigned I = `0`, E = VecTy->getNumElements(); I != E; ++I) {
773	if (Constant *Elem = C->getAggregateElement(Elt: I))
774	if (!isa<T>(Elem) &&
775	(UseMask.empty() \|\| (I < UseMask.size() && !UseMask.test(Idx: I))))
776	Res.reset(Idx: I);
777	}
778	return Res;
779	}
780
781	/// Checks if the vector of instructions can be represented as a shuffle, like:
782	/// %x0 = extractelement <4 x i8> %x, i32 0
783	/// %x3 = extractelement <4 x i8> %x, i32 3
784	/// %y1 = extractelement <4 x i8> %y, i32 1
785	/// %y2 = extractelement <4 x i8> %y, i32 2
786	/// %x0x0 = mul i8 %x0, %x0
787	/// %x3x3 = mul i8 %x3, %x3
788	/// %y1y1 = mul i8 %y1, %y1
789	/// %y2y2 = mul i8 %y2, %y2
790	/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
791	/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
792	/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
793	/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
794	/// ret <4 x i8> %ins4
795	/// can be transformed into:
796	/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
797	/// i32 6>
798	/// %2 = mul <4 x i8> %1, %1
799	/// ret <4 x i8> %2
800	/// Mask will return the Shuffle Mask equivalent to the extracted elements.
801	/// TODO: Can we split off and reuse the shuffle mask detection from
802	/// ShuffleVectorInst/getShuffleCost?
803	static std::optional<TargetTransformInfo::ShuffleKind>
804	isFixedVectorShuffle(ArrayRef<Value > VL, SmallVectorImpl<int*> &Mask,
805	AssumptionCache *AC) {
806	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
807	if (It == VL.end())
808	return std::nullopt;
809	unsigned Size =
810	std::accumulate(first: VL.begin(), last: VL.end(), init: `0u`, binary_op: [](unsigned S, Value *V) {
811	auto *EI = dyn_cast<ExtractElementInst>(Val: V);
812	if (!EI)
813	return S;
814	auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
815	if (!VTy)
816	return S;
817	return std::max(a: S, b: VTy->getNumElements());
818	});
819
820	Value Vec1 = nullptr*;
821	Value Vec2 = nullptr*;
822	bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) {
823	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
824	if (!EE)
825	return false;
826	Value *Vec = EE->getVectorOperand();
827	if (isa<UndefValue>(Val: Vec))
828	return false;
829	return isGuaranteedNotToBePoison(V: Vec, AC);
830	});
831	enum ShuffleMode { Unknown, Select, Permute };
832	ShuffleMode CommonShuffleMode = Unknown;
833	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
834	for (unsigned I = `0`, E = VL.size(); I < E; ++I) {
835	// Undef can be represented as an undef element in a vector.
836	if (isa<UndefValue>(Val: VL [I]))
837	continue;
838	auto *EI = cast<ExtractElementInst>(Val: VL [I]);
839	if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
840	return std::nullopt;
841	auto *Vec = EI->getVectorOperand();
842	// We can extractelement from undef or poison vector.
843	if (isUndefVector</isPoisonOnly=/true>(V: Vec).all())
844	continue;
845	// All vector operands must have the same number of vector elements.
846	if (isa<UndefValue>(Val: Vec)) {
847	Mask [I] = I;
848	} else {
849	if (isa<UndefValue>(Val: EI->getIndexOperand()))
850	continue;
851	auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
852	if (!Idx)
853	return std::nullopt;
854	// Undefined behavior if Idx is negative or >= Size.
855	if (Idx->getValue().uge(RHS: Size))
856	continue;
857	unsigned IntIdx = Idx->getValue().getZExtValue();
858	Mask [I] = IntIdx;
859	}
860	if (isUndefVector(V: Vec).all() && HasNonUndefVec)
861	continue;
862	// For correct shuffling we have to have at most 2 different vector operands
863	// in all extractelement instructions.
864	if (!Vec1 \|\| Vec1 == Vec) {
865	Vec1 = Vec;
866	} else if (!Vec2 \|\| Vec2 == Vec) {
867	Vec2 = Vec;
868	Mask [I] += Size;
869	} else {
870	return std::nullopt;
871	}
872	if (CommonShuffleMode == Permute)
873	continue;
874	// If the extract index is not the same as the operation number, it is a
875	// permutation.
876	if (Mask [I] % Size != I) {
877	CommonShuffleMode = Permute;
878	continue;
879	}
880	CommonShuffleMode = Select;
881	}
882	// If we're not crossing lanes in different vectors, consider it as blending.
883	if (CommonShuffleMode == Select && Vec2)
884	return TargetTransformInfo::SK_Select;
885	// If Vec2 was never used, we have a permutation of a single vector, otherwise
886	// we have permutation of 2 vectors.
887	return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
888	: TargetTransformInfo::SK_PermuteSingleSrc;
889	}
890
891	/// \returns True if Extract{Value,Element} instruction extracts element Idx.
892	static std::optional<unsigned> getExtractIndex(const Instruction *E) {
893	unsigned Opcode = E->getOpcode();
894	assert((Opcode == Instruction::ExtractElement \|\|
895	Opcode == Instruction::ExtractValue) &&
896	"Expected extractelement or extractvalue instruction.");
897	if (Opcode == Instruction::ExtractElement) {
898	auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: `1`));
899	if (!CI)
900	return std::nullopt;
901	// Check if the index is out of bound - we can get the source vector from
902	// operand 0
903	unsigned Idx = CI->getZExtValue();
904	auto *EE = cast<ExtractElementInst>(Val: E);
905	const unsigned VF = ::getNumElements(Ty: EE->getVectorOperandType());
906	if (Idx >= VF)
907	return std::nullopt;
908	return Idx;
909	}
910	auto *EI = cast<ExtractValueInst>(Val: E);
911	if (EI->getNumIndices() != `1`)
912	return std::nullopt;
913	return *EI->idx_begin();
914	}
915
916	/// Checks if the provided value does not require scheduling. It does not
917	/// require scheduling if this is not an instruction or it is an instruction
918	/// that does not read/write memory and all operands are either not instructions
919	/// or phi nodes or instructions from different blocks.
920	static bool areAllOperandsNonInsts(Value *V);
921	/// Checks if the provided value does not require scheduling. It does not
922	/// require scheduling if this is not an instruction or it is an instruction
923	/// that does not read/write memory and all users are phi nodes or instructions
924	/// from the different blocks.
925	static bool isUsedOutsideBlock(Value *V);
926	/// Checks if the specified value does not require scheduling. It does not
927	/// require scheduling if all operands and all users do not need to be scheduled
928	/// in the current basic block.
929	static bool doesNotNeedToBeScheduled(Value *V);
930
931	/// \returns true if \p Opcode is allowed as part of the main/alternate
932	/// instruction for SLP vectorization.
933	///
934	/// Example of unsupported opcode is SDIV that can potentially cause UB if the
935	/// "shuffled out" lane would result in division by zero.
936	static bool isValidForAlternation(unsigned Opcode) {
937	return !Instruction::isIntDivRem(Opcode);
938	}
939
940	namespace {
941
942	/// Helper class that determines VL can use the same opcode.
943	/// Alternate instruction is supported. In addition, it supports interchangeable
944	/// instruction. An interchangeable instruction is an instruction that can be
945	/// converted to another instruction with same semantics. For example, x << 1 is
946	/// equal to x 2. x * 1 is equal to x \| 0.*
947	class BinOpSameOpcodeHelper {
948	using MaskType = std::uint_fast16_t;
949	/// Sort SupportedOp because it is used by binary_search.
950	constexpr static std::initializer_list<unsigned> SupportedOp = {
951	Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
952	Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
953	static_assert(llvm::is_sorted_constexpr(Range: SupportedOp) &&
954	"SupportedOp is not sorted.");
955	enum : MaskType {
956	ShlBIT = `0b1`,
957	AShrBIT = `0b10`,
958	MulBIT = `0b100`,
959	AddBIT = `0b1000`,
960	SubBIT = `0b10000`,
961	AndBIT = `0b100000`,
962	OrBIT = `0b1000000`,
963	XorBIT = `0b10000000`,
964	MainOpBIT = `0b100000000`,
965	LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
966	};
967	/// Return a non-nullptr if either operand of I is a ConstantInt.
968	/// The second return value represents the operand position. We check the
969	/// right-hand side first (1). If the right hand side is not a ConstantInt and
970	/// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
971	/// side (0).
972	static std::pair<ConstantInt , unsigned*>
973	isBinOpWithConstantInt(const Instruction *I) {
974	unsigned Opcode = I->getOpcode();
975	assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
976	(void)SupportedOp;
977	auto *BinOp = cast<BinaryOperator>(Val: I);
978	if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: `1`)))
979	return {CI, `1`};
980	if (Opcode == Instruction::Sub \|\| Opcode == Instruction::Shl \|\|
981	Opcode == Instruction::AShr)
982	return {nullptr, `0`};
983	if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: `0`)))
984	return {CI, `0`};
985	return {nullptr, `0`};
986	}
987	struct InterchangeableInfo {
988	const Instruction I = nullptr*;
989	/// The bit it sets represents whether MainOp can be converted to.
990	MaskType Mask = MainOpBIT \| XorBIT \| OrBIT \| AndBIT \| SubBIT \| AddBIT \|
991	MulBIT \| AShrBIT \| ShlBIT;
992	/// We cannot create an interchangeable instruction that does not exist in
993	/// VL. For example, VL [x + 0, y 1] can be converted to [x << 0, y << 0],*
994	/// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
995	/// 1]. SeenBefore is used to know what operations have been seen before.
996	MaskType SeenBefore = `0`;
997	InterchangeableInfo(const Instruction *I) : I(I) {}
998	/// Return false allows BinOpSameOpcodeHelper to find an alternate
999	/// instruction. Directly setting the mask will destroy the mask state,
1000	/// preventing us from determining which instruction it should convert to.
1001	bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1002	if (Mask & InterchangeableMask) {
1003	SeenBefore \|= OpcodeInMaskForm;
1004	Mask &= InterchangeableMask;
1005	return true;
1006	}
1007	return false;
1008	}
1009	bool equal(unsigned Opcode) {
1010	return Opcode == I->getOpcode() && trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT);
1011	}
1012	unsigned getOpcode() const {
1013	MaskType Candidate = Mask & SeenBefore;
1014	if (Candidate & MainOpBIT)
1015	return I->getOpcode();
1016	if (Candidate & ShlBIT)
1017	return Instruction::Shl;
1018	if (Candidate & AShrBIT)
1019	return Instruction::AShr;
1020	if (Candidate & MulBIT)
1021	return Instruction::Mul;
1022	if (Candidate & AddBIT)
1023	return Instruction::Add;
1024	if (Candidate & SubBIT)
1025	return Instruction::Sub;
1026	if (Candidate & AndBIT)
1027	return Instruction::And;
1028	if (Candidate & OrBIT)
1029	return Instruction::Or;
1030	if (Candidate & XorBIT)
1031	return Instruction::Xor;
1032	llvm_unreachable("Cannot find interchangeable instruction.");
1033	}
1034
1035	/// Return true if the instruction can be converted to \p Opcode.
1036	bool hasCandidateOpcode(unsigned Opcode) const {
1037	MaskType Candidate = Mask & SeenBefore;
1038	switch (Opcode) {
1039	case Instruction::Shl:
1040	return Candidate & ShlBIT;
1041	case Instruction::AShr:
1042	return Candidate & AShrBIT;
1043	case Instruction::Mul:
1044	return Candidate & MulBIT;
1045	case Instruction::Add:
1046	return Candidate & AddBIT;
1047	case Instruction::Sub:
1048	return Candidate & SubBIT;
1049	case Instruction::And:
1050	return Candidate & AndBIT;
1051	case Instruction::Or:
1052	return Candidate & OrBIT;
1053	case Instruction::Xor:
1054	return Candidate & XorBIT;
1055	case Instruction::LShr:
1056	case Instruction::FAdd:
1057	case Instruction::FSub:
1058	case Instruction::FMul:
1059	case Instruction::SDiv:
1060	case Instruction::UDiv:
1061	case Instruction::FDiv:
1062	case Instruction::SRem:
1063	case Instruction::URem:
1064	case Instruction::FRem:
1065	return false;
1066	default:
1067	break;
1068	}
1069	llvm_unreachable("Cannot find interchangeable instruction.");
1070	}
1071
1072	SmallVector<Value > getOperand(const* Instruction To) const* {
1073	unsigned ToOpcode = To->getOpcode();
1074	unsigned FromOpcode = I->getOpcode();
1075	if (FromOpcode == ToOpcode)
1076	return SmallVector<Value *>(I->operands());
1077	assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1078	auto [CI, Pos] = isBinOpWithConstantInt(I);
1079	const APInt &FromCIValue = CI->getValue();
1080	unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1081	Type *RHSType = I->getOperand(i: Pos)->getType();
1082	Constant *RHS;
1083	switch (FromOpcode) {
1084	case Instruction::Shl:
1085	if (ToOpcode == Instruction::Mul) {
1086	RHS = ConstantInt::get(
1087	Ty: RHSType, V: APInt::getOneBitSet(numBits: FromCIValueBitWidth,
1088	BitNo: FromCIValue.getZExtValue()));
1089	} else {
1090	assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1091	RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1092	/AllowRHSConstant=/true);
1093	}
1094	break;
1095	case Instruction::Mul:
1096	assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1097	if (ToOpcode == Instruction::Shl) {
1098	RHS = ConstantInt::get(
1099	Ty: RHSType, V: APInt (FromCIValueBitWidth, FromCIValue.logBase2()));
1100	} else {
1101	assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1102	RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1103	/AllowRHSConstant=/true);
1104	}
1105	break;
1106	case Instruction::Add:
1107	case Instruction::Sub:
1108	if (FromCIValue.isZero()) {
1109	RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1110	/AllowRHSConstant=/true);
1111	} else {
1112	assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1113	"Cannot convert the instruction.");
1114	APInt NegatedVal = APInt (FromCIValue);
1115	NegatedVal.negate();
1116	RHS = ConstantInt::get(Ty: RHSType, V: NegatedVal);
1117	}
1118	break;
1119	case Instruction::And:
1120	assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1121	RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1122	/AllowRHSConstant=/true);
1123	break;
1124	default:
1125	assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1126	RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1127	/AllowRHSConstant=/true);
1128	break;
1129	}
1130	Value *LHS = I->getOperand(i: `1` - Pos);
1131	// If the target opcode is non-commutative (e.g., shl, sub),
1132	// force the variable to the left and the constant to the right.
1133	if (Pos == `1` \|\| !Instruction::isCommutative(Opcode: ToOpcode))
1134	return SmallVector<Value *>({LHS, RHS});
1135
1136	return SmallVector<Value *>({RHS, LHS});
1137	}
1138	};
1139	InterchangeableInfo MainOp;
1140	InterchangeableInfo AltOp;
1141	bool isValidForAlternation(const Instruction I) const* {
1142	return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) &&
1143	::isValidForAlternation(Opcode: I->getOpcode());
1144	}
1145	bool initializeAltOp(const Instruction *I) {
1146	if (AltOp.I)
1147	return true;
1148	if (!isValidForAlternation(I))
1149	return false;
1150	AltOp.I = I;
1151	return true;
1152	}
1153
1154	public:
1155	BinOpSameOpcodeHelper(const Instruction *MainOp,
1156	const Instruction AltOp = nullptr*)
1157	: MainOp (MainOp), AltOp (AltOp) {}
1158	bool add(const Instruction *I) {
1159	assert(isa<BinaryOperator>(I) &&
1160	"BinOpSameOpcodeHelper only accepts BinaryOperator.");
1161	unsigned Opcode = I->getOpcode();
1162	MaskType OpcodeInMaskForm;
1163	// Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1164	switch (Opcode) {
1165	case Instruction::Shl:
1166	OpcodeInMaskForm = ShlBIT;
1167	break;
1168	case Instruction::AShr:
1169	OpcodeInMaskForm = AShrBIT;
1170	break;
1171	case Instruction::Mul:
1172	OpcodeInMaskForm = MulBIT;
1173	break;
1174	case Instruction::Add:
1175	OpcodeInMaskForm = AddBIT;
1176	break;
1177	case Instruction::Sub:
1178	OpcodeInMaskForm = SubBIT;
1179	break;
1180	case Instruction::And:
1181	OpcodeInMaskForm = AndBIT;
1182	break;
1183	case Instruction::Or:
1184	OpcodeInMaskForm = OrBIT;
1185	break;
1186	case Instruction::Xor:
1187	OpcodeInMaskForm = XorBIT;
1188	break;
1189	default:
1190	return MainOp.equal(Opcode) \|\|
1191	(initializeAltOp(I) && AltOp.equal(Opcode));
1192	}
1193	MaskType InterchangeableMask = OpcodeInMaskForm;
1194	ConstantInt *CI = isBinOpWithConstantInt(I).first;
1195	if (CI) {
1196	constexpr MaskType CanBeAll =
1197	XorBIT \| OrBIT \| AndBIT \| SubBIT \| AddBIT \| MulBIT \| AShrBIT \| ShlBIT;
1198	const APInt &CIValue = CI->getValue();
1199	switch (Opcode) {
1200	case Instruction::Shl:
1201	if (CIValue.ult(RHS: CIValue.getBitWidth()))
1202	InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT \| ShlBIT;
1203	break;
1204	case Instruction::Mul:
1205	if (CIValue.isOne()) {
1206	InterchangeableMask = CanBeAll;
1207	break;
1208	}
1209	if (CIValue.isPowerOf2())
1210	InterchangeableMask = MulBIT \| ShlBIT;
1211	break;
1212	case Instruction::Add:
1213	case Instruction::Sub:
1214	InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT \| AddBIT;
1215	break;
1216	case Instruction::And:
1217	if (CIValue.isAllOnes())
1218	InterchangeableMask = CanBeAll;
1219	break;
1220	case Instruction::Xor:
1221	if (CIValue.isZero())
1222	InterchangeableMask = XorBIT \| OrBIT \| SubBIT \| AddBIT;
1223	break;
1224	default:
1225	if (CIValue.isZero())
1226	InterchangeableMask = CanBeAll;
1227	break;
1228	}
1229	}
1230	return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) \|\|
1231	(initializeAltOp(I) &&
1232	AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1233	}
1234	unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1235	/// Checks if the list of potential opcodes includes \p Opcode.
1236	bool hasCandidateOpcode(unsigned Opcode) const {
1237	return MainOp.hasCandidateOpcode(Opcode);
1238	}
1239	bool hasAltOp() const { return AltOp.I; }
1240	unsigned getAltOpcode() const {
1241	return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1242	}
1243	SmallVector<Value > getOperand(const* Instruction I) const* {
1244	return MainOp.getOperand(To: I);
1245	}
1246	};
1247
1248	/// Main data required for vectorization of instructions.
1249	class InstructionsState {
1250	/// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1251	/// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1252	/// (i.e., AltOp is not equal to MainOp; this can be checked using
1253	/// isAltShuffle).
1254	/// A rare exception is TrySplitNode, where the InstructionsState is derived
1255	/// from getMainAltOpsNoStateVL.
1256	/// For those InstructionsState that use alternate instructions, the resulting
1257	/// vectorized output ultimately comes from a shufflevector. For example,
1258	/// given a vector list (VL):
1259	/// VL[0] = add i32 a, e
1260	/// VL[1] = sub i32 b, f
1261	/// VL[2] = add i32 c, g
1262	/// VL[3] = sub i32 d, h
1263	/// The vectorized result would be:
1264	/// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1265	/// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1266	/// result = shufflevector <4 x i32> intermediated_0,
1267	/// <4 x i32> intermediated_1,
1268	/// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1269	/// Since shufflevector is used in the final result, when calculating the cost
1270	/// (getEntryCost), we must account for the usage of shufflevector in
1271	/// GetVectorCost.
1272	Instruction MainOp = nullptr*;
1273	Instruction AltOp = nullptr*;
1274	/// Wether the instruction state represents copyable instructions.
1275	bool HasCopyables = false;
1276
1277	public:
1278	Instruction getMainOp() const* {
1279	assert(valid() && "InstructionsState is invalid.");
1280	return MainOp;
1281	}
1282
1283	Instruction getAltOp() const* {
1284	assert(valid() && "InstructionsState is invalid.");
1285	return AltOp;
1286	}
1287
1288	/// The main/alternate opcodes for the list of instructions.
1289	unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1290
1291	unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1292
1293	/// Some of the instructions in the list have alternate opcodes.
1294	bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1295
1296	/// Checks if the instruction matches either the main or alternate opcode.
1297	/// \returns
1298	/// - MainOp if \param I matches MainOp's opcode directly or can be converted
1299	/// to it
1300	/// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1301	/// it
1302	/// - nullptr if \param I cannot be matched or converted to either opcode
1303	Instruction getMatchingMainOpOrAltOp(Instruction I) const {
1304	assert(MainOp && "MainOp cannot be nullptr.");
1305	if (I->getOpcode() == MainOp->getOpcode())
1306	return MainOp;
1307	if (MainOp->getOpcode() == Instruction::Select &&
1308	I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1309	return MainOp;
1310	// Prefer AltOp instead of interchangeable instruction of MainOp.
1311	assert(AltOp && "AltOp cannot be nullptr.");
1312	if (I->getOpcode() == AltOp->getOpcode())
1313	return AltOp;
1314	if (!I->isBinaryOp())
1315	return nullptr;
1316	BinOpSameOpcodeHelper Converter(MainOp);
1317	if (!Converter.add(I) \|\| !Converter.add(I: MainOp))
1318	return nullptr;
1319	if (isAltShuffle() && !Converter.hasCandidateOpcode(Opcode: MainOp->getOpcode())) {
1320	BinOpSameOpcodeHelper AltConverter(AltOp);
1321	if (AltConverter.add(I) && AltConverter.add(I: AltOp) &&
1322	AltConverter.hasCandidateOpcode(Opcode: AltOp->getOpcode()))
1323	return AltOp;
1324	}
1325	if (Converter.hasAltOp() && !isAltShuffle())
1326	return nullptr;
1327	return Converter.hasAltOp() ? AltOp : MainOp;
1328	}
1329
1330	/// Checks if main/alt instructions are shift operations.
1331	bool isShiftOp() const {
1332	return getMainOp()->isShift() && getAltOp()->isShift();
1333	}
1334
1335	/// Checks if main/alt instructions are bitwise logic operations.
1336	bool isBitwiseLogicOp() const {
1337	return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1338	}
1339
1340	/// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1341	bool isMulDivLikeOp() const {
1342	constexpr std::array<unsigned, `8`> MulDiv = {
1343	Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1344	Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1345	Instruction::URem, Instruction::FRem};
1346	return is_contained(Range: MulDiv, Element: getOpcode()) &&
1347	is_contained(Range: MulDiv, Element: getAltOpcode());
1348	}
1349
1350	/// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1351	bool isAddSubLikeOp() const {
1352	constexpr std::array<unsigned, `4`> AddSub = {
1353	Instruction::Add, Instruction::Sub, Instruction::FAdd,
1354	Instruction::FSub};
1355	return is_contained(Range: AddSub, Element: getOpcode()) &&
1356	is_contained(Range: AddSub, Element: getAltOpcode());
1357	}
1358
1359	/// Checks if main/alt instructions are cmp operations.
1360	bool isCmpOp() const {
1361	return (getOpcode() == Instruction::ICmp \|\|
1362	getOpcode() == Instruction::FCmp) &&
1363	getAltOpcode() == getOpcode();
1364	}
1365
1366	/// Checks if the current state is valid, i.e. has non-null MainOp
1367	bool valid() const { return MainOp && AltOp; }
1368
1369	explicit operator bool() const { return valid(); }
1370
1371	InstructionsState() = delete;
1372	InstructionsState(Instruction MainOp, Instruction AltOp,
1373	bool HasCopyables = false)
1374	: MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1375	static InstructionsState invalid() { return {nullptr, nullptr}; }
1376
1377	/// Checks if the value is a copyable element.
1378	bool isCopyableElement(Value V) const* {
1379	assert(valid() && "InstructionsState is invalid.");
1380	if (!HasCopyables)
1381	return false;
1382	if (isAltShuffle() \|\| getOpcode() == Instruction::GetElementPtr)
1383	return false;
1384	auto *I = dyn_cast<Instruction>(Val: V);
1385	if (!I)
1386	return !isa<PoisonValue>(Val: V);
1387	if (I->getParent() != MainOp->getParent() &&
1388	(!isVectorLikeInstWithConstOps(V: I) \|\|
1389	!isVectorLikeInstWithConstOps(V: MainOp)))
1390	return true;
1391	if (I->getOpcode() == MainOp->getOpcode())
1392	return false;
1393	if (!I->isBinaryOp())
1394	return true;
1395	BinOpSameOpcodeHelper Converter(MainOp);
1396	return !Converter.add(I) \|\| !Converter.add(I: MainOp) \|\|
1397	Converter.hasAltOp() \|\| !Converter.hasCandidateOpcode(Opcode: getOpcode());
1398	}
1399
1400	/// Checks if the value is non-schedulable.
1401	bool isNonSchedulable(Value V) const* {
1402	assert(valid() && "InstructionsState is invalid.");
1403	auto *I = dyn_cast<Instruction>(Val: V);
1404	if (!HasCopyables)
1405	return !I \|\| isa<PHINode>(Val: I) \|\| isVectorLikeInstWithConstOps(V: I) \|\|
1406	doesNotNeedToBeScheduled(V);
1407	// MainOp for copyables always schedulable to correctly identify
1408	// non-schedulable copyables.
1409	if (getMainOp() == V)
1410	return false;
1411	if (isCopyableElement(V)) {
1412	auto IsNonSchedulableCopyableElement = [this](Value *V) {
1413	auto *I = dyn_cast<Instruction>(Val: V);
1414	return !I \|\| isa<PHINode>(Val: I) \|\| I->getParent() != MainOp->getParent() \|\|
1415	(doesNotNeedToBeScheduled(V: I) &&
1416	// If the copyable instructions comes after MainOp
1417	// (non-schedulable, but used in the block) - cannot vectorize
1418	// it, will possibly generate use before def.
1419	!MainOp->comesBefore(Other: I));
1420	};
1421
1422	return IsNonSchedulableCopyableElement(V);
1423	}
1424	return !I \|\| isa<PHINode>(Val: I) \|\| isVectorLikeInstWithConstOps(V: I) \|\|
1425	doesNotNeedToBeScheduled(V);
1426	}
1427
1428	/// Checks if the state represents copyable instructions.
1429	bool areInstructionsWithCopyableElements() const {
1430	assert(valid() && "InstructionsState is invalid.");
1431	return HasCopyables;
1432	}
1433	};
1434
1435	std::pair<Instruction , SmallVector<Value >>
1436	convertTo(Instruction I, const* InstructionsState &S) {
1437	Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1438	assert(SelectedOp && "Cannot convert the instruction.");
1439	if (I->isBinaryOp()) {
1440	BinOpSameOpcodeHelper Converter(I);
1441	return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp));
1442	}
1443	return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands()));
1444	}
1445
1446	} // end anonymous namespace
1447
1448	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1449	const TargetLibraryInfo &TLI);
1450
1451	/// Find an instruction with a specific opcode in VL.
1452	/// \param VL Array of values to search through. Must contain only Instructions
1453	/// and PoisonValues.
1454	/// \param Opcode The instruction opcode to search for
1455	/// \returns
1456	/// - The first instruction found with matching opcode
1457	/// - nullptr if no matching instruction is found
1458	static Instruction findInstructionWithOpcode(ArrayRef<Value > VL,
1459	unsigned Opcode) {
1460	for (Value *V : VL) {
1461	if (isa<PoisonValue>(Val: V))
1462	continue;
1463	assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1464	auto *Inst = cast<Instruction>(Val: V);
1465	if (Inst->getOpcode() == Opcode)
1466	return Inst;
1467	}
1468	return nullptr;
1469	}
1470
1471	/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1472	/// compatible instructions or constants, or just some other regular values.
1473	static bool areCompatibleCmpOps(Value BaseOp0, Value BaseOp1, Value *Op0,
1474	Value Op1, const* TargetLibraryInfo &TLI) {
1475	return (isConstant(V: BaseOp0) && isConstant(V: Op0)) \|\|
1476	(isConstant(V: BaseOp1) && isConstant(V: Op1)) \|\|
1477	(!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
1478	!isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) \|\|
1479	BaseOp0 == Op0 \|\| BaseOp1 == Op1 \|\|
1480	getSameOpcode(VL: {BaseOp0, Op0}, TLI) \|\|
1481	getSameOpcode(VL: {BaseOp1, Op1}, TLI);
1482	}
1483
1484	/// \returns true if a compare instruction \p CI has similar "look" and
1485	/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1486	/// swapped, false otherwise.
1487	static bool isCmpSameOrSwapped(const CmpInst BaseCI, const* CmpInst *CI,
1488	const TargetLibraryInfo &TLI) {
1489	assert(BaseCI->getOperand(`0`)->getType() == CI->getOperand(`0`)->getType() &&
1490	"Assessing comparisons of different types?");
1491	CmpInst::Predicate BasePred = BaseCI->getPredicate();
1492	CmpInst::Predicate Pred = CI->getPredicate();
1493	CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
1494
1495	Value *BaseOp0 = BaseCI->getOperand(i_nocapture: `0`);
1496	Value *BaseOp1 = BaseCI->getOperand(i_nocapture: `1`);
1497	Value *Op0 = CI->getOperand(i_nocapture: `0`);
1498	Value *Op1 = CI->getOperand(i_nocapture: `1`);
1499
1500	return (BasePred == Pred &&
1501	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) \|\|
1502	(BasePred == SwappedPred &&
1503	areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
1504	}
1505
1506	/// \returns analysis of the Instructions in \p VL described in
1507	/// InstructionsState, the Opcode that we suppose the whole list
1508	/// could be vectorized even if its structure is diverse.
1509	static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1510	const TargetLibraryInfo &TLI) {
1511	// Make sure these are all Instructions.
1512	if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>))
1513	return InstructionsState::invalid();
1514
1515	auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
1516	if (It == VL.end())
1517	return InstructionsState::invalid();
1518
1519	Instruction MainOp = cast<Instruction>(Val: It);
1520	unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>);
1521	if ((VL.size() > `2` && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / `2`) \|\|
1522	(VL.size() == `2` && InstCnt < `2`))
1523	return InstructionsState::invalid();
1524
1525	bool IsCastOp = isa<CastInst>(Val: MainOp);
1526	bool IsBinOp = isa<BinaryOperator>(Val: MainOp);
1527	bool IsCmpOp = isa<CmpInst>(Val: MainOp);
1528	CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
1529	: CmpInst::BAD_ICMP_PREDICATE;
1530	Instruction *AltOp = MainOp;
1531	unsigned Opcode = MainOp->getOpcode();
1532	unsigned AltOpcode = Opcode;
1533
1534	BinOpSameOpcodeHelper BinOpHelper(MainOp);
1535	bool SwappedPredsCompatible = IsCmpOp && [&]() {
1536	SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1537	UniquePreds.insert(X: BasePred);
1538	UniqueNonSwappedPreds.insert(X: BasePred);
1539	for (Value *V : VL) {
1540	auto *I = dyn_cast<CmpInst>(Val: V);
1541	if (!I)
1542	return false;
1543	CmpInst::Predicate CurrentPred = I->getPredicate();
1544	CmpInst::Predicate SwappedCurrentPred =
1545	CmpInst::getSwappedPredicate(pred: CurrentPred);
1546	UniqueNonSwappedPreds.insert(X: CurrentPred);
1547	if (!UniquePreds.contains(key: CurrentPred) &&
1548	!UniquePreds.contains(key: SwappedCurrentPred))
1549	UniquePreds.insert(X: CurrentPred);
1550	}
1551	// Total number of predicates > 2, but if consider swapped predicates
1552	// compatible only 2, consider swappable predicates as compatible opcodes,
1553	// not alternate.
1554	return UniqueNonSwappedPreds.size() > `2` && UniquePreds.size() == `2`;
1555	}();
1556	// Check for one alternate opcode from another BinaryOperator.
1557	// TODO - generalize to support all operators (types, calls etc.).
1558	Intrinsic::ID BaseID = `0`;
1559	SmallVector<VFInfo> BaseMappings;
1560	if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) {
1561	BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
1562	BaseMappings = VFDatabase (CallBase).getMappings(CI: CallBase);
1563	if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
1564	return InstructionsState::invalid();
1565	}
1566	bool AnyPoison = InstCnt != VL.size();
1567	// Check MainOp too to be sure that it matches the requirements for the
1568	// instructions.
1569	for (Value *V : iterator_range(It, VL.end())) {
1570	auto *I = dyn_cast<Instruction>(Val: V);
1571	if (!I)
1572	continue;
1573
1574	// Cannot combine poison and divisions.
1575	// TODO: do some smart analysis of the CallInsts to exclude divide-like
1576	// intrinsics/functions only.
1577	if (AnyPoison && (I->isIntDivRem() \|\| I->isFPDivRem() \|\| isa<CallInst>(Val: I)))
1578	return InstructionsState::invalid();
1579	unsigned InstOpcode = I->getOpcode();
1580	if (IsBinOp && isa<BinaryOperator>(Val: I)) {
1581	if (BinOpHelper.add(I))
1582	continue;
1583	} else if (IsCastOp && isa<CastInst>(Val: I)) {
1584	Value *Op0 = MainOp->getOperand(i: `0`);
1585	Type *Ty0 = Op0->getType();
1586	Value *Op1 = I->getOperand(i: `0`);
1587	Type *Ty1 = Op1->getType();
1588	if (Ty0 == Ty1) {
1589	if (InstOpcode == Opcode \|\| InstOpcode == AltOpcode)
1590	continue;
1591	if (Opcode == AltOpcode) {
1592	assert(isValidForAlternation(Opcode) &&
1593	isValidForAlternation(InstOpcode) &&
1594	"Cast isn't safe for alternation, logic needs to be updated!");
1595	AltOpcode = InstOpcode;
1596	AltOp = I;
1597	continue;
1598	}
1599	}
1600	} else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) {
1601	auto *BaseInst = cast<CmpInst>(Val: MainOp);
1602	Type *Ty0 = BaseInst->getOperand(i_nocapture: `0`)->getType();
1603	Type *Ty1 = Inst->getOperand(i_nocapture: `0`)->getType();
1604	if (Ty0 == Ty1) {
1605	assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1606	assert(InstOpcode == AltOpcode &&
1607	"Alternate instructions are only supported by BinaryOperator "
1608	"and CastInst.");
1609	// Check for compatible operands. If the corresponding operands are not
1610	// compatible - need to perform alternate vectorization.
1611	CmpInst::Predicate CurrentPred = Inst->getPredicate();
1612	CmpInst::Predicate SwappedCurrentPred =
1613	CmpInst::getSwappedPredicate(pred: CurrentPred);
1614
1615	if ((VL.size() == `2` \|\| SwappedPredsCompatible) &&
1616	(BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred))
1617	continue;
1618
1619	if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
1620	continue;
1621	auto *AltInst = cast<CmpInst>(Val: AltOp);
1622	if (MainOp != AltOp) {
1623	if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
1624	continue;
1625	} else if (BasePred != CurrentPred) {
1626	assert(
1627	isValidForAlternation(InstOpcode) &&
1628	"CmpInst isn't safe for alternation, logic needs to be updated!");
1629	AltOp = I;
1630	continue;
1631	}
1632	CmpInst::Predicate AltPred = AltInst->getPredicate();
1633	if (BasePred == CurrentPred \|\| BasePred == SwappedCurrentPred \|\|
1634	AltPred == CurrentPred \|\| AltPred == SwappedCurrentPred)
1635	continue;
1636	}
1637	} else if (InstOpcode == Opcode) {
1638	assert(InstOpcode == AltOpcode &&
1639	"Alternate instructions are only supported by BinaryOperator and "
1640	"CastInst.");
1641	if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
1642	if (Gep->getNumOperands() != `2` \|\|
1643	Gep->getOperand(i_nocapture: `0`)->getType() != MainOp->getOperand(i: `0`)->getType())
1644	return InstructionsState::invalid();
1645	} else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
1646	if (!isVectorLikeInstWithConstOps(V: EI))
1647	return InstructionsState::invalid();
1648	} else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
1649	auto *BaseLI = cast<LoadInst>(Val: MainOp);
1650	if (!LI->isSimple() \|\| !BaseLI->isSimple())
1651	return InstructionsState::invalid();
1652	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
1653	auto *CallBase = cast<CallInst>(Val: MainOp);
1654	if (Call->getCalledFunction() != CallBase->getCalledFunction())
1655	return InstructionsState::invalid();
1656	if (Call->hasOperandBundles() &&
1657	(!CallBase->hasOperandBundles() \|\|
1658	!std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
1659	last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
1660	first2: CallBase->op_begin() +
1661	CallBase->getBundleOperandsStartIndex())))
1662	return InstructionsState::invalid();
1663	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
1664	if (ID != BaseID)
1665	return InstructionsState::invalid();
1666	if (!ID) {
1667	SmallVector<VFInfo> Mappings = VFDatabase (Call).getMappings(CI: Call);
1668	if (Mappings.size() != BaseMappings.size() \|\|
1669	Mappings.front().ISA != BaseMappings.front().ISA \|\|
1670	Mappings.front().ScalarName != BaseMappings.front().ScalarName \|\|
1671	Mappings.front().VectorName != BaseMappings.front().VectorName \|\|
1672	Mappings.front().Shape.VF != BaseMappings.front().Shape.VF \|\|
1673	Mappings.front().Shape.Parameters !=
1674	BaseMappings.front().Shape.Parameters)
1675	return InstructionsState::invalid();
1676	}
1677	}
1678	continue;
1679	}
1680	return InstructionsState::invalid();
1681	}
1682
1683	if (IsBinOp) {
1684	MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode());
1685	assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1686	AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode());
1687	assert(AltOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1688	}
1689	assert((MainOp == AltOp \|\| !allSameOpcode(VL)) &&
1690	"Incorrect implementation of allSameOpcode.");
1691	InstructionsState S(MainOp, AltOp);
1692	assert(all_of(VL,
1693	[&](Value *V) {
1694	return isa<PoisonValue>(V) \|\|
1695	S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1696	}) &&
1697	"Invalid InstructionsState.");
1698	return S;
1699	}
1700
1701	/// \returns true if all of the values in \p VL have the same type or false
1702	/// otherwise.
1703	static bool allSameType(ArrayRef<Value *> VL) {
1704	Type *Ty = VL.consume_front()->getType();
1705	return all_of(Range&: VL, P: [&](Value V) { return* V->getType() == Ty; });
1706	}
1707
1708	/// \returns True if in-tree use also needs extract. This refers to
1709	/// possible scalar operand in vectorized instruction.
1710	static bool doesInTreeUserNeedToExtract(Value Scalar, Instruction UserInst,
1711	TargetLibraryInfo *TLI,
1712	const TargetTransformInfo *TTI) {
1713	if (!UserInst)
1714	return false;
1715	unsigned Opcode = UserInst->getOpcode();
1716	switch (Opcode) {
1717	case Instruction::Load: {
1718	LoadInst *LI = cast<LoadInst>(Val: UserInst);
1719	return (LI->getPointerOperand() == Scalar);
1720	}
1721	case Instruction::Store: {
1722	StoreInst *SI = cast<StoreInst>(Val: UserInst);
1723	return (SI->getPointerOperand() == Scalar);
1724	}
1725	case Instruction::Call: {
1726	CallInst *CI = cast<CallInst>(Val: UserInst);
1727	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1728	return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
1729	return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1730	Arg.value().get() == Scalar;
1731	});
1732	}
1733	default:
1734	return false;
1735	}
1736	}
1737
1738	/// \returns the AA location that is being access by the instruction.
1739	static MemoryLocation getLocation(Instruction *I) {
1740	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1741	return MemoryLocation::get(SI);
1742	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1743	return MemoryLocation::get(LI);
1744	return MemoryLocation ();
1745	}
1746
1747	/// \returns True if the instruction is not a volatile or atomic load/store.
1748	static bool isSimple(Instruction *I) {
1749	if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1750	return LI->isSimple();
1751	if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1752	return SI->isSimple();
1753	if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
1754	return !MI->isVolatile();
1755	return true;
1756	}
1757
1758	/// Shuffles \p Mask in accordance with the given \p SubMask.
1759	/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1760	/// one but two input vectors.
1761	static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1762	bool ExtendingManyInputs = false) {
1763	if (SubMask.empty())
1764	return;
1765	assert(
1766	(!ExtendingManyInputs \|\| SubMask.size() > Mask.size() \|\|
1767	// Check if input scalars were extended to match the size of other node.
1768	(SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1769	"SubMask with many inputs support must be larger than the mask.");
1770	if (Mask.empty()) {
1771	Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
1772	return;
1773	}
1774	SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1775	int TermValue = std::min(a: Mask.size(), b: SubMask.size());
1776	for (int I = `0`, E = SubMask.size(); I < E; ++I) {
1777	if (SubMask [I] == PoisonMaskElem \|\|
1778	(!ExtendingManyInputs &&
1779	(SubMask [I] >= TermValue \|\| Mask [SubMask [I]] >= TermValue)))
1780	continue;
1781	NewMask [I] = Mask [SubMask [I]];
1782	}
1783	Mask.swap(RHS&: NewMask);
1784	}
1785
1786	/// Order may have elements assigned special value (size) which is out of
1787	/// bounds. Such indices only appear on places which correspond to undef values
1788	/// (see canReuseExtract for details) and used in order to avoid undef values
1789	/// have effect on operands ordering.
1790	/// The first loop below simply finds all unused indices and then the next loop
1791	/// nest assigns these indices for undef values positions.
1792	/// As an example below Order has two undef positions and they have assigned
1793	/// values 3 and 7 respectively:
1794	/// before: 6 9 5 4 9 2 1 0
1795	/// after: 6 3 5 4 7 2 1 0
1796	static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1797	const size_t Sz = Order.size();
1798	SmallBitVector UnusedIndices(Sz, /t=/true);
1799	SmallBitVector MaskedIndices(Sz);
1800	for (unsigned I = `0`; I < Sz; ++I) {
1801	if (Order [I] < Sz)
1802	UnusedIndices.reset(Idx: Order [I]);
1803	else
1804	MaskedIndices.set(I);
1805	}
1806	if (MaskedIndices.none())
1807	return;
1808	assert(UnusedIndices.count() == MaskedIndices.count() &&
1809	"Non-synced masked/available indices.");
1810	int Idx = UnusedIndices.find_first();
1811	int MIdx = MaskedIndices.find_first();
1812	while (MIdx >= `0`) {
1813	assert(Idx >= `0` && "Indices must be synced.");
1814	Order [MIdx] = Idx;
1815	Idx = UnusedIndices.find_next(Prev: Idx);
1816	MIdx = MaskedIndices.find_next(Prev: MIdx);
1817	}
1818	}
1819
1820	/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1821	/// Opcode1.
1822	static SmallBitVector getAltInstrMask(ArrayRef<Value > VL, Type ScalarTy,
1823	unsigned Opcode0, unsigned Opcode1) {
1824	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
1825	SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1826	for (unsigned Lane : seq<unsigned>(Size: VL.size())) {
1827	if (isa<PoisonValue>(Val: VL [Lane]))
1828	continue;
1829	if (cast<Instruction>(Val: VL [Lane])->getOpcode() == Opcode1)
1830	OpcodeMask.set(I: Lane * ScalarTyNumElements,
1831	E: Lane * ScalarTyNumElements + ScalarTyNumElements);
1832	}
1833	return OpcodeMask;
1834	}
1835
1836	/// Replicates the given \p Val \p VF times.
1837	static SmallVector<Constant > replicateMask(ArrayRef<Constant > Val,
1838	unsigned VF) {
1839	assert(none_of(Val, [](Constant C) { return* C->getType()->isVectorTy(); }) &&
1840	"Expected scalar constants.");
1841	SmallVector<Constant > NewVal(Val.size() VF);
1842	for (auto [I, V] : enumerate(First&: Val))
1843	std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V);
1844	return NewVal;
1845	}
1846
1847	static void inversePermutation(ArrayRef<unsigned> Indices,
1848	SmallVectorImpl<int> &Mask) {
1849	Mask.clear();
1850	const unsigned E = Indices.size();
1851	Mask.resize(N: E, NV: PoisonMaskElem);
1852	for (unsigned I = `0`; I < E; ++I)
1853	Mask [Indices [I]] = I;
1854	}
1855
1856	/// Reorders the list of scalars in accordance with the given \p Mask.
1857	static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1858	ArrayRef<int> Mask) {
1859	assert(!Mask.empty() && "Expected non-empty mask.");
1860	SmallVector<Value *> Prev(Scalars.size(),
1861	PoisonValue::get(T: Scalars.front()->getType()));
1862	Prev.swap(RHS&: Scalars);
1863	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
1864	if (Mask [I] != PoisonMaskElem)
1865	Scalars [Mask [I]] = Prev [I];
1866	}
1867
1868	/// Checks if the provided value does not require scheduling. It does not
1869	/// require scheduling if this is not an instruction or it is an instruction
1870	/// that does not read/write memory and all operands are either not instructions
1871	/// or phi nodes or instructions from different blocks.
1872	static bool areAllOperandsNonInsts(Value *V) {
1873	auto *I = dyn_cast<Instruction>(Val: V);
1874	if (!I)
1875	return true;
1876	return !mayHaveNonDefUseDependency(I: *I) &&
1877	all_of(Range: I->operands(), P: [I](Value *V) {
1878	auto *IO = dyn_cast<Instruction>(Val: V);
1879	if (!IO)
1880	return true;
1881	return isa<PHINode>(Val: IO) \|\| IO->getParent() != I->getParent();
1882	});
1883	}
1884
1885	/// Checks if the provided value does not require scheduling. It does not
1886	/// require scheduling if this is not an instruction or it is an instruction
1887	/// that does not read/write memory and all users are phi nodes or instructions
1888	/// from the different blocks.
1889	static bool isUsedOutsideBlock(Value *V) {
1890	auto *I = dyn_cast<Instruction>(Val: V);
1891	if (!I)
1892	return true;
1893	// Limits the number of uses to save compile time.
1894	return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1895	all_of(Range: I->users(), P: [I](User *U) {
1896	auto *IU = dyn_cast<Instruction>(Val: U);
1897	if (!IU)
1898	return true;
1899	return IU->getParent() != I->getParent() \|\| isa<PHINode>(Val: IU);
1900	});
1901	}
1902
1903	/// Checks if the specified value does not require scheduling. It does not
1904	/// require scheduling if all operands and all users do not need to be scheduled
1905	/// in the current basic block.
1906	static bool doesNotNeedToBeScheduled(Value *V) {
1907	return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1908	}
1909
1910	/// Checks if the specified array of instructions does not require scheduling.
1911	/// It is so if all either instructions have operands that do not require
1912	/// scheduling or their users do not require scheduling since they are phis or
1913	/// in other basic blocks.
1914	static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1915	return !VL.empty() &&
1916	(all_of(Range&: VL, P: isUsedOutsideBlock) \|\| all_of(Range&: VL, P: areAllOperandsNonInsts));
1917	}
1918
1919	/// Returns true if widened type of \p Ty elements with size \p Sz represents
1920	/// full vector type, i.e. adding extra element results in extra parts upon type
1921	/// legalization.
1922	static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1923	unsigned Sz) {
1924	if (Sz <= `1`)
1925	return false;
1926	if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty))
1927	return false;
1928	if (has_single_bit(Value: Sz))
1929	return true;
1930	const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
1931	return NumParts > `0` && NumParts < Sz && has_single_bit(Value: Sz / NumParts) &&
1932	Sz % NumParts == `0`;
1933	}
1934
1935	/// Returns number of parts, the type \p VecTy will be split at the codegen
1936	/// phase. If the type is going to be scalarized or does not uses whole
1937	/// registers, returns 1.
1938	static unsigned
1939	getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1940	const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1941	unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
1942	if (NumParts == `0` \|\| NumParts >= Limit)
1943	return `1`;
1944	unsigned Sz = getNumElements(Ty: VecTy);
1945	if (NumParts >= Sz \|\| Sz % NumParts != `0` \|\|
1946	!hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts))
1947	return `1`;
1948	return NumParts;
1949	}
1950
1951	/// Bottom Up SLP Vectorizer.
1952	class slpvectorizer::BoUpSLP {
1953	class TreeEntry;
1954	class ScheduleEntity;
1955	class ScheduleData;
1956	class ScheduleCopyableData;
1957	class ScheduleBundle;
1958	class ShuffleCostEstimator;
1959	class ShuffleInstructionBuilder;
1960
1961	/// If we decide to generate strided load / store, this struct contains all
1962	/// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1963	/// and analyzeConstantStrideCandidate. Note that Stride can be given either
1964	/// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1965	/// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1966	/// size of element of FixedVectorType.
1967	struct StridedPtrInfo {
1968	Value StrideVal = nullptr*;
1969	const SCEV StrideSCEV = nullptr*;
1970	FixedVectorType Ty = nullptr*;
1971	};
1972	SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1973
1974	public:
1975	/// Tracks the state we can represent the loads in the given sequence.
1976	enum class LoadsState {
1977	Gather,
1978	Vectorize,
1979	ScatterVectorize,
1980	StridedVectorize,
1981	CompressVectorize
1982	};
1983
1984	using ValueList = SmallVector<Value *, `8`>;
1985	using InstrList = SmallVector<Instruction *, `16`>;
1986	using ValueSet = SmallPtrSet<Value *, `16`>;
1987	using StoreList = SmallVector<StoreInst *, `8`>;
1988	using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, `4`>;
1989	using OrdersType = SmallVector<unsigned, `4`>;
1990
1991	BoUpSLP(Function Func, ScalarEvolution Se, TargetTransformInfo *Tti,
1992	TargetLibraryInfo TLi, AAResults Aa, LoopInfo *Li,
1993	DominatorTree Dt, AssumptionCache AC, DemandedBits *DB,
1994	const DataLayout DL, OptimizationRemarkEmitter ORE)
1995	: BatchAA (*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1996	AC(AC), DB(DB), DL(DL), ORE(ORE),
1997	Builder (Se->getContext(), TargetFolder (*DL)) {
1998	CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1999	// Use the vector register size specified by the target unless overridden
2000	// by a command-line option.
2001	// TODO: It would be better to limit the vectorization factor based on
2002	// data type rather than just register size. For example, x86 AVX has
2003	// 256-bit registers, but it does not support integer operations
2004	// at that width (that requires AVX2).
2005	if (MaxVectorRegSizeOption.getNumOccurrences())
2006	MaxVecRegSize = MaxVectorRegSizeOption;
2007	else
2008	MaxVecRegSize =
2009	TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
2010	.getFixedValue();
2011
2012	if (MinVectorRegSizeOption.getNumOccurrences())
2013	MinVecRegSize = MinVectorRegSizeOption;
2014	else
2015	MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2016	}
2017
2018	/// Vectorize the tree that starts with the elements in \p VL.
2019	/// Returns the vectorized root.
2020	Value *vectorizeTree();
2021
2022	/// Vectorize the tree but with the list of externally used values \p
2023	/// ExternallyUsedValues. Values in this MapVector can be replaced but the
2024	/// generated extractvalue instructions.
2025	Value *
2026	vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2027	Instruction ReductionRoot = nullptr*,
2028	ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2029	VectorValuesAndScales = {});
2030
2031	/// \returns the cost incurred by unwanted spills and fills, caused by
2032	/// holding live values over call sites.
2033	InstructionCost getSpillCost();
2034
2035	/// Calculates the cost of the subtrees, trims non-profitable ones and returns
2036	/// final cost.
2037	InstructionCost
2038	calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
2039
2040	/// \returns the vectorization cost of the subtree that starts at \p VL.
2041	/// A negative number means that this is profitable.
2042	InstructionCost getTreeCost(InstructionCost TreeCost,
2043	ArrayRef<Value *> VectorizedVals = {},
2044	InstructionCost ReductionCost = TTI::TCC_Free);
2045
2046	/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2047	/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2048	void buildTree(ArrayRef<Value *> Roots,
2049	const SmallDenseSet<Value *> &UserIgnoreLst);
2050
2051	/// Construct a vectorizable tree that starts at \p Roots.
2052	void buildTree(ArrayRef<Value *> Roots);
2053
2054	/// Return the scalars of the root node.
2055	ArrayRef<Value > getRootNodeScalars() const* {
2056	assert(!VectorizableTree.empty() && "No graph to get the first node from");
2057	return VectorizableTree.front()->Scalars;
2058	}
2059
2060	/// Returns the type/is-signed info for the root node in the graph without
2061	/// casting.
2062	std::optional<std::pair<Type , bool>> getRootNodeTypeWithNoCast() const* {
2063	const TreeEntry &Root = *VectorizableTree.front();
2064	if (Root.State != TreeEntry::Vectorize \|\| Root.isAltShuffle() \|\|
2065	!Root.Scalars.front()->getType()->isIntegerTy())
2066	return std::nullopt;
2067	auto It = MinBWs.find(Val: &Root);
2068	if (It != MinBWs.end())
2069	return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(),
2070	NumBits: It ->second.first),
2071	y: It ->second.second);
2072	if (Root.getOpcode() == Instruction::ZExt \|\|
2073	Root.getOpcode() == Instruction::SExt)
2074	return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(),
2075	y: Root.getOpcode() == Instruction::SExt);
2076	return std::nullopt;
2077	}
2078
2079	/// Checks if the root graph node can be emitted with narrower bitwidth at
2080	/// codegen and returns it signedness, if so.
2081	bool isSignedMinBitwidthRootNode() const {
2082	return MinBWs.at(Val: VectorizableTree.front().get()).second;
2083	}
2084
2085	/// Returns reduction type after minbitdth analysis.
2086	FixedVectorType getReductionType() const* {
2087	if (ReductionBitWidth == `0` \|\|
2088	!VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() \|\|
2089	ReductionBitWidth >=
2090	DL->getTypeSizeInBits(
2091	Ty: VectorizableTree.front()->Scalars.front()->getType()))
2092	return getWidenedType(
2093	ScalarTy: VectorizableTree.front()->Scalars.front()->getType(),
2094	VF: VectorizableTree.front()->getVectorFactor());
2095	return getWidenedType(
2096	ScalarTy: IntegerType::get(
2097	C&: VectorizableTree.front()->Scalars.front()->getContext(),
2098	NumBits: ReductionBitWidth),
2099	VF: VectorizableTree.front()->getVectorFactor());
2100	}
2101
2102	/// Returns true if the tree results in one of the reduced bitcasts variants.
2103	bool isReducedBitcastRoot() const {
2104	return VectorizableTree.front()->hasState() &&
2105	(VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast \|\|
2106	VectorizableTree.front()->CombinedOp ==
2107	TreeEntry::ReducedBitcastBSwap \|\|
2108	VectorizableTree.front()->CombinedOp ==
2109	TreeEntry::ReducedBitcastLoads \|\|
2110	VectorizableTree.front()->CombinedOp ==
2111	TreeEntry::ReducedBitcastBSwapLoads) &&
2112	VectorizableTree.front()->State == TreeEntry::Vectorize;
2113	}
2114
2115	/// Returns true if the tree results in the reduced cmp bitcast root.
2116	bool isReducedCmpBitcastRoot() const {
2117	return VectorizableTree.front()->hasState() &&
2118	VectorizableTree.front()->CombinedOp ==
2119	TreeEntry::ReducedCmpBitcast &&
2120	VectorizableTree.front()->State == TreeEntry::Vectorize;
2121	}
2122
2123	/// Builds external uses of the vectorized scalars, i.e. the list of
2124	/// vectorized scalars to be extracted, their lanes and their scalar users. \p
2125	/// ExternallyUsedValues contains additional list of external uses to handle
2126	/// vectorization of reductions.
2127	void
2128	buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2129
2130	/// Transforms graph nodes to target specific representations, if profitable.
2131	void transformNodes();
2132
2133	/// Clear the internal data structures that are created by 'buildTree'.
2134	void deleteTree() {
2135	VectorizableTree.clear();
2136	ScalarToTreeEntries.clear();
2137	DeletedNodes.clear();
2138	TransformedToGatherNodes.clear();
2139	OperandsToTreeEntry.clear();
2140	ScalarsInSplitNodes.clear();
2141	MustGather.clear();
2142	NonScheduledFirst.clear();
2143	EntryToLastInstruction.clear();
2144	LastInstructionToPos.clear();
2145	LoadEntriesToVectorize.clear();
2146	IsGraphTransformMode = false;
2147	GatheredLoadsEntriesFirst.reset();
2148	CompressEntryToData.clear();
2149	ExternalUses.clear();
2150	ExternalUsesAsOriginalScalar.clear();
2151	ExternalUsesWithNonUsers.clear();
2152	for (auto &Iter : BlocksSchedules) {
2153	BlockScheduling *BS = Iter.second.get();
2154	BS->clear();
2155	}
2156	MinBWs.clear();
2157	ReductionBitWidth = `0`;
2158	BaseGraphSize = `1`;
2159	CastMaxMinBWSizes.reset();
2160	ExtraBitWidthNodes.clear();
2161	InstrElementSize.clear();
2162	UserIgnoreList = nullptr;
2163	PostponedGathers.clear();
2164	ValueToGatherNodes.clear();
2165	TreeEntryToStridedPtrInfoMap.clear();
2166	}
2167
2168	unsigned getTreeSize() const { return VectorizableTree.size(); }
2169
2170	/// Returns the base graph size, before any transformations.
2171	unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2172
2173	/// Perform LICM and CSE on the newly generated gather sequences.
2174	void optimizeGatherSequence();
2175
2176	/// Does this non-empty order represent an identity order? Identity
2177	/// should be represented as an empty order, so this is used to
2178	/// decide if we can canonicalize a computed order. Undef elements
2179	/// (represented as size) are ignored.
2180	static bool isIdentityOrder(ArrayRef<unsigned> Order) {
2181	assert(!Order.empty() && "expected non-empty order");
2182	const unsigned Sz = Order.size();
2183	return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) {
2184	return P.value() == P.index() \|\| P.value() == Sz;
2185	});
2186	}
2187
2188	/// Checks if the specified gather tree entry \p TE can be represented as a
2189	/// shuffled vector entry + (possibly) permutation with other gathers. It
2190	/// implements the checks only for possibly ordered scalars (Loads,
2191	/// ExtractElement, ExtractValue), which can be part of the graph.
2192	/// \param TopToBottom If true, used for the whole tree rotation, false - for
2193	/// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2194	/// node might be ignored.
2195	std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2196	bool TopToBottom,
2197	bool IgnoreReorder);
2198
2199	/// Sort loads into increasing pointers offsets to allow greater clustering.
2200	std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2201
2202	/// Gets reordering data for the given tree entry. If the entry is vectorized
2203	/// - just return ReorderIndices, otherwise check if the scalars can be
2204	/// reordered and return the most optimal order.
2205	/// \return std::nullopt if ordering is not important, empty order, if
2206	/// identity order is important, or the actual order.
2207	/// \param TopToBottom If true, include the order of vectorized stores and
2208	/// insertelement nodes, otherwise skip them.
2209	/// \param IgnoreReorder true, if the root node order can be ignored.
2210	std::optional<OrdersType>
2211	getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2212
2213	/// Checks if it is profitable to reorder the current tree.
2214	/// If the tree does not contain many profitable reordable nodes, better to
2215	/// skip it to save compile time.
2216	bool isProfitableToReorder() const;
2217
2218	/// Reorders the current graph to the most profitable order starting from the
2219	/// root node to the leaf nodes. The best order is chosen only from the nodes
2220	/// of the same size (vectorization factor). Smaller nodes are considered
2221	/// parts of subgraph with smaller VF and they are reordered independently. We
2222	/// can make it because we still need to extend smaller nodes to the wider VF
2223	/// and we can merge reordering shuffles with the widening shuffles.
2224	void reorderTopToBottom();
2225
2226	/// Reorders the current graph to the most profitable order starting from
2227	/// leaves to the root. It allows to rotate small subgraphs and reduce the
2228	/// number of reshuffles if the leaf nodes use the same order. In this case we
2229	/// can merge the orders and just shuffle user node instead of shuffling its
2230	/// operands. Plus, even the leaf nodes have different orders, it allows to
2231	/// sink reordering in the graph closer to the root node and merge it later
2232	/// during analysis.
2233	void reorderBottomToTop(bool IgnoreReorder = false);
2234
2235	/// \return The vector element size in bits to use when vectorizing the
2236	/// expression tree ending at \p V. If V is a store, the size is the width of
2237	/// the stored value. Otherwise, the size is the width of the largest loaded
2238	/// value reaching V. This method is used by the vectorizer to calculate
2239	/// vectorization factors.
2240	unsigned getVectorElementSize(Value *V);
2241
2242	/// Compute the minimum type sizes required to represent the entries in a
2243	/// vectorizable tree.
2244	void computeMinimumValueSizes();
2245
2246	// \returns maximum vector register size as set by TTI or overridden by cl::opt.
2247	unsigned getMaxVecRegSize() const {
2248	return MaxVecRegSize;
2249	}
2250
2251	// \returns minimum vector register size as set by cl::opt.
2252	unsigned getMinVecRegSize() const {
2253	return MinVecRegSize;
2254	}
2255
2256	unsigned getMinVF(unsigned Sz) const {
2257	return std::max(a: `2U`, b: getMinVecRegSize() / Sz);
2258	}
2259
2260	unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2261	unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2262	MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2263	return MaxVF ? MaxVF : UINT_MAX;
2264	}
2265
2266	/// Check if homogeneous aggregate is isomorphic to some VectorType.
2267	/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2268	/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2269	/// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2270	///
2271	/// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2272	unsigned canMapToVector(Type T) const*;
2273
2274	/// \returns True if the VectorizableTree is both tiny and not fully
2275	/// vectorizable. We do not vectorize such trees.
2276	bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2277
2278	/// Checks if the graph and all its subgraphs cannot be better vectorized.
2279	/// It may happen, if all gather nodes are loads and they cannot be
2280	/// "clusterized". In this case even subgraphs cannot be vectorized more
2281	/// effectively than the base graph.
2282	bool isTreeNotExtendable() const;
2283
2284	bool isStridedLoad(ArrayRef<Value > PointerOps, Type ScalarTy,
2285	Align Alignment, const int64_t Diff,
2286	const size_t Sz) const;
2287
2288	/// Return true if an array of scalar loads can be replaced with a strided
2289	/// load (with constant stride).
2290	///
2291	/// It is possible that the load gets "widened". Suppose that originally each
2292	/// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2293	/// constant): %b + 0 %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2*
2294	/// ...
2295	/// %b + 0 %s + (w - 1)*
2296	///
2297	/// %b + 1 %s + 0*
2298	/// %b + 1 %s + 1*
2299	/// %b + 1 %s + 2*
2300	/// ...
2301	/// %b + 1 %s + (w - 1)*
2302	/// ...
2303	///
2304	/// %b + (n - 1) %s + 0*
2305	/// %b + (n - 1) %s + 1*
2306	/// %b + (n - 1) %s + 2*
2307	/// ...
2308	/// %b + (n - 1) %s + (w - 1)*
2309	///
2310	/// In this case we will generate a strided load of type `<n x (k w)>`.*
2311	///
2312	/// \param PointerOps list of pointer arguments of loads.
2313	/// \param ElemTy original scalar type of loads.
2314	/// \param Alignment alignment of the first load.
2315	/// \param SortedIndices is the order of PointerOps as returned by
2316	/// `sortPtrAccesses`
2317	/// \param Diff Pointer difference between the lowest and the highes pointer
2318	/// in `PointerOps` as returned by `getPointersDiff`.
2319	/// \param Ptr0 first pointer in `PointersOps`.
2320	/// \param PtrN last pointer in `PointersOps`.
2321	/// \param SPtrInfo If the function return `true`, it also sets all the fields
2322	/// of `SPtrInfo` necessary to generate the strided load later.
2323	bool analyzeConstantStrideCandidate(
2324	const ArrayRef<Value > PointerOps, Type ElemTy, Align Alignment,
2325	const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2326	Value Ptr0, Value PtrN, StridedPtrInfo &SPtrInfo) const;
2327
2328	/// Return true if an array of scalar loads can be replaced with a strided
2329	/// load (with run-time stride).
2330	/// \param PointerOps list of pointer arguments of loads.
2331	/// \param ScalarTy type of loads.
2332	/// \param CommonAlignment common alignement of loads as computed by
2333	/// `computeCommonAlignment<LoadInst>`.
2334	/// \param SortedIndicies is a list of indicies computed by this function such
2335	/// that the sequence `PointerOps[SortedIndices[0]],
2336	/// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2337	/// ordered by the coefficient of the stride. For example, if PointerOps is
2338	/// `%base + %stride, %base, %base + 2 stride` the `SortedIndices` will be*
2339	/// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2340	/// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2341	/// \param SPtrInfo If the function return `true`, it also sets all the fields
2342	/// of `SPtrInfo` necessary to generate the strided load later.
2343	bool analyzeRtStrideCandidate(ArrayRef<Value > PointerOps, Type ScalarTy,
2344	Align CommonAlignment,
2345	SmallVectorImpl<unsigned> &SortedIndices,
2346	StridedPtrInfo &SPtrInfo) const;
2347
2348	/// Checks if the given array of loads can be represented as a vectorized,
2349	/// scatter or just simple gather.
2350	/// \param VL list of loads.
2351	/// \param VL0 main load value.
2352	/// \param Order returned order of load instructions.
2353	/// \param PointerOps returned list of pointer operands.
2354	/// \param BestVF return best vector factor, if recursive check found better
2355	/// vectorization sequences rather than masked gather.
2356	/// \param TryRecursiveCheck used to check if long masked gather can be
2357	/// represented as a serie of loads/insert subvector, if profitable.
2358	LoadsState canVectorizeLoads(ArrayRef<Value > VL, const* Value *VL0,
2359	SmallVectorImpl<unsigned> &Order,
2360	SmallVectorImpl<Value *> &PointerOps,
2361	StridedPtrInfo &SPtrInfo,
2362	unsigned BestVF = nullptr*,
2363	bool TryRecursiveCheck = true) const;
2364
2365	/// Registers non-vectorizable sequence of loads
2366	template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2367	ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2368	}
2369
2370	/// Checks if the given loads sequence is known as not vectorizable
2371	template <typename T>
2372	bool areKnownNonVectorizableLoads(ArrayRef<T > VL) const* {
2373	return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL));
2374	}
2375
2376	OptimizationRemarkEmitter getORE() { return* ORE; }
2377
2378	/// This structure holds any data we need about the edges being traversed
2379	/// during buildTreeRec(). We keep track of:
2380	/// (i) the user TreeEntry index, and
2381	/// (ii) the index of the edge.
2382	struct EdgeInfo {
2383	EdgeInfo() = default;
2384	EdgeInfo(TreeEntry UserTE, unsigned* EdgeIdx)
2385	: UserTE(UserTE), EdgeIdx(EdgeIdx) {}
2386	/// The user TreeEntry.
2387	TreeEntry UserTE = nullptr*;
2388	/// The operand index of the use.
2389	unsigned EdgeIdx = UINT_MAX;
2390	#ifndef NDEBUG
2391	friend inline raw_ostream &operator<<(raw_ostream &OS,
2392	const BoUpSLP::EdgeInfo &EI) {
2393	EI.dump(OS);
2394	return OS;
2395	}
2396	/// Debug print.
2397	void dump(raw_ostream &OS) const {
2398	OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2399	<< " EdgeIdx:" << EdgeIdx << "}";
2400	}
2401	LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2402	#endif
2403	bool operator == (const EdgeInfo &Other) const {
2404	return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2405	}
2406
2407	operator bool() const { return UserTE != nullptr; }
2408	};
2409	friend struct DenseMapInfo<EdgeInfo>;
2410
2411	/// A helper class used for scoring candidates for two consecutive lanes.
2412	class LookAheadHeuristics {
2413	const TargetLibraryInfo &TLI;
2414	const DataLayout &DL;
2415	ScalarEvolution &SE;
2416	const BoUpSLP &R;
2417	int NumLanes; // Total number of lanes (aka vectorization factor).
2418	int MaxLevel; // The maximum recursion depth for accumulating score.
2419
2420	public:
2421	LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
2422	ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2423	int MaxLevel)
2424	: TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2425	MaxLevel(MaxLevel) {}
2426
2427	// The hard-coded scores listed here are not very important, though it shall
2428	// be higher for better matches to improve the resulting cost. When
2429	// computing the scores of matching one sub-tree with another, we are
2430	// basically counting the number of values that are matching. So even if all
2431	// scores are set to 1, we would still get a decent matching result.
2432	// However, sometimes we have to break ties. For example we may have to
2433	// choose between matching loads vs matching opcodes. This is what these
2434	// scores are helping us with: they provide the order of preference. Also,
2435	// this is important if the scalar is externally used or used in another
2436	// tree entry node in the different lane.
2437
2438	/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2439	static const int ScoreConsecutiveLoads = `4`;
2440	/// The same load multiple times. This should have a better score than
2441	/// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2442	/// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2443	/// a vector load and 1.0 for a broadcast.
2444	static const int ScoreSplatLoads = `3`;
2445	/// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2446	static const int ScoreReversedLoads = `3`;
2447	/// A load candidate for masked gather.
2448	static const int ScoreMaskedGatherCandidate = `1`;
2449	/// ExtractElementInst from same vector and consecutive indexes.
2450	static const int ScoreConsecutiveExtracts = `4`;
2451	/// ExtractElementInst from same vector and reversed indices.
2452	static const int ScoreReversedExtracts = `3`;
2453	/// Constants.
2454	static const int ScoreConstants = `2`;
2455	/// Instructions with the same opcode.
2456	static const int ScoreSameOpcode = `2`;
2457	/// Instructions with alt opcodes (e.g, add + sub).
2458	static const int ScoreAltOpcodes = `1`;
2459	/// Identical instructions (a.k.a. splat or broadcast).
2460	static const int ScoreSplat = `1`;
2461	/// Matching with an undef is preferable to failing.
2462	static const int ScoreUndef = `1`;
2463	/// Score for failing to find a decent match.
2464	static const int ScoreFail = `0`;
2465	/// Score if all users are vectorized.
2466	static const int ScoreAllUserVectorized = `1`;
2467
2468	/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2469	/// \p U1 and \p U2 are the users of \p V1 and \p V2.
2470	/// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2471	/// MainAltOps.
2472	int getShallowScore(Value V1, Value V2, Instruction U1, Instruction U2,
2473	ArrayRef<Value > MainAltOps) const* {
2474	if (!isValidElementType(Ty: V1->getType()) \|\|
2475	!isValidElementType(Ty: V2->getType()))
2476	return LookAheadHeuristics::ScoreFail;
2477
2478	if (V1 == V2) {
2479	if (isa<LoadInst>(Val: V1)) {
2480	// Retruns true if the users of V1 and V2 won't need to be extracted.
2481	auto AllUsersAreInternal = [U1, U2, this](Value V1, Value V2) {
2482	// Bail out if we have too many uses to save compilation time.
2483	if (V1->hasNUsesOrMore(N: UsesLimit) \|\| V2->hasNUsesOrMore(N: UsesLimit))
2484	return false;
2485
2486	auto AllUsersVectorized = [U1, U2, this](Value *V) {
2487	return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
2488	return U == U1 \|\| U == U2 \|\| R.isVectorized(V: U);
2489	});
2490	};
2491	return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2492	};
2493	// A broadcast of a load can be cheaper on some targets.
2494	if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
2495	NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
2496	((int)V1->getNumUses() == NumLanes \|\|
2497	AllUsersAreInternal(V1, V2)))
2498	return LookAheadHeuristics::ScoreSplatLoads;
2499	}
2500	return LookAheadHeuristics::ScoreSplat;
2501	}
2502
2503	auto CheckSameEntryOrFail = [&]() {
2504	if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) {
2505	SmallPtrSet<TreeEntry *, `4`> Set(llvm::from_range, TEs1);
2506	if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2);
2507	!TEs2.empty() &&
2508	any_of(Range&: TEs2, P: [&](TreeEntry E) { return* Set.contains(Ptr: E); }))
2509	return LookAheadHeuristics::ScoreSplatLoads;
2510	}
2511	return LookAheadHeuristics::ScoreFail;
2512	};
2513
2514	auto *LI1 = dyn_cast<LoadInst>(Val: V1);
2515	auto *LI2 = dyn_cast<LoadInst>(Val: V2);
2516	if (LI1 && LI2) {
2517	if (LI1->getParent() != LI2->getParent() \|\| !LI1->isSimple() \|\|
2518	!LI2->isSimple())
2519	return CheckSameEntryOrFail();
2520
2521	std::optional<int64_t> Dist = getPointersDiff(
2522	ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
2523	PtrB: LI2->getPointerOperand(), DL, SE, /StrictCheck=/true);
2524	if (!Dist \|\| *Dist == `0`) {
2525	if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
2526	getUnderlyingObject(V: LI2->getPointerOperand()) &&
2527	R.TTI->isLegalMaskedGather(
2528	DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
2529	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2530	return CheckSameEntryOrFail();
2531	}
2532	// The distance is too large - still may be profitable to use masked
2533	// loads/gathers.
2534	if (std::abs(i: *Dist) > NumLanes / `2`)
2535	return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2536	// This still will detect consecutive loads, but we might have "holes"
2537	// in some cases. It is ok for non-power-2 vectorization and may produce
2538	// better results. It should not affect current vectorization.
2539	return (*Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveLoads
2540	: LookAheadHeuristics::ScoreReversedLoads;
2541	}
2542
2543	auto *C1 = dyn_cast<Constant>(Val: V1);
2544	auto *C2 = dyn_cast<Constant>(Val: V2);
2545	if (C1 && C2)
2546	return LookAheadHeuristics::ScoreConstants;
2547
2548	// Consider constants and buildvector compatible.
2549	if ((C1 && isa<InsertElementInst>(Val: V2)) \|\|
2550	(C2 && isa<InsertElementInst>(Val: V1)))
2551	return LookAheadHeuristics::ScoreConstants;
2552
2553	// Extracts from consecutive indexes of the same vector better score as
2554	// the extracts could be optimized away.
2555	Value *EV1;
2556	ConstantInt *Ex1Idx;
2557	if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
2558	// Undefs are always profitable for extractelements.
2559	// Compiler can easily combine poison and extractelement <non-poison> or
2560	// undef and extractelement <poison>. But combining undef +
2561	// extractelement <non-poison-but-may-produce-poison> requires some
2562	// extra operations.
2563	if (isa<UndefValue>(Val: V2))
2564	return (isa<PoisonValue>(Val: V2) \|\| isUndefVector(V: EV1).all())
2565	? LookAheadHeuristics::ScoreConsecutiveExtracts
2566	: LookAheadHeuristics::ScoreSameOpcode;
2567	Value EV2 = nullptr*;
2568	ConstantInt Ex2Idx = nullptr*;
2569	if (match(V: V2,
2570	P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
2571	R: m_Undef())))) {
2572	// Undefs are always profitable for extractelements.
2573	if (!Ex2Idx)
2574	return LookAheadHeuristics::ScoreConsecutiveExtracts;
2575	if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
2576	return LookAheadHeuristics::ScoreConsecutiveExtracts;
2577	if (EV2 == EV1) {
2578	int Idx1 = Ex1Idx->getZExtValue();
2579	int Idx2 = Ex2Idx->getZExtValue();
2580	int Dist = Idx2 - Idx1;
2581	// The distance is too large - still may be profitable to use
2582	// shuffles.
2583	if (std::abs(x: Dist) == `0`)
2584	return LookAheadHeuristics::ScoreSplat;
2585	if (std::abs(x: Dist) > NumLanes / `2`)
2586	return LookAheadHeuristics::ScoreSameOpcode;
2587	return (Dist > `0`) ? LookAheadHeuristics::ScoreConsecutiveExtracts
2588	: LookAheadHeuristics::ScoreReversedExtracts;
2589	}
2590	return LookAheadHeuristics::ScoreAltOpcodes;
2591	}
2592	return CheckSameEntryOrFail();
2593	}
2594
2595	auto *I1 = dyn_cast<Instruction>(Val: V1);
2596	auto *I2 = dyn_cast<Instruction>(Val: V2);
2597	if (I1 && I2) {
2598	if (I1->getParent() != I2->getParent())
2599	return CheckSameEntryOrFail();
2600	SmallVector<Value *, `4`> Ops(MainAltOps);
2601	Ops.push_back(Elt: I1);
2602	Ops.push_back(Elt: I2);
2603	InstructionsState S = getSameOpcode(VL: Ops, TLI);
2604	// Note: Only consider instructions with <= 2 operands to avoid
2605	// complexity explosion.
2606	if (S &&
2607	(S.getMainOp()->getNumOperands() <= `2` \|\| !MainAltOps.empty() \|\|
2608	!S.isAltShuffle()) &&
2609	all_of(Range&: Ops, P: [&S](Value *V) {
2610	return isa<PoisonValue>(Val: V) \|\|
2611	cast<Instruction>(Val: V)->getNumOperands() ==
2612	S.getMainOp()->getNumOperands();
2613	}))
2614	return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2615	: LookAheadHeuristics::ScoreSameOpcode;
2616	}
2617
2618	if (I1 && isa<PoisonValue>(Val: V2))
2619	return LookAheadHeuristics::ScoreSameOpcode;
2620
2621	if (isa<UndefValue>(Val: V2))
2622	return LookAheadHeuristics::ScoreUndef;
2623
2624	return CheckSameEntryOrFail();
2625	}
2626
2627	/// Go through the operands of \p LHS and \p RHS recursively until
2628	/// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2629	/// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2630	/// of \p U1 and \p U2), except at the beginning of the recursion where
2631	/// these are set to nullptr.
2632	///
2633	/// For example:
2634	/// \verbatim
2635	/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2636	/// \ / \ / \ / \ /
2637	/// + + + +
2638	/// G1 G2 G3 G4
2639	/// \endverbatim
2640	/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2641	/// each level recursively, accumulating the score. It starts from matching
2642	/// the additions at level 0, then moves on to the loads (level 1). The
2643	/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2644	/// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2645	/// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2646	/// Please note that the order of the operands does not matter, as we
2647	/// evaluate the score of all profitable combinations of operands. In
2648	/// other words the score of G1 and G4 is the same as G1 and G2. This
2649	/// heuristic is based on ideas described in:
2650	/// Look-ahead SLP: Auto-vectorization in the presence of commutative
2651	/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2652	/// Luís F. W. Góes
2653	int getScoreAtLevelRec(Value LHS, Value RHS, Instruction *U1,
2654	Instruction U2, int* CurrLevel,
2655	ArrayRef<Value > MainAltOps) const* {
2656
2657	// Get the shallow score of V1 and V2.
2658	int ShallowScoreAtThisLevel =
2659	getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
2660
2661	// If reached MaxLevel,
2662	// or if V1 and V2 are not instructions,
2663	// or if they are SPLAT,
2664	// or if they are not consecutive,
2665	// or if profitable to vectorize loads or extractelements, early return
2666	// the current cost.
2667	auto *I1 = dyn_cast<Instruction>(Val: LHS);
2668	auto *I2 = dyn_cast<Instruction>(Val: RHS);
2669	if (CurrLevel == MaxLevel \|\| !(I1 && I2) \|\| I1 == I2 \|\|
2670	ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail \|\|
2671	(((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) \|\|
2672	(I1->getNumOperands() > `2` && I2->getNumOperands() > `2`) \|\|
2673	(isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
2674	ShallowScoreAtThisLevel))
2675	return ShallowScoreAtThisLevel;
2676	assert(I1 && I2 && "Should have early exited.");
2677
2678	// Contains the I2 operand indexes that got matched with I1 operands.
2679	SmallSet<unsigned, `4`> Op2Used;
2680
2681	// Recursion towards the operands of I1 and I2. We are trying all possible
2682	// operand pairs, and keeping track of the best score.
2683	if (I1->getNumOperands() != I2->getNumOperands())
2684	return LookAheadHeuristics::ScoreSameOpcode;
2685	for (unsigned OpIdx1 = `0`, NumOperands1 = I1->getNumOperands();
2686	OpIdx1 != NumOperands1; ++OpIdx1) {
2687	// Try to pair op1I with the best operand of I2.
2688	int MaxTmpScore = `0`;
2689	unsigned MaxOpIdx2 = `0`;
2690	bool FoundBest = false;
2691	// If I2 is commutative try all combinations.
2692	unsigned FromIdx = isCommutative(I: I2) ? `0` : OpIdx1;
2693	unsigned ToIdx = isCommutative(I: I2)
2694	? I2->getNumOperands()
2695	: std::min(a: I2->getNumOperands(), b: OpIdx1 + `1`);
2696	assert(FromIdx <= ToIdx && "Bad index");
2697	for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2698	// Skip operands already paired with OpIdx1.
2699	if (Op2Used.count(V: OpIdx2))
2700	continue;
2701	// Recursively calculate the cost at each level
2702	int TmpScore =
2703	getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
2704	U1: I1, U2: I2, CurrLevel: CurrLevel + `1`, MainAltOps: {});
2705	// Look for the best score.
2706	if (TmpScore > LookAheadHeuristics::ScoreFail &&
2707	TmpScore > MaxTmpScore) {
2708	MaxTmpScore = TmpScore;
2709	MaxOpIdx2 = OpIdx2;
2710	FoundBest = true;
2711	}
2712	}
2713	if (FoundBest) {
2714	// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2715	Op2Used.insert(V: MaxOpIdx2);
2716	ShallowScoreAtThisLevel += MaxTmpScore;
2717	}
2718	}
2719	return ShallowScoreAtThisLevel;
2720	}
2721	};
2722	/// A helper data structure to hold the operands of a vector of instructions.
2723	/// This supports a fixed vector length for all operand vectors.
2724	class VLOperands {
2725	/// For each operand we need (i) the value, and (ii) the opcode that it
2726	/// would be attached to if the expression was in a left-linearized form.
2727	/// This is required to avoid illegal operand reordering.
2728	/// For example:
2729	/// \verbatim
2730	/// 0 Op1
2731	/// \|/
2732	/// Op1 Op2 Linearized + Op2
2733	/// \ / ----------> \|/
2734	/// - -
2735	///
2736	/// Op1 - Op2 (0 + Op1) - Op2
2737	/// \endverbatim
2738	///
2739	/// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2740	///
2741	/// Another way to think of this is to track all the operations across the
2742	/// path from the operand all the way to the root of the tree and to
2743	/// calculate the operation that corresponds to this path. For example, the
2744	/// path from Op2 to the root crosses the RHS of the '-', therefore the
2745	/// corresponding operation is a '-' (which matches the one in the
2746	/// linearized tree, as shown above).
2747	///
2748	/// For lack of a better term, we refer to this operation as Accumulated
2749	/// Path Operation (APO).
2750	struct OperandData {
2751	OperandData() = default;
2752	OperandData(Value V, bool* APO, bool IsUsed)
2753	: V(V), APO(APO), IsUsed(IsUsed) {}
2754	/// The operand value.
2755	Value V = nullptr*;
2756	/// TreeEntries only allow a single opcode, or an alternate sequence of
2757	/// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2758	/// APO. It is set to 'true' if 'V' is attached to an inverse operation
2759	/// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2760	/// (e.g., Add/Mul)
2761	bool APO = false;
2762	/// Helper data for the reordering function.
2763	bool IsUsed = false;
2764	};
2765
2766	/// During operand reordering, we are trying to select the operand at lane
2767	/// that matches best with the operand at the neighboring lane. Our
2768	/// selection is based on the type of value we are looking for. For example,
2769	/// if the neighboring lane has a load, we need to look for a load that is
2770	/// accessing a consecutive address. These strategies are summarized in the
2771	/// 'ReorderingMode' enumerator.
2772	enum class ReorderingMode {
2773	Load, ///< Matching loads to consecutive memory addresses
2774	Opcode, ///< Matching instructions based on opcode (same or alternate)
2775	Constant, ///< Matching constants
2776	Splat, ///< Matching the same instruction multiple times (broadcast)
2777	Failed, ///< We failed to create a vectorizable group
2778	};
2779
2780	using OperandDataVec = SmallVector<OperandData, `2`>;
2781
2782	/// A vector of operand vectors.
2783	SmallVector<OperandDataVec, `4`> OpsVec;
2784	/// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2785	/// is not IntrinsicInst, ArgSize is User::getNumOperands.
2786	unsigned ArgSize = `0`;
2787
2788	const TargetLibraryInfo &TLI;
2789	const DataLayout &DL;
2790	ScalarEvolution &SE;
2791	const BoUpSLP &R;
2792	const Loop L = nullptr*;
2793
2794	/// \returns the operand data at \p OpIdx and \p Lane.
2795	OperandData &getData(unsigned OpIdx, unsigned Lane) {
2796	return OpsVec [OpIdx][Lane];
2797	}
2798
2799	/// \returns the operand data at \p OpIdx and \p Lane. Const version.
2800	const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2801	return OpsVec [OpIdx][Lane];
2802	}
2803
2804	/// Clears the used flag for all entries.
2805	void clearUsed() {
2806	for (unsigned OpIdx = `0`, NumOperands = getNumOperands();
2807	OpIdx != NumOperands; ++OpIdx)
2808	for (unsigned Lane = `0`, NumLanes = getNumLanes(); Lane != NumLanes;
2809	++Lane)
2810	OpsVec [OpIdx][Lane].IsUsed = false;
2811	}
2812
2813	/// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2814	void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2815	std::swap(a&: OpsVec [OpIdx1][Lane], b&: OpsVec [OpIdx2][Lane]);
2816	}
2817
2818	/// \param Lane lane of the operands under analysis.
2819	/// \param OpIdx operand index in \p Lane lane we're looking the best
2820	/// candidate for.
2821	/// \param Idx operand index of the current candidate value.
2822	/// \returns The additional score due to possible broadcasting of the
2823	/// elements in the lane. It is more profitable to have power-of-2 unique
2824	/// elements in the lane, it will be vectorized with higher probability
2825	/// after removing duplicates. Currently the SLP vectorizer supports only
2826	/// vectorization of the power-of-2 number of unique scalars.
2827	int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2828	const SmallBitVector &UsedLanes) const {
2829	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2830	if (!isa<Instruction>(Val: IdxLaneV) \|\| IdxLaneV == getData(OpIdx, Lane).V \|\|
2831	isa<ExtractElementInst>(Val: IdxLaneV))
2832	return `0`;
2833	SmallDenseMap<Value , unsigned*, `4`> Uniques;
2834	for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) {
2835	if (Ln == Lane)
2836	continue;
2837	Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
2838	if (!isa<Instruction>(Val: OpIdxLnV))
2839	return `0`;
2840	Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln);
2841	}
2842	unsigned UniquesCount = Uniques.size();
2843	auto IdxIt = Uniques.find(Val: IdxLaneV);
2844	unsigned UniquesCntWithIdxLaneV =
2845	IdxIt != Uniques.end() ? UniquesCount : UniquesCount + `1`;
2846	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2847	auto OpIdxIt = Uniques.find(Val: OpIdxLaneV);
2848	unsigned UniquesCntWithOpIdxLaneV =
2849	OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + `1`;
2850	if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2851	return `0`;
2852	return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) -
2853	UniquesCntWithOpIdxLaneV,
2854	b: UniquesCntWithOpIdxLaneV -
2855	bit_floor(Value: UniquesCntWithOpIdxLaneV)) -
2856	((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt ->second))
2857	? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV)
2858	: bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2859	}
2860
2861	/// \param Lane lane of the operands under analysis.
2862	/// \param OpIdx operand index in \p Lane lane we're looking the best
2863	/// candidate for.
2864	/// \param Idx operand index of the current candidate value.
2865	/// \returns The additional score for the scalar which users are all
2866	/// vectorized.
2867	int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2868	Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2869	Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2870	// Do not care about number of uses for vector-like instructions
2871	// (extractelement/extractvalue with constant indices), they are extracts
2872	// themselves and already externally used. Vectorization of such
2873	// instructions does not add extra extractelement instruction, just may
2874	// remove it.
2875	if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
2876	isVectorLikeInstWithConstOps(V: OpIdxLaneV))
2877	return LookAheadHeuristics::ScoreAllUserVectorized;
2878	auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
2879	if (!IdxLaneI \|\| !isa<Instruction>(Val: OpIdxLaneV))
2880	return `0`;
2881	return R.areAllUsersVectorized(I: IdxLaneI)
2882	? LookAheadHeuristics::ScoreAllUserVectorized
2883	: `0`;
2884	}
2885
2886	/// Score scaling factor for fully compatible instructions but with
2887	/// different number of external uses. Allows better selection of the
2888	/// instructions with less external uses.
2889	static const int ScoreScaleFactor = `10`;
2890
2891	/// \Returns the look-ahead score, which tells us how much the sub-trees
2892	/// rooted at \p LHS and \p RHS match, the more they match the higher the
2893	/// score. This helps break ties in an informed way when we cannot decide on
2894	/// the order of the operands by just considering the immediate
2895	/// predecessors.
2896	int getLookAheadScore(Value LHS, Value RHS, ArrayRef<Value *> MainAltOps,
2897	int Lane, unsigned OpIdx, unsigned Idx,
2898	bool &IsUsed, const SmallBitVector &UsedLanes) {
2899	LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2900	LookAheadMaxDepth);
2901	// Keep track of the instruction stack as we recurse into the operands
2902	// during the look-ahead score exploration.
2903	int Score =
2904	LookAhead.getScoreAtLevelRec(LHS, RHS, /U1=/nullptr, /U2=/nullptr,
2905	/CurrLevel=/`1`, MainAltOps);
2906	if (Score) {
2907	int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2908	if (Score <= -SplatScore) {
2909	// Failed score.
2910	Score = `0`;
2911	} else {
2912	Score += SplatScore;
2913	// Scale score to see the difference between different operands
2914	// and similar operands but all vectorized/not all vectorized
2915	// uses. It does not affect actual selection of the best
2916	// compatible operand in general, just allows to select the
2917	// operand with all vectorized uses.
2918	Score *= ScoreScaleFactor;
2919	Score += getExternalUseScore(Lane, OpIdx, Idx);
2920	IsUsed = true;
2921	}
2922	}
2923	return Score;
2924	}
2925
2926	/// Best defined scores per lanes between the passes. Used to choose the
2927	/// best operand (with the highest score) between the passes.
2928	/// The key - {Operand Index, Lane}.
2929	/// The value - the best score between the passes for the lane and the
2930	/// operand.
2931	SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, `8`>
2932	BestScoresPerLanes;
2933
2934	// Search all operands in Ops[][Lane] for the one that matches best*
2935	// Ops[OpIdx][LastLane] and return its opreand index.
2936	// If no good match can be found, return std::nullopt.
2937	std::optional<unsigned>
2938	getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2939	ArrayRef<ReorderingMode> ReorderingModes,
2940	ArrayRef<Value *> MainAltOps,
2941	const SmallBitVector &UsedLanes) {
2942	unsigned NumOperands = getNumOperands();
2943
2944	// The operand of the previous lane at OpIdx.
2945	Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
2946
2947	// Our strategy mode for OpIdx.
2948	ReorderingMode RMode = ReorderingModes [OpIdx];
2949	if (RMode == ReorderingMode::Failed)
2950	return std::nullopt;
2951
2952	// The linearized opcode of the operand at OpIdx, Lane.
2953	bool OpIdxAPO = getData(OpIdx, Lane).APO;
2954
2955	// The best operand index and its score.
2956	// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2957	// are using the score to differentiate between the two.
2958	struct BestOpData {
2959	std::optional<unsigned> Idx;
2960	unsigned Score = `0`;
2961	} BestOp;
2962	BestOp.Score =
2963	BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: `0`)
2964	.first ->second;
2965
2966	// Track if the operand must be marked as used. If the operand is set to
2967	// Score 1 explicitly (because of non power-of-2 unique scalars, we may
2968	// want to reestimate the operands again on the following iterations).
2969	bool IsUsed = RMode == ReorderingMode::Splat \|\|
2970	RMode == ReorderingMode::Constant \|\|
2971	RMode == ReorderingMode::Load;
2972	// Iterate through all unused operands and look for the best.
2973	for (unsigned Idx = `0`; Idx != NumOperands; ++Idx) {
2974	// Get the operand at Idx and Lane.
2975	OperandData &OpData = getData(OpIdx: Idx, Lane);
2976	Value *Op = OpData.V;
2977	bool OpAPO = OpData.APO;
2978
2979	// Skip already selected operands.
2980	if (OpData.IsUsed)
2981	continue;
2982
2983	// Skip if we are trying to move the operand to a position with a
2984	// different opcode in the linearized tree form. This would break the
2985	// semantics.
2986	if (OpAPO != OpIdxAPO)
2987	continue;
2988
2989	// Look for an operand that matches the current mode.
2990	switch (RMode) {
2991	case ReorderingMode::Load:
2992	case ReorderingMode::Opcode: {
2993	bool LeftToRight = Lane > LastLane;
2994	Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2995	Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2996	int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
2997	OpIdx, Idx, IsUsed, UsedLanes);
2998	if (Score > static_cast<int>(BestOp.Score) \|\|
2999	(Score > `0` && Score == static_cast<int>(BestOp.Score) &&
3000	Idx == OpIdx)) {
3001	BestOp.Idx = Idx;
3002	BestOp.Score = Score;
3003	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] = Score;
3004	}
3005	break;
3006	}
3007	case ReorderingMode::Constant:
3008	if (isa<Constant>(Val: Op) \|\|
3009	(!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
3010	BestOp.Idx = Idx;
3011	if (isa<Constant>(Val: Op)) {
3012	BestOp.Score = LookAheadHeuristics::ScoreConstants;
3013	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] =
3014	LookAheadHeuristics::ScoreConstants;
3015	}
3016	if (isa<UndefValue>(Val: Op) \|\| !isa<Constant>(Val: Op))
3017	IsUsed = false;
3018	}
3019	break;
3020	case ReorderingMode::Splat:
3021	if (Op == OpLastLane \|\| (!BestOp.Score && isa<Constant>(Val: Op))) {
3022	IsUsed = Op == OpLastLane;
3023	if (Op == OpLastLane) {
3024	BestOp.Score = LookAheadHeuristics::ScoreSplat;
3025	BestScoresPerLanes [std::make_pair(x&: OpIdx, y&: Lane)] =
3026	LookAheadHeuristics::ScoreSplat;
3027	}
3028	BestOp.Idx = Idx;
3029	}
3030	break;
3031	case ReorderingMode::Failed:
3032	llvm_unreachable("Not expected Failed reordering mode.");
3033	}
3034	}
3035
3036	if (BestOp.Idx) {
3037	getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
3038	return BestOp.Idx;
3039	}
3040	// If we could not find a good match return std::nullopt.
3041	return std::nullopt;
3042	}
3043
3044	/// Helper for reorderOperandVecs.
3045	/// \returns the lane that we should start reordering from. This is the one
3046	/// which has the least number of operands that can freely move about or
3047	/// less profitable because it already has the most optimal set of operands.
3048	unsigned getBestLaneToStartReordering() const {
3049	unsigned Min = UINT_MAX;
3050	unsigned SameOpNumber = `0`;
3051	// std::pair<unsigned, unsigned> is used to implement a simple voting
3052	// algorithm and choose the lane with the least number of operands that
3053	// can freely move about or less profitable because it already has the
3054	// most optimal set of operands. The first unsigned is a counter for
3055	// voting, the second unsigned is the counter of lanes with instructions
3056	// with same/alternate opcodes and same parent basic block.
3057	MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
3058	// Try to be closer to the original results, if we have multiple lanes
3059	// with same cost. If 2 lanes have the same cost, use the one with the
3060	// highest index.
3061	for (int I = getNumLanes(); I > `0`; --I) {
3062	unsigned Lane = I - `1`;
3063	OperandsOrderData NumFreeOpsHash =
3064	getMaxNumOperandsThatCanBeReordered(Lane);
3065	// Compare the number of operands that can move and choose the one with
3066	// the least number.
3067	if (NumFreeOpsHash.NumOfAPOs < Min) {
3068	Min = NumFreeOpsHash.NumOfAPOs;
3069	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3070	HashMap.clear();
3071	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
3072	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
3073	NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3074	// Select the most optimal lane in terms of number of operands that
3075	// should be moved around.
3076	SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3077	HashMap [NumFreeOpsHash.Hash] = std::make_pair(x: `1`, y&: Lane);
3078	} else if (NumFreeOpsHash.NumOfAPOs == Min &&
3079	NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3080	auto [It, Inserted] =
3081	HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: `1`, Args&: Lane);
3082	if (!Inserted)
3083	++It->second.first;
3084	}
3085	}
3086	// Select the lane with the minimum counter.
3087	unsigned BestLane = `0`;
3088	unsigned CntMin = UINT_MAX;
3089	for (const auto &Data : reverse(C&: HashMap)) {
3090	if (Data.second.first < CntMin) {
3091	CntMin = Data.second.first;
3092	BestLane = Data.second.second;
3093	}
3094	}
3095	return BestLane;
3096	}
3097
3098	/// Data structure that helps to reorder operands.
3099	struct OperandsOrderData {
3100	/// The best number of operands with the same APOs, which can be
3101	/// reordered.
3102	unsigned NumOfAPOs = UINT_MAX;
3103	/// Number of operands with the same/alternate instruction opcode and
3104	/// parent.
3105	unsigned NumOpsWithSameOpcodeParent = `0`;
3106	/// Hash for the actual operands ordering.
3107	/// Used to count operands, actually their position id and opcode
3108	/// value. It is used in the voting mechanism to find the lane with the
3109	/// least number of operands that can freely move about or less profitable
3110	/// because it already has the most optimal set of operands. Can be
3111	/// replaced with SmallVector<unsigned> instead but hash code is faster
3112	/// and requires less memory.
3113	unsigned Hash = `0`;
3114	};
3115	/// \returns the maximum number of operands that are allowed to be reordered
3116	/// for \p Lane and the number of compatible instructions(with the same
3117	/// parent/opcode). This is used as a heuristic for selecting the first lane
3118	/// to start operand reordering.
3119	OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3120	unsigned CntTrue = `0`;
3121	unsigned NumOperands = getNumOperands();
3122	// Operands with the same APO can be reordered. We therefore need to count
3123	// how many of them we have for each APO, like this: Cnt[APO] = x.
3124	// Since we only have two APOs, namely true and false, we can avoid using
3125	// a map. Instead we can simply count the number of operands that
3126	// correspond to one of them (in this case the 'true' APO), and calculate
3127	// the other by subtracting it from the total number of operands.
3128	// Operands with the same instruction opcode and parent are more
3129	// profitable since we don't need to move them in many cases, with a high
3130	// probability such lane already can be vectorized effectively.
3131	bool AllUndefs = true;
3132	unsigned NumOpsWithSameOpcodeParent = `0`;
3133	Instruction OpcodeI = nullptr*;
3134	BasicBlock Parent = nullptr*;
3135	unsigned Hash = `0`;
3136	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
3137	const OperandData &OpData = getData(OpIdx, Lane);
3138	if (OpData.APO)
3139	++CntTrue;
3140	// Use Boyer-Moore majority voting for finding the majority opcode and
3141	// the number of times it occurs.
3142	if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
3143	if (!OpcodeI \|\| !getSameOpcode(VL: {OpcodeI, I}, TLI) \|\|
3144	I->getParent() != Parent) {
3145	if (NumOpsWithSameOpcodeParent == `0`) {
3146	NumOpsWithSameOpcodeParent = `1`;
3147	OpcodeI = I;
3148	Parent = I->getParent();
3149	} else {
3150	--NumOpsWithSameOpcodeParent;
3151	}
3152	} else {
3153	++NumOpsWithSameOpcodeParent;
3154	}
3155	}
3156	Hash = hash_combine(
3157	args: Hash, args: hash_value(value: (OpIdx + `1`) * (OpData.V->getValueID() + `1`)));
3158	AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
3159	}
3160	if (AllUndefs)
3161	return {};
3162	OperandsOrderData Data;
3163	Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
3164	Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3165	Data.Hash = Hash;
3166	return Data;
3167	}
3168
3169	/// Go through the instructions in VL and append their operands.
3170	void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3171	const InstructionsState &S) {
3172	assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3173	assert((empty() \|\| all_of(Operands,
3174	[this](const ValueList &VL) {
3175	return VL.size() == getNumLanes();
3176	})) &&
3177	"Expected same number of lanes");
3178	assert(S.valid() && "InstructionsState is invalid.");
3179	// IntrinsicInst::isCommutative returns true if swapping the first "two"
3180	// arguments to the intrinsic produces the same result.
3181	Instruction *MainOp = S.getMainOp();
3182	unsigned NumOperands = MainOp->getNumOperands();
3183	ArgSize = ::getNumberOfPotentiallyCommutativeOps(I: MainOp);
3184	OpsVec.resize(N: ArgSize);
3185	unsigned NumLanes = VL.size();
3186	for (OperandDataVec &Ops : OpsVec)
3187	Ops.resize(N: NumLanes);
3188	for (unsigned Lane : seq<unsigned>(Size: NumLanes)) {
3189	// Our tree has just 3 nodes: the root and two operands.
3190	// It is therefore trivial to get the APO. We only need to check the
3191	// opcode of V and whether the operand at OpIdx is the LHS or RHS
3192	// operand. The LHS operand of both add and sub is never attached to an
3193	// inversese operation in the linearized form, therefore its APO is
3194	// false. The RHS is true only if V is an inverse operation.
3195
3196	// Since operand reordering is performed on groups of commutative
3197	// operations or alternating sequences (e.g., +, -), we can safely tell
3198	// the inverse operations by checking commutativity.
3199	auto *I = dyn_cast<Instruction>(Val: VL [Lane]);
3200	if (!I && isa<PoisonValue>(Val: VL [Lane])) {
3201	for (unsigned OpIdx : seq<unsigned>(Size: NumOperands))
3202	OpsVec [OpIdx][Lane] = {Operands [OpIdx][Lane], true, false};
3203	continue;
3204	}
3205	bool IsInverseOperation = false;
3206	if (S.isCopyableElement(V: VL [Lane])) {
3207	// The value is a copyable element.
3208	IsInverseOperation =
3209	!isCommutative(I: MainOp, ValWithUses: VL [Lane], /IsCopyable=/true);
3210	} else {
3211	assert(I && "Expected instruction");
3212	auto [SelectedOp, Ops] = convertTo(I, S);
3213	// We cannot check commutativity by the converted instruction
3214	// (SelectedOp) because isCommutative also examines def-use
3215	// relationships.
3216	IsInverseOperation = !isCommutative(I: SelectedOp, ValWithUses: I);
3217	}
3218	for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) {
3219	bool APO = (OpIdx == `0`) ? false : IsInverseOperation;
3220	OpsVec [OpIdx][Lane] = {Operands [OpIdx][Lane], APO, false};
3221	}
3222	}
3223	}
3224
3225	/// \returns the number of operands.
3226	unsigned getNumOperands() const { return ArgSize; }
3227
3228	/// \returns the number of lanes.
3229	unsigned getNumLanes() const { return OpsVec [`0`].size(); }
3230
3231	/// \returns the operand value at \p OpIdx and \p Lane.
3232	Value getValue(unsigned* OpIdx, unsigned Lane) const {
3233	return getData(OpIdx, Lane).V;
3234	}
3235
3236	/// \returns true if the data structure is empty.
3237	bool empty() const { return OpsVec.empty(); }
3238
3239	/// Clears the data.
3240	void clear() { OpsVec.clear(); }
3241
3242	/// \Returns true if there are enough operands identical to \p Op to fill
3243	/// the whole vector (it is mixed with constants or loop invariant values).
3244	/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3245	bool shouldBroadcast(Value Op, unsigned* OpIdx, unsigned Lane) {
3246	assert(Op == getValue(OpIdx, Lane) &&
3247	"Op is expected to be getValue(OpIdx, Lane).");
3248	// Small number of loads - try load matching.
3249	if (isa<LoadInst>(Val: Op) && getNumLanes() == `2` && getNumOperands() == `2`)
3250	return false;
3251	bool OpAPO = getData(OpIdx, Lane).APO;
3252	bool IsInvariant = L && L->isLoopInvariant(V: Op);
3253	unsigned Cnt = `0`;
3254	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3255	if (Ln == Lane)
3256	continue;
3257	// This is set to true if we found a candidate for broadcast at Lane.
3258	bool FoundCandidate = false;
3259	for (unsigned OpI = `0`, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3260	OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3261	if (Data.APO != OpAPO \|\| Data.IsUsed)
3262	continue;
3263	Value *OpILane = getValue(OpIdx: OpI, Lane);
3264	bool IsConstantOp = isa<Constant>(Val: OpILane);
3265	// Consider the broadcast candidate if:
3266	// 1. Same value is found in one of the operands.
3267	if (Data.V == Op \|\|
3268	// 2. The operand in the given lane is not constant but there is a
3269	// constant operand in another lane (which can be moved to the
3270	// given lane). In this case we can represent it as a simple
3271	// permutation of constant and broadcast.
3272	(!IsConstantOp &&
3273	((Lns > `2` && isa<Constant>(Val: Data.V)) \|\|
3274	// 2.1. If we have only 2 lanes, need to check that value in the
3275	// next lane does not build same opcode sequence.
3276	(Lns == `2` &&
3277	!getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + `1`) % OpE, Lane: Ln)}, TLI) &&
3278	isa<Constant>(Val: Data.V)))) \|\|
3279	// 3. The operand in the current lane is loop invariant (can be
3280	// hoisted out) and another operand is also a loop invariant
3281	// (though not a constant). In this case the whole vector can be
3282	// hoisted out.
3283	// FIXME: need to teach the cost model about this case for better
3284	// estimation.
3285	(IsInvariant && !isa<Constant>(Val: Data.V) &&
3286	!getSameOpcode(VL: {Op, Data.V}, TLI) &&
3287	L->isLoopInvariant(V: Data.V))) {
3288	FoundCandidate = true;
3289	Data.IsUsed = Data.V == Op;
3290	if (Data.V == Op)
3291	++Cnt;
3292	break;
3293	}
3294	}
3295	if (!FoundCandidate)
3296	return false;
3297	}
3298	return getNumLanes() == `2` \|\| Cnt > `1`;
3299	}
3300
3301	/// Checks if there is at least single compatible operand in lanes other
3302	/// than \p Lane, compatible with the operand \p Op.
3303	bool canBeVectorized(Instruction Op, unsigned* OpIdx, unsigned Lane) const {
3304	assert(Op == getValue(OpIdx, Lane) &&
3305	"Op is expected to be getValue(OpIdx, Lane).");
3306	bool OpAPO = getData(OpIdx, Lane).APO;
3307	for (unsigned Ln = `0`, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3308	if (Ln == Lane)
3309	continue;
3310	if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
3311	const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3312	if (Data.APO != OpAPO \|\| Data.IsUsed)
3313	return true;
3314	Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
3315	return (L && L->isLoopInvariant(V: OpILn)) \|\|
3316	(getSameOpcode(VL: {Op, OpILn}, TLI) &&
3317	allSameBlock(VL: {Op, OpILn}));
3318	}))
3319	return true;
3320	}
3321	return false;
3322	}
3323
3324	public:
3325	/// Initialize with all the operands of the instruction vector \p RootVL.
3326	VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,
3327	const InstructionsState &S, const BoUpSLP &R)
3328	: TLI(R.TLI), DL(R.DL), SE(*R.SE), R(R),
3329	L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) {
3330	// Append all the operands of RootVL.
3331	appendOperands(VL: RootVL, Operands, S);
3332	}
3333
3334	/// \Returns a value vector with the operands across all lanes for the
3335	/// opearnd at \p OpIdx.
3336	ValueList getVL(unsigned OpIdx) const {
3337	ValueList OpVL(OpsVec [OpIdx].size());
3338	assert(OpsVec[OpIdx].size() == getNumLanes() &&
3339	"Expected same num of lanes across all operands");
3340	for (unsigned Lane = `0`, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3341	OpVL [Lane] = OpsVec [OpIdx][Lane].V;
3342	return OpVL;
3343	}
3344
3345	// Performs operand reordering for 2 or more operands.
3346	// The original operands are in OrigOps[OpIdx][Lane].
3347	// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3348	void reorder() {
3349	unsigned NumOperands = getNumOperands();
3350	unsigned NumLanes = getNumLanes();
3351	// Each operand has its own mode. We are using this mode to help us select
3352	// the instructions for each lane, so that they match best with the ones
3353	// we have selected so far.
3354	SmallVector<ReorderingMode, `2`> ReorderingModes(NumOperands);
3355
3356	// This is a greedy single-pass algorithm. We are going over each lane
3357	// once and deciding on the best order right away with no back-tracking.
3358	// However, in order to increase its effectiveness, we start with the lane
3359	// that has operands that can move the least. For example, given the
3360	// following lanes:
3361	// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3362	// Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3363	// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3364	// Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3365	// we will start at Lane 1, since the operands of the subtraction cannot
3366	// be reordered. Then we will visit the rest of the lanes in a circular
3367	// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3368
3369	// Find the first lane that we will start our search from.
3370	unsigned FirstLane = getBestLaneToStartReordering();
3371
3372	// Initialize the modes.
3373	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
3374	Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
3375	// Keep track if we have instructions with all the same opcode on one
3376	// side.
3377	if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
3378	// Check if OpLane0 should be broadcast.
3379	if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) \|\|
3380	!canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
3381	ReorderingModes [OpIdx] = ReorderingMode::Splat;
3382	else if (isa<LoadInst>(Val: OpILane0))
3383	ReorderingModes [OpIdx] = ReorderingMode::Load;
3384	else
3385	ReorderingModes [OpIdx] = ReorderingMode::Opcode;
3386	} else if (isa<Constant>(Val: OpLane0)) {
3387	ReorderingModes [OpIdx] = ReorderingMode::Constant;
3388	} else if (isa<Argument>(Val: OpLane0)) {
3389	// Our best hope is a Splat. It may save some cost in some cases.
3390	ReorderingModes [OpIdx] = ReorderingMode::Splat;
3391	} else {
3392	llvm_unreachable("Unexpected value kind.");
3393	}
3394	}
3395
3396	// Check that we don't have same operands. No need to reorder if operands
3397	// are just perfect diamond or shuffled diamond match. Do not do it only
3398	// for possible broadcasts or non-power of 2 number of scalars (just for
3399	// now).
3400	auto &&SkipReordering = [this]() {
3401	SmallPtrSet<Value *, `4`> UniqueValues;
3402	ArrayRef<OperandData> Op0 = OpsVec.front();
3403	for (const OperandData &Data : Op0)
3404	UniqueValues.insert(Ptr: Data.V);
3405	for (ArrayRef<OperandData> Op :
3406	ArrayRef(OpsVec).slice(N: `1`, M: getNumOperands() - `1`)) {
3407	if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
3408	return !UniqueValues.contains(Ptr: Data.V);
3409	}))
3410	return false;
3411	}
3412	// TODO: Check if we can remove a check for non-power-2 number of
3413	// scalars after full support of non-power-2 vectorization.
3414	return UniqueValues.size() != `2` &&
3415	hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(),
3416	Sz: UniqueValues.size());
3417	};
3418
3419	// If the initial strategy fails for any of the operand indexes, then we
3420	// perform reordering again in a second pass. This helps avoid assigning
3421	// high priority to the failed strategy, and should improve reordering for
3422	// the non-failed operand indexes.
3423	for (int Pass = `0`; Pass != `2`; ++Pass) {
3424	// Check if no need to reorder operands since they're are perfect or
3425	// shuffled diamond match.
3426	// Need to do it to avoid extra external use cost counting for
3427	// shuffled matches, which may cause regressions.
3428	if (SkipReordering())
3429	break;
3430	// Skip the second pass if the first pass did not fail.
3431	bool StrategyFailed = false;
3432	// Mark all operand data as free to use.
3433	clearUsed();
3434	// We keep the original operand order for the FirstLane, so reorder the
3435	// rest of the lanes. We are visiting the nodes in a circular fashion,
3436	// using FirstLane as the center point and increasing the radius
3437	// distance.
3438	SmallVector<SmallVector<Value *, `2`>> MainAltOps(NumOperands);
3439	for (unsigned I = `0`; I < NumOperands; ++I)
3440	MainAltOps [I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
3441
3442	SmallBitVector UsedLanes(NumLanes);
3443	UsedLanes.set(FirstLane);
3444	for (unsigned Distance = `1`; Distance != NumLanes; ++Distance) {
3445	// Visit the lane on the right and then the lane on the left.
3446	for (int Direction : {+`1`, -`1`}) {
3447	int Lane = FirstLane + Direction * Distance;
3448	if (Lane < `0` \|\| Lane >= (int)NumLanes)
3449	continue;
3450	UsedLanes.set(Lane);
3451	int LastLane = Lane - Direction;
3452	assert(LastLane >= `0` && LastLane < (int)NumLanes &&
3453	"Out of bounds");
3454	// Look for a good match for each operand.
3455	for (unsigned OpIdx = `0`; OpIdx != NumOperands; ++OpIdx) {
3456	// Search for the operand that matches SortedOps[OpIdx][Lane-1].
3457	std::optional<unsigned> BestIdx =
3458	getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3459	MainAltOps: MainAltOps [OpIdx], UsedLanes);
3460	// By not selecting a value, we allow the operands that follow to
3461	// select a better matching value. We will get a non-null value in
3462	// the next run of getBestOperand().
3463	if (BestIdx) {
3464	// Swap the current operand with the one returned by
3465	// getBestOperand().
3466	swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
3467	} else {
3468	// Enable the second pass.
3469	StrategyFailed = true;
3470	}
3471	// Try to get the alternate opcode and follow it during analysis.
3472	if (MainAltOps [OpIdx].size() != `2`) {
3473	OperandData &AltOp = getData(OpIdx, Lane);
3474	InstructionsState OpS =
3475	getSameOpcode(VL: {MainAltOps [OpIdx].front(), AltOp.V}, TLI);
3476	if (OpS && OpS.isAltShuffle())
3477	MainAltOps [OpIdx].push_back(Elt: AltOp.V);
3478	}
3479	}
3480	}
3481	}
3482	// Skip second pass if the strategy did not fail.
3483	if (!StrategyFailed)
3484	break;
3485	}
3486	}
3487
3488	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3489	LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3490	switch (RMode) {
3491	case ReorderingMode::Load:
3492	return "Load";
3493	case ReorderingMode::Opcode:
3494	return "Opcode";
3495	case ReorderingMode::Constant:
3496	return "Constant";
3497	case ReorderingMode::Splat:
3498	return "Splat";
3499	case ReorderingMode::Failed:
3500	return "Failed";
3501	}
3502	llvm_unreachable("Unimplemented Reordering Type");
3503	}
3504
3505	LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3506	raw_ostream &OS) {
3507	return OS << getModeStr(RMode);
3508	}
3509
3510	/// Debug print.
3511	LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3512	printMode(RMode, dbgs());
3513	}
3514
3515	friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3516	return printMode(RMode, OS);
3517	}
3518
3519	LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
3520	const unsigned Indent = `2`;
3521	unsigned Cnt = `0`;
3522	for (const OperandDataVec &OpDataVec : OpsVec) {
3523	OS << "Operand " << Cnt++ << "\n";
3524	for (const OperandData &OpData : OpDataVec) {
3525	OS.indent(Indent) << "{";
3526	if (Value *V = OpData.V)
3527	OS << *V;
3528	else
3529	OS << "null";
3530	OS << ", APO:" << OpData.APO << "}\n";
3531	}
3532	OS << "\n";
3533	}
3534	return OS;
3535	}
3536
3537	/// Debug print.
3538	LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3539	#endif
3540	};
3541
3542	/// Evaluate each pair in \p Candidates and return index into \p Candidates
3543	/// for a pair which have highest score deemed to have best chance to form
3544	/// root of profitable tree to vectorize. Return std::nullopt if no candidate
3545	/// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3546	/// of the cost, considered to be good enough score.
3547	std::optional<int>
3548	findBestRootPair(ArrayRef<std::pair<Value , Value >> Candidates,
3549	int Limit = LookAheadHeuristics::ScoreFail) const {
3550	LookAheadHeuristics LookAhead(TLI, DL, SE, this, /NumLanes=/`2`,
3551	RootLookAheadMaxDepth);
3552	int BestScore = Limit;
3553	std::optional<int> Index;
3554	for (int I : seq<int>(Begin: `0`, End: Candidates.size())) {
3555	int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates [I].first,
3556	RHS: Candidates [I].second,
3557	/U1=/nullptr, /U2=/nullptr,
3558	/CurrLevel=/`1`, MainAltOps: {});
3559	if (Score > BestScore) {
3560	BestScore = Score;
3561	Index = I;
3562	}
3563	}
3564	return Index;
3565	}
3566
3567	/// Checks if the instruction is marked for deletion.
3568	bool isDeleted(Instruction I) const* { return DeletedInstructions.count(V: I); }
3569
3570	/// Removes an instruction from its block and eventually deletes it.
3571	/// It's like Instruction::eraseFromParent() except that the actual deletion
3572	/// is delayed until BoUpSLP is destructed.
3573	void eraseInstruction(Instruction *I) {
3574	DeletedInstructions.insert(V: I);
3575	}
3576
3577	/// Remove instructions from the parent function and clear the operands of \p
3578	/// DeadVals instructions, marking for deletion trivially dead operands.
3579	template <typename T>
3580	void removeInstructionsAndOperands(
3581	ArrayRef<T *> DeadVals,
3582	ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3583	VectorValuesAndScales) {
3584	SmallVector<WeakTrackingVH> DeadInsts;
3585	for (T *V : DeadVals) {
3586	auto *I = cast<Instruction>(V);
3587	eraseInstruction(I);
3588	}
3589	DenseSet<Value *> Processed;
3590	for (T *V : DeadVals) {
3591	if (!V \|\| !Processed.insert(V).second)
3592	continue;
3593	auto *I = cast<Instruction>(V);
3594	salvageDebugInfo(*I);
3595	ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I);
3596	for (Use &U : I->operands()) {
3597	if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
3598	OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
3599	wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
3600	(Entries.empty() \|\| none_of(Entries, [&](const TreeEntry *Entry) {
3601	return Entry->VectorizedValue == OpI;
3602	})))
3603	DeadInsts.push_back(Elt: OpI);
3604	}
3605	I->dropAllReferences();
3606	}
3607	for (T *V : DeadVals) {
3608	auto *I = cast<Instruction>(V);
3609	if (!I->getParent())
3610	continue;
3611	assert((I->use_empty() \|\| all_of(I->uses(),
3612	[&](Use &U) {
3613	return isDeleted(
3614	cast<Instruction>(U.getUser()));
3615	})) &&
3616	"trying to erase instruction with users.");
3617	I->removeFromParent();
3618	SE->forgetValue(V: I);
3619	}
3620	// Process the dead instruction list until empty.
3621	while (!DeadInsts.empty()) {
3622	Value *V = DeadInsts.pop_back_val();
3623	Instruction *VI = cast_or_null<Instruction>(Val: V);
3624	if (!VI \|\| !VI->getParent())
3625	continue;
3626	assert(isInstructionTriviallyDead(VI, TLI) &&
3627	"Live instruction found in dead worklist!");
3628	assert(VI->use_empty() && "Instructions with uses are not dead.");
3629
3630	// Don't lose the debug info while deleting the instructions.
3631	salvageDebugInfo(I&: *VI);
3632
3633	// Null out all of the instruction's operands to see if any operand
3634	// becomes dead as we go.
3635	for (Use &OpU : VI->operands()) {
3636	Value *OpV = OpU.get();
3637	if (!OpV)
3638	continue;
3639	OpU.set(nullptr);
3640
3641	if (!OpV->use_empty())
3642	continue;
3643
3644	// If the operand is an instruction that became dead as we nulled out
3645	// the operand, and if it is 'trivially' dead, delete it in a future
3646	// loop iteration.
3647	if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
3648	if (!DeletedInstructions.contains(V: OpI) &&
3649	(!OpI->getType()->isVectorTy() \|\|
3650	none_of(
3651	VectorValuesAndScales,
3652	[&](const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3653	&V) { return std::get<`0`>(t: V) == OpI; })) &&
3654	isInstructionTriviallyDead(I: OpI, TLI))
3655	DeadInsts.push_back(Elt: OpI);
3656	}
3657
3658	VI->removeFromParent();
3659	eraseInstruction(I: VI);
3660	SE->forgetValue(V: VI);
3661	}
3662	}
3663
3664	/// Checks if the instruction was already analyzed for being possible
3665	/// reduction root.
3666	bool isAnalyzedReductionRoot(Instruction I) const* {
3667	return AnalyzedReductionsRoots.count(Ptr: I);
3668	}
3669	/// Register given instruction as already analyzed for being possible
3670	/// reduction root.
3671	void analyzedReductionRoot(Instruction *I) {
3672	AnalyzedReductionsRoots.insert(Ptr: I);
3673	}
3674	/// Checks if the provided list of reduced values was checked already for
3675	/// vectorization.
3676	bool areAnalyzedReductionVals(ArrayRef<Value > VL) const* {
3677	return AnalyzedReductionVals.contains(V: hash_value(S: VL));
3678	}
3679	/// Adds the list of reduced values to list of already checked values for the
3680	/// vectorization.
3681	void analyzedReductionVals(ArrayRef<Value *> VL) {
3682	AnalyzedReductionVals.insert(V: hash_value(S: VL));
3683	}
3684	/// Clear the list of the analyzed reduction root instructions.
3685	void clearReductionData() {
3686	AnalyzedReductionsRoots.clear();
3687	AnalyzedReductionVals.clear();
3688	AnalyzedMinBWVals.clear();
3689	}
3690	/// Checks if the given value is gathered in one of the nodes.
3691	bool isAnyGathered(const SmallDenseSet<Value > &Vals) const* {
3692	return any_of(Range: MustGather, P: [&](Value V) { return* Vals.contains(V); });
3693	}
3694	/// Checks if the given value is gathered in one of the nodes.
3695	bool isGathered(const Value V) const* {
3696	return MustGather.contains(Ptr: V);
3697	}
3698	/// Checks if the specified value was not schedule.
3699	bool isNotScheduled(const Value V) const* {
3700	return NonScheduledFirst.contains(Ptr: V);
3701	}
3702
3703	/// Check if the value is vectorized in the tree.
3704	bool isVectorized(const Value V) const* {
3705	assert(V && "V cannot be nullptr.");
3706	ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
3707	return any_of(Range&: Entries, P: [&](const TreeEntry *E) {
3708	return !DeletedNodes.contains(Ptr: E) && !TransformedToGatherNodes.contains(Val: E);
3709	});
3710	}
3711
3712	~BoUpSLP();
3713
3714	private:
3715	/// Determine if a node \p E in can be demoted to a smaller type with a
3716	/// truncation. We collect the entries that will be demoted in ToDemote.
3717	/// \param E Node for analysis
3718	/// \param ToDemote indices of the nodes to be demoted.
3719	bool collectValuesToDemote(
3720	const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3721	SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
3722	const SmallDenseSet<unsigned, `8`> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3723	bool &IsProfitableToDemote, bool IsTruncRoot) const;
3724
3725	/// Builds the list of reorderable operands on the edges \p Edges of the \p
3726	/// UserTE, which allow reordering (i.e. the operands can be reordered because
3727	/// they have only one user and reordarable).
3728	/// \param ReorderableGathers List of all gather nodes that require reordering
3729	/// (e.g., gather of extractlements or partially vectorizable loads).
3730	/// \param GatherOps List of gather operand nodes for \p UserTE that require
3731	/// reordering, subset of \p NonVectorized.
3732	void buildReorderableOperands(
3733	TreeEntry *UserTE,
3734	SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3735	const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3736	SmallVectorImpl<TreeEntry *> &GatherOps);
3737
3738	/// Checks if the given \p TE is a gather node with clustered reused scalars
3739	/// and reorders it per given \p Mask.
3740	void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3741
3742	/// Checks if all users of \p I are the part of the vectorization tree.
3743	bool areAllUsersVectorized(
3744	Instruction *I,
3745	const SmallDenseSet<Value > VectorizedVals = nullptr) const;
3746
3747	/// Return information about the vector formed for the specified index
3748	/// of a vector of (the same) instruction.
3749	TargetTransformInfo::OperandValueInfo
3750	getOperandInfo(ArrayRef<Value > Ops) const*;
3751
3752	/// \returns the graph entry for the \p Idx operand of the \p E entry.
3753	const TreeEntry getOperandEntry(const* TreeEntry E, unsigned* Idx) const;
3754	TreeEntry getOperandEntry(TreeEntry E, unsigned Idx) {
3755	return const_cast<TreeEntry *>(
3756	getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx));
3757	}
3758
3759	/// Gets the root instruction for the given node. If the node is a strided
3760	/// load/store node with the reverse order, the root instruction is the last
3761	/// one.
3762	Instruction getRootEntryInstruction(const* TreeEntry &Entry) const;
3763
3764	/// \returns Cast context for the given graph node.
3765	TargetTransformInfo::CastContextHint
3766	getCastContextHint(const TreeEntry &TE) const;
3767
3768	/// \returns the cost of the vectorizable entry.
3769	InstructionCost getEntryCost(const TreeEntry *E,
3770	ArrayRef<Value *> VectorizedVals,
3771	SmallPtrSetImpl<Value *> &CheckedExtracts);
3772
3773	/// Checks if it is legal and profitable to build SplitVectorize node for the
3774	/// given \p VL.
3775	/// \param Op1 first homogeneous scalars.
3776	/// \param Op2 second homogeneous scalars.
3777	/// \param ReorderIndices indices to reorder the scalars.
3778	/// \returns true if the node was successfully built.
3779	bool canBuildSplitNode(ArrayRef<Value *> VL,
3780	const InstructionsState &LocalState,
3781	SmallVectorImpl<Value *> &Op1,
3782	SmallVectorImpl<Value *> &Op2,
3783	OrdersType &ReorderIndices) const;
3784
3785	/// This is the recursive part of buildTree.
3786	void buildTreeRec(ArrayRef<Value > Roots, unsigned* Depth, const EdgeInfo &EI,
3787	unsigned InterleaveFactor = `0`);
3788
3789	/// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3790	/// be vectorized to use the original vector (or aggregate "bitcast" to a
3791	/// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3792	/// returns false, setting \p CurrentOrder to either an empty vector or a
3793	/// non-identity permutation that allows to reuse extract instructions.
3794	/// \param ResizeAllowed indicates whether it is allowed to handle subvector
3795	/// extract order.
3796	bool canReuseExtract(ArrayRef<Value *> VL,
3797	SmallVectorImpl<unsigned> &CurrentOrder,
3798	bool ResizeAllowed = false) const;
3799
3800	/// Vectorize a single entry in the tree.
3801	Value vectorizeTree(TreeEntry E);
3802
3803	/// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3804	/// \p E.
3805	Value vectorizeOperand(TreeEntry E, unsigned NodeIdx);
3806
3807	/// Create a new vector from a list of scalar values. Produces a sequence
3808	/// which exploits values reused across lanes, and arranges the inserts
3809	/// for ease of later optimization.
3810	template <typename BVTy, typename ResTy, typename... Args>
3811	ResTy processBuildVector(const TreeEntry E, Type ScalarTy, Args &...Params);
3812
3813	/// Create a new vector from a list of scalar values. Produces a sequence
3814	/// which exploits values reused across lanes, and arranges the inserts
3815	/// for ease of later optimization.
3816	Value createBuildVector(const* TreeEntry E, Type ScalarTy);
3817
3818	/// Returns the instruction in the bundle, which can be used as a base point
3819	/// for scheduling. Usually it is the last instruction in the bundle, except
3820	/// for the case when all operands are external (in this case, it is the first
3821	/// instruction in the list).
3822	Instruction &getLastInstructionInBundle(const TreeEntry *E);
3823
3824	/// Tries to find extractelement instructions with constant indices from fixed
3825	/// vector type and gather such instructions into a bunch, which highly likely
3826	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3827	/// was successful, the matched scalars are replaced by poison values in \p VL
3828	/// for future analysis.
3829	std::optional<TargetTransformInfo::ShuffleKind>
3830	tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3831	SmallVectorImpl<int> &Mask) const;
3832
3833	/// Tries to find extractelement instructions with constant indices from fixed
3834	/// vector type and gather such instructions into a bunch, which highly likely
3835	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3836	/// was successful, the matched scalars are replaced by poison values in \p VL
3837	/// for future analysis.
3838	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3839	tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3840	SmallVectorImpl<int> &Mask,
3841	unsigned NumParts) const;
3842
3843	/// Checks if the gathered \p VL can be represented as a single register
3844	/// shuffle(s) of previous tree entries.
3845	/// \param TE Tree entry checked for permutation.
3846	/// \param VL List of scalars (a subset of the TE scalar), checked for
3847	/// permutations. Must form single-register vector.
3848	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3849	/// commands to build the mask using the original vector value, without
3850	/// relying on the potential reordering.
3851	/// \returns ShuffleKind, if gathered values can be represented as shuffles of
3852	/// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3853	std::optional<TargetTransformInfo::ShuffleKind>
3854	isGatherShuffledSingleRegisterEntry(
3855	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
3856	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part,
3857	bool ForOrder);
3858
3859	/// Checks if the gathered \p VL can be represented as multi-register
3860	/// shuffle(s) of previous tree entries.
3861	/// \param TE Tree entry checked for permutation.
3862	/// \param VL List of scalars (a subset of the TE scalar), checked for
3863	/// permutations.
3864	/// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3865	/// commands to build the mask using the original vector value, without
3866	/// relying on the potential reordering.
3867	/// \returns per-register series of ShuffleKind, if gathered values can be
3868	/// represented as shuffles of previous tree entries. \p Mask is filled with
3869	/// the shuffle mask (also on per-register base).
3870	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3871	isGatherShuffledEntry(
3872	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
3873	SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3874	unsigned NumParts, bool ForOrder = false);
3875
3876	/// \returns the cost of gathering (inserting) the values in \p VL into a
3877	/// vector.
3878	/// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3879	InstructionCost getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc,
3880	Type ScalarTy) const*;
3881
3882	/// Set the Builder insert point to one after the last instruction in
3883	/// the bundle
3884	void setInsertPointAfterBundle(const TreeEntry *E);
3885
3886	/// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3887	/// specified, the starting vector value is poison.
3888	Value *
3889	gather(ArrayRef<Value > VL, Value Root, Type *ScalarTy,
3890	function_ref<Value (Value , Value , ArrayRef<int*>)> CreateShuffle);
3891
3892	/// \returns whether the VectorizableTree is fully vectorizable and will
3893	/// be beneficial even the tree height is tiny.
3894	bool isFullyVectorizableTinyTree(bool ForReduction) const;
3895
3896	/// Run through the list of all gathered loads in the graph and try to find
3897	/// vector loads/masked gathers instead of regular gathers. Later these loads
3898	/// are reshufled to build final gathered nodes.
3899	void tryToVectorizeGatheredLoads(
3900	const SmallMapVector<
3901	std::tuple<BasicBlock , Value , Type *>,
3902	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, `8`>
3903	&GatheredLoads);
3904
3905	/// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3906	/// users of \p TE and collects the stores. It returns the map from the store
3907	/// pointers to the collected stores.
3908	SmallVector<SmallVector<StoreInst *>>
3909	collectUserStores(const BoUpSLP::TreeEntry TE) const*;
3910
3911	/// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3912	/// stores in \p StoresVec can form a vector instruction. If so it returns
3913	/// true and populates \p ReorderIndices with the shuffle indices of the
3914	/// stores when compared to the sorted vector.
3915	bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3916	OrdersType &ReorderIndices) const;
3917
3918	/// Iterates through the users of \p TE, looking for scalar stores that can be
3919	/// potentially vectorized in a future SLP-tree. If found, it keeps track of
3920	/// their order and builds an order index vector for each store bundle. It
3921	/// returns all these order vectors found.
3922	/// We run this after the tree has formed, otherwise we may come across user
3923	/// instructions that are not yet in the tree.
3924	SmallVector<OrdersType, `1`>
3925	findExternalStoreUsersReorderIndices(TreeEntry TE) const*;
3926
3927	/// Tries to reorder the gathering node for better vectorization
3928	/// opportunities.
3929	void reorderGatherNode(TreeEntry &TE);
3930
3931	/// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
3932	/// .., 56))-like pattern.
3933	/// If the int shifts unique, also strided, but not ordered, sets \p Order.
3934	/// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
3935	/// If the root nodes are loads, sets \p ForLoads to true.
3936	bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
3937	bool &ForLoads) const;
3938
3939	/// Checks if the \p SelectTE matches zext+selects, which can be inversed for
3940	/// better codegen in case like zext (icmp ne), select (icmp eq), ....
3941	bool matchesInversedZExtSelect(
3942	const TreeEntry &SelectTE,
3943	SmallVectorImpl<unsigned> &InversedCmpsIndices) const;
3944
3945	/// Checks if the tree is reduction or of bit selects, like select %cmp, <1,
3946	/// 2, 4, 8, ..>, zeroinitializer, which can be reduced just to a bitcast %cmp
3947	/// to in.
3948	bool matchesSelectOfBits(const TreeEntry &SelectTE) const;
3949
3950	class TreeEntry {
3951	public:
3952	using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, `8`>;
3953	TreeEntry(VecTreeTy &Container) : Container(Container) {}
3954
3955	/// \returns Common mask for reorder indices and reused scalars.
3956	SmallVector<int> getCommonMask() const {
3957	if (State == TreeEntry::SplitVectorize)
3958	return {};
3959	SmallVector<int> Mask;
3960	inversePermutation(Indices: ReorderIndices, Mask);
3961	::addMask(Mask, SubMask: ReuseShuffleIndices);
3962	return Mask;
3963	}
3964
3965	/// \returns The mask for split nodes.
3966	SmallVector<int> getSplitMask() const {
3967	assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3968	"Expected only split vectorize node.");
3969	unsigned CommonVF = std::max<unsigned>(
3970	a: CombinedEntriesWithIndices.back().second,
3971	b: Scalars.size() - CombinedEntriesWithIndices.back().second);
3972	const unsigned Scale = getNumElements(Ty: Scalars.front()->getType());
3973	CommonVF *= Scale;
3974	SmallVector<int> Mask(getVectorFactor() * Scale, PoisonMaskElem);
3975	for (auto [Idx, I] : enumerate(First: ReorderIndices)) {
3976	for (unsigned K : seq<unsigned>(Size: Scale)) {
3977	Mask [Scale * I + K] =
3978	Scale * Idx + K +
3979	(Idx >= CombinedEntriesWithIndices.back().second
3980	? CommonVF - CombinedEntriesWithIndices.back().second * Scale
3981	: `0`);
3982	}
3983	}
3984	return Mask;
3985	}
3986
3987	/// Updates (reorders) SplitVectorize node according to the given mask \p
3988	/// Mask and order \p MaskOrder.
3989	void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3990	ArrayRef<int> MaskOrder);
3991
3992	/// \returns true if the scalars in VL are equal to this entry.
3993	bool isSame(ArrayRef<Value > VL) const* {
3994	auto &&IsSame = [VL](ArrayRef<Value > Scalars, ArrayRef<int*> Mask) {
3995	if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3996	return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
3997	return VL.size() == Mask.size() &&
3998	std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
3999	binary_pred: [Scalars](Value V, int* Idx) {
4000	return (isa<UndefValue>(Val: V) &&
4001	Idx == PoisonMaskElem) \|\|
4002	(Idx != PoisonMaskElem && V == Scalars [Idx]);
4003	});
4004	};
4005	if (!ReorderIndices.empty()) {
4006	// TODO: implement matching if the nodes are just reordered, still can
4007	// treat the vector as the same if the list of scalars matches VL
4008	// directly, without reordering.
4009	SmallVector<int> Mask;
4010	inversePermutation(Indices: ReorderIndices, Mask);
4011	if (VL.size() == Scalars.size())
4012	return IsSame(Scalars, Mask);
4013	if (VL.size() == ReuseShuffleIndices.size()) {
4014	::addMask(Mask, SubMask: ReuseShuffleIndices);
4015	return IsSame(Scalars, Mask);
4016	}
4017	return false;
4018	}
4019	return IsSame(Scalars, ReuseShuffleIndices);
4020	}
4021
4022	/// \returns true if current entry has same operands as \p TE.
4023	bool hasEqualOperands(const TreeEntry &TE) const {
4024	if (TE.getNumOperands() != getNumOperands())
4025	return false;
4026	SmallBitVector Used(getNumOperands());
4027	for (unsigned I = `0`, E = getNumOperands(); I < E; ++I) {
4028	unsigned PrevCount = Used.count();
4029	for (unsigned K = `0`; K < E; ++K) {
4030	if (Used.test(Idx: K))
4031	continue;
4032	if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
4033	Used.set(K);
4034	break;
4035	}
4036	}
4037	// Check if we actually found the matching operand.
4038	if (PrevCount == Used.count())
4039	return false;
4040	}
4041	return true;
4042	}
4043
4044	/// \return Final vectorization factor for the node. Defined by the total
4045	/// number of vectorized scalars, including those, used several times in the
4046	/// entry and counted in the \a ReuseShuffleIndices, if any.
4047	unsigned getVectorFactor() const {
4048	if (!ReuseShuffleIndices.empty())
4049	return ReuseShuffleIndices.size();
4050	return Scalars.size();
4051	};
4052
4053	/// Checks if the current node is a gather node.
4054	bool isGather() const { return State == NeedToGather; }
4055
4056	/// A vector of scalars.
4057	ValueList Scalars;
4058
4059	/// The Scalars are vectorized into this value. It is initialized to Null.
4060	WeakTrackingVH VectorizedValue = nullptr;
4061
4062	/// Do we need to gather this sequence or vectorize it
4063	/// (either with vector instruction or with scatter/gather
4064	/// intrinsics for store/load)?
4065	enum EntryState {
4066	Vectorize, ///< The node is regularly vectorized.
4067	ScatterVectorize, ///< Masked scatter/gather node.
4068	StridedVectorize, ///< Strided loads (and stores)
4069	CompressVectorize, ///< (Masked) load with compress.
4070	NeedToGather, ///< Gather/buildvector node.
4071	CombinedVectorize, ///< Vectorized node, combined with its user into more
4072	///< complex node like select/cmp to minmax, mul/add to
4073	///< fma, etc. Must be used for the following nodes in
4074	///< the pattern, not the very first one.
4075	SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4076	///< independently and then combines back.
4077	};
4078	EntryState State;
4079
4080	/// List of combined opcodes supported by the vectorizer.
4081	enum CombinedOpcode {
4082	NotCombinedOp = -`1`,
4083	MinMax = Instruction::OtherOpsEnd + `1`,
4084	FMulAdd,
4085	ReducedBitcast,
4086	ReducedBitcastBSwap,
4087	ReducedBitcastLoads,
4088	ReducedBitcastBSwapLoads,
4089	ReducedCmpBitcast,
4090	};
4091	CombinedOpcode CombinedOp = NotCombinedOp;
4092
4093	/// Does this sequence require some shuffling?
4094	SmallVector<int, `4`> ReuseShuffleIndices;
4095
4096	/// Does this entry require reordering?
4097	SmallVector<unsigned, `4`> ReorderIndices;
4098
4099	/// Points back to the VectorizableTree.
4100	///
4101	/// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4102	/// to be a pointer and needs to be able to initialize the child iterator.
4103	/// Thus we need a reference back to the container to translate the indices
4104	/// to entries.
4105	VecTreeTy &Container;
4106
4107	/// The TreeEntry index containing the user of this entry.
4108	EdgeInfo UserTreeIndex;
4109
4110	/// The index of this treeEntry in VectorizableTree.
4111	unsigned Idx = `0`;
4112
4113	/// For gather/buildvector/alt opcode nodes, which are combined from
4114	/// other nodes as a series of insertvector instructions.
4115	SmallVector<std::pair<unsigned, unsigned>, `2`> CombinedEntriesWithIndices;
4116
4117	private:
4118	/// The operands of each instruction in each lane Operands[op_index][lane].
4119	/// Note: This helps avoid the replication of the code that performs the
4120	/// reordering of operands during buildTreeRec() and vectorizeTree().
4121	SmallVector<ValueList, `2`> Operands;
4122
4123	/// Copyable elements of the entry node.
4124	SmallPtrSet<const Value *, `4`> CopyableElements;
4125
4126	/// MainOp and AltOp are recorded inside. S should be obtained from
4127	/// newTreeEntry.
4128	InstructionsState S = InstructionsState::invalid();
4129
4130	/// Interleaving factor for interleaved loads Vectorize nodes.
4131	unsigned InterleaveFactor = `0`;
4132
4133	/// True if the node does not require scheduling.
4134	bool DoesNotNeedToSchedule = false;
4135
4136	/// Set this bundle's \p OpIdx'th operand to \p OpVL.
4137	void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4138	if (Operands.size() < OpIdx + `1`)
4139	Operands.resize(N: OpIdx + `1`);
4140	assert(Operands[OpIdx].empty() && "Already resized?");
4141	assert(OpVL.size() <= Scalars.size() &&
4142	"Number of operands is greater than the number of scalars.");
4143	Operands [OpIdx].resize(N: OpVL.size());
4144	copy(Range&: OpVL, Out: Operands [OpIdx].begin());
4145	}
4146
4147	public:
4148	/// Returns interleave factor for interleave nodes.
4149	unsigned getInterleaveFactor() const { return InterleaveFactor; }
4150	/// Sets interleaving factor for the interleaving nodes.
4151	void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4152
4153	/// Marks the node as one that does not require scheduling.
4154	void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4155	/// Returns true if the node is marked as one that does not require
4156	/// scheduling.
4157	bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4158
4159	/// Set this bundle's operands from \p Operands.
4160	void setOperands(ArrayRef<ValueList> Operands) {
4161	for (unsigned I : seq<unsigned>(Size: Operands.size()))
4162	setOperand(OpIdx: I, OpVL: Operands [I]);
4163	}
4164
4165	/// Reorders operands of the node to the given mask \p Mask.
4166	void reorderOperands(ArrayRef<int> Mask) {
4167	for (ValueList &Operand : Operands)
4168	reorderScalars(Scalars&: Operand, Mask);
4169	}
4170
4171	/// \returns the \p OpIdx operand of this TreeEntry.
4172	ValueList &getOperand(unsigned OpIdx) {
4173	assert(OpIdx < Operands.size() && "Off bounds");
4174	return Operands [OpIdx];
4175	}
4176
4177	/// \returns the \p OpIdx operand of this TreeEntry.
4178	ArrayRef<Value > getOperand(unsigned* OpIdx) const {
4179	assert(OpIdx < Operands.size() && "Off bounds");
4180	return Operands [OpIdx];
4181	}
4182
4183	/// \returns the number of operands.
4184	unsigned getNumOperands() const { return Operands.size(); }
4185
4186	/// \return the single \p OpIdx operand.
4187	Value getSingleOperand(unsigned* OpIdx) const {
4188	assert(OpIdx < Operands.size() && "Off bounds");
4189	assert(!Operands[OpIdx].empty() && "No operand available");
4190	return Operands [OpIdx][`0`];
4191	}
4192
4193	/// Some of the instructions in the list have alternate opcodes.
4194	bool isAltShuffle() const { return S.isAltShuffle(); }
4195
4196	Instruction getMatchingMainOpOrAltOp(Instruction I) const {
4197	return S.getMatchingMainOpOrAltOp(I);
4198	}
4199
4200	/// Chooses the correct key for scheduling data. If \p Op has the same (or
4201	/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4202	/// \p OpValue.
4203	Value isOneOf(Value Op) const {
4204	auto *I = dyn_cast<Instruction>(Val: Op);
4205	if (I && getMatchingMainOpOrAltOp(I))
4206	return Op;
4207	return S.getMainOp();
4208	}
4209
4210	void setOperations(const InstructionsState &S) {
4211	assert(S && "InstructionsState is invalid.");
4212	this->S = S;
4213	}
4214
4215	Instruction getMainOp() const* { return S.getMainOp(); }
4216
4217	Instruction getAltOp() const* { return S.getAltOp(); }
4218
4219	/// The main/alternate opcodes for the list of instructions.
4220	unsigned getOpcode() const { return S.getOpcode(); }
4221
4222	unsigned getAltOpcode() const { return S.getAltOpcode(); }
4223
4224	bool hasState() const { return S.valid(); }
4225
4226	/// Add \p V to the list of copyable elements.
4227	void addCopyableElement(Value *V) {
4228	assert(S.isCopyableElement(V) && "Not a copyable element.");
4229	CopyableElements.insert(Ptr: V);
4230	}
4231
4232	/// Returns true if \p V is a copyable element.
4233	bool isCopyableElement(Value V) const* {
4234	return CopyableElements.contains(Ptr: V);
4235	}
4236
4237	/// Returns true if any scalar in the list is a copyable element.
4238	bool hasCopyableElements() const { return !CopyableElements.empty(); }
4239
4240	/// Returns the state of the operations.
4241	const InstructionsState &getOperations() const { return S; }
4242
4243	/// When ReuseReorderShuffleIndices is empty it just returns position of \p
4244	/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4245	unsigned findLaneForValue(Value V) const* {
4246	unsigned FoundLane = getVectorFactor();
4247	for (auto It = find(Range: Scalars, Val: V), End = Scalars.end(); It != End;
4248	std::advance(i&: It, n: `1`)) {
4249	if (*It != V)
4250	continue;
4251	FoundLane = std::distance(first: Scalars.begin(), last: It);
4252	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4253	if (!ReorderIndices.empty())
4254	FoundLane = ReorderIndices [FoundLane];
4255	assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4256	if (ReuseShuffleIndices.empty())
4257	break;
4258	if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane);
4259	RIt != ReuseShuffleIndices.end()) {
4260	FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt);
4261	break;
4262	}
4263	}
4264	assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4265	return FoundLane;
4266	}
4267
4268	/// Build a shuffle mask for graph entry which represents a merge of main
4269	/// and alternate operations.
4270	void
4271	buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4272	SmallVectorImpl<int> &Mask,
4273	SmallVectorImpl<Value > OpScalars = nullptr,
4274	SmallVectorImpl<Value > AltScalars = nullptr) const;
4275
4276	/// Return true if this is a non-power-of-2 node.
4277	bool isNonPowOf2Vec() const {
4278	bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size());
4279	return IsNonPowerOf2;
4280	}
4281
4282	/// Return true if this is a node, which tries to vectorize number of
4283	/// elements, forming whole vectors.
4284	bool
4285	hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4286	bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4287	TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size());
4288	assert((!IsNonPowerOf2 \|\| ReuseShuffleIndices.empty()) &&
4289	"Reshuffling not supported with non-power-of-2 vectors yet.");
4290	return IsNonPowerOf2;
4291	}
4292
4293	Value getOrdered(unsigned* Idx) const {
4294	if (ReorderIndices.empty())
4295	return Scalars [Idx];
4296	SmallVector<int> Mask;
4297	inversePermutation(Indices: ReorderIndices, Mask);
4298	return Scalars [Mask [Idx]];
4299	}
4300
4301	#ifndef NDEBUG
4302	/// Debug printer.
4303	LLVM_DUMP_METHOD void dump() const {
4304	dbgs() << Idx << ".\n";
4305	for (unsigned OpI = `0`, OpE = Operands.size(); OpI != OpE; ++OpI) {
4306	dbgs() << "Operand " << OpI << ":\n";
4307	for (const Value *V : Operands[OpI])
4308	dbgs().indent(`2`) << *V << "\n";
4309	}
4310	dbgs() << "Scalars: \n";
4311	for (Value *V : Scalars)
4312	dbgs().indent(`2`) << *V << "\n";
4313	dbgs() << "State: ";
4314	if (S && hasCopyableElements())
4315	dbgs() << "[[Copyable]] ";
4316	switch (State) {
4317	case Vectorize:
4318	if (InterleaveFactor > `0`) {
4319	dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4320	<< "\n";
4321	} else {
4322	dbgs() << "Vectorize\n";
4323	}
4324	break;
4325	case ScatterVectorize:
4326	dbgs() << "ScatterVectorize\n";
4327	break;
4328	case StridedVectorize:
4329	dbgs() << "StridedVectorize\n";
4330	break;
4331	case CompressVectorize:
4332	dbgs() << "CompressVectorize\n";
4333	break;
4334	case NeedToGather:
4335	dbgs() << "NeedToGather\n";
4336	break;
4337	case CombinedVectorize:
4338	dbgs() << "CombinedVectorize\n";
4339	break;
4340	case SplitVectorize:
4341	dbgs() << "SplitVectorize\n";
4342	break;
4343	}
4344	if (S) {
4345	dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4346	dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4347	} else {
4348	dbgs() << "MainOp: NULL\n";
4349	dbgs() << "AltOp: NULL\n";
4350	}
4351	dbgs() << "VectorizedValue: ";
4352	if (VectorizedValue)
4353	dbgs() << *VectorizedValue << "\n";
4354	else
4355	dbgs() << "NULL\n";
4356	dbgs() << "ReuseShuffleIndices: ";
4357	if (ReuseShuffleIndices.empty())
4358	dbgs() << "Empty";
4359	else
4360	for (int ReuseIdx : ReuseShuffleIndices)
4361	dbgs() << ReuseIdx << ", ";
4362	dbgs() << "\n";
4363	dbgs() << "ReorderIndices: ";
4364	for (unsigned ReorderIdx : ReorderIndices)
4365	dbgs() << ReorderIdx << ", ";
4366	dbgs() << "\n";
4367	dbgs() << "UserTreeIndex: ";
4368	if (UserTreeIndex)
4369	dbgs() << UserTreeIndex;
4370	else
4371	dbgs() << "<invalid>";
4372	dbgs() << "\n";
4373	if (!CombinedEntriesWithIndices.empty()) {
4374	dbgs() << "Combined entries: ";
4375	interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4376	dbgs() << "Entry index " << P.first << " with offset " << P.second;
4377	});
4378	dbgs() << "\n";
4379	}
4380	}
4381	#endif
4382	};
4383
4384	#ifndef NDEBUG
4385	void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4386	InstructionCost VecCost, InstructionCost ScalarCost,
4387	StringRef Banner) const {
4388	dbgs() << "SLP: " << Banner << ":\n";
4389	E->dump();
4390	dbgs() << "SLP: Costs:\n";
4391	dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4392	dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4393	dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4394	dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4395	<< ReuseShuffleCost + VecCost - ScalarCost << "\n";
4396	}
4397	#endif
4398
4399	/// Create a new gather TreeEntry
4400	TreeEntry newGatherTreeEntry(ArrayRef<Value > VL,
4401	const InstructionsState &S,
4402	const EdgeInfo &UserTreeIdx,
4403	ArrayRef<int> ReuseShuffleIndices = {}) {
4404	auto Invalid = ScheduleBundle::invalid();
4405	return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4406	}
4407
4408	/// Create a new VectorizableTree entry.
4409	TreeEntry newTreeEntry(ArrayRef<Value > VL, ScheduleBundle &Bundle,
4410	const InstructionsState &S,
4411	const EdgeInfo &UserTreeIdx,
4412	ArrayRef<int> ReuseShuffleIndices = {},
4413	ArrayRef<unsigned> ReorderIndices = {},
4414	unsigned InterleaveFactor = `0`) {
4415	TreeEntry::EntryState EntryState =
4416	Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4417	TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4418	ReuseShuffleIndices, ReorderIndices);
4419	if (E && InterleaveFactor > `0`)
4420	E->setInterleave(InterleaveFactor);
4421	return E;
4422	}
4423
4424	TreeEntry newTreeEntry(ArrayRef<Value > VL,
4425	TreeEntry::EntryState EntryState,
4426	ScheduleBundle &Bundle, const InstructionsState &S,
4427	const EdgeInfo &UserTreeIdx,
4428	ArrayRef<int> ReuseShuffleIndices = {},
4429	ArrayRef<unsigned> ReorderIndices = {}) {
4430	assert(((!Bundle && (EntryState == TreeEntry::NeedToGather \|\|
4431	EntryState == TreeEntry::SplitVectorize)) \|\|
4432	(Bundle && EntryState != TreeEntry::NeedToGather &&
4433	EntryState != TreeEntry::SplitVectorize)) &&
4434	"Need to vectorize gather entry?");
4435	// Gathered loads still gathered? Do not create entry, use the original one.
4436	if (GatheredLoadsEntriesFirst.has_value() &&
4437	EntryState == TreeEntry::NeedToGather && S &&
4438	S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4439	!UserTreeIdx.UserTE)
4440	return nullptr;
4441	VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
4442	TreeEntry *Last = VectorizableTree.back().get();
4443	Last->Idx = VectorizableTree.size() - `1`;
4444	Last->State = EntryState;
4445	if (UserTreeIdx.UserTE)
4446	OperandsToTreeEntry.try_emplace(
4447	Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last);
4448	// FIXME: Remove once support for ReuseShuffleIndices has been implemented
4449	// for non-power-of-two vectors.
4450	assert(
4451	(hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) \|\|
4452	ReuseShuffleIndices.empty()) &&
4453	"Reshuffling scalars not yet supported for nodes with padding");
4454	Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
4455	in_end: ReuseShuffleIndices.end());
4456	if (ReorderIndices.empty()) {
4457	Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
4458	if (S)
4459	Last->setOperations(S);
4460	} else {
4461	// Reorder scalars and build final mask.
4462	Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
4463	transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
4464	F: [VL](unsigned Idx) -> Value * {
4465	if (Idx >= VL.size())
4466	return UndefValue::get(T: VL.front()->getType());
4467	return VL [Idx];
4468	});
4469	InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
4470	if (S)
4471	Last->setOperations(S);
4472	Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
4473	}
4474	if (EntryState == TreeEntry::SplitVectorize) {
4475	assert(S && "Split nodes must have operations.");
4476	Last->setOperations(S);
4477	SmallPtrSet<Value *, `4`> Processed;
4478	for (Value *V : VL) {
4479	auto *I = dyn_cast<Instruction>(Val: V);
4480	if (!I)
4481	continue;
4482	auto It = ScalarsInSplitNodes.find(Val: V);
4483	if (It == ScalarsInSplitNodes.end()) {
4484	ScalarsInSplitNodes.try_emplace(Key: V).first ->getSecond().push_back(Elt: Last);
4485	(void)Processed.insert(Ptr: V);
4486	} else if (Processed.insert(Ptr: V).second) {
4487	assert(!is_contained(It->getSecond(), Last) &&
4488	"Value already associated with the node.");
4489	It ->getSecond().push_back(Elt: Last);
4490	}
4491	}
4492	} else if (!Last->isGather()) {
4493	if (isa<PHINode>(Val: S.getMainOp()) \|\|
4494	isVectorLikeInstWithConstOps(V: S.getMainOp()) \|\|
4495	(!S.areInstructionsWithCopyableElements() &&
4496	doesNotNeedToSchedule(VL)) \|\|
4497	all_of(Range&: VL, P: [&](Value V) { return* S.isNonSchedulable(V); }))
4498	Last->setDoesNotNeedToSchedule();
4499	SmallPtrSet<Value *, `4`> Processed;
4500	for (Value *V : VL) {
4501	if (isa<PoisonValue>(Val: V))
4502	continue;
4503	if (S.isCopyableElement(V)) {
4504	Last->addCopyableElement(V);
4505	continue;
4506	}
4507	auto It = ScalarToTreeEntries.find(Val: V);
4508	if (It == ScalarToTreeEntries.end()) {
4509	ScalarToTreeEntries.try_emplace(Key: V).first ->getSecond().push_back(Elt: Last);
4510	(void)Processed.insert(Ptr: V);
4511	} else if (Processed.insert(Ptr: V).second) {
4512	assert(!is_contained(It->getSecond(), Last) &&
4513	"Value already associated with the node.");
4514	It ->getSecond().push_back(Elt: Last);
4515	}
4516	}
4517	// Update the scheduler bundle to point to this TreeEntry.
4518	assert((!Bundle.getBundle().empty() \|\| Last->doesNotNeedToSchedule()) &&
4519	"Bundle and VL out of sync");
4520	if (!Bundle.getBundle().empty()) {
4521	#if !defined(NDEBUG) \|\| defined(EXPENSIVE_CHECKS)
4522	auto *BundleMember = Bundle.getBundle().begin();
4523	SmallPtrSet<Value *, `4`> Processed;
4524	for (Value *V : VL) {
4525	if (S.isNonSchedulable(V) \|\| !Processed.insert(V).second)
4526	continue;
4527	++BundleMember;
4528	}
4529	assert(BundleMember == Bundle.getBundle().end() &&
4530	"Bundle and VL out of sync");
4531	#endif
4532	Bundle.setTreeEntry(Last);
4533	}
4534	} else {
4535	// Build a map for gathered scalars to the nodes where they are used.
4536	bool AllConstsOrCasts = true;
4537	for (Value *V : VL) {
4538	if (S && S.areInstructionsWithCopyableElements() &&
4539	S.isCopyableElement(V))
4540	Last->addCopyableElement(V);
4541	if (!isConstant(V)) {
4542	auto *I = dyn_cast<CastInst>(Val: V);
4543	AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4544	if (UserTreeIdx.EdgeIdx != UINT_MAX \|\| !UserTreeIdx.UserTE \|\|
4545	!UserTreeIdx.UserTE->isGather())
4546	ValueToGatherNodes.try_emplace(Key: V).first ->getSecond().insert(X: Last);
4547	}
4548	}
4549	if (AllConstsOrCasts)
4550	CastMaxMinBWSizes =
4551	std::make_pair(x: std::numeric_limits<unsigned>::max(), y: `1`);
4552	MustGather.insert_range(R&: VL);
4553	}
4554
4555	if (UserTreeIdx.UserTE)
4556	Last->UserTreeIndex = UserTreeIdx;
4557	return Last;
4558	}
4559
4560	/// -- Vectorization State --
4561	/// Holds all of the tree entries.
4562	TreeEntry::VecTreeTy VectorizableTree;
4563
4564	#ifndef NDEBUG
4565	/// Debug printer.
4566	LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4567	for (unsigned Id = `0`, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4568	VectorizableTree[Id]->dump();
4569	if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4570	dbgs() << "[[TRANSFORMED TO GATHER]]";
4571	else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4572	dbgs() << "[[DELETED NODE]]";
4573	dbgs() << "\n";
4574	}
4575	}
4576	#endif
4577
4578	/// Get list of vector entries, associated with the value \p V.
4579	ArrayRef<TreeEntry > getTreeEntries(const* Value V) const* {
4580	assert(V && "V cannot be nullptr.");
4581	auto It = ScalarToTreeEntries.find(Val: V);
4582	if (It == ScalarToTreeEntries.end())
4583	return {};
4584	return It ->getSecond();
4585	}
4586
4587	/// Get list of split vector entries, associated with the value \p V.
4588	ArrayRef<TreeEntry > getSplitTreeEntries(Value V) const {
4589	assert(V && "V cannot be nullptr.");
4590	auto It = ScalarsInSplitNodes.find(Val: V);
4591	if (It == ScalarsInSplitNodes.end())
4592	return {};
4593	return It ->getSecond();
4594	}
4595
4596	/// Returns first vector node for value \p V, matching values \p VL.
4597	TreeEntry getSameValuesTreeEntry(Value V, ArrayRef<Value *> VL,
4598	bool SameVF = false) const {
4599	assert(V && "V cannot be nullptr.");
4600	for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V))
4601	if ((!SameVF \|\| TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4602	return TE;
4603	return nullptr;
4604	}
4605
4606	/// Contains all the outputs of legality analysis for a list of values to
4607	/// vectorize.
4608	class ScalarsVectorizationLegality {
4609	InstructionsState S;
4610	bool IsLegal;
4611	bool TryToFindDuplicates;
4612	bool TrySplitVectorize;
4613
4614	public:
4615	ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4616	bool TryToFindDuplicates = true,
4617	bool TrySplitVectorize = false)
4618	: S (S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4619	TrySplitVectorize(TrySplitVectorize) {
4620	assert((!IsLegal \|\| (S.valid() && TryToFindDuplicates)) &&
4621	"Inconsistent state");
4622	}
4623	const InstructionsState &getInstructionsState() const { return S; };
4624	bool isLegal() const { return IsLegal; }
4625	bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4626	bool trySplitVectorize() const { return TrySplitVectorize; }
4627	};
4628
4629	/// Checks if the specified list of the instructions/values can be vectorized
4630	/// in general.
4631	ScalarsVectorizationLegality
4632	getScalarsVectorizationLegality(ArrayRef<Value > VL, unsigned* Depth,
4633	const EdgeInfo &UserTreeIdx,
4634	bool TryCopyableElementsVectorization) const;
4635
4636	/// Checks if the specified list of the instructions/values can be vectorized
4637	/// and fills required data before actual scheduling of the instructions.
4638	TreeEntry::EntryState getScalarsVectorizationState(
4639	const InstructionsState &S, ArrayRef<Value *> VL,
4640	bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4641	SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4642
4643	/// Maps a specific scalar to its tree entry(ies).
4644	SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarToTreeEntries;
4645
4646	/// List of deleted non-profitable nodes.
4647	SmallPtrSet<const TreeEntry *, `8`> DeletedNodes;
4648
4649	/// List of nodes, transformed to gathered, with their conservative
4650	/// gather/buildvector cost estimation.
4651	SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4652
4653	/// Maps the operand index and entry to the corresponding tree entry.
4654	SmallDenseMap<std::pair<const TreeEntry , unsigned>, TreeEntry >
4655	OperandsToTreeEntry;
4656
4657	/// Scalars, used in split vectorize nodes.
4658	SmallDenseMap<Value , SmallVector<TreeEntry >> ScalarsInSplitNodes;
4659
4660	/// Maps a value to the proposed vectorizable size.
4661	SmallDenseMap<Value , unsigned*> InstrElementSize;
4662
4663	/// A list of scalars that we found that we need to keep as scalars.
4664	ValueSet MustGather;
4665
4666	/// A set of first non-schedulable values.
4667	ValueSet NonScheduledFirst;
4668
4669	/// A map between the vectorized entries and the last instructions in the
4670	/// bundles. The bundles are built in use order, not in the def order of the
4671	/// instructions. So, we cannot rely directly on the last instruction in the
4672	/// bundle being the last instruction in the program order during
4673	/// vectorization process since the basic blocks are affected, need to
4674	/// pre-gather them before.
4675	SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4676
4677	/// Keeps the mapping between the last instructions and their insertion
4678	/// points, which is an instruction-after-the-last-instruction.
4679	SmallDenseMap<const Instruction , Instruction > LastInstructionToPos;
4680
4681	/// List of gather nodes, depending on other gather/vector nodes, which should
4682	/// be emitted after the vector instruction emission process to correctly
4683	/// handle order of the vector instructions and shuffles.
4684	SetVector<const TreeEntry *> PostponedGathers;
4685
4686	using ValueToGatherNodesMap =
4687	DenseMap<Value , SmallSetVector<const* TreeEntry *, `4`>>;
4688	ValueToGatherNodesMap ValueToGatherNodes;
4689
4690	/// A list of the load entries (node indices), which can be vectorized using
4691	/// strided or masked gather approach, but attempted to be represented as
4692	/// contiguous loads.
4693	SetVector<unsigned> LoadEntriesToVectorize;
4694
4695	/// true if graph nodes transforming mode is on.
4696	bool IsGraphTransformMode = false;
4697
4698	/// The index of the first gathered load entry in the VectorizeTree.
4699	std::optional<unsigned> GatheredLoadsEntriesFirst;
4700
4701	/// Maps compress entries to their mask data for the final codegen.
4702	SmallDenseMap<const TreeEntry *,
4703	std::tuple<SmallVector<int>, VectorType , unsigned, bool*>>
4704	CompressEntryToData;
4705
4706	/// This POD struct describes one external user in the vectorized tree.
4707	struct ExternalUser {
4708	ExternalUser(Value S, llvm::User U, const TreeEntry &E, unsigned L)
4709	: Scalar(S), User(U), E(E), Lane(L) {}
4710
4711	/// Which scalar in our function.
4712	Value Scalar = nullptr*;
4713
4714	/// Which user that uses the scalar.
4715	llvm::User User = nullptr*;
4716
4717	/// Vector node, the value is part of.
4718	const TreeEntry &E;
4719
4720	/// Which lane does the scalar belong to.
4721	unsigned Lane;
4722	};
4723	using UserList = SmallVector<ExternalUser, `16`>;
4724
4725	/// Checks if two instructions may access the same memory.
4726	///
4727	/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4728	/// is invariant in the calling loop.
4729	bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4730	Instruction *Inst2) {
4731	assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4732	// First check if the result is already in the cache.
4733	AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
4734	auto Res = AliasCache.try_emplace(Key);
4735	if (!Res.second)
4736	return Res.first ->second;
4737	bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
4738	// Store the result in the cache.
4739	Res.first ->getSecond() = Aliased;
4740	return Aliased;
4741	}
4742
4743	using AliasCacheKey = std::pair<Instruction , Instruction >;
4744
4745	/// Cache for alias results.
4746	/// TODO: consider moving this to the AliasAnalysis itself.
4747	SmallDenseMap<AliasCacheKey, bool> AliasCache;
4748
4749	// Cache for pointerMayBeCaptured calls inside AA. This is preserved
4750	// globally through SLP because we don't perform any action which
4751	// invalidates capture results.
4752	BatchAAResults BatchAA;
4753
4754	/// Temporary store for deleted instructions. Instructions will be deleted
4755	/// eventually when the BoUpSLP is destructed. The deferral is required to
4756	/// ensure that there are no incorrect collisions in the AliasCache, which
4757	/// can happen if a new instruction is allocated at the same address as a
4758	/// previously deleted instruction.
4759	DenseSet<Instruction *> DeletedInstructions;
4760
4761	/// Set of the instruction, being analyzed already for reductions.
4762	SmallPtrSet<Instruction *, `16`> AnalyzedReductionsRoots;
4763
4764	/// Set of hashes for the list of reduction values already being analyzed.
4765	DenseSet<size_t> AnalyzedReductionVals;
4766
4767	/// Values, already been analyzed for mininmal bitwidth and found to be
4768	/// non-profitable.
4769	DenseSet<Value *> AnalyzedMinBWVals;
4770
4771	/// A list of values that need to extracted out of the tree.
4772	/// This list holds pairs of (Internal Scalar : External User). External User
4773	/// can be nullptr, it means that this Internal Scalar will be used later,
4774	/// after vectorization.
4775	UserList ExternalUses;
4776
4777	/// A list of GEPs which can be reaplced by scalar GEPs instead of
4778	/// extractelement instructions.
4779	SmallPtrSet<Value *, `4`> ExternalUsesAsOriginalScalar;
4780
4781	/// A list of scalar to be extracted without specific user necause of too many
4782	/// uses.
4783	SmallPtrSet<Value *, `4`> ExternalUsesWithNonUsers;
4784
4785	/// Values used only by @llvm.assume calls.
4786	SmallPtrSet<const Value *, `32`> EphValues;
4787
4788	/// Holds all of the instructions that we gathered, shuffle instructions and
4789	/// extractelements.
4790	SetVector<Instruction *> GatherShuffleExtractSeq;
4791
4792	/// A list of blocks that we are going to CSE.
4793	DenseSet<BasicBlock *> CSEBlocks;
4794
4795	/// List of hashes of vector of loads, which are known to be non vectorizable.
4796	DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4797
4798	/// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4799	/// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4800	/// instructions, while ScheduleBundle represents a batch of instructions,
4801	/// going to be groupped together. ScheduleCopyableData models extra user for
4802	/// "copyable" instructions.
4803	class ScheduleEntity {
4804	friend class ScheduleBundle;
4805	friend class ScheduleData;
4806	friend class ScheduleCopyableData;
4807
4808	protected:
4809	enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4810	Kind getKind() const { return K; }
4811	ScheduleEntity(Kind K) : K(K) {}
4812
4813	private:
4814	/// Used for getting a "good" final ordering of instructions.
4815	int SchedulingPriority = `0`;
4816	/// True if this instruction (or bundle) is scheduled (or considered as
4817	/// scheduled in the dry-run).
4818	bool IsScheduled = false;
4819	/// The kind of the ScheduleEntity.
4820	const Kind K = Kind::ScheduleData;
4821
4822	public:
4823	ScheduleEntity() = delete;
4824	/// Gets/sets the scheduling priority.
4825	void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4826	int getSchedulingPriority() const { return SchedulingPriority; }
4827	bool isReady() const {
4828	if (const auto SD = dyn_cast<ScheduleData>(Val: this*))
4829	return SD->isReady();
4830	if (const auto CD = dyn_cast<ScheduleCopyableData>(Val: this*))
4831	return CD->isReady();
4832	return cast<ScheduleBundle>(Val: this)->isReady();
4833	}
4834	/// Returns true if the dependency information has been calculated.
4835	/// Note that depenendency validity can vary between instructions within
4836	/// a single bundle.
4837	bool hasValidDependencies() const {
4838	if (const auto SD = dyn_cast<ScheduleData>(Val: this*))
4839	return SD->hasValidDependencies();
4840	if (const auto CD = dyn_cast<ScheduleCopyableData>(Val: this*))
4841	return CD->hasValidDependencies();
4842	return cast<ScheduleBundle>(Val: this)->hasValidDependencies();
4843	}
4844	/// Gets the number of unscheduled dependencies.
4845	int getUnscheduledDeps() const {
4846	if (const auto SD = dyn_cast<ScheduleData>(Val: this*))
4847	return SD->getUnscheduledDeps();
4848	if (const auto CD = dyn_cast<ScheduleCopyableData>(Val: this*))
4849	return CD->getUnscheduledDeps();
4850	return cast<ScheduleBundle>(Val: this)->unscheduledDepsInBundle();
4851	}
4852	/// Increments the number of unscheduled dependencies.
4853	int incrementUnscheduledDeps(int Incr) {
4854	if (auto SD = dyn_cast<ScheduleData>(Val: this*))
4855	return SD->incrementUnscheduledDeps(Incr);
4856	return cast<ScheduleCopyableData>(Val: this)->incrementUnscheduledDeps(Incr);
4857	}
4858	/// Gets the number of dependencies.
4859	int getDependencies() const {
4860	if (const auto SD = dyn_cast<ScheduleData>(Val: this*))
4861	return SD->getDependencies();
4862	return cast<ScheduleCopyableData>(Val: this)->getDependencies();
4863	}
4864	/// Gets the instruction.
4865	Instruction getInst() const* {
4866	if (const auto SD = dyn_cast<ScheduleData>(Val: this*))
4867	return SD->getInst();
4868	return cast<ScheduleCopyableData>(Val: this)->getInst();
4869	}
4870
4871	/// Gets/sets if the bundle is scheduled.
4872	bool isScheduled() const { return IsScheduled; }
4873	void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4874
4875	static bool classof(const ScheduleEntity ) { return* true; }
4876
4877	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4878	void dump(raw_ostream &OS) const {
4879	if (const auto SD = dyn_cast<ScheduleData>(this*))
4880	return SD->dump(OS);
4881	if (const auto CD = dyn_cast<ScheduleCopyableData>(this*))
4882	return CD->dump(OS);
4883	return cast<ScheduleBundle>(this)->dump(OS);
4884	}
4885
4886	LLVM_DUMP_METHOD void dump() const {
4887	dump(dbgs());
4888	dbgs() << `'\n'`;
4889	}
4890	#endif // if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4891	};
4892
4893	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
4894	friend inline raw_ostream &operator<<(raw_ostream &OS,
4895	const BoUpSLP::ScheduleEntity &SE) {
4896	SE.dump(OS);
4897	return OS;
4898	}
4899	#endif
4900
4901	/// Contains all scheduling relevant data for an instruction.
4902	/// A ScheduleData either represents a single instruction or a member of an
4903	/// instruction bundle (= a group of instructions which is combined into a
4904	/// vector instruction).
4905	class ScheduleData final : public ScheduleEntity {
4906	public:
4907	// The initial value for the dependency counters. It means that the
4908	// dependencies are not calculated yet.
4909	enum { InvalidDeps = -`1` };
4910
4911	ScheduleData() : ScheduleEntity (Kind::ScheduleData) {}
4912	static bool classof(const ScheduleEntity *Entity) {
4913	return Entity->getKind() == Kind::ScheduleData;
4914	}
4915
4916	void init(int BlockSchedulingRegionID, Instruction *I) {
4917	NextLoadStore = nullptr;
4918	IsScheduled = false;
4919	SchedulingRegionID = BlockSchedulingRegionID;
4920	clearDependencies();
4921	Inst = I;
4922	}
4923
4924	/// Verify basic self consistency properties
4925	void verify() {
4926	if (hasValidDependencies()) {
4927	assert(UnscheduledDeps <= Dependencies && "invariant");
4928	} else {
4929	assert(UnscheduledDeps == Dependencies && "invariant");
4930	}
4931
4932	if (IsScheduled) {
4933	assert(hasValidDependencies() && UnscheduledDeps == `0` &&
4934	"unexpected scheduled state");
4935	}
4936	}
4937
4938	/// Returns true if the dependency information has been calculated.
4939	/// Note that depenendency validity can vary between instructions within
4940	/// a single bundle.
4941	bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4942
4943	/// Returns true if it is ready for scheduling, i.e. it has no more
4944	/// unscheduled depending instructions/bundles.
4945	bool isReady() const { return UnscheduledDeps == `0` && !IsScheduled; }
4946
4947	/// Modifies the number of unscheduled dependencies for this instruction,
4948	/// and returns the number of remaining dependencies for the containing
4949	/// bundle.
4950	int incrementUnscheduledDeps(int Incr) {
4951	assert(hasValidDependencies() &&
4952	"increment of unscheduled deps would be meaningless");
4953	UnscheduledDeps += Incr;
4954	assert(UnscheduledDeps >= `0` &&
4955	"Expected valid number of unscheduled deps");
4956	return UnscheduledDeps;
4957	}
4958
4959	/// Sets the number of unscheduled dependencies to the number of
4960	/// dependencies.
4961	void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4962
4963	/// Clears all dependency information.
4964	void clearDependencies() {
4965	clearDirectDependencies();
4966	MemoryDependencies.clear();
4967	ControlDependencies.clear();
4968	}
4969
4970	/// Clears all direct dependencies only, except for control and memory
4971	/// dependencies.
4972	/// Required for copyable elements to correctly handle control/memory deps
4973	/// and avoid extra reclaculation of such deps.
4974	void clearDirectDependencies() {
4975	Dependencies = InvalidDeps;
4976	resetUnscheduledDeps();
4977	IsScheduled = false;
4978	}
4979
4980	/// Gets the number of unscheduled dependencies.
4981	int getUnscheduledDeps() const { return UnscheduledDeps; }
4982	/// Gets the number of dependencies.
4983	int getDependencies() const { return Dependencies; }
4984	/// Initializes the number of dependencies.
4985	void initDependencies() { Dependencies = `0`; }
4986	/// Increments the number of dependencies.
4987	void incDependencies() { Dependencies++; }
4988
4989	/// Gets scheduling region ID.
4990	int getSchedulingRegionID() const { return SchedulingRegionID; }
4991
4992	/// Gets the instruction.
4993	Instruction getInst() const* { return Inst; }
4994
4995	/// Gets the list of memory dependencies.
4996	ArrayRef<ScheduleData > getMemoryDependencies() const* {
4997	return MemoryDependencies;
4998	}
4999	/// Adds a memory dependency.
5000	void addMemoryDependency(ScheduleData *Dep) {
5001	MemoryDependencies.push_back(Elt: Dep);
5002	}
5003	/// Gets the list of control dependencies.
5004	ArrayRef<ScheduleData > getControlDependencies() const* {
5005	return ControlDependencies;
5006	}
5007	/// Adds a control dependency.
5008	void addControlDependency(ScheduleData *Dep) {
5009	ControlDependencies.push_back(Elt: Dep);
5010	}
5011	/// Gets/sets the next load/store instruction in the block.
5012	ScheduleData getNextLoadStore() const* { return NextLoadStore; }
5013	void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5014
5015	void dump(raw_ostream &OS) const { OS << *Inst; }
5016
5017	LLVM_DUMP_METHOD void dump() const {
5018	dump(OS&: dbgs());
5019	dbgs() << `'\n'`;
5020	}
5021
5022	private:
5023	Instruction Inst = nullptr*;
5024
5025	/// Single linked list of all memory instructions (e.g. load, store, call)
5026	/// in the block - until the end of the scheduling region.
5027	ScheduleData NextLoadStore = nullptr*;
5028
5029	/// The dependent memory instructions.
5030	/// This list is derived on demand in calculateDependencies().
5031	SmallVector<ScheduleData *> MemoryDependencies;
5032
5033	/// List of instructions which this instruction could be control dependent
5034	/// on. Allowing such nodes to be scheduled below this one could introduce
5035	/// a runtime fault which didn't exist in the original program.
5036	/// ex: this is a load or udiv following a readonly call which inf loops
5037	SmallVector<ScheduleData *> ControlDependencies;
5038
5039	/// This ScheduleData is in the current scheduling region if this matches
5040	/// the current SchedulingRegionID of BlockScheduling.
5041	int SchedulingRegionID = `0`;
5042
5043	/// The number of dependencies. Constitutes of the number of users of the
5044	/// instruction plus the number of dependent memory instructions (if any).
5045	/// This value is calculated on demand.
5046	/// If InvalidDeps, the number of dependencies is not calculated yet.
5047	int Dependencies = InvalidDeps;
5048
5049	/// The number of dependencies minus the number of dependencies of scheduled
5050	/// instructions. As soon as this is zero, the instruction/bundle gets ready
5051	/// for scheduling.
5052	/// Note that this is negative as long as Dependencies is not calculated.
5053	int UnscheduledDeps = InvalidDeps;
5054	};
5055
5056	#ifndef NDEBUG
5057	friend inline raw_ostream &operator<<(raw_ostream &OS,
5058	const BoUpSLP::ScheduleData &SD) {
5059	SD.dump(OS);
5060	return OS;
5061	}
5062	#endif
5063
5064	class ScheduleBundle final : public ScheduleEntity {
5065	/// The schedule data for the instructions in the bundle.
5066	SmallVector<ScheduleEntity *> Bundle;
5067	/// True if this bundle is valid.
5068	bool IsValid = true;
5069	/// The TreeEntry that this instruction corresponds to.
5070	TreeEntry TE = nullptr*;
5071	ScheduleBundle(bool IsValid)
5072	: ScheduleEntity (Kind::ScheduleBundle), IsValid(IsValid) {}
5073
5074	public:
5075	ScheduleBundle() : ScheduleEntity (Kind::ScheduleBundle) {}
5076	static bool classof(const ScheduleEntity *Entity) {
5077	return Entity->getKind() == Kind::ScheduleBundle;
5078	}
5079
5080	/// Verify basic self consistency properties
5081	void verify() const {
5082	for (const ScheduleEntity *SD : Bundle) {
5083	if (SD->hasValidDependencies()) {
5084	assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5085	"invariant");
5086	} else {
5087	assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5088	"invariant");
5089	}
5090
5091	if (isScheduled()) {
5092	assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == `0` &&
5093	"unexpected scheduled state");
5094	}
5095	}
5096	}
5097
5098	/// Returns the number of unscheduled dependencies in the bundle.
5099	int unscheduledDepsInBundle() const {
5100	assert(*this && "bundle must not be empty");
5101	int Sum = `0`;
5102	for (const ScheduleEntity *BundleMember : Bundle) {
5103	if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5104	return ScheduleData::InvalidDeps;
5105	Sum += BundleMember->getUnscheduledDeps();
5106	}
5107	return Sum;
5108	}
5109
5110	/// Returns true if the dependency information has been calculated.
5111	/// Note that depenendency validity can vary between instructions within
5112	/// a single bundle.
5113	bool hasValidDependencies() const {
5114	return all_of(Range: Bundle, P: [](const ScheduleEntity *SD) {
5115	return SD->hasValidDependencies();
5116	});
5117	}
5118
5119	/// Returns true if it is ready for scheduling, i.e. it has no more
5120	/// unscheduled depending instructions/bundles.
5121	bool isReady() const {
5122	assert(*this && "bundle must not be empty");
5123	return unscheduledDepsInBundle() == `0` && !isScheduled();
5124	}
5125
5126	/// Returns the bundle of scheduling data, associated with the current
5127	/// instruction.
5128	ArrayRef<ScheduleEntity > getBundle() { return* Bundle; }
5129	ArrayRef<const ScheduleEntity > getBundle() const* { return Bundle; }
5130	/// Adds an instruction to the bundle.
5131	void add(ScheduleEntity *SD) { Bundle.push_back(Elt: SD); }
5132
5133	/// Gets/sets the associated tree entry.
5134	void setTreeEntry(TreeEntry TE) { this*->TE = TE; }
5135	TreeEntry getTreeEntry() const* { return TE; }
5136
5137	static ScheduleBundle invalid() { return {false}; }
5138
5139	operator bool() const { return IsValid; }
5140
5141	#ifndef NDEBUG
5142	void dump(raw_ostream &OS) const {
5143	if (!*this) {
5144	OS << "[]";
5145	return;
5146	}
5147	OS << `'['`;
5148	interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5149	if (isa<ScheduleCopyableData>(SD))
5150	OS << "<Copyable>";
5151	OS << *SD->getInst();
5152	});
5153	OS << `']'`;
5154	}
5155
5156	LLVM_DUMP_METHOD void dump() const {
5157	dump(dbgs());
5158	dbgs() << `'\n'`;
5159	}
5160	#endif // NDEBUG
5161	};
5162
5163	#ifndef NDEBUG
5164	friend inline raw_ostream &operator<<(raw_ostream &OS,
5165	const BoUpSLP::ScheduleBundle &Bundle) {
5166	Bundle.dump(OS);
5167	return OS;
5168	}
5169	#endif
5170
5171	/// Contains all scheduling relevant data for the copyable instruction.
5172	/// It models the virtual instructions, supposed to replace the original
5173	/// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5174	/// %1], where %1 = add, then the ScheduleCopyableData models virtual
5175	/// instruction %virt = add %0, 0.
5176	class ScheduleCopyableData final : public ScheduleEntity {
5177	/// The source schedule data for the instruction.
5178	Instruction Inst = nullptr*;
5179	/// The edge information for the instruction.
5180	const EdgeInfo EI;
5181	/// This ScheduleData is in the current scheduling region if this matches
5182	/// the current SchedulingRegionID of BlockScheduling.
5183	int SchedulingRegionID = `0`;
5184	/// Bundle, this data is part of.
5185	ScheduleBundle &Bundle;
5186
5187	public:
5188	ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5189	const EdgeInfo &EI, ScheduleBundle &Bundle)
5190	: ScheduleEntity (Kind::ScheduleCopyableData), Inst(I), EI (EI),
5191	SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5192	static bool classof(const ScheduleEntity *Entity) {
5193	return Entity->getKind() == Kind::ScheduleCopyableData;
5194	}
5195
5196	/// Verify basic self consistency properties
5197	void verify() {
5198	if (hasValidDependencies()) {
5199	assert(UnscheduledDeps <= Dependencies && "invariant");
5200	} else {
5201	assert(UnscheduledDeps == Dependencies && "invariant");
5202	}
5203
5204	if (IsScheduled) {
5205	assert(hasValidDependencies() && UnscheduledDeps == `0` &&
5206	"unexpected scheduled state");
5207	}
5208	}
5209
5210	/// Returns true if the dependency information has been calculated.
5211	/// Note that depenendency validity can vary between instructions within
5212	/// a single bundle.
5213	bool hasValidDependencies() const {
5214	return Dependencies != ScheduleData::InvalidDeps;
5215	}
5216
5217	/// Returns true if it is ready for scheduling, i.e. it has no more
5218	/// unscheduled depending instructions/bundles.
5219	bool isReady() const { return UnscheduledDeps == `0` && !IsScheduled; }
5220
5221	/// Modifies the number of unscheduled dependencies for this instruction,
5222	/// and returns the number of remaining dependencies for the containing
5223	/// bundle.
5224	int incrementUnscheduledDeps(int Incr) {
5225	assert(hasValidDependencies() &&
5226	"increment of unscheduled deps would be meaningless");
5227	UnscheduledDeps += Incr;
5228	assert(UnscheduledDeps >= `0` && "invariant");
5229	return UnscheduledDeps;
5230	}
5231
5232	/// Sets the number of unscheduled dependencies to the number of
5233	/// dependencies.
5234	void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5235
5236	/// Gets the number of unscheduled dependencies.
5237	int getUnscheduledDeps() const { return UnscheduledDeps; }
5238	/// Gets the number of dependencies.
5239	int getDependencies() const { return Dependencies; }
5240	/// Initializes the number of dependencies.
5241	void initDependencies() { Dependencies = `0`; }
5242	/// Increments the number of dependencies.
5243	void incDependencies() { Dependencies++; }
5244
5245	/// Gets scheduling region ID.
5246	int getSchedulingRegionID() const { return SchedulingRegionID; }
5247
5248	/// Gets the instruction.
5249	Instruction getInst() const* { return Inst; }
5250
5251	/// Clears all dependency information.
5252	void clearDependencies() {
5253	Dependencies = ScheduleData::InvalidDeps;
5254	UnscheduledDeps = ScheduleData::InvalidDeps;
5255	IsScheduled = false;
5256	}
5257
5258	/// Gets the edge information.
5259	const EdgeInfo &getEdgeInfo() const { return EI; }
5260
5261	/// Gets the bundle.
5262	ScheduleBundle &getBundle() { return Bundle; }
5263	const ScheduleBundle &getBundle() const { return Bundle; }
5264
5265	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
5266	void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5267
5268	LLVM_DUMP_METHOD void dump() const {
5269	dump(dbgs());
5270	dbgs() << `'\n'`;
5271	}
5272	#endif // !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
5273
5274	private:
5275	/// true, if it has valid dependency information. These nodes always have
5276	/// only single dependency.
5277	int Dependencies = ScheduleData::InvalidDeps;
5278
5279	/// The number of dependencies minus the number of dependencies of scheduled
5280	/// instructions. As soon as this is zero, the instruction/bundle gets ready
5281	/// for scheduling.
5282	/// Note that this is negative as long as Dependencies is not calculated.
5283	int UnscheduledDeps = ScheduleData::InvalidDeps;
5284	};
5285
5286	#ifndef NDEBUG
5287	friend inline raw_ostream &
5288	operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5289	SD.dump(OS);
5290	return OS;
5291	}
5292	#endif
5293
5294	friend struct GraphTraits<BoUpSLP *>;
5295	friend struct DOTGraphTraits<BoUpSLP *>;
5296
5297	/// Contains all scheduling data for a basic block.
5298	/// It does not schedules instructions, which are not memory read/write
5299	/// instructions and their operands are either constants, or arguments, or
5300	/// phis, or instructions from others blocks, or their users are phis or from
5301	/// the other blocks. The resulting vector instructions can be placed at the
5302	/// beginning of the basic block without scheduling (if operands does not need
5303	/// to be scheduled) or at the end of the block (if users are outside of the
5304	/// block). It allows to save some compile time and memory used by the
5305	/// compiler.
5306	/// ScheduleData is assigned for each instruction in between the boundaries of
5307	/// the tree entry, even for those, which are not part of the graph. It is
5308	/// required to correctly follow the dependencies between the instructions and
5309	/// their correct scheduling. The ScheduleData is not allocated for the
5310	/// instructions, which do not require scheduling, like phis, nodes with
5311	/// extractelements/insertelements only or nodes with instructions, with
5312	/// uses/operands outside of the block.
5313	struct BlockScheduling {
5314	BlockScheduling(BasicBlock *BB)
5315	: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5316
5317	void clear() {
5318	ScheduledBundles.clear();
5319	ScheduledBundlesList.clear();
5320	ScheduleCopyableDataMap.clear();
5321	ScheduleCopyableDataMapByInst.clear();
5322	ScheduleCopyableDataMapByInstUser.clear();
5323	ScheduleCopyableDataMapByUsers.clear();
5324	ReadyInsts.clear();
5325	ScheduleStart = nullptr;
5326	ScheduleEnd = nullptr;
5327	FirstLoadStoreInRegion = nullptr;
5328	LastLoadStoreInRegion = nullptr;
5329	RegionHasStackSave = false;
5330
5331	// Reduce the maximum schedule region size by the size of the
5332	// previous scheduling run.
5333	ScheduleRegionSizeLimit -= ScheduleRegionSize;
5334	if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5335	ScheduleRegionSizeLimit = MinScheduleRegionSize;
5336	ScheduleRegionSize = `0`;
5337
5338	// Make a new scheduling region, i.e. all existing ScheduleData is not
5339	// in the new region yet.
5340	++SchedulingRegionID;
5341	}
5342
5343	ScheduleData getScheduleData(Instruction I) {
5344	if (!I)
5345	return nullptr;
5346	if (BB != I->getParent())
5347	// Avoid lookup if can't possibly be in map.
5348	return nullptr;
5349	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
5350	if (SD && isInSchedulingRegion(SD: *SD))
5351	return SD;
5352	return nullptr;
5353	}
5354
5355	ScheduleData getScheduleData(Value V) {
5356	return getScheduleData(I: dyn_cast<Instruction>(Val: V));
5357	}
5358
5359	/// Returns the ScheduleCopyableData for the given edge (user tree entry and
5360	/// operand number) and value.
5361	ScheduleCopyableData getScheduleCopyableData(const* EdgeInfo &EI,
5362	const Value V) const* {
5363	if (ScheduleCopyableDataMap.empty())
5364	return nullptr;
5365	auto It = ScheduleCopyableDataMap.find(Val: std::make_pair(x: EI, y&: V));
5366	if (It == ScheduleCopyableDataMap.end())
5367	return nullptr;
5368	ScheduleCopyableData *SD = It ->getSecond().get();
5369	if (!isInSchedulingRegion(SD: *SD))
5370	return nullptr;
5371	return SD;
5372	}
5373
5374	/// Returns the ScheduleCopyableData for the given user \p User, operand
5375	/// number and operand \p V.
5376	SmallVector<ScheduleCopyableData *>
5377	getScheduleCopyableData(const Value User, unsigned* OperandIdx,
5378	const Value *V) {
5379	if (ScheduleCopyableDataMapByInstUser.empty())
5380	return {};
5381	const auto It = ScheduleCopyableDataMapByInstUser.find(
5382	Val: std::make_pair(x: std::make_pair(x&: User, y&: OperandIdx), y&: V));
5383	if (It == ScheduleCopyableDataMapByInstUser.end())
5384	return {};
5385	SmallVector<ScheduleCopyableData *> Res;
5386	for (ScheduleCopyableData *SD : It ->getSecond()) {
5387	if (isInSchedulingRegion(SD: *SD))
5388	Res.push_back(Elt: SD);
5389	}
5390	return Res;
5391	}
5392
5393	/// Returns true if all operands of the given instruction \p User are
5394	/// replaced by copyable data.
5395	/// \param User The user instruction.
5396	/// \param Op The operand, which might be replaced by the copyable data.
5397	/// \param SLP The SLP tree.
5398	/// \param NumOps The number of operands used. If the instruction uses the
5399	/// same operand several times, check for the first use, then the second,
5400	/// etc.
5401	bool areAllOperandsReplacedByCopyableData(Instruction *User,
5402	Instruction *Op, BoUpSLP &SLP,
5403	unsigned NumOps) const {
5404	assert(NumOps > `0` && "No operands");
5405	if (ScheduleCopyableDataMap.empty())
5406	return false;
5407	SmallDenseMap<TreeEntry , unsigned*> PotentiallyReorderedEntriesCount;
5408	ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(V: User);
5409	if (Entries.empty())
5410	return false;
5411	unsigned CurNumOps = `0`;
5412	for (const Use &U : User->operands()) {
5413	if (U.get() != Op)
5414	continue;
5415	++CurNumOps;
5416	// Check all tree entries, if they have operands replaced by copyable
5417	// data.
5418	for (TreeEntry *TE : Entries) {
5419	unsigned Inc = `0`;
5420	bool IsNonSchedulableWithParentPhiNode =
5421	TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5422	TE->UserTreeIndex.UserTE->hasState() &&
5423	TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5424	TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5425	// Count the number of unique phi nodes, which are the parent for
5426	// parent entry, and exit, if all the unique phis are processed.
5427	if (IsNonSchedulableWithParentPhiNode) {
5428	SmallPtrSet<Value *, `4`> ParentsUniqueUsers;
5429	const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5430	for (Value *V : ParentTE->Scalars) {
5431	auto *PHI = dyn_cast<PHINode>(Val: V);
5432	if (!PHI)
5433	continue;
5434	if (ParentsUniqueUsers.insert(Ptr: PHI).second &&
5435	is_contained(Range: PHI->incoming_values(), Element: User))
5436	++Inc;
5437	}
5438	} else {
5439	Inc = count(Range&: TE->Scalars, Element: User);
5440	}
5441
5442	// Check if the user is commutative.
5443	// The commutatives are handled later, as their operands can be
5444	// reordered.
5445	// Same applies even for non-commutative cmps, because we can invert
5446	// their predicate potentially and, thus, reorder the operands.
5447	bool IsCommutativeUser =
5448	::isCommutative(I: User) &&
5449	::isCommutableOperand(I: User, ValWithUses: User, Op: U.getOperandNo());
5450	if (!IsCommutativeUser) {
5451	Instruction *MainOp = TE->getMatchingMainOpOrAltOp(I: User);
5452	IsCommutativeUser =
5453	::isCommutative(I: MainOp, ValWithUses: User) &&
5454	::isCommutableOperand(I: MainOp, ValWithUses: User, Op: U.getOperandNo());
5455	}
5456	// The commutative user with the same operands can be safely
5457	// considered as non-commutative, operands reordering does not change
5458	// the semantics.
5459	assert(
5460	(!IsCommutativeUser \|\|
5461	(((::isCommutative(User) &&
5462	::isCommutableOperand(User, User, `0`) &&
5463	::isCommutableOperand(User, User, `1`)) \|\|
5464	(::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5465	::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5466	User, `0`) &&
5467	::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5468	User, `1`))))) &&
5469	"Expected commutative user with 2 first commutable operands");
5470	bool IsCommutativeWithSameOps =
5471	IsCommutativeUser && User->getOperand(i: `0`) == User->getOperand(i: `1`);
5472	if ((!IsCommutativeUser \|\| IsCommutativeWithSameOps) &&
5473	!isa<CmpInst>(Val: User)) {
5474	EdgeInfo EI(TE, U.getOperandNo());
5475	if (CurNumOps != NumOps \|\| getScheduleCopyableData(EI, V: Op))
5476	continue;
5477	return false;
5478	}
5479	PotentiallyReorderedEntriesCount.try_emplace(Key: TE, Args: `0`)
5480	.first ->getSecond() += Inc;
5481	}
5482	}
5483	if (PotentiallyReorderedEntriesCount.empty())
5484	return true;
5485	// Check the commutative/cmp entries.
5486	for (auto &P : PotentiallyReorderedEntriesCount) {
5487	SmallPtrSet<Value *, `4`> ParentsUniqueUsers;
5488	bool IsNonSchedulableWithParentPhiNode =
5489	P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5490	P.first->UserTreeIndex.UserTE->hasState() &&
5491	P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5492	P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5493	auto *It = find(Range&: P.first->Scalars, Val: User);
5494	do {
5495	assert(It != P.first->Scalars.end() &&
5496	"User is not in the tree entry");
5497	int Lane = std::distance(first: P.first->Scalars.begin(), last: It);
5498	assert(Lane >= `0` && "Lane is not found");
5499	if (isa<StoreInst>(Val: User) && !P.first->ReorderIndices.empty())
5500	Lane = P.first->ReorderIndices [Lane];
5501	assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5502	"Couldn't find extract lane");
5503	// Count the number of unique phi nodes, which are the parent for
5504	// parent entry, and exit, if all the unique phis are processed.
5505	if (IsNonSchedulableWithParentPhiNode) {
5506	const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5507	Value *User = ParentTE->Scalars [Lane];
5508	if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5509	It =
5510	find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5511	continue;
5512	}
5513	}
5514	for (unsigned OpIdx :
5515	seq<unsigned>(Size: ::getNumberOfPotentiallyCommutativeOps(
5516	I: P.first->getMainOp()))) {
5517	if (P.first->getOperand(OpIdx)[Lane] == Op &&
5518	getScheduleCopyableData(EI: EdgeInfo (P.first, OpIdx), V: Op))
5519	--P.getSecond();
5520	}
5521	// If parent node is schedulable, it will be handled correctly.
5522	It = find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5523	} while (It != P.first->Scalars.end());
5524	}
5525	return all_of(Range&: PotentiallyReorderedEntriesCount,
5526	P: [&](const std::pair<const TreeEntry , unsigned*> &P) {
5527	return P.second == NumOps - `1`;
5528	});
5529	}
5530
5531	SmallVector<ScheduleCopyableData *>
5532	getScheduleCopyableData(const Instruction I) const* {
5533	if (ScheduleCopyableDataMapByInst.empty())
5534	return {};
5535	const auto It = ScheduleCopyableDataMapByInst.find(Val: I);
5536	if (It == ScheduleCopyableDataMapByInst.end())
5537	return {};
5538	SmallVector<ScheduleCopyableData *> Res;
5539	for (ScheduleCopyableData *SD : It ->getSecond()) {
5540	if (isInSchedulingRegion(SD: *SD))
5541	Res.push_back(Elt: SD);
5542	}
5543	return Res;
5544	}
5545
5546	SmallVector<ScheduleCopyableData *>
5547	getScheduleCopyableDataUsers(const Instruction User) const* {
5548	if (ScheduleCopyableDataMapByUsers.empty())
5549	return {};
5550	const auto It = ScheduleCopyableDataMapByUsers.find(Val: User);
5551	if (It == ScheduleCopyableDataMapByUsers.end())
5552	return {};
5553	SmallVector<ScheduleCopyableData *> Res;
5554	for (ScheduleCopyableData *SD : It ->getSecond()) {
5555	if (isInSchedulingRegion(SD: *SD))
5556	Res.push_back(Elt: SD);
5557	}
5558	return Res;
5559	}
5560
5561	ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5562	Instruction *I,
5563	int SchedulingRegionID,
5564	ScheduleBundle &Bundle) {
5565	assert(!getScheduleCopyableData(EI, I) && "already in the map");
5566	ScheduleCopyableData *CD =
5567	ScheduleCopyableDataMap
5568	.try_emplace(Key: std::make_pair(x: EI, y&: I),
5569	Args: std::make_unique<ScheduleCopyableData>(
5570	args&: SchedulingRegionID, args&: I, args: EI, args&: Bundle))
5571	.first ->getSecond()
5572	.get();
5573	ScheduleCopyableDataMapByInst [I].push_back(Elt: CD);
5574	if (EI.UserTE) {
5575	ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
5576	const auto *It = find(Range&: Op, Val: I);
5577	assert(It != Op.end() && "Lane not set");
5578	SmallPtrSet<Instruction *, `4`> Visited;
5579	do {
5580	int Lane = std::distance(first: Op.begin(), last: It);
5581	assert(Lane >= `0` && "Lane not set");
5582	if (isa<StoreInst>(Val: EI.UserTE->Scalars [Lane]) &&
5583	!EI.UserTE->ReorderIndices.empty())
5584	Lane = EI.UserTE->ReorderIndices [Lane];
5585	assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5586	"Couldn't find extract lane");
5587	auto *In = cast<Instruction>(Val: EI.UserTE->Scalars [Lane]);
5588	if (!Visited.insert(Ptr: In).second) {
5589	It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5590	continue;
5591	}
5592	ScheduleCopyableDataMapByInstUser
5593	.try_emplace(Key: std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I))
5594	.first ->getSecond()
5595	.push_back(Elt: CD);
5596	ScheduleCopyableDataMapByUsers.try_emplace(Key: I)
5597	.first ->getSecond()
5598	.insert(X: CD);
5599	// Remove extra deps for users, becoming non-immediate users of the
5600	// instruction. It may happen, if the chain of same copyable elements
5601	// appears in the tree.
5602	if (In == I) {
5603	EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5604	if (ScheduleCopyableData *UserCD =
5605	getScheduleCopyableData(EI: UserEI, V: In))
5606	ScheduleCopyableDataMapByUsers [I].remove(X: UserCD);
5607	}
5608	It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5609	} while (It != Op.end());
5610	} else {
5611	ScheduleCopyableDataMapByUsers.try_emplace(Key: I).first ->getSecond().insert(
5612	X: CD);
5613	}
5614	return *CD;
5615	}
5616
5617	ArrayRef<ScheduleBundle > getScheduleBundles(Value V) const {
5618	auto *I = dyn_cast<Instruction>(Val: V);
5619	if (!I)
5620	return {};
5621	auto It = ScheduledBundles.find(Val: I);
5622	if (It == ScheduledBundles.end())
5623	return {};
5624	return It ->getSecond();
5625	}
5626
5627	/// Returns true if the entity is in the scheduling region.
5628	bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5629	if (const auto *Data = dyn_cast<ScheduleData>(Val: &SD))
5630	return Data->getSchedulingRegionID() == SchedulingRegionID;
5631	if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: &SD))
5632	return CD->getSchedulingRegionID() == SchedulingRegionID;
5633	return all_of(Range: cast<ScheduleBundle>(Val: SD).getBundle(),
5634	P: [&](const ScheduleEntity *BundleMember) {
5635	return isInSchedulingRegion(SD: *BundleMember);
5636	});
5637	}
5638
5639	/// Marks an instruction as scheduled and puts all dependent ready
5640	/// instructions into the ready-list.
5641	template <typename ReadyListType>
5642	void schedule(const BoUpSLP &R, const InstructionsState &S,
5643	const EdgeInfo &EI, ScheduleEntity *Data,
5644	ReadyListType &ReadyList) {
5645	auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5646	ArrayRef<ScheduleBundle *> Bundles) {
5647	// Handle the def-use chain dependencies.
5648
5649	// Decrement the unscheduled counter and insert to ready list if ready.
5650	auto DecrUnsched = [&](auto Data, bool* IsControl = false) {
5651	if ((IsControl \|\| Data->hasValidDependencies()) &&
5652	Data->incrementUnscheduledDeps(-`1`) == `0`) {
5653	// There are no more unscheduled dependencies after
5654	// decrementing, so we can put the dependent instruction
5655	// into the ready list.
5656	SmallVector<ScheduleBundle *, `1`> CopyableBundle;
5657	ArrayRef<ScheduleBundle *> Bundles;
5658	if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5659	CopyableBundle.push_back(Elt: &CD->getBundle());
5660	Bundles = CopyableBundle;
5661	} else {
5662	Bundles = getScheduleBundles(V: Data->getInst());
5663	}
5664	if (!Bundles.empty()) {
5665	for (ScheduleBundle *Bundle : Bundles) {
5666	if (Bundle->unscheduledDepsInBundle() == `0`) {
5667	assert(!Bundle->isScheduled() &&
5668	"already scheduled bundle gets ready");
5669	ReadyList.insert(Bundle);
5670	LLVM_DEBUG(dbgs()
5671	<< "SLP: gets ready: " << *Bundle << "\n");
5672	}
5673	}
5674	return;
5675	}
5676	assert(!Data->isScheduled() &&
5677	"already scheduled bundle gets ready");
5678	assert(!isa<ScheduleCopyableData>(Data) &&
5679	"Expected non-copyable data");
5680	ReadyList.insert(Data);
5681	LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5682	}
5683	};
5684
5685	auto DecrUnschedForInst = [&](Instruction User, unsigned* OpIdx,
5686	Instruction *I) {
5687	if (!ScheduleCopyableDataMap.empty()) {
5688	SmallVector<ScheduleCopyableData *> CopyableData =
5689	getScheduleCopyableData(User, OperandIdx: OpIdx, V: I);
5690	for (ScheduleCopyableData *CD : CopyableData)
5691	DecrUnsched(CD, /IsControl=/false);
5692	if (!CopyableData.empty())
5693	return;
5694	}
5695	if (ScheduleData *OpSD = getScheduleData(I))
5696	DecrUnsched(OpSD, /IsControl=/false);
5697	};
5698
5699	// If BundleMember is a vector bundle, its operands may have been
5700	// reordered during buildTree(). We therefore need to get its operands
5701	// through the TreeEntry.
5702	if (!Bundles.empty()) {
5703	auto *In = BundleMember->getInst();
5704	// Count uses of each instruction operand.
5705	SmallDenseMap<const Instruction , unsigned*> OperandsUses;
5706	unsigned TotalOpCount = `0`;
5707	if (isa<ScheduleCopyableData>(Val: BundleMember)) {
5708	// Copyable data is used only once (uses itself).
5709	TotalOpCount = OperandsUses [In] = `1`;
5710	} else {
5711	for (const Use &U : In->operands()) {
5712	if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5713	auto Res = OperandsUses.try_emplace(Key: I, Args: `0`);
5714	++Res.first ->getSecond();
5715	++TotalOpCount;
5716	}
5717	}
5718	}
5719	// Decrement the unscheduled counter and insert to ready list if
5720	// ready.
5721	auto DecrUnschedForInst =
5722	[&](Instruction I, TreeEntry UserTE, unsigned OpIdx,
5723	SmallDenseSet<std::pair<const ScheduleEntity , unsigned*>>
5724	&Checked) {
5725	if (!ScheduleCopyableDataMap.empty()) {
5726	const EdgeInfo EI = {UserTE, OpIdx};
5727	if (ScheduleCopyableData *CD =
5728	getScheduleCopyableData(EI, V: I)) {
5729	if (!Checked.insert(V: std::make_pair(x&: CD, y&: OpIdx)).second)
5730	return;
5731	DecrUnsched(CD, /IsControl=/false);
5732	return;
5733	}
5734	}
5735	auto It = OperandsUses.find(Val: I);
5736	assert(It != OperandsUses.end() && "Operand not found");
5737	if (It ->second > `0`) {
5738	if (ScheduleData *OpSD = getScheduleData(I)) {
5739	if (!Checked.insert(V: std::make_pair(x&: OpSD, y&: OpIdx)).second)
5740	return;
5741	--It ->getSecond();
5742	assert(TotalOpCount > `0` && "No more operands to decrement");
5743	--TotalOpCount;
5744	DecrUnsched(OpSD, /IsControl=/false);
5745	} else {
5746	--It ->getSecond();
5747	assert(TotalOpCount > `0` && "No more operands to decrement");
5748	--TotalOpCount;
5749	}
5750	}
5751	};
5752
5753	SmallDenseSet<std::pair<const ScheduleEntity , unsigned*>> Checked;
5754	for (ScheduleBundle *Bundle : Bundles) {
5755	if (ScheduleCopyableDataMap.empty() && TotalOpCount == `0`)
5756	break;
5757	SmallPtrSet<Value *, `4`> ParentsUniqueUsers;
5758	// Need to search for the lane since the tree entry can be
5759	// reordered.
5760	auto *It = find(Range&: Bundle->getTreeEntry()->Scalars, Val: In);
5761	bool IsNonSchedulableWithParentPhiNode =
5762	Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5763	Bundle->getTreeEntry()->UserTreeIndex &&
5764	Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5765	Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5766	TreeEntry::SplitVectorize &&
5767	Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5768	Instruction::PHI;
5769	do {
5770	int Lane =
5771	std::distance(first: Bundle->getTreeEntry()->Scalars.begin(), last: It);
5772	assert(Lane >= `0` && "Lane not set");
5773	if (isa<StoreInst>(Val: In) &&
5774	!Bundle->getTreeEntry()->ReorderIndices.empty())
5775	Lane = Bundle->getTreeEntry()->ReorderIndices [Lane];
5776	assert(Lane < static_cast<int>(
5777	Bundle->getTreeEntry()->Scalars.size()) &&
5778	"Couldn't find extract lane");
5779
5780	// Since vectorization tree is being built recursively this
5781	// assertion ensures that the tree entry has all operands set
5782	// before reaching this code. Couple of exceptions known at the
5783	// moment are extracts where their second (immediate) operand is
5784	// not added. Since immediates do not affect scheduler behavior
5785	// this is considered okay.
5786	assert(
5787	In &&
5788	(isa<ExtractValueInst, ExtractElementInst, CallBase>(In) \|\|
5789	In->getNumOperands() ==
5790	Bundle->getTreeEntry()->getNumOperands() \|\|
5791	(isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
5792	Instruction::Select) \|\|
5793	Bundle->getTreeEntry()->isCopyableElement(In)) &&
5794	"Missed TreeEntry operands?");
5795
5796	// Count the number of unique phi nodes, which are the parent for
5797	// parent entry, and exit, if all the unique phis are processed.
5798	if (IsNonSchedulableWithParentPhiNode) {
5799	const TreeEntry *ParentTE =
5800	Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5801	Value *User = ParentTE->Scalars [Lane];
5802	if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5803	It = std::find(first: std::next(x: It),
5804	last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5805	continue;
5806	}
5807	}
5808
5809	for (unsigned OpIdx :
5810	seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands()))
5811	if (auto *I = dyn_cast<Instruction>(
5812	Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5813	LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5814	<< *I << "\n");
5815	DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5816	}
5817	// If parent node is schedulable, it will be handled correctly.
5818	if (Bundle->getTreeEntry()->isCopyableElement(V: In))
5819	break;
5820	It = std::find(first: std::next(x: It),
5821	last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5822	} while (It != Bundle->getTreeEntry()->Scalars.end());
5823	}
5824	} else {
5825	// If BundleMember is a stand-alone instruction, no operand reordering
5826	// has taken place, so we directly access its operands.
5827	for (Use &U : BundleMember->getInst()->operands()) {
5828	if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5829	LLVM_DEBUG(dbgs()
5830	<< "SLP: check for readiness (def): " << *I << "\n");
5831	DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5832	}
5833	}
5834	}
5835	// Handle the memory dependencies.
5836	auto *SD = dyn_cast<ScheduleData>(Val: BundleMember);
5837	if (!SD)
5838	return;
5839	SmallPtrSet<const ScheduleData *, `4`> VisitedMemory;
5840	for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5841	if (!VisitedMemory.insert(Ptr: MemoryDep).second)
5842	continue;
5843	// There are no more unscheduled dependencies after decrementing,
5844	// so we can put the dependent instruction into the ready list.
5845	LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5846	<< *MemoryDep << "\n");
5847	DecrUnsched(MemoryDep);
5848	}
5849	// Handle the control dependencies.
5850	SmallPtrSet<const ScheduleData *, `4`> VisitedControl;
5851	for (ScheduleData *Dep : SD->getControlDependencies()) {
5852	if (!VisitedControl.insert(Ptr: Dep).second)
5853	continue;
5854	// There are no more unscheduled dependencies after decrementing,
5855	// so we can put the dependent instruction into the ready list.
5856	LLVM_DEBUG(dbgs()
5857	<< "SLP: check for readiness (ctrl): " << *Dep << "\n");
5858	DecrUnsched(Dep, /IsControl=/true);
5859	}
5860	};
5861	if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) {
5862	SD->setScheduled(/Scheduled=/true);
5863	LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5864	SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
5865	SmallVector<ScheduleBundle *> Bundles;
5866	Instruction *In = SD->getInst();
5867	ArrayRef<TreeEntry *> Entries = R.getTreeEntries(V: In);
5868	if (!Entries.empty()) {
5869	for (TreeEntry *TE : Entries) {
5870	if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(Val: In) &&
5871	In->getNumOperands() != TE->getNumOperands())
5872	continue;
5873	auto &BundlePtr =
5874	PseudoBundles.emplace_back(Args: std::make_unique<ScheduleBundle>());
5875	BundlePtr ->setTreeEntry(TE);
5876	BundlePtr ->add(SD);
5877	Bundles.push_back(Elt: BundlePtr.get());
5878	}
5879	}
5880	ProcessBundleMember(SD, Bundles);
5881	} else {
5882	ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data);
5883	Bundle.setScheduled(/Scheduled=/true);
5884	LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5885	auto AreAllBundlesScheduled =
5886	[&](const ScheduleEntity *SD,
5887	ArrayRef<ScheduleBundle *> SDBundles) {
5888	if (isa<ScheduleCopyableData>(Val: SD))
5889	return true;
5890	return !SDBundles.empty() &&
5891	all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5892	return SDBundle->isScheduled();
5893	});
5894	};
5895	for (ScheduleEntity *SD : Bundle.getBundle()) {
5896	ArrayRef<ScheduleBundle *> SDBundles;
5897	if (!isa<ScheduleCopyableData>(Val: SD))
5898	SDBundles = getScheduleBundles(V: SD->getInst());
5899	if (AreAllBundlesScheduled(SD, SDBundles)) {
5900	SD->setScheduled(/Scheduled=/true);
5901	ProcessBundleMember(SD, isa<ScheduleCopyableData>(Val: SD) ? &Bundle
5902	: SDBundles);
5903	}
5904	}
5905	}
5906	}
5907
5908	/// Verify basic self consistency properties of the data structure.
5909	void verify() {
5910	if (!ScheduleStart)
5911	return;
5912
5913	assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5914	ScheduleStart->comesBefore(ScheduleEnd) &&
5915	"Not a valid scheduling region?");
5916
5917	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5918	ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5919	if (!Bundles.empty()) {
5920	for (ScheduleBundle *Bundle : Bundles) {
5921	assert(isInSchedulingRegion(*Bundle) &&
5922	"primary schedule data not in window?");
5923	Bundle->verify();
5924	}
5925	continue;
5926	}
5927	auto *SD = getScheduleData(I);
5928	if (!SD)
5929	continue;
5930	assert(isInSchedulingRegion(*SD) &&
5931	"primary schedule data not in window?");
5932	SD->verify();
5933	}
5934
5935	assert(all_of(ReadyInsts,
5936	[](const ScheduleEntity *Bundle) {
5937	return Bundle->isReady();
5938	}) &&
5939	"item in ready list not ready?");
5940	}
5941
5942	/// Put all instructions into the ReadyList which are ready for scheduling.
5943	template <typename ReadyListType>
5944	void initialFillReadyList(ReadyListType &ReadyList) {
5945	SmallPtrSet<ScheduleBundle *, `16`> Visited;
5946	for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5947	ScheduleData *SD = getScheduleData(I);
5948	if (SD && SD->hasValidDependencies() && SD->isReady()) {
5949	if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5950	!Bundles.empty()) {
5951	for (ScheduleBundle *Bundle : Bundles) {
5952	if (!Visited.insert(Ptr: Bundle).second)
5953	continue;
5954	if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5955	ReadyList.insert(Bundle);
5956	LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5957	<< *Bundle << "\n");
5958	}
5959	}
5960	continue;
5961	}
5962	ReadyList.insert(SD);
5963	LLVM_DEBUG(dbgs()
5964	<< "SLP: initially in ready list: " << *SD << "\n");
5965	}
5966	}
5967	}
5968
5969	/// Build a bundle from the ScheduleData nodes corresponding to the
5970	/// scalar instruction for each lane.
5971	/// \param VL The list of scalar instructions.
5972	/// \param S The state of the instructions.
5973	/// \param EI The edge in the SLP graph or the user node/operand number.
5974	ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5975	const InstructionsState &S, const EdgeInfo &EI);
5976
5977	/// Checks if a bundle of instructions can be scheduled, i.e. has no
5978	/// cyclic dependencies. This is only a dry-run, no instructions are
5979	/// actually moved at this stage.
5980	/// \returns the scheduling bundle. The returned Optional value is not
5981	/// std::nullopt if \p VL is allowed to be scheduled.
5982	std::optional<ScheduleBundle *>
5983	tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
5984	const InstructionsState &S, const EdgeInfo &EI);
5985
5986	/// Allocates schedule data chunk.
5987	ScheduleData *allocateScheduleDataChunks();
5988
5989	/// Extends the scheduling region so that V is inside the region.
5990	/// \returns true if the region size is within the limit.
5991	bool extendSchedulingRegion(Value V, const* InstructionsState &S);
5992
5993	/// Initialize the ScheduleData structures for new instructions in the
5994	/// scheduling region.
5995	void initScheduleData(Instruction FromI, Instruction ToI,
5996	ScheduleData *PrevLoadStore,
5997	ScheduleData *NextLoadStore);
5998
5999	/// Updates the dependency information of a bundle and of all instructions/
6000	/// bundles which depend on the original bundle.
6001	void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
6002	BoUpSLP *SLP,
6003	ArrayRef<ScheduleData *> ControlDeps = {});
6004
6005	/// Sets all instruction in the scheduling region to un-scheduled.
6006	void resetSchedule();
6007
6008	BasicBlock *BB;
6009
6010	/// Simple memory allocation for ScheduleData.
6011	SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
6012
6013	/// The size of a ScheduleData array in ScheduleDataChunks.
6014	int ChunkSize;
6015
6016	/// The allocator position in the current chunk, which is the last entry
6017	/// of ScheduleDataChunks.
6018	int ChunkPos;
6019
6020	/// Attaches ScheduleData to Instruction.
6021	/// Note that the mapping survives during all vectorization iterations, i.e.
6022	/// ScheduleData structures are recycled.
6023	SmallDenseMap<Instruction , ScheduleData > ScheduleDataMap;
6024
6025	/// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6026	/// number) and the operand instruction, represented as copyable element.
6027	SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6028	std::unique_ptr<ScheduleCopyableData>>
6029	ScheduleCopyableDataMap;
6030
6031	/// Represents mapping between instruction and all related
6032	/// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6033	/// element). The SLP tree may contain several representations of the same
6034	/// instruction.
6035	SmallDenseMap<const Instruction , SmallVector<ScheduleCopyableData >>
6036	ScheduleCopyableDataMapByInst;
6037
6038	/// Represents mapping between user value and operand number, the operand
6039	/// value and all related ScheduleCopyableData. The relation is 1:n, because
6040	/// the same user may refernce the same operand in different tree entries
6041	/// and the operand may be modelled by the different copyable data element.
6042	SmallDenseMap<std::pair<std::pair<const Value , unsigned>, const* Value *>,
6043	SmallVector<ScheduleCopyableData *>>
6044	ScheduleCopyableDataMapByInstUser;
6045
6046	/// Represents mapping between instruction and all related
6047	/// ScheduleCopyableData. It represents the mapping between the actual
6048	/// instruction and the last copyable data element in the chain. E.g., if
6049	/// the graph models the following instructions:
6050	/// %0 = non-add instruction ...
6051	/// ...
6052	/// %4 = add %3, 1
6053	/// %5 = add %4, 1
6054	/// %6 = insertelement poison, %0, 0
6055	/// %7 = insertelement %6, %5, 1
6056	/// And the graph is modeled as:
6057	/// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6058	/// -> [1, 0] -> [%1, 0]
6059	///
6060	/// this map will map %0 only to the copyable element <1>, which is the last
6061	/// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6062	/// keep the map to <0>, not the %0.
6063	SmallDenseMap<const Instruction *,
6064	SmallSetVector<ScheduleCopyableData *, `4`>>
6065	ScheduleCopyableDataMapByUsers;
6066
6067	/// Attaches ScheduleBundle to Instruction.
6068	SmallDenseMap<Instruction , SmallVector<ScheduleBundle >>
6069	ScheduledBundles;
6070	/// The list of ScheduleBundles.
6071	SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6072
6073	/// The ready-list for scheduling (only used for the dry-run).
6074	SetVector<ScheduleEntity *> ReadyInsts;
6075
6076	/// The first instruction of the scheduling region.
6077	Instruction ScheduleStart = nullptr*;
6078
6079	/// The first instruction _after_ the scheduling region.
6080	Instruction ScheduleEnd = nullptr*;
6081
6082	/// The first memory accessing instruction in the scheduling region
6083	/// (can be null).
6084	ScheduleData FirstLoadStoreInRegion = nullptr*;
6085
6086	/// The last memory accessing instruction in the scheduling region
6087	/// (can be null).
6088	ScheduleData LastLoadStoreInRegion = nullptr*;
6089
6090	/// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6091	/// region? Used to optimize the dependence calculation for the
6092	/// common case where there isn't.
6093	bool RegionHasStackSave = false;
6094
6095	/// The current size of the scheduling region.
6096	int ScheduleRegionSize = `0`;
6097
6098	/// The maximum size allowed for the scheduling region.
6099	int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6100
6101	/// The ID of the scheduling region. For a new vectorization iteration this
6102	/// is incremented which "removes" all ScheduleData from the region.
6103	/// Make sure that the initial SchedulingRegionID is greater than the
6104	/// initial SchedulingRegionID in ScheduleData (which is 0).
6105	int SchedulingRegionID = `1`;
6106	};
6107
6108	/// Attaches the BlockScheduling structures to basic blocks.
6109	MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6110
6111	/// Performs the "real" scheduling. Done before vectorization is actually
6112	/// performed in a basic block.
6113	void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6114
6115	/// List of users to ignore during scheduling and that don't need extracting.
6116	const SmallDenseSet<Value > UserIgnoreList = nullptr;
6117
6118	/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6119	/// sorted SmallVectors of unsigned.
6120	struct OrdersTypeDenseMapInfo {
6121	static OrdersType getEmptyKey() {
6122	OrdersType V;
6123	V.push_back(Elt: ~`1U`);
6124	return V;
6125	}
6126
6127	static OrdersType getTombstoneKey() {
6128	OrdersType V;
6129	V.push_back(Elt: ~`2U`);
6130	return V;
6131	}
6132
6133	static unsigned getHashValue(const OrdersType &V) {
6134	return static_cast<unsigned>(hash_combine_range(R: V));
6135	}
6136
6137	static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6138	return LHS == RHS;
6139	}
6140	};
6141
6142	// Analysis and block reference.
6143	Function *F;
6144	ScalarEvolution *SE;
6145	TargetTransformInfo *TTI;
6146	TargetLibraryInfo *TLI;
6147	LoopInfo *LI;
6148	DominatorTree *DT;
6149	AssumptionCache *AC;
6150	DemandedBits *DB;
6151	const DataLayout *DL;
6152	OptimizationRemarkEmitter *ORE;
6153
6154	unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6155	unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6156
6157	/// Instruction builder to construct the vectorized tree.
6158	IRBuilder<TargetFolder> Builder;
6159
6160	/// A map of scalar integer values to the smallest bit width with which they
6161	/// can legally be represented. The values map to (width, signed) pairs,
6162	/// where "width" indicates the minimum bit width and "signed" is True if the
6163	/// value must be signed-extended, rather than zero-extended, back to its
6164	/// original width.
6165	DenseMap<const TreeEntry , std::pair<uint64_t, bool*>> MinBWs;
6166
6167	/// Final size of the reduced vector, if the current graph represents the
6168	/// input for the reduction and it was possible to narrow the size of the
6169	/// reduction.
6170	unsigned ReductionBitWidth = `0`;
6171
6172	/// Canonical graph size before the transformations.
6173	unsigned BaseGraphSize = `1`;
6174
6175	/// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6176	/// type sizes, used in the tree.
6177	std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6178
6179	/// Indices of the vectorized nodes, which supposed to be the roots of the new
6180	/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6181	DenseSet<unsigned> ExtraBitWidthNodes;
6182	};
6183
6184	template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6185	using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
6186	using SecondInfo = DenseMapInfo<unsigned>;
6187	static BoUpSLP::EdgeInfo getEmptyKey() {
6188	return BoUpSLP::EdgeInfo (FirstInfo::getEmptyKey(),
6189	SecondInfo::getEmptyKey());
6190	}
6191
6192	static BoUpSLP::EdgeInfo getTombstoneKey() {
6193	return BoUpSLP::EdgeInfo (FirstInfo::getTombstoneKey(),
6194	SecondInfo::getTombstoneKey());
6195	}
6196
6197	static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6198	return detail::combineHashValue(a: FirstInfo::getHashValue(PtrVal: Val.UserTE),
6199	b: SecondInfo::getHashValue(Val: Val.EdgeIdx));
6200	}
6201
6202	static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6203	const BoUpSLP::EdgeInfo &RHS) {
6204	return LHS == RHS;
6205	}
6206	};
6207
6208	template <> struct llvm::GraphTraits<BoUpSLP *> {
6209	using TreeEntry = BoUpSLP::TreeEntry;
6210
6211	/// NodeRef has to be a pointer per the GraphWriter.
6212	using NodeRef = TreeEntry *;
6213
6214	using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6215
6216	/// Add the VectorizableTree to the index iterator to be able to return
6217	/// TreeEntry pointers.
6218	struct ChildIteratorType
6219	: public iterator_adaptor_base<
6220	ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator> {
6221	ContainerTy &VectorizableTree;
6222
6223	ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, `1`>::iterator W,
6224	ContainerTy &VT)
6225	: ChildIteratorType::iterator_adaptor_base (W), VectorizableTree(VT) {}
6226
6227	NodeRef operator() { return* I->UserTE; }
6228	};
6229
6230	static NodeRef getEntryNode(BoUpSLP &R) {
6231	return R.VectorizableTree [`0`].get();
6232	}
6233
6234	static ChildIteratorType child_begin(NodeRef N) {
6235	return {&N->UserTreeIndex, N->Container};
6236	}
6237
6238	static ChildIteratorType child_end(NodeRef N) {
6239	return {&N->UserTreeIndex + `1`, N->Container};
6240	}
6241
6242	/// For the node iterator we just need to turn the TreeEntry iterator into a
6243	/// TreeEntry iterator so that it dereferences to NodeRef.*
6244	class nodes_iterator {
6245	using ItTy = ContainerTy::iterator;
6246	ItTy It;
6247
6248	public:
6249	nodes_iterator(const ItTy &It2) : It(It2) {}
6250	NodeRef operator() { return* It->get(); }
6251	nodes_iterator operator++() {
6252	++It;
6253	return *this;
6254	}
6255	bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6256	};
6257
6258	static nodes_iterator nodes_begin(BoUpSLP *R) {
6259	return nodes_iterator (R->VectorizableTree.begin());
6260	}
6261
6262	static nodes_iterator nodes_end(BoUpSLP *R) {
6263	return nodes_iterator (R->VectorizableTree.end());
6264	}
6265
6266	static unsigned size(BoUpSLP R) { return* R->VectorizableTree.size(); }
6267	};
6268
6269	template <>
6270	struct llvm::DOTGraphTraits<BoUpSLP > : public* DefaultDOTGraphTraits {
6271	using TreeEntry = BoUpSLP::TreeEntry;
6272
6273	DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits (IsSimple) {}
6274
6275	std::string getNodeLabel(const TreeEntry Entry, const* BoUpSLP *R) {
6276	std::string Str;
6277	raw_string_ostream OS(Str);
6278	OS << Entry->Idx << ".\n";
6279	if (isSplat(VL: Entry->Scalars))
6280	OS << "<splat> ";
6281	for (auto *V : Entry->Scalars) {
6282	OS << *V;
6283	if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
6284	return EU.Scalar == V;
6285	}))
6286	OS << " <extract>";
6287	OS << "\n";
6288	}
6289	return Str;
6290	}
6291
6292	static std::string getNodeAttributes(const TreeEntry *Entry,
6293	const BoUpSLP *) {
6294	if (Entry->isGather())
6295	return "color=red";
6296	if (Entry->State == TreeEntry::ScatterVectorize \|\|
6297	Entry->State == TreeEntry::StridedVectorize \|\|
6298	Entry->State == TreeEntry::CompressVectorize)
6299	return "color=blue";
6300	return "";
6301	}
6302	};
6303
6304	BoUpSLP::~BoUpSLP() {
6305	SmallVector<WeakTrackingVH> DeadInsts;
6306	for (auto *I : DeletedInstructions) {
6307	if (!I->getParent()) {
6308	// Temporarily insert instruction back to erase them from parent and
6309	// memory later.
6310	if (isa<PHINode>(Val: I))
6311	// Phi nodes must be the very first instructions in the block.
6312	I->insertBefore(BB&: F->getEntryBlock(),
6313	InsertPos: F->getEntryBlock().getFirstNonPHIIt());
6314	else
6315	I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator());
6316	continue;
6317	}
6318	for (Use &U : I->operands()) {
6319	auto *Op = dyn_cast<Instruction>(Val: U.get());
6320	if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
6321	wouldInstructionBeTriviallyDead(I: Op, TLI))
6322	DeadInsts.emplace_back(Args&: Op);
6323	}
6324	I->dropAllReferences();
6325	}
6326	for (auto *I : DeletedInstructions) {
6327	assert(I->use_empty() &&
6328	"trying to erase instruction with users.");
6329	I->eraseFromParent();
6330	}
6331
6332	// Cleanup any dead scalar code feeding the vectorized instructions
6333	RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
6334
6335	#ifdef EXPENSIVE_CHECKS
6336	// If we could guarantee that this call is not extremely slow, we could
6337	// remove the ifdef limitation (see PR47712).
6338	assert(!verifyFunction(*F, &dbgs()));
6339	#endif
6340	}
6341
6342	/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6343	/// contains original mask for the scalars reused in the node. Procedure
6344	/// transform this mask in accordance with the given \p Mask.
6345	static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
6346	assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6347	"Expected non-empty mask.");
6348	SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6349	Prev.swap(RHS&: Reuses);
6350	for (unsigned I = `0`, E = Prev.size(); I < E; ++I)
6351	if (Mask [I] != PoisonMaskElem)
6352	Reuses [Mask [I]] = Prev [I];
6353	}
6354
6355	/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6356	/// the original order of the scalars. Procedure transforms the provided order
6357	/// in accordance with the given \p Mask. If the resulting \p Order is just an
6358	/// identity order, \p Order is cleared.
6359	static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
6360	bool BottomOrder = false) {
6361	assert(!Mask.empty() && "Expected non-empty mask.");
6362	unsigned Sz = Mask.size();
6363	if (BottomOrder) {
6364	SmallVector<unsigned> PrevOrder;
6365	if (Order.empty()) {
6366	PrevOrder.resize(N: Sz);
6367	std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: `0`);
6368	} else {
6369	PrevOrder.swap(RHS&: Order);
6370	}
6371	Order.assign(NumElts: Sz, Elt: Sz);
6372	for (unsigned I = `0`; I < Sz; ++I)
6373	if (Mask [I] != PoisonMaskElem)
6374	Order [I] = PrevOrder [Mask [I]];
6375	if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
6376	return Data.value() == Sz \|\| Data.index() == Data.value();
6377	})) {
6378	Order.clear();
6379	return;
6380	}
6381	fixupOrderingIndices(Order);
6382	return;
6383	}
6384	SmallVector<int> MaskOrder;
6385	if (Order.empty()) {
6386	MaskOrder.resize(N: Sz);
6387	std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: `0`);
6388	} else {
6389	inversePermutation(Indices: Order, Mask&: MaskOrder);
6390	}
6391	reorderReuses(Reuses&: MaskOrder, Mask);
6392	if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
6393	Order.clear();
6394	return;
6395	}
6396	Order.assign(NumElts: Sz, Elt: Sz);
6397	for (unsigned I = `0`; I < Sz; ++I)
6398	if (MaskOrder [I] != PoisonMaskElem)
6399	Order [MaskOrder [I]] = I;
6400	fixupOrderingIndices(Order);
6401	}
6402
6403	std::optional<BoUpSLP::OrdersType>
6404	BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6405	bool TopToBottom, bool IgnoreReorder) {
6406	assert(TE.isGather() && "Expected gather node only.");
6407	// Try to find subvector extract/insert patterns and reorder only such
6408	// patterns.
6409	SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6410	Type *ScalarTy = GatheredScalars.front()->getType();
6411	size_t NumScalars = GatheredScalars.size();
6412	if (!isValidElementType(Ty: ScalarTy))
6413	return std::nullopt;
6414	auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
6415	unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars);
6416	SmallVector<int> ExtractMask;
6417	SmallVector<int> Mask;
6418	SmallVector<SmallVector<const TreeEntry *>> Entries;
6419	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
6420	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
6421	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
6422	isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
6423	/ForOrder=/true);
6424	// No shuffled operands - ignore.
6425	if (GatherShuffles.empty() && ExtractShuffles.empty())
6426	return std::nullopt;
6427	OrdersType CurrentOrder(NumScalars, NumScalars);
6428	if (GatherShuffles.size() == `1` &&
6429	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6430	Entries.front().front()->isSame(VL: TE.Scalars)) {
6431	// If the full matched node in whole tree rotation - no need to consider the
6432	// matching order, rotating the whole tree.
6433	if (TopToBottom)
6434	return std::nullopt;
6435	// No need to keep the order for the same user node.
6436	if (Entries.front().front()->UserTreeIndex.UserTE ==
6437	TE.UserTreeIndex.UserTE)
6438	return std::nullopt;
6439	// No need to keep the order for the matched root node, if it can be freely
6440	// reordered.
6441	if (!IgnoreReorder && Entries.front().front()->Idx == `0`)
6442	return std::nullopt;
6443	// If shuffling 2 elements only and the matching node has reverse reuses -
6444	// no need to count order, both work fine.
6445	if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6446	TE.getVectorFactor() == `2` && Mask.size() == `2` &&
6447	any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices),
6448	P: [](const auto &P) {
6449	return P.value() % `2` != static_cast<int>(P.index()) % `2`;
6450	}))
6451	return std::nullopt;
6452
6453	// Perfect match in the graph, will reuse the previously vectorized
6454	// node. Cost is 0.
6455	std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: `0`);
6456	return CurrentOrder;
6457	}
6458	auto IsSplatMask = [](ArrayRef<int> Mask) {
6459	int SingleElt = PoisonMaskElem;
6460	return all_of(Range&: Mask, P: [&](int I) {
6461	if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6462	SingleElt = I;
6463	return I == PoisonMaskElem \|\| I == SingleElt;
6464	});
6465	};
6466	// Exclusive broadcast mask - ignore.
6467	if ((ExtractShuffles.empty() && IsSplatMask (Mask) &&
6468	(Entries.size() != `1` \|\|
6469	Entries.front().front()->ReorderIndices.empty())) \|\|
6470	(GatherShuffles.empty() && IsSplatMask (ExtractMask)))
6471	return std::nullopt;
6472	SmallBitVector ShuffledSubMasks(NumParts);
6473	auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6474	ArrayRef<int> Mask, int PartSz, int NumParts,
6475	function_ref<unsigned(unsigned)> GetVF) {
6476	for (int I : seq<int>(Begin: `0`, End: NumParts)) {
6477	if (ShuffledSubMasks.test(Idx: I))
6478	continue;
6479	const int VF = GetVF (I);
6480	if (VF == `0`)
6481	continue;
6482	unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
6483	MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
6484	// Shuffle of at least 2 vectors - ignore.
6485	if (any_of(Range&: Slice, P: not_equal_to(Arg&: NumScalars))) {
6486	llvm::fill(Range&: Slice, Value&: NumScalars);
6487	ShuffledSubMasks.set(I);
6488	continue;
6489	}
6490	// Try to include as much elements from the mask as possible.
6491	int FirstMin = INT_MAX;
6492	int SecondVecFound = false;
6493	for (int K : seq<int>(Size: Limit)) {
6494	int Idx = Mask [I * PartSz + K];
6495	if (Idx == PoisonMaskElem) {
6496	Value V = GatheredScalars [I PartSz + K];
6497	if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
6498	SecondVecFound = true;
6499	break;
6500	}
6501	continue;
6502	}
6503	if (Idx < VF) {
6504	if (FirstMin > Idx)
6505	FirstMin = Idx;
6506	} else {
6507	SecondVecFound = true;
6508	break;
6509	}
6510	}
6511	FirstMin = (FirstMin / PartSz) * PartSz;
6512	// Shuffle of at least 2 vectors - ignore.
6513	if (SecondVecFound) {
6514	llvm::fill(Range&: Slice, Value&: NumScalars);
6515	ShuffledSubMasks.set(I);
6516	continue;
6517	}
6518	for (int K : seq<int>(Size: Limit)) {
6519	int Idx = Mask [I * PartSz + K];
6520	if (Idx == PoisonMaskElem)
6521	continue;
6522	Idx -= FirstMin;
6523	if (Idx >= PartSz) {
6524	SecondVecFound = true;
6525	break;
6526	}
6527	if (CurrentOrder [I * PartSz + Idx] >
6528	static_cast<unsigned>(I * PartSz + K) &&
6529	CurrentOrder [I * PartSz + Idx] !=
6530	static_cast<unsigned>(I * PartSz + Idx))
6531	CurrentOrder [I * PartSz + Idx] = I * PartSz + K;
6532	}
6533	// Shuffle of at least 2 vectors - ignore.
6534	if (SecondVecFound) {
6535	llvm::fill(Range&: Slice, Value&: NumScalars);
6536	ShuffledSubMasks.set(I);
6537	continue;
6538	}
6539	}
6540	};
6541	int PartSz = getPartNumElems(Size: NumScalars, NumParts);
6542	if (!ExtractShuffles.empty())
6543	TransformMaskToOrder (
6544	CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6545	if (!ExtractShuffles [I])
6546	return `0U`;
6547	unsigned VF = `0`;
6548	unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
6549	for (unsigned Idx : seq<unsigned>(Size: Sz)) {
6550	int K = I * PartSz + Idx;
6551	if (ExtractMask [K] == PoisonMaskElem)
6552	continue;
6553	if (!TE.ReuseShuffleIndices.empty())
6554	K = TE.ReuseShuffleIndices [K];
6555	if (K == PoisonMaskElem)
6556	continue;
6557	if (!TE.ReorderIndices.empty())
6558	K = std::distance(first: TE.ReorderIndices.begin(),
6559	last: find(Range: TE.ReorderIndices, Val: K));
6560	auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars [K]);
6561	if (!EI)
6562	continue;
6563	VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
6564	->getElementCount()
6565	.getKnownMinValue());
6566	}
6567	return VF;
6568	});
6569	// Check special corner case - single shuffle of the same entry.
6570	if (GatherShuffles.size() == `1` && NumParts != `1`) {
6571	if (ShuffledSubMasks.any())
6572	return std::nullopt;
6573	PartSz = NumScalars;
6574	NumParts = `1`;
6575	}
6576	if (!Entries.empty())
6577	TransformMaskToOrder (CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6578	if (!GatherShuffles [I])
6579	return `0U`;
6580	return std::max(a: Entries [I].front()->getVectorFactor(),
6581	b: Entries [I].back()->getVectorFactor());
6582	});
6583	unsigned NumUndefs = count(Range&: CurrentOrder, Element: NumScalars);
6584	if (ShuffledSubMasks.all() \|\| (NumScalars > `2` && NumUndefs >= NumScalars / `2`))
6585	return std::nullopt;
6586	return std::move(CurrentOrder);
6587	}
6588
6589	static bool arePointersCompatible(Value Ptr1, Value Ptr2,
6590	const TargetLibraryInfo &TLI,
6591	bool CompareOpcodes = true) {
6592	if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) !=
6593	getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth))
6594	return false;
6595	auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
6596	auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
6597	return (!GEP1 \|\| GEP1->getNumOperands() == `2`) &&
6598	(!GEP2 \|\| GEP2->getNumOperands() == `2`) &&
6599	(((!GEP1 \|\| isConstant(V: GEP1->getOperand(i_nocapture: `1`))) &&
6600	(!GEP2 \|\| isConstant(V: GEP2->getOperand(i_nocapture: `1`)))) \|\|
6601	!CompareOpcodes \|\|
6602	(GEP1 && GEP2 &&
6603	getSameOpcode(VL: {GEP1->getOperand(i_nocapture: `1`), GEP2->getOperand(i_nocapture: `1`)}, TLI)));
6604	}
6605
6606	/// Calculates minimal alignment as a common alignment.
6607	template <typename T>
6608	static Align computeCommonAlignment(ArrayRef<Value *> VL) {
6609	Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6610	for (Value *V : VL)
6611	CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6612	return CommonAlignment;
6613	}
6614
6615	/// Check if \p Order represents reverse order.
6616	static bool isReverseOrder(ArrayRef<unsigned> Order) {
6617	assert(!Order.empty() &&
6618	"Order is empty. Please check it before using isReverseOrder.");
6619	unsigned Sz = Order.size();
6620	return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
6621	return Pair.value() == Sz \|\| Sz - Pair.index() - `1` == Pair.value();
6622	});
6623	}
6624
6625	/// Checks if the provided list of pointers \p Pointers represents the strided
6626	/// pointers for type ElemTy. If they are not, nullptr is returned.
6627	/// Otherwise, SCEV of the stride value is returned.*
6628	/// If `PointerOps` can be rearanged into the following sequence:
6629	/// ```
6630	/// %x + c_0 stride,*
6631	/// %x + c_1 stride,*
6632	/// %x + c_2 stride*
6633	/// ...
6634	/// ```
6635	/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6636	/// and the SCEV of the `stride` will be returned.
6637	static const SCEV calculateRtStride(ArrayRef<Value > PointerOps, Type *ElemTy,
6638	const DataLayout &DL, ScalarEvolution &SE,
6639	SmallVectorImpl<unsigned> &SortedIndices,
6640	SmallVectorImpl<int64_t> &Coeffs) {
6641	assert(Coeffs.size() == PointerOps.size() &&
6642	"Coeffs vector needs to be of correct size");
6643	SmallVector<const SCEV *> SCEVs;
6644	const SCEV PtrSCEVLowest = nullptr*;
6645	const SCEV PtrSCEVHighest = nullptr*;
6646	// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6647	// addresses).
6648	for (Value *Ptr : PointerOps) {
6649	const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
6650	if (!PtrSCEV)
6651	return nullptr;
6652	SCEVs.push_back(Elt: PtrSCEV);
6653	if (!PtrSCEVLowest && !PtrSCEVHighest) {
6654	PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6655	continue;
6656	}
6657	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6658	if (isa<SCEVCouldNotCompute>(Val: Diff))
6659	return nullptr;
6660	if (Diff->isNonConstantNegative()) {
6661	PtrSCEVLowest = PtrSCEV;
6662	continue;
6663	}
6664	const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
6665	if (isa<SCEVCouldNotCompute>(Val: Diff1))
6666	return nullptr;
6667	if (Diff1->isNonConstantNegative()) {
6668	PtrSCEVHighest = PtrSCEV;
6669	continue;
6670	}
6671	}
6672	// Dist = PtrSCEVHighest - PtrSCEVLowest;
6673	const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
6674	if (isa<SCEVCouldNotCompute>(Val: Dist))
6675	return nullptr;
6676	int Size = DL.getTypeStoreSize(Ty: ElemTy);
6677	auto TryGetStride = [&](const SCEV *Dist,
6678	const SCEV Multiplier) -> const* SCEV * {
6679	if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
6680	if (M->getOperand(i: `0`) == Multiplier)
6681	return M->getOperand(i: `1`);
6682	if (M->getOperand(i: `1`) == Multiplier)
6683	return M->getOperand(i: `0`);
6684	return nullptr;
6685	}
6686	if (Multiplier == Dist)
6687	return SE.getConstant(Ty: Dist->getType(), V: `1`);
6688	return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
6689	};
6690	// Stride_in_elements = Dist / element_size (num_elems - 1).*
6691	const SCEV Stride = nullptr*;
6692	if (Size != `1` \|\| SCEVs.size() > `2`) {
6693	const SCEV Sz = SE.getConstant(Ty: Dist->getType(), V: Size (SCEVs.size() - `1`));
6694	Stride = TryGetStride (Dist, Sz);
6695	if (!Stride)
6696	return nullptr;
6697	}
6698	if (!Stride \|\| isa<SCEVConstant>(Val: Stride))
6699	return nullptr;
6700	// Iterate through all pointers and check if all distances are
6701	// unique multiple of Stride.
6702	using DistOrdPair = std::pair<int64_t, int>;
6703	auto Compare = llvm::less_first ();
6704	std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6705	int Cnt = `0`;
6706	bool IsConsecutive = true;
6707	for (const auto [Idx, PtrSCEV] : enumerate(First&: SCEVs)) {
6708	unsigned Dist = `0`;
6709	if (PtrSCEV != PtrSCEVLowest) {
6710	const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6711	const SCEV *Coeff = TryGetStride (Diff, Stride);
6712	if (!Coeff)
6713	return nullptr;
6714	const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
6715	if (!SC \|\| isa<SCEVCouldNotCompute>(Val: SC))
6716	return nullptr;
6717	Coeffs [Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6718	if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
6719	RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
6720	->isZero())
6721	return nullptr;
6722	Dist = SC->getAPInt().getZExtValue();
6723	} else {
6724	Coeffs [Idx] = `0`;
6725	}
6726	// If the strides are not the same or repeated, we can't vectorize.
6727	if ((Dist / Size) * Size != Dist \|\| (Dist / Size) >= SCEVs.size())
6728	return nullptr;
6729	auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
6730	if (!Res.second)
6731	return nullptr;
6732	// Consecutive order if the inserted element is the last one.
6733	IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
6734	++Cnt;
6735	}
6736	if (Offsets.size() != SCEVs.size())
6737	return nullptr;
6738	SortedIndices.clear();
6739	if (!IsConsecutive) {
6740	// Fill SortedIndices array only if it is non-consecutive.
6741	SortedIndices.resize(N: PointerOps.size());
6742	Cnt = `0`;
6743	for (const std::pair<int64_t, int> &Pair : Offsets) {
6744	SortedIndices [Cnt] = Pair.second;
6745	++Cnt;
6746	}
6747	}
6748	return Stride;
6749	}
6750
6751	static std::pair<InstructionCost, InstructionCost>
6752	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6753	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
6754	Type ScalarTy, VectorType VecTy);
6755
6756	/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6757	/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6758	/// subvector pattern.
6759	static InstructionCost
6760	getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6761	VectorType Tp, ArrayRef<int*> Mask = {},
6762	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
6763	int Index = `0`, VectorType SubTp = nullptr*,
6764	ArrayRef<const Value *> Args = {}) {
6765	VectorType *DstTy = Tp;
6766	if (!Mask.empty())
6767	DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size());
6768
6769	if (Kind != TTI::SK_PermuteTwoSrc)
6770	return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6771	Args);
6772	int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6773	int NumSubElts;
6774	if (Mask.size() > `2` && ShuffleVectorInst::isInsertSubvectorMask(
6775	Mask, NumSrcElts, NumSubElts, Index)) {
6776	if (Index + NumSubElts > NumSrcElts &&
6777	Index + NumSrcElts <= static_cast<int>(Mask.size()))
6778	return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask,
6779	CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
6780	}
6781	return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6782	Args);
6783	}
6784
6785	/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6786	/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6787	/// instead of a scalar.
6788	static InstructionCost
6789	getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
6790	VectorType Ty, const* APInt &DemandedElts, bool Insert,
6791	bool Extract, TTI::TargetCostKind CostKind,
6792	bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6793	assert(!isa<ScalableVectorType>(Ty) &&
6794	"ScalableVectorType is not supported.");
6795	assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6796	getNumElements(Ty) &&
6797	"Incorrect usage.");
6798	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6799	assert(SLPReVec && "Only supported by REVEC.");
6800	// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6801	// of CreateInsertElement.
6802	unsigned ScalarTyNumElements = VecTy->getNumElements();
6803	InstructionCost Cost = `0`;
6804	for (unsigned I : seq(Size: DemandedElts.getBitWidth())) {
6805	if (!DemandedElts [I])
6806	continue;
6807	if (Insert)
6808	Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind,
6809	Index: I * ScalarTyNumElements, SubTp: VecTy);
6810	if (Extract)
6811	Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind,
6812	Index: I * ScalarTyNumElements, SubTp: VecTy);
6813	}
6814	return Cost;
6815	}
6816	return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6817	CostKind, ForPoisonSrc, VL);
6818	}
6819
6820	/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6821	/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6822	static InstructionCost getVectorInstrCost(
6823	const TargetTransformInfo &TTI, Type ScalarTy, unsigned* Opcode, Type *Val,
6824	TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6825	ArrayRef<std::tuple<Value , User , int>> ScalarUserAndIdx) {
6826	if (Opcode == Instruction::ExtractElement) {
6827	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6828	assert(SLPReVec && "Only supported by REVEC.");
6829	assert(isa<VectorType>(Val) && "Val must be a vector type.");
6830	return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
6831	Tp: cast<VectorType>(Val), Mask: {}, CostKind,
6832	Index: Index * VecTy->getNumElements(), SubTp: VecTy);
6833	}
6834	}
6835	return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6836	ScalarUserAndIdx);
6837	}
6838
6839	/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6840	/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6841	static InstructionCost getExtractWithExtendCost(
6842	const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6843	VectorType VecTy, unsigned* Index,
6844	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
6845	if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) {
6846	assert(SLPReVec && "Only supported by REVEC.");
6847	auto *SubTp =
6848	getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements());
6849	return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind,
6850	Index: Index * ScalarTy->getNumElements(), SubTp) +
6851	TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None,
6852	CostKind);
6853	}
6854	return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6855	}
6856
6857	/// Creates subvector insert. Generates shuffle using \p Generator or
6858	/// using default shuffle.
6859	static Value *createInsertVector(
6860	IRBuilderBase &Builder, Value Vec, Value V, unsigned Index,
6861	function_ref<Value (Value , Value , ArrayRef<int*>)> Generator = {}) {
6862	if (isa<PoisonValue>(Val: Vec) && isa<PoisonValue>(Val: V))
6863	return Vec;
6864	const unsigned SubVecVF = getNumElements(Ty: V->getType());
6865	// Create shuffle, insertvector requires that index is multiple of
6866	// the subvector length.
6867	const unsigned VecVF = getNumElements(Ty: Vec->getType());
6868	SmallVector<int> Mask(VecVF, PoisonMaskElem);
6869	if (isa<PoisonValue>(Val: Vec)) {
6870	auto *Begin = std::next(x: Mask.begin(), n: Index);
6871	std::iota(first: Begin, last: std::next(x: Begin, n: SubVecVF), value: `0`);
6872	Vec = Builder.CreateShuffleVector(V, Mask);
6873	return Vec;
6874	}
6875	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
6876	std::iota(first: std::next(x: Mask.begin(), n: Index),
6877	last: std::next(x: Mask.begin(), n: Index + SubVecVF), value: VecVF);
6878	if (Generator)
6879	return Generator (Vec, V, Mask);
6880	// 1. Resize V to the size of Vec.
6881	SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6882	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: `0`);
6883	V = Builder.CreateShuffleVector(V, Mask: ResizeMask);
6884	// 2. Insert V into Vec.
6885	return Builder.CreateShuffleVector(V1: Vec, V2: V, Mask);
6886	}
6887
6888	/// Generates subvector extract using \p Generator or using default shuffle.
6889	static Value createExtractVector(IRBuilderBase &Builder, Value Vec,
6890	unsigned SubVecVF, unsigned Index) {
6891	SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6892	std::iota(first: Mask.begin(), last: Mask.end(), value: Index);
6893	return Builder.CreateShuffleVector(V: Vec, Mask);
6894	}
6895
6896	/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6897	/// with \p Order.
6898	/// \return true if the mask represents strided access, false - otherwise.
6899	static bool buildCompressMask(ArrayRef<Value *> PointerOps,
6900	ArrayRef<unsigned> Order, Type *ScalarTy,
6901	const DataLayout &DL, ScalarEvolution &SE,
6902	SmallVectorImpl<int> &CompressMask) {
6903	const unsigned Sz = PointerOps.size();
6904	CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
6905	// The first element always set.
6906	CompressMask [`0`] = `0`;
6907	// Check if the mask represents strided access.
6908	std::optional<unsigned> Stride = `0`;
6909	Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps [Order.front()];
6910	for (unsigned I : seq<unsigned>(Begin: `1`, End: Sz)) {
6911	Value *Ptr = Order.empty() ? PointerOps [I] : PointerOps [Order [I]];
6912	std::optional<int64_t> OptPos =
6913	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
6914	if (!OptPos \|\| OptPos > std::numeric_limits<unsigned>::max())
6915	return false;
6916	unsigned Pos = static_cast<unsigned>(*OptPos);
6917	CompressMask [I] = Pos;
6918	if (!Stride)
6919	continue;
6920	if (*Stride == `0`) {
6921	*Stride = Pos;
6922	continue;
6923	}
6924	if (Pos != Stride I)
6925	Stride.reset();
6926	}
6927	return Stride.has_value();
6928	}
6929
6930	/// Checks if the \p VL can be transformed to a (masked)load + compress or
6931	/// (masked) interleaved load.
6932	static bool isMaskedLoadCompress(
6933	ArrayRef<Value > VL, ArrayRef<Value > PointerOps,
6934	ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6935	const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
6936	const DominatorTree &DT, const TargetLibraryInfo &TLI,
6937	const function_ref<bool(Value )> AreAllUsersVectorized, bool* &IsMasked,
6938	unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6939	VectorType *&LoadVecTy) {
6940	InterleaveFactor = `0`;
6941	Type *ScalarTy = VL.front()->getType();
6942	const size_t Sz = VL.size();
6943	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6944	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6945	SmallVector<int> Mask;
6946	if (!Order.empty())
6947	inversePermutation(Indices: Order, Mask);
6948	// Check external uses.
6949	for (const auto [I, V] : enumerate(First&: VL)) {
6950	if (AreAllUsersVectorized (V))
6951	continue;
6952	InstructionCost ExtractCost =
6953	TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
6954	Index: Mask.empty() ? I : Mask [I]);
6955	InstructionCost ScalarCost =
6956	TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind);
6957	if (ExtractCost <= ScalarCost)
6958	return false;
6959	}
6960	Value *Ptr0;
6961	Value *PtrN;
6962	if (Order.empty()) {
6963	Ptr0 = PointerOps.front();
6964	PtrN = PointerOps.back();
6965	} else {
6966	Ptr0 = PointerOps [Order.front()];
6967	PtrN = PointerOps [Order.back()];
6968	}
6969	std::optional<int64_t> Diff =
6970	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE);
6971	if (!Diff)
6972	return false;
6973	const size_t MaxRegSize =
6974	TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
6975	.getFixedValue();
6976	// Check for very large distances between elements.
6977	if (*Diff / Sz >= MaxRegSize / `8`)
6978	return false;
6979	LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + `1`);
6980	auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL [Order.front()]);
6981	Align CommonAlignment = LI->getAlign();
6982	IsMasked = !isSafeToLoadUnconditionally(
6983	V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL,
6984	ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL [Order.back()]), AC: &AC, DT: &DT,
6985	TLI: &TLI);
6986	if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment,
6987	AddressSpace: LI->getPointerAddressSpace()))
6988	return false;
6989	// TODO: perform the analysis of each scalar load for better
6990	// safe-load-unconditionally analysis.
6991	bool IsStrided =
6992	buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6993	assert(CompressMask.size() >= `2` && "At least two elements are required");
6994	SmallVector<Value *> OrderedPointerOps(PointerOps);
6995	if (!Order.empty())
6996	reorderScalars(Scalars&: OrderedPointerOps, Mask);
6997	auto [ScalarGEPCost, VectorGEPCost] =
6998	getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(),
6999	Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy);
7000	// The cost of scalar loads.
7001	InstructionCost ScalarLoadsCost =
7002	std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost (),
7003	binary_op: [&](InstructionCost C, Value *V) {
7004	return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V),
7005	CostKind);
7006	}) +
7007	ScalarGEPCost;
7008	APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7009	InstructionCost GatherCost =
7010	getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7011	/Insert=/true,
7012	/Extract=/false, CostKind) +
7013	ScalarLoadsCost;
7014	InstructionCost LoadCost = `0`;
7015	if (IsMasked) {
7016	LoadCost = TTI.getMemIntrinsicInstrCost(
7017	MICA: MemIntrinsicCostAttributes (Intrinsic::masked_load, LoadVecTy,
7018	CommonAlignment,
7019	LI->getPointerAddressSpace()),
7020	CostKind);
7021	} else {
7022	LoadCost =
7023	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
7024	AddressSpace: LI->getPointerAddressSpace(), CostKind);
7025	}
7026	if (IsStrided && !IsMasked && Order.empty()) {
7027	// Check for potential segmented(interleaved) loads.
7028	VectorType *AlignedLoadVecTy = getWidenedType(
7029	ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + `1`));
7030	if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment,
7031	DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT,
7032	TLI: &TLI))
7033	AlignedLoadVecTy = LoadVecTy;
7034	if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask [`1`],
7035	Alignment: CommonAlignment,
7036	AddrSpace: LI->getPointerAddressSpace())) {
7037	InstructionCost InterleavedCost =
7038	VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7039	Opcode: Instruction::Load, VecTy: AlignedLoadVecTy,
7040	Factor: CompressMask [`1`], Indices: {}, Alignment: CommonAlignment,
7041	AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked);
7042	if (InterleavedCost < GatherCost) {
7043	InterleaveFactor = CompressMask [`1`];
7044	LoadVecTy = AlignedLoadVecTy;
7045	return true;
7046	}
7047	}
7048	}
7049	InstructionCost CompressCost = ::getShuffleCost(
7050	TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind);
7051	if (!Order.empty()) {
7052	SmallVector<int> NewMask(Sz, PoisonMaskElem);
7053	for (unsigned I : seq<unsigned>(Size: Sz)) {
7054	NewMask [I] = CompressMask [Mask [I]];
7055	}
7056	CompressMask.swap(RHS&: NewMask);
7057	}
7058	InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7059	return TotalVecCost < GatherCost;
7060	}
7061
7062	/// Checks if the \p VL can be transformed to a (masked)load + compress or
7063	/// (masked) interleaved load.
7064	static bool
7065	isMaskedLoadCompress(ArrayRef<Value > VL, ArrayRef<Value > PointerOps,
7066	ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
7067	const DataLayout &DL, ScalarEvolution &SE,
7068	AssumptionCache &AC, const DominatorTree &DT,
7069	const TargetLibraryInfo &TLI,
7070	const function_ref<bool(Value *)> AreAllUsersVectorized) {
7071	bool IsMasked;
7072	unsigned InterleaveFactor;
7073	SmallVector<int> CompressMask;
7074	VectorType *LoadVecTy;
7075	return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7076	AreAllUsersVectorized, IsMasked, InterleaveFactor,
7077	CompressMask, LoadVecTy);
7078	}
7079
7080	/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7081	/// PointerOps:
7082	/// 1. Target with strided load support is detected.
7083	/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7084	/// potential stride <= MaxProfitableLoadStride and the potential stride is
7085	/// power-of-2 (to avoid perf regressions for the very small number of loads)
7086	/// and max distance > number of loads, or potential stride is -1.
7087	/// 3. The loads are ordered, or number of unordered loads <=
7088	/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7089	/// to avoid extra costs for very expensive shuffles).
7090	/// 4. Any pointer operand is an instruction with the users outside of the
7091	/// current graph (for masked gathers extra extractelement instructions
7092	/// might be required).
7093	bool BoUpSLP::isStridedLoad(ArrayRef<Value > PointerOps, Type ScalarTy,
7094	Align Alignment, const int64_t Diff,
7095	const size_t Sz) const {
7096	if (Diff % (Sz - `1`) != `0`)
7097	return false;
7098
7099	// Try to generate strided load node.
7100	auto IsAnyPointerUsedOutGraph = any_of(Range&: PointerOps, P: [&](Value *V) {
7101	return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) {
7102	return !isVectorized(V: U) && !MustGather.contains(Ptr: U);
7103	});
7104	});
7105
7106	const uint64_t AbsoluteDiff = std::abs(i: Diff);
7107	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7108	if (IsAnyPointerUsedOutGraph \|\|
7109	(AbsoluteDiff > Sz &&
7110	(Sz > MinProfitableStridedLoads \|\|
7111	(AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7112	AbsoluteDiff % Sz == `0` && has_single_bit(Value: AbsoluteDiff / Sz)))) \|\|
7113	Diff == -(static_cast<int64_t>(Sz) - `1`)) {
7114	int64_t Stride = Diff / static_cast<int64_t>(Sz - `1`);
7115	if (Diff != Stride * static_cast<int64_t>(Sz - `1`))
7116	return false;
7117	if (!TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment))
7118	return false;
7119	return true;
7120	}
7121	return false;
7122	}
7123
7124	bool BoUpSLP::analyzeConstantStrideCandidate(
7125	const ArrayRef<Value > PointerOps, Type ScalarTy, Align Alignment,
7126	const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7127	Value Ptr0, Value PtrN, StridedPtrInfo &SPtrInfo) const {
7128	const size_t Sz = PointerOps.size();
7129	SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7130	// Go through `PointerOps` in sorted order and record offsets from
7131	// PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7132	// sortPtrAccesses only validates getPointersDiff for pairs relative to
7133	// PointerOps[0]. This is safe since only offset differences are used below.
7134	for (unsigned I : seq<unsigned>(Size: Sz)) {
7135	Value *Ptr =
7136	SortedIndices.empty() ? PointerOps [I] : PointerOps [SortedIndices [I]];
7137	std::optional<int64_t> Offset =
7138	getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps [`0`], ElemTyB: ScalarTy, PtrB: Ptr, DL: DL, SE&: SE);
7139	assert(Offset && "sortPtrAccesses should have validated this pointer");
7140	SortedOffsetsFromBase [I] = *Offset;
7141	}
7142
7143	// The code below checks that `SortedOffsetsFromBase` looks as follows:
7144	// ```
7145	// [
7146	// (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7147	// (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7148	// ...
7149	// (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7150	// GroupSize - 1}), // last group
7151	// ]
7152	// ```
7153	// The distance between consecutive elements within each group should all be
7154	// the same `StrideWithinGroup`. The distance between the first elements of
7155	// consecutive groups should all be the same `StrideBetweenGroups`.
7156
7157	int64_t StrideWithinGroup =
7158	SortedOffsetsFromBase [`1`] - SortedOffsetsFromBase [`0`];
7159	// Determine size of the first group. Later we will check that all other
7160	// groups have the same size.
7161	auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7162	return SortedOffsetsFromBase [Idx] - SortedOffsetsFromBase [Idx - `1`] !=
7163	StrideWithinGroup;
7164	};
7165	auto Indices = seq<unsigned>(Begin: `1`, End: Sz);
7166	auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7167	unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7168
7169	unsigned VecSz = Sz;
7170	Type *NewScalarTy = ScalarTy;
7171
7172	// Quick detour: at this point we can say what the type of strided load would
7173	// be if all the checks pass. Check if this type is legal for the target.
7174	bool NeedsWidening = Sz != GroupSize;
7175	if (NeedsWidening) {
7176	if (Sz % GroupSize != `0`)
7177	return false;
7178
7179	if (StrideWithinGroup != `1`)
7180	return false;
7181	VecSz = Sz / GroupSize;
7182	NewScalarTy = Type::getIntNTy(
7183	C&: SE->getContext(),
7184	N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * GroupSize);
7185	}
7186
7187	if (!isStridedLoad(PointerOps, ScalarTy: NewScalarTy, Alignment, Diff, Sz: VecSz))
7188	return false;
7189
7190	int64_t StrideIntVal = StrideWithinGroup;
7191	if (NeedsWidening) {
7192	// Continue with checking the "shape" of `SortedOffsetsFromBase`.
7193	// Check that the strides between groups are all the same.
7194	unsigned CurrentGroupStartIdx = GroupSize;
7195	int64_t StrideBetweenGroups =
7196	SortedOffsetsFromBase [GroupSize] - SortedOffsetsFromBase [`0`];
7197	StrideIntVal = StrideBetweenGroups;
7198	for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7199	if (SortedOffsetsFromBase [CurrentGroupStartIdx] -
7200	SortedOffsetsFromBase [CurrentGroupStartIdx - GroupSize] !=
7201	StrideBetweenGroups)
7202	return false;
7203	}
7204
7205	auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7206	auto Indices = seq<unsigned>(Begin: StartIdx + `1`, End: Sz);
7207	auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7208	unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7209	return GroupEndIdx - StartIdx == GroupSize;
7210	};
7211	for (unsigned I = `0`; I < Sz; I += GroupSize) {
7212	if (!CheckGroup (I))
7213	return false;
7214	}
7215	}
7216
7217	Type *StrideTy = DL->getIndexType(PtrTy: Ptr0->getType());
7218	SPtrInfo.StrideVal = ConstantInt::getSigned(Ty: StrideTy, V: StrideIntVal);
7219	SPtrInfo.Ty = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7220	return true;
7221	}
7222
7223	bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
7224	Type *ScalarTy, Align CommonAlignment,
7225	SmallVectorImpl<unsigned> &SortedIndices,
7226	StridedPtrInfo &SPtrInfo) const {
7227	// If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7228	// is constant, we partition `PointerOps` sequence into subsequences of
7229	// pointers with the same offset. For each offset we record values from
7230	// `PointerOps` and their indicies in `PointerOps`.
7231	SmallDenseMap<int64_t, std::pair<SmallVector<Value >, SmallVector<unsigned*>>>
7232	OffsetToPointerOpIdxMap;
7233	for (auto [Idx, Ptr] : enumerate(First&: PointerOps)) {
7234	const SCEV *PtrSCEV = SE->getSCEV(V: Ptr);
7235	if (!PtrSCEV)
7236	return false;
7237
7238	const auto *Add = dyn_cast<SCEVAddExpr>(Val: PtrSCEV);
7239	int64_t Offset = `0`;
7240	if (Add) {
7241	// `Offset` is non-zero.
7242	for (int I : seq<int>(Size: Add->getNumOperands())) {
7243	const auto *SC = dyn_cast<SCEVConstant>(Val: Add->getOperand(i: I));
7244	if (!SC)
7245	continue;
7246	Offset = SC->getAPInt().getSExtValue();
7247	if (Offset >= std::numeric_limits<int64_t>::max() - `1`) {
7248	Offset = `0`;
7249	continue;
7250	}
7251	break;
7252	}
7253	}
7254	OffsetToPointerOpIdxMap [Offset].first.push_back(Elt: Ptr);
7255	OffsetToPointerOpIdxMap [Offset].second.push_back(Elt: Idx);
7256	}
7257	unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7258
7259	// Quick detour: at this point we can say what the type of strided load would
7260	// be if all the checks pass. Check if this type is legal for the target.
7261	const unsigned Sz = PointerOps.size();
7262	unsigned VecSz = Sz;
7263	Type *NewScalarTy = ScalarTy;
7264	if (NumOffsets > `1`) {
7265	if (Sz % NumOffsets != `0`)
7266	return false;
7267	VecSz = Sz / NumOffsets;
7268	NewScalarTy = Type::getIntNTy(
7269	C&: SE->getContext(),
7270	N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * NumOffsets);
7271	}
7272	FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7273	if (Sz <= MinProfitableStridedLoads \|\| !TTI->isTypeLegal(Ty: StridedLoadTy) \|\|
7274	!TTI->isLegalStridedLoadStore(DataType: StridedLoadTy, Alignment: CommonAlignment))
7275	return false;
7276
7277	// Check if the offsets are contiguous and that each group has the required
7278	// size.
7279	SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7280	for (auto [Idx, MapPair] : enumerate(First&: OffsetToPointerOpIdxMap)) {
7281	if (MapPair.second.first.size() != VecSz)
7282	return false;
7283	SortedOffsetsV [Idx] = MapPair.first;
7284	}
7285	sort(C&: SortedOffsetsV);
7286
7287	if (NumOffsets > `1`) {
7288	for (int I : seq<int>(Begin: `1`, End: SortedOffsetsV.size())) {
7289	if (SortedOffsetsV [I] - SortedOffsetsV [I - `1`] != `1`)
7290	return false;
7291	}
7292	}
7293
7294	// Introduce some notation for the explanations below. Let `PointerOps_j`
7295	// denote the subsequence of `PointerOps` with offsets equal to
7296	// `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7297	// ```
7298	// PointerOps_j[SortedIndices_j[0]],
7299	// PointerOps_j[SortedIndices_j[1]],
7300	// PointerOps_j[SortedIndices_j[2]],
7301	// ...
7302	// ```
7303	// is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7304	// of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7305	// i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7306	// The entire sorted `PointerOps` looks like this:
7307	// ```
7308	// PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7309	// PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7310	// PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7311	// ...
7312	// PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7313	// PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7314	//
7315	// PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7316	// PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7317	// PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7318	// ...
7319	// PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7320	// PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7321	//
7322	// PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7323	// PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7324	// PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7325	// ...
7326	// PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7327	// PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7328	// ...
7329	// ...
7330	// ...
7331	// PointerOps_0[SortedIndices_0[VecSz - 1]] =
7332	// PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7333	// PointerOps_1[SortedIndices_1[VecSz - 1]] =
7334	// PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7335	// PointerOps_2[SortedIndices_2[VecSz - 1]] =
7336	// PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7337	// ...
7338	// PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7339	// PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7340	// ```
7341	// In order to be able to generate a strided load, we need the following
7342	// checks to pass:
7343	//
7344	// (1) for each `PointerOps_j` check that the distance
7345	// between adjacent pointers are all equal to the same value (stride).
7346	// (2) for each `PointerOps_j` check that coefficients calculated by
7347	// `calculateRtStride` are all the same.
7348	//
7349	// As we do that, also calculate SortedIndices. Since we should not modify
7350	// `SortedIndices` unless we know that all the checks succeed, record the
7351	// indicies into `SortedIndicesDraft`.
7352	SmallVector<unsigned> SortedIndicesDraft(Sz);
7353
7354	// Given sorted indices for a particular offset (as calculated by
7355	// calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7356	// Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7357	// \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7358	// \param `IndicesInAllPointerOps` vector of indices of the
7359	// subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7360	// notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7361	// \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7362	auto UpdateSortedIndices =
7363	[&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7364	ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7365	if (SortedIndicesForOffset.empty()) {
7366	SortedIndicesForOffset.resize(N: IndicesInAllPointerOps.size());
7367	std::iota(first: SortedIndicesForOffset.begin(),
7368	last: SortedIndicesForOffset.end(), value: `0`);
7369	}
7370	for (const auto [Num, Idx] : enumerate(First&: SortedIndicesForOffset)) {
7371	SortedIndicesDraft [Num * NumOffsets + OffsetNum] =
7372	IndicesInAllPointerOps [Idx];
7373	}
7374	};
7375
7376	int64_t LowestOffset = SortedOffsetsV [`0`];
7377	ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap [LowestOffset].first;
7378
7379	SmallVector<int64_t> Coeffs0(VecSz);
7380	SmallVector<unsigned> SortedIndicesForOffset0;
7381	const SCEV Stride0 = calculateRtStride(PointerOps: PointerOps0, ElemTy: ScalarTy, DL: DL, SE&: *SE,
7382	SortedIndices&: SortedIndicesForOffset0, Coeffs&: Coeffs0);
7383	if (!Stride0)
7384	return false;
7385	unsigned NumCoeffs0 = Coeffs0.size();
7386	if (NumCoeffs0 * NumOffsets != Sz)
7387	return false;
7388	sort(C&: Coeffs0);
7389
7390	ArrayRef<unsigned> IndicesInAllPointerOps0 =
7391	OffsetToPointerOpIdxMap [LowestOffset].second;
7392	UpdateSortedIndices (SortedIndicesForOffset0, IndicesInAllPointerOps0, `0`);
7393
7394	// Now that we know what the common stride and coefficients has to be check
7395	// the remaining `PointerOps_j`.
7396	SmallVector<int64_t> Coeffs;
7397	SmallVector<unsigned> SortedIndicesForOffset;
7398	for (int J : seq<int>(Begin: `1`, End: NumOffsets)) {
7399	Coeffs.clear();
7400	Coeffs.resize(N: VecSz);
7401	SortedIndicesForOffset.clear();
7402
7403	int64_t Offset = SortedOffsetsV [J];
7404	ArrayRef<Value *> PointerOpsForOffset =
7405	OffsetToPointerOpIdxMap [Offset].first;
7406	ArrayRef<unsigned> IndicesInAllPointerOps =
7407	OffsetToPointerOpIdxMap [Offset].second;
7408	const SCEV *StrideWithinGroup =
7409	calculateRtStride(PointerOps: PointerOpsForOffset, ElemTy: ScalarTy, DL: DL, SE&: SE,
7410	SortedIndices&: SortedIndicesForOffset, Coeffs);
7411
7412	if (!StrideWithinGroup \|\| StrideWithinGroup != Stride0)
7413	return false;
7414	if (Coeffs.size() != NumCoeffs0)
7415	return false;
7416	sort(C&: Coeffs);
7417	if (Coeffs != Coeffs0)
7418	return false;
7419
7420	UpdateSortedIndices (SortedIndicesForOffset, IndicesInAllPointerOps, J);
7421	}
7422
7423	SortedIndices.clear();
7424	SortedIndices = std::move(SortedIndicesDraft);
7425	SPtrInfo.StrideSCEV = Stride0;
7426	SPtrInfo.Ty = StridedLoadTy;
7427	return true;
7428	}
7429
7430	BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
7431	ArrayRef<Value > VL, const* Value VL0, SmallVectorImpl<unsigned*> &Order,
7432	SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7433	unsigned BestVF, bool* TryRecursiveCheck) const {
7434	// Check that a vectorized load would load the same memory as a scalar
7435	// load. For example, we don't want to vectorize loads that are smaller
7436	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7437	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
7438	// from such a struct, we read/write packed bits disagreeing with the
7439	// unvectorized version.
7440	if (BestVF)
7441	*BestVF = `0`;
7442	if (areKnownNonVectorizableLoads(VL))
7443	return LoadsState::Gather;
7444	Type *ScalarTy = VL0->getType();
7445
7446	if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
7447	return LoadsState::Gather;
7448
7449	// Make sure all loads in the bundle are simple - we can't vectorize
7450	// atomic or volatile loads.
7451	PointerOps.clear();
7452	const size_t Sz = VL.size();
7453	PointerOps.resize(N: Sz);
7454	auto *POIter = PointerOps.begin();
7455	for (Value *V : VL) {
7456	auto *L = dyn_cast<LoadInst>(Val: V);
7457	if (!L \|\| !L->isSimple())
7458	return LoadsState::Gather;
7459	*POIter = L->getPointerOperand();
7460	++POIter;
7461	}
7462
7463	Order.clear();
7464	// Check the order of pointer operands or that all pointers are the same.
7465	bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order);
7466
7467	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7468	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7469	if (!IsSorted) {
7470	if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, SortedIndices&: Order,
7471	SPtrInfo))
7472	return LoadsState::StridedVectorize;
7473
7474	if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) \|\|
7475	TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7476	return LoadsState::Gather;
7477
7478	if (!all_of(Range&: PointerOps, P: [&](Value *P) {
7479	return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
7480	}))
7481	return LoadsState::Gather;
7482
7483	} else {
7484	Value *Ptr0;
7485	Value *PtrN;
7486	if (Order.empty()) {
7487	Ptr0 = PointerOps.front();
7488	PtrN = PointerOps.back();
7489	} else {
7490	Ptr0 = PointerOps [Order.front()];
7491	PtrN = PointerOps [Order.back()];
7492	}
7493	// sortPtrAccesses validates getPointersDiff for all pointers relative to
7494	// PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7495	// Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7496	std::optional<int64_t> Diff0 =
7497	getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps [`0`], ElemTyB: ScalarTy, PtrB: Ptr0, DL: DL, SE&: SE);
7498	std::optional<int64_t> DiffN =
7499	getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps [`0`], ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
7500	assert(Diff0 && DiffN &&
7501	"sortPtrAccesses should have validated these pointers");
7502	int64_t Diff = DiffN - Diff0;
7503	// Check that the sorted loads are consecutive.
7504	if (static_cast<uint64_t>(Diff) == Sz - `1`)
7505	return LoadsState::Vectorize;
7506	if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: TTI, DL: DL, SE&: SE, AC&: AC, DT: *DT,
7507	TLI: TLI, AreAllUsersVectorized: [&](Value V) {
7508	return areAllUsersVectorized(
7509	I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList);
7510	}))
7511	return LoadsState::CompressVectorize;
7512	Align Alignment =
7513	cast<LoadInst>(Val: Order.empty() ? VL.front() : VL [Order.front()])
7514	->getAlign();
7515	if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, SortedIndices: Order,
7516	Diff, Ptr0, PtrN, SPtrInfo))
7517	return LoadsState::StridedVectorize;
7518	}
7519	if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) \|\|
7520	TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7521	return LoadsState::Gather;
7522	// Correctly identify compare the cost of loads + shuffles rather than
7523	// strided/masked gather loads. Returns true if vectorized + shuffles
7524	// representation is better than just gather.
7525	auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7526	unsigned *BestVF,
7527	bool ProfitableGatherPointers) {
7528	if (BestVF)
7529	*BestVF = `0`;
7530	// Compare masked gather cost and loads + insert subvector costs.
7531	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7532	auto [ScalarGEPCost, VectorGEPCost] =
7533	getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(),
7534	Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7535	// Estimate the cost of masked gather GEP. If not a splat, roughly
7536	// estimate as a buildvector, otherwise estimate as splat.
7537	APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7538	Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7539	VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz);
7540	if (static_cast<unsigned>(count_if(
7541	Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - `1` \|\|
7542	any_of(Range&: PointerOps, P: [&](Value *V) {
7543	return getUnderlyingObject(V) !=
7544	getUnderlyingObject(V: PointerOps.front());
7545	}))
7546	VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy,
7547	DemandedElts, /Insert=/true,
7548	/Extract=/false, CostKind);
7549	else
7550	VectorGEPCost +=
7551	getScalarizationOverhead(
7552	TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: `0`),
7553	/Insert=/true, /Extract=/false, CostKind) +
7554	::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind);
7555	// The cost of scalar loads.
7556	InstructionCost ScalarLoadsCost =
7557	std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost (),
7558	binary_op: [&](InstructionCost C, Value *V) {
7559	return C + TTI.getInstructionCost(
7560	U: cast<Instruction>(Val: V), CostKind);
7561	}) +
7562	ScalarGEPCost;
7563	// The cost of masked gather.
7564	InstructionCost MaskedGatherCost =
7565	TTI.getMemIntrinsicInstrCost(
7566	MICA: MemIntrinsicCostAttributes (Intrinsic::masked_gather, VecTy,
7567	cast<LoadInst>(Val: VL0)->getPointerOperand(),
7568	/VariableMask=/false, CommonAlignment),
7569	CostKind) +
7570	(ProfitableGatherPointers ? `0` : VectorGEPCost);
7571	InstructionCost GatherCost =
7572	getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7573	/Insert=/true,
7574	/Extract=/false, CostKind) +
7575	ScalarLoadsCost;
7576	// The list of loads is small or perform partial check already - directly
7577	// compare masked gather cost and gather cost.
7578	constexpr unsigned ListLimit = `4`;
7579	if (!TryRecursiveCheck \|\| VL.size() < ListLimit)
7580	return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7581
7582	// FIXME: The following code has not been updated for non-power-of-2
7583	// vectors (and not whole registers). The splitting logic here does not
7584	// cover the original vector if the vector factor is not a power of two.
7585	if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size()))
7586	return false;
7587
7588	unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
7589	unsigned MinVF = getMinVF(Sz: `2` * Sz);
7590	DemandedElts.clearAllBits();
7591	// Iterate through possible vectorization factors and check if vectorized +
7592	// shuffles is better than just gather.
7593	for (unsigned VF =
7594	getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - `1`);
7595	VF >= MinVF;
7596	VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - `1`)) {
7597	SmallVector<LoadsState> States;
7598	for (unsigned Cnt = `0`, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7599	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
7600	SmallVector<unsigned> Order;
7601	SmallVector<Value *> PointerOps;
7602	LoadsState LS = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
7603	PointerOps, SPtrInfo, BestVF,
7604	/TryRecursiveCheck=/false);
7605	// Check that the sorted loads are consecutive.
7606	if (LS == LoadsState::Gather) {
7607	if (BestVF) {
7608	DemandedElts.setAllBits();
7609	break;
7610	}
7611	DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF);
7612	continue;
7613	}
7614	// If need the reorder - consider as high-cost masked gather for now.
7615	if ((LS == LoadsState::Vectorize \|\|
7616	LS == LoadsState::StridedVectorize \|\|
7617	LS == LoadsState::CompressVectorize) &&
7618	!Order.empty() && !isReverseOrder(Order))
7619	LS = LoadsState::ScatterVectorize;
7620	States.push_back(Elt: LS);
7621	}
7622	if (DemandedElts.isAllOnes())
7623	// All loads gathered - try smaller VF.
7624	continue;
7625	// Can be vectorized later as a serie of loads/insertelements.
7626	InstructionCost VecLdCost = `0`;
7627	if (!DemandedElts.isZero()) {
7628	VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7629	/Insert=/true,
7630	/Extract=/false, CostKind) +
7631	ScalarGEPCost;
7632	for (unsigned Idx : seq<unsigned>(Size: VL.size()))
7633	if (DemandedElts [Idx])
7634	VecLdCost +=
7635	TTI.getInstructionCost(U: cast<Instruction>(Val: VL [Idx]), CostKind);
7636	}
7637	auto *SubVecTy = getWidenedType(ScalarTy, VF);
7638	for (auto [I, LS] : enumerate(First&: States)) {
7639	auto LI0 = cast<LoadInst>(Val: VL [I VF]);
7640	InstructionCost VectorGEPCost =
7641	(LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7642	? `0`
7643	: getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
7644	BasePtr: LI0->getPointerOperand(),
7645	Opcode: Instruction::GetElementPtr, CostKind, ScalarTy,
7646	VecTy: SubVecTy)
7647	.second;
7648	if (LS == LoadsState::ScatterVectorize) {
7649	if (static_cast<unsigned>(
7650	count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) <
7651	PointerOps.size() - `1` \|\|
7652	any_of(Range&: PointerOps, P: [&](Value *V) {
7653	return getUnderlyingObject(V) !=
7654	getUnderlyingObject(V: PointerOps.front());
7655	}))
7656	VectorGEPCost += getScalarizationOverhead(
7657	TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF),
7658	/Insert=/true, /Extract=/false, CostKind);
7659	else
7660	VectorGEPCost +=
7661	getScalarizationOverhead(
7662	TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: `0`),
7663	/Insert=/true, /Extract=/false, CostKind) +
7664	::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {},
7665	CostKind);
7666	}
7667	switch (LS) {
7668	case LoadsState::Vectorize:
7669	VecLdCost +=
7670	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
7671	AddressSpace: LI0->getPointerAddressSpace(), CostKind,
7672	OpdInfo: TTI::OperandValueInfo ()) +
7673	VectorGEPCost;
7674	break;
7675	case LoadsState::StridedVectorize:
7676	VecLdCost += TTI.getMemIntrinsicInstrCost(
7677	MICA: MemIntrinsicCostAttributes (
7678	Intrinsic::experimental_vp_strided_load,
7679	SubVecTy, LI0->getPointerOperand(),
7680	/VariableMask=/false, CommonAlignment),
7681	CostKind) +
7682	VectorGEPCost;
7683	break;
7684	case LoadsState::CompressVectorize:
7685	VecLdCost += TTI.getMemIntrinsicInstrCost(
7686	MICA: MemIntrinsicCostAttributes (
7687	Intrinsic::masked_load, SubVecTy,
7688	CommonAlignment, LI0->getPointerAddressSpace()),
7689	CostKind) +
7690	::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy,
7691	Mask: {}, CostKind);
7692	break;
7693	case LoadsState::ScatterVectorize:
7694	VecLdCost += TTI.getMemIntrinsicInstrCost(
7695	MICA: MemIntrinsicCostAttributes (
7696	Intrinsic::masked_gather, SubVecTy,
7697	LI0->getPointerOperand(),
7698	/VariableMask=/false, CommonAlignment),
7699	CostKind) +
7700	VectorGEPCost;
7701	break;
7702	case LoadsState::Gather:
7703	// Gathers are already calculated - ignore.
7704	continue;
7705	}
7706	SmallVector<int> ShuffleMask(VL.size());
7707	for (int Idx : seq<int>(Begin: `0`, End: VL.size()))
7708	ShuffleMask [Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7709	if (I > `0`)
7710	VecLdCost +=
7711	::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
7712	CostKind, Index: I * VF, SubTp: SubVecTy);
7713	}
7714	// If masked gather cost is higher - better to vectorize, so
7715	// consider it as a gather node. It will be better estimated
7716	// later.
7717	if (MaskedGatherCost >= VecLdCost &&
7718	VecLdCost - GatherCost < -SLPCostThreshold) {
7719	if (BestVF)
7720	*BestVF = VF;
7721	return true;
7722	}
7723	}
7724	return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7725	};
7726	// TODO: need to improve analysis of the pointers, if not all of them are
7727	// GEPs or have > 2 operands, we end up with a gather node, which just
7728	// increases the cost.
7729	Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
7730	bool ProfitableGatherPointers =
7731	L && Sz > `2` && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
7732	return L->isLoopInvariant(V);
7733	})) <= Sz / `2`;
7734	if (ProfitableGatherPointers \|\| all_of(Range&: PointerOps, P: [](Value *P) {
7735	auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
7736	return (!GEP && doesNotNeedToBeScheduled(V: P)) \|\|
7737	(GEP && GEP->getNumOperands() == `2` &&
7738	isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: `1`)));
7739	})) {
7740	// Check if potential masked gather can be represented as series
7741	// of loads + insertsubvectors.
7742	// If masked gather cost is higher - better to vectorize, so
7743	// consider it as a gather node. It will be better estimated
7744	// later.
7745	if (!TryRecursiveCheck \|\| !CheckForShuffledLoads (CommonAlignment, BestVF,
7746	ProfitableGatherPointers))
7747	return LoadsState::ScatterVectorize;
7748	}
7749
7750	return LoadsState::Gather;
7751	}
7752
7753	static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
7754	ArrayRef<BasicBlock > BBs, Type ElemTy,
7755	const DataLayout &DL, ScalarEvolution &SE,
7756	SmallVectorImpl<unsigned> &SortedIndices) {
7757	assert(
7758	all_of(VL, [](const Value V) { return* V->getType()->isPointerTy(); }) &&
7759	"Expected list of pointer operands.");
7760	// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7761	// Ptr into, sort and return the sorted indices with values next to one
7762	// another.
7763	SmallMapVector<
7764	std::pair<BasicBlock , Value >,
7765	SmallVector<SmallVector<std::tuple<Value , int64_t, unsigned*>>>, `8`>
7766	Bases;
7767	Bases
7768	.try_emplace(Key: std::make_pair(
7769	x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth)))
7770	.first->second.emplace_back().emplace_back(Args: VL.front(), Args: `0U`, Args: `0U`);
7771
7772	SortedIndices.clear();
7773	for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) {
7774	auto Key = std::make_pair(x: BBs [Cnt + `1`],
7775	y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth));
7776	bool Found = any_of(Range&: Bases.try_emplace(Key).first->second,
7777	P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7778	std::optional<int64_t> Diff =
7779	getPointersDiff(ElemTy, std::get<`0`>(Base.front()),
7780	ElemTy, Ptr, DL, SE,
7781	/StrictCheck=/true);
7782	if (!Diff)
7783	return false;
7784
7785	Base.emplace_back(Ptr, *Diff, Cnt + `1`);
7786	return true;
7787	});
7788
7789	if (!Found) {
7790	// If we haven't found enough to usefully cluster, return early.
7791	if (Bases.size() > VL.size() / `2` - `1`)
7792	return false;
7793
7794	// Not found already - add a new Base
7795	Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: `0`, Args: Cnt + `1`);
7796	}
7797	}
7798
7799	if (Bases.size() == VL.size())
7800	return false;
7801
7802	if (Bases.size() == `1` && (Bases.front().second.size() == `1` \|\|
7803	Bases.front().second.size() == VL.size()))
7804	return false;
7805
7806	// For each of the bases sort the pointers by Offset and check if any of the
7807	// base become consecutively allocated.
7808	auto ComparePointers = [](Value Ptr1, Value Ptr2) {
7809	SmallPtrSet<Value *, `13`> FirstPointers;
7810	SmallPtrSet<Value *, `13`> SecondPointers;
7811	Value *P1 = Ptr1;
7812	Value *P2 = Ptr2;
7813	unsigned Depth = `0`;
7814	while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) {
7815	if (P1 == P2 \|\| Depth > RecursionMaxDepth)
7816	return false;
7817	FirstPointers.insert(Ptr: P1);
7818	SecondPointers.insert(Ptr: P2);
7819	P1 = getUnderlyingObject(V: P1, /MaxLookup=/`1`);
7820	P2 = getUnderlyingObject(V: P2, /MaxLookup=/`1`);
7821	++Depth;
7822	}
7823	assert((FirstPointers.contains(P2) \|\| SecondPointers.contains(P1)) &&
7824	"Unable to find matching root.");
7825	return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1);
7826	};
7827	for (auto &Base : Bases) {
7828	for (auto &Vec : Base.second) {
7829	if (Vec.size() > `1`) {
7830	stable_sort(Range&: Vec, C: llvm::less_second ());
7831	int64_t InitialOffset = std::get<`1`>(t&: Vec [`0`]);
7832	bool AnyConsecutive =
7833	all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
7834	return std::get<`1`>(P.value()) ==
7835	int64_t(P.index()) + InitialOffset;
7836	});
7837	// Fill SortedIndices array only if it looks worth-while to sort the
7838	// ptrs.
7839	if (!AnyConsecutive)
7840	return false;
7841	}
7842	}
7843	stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) {
7844	return ComparePointers(std::get<`0`>(V1.front()), std::get<`0`>(V2.front()));
7845	});
7846	}
7847
7848	for (auto &T : Bases)
7849	for (const auto &Vec : T.second)
7850	for (const auto &P : Vec)
7851	SortedIndices.push_back(Elt: std::get<`2`>(t: P));
7852
7853	assert(SortedIndices.size() == VL.size() &&
7854	"Expected SortedIndices to be the size of VL");
7855	return true;
7856	}
7857
7858	std::optional<BoUpSLP::OrdersType>
7859	BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7860	assert(TE.isGather() && "Expected gather node only.");
7861	Type *ScalarTy = TE.Scalars [`0`]->getType();
7862
7863	SmallVector<Value *> Ptrs;
7864	Ptrs.reserve(N: TE.Scalars.size());
7865	SmallVector<BasicBlock *> BBs;
7866	BBs.reserve(N: TE.Scalars.size());
7867	for (Value *V : TE.Scalars) {
7868	auto *L = dyn_cast<LoadInst>(Val: V);
7869	if (!L \|\| !L->isSimple())
7870	return std::nullopt;
7871	Ptrs.push_back(Elt: L->getPointerOperand());
7872	BBs.push_back(Elt: L->getParent());
7873	}
7874
7875	BoUpSLP::OrdersType Order;
7876	if (!LoadEntriesToVectorize.contains(key: TE.Idx) &&
7877	clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: Order))
7878	return std::move(Order);
7879	return std::nullopt;
7880	}
7881
7882	/// Check if two insertelement instructions are from the same buildvector.
7883	static bool areTwoInsertFromSameBuildVector(
7884	InsertElementInst VU, InsertElementInst V,
7885	function_ref<Value (InsertElementInst )> GetBaseOperand) {
7886	// Instructions must be from the same basic blocks.
7887	if (VU->getParent() != V->getParent())
7888	return false;
7889	// Checks if 2 insertelements are from the same buildvector.
7890	if (VU->getType() != V->getType())
7891	return false;
7892	// Multiple used inserts are separate nodes.
7893	if (!VU->hasOneUse() && !V->hasOneUse())
7894	return false;
7895	auto *IE1 = VU;
7896	auto *IE2 = V;
7897	std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
7898	std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
7899	if (Idx1 == std::nullopt \|\| Idx2 == std::nullopt)
7900	return false;
7901	// Go through the vector operand of insertelement instructions trying to find
7902	// either VU as the original vector for IE2 or V as the original vector for
7903	// IE1.
7904	SmallBitVector ReusedIdx(
7905	cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
7906	bool IsReusedIdx = false;
7907	do {
7908	if (IE2 == VU && !IE1)
7909	return VU->hasOneUse();
7910	if (IE1 == V && !IE2)
7911	return V->hasOneUse();
7912	if (IE1 && IE1 != V) {
7913	unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
7914	IsReusedIdx \|= ReusedIdx.test(Idx: Idx1);
7915	ReusedIdx.set(Idx1);
7916	if ((IE1 != VU && !IE1->hasOneUse()) \|\| IsReusedIdx)
7917	IE1 = nullptr;
7918	else
7919	IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE1));
7920	}
7921	if (IE2 && IE2 != VU) {
7922	unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
7923	IsReusedIdx \|= ReusedIdx.test(Idx: Idx2);
7924	ReusedIdx.set(Idx2);
7925	if ((IE2 != V && !IE2->hasOneUse()) \|\| IsReusedIdx)
7926	IE2 = nullptr;
7927	else
7928	IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand (IE2));
7929	}
7930	} while (!IsReusedIdx && (IE1 \|\| IE2));
7931	return false;
7932	}
7933
7934	/// Checks if the specified instruction \p I is an alternate operation for
7935	/// the given \p MainOp and \p AltOp instructions.
7936	static bool isAlternateInstruction(Instruction I, Instruction MainOp,
7937	Instruction *AltOp,
7938	const TargetLibraryInfo &TLI);
7939
7940	std::optional<BoUpSLP::OrdersType>
7941	BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7942	bool IgnoreReorder) {
7943	// No need to reorder if need to shuffle reuses, still need to shuffle the
7944	// node.
7945	if (!TE.ReuseShuffleIndices.empty()) {
7946	// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7947	assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7948	"Reshuffling scalars not yet supported for nodes with padding");
7949
7950	if (isSplat(VL: TE.Scalars))
7951	return std::nullopt;
7952	// Check if reuse shuffle indices can be improved by reordering.
7953	// For this, check that reuse mask is "clustered", i.e. each scalar values
7954	// is used once in each submask of size <number_of_scalars>.
7955	// Example: 4 scalar values.
7956	// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7957	// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7958	// element 3 is used twice in the second submask.
7959	unsigned Sz = TE.Scalars.size();
7960	if (TE.isGather()) {
7961	if (std::optional<OrdersType> CurrentOrder =
7962	findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7963	SmallVector<int> Mask;
7964	fixupOrderingIndices(Order: *CurrentOrder);
7965	inversePermutation(Indices: *CurrentOrder, Mask);
7966	::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
7967	OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7968	unsigned Sz = TE.Scalars.size();
7969	for (int K = `0`, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7970	for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
7971	if (Idx != PoisonMaskElem)
7972	Res [Idx + K * Sz] = I + K * Sz;
7973	}
7974	return std::move(Res);
7975	}
7976	}
7977	if (Sz == `2` && TE.getVectorFactor() == `4` &&
7978	::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
7979	VF: `2` * TE.getVectorFactor())) == `1`)
7980	return std::nullopt;
7981	if (TE.ReuseShuffleIndices.size() % Sz != `0`)
7982	return std::nullopt;
7983	if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
7984	VF: Sz)) {
7985	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7986	if (TE.ReorderIndices.empty())
7987	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
7988	else
7989	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
7990	::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
7991	unsigned VF = ReorderMask.size();
7992	OrdersType ResOrder(VF, VF);
7993	unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
7994	SmallBitVector UsedVals(NumParts);
7995	for (unsigned I = `0`; I < VF; I += Sz) {
7996	int Val = PoisonMaskElem;
7997	unsigned UndefCnt = `0`;
7998	unsigned Limit = std::min(a: Sz, b: VF - I);
7999	if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
8000	P: [&](int Idx) {
8001	if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
8002	Val = Idx;
8003	if (Idx == PoisonMaskElem)
8004	++UndefCnt;
8005	return Idx != PoisonMaskElem && Idx != Val;
8006	}) \|\|
8007	Val >= static_cast<int>(NumParts) \|\| UsedVals.test(Idx: Val) \|\|
8008	UndefCnt > Sz / `2`)
8009	return std::nullopt;
8010	UsedVals.set(Val);
8011	for (unsigned K = `0`; K < NumParts; ++K) {
8012	unsigned Idx = Val + Sz * K;
8013	if (Idx < VF && I + K < VF)
8014	ResOrder [Idx] = I + K;
8015	}
8016	}
8017	return std::move(ResOrder);
8018	}
8019	unsigned VF = TE.getVectorFactor();
8020	// Try build correct order for extractelement instructions.
8021	SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8022	TE.ReuseShuffleIndices.end());
8023	if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8024	all_of(Range: TE.Scalars, P: [Sz](Value *V) {
8025	if (isa<PoisonValue>(Val: V))
8026	return true;
8027	std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
8028	return Idx && *Idx < Sz;
8029	})) {
8030	assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8031	"by BinaryOperator and CastInst.");
8032	SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8033	if (TE.ReorderIndices.empty())
8034	std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: `0`);
8035	else
8036	inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
8037	for (unsigned I = `0`; I < VF; ++I) {
8038	int &Idx = ReusedMask [I];
8039	if (Idx == PoisonMaskElem)
8040	continue;
8041	Value *V = TE.Scalars [ReorderMask [Idx]];
8042	std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
8043	Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
8044	}
8045	}
8046	// Build the order of the VF size, need to reorder reuses shuffles, they are
8047	// always of VF size.
8048	OrdersType ResOrder(VF);
8049	std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: `0`);
8050	auto *It = ResOrder.begin();
8051	for (unsigned K = `0`; K < VF; K += Sz) {
8052	OrdersType CurrentOrder(TE.ReorderIndices);
8053	SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
8054	if (SubMask.front() == PoisonMaskElem)
8055	std::iota(first: SubMask.begin(), last: SubMask.end(), value: `0`);
8056	reorderOrder(Order&: CurrentOrder, Mask: SubMask);
8057	transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
8058	std::advance(i&: It, n: Sz);
8059	}
8060	if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
8061	return Data.index() == Data.value();
8062	}))
8063	return std::nullopt; // No need to reorder.
8064	return std::move(ResOrder);
8065	}
8066	if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8067	(!TE.UserTreeIndex \|\| !TE.UserTreeIndex.UserTE->hasState() \|\|
8068	!Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) &&
8069	(TE.ReorderIndices.empty() \|\| isReverseOrder(Order: TE.ReorderIndices)))
8070	return std::nullopt;
8071	if (TE.State == TreeEntry::SplitVectorize \|\|
8072	((TE.State == TreeEntry::Vectorize \|\|
8073	TE.State == TreeEntry::StridedVectorize \|\|
8074	TE.State == TreeEntry::CompressVectorize) &&
8075	(isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) \|\|
8076	(TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) {
8077	assert((TE.State == TreeEntry::SplitVectorize \|\| !TE.isAltShuffle()) &&
8078	"Alternate instructions are only supported by "
8079	"BinaryOperator and CastInst.");
8080	return TE.ReorderIndices;
8081	}
8082	if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8083	TE.isAltShuffle()) {
8084	assert(TE.ReuseShuffleIndices.empty() &&
8085	"ReuseShuffleIndices should be "
8086	"empty for alternate instructions.");
8087	SmallVector<int> Mask;
8088	TE.buildAltOpShuffleMask(
8089	IsAltOp: [&](Instruction *I) {
8090	assert(TE.getMatchingMainOpOrAltOp(I) &&
8091	"Unexpected main/alternate opcode");
8092	return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI);
8093	},
8094	Mask);
8095	const int VF = TE.getVectorFactor();
8096	OrdersType ResOrder(VF, VF);
8097	for (unsigned I : seq<unsigned>(Size: VF)) {
8098	if (Mask [I] == PoisonMaskElem)
8099	continue;
8100	ResOrder [Mask [I] % VF] = I;
8101	}
8102	return std::move(ResOrder);
8103	}
8104	if (!TE.ReorderIndices.empty())
8105	return TE.ReorderIndices;
8106	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8107	if (!TE.ReorderIndices.empty())
8108	return TE.ReorderIndices;
8109
8110	SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8111	for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) {
8112	if (isa<Constant>(Val: V) \|\| !V->hasNUsesOrMore(N: `1`))
8113	continue;
8114	auto II = dyn_cast<InsertElementInst>(Val: V->user_begin());
8115	if (!II)
8116	continue;
8117	Instruction BVHead = nullptr*;
8118	BasicBlock *BB = II->getParent();
8119	while (II && II->hasOneUse() && II->getParent() == BB) {
8120	BVHead = II;
8121	II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: `0`));
8122	}
8123	I = BVHead;
8124	}
8125
8126	auto CompareByBasicBlocks = [&](BasicBlock BB1, BasicBlock BB2) {
8127	assert(BB1 != BB2 && "Expected different basic blocks.");
8128	if (!DT->isReachableFromEntry(A: BB1))
8129	return false;
8130	if (!DT->isReachableFromEntry(A: BB2))
8131	return true;
8132	auto *NodeA = DT->getNode(BB: BB1);
8133	auto *NodeB = DT->getNode(BB: BB2);
8134	assert(NodeA && "Should only process reachable instructions");
8135	assert(NodeB && "Should only process reachable instructions");
8136	assert((NodeA == NodeB) ==
8137	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8138	"Different nodes should have different DFS numbers");
8139	return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8140	};
8141	auto PHICompare = [&](unsigned I1, unsigned I2) {
8142	Value *V1 = TE.Scalars [I1];
8143	Value *V2 = TE.Scalars [I2];
8144	if (V1 == V2 \|\| (V1->use_empty() && V2->use_empty()))
8145	return false;
8146	if (isa<PoisonValue>(Val: V1))
8147	return true;
8148	if (isa<PoisonValue>(Val: V2))
8149	return false;
8150	if (V1->getNumUses() < V2->getNumUses())
8151	return true;
8152	if (V1->getNumUses() > V2->getNumUses())
8153	return false;
8154	auto FirstUserOfPhi1 = cast<Instruction>(Val: V1->user_begin());
8155	auto FirstUserOfPhi2 = cast<Instruction>(Val: V2->user_begin());
8156	if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8157	return CompareByBasicBlocks (FirstUserOfPhi1->getParent(),
8158	FirstUserOfPhi2->getParent());
8159	auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1);
8160	auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2);
8161	auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1);
8162	auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2);
8163	if (IE1 && !IE2)
8164	return true;
8165	if (!IE1 && IE2)
8166	return false;
8167	if (IE1 && IE2) {
8168	if (UserBVHead [I1] && !UserBVHead [I2])
8169	return true;
8170	if (!UserBVHead [I1])
8171	return false;
8172	if (UserBVHead [I1] == UserBVHead [I2])
8173	return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
8174	if (UserBVHead [I1]->getParent() != UserBVHead [I2]->getParent())
8175	return CompareByBasicBlocks (UserBVHead [I1]->getParent(),
8176	UserBVHead [I2]->getParent());
8177	return UserBVHead [I1]->comesBefore(Other: UserBVHead [I2]);
8178	}
8179	if (EE1 && !EE2)
8180	return true;
8181	if (!EE1 && EE2)
8182	return false;
8183	if (EE1 && EE2) {
8184	auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: `0`));
8185	auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: `0`));
8186	auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: `0`));
8187	auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: `0`));
8188	if (!Inst2 && !P2)
8189	return Inst1 \|\| P1;
8190	if (EE1->getOperand(i_nocapture: `0`) == EE2->getOperand(i_nocapture: `0`))
8191	return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
8192	if (!Inst1 && Inst2)
8193	return false;
8194	if (Inst1 && Inst2) {
8195	if (Inst1->getParent() != Inst2->getParent())
8196	return CompareByBasicBlocks (Inst1->getParent(), Inst2->getParent());
8197	return Inst1->comesBefore(Other: Inst2);
8198	}
8199	if (!P1 && P2)
8200	return false;
8201	assert(P1 && P2 &&
8202	"Expected either instructions or arguments vector operands.");
8203	return P1->getArgNo() < P2->getArgNo();
8204	}
8205	return false;
8206	};
8207	OrdersType Phis(TE.Scalars.size());
8208	std::iota(first: Phis.begin(), last: Phis.end(), value: `0`);
8209	stable_sort(Range&: Phis, C: PHICompare);
8210	if (isIdentityOrder(Order: Phis))
8211	return std::nullopt; // No need to reorder.
8212	return std::move(Phis);
8213	}
8214	if (TE.isGather() &&
8215	(!TE.hasState() \|\| !TE.isAltShuffle() \|\|
8216	ScalarsInSplitNodes.contains(Val: TE.getMainOp())) &&
8217	allSameType(VL: TE.Scalars)) {
8218	// TODO: add analysis of other gather nodes with extractelement
8219	// instructions and other values/instructions, not only undefs.
8220	if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) \|\|
8221	(all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
8222	any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
8223	all_of(Range: TE.Scalars, P: [](Value *V) {
8224	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8225	return !EE \|\| isa<FixedVectorType>(Val: EE->getVectorOperandType());
8226	})) {
8227	// Check that gather of extractelements can be represented as
8228	// just a shuffle of a single vector.
8229	OrdersType CurrentOrder;
8230	bool Reuse =
8231	canReuseExtract(VL: TE.Scalars, CurrentOrder, /ResizeAllowed=/true);
8232	if (Reuse \|\| !CurrentOrder.empty())
8233	return std::move(CurrentOrder);
8234	}
8235	// If the gather node is <undef, v, .., poison> and
8236	// insertelement poison, v, 0 [+ permute]
8237	// is cheaper than
8238	// insertelement poison, v, n - try to reorder.
8239	// If rotating the whole graph, exclude the permute cost, the whole graph
8240	// might be transformed.
8241	int Sz = TE.Scalars.size();
8242	if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
8243	count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - `1`) {
8244	const auto *It = find_if_not(Range: TE.Scalars, P: isConstant);
8245	if (It == TE.Scalars.begin())
8246	return OrdersType ();
8247	auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
8248	if (It != TE.Scalars.end()) {
8249	OrdersType Order(Sz, Sz);
8250	unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
8251	Order [Idx] = `0`;
8252	fixupOrderingIndices(Order);
8253	SmallVector<int> Mask;
8254	inversePermutation(Indices: Order, Mask);
8255	InstructionCost PermuteCost =
8256	TopToBottom
8257	? `0`
8258	: ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
8259	InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8260	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: `0`,
8261	Op0: PoisonValue::get(T: Ty), Op1: *It);
8262	InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8263	Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
8264	Op0: PoisonValue::get(T: Ty), Op1: *It);
8265	if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8266	OrdersType Order(Sz, Sz);
8267	Order [Idx] = `0`;
8268	return std::move(Order);
8269	}
8270	}
8271	}
8272	if (isSplat(VL: TE.Scalars))
8273	return std::nullopt;
8274	if (TE.Scalars.size() >= `3`)
8275	if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8276	return Order;
8277	// Check if can include the order of vectorized loads. For masked gathers do
8278	// extra analysis later, so include such nodes into a special list.
8279	if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8280	SmallVector<Value *> PointerOps;
8281	StridedPtrInfo SPtrInfo;
8282	OrdersType CurrentOrder;
8283	LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(),
8284	Order&: CurrentOrder, PointerOps, SPtrInfo);
8285	if (Res == LoadsState::Vectorize \|\| Res == LoadsState::StridedVectorize \|\|
8286	Res == LoadsState::CompressVectorize)
8287	return std::move(CurrentOrder);
8288	}
8289	// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8290	// has been auditted for correctness with non-power-of-two vectors.
8291	if (!VectorizeNonPowerOf2 \|\| !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
8292	if (std::optional<OrdersType> CurrentOrder =
8293	findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8294	return CurrentOrder;
8295	}
8296	return std::nullopt;
8297	}
8298
8299	/// Checks if the given mask is a "clustered" mask with the same clusters of
8300	/// size \p Sz, which are not identity submasks.
8301	static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
8302	unsigned Sz) {
8303	ArrayRef<int> FirstCluster = Mask.slice(N: `0`, M: Sz);
8304	if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
8305	return false;
8306	for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8307	ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
8308	if (Cluster != FirstCluster)
8309	return false;
8310	}
8311	return true;
8312	}
8313
8314	void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8315	// Reorder reuses mask.
8316	reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
8317	const unsigned Sz = TE.Scalars.size();
8318	// For vectorized and non-clustered reused no need to do anything else.
8319	if (!TE.isGather() \|\|
8320	!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
8321	VF: Sz) \|\|
8322	!isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
8323	return;
8324	SmallVector<int> NewMask;
8325	inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
8326	addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
8327	// Clear reorder since it is going to be applied to the new mask.
8328	TE.ReorderIndices.clear();
8329	// Try to improve gathered nodes with clustered reuses, if possible.
8330	ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: `0`, M: Sz);
8331	SmallVector<unsigned> NewOrder(Slice);
8332	inversePermutation(Indices: NewOrder, Mask&: NewMask);
8333	reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
8334	// Fill the reuses mask with the identity submasks.
8335	for (auto *It = TE.ReuseShuffleIndices.begin(),
8336	*End = TE.ReuseShuffleIndices.end();
8337	It != End; std::advance(i&: It, n: Sz))
8338	std::iota(first: It, last: std::next(x: It, n: Sz), value: `0`);
8339	}
8340
8341	static void combineOrders(MutableArrayRef<unsigned> Order,
8342	ArrayRef<unsigned> SecondaryOrder) {
8343	assert((SecondaryOrder.empty() \|\| Order.size() == SecondaryOrder.size()) &&
8344	"Expected same size of orders");
8345	size_t Sz = Order.size();
8346	SmallBitVector UsedIndices(Sz);
8347	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz)) {
8348	if (Order [Idx] != Sz)
8349	UsedIndices.set(Order [Idx]);
8350	}
8351	if (SecondaryOrder.empty()) {
8352	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
8353	if (Order [Idx] == Sz && !UsedIndices.test(Idx))
8354	Order [Idx] = Idx;
8355	} else {
8356	for (unsigned Idx : seq<unsigned>(Begin: `0`, End: Sz))
8357	if (SecondaryOrder [Idx] != Sz && Order [Idx] == Sz &&
8358	!UsedIndices.test(Idx: SecondaryOrder [Idx]))
8359	Order [Idx] = SecondaryOrder [Idx];
8360	}
8361	}
8362
8363	bool BoUpSLP::isProfitableToReorder() const {
8364	if (DisableTreeReorder)
8365	return false;
8366
8367	constexpr unsigned TinyVF = `2`;
8368	constexpr unsigned TinyTree = `10`;
8369	constexpr unsigned PhiOpsLimit = `12`;
8370	constexpr unsigned GatherLoadsLimit = `2`;
8371	if (VectorizableTree.size() <= TinyTree)
8372	return true;
8373	if (VectorizableTree.front()->hasState() &&
8374	!VectorizableTree.front()->isGather() &&
8375	(VectorizableTree.front()->getOpcode() == Instruction::Store \|\|
8376	VectorizableTree.front()->getOpcode() == Instruction::PHI \|\|
8377	(VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8378	(VectorizableTree.front()->getOpcode() == Instruction::PtrToInt \|\|
8379	VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8380	VectorizableTree.front()->ReorderIndices.empty()) {
8381	// Check if the tree has only single store and single (unordered) load node,
8382	// other nodes are phis or geps/binops, combined with phis, and/or single
8383	// gather load node
8384	if (VectorizableTree.front()->hasState() &&
8385	VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8386	VectorizableTree.front()->Scalars.size() == TinyVF &&
8387	VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8388	return false;
8389	// Single node, which require reorder - skip.
8390	if (VectorizableTree.front()->hasState() &&
8391	VectorizableTree.front()->getOpcode() == Instruction::Store &&
8392	VectorizableTree.front()->ReorderIndices.empty()) {
8393	const unsigned ReorderedSplitsCnt =
8394	count_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8395	return TE ->State == TreeEntry::SplitVectorize &&
8396	!TE ->ReorderIndices.empty() && TE ->UserTreeIndex.UserTE &&
8397	TE ->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8398	::isCommutative(I: TE ->UserTreeIndex.UserTE->getMainOp());
8399	});
8400	if (ReorderedSplitsCnt <= `1` &&
8401	static_cast<unsigned>(count_if(
8402	Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8403	return ((!TE ->isGather() &&
8404	(TE ->ReorderIndices.empty() \|\|
8405	(TE ->UserTreeIndex.UserTE &&
8406	TE ->UserTreeIndex.UserTE->State ==
8407	TreeEntry::Vectorize &&
8408	!TE ->UserTreeIndex.UserTE->ReuseShuffleIndices
8409	.empty()))) \|\|
8410	(TE ->isGather() && TE ->ReorderIndices.empty() &&
8411	(!TE ->hasState() \|\| TE ->isAltShuffle() \|\|
8412	TE ->getOpcode() == Instruction::Load \|\|
8413	TE ->getOpcode() == Instruction::ZExt \|\|
8414	TE ->getOpcode() == Instruction::SExt))) &&
8415	(VectorizableTree.front()->getVectorFactor() > TinyVF \|\|
8416	!TE ->isGather() \|\| none_of(Range&: TE ->Scalars, P: [&](Value *V) {
8417	return !isConstant(V) && isVectorized(V);
8418	}));
8419	})) >= VectorizableTree.size() - ReorderedSplitsCnt)
8420	return false;
8421	}
8422	bool HasPhis = false;
8423	bool HasLoad = true;
8424	unsigned GatherLoads = `0`;
8425	for (const std::unique_ptr<TreeEntry> &TE :
8426	ArrayRef(VectorizableTree).drop_front()) {
8427	if (TE ->State == TreeEntry::SplitVectorize)
8428	continue;
8429	if (!TE ->hasState()) {
8430	if (all_of(Range&: TE ->Scalars, P: IsaPred<Constant, PHINode>) \|\|
8431	all_of(Range&: TE ->Scalars, P: IsaPred<BinaryOperator, PHINode>))
8432	continue;
8433	if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8434	any_of(Range&: TE ->Scalars, P: IsaPred<PHINode, GEPOperator>))
8435	continue;
8436	return true;
8437	}
8438	if (TE ->getOpcode() == Instruction::Load && TE ->ReorderIndices.empty()) {
8439	if (!TE ->isGather()) {
8440	HasLoad = false;
8441	continue;
8442	}
8443	if (HasLoad)
8444	return true;
8445	++GatherLoads;
8446	if (GatherLoads >= GatherLoadsLimit)
8447	return true;
8448	}
8449	if (TE ->getOpcode() == Instruction::GetElementPtr \|\|
8450	Instruction::isBinaryOp(Opcode: TE ->getOpcode()))
8451	continue;
8452	if (TE ->getOpcode() != Instruction::PHI &&
8453	(!TE ->hasCopyableElements() \|\|
8454	static_cast<unsigned>(count_if(Range&: TE ->Scalars, P: IsaPred<PHINode>)) <
8455	TE ->Scalars.size() / `2`))
8456	return true;
8457	if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8458	TE ->getNumOperands() > PhiOpsLimit)
8459	return false;
8460	HasPhis = true;
8461	}
8462	return !HasPhis;
8463	}
8464	return true;
8465	}
8466
8467	void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8468	ArrayRef<int> MaskOrder) {
8469	assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8470	SmallVector<int> NewMask(getVectorFactor());
8471	SmallVector<int> NewMaskOrder(getVectorFactor());
8472	std::iota(first: NewMask.begin(), last: NewMask.end(), value: `0`);
8473	std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: `0`);
8474	if (Idx == `0`) {
8475	copy(Range&: Mask, Out: NewMask.begin());
8476	copy(Range&: MaskOrder, Out: NewMaskOrder.begin());
8477	} else {
8478	assert(Idx == `1` && "Expected either 0 or 1 index.");
8479	unsigned Offset = CombinedEntriesWithIndices.back().second;
8480	for (unsigned I : seq<unsigned>(Size: Mask.size())) {
8481	NewMask [I + Offset] = Mask [I] + Offset;
8482	NewMaskOrder [I + Offset] = MaskOrder [I] + Offset;
8483	}
8484	}
8485	reorderScalars(Scalars, Mask: NewMask);
8486	reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /BottomOrder=/true);
8487	if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices))
8488	ReorderIndices.clear();
8489	}
8490
8491	void BoUpSLP::reorderTopToBottom() {
8492	// Maps VF to the graph nodes.
8493	DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
8494	// ExtractElement gather nodes which can be vectorized and need to handle
8495	// their ordering.
8496	DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
8497
8498	// Phi nodes can have preferred ordering based on their result users
8499	DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
8500
8501	// AltShuffles can also have a preferred ordering that leads to fewer
8502	// instructions, e.g., the addsub instruction in x86.
8503	DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8504
8505	// Maps a TreeEntry to the reorder indices of external users.
8506	DenseMap<const TreeEntry *, SmallVector<OrdersType, `1`>>
8507	ExternalUserReorderMap;
8508	// Find all reorderable nodes with the given VF.
8509	// Currently the are vectorized stores,loads,extracts + some gathering of
8510	// extracts.
8511	for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
8512	const std::unique_ptr<TreeEntry> &TE) {
8513	// Look for external users that will probably be vectorized.
8514	SmallVector<OrdersType, `1`> ExternalUserReorderIndices =
8515	findExternalStoreUsersReorderIndices(TE: TE.get());
8516	if (!ExternalUserReorderIndices.empty()) {
8517	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
8518	ExternalUserReorderMap.try_emplace(Key: TE.get(),
8519	Args: std::move(ExternalUserReorderIndices));
8520	}
8521
8522	// Patterns like [fadd,fsub] can be combined into a single instruction in
8523	// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8524	// to take into account their order when looking for the most used order.
8525	if (TE ->hasState() && TE ->isAltShuffle() &&
8526	TE ->State != TreeEntry::SplitVectorize) {
8527	Type *ScalarTy = TE ->Scalars [`0`]->getType();
8528	VectorType *VecTy = getWidenedType(ScalarTy, VF: TE ->Scalars.size());
8529	unsigned Opcode0 = TE ->getOpcode();
8530	unsigned Opcode1 = TE ->getAltOpcode();
8531	SmallBitVector OpcodeMask(
8532	getAltInstrMask(VL: TE ->Scalars, ScalarTy, Opcode0, Opcode1));
8533	// If this pattern is supported by the target then we consider the order.
8534	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8535	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
8536	AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType ());
8537	}
8538	// TODO: Check the reverse order too.
8539	}
8540
8541	bool IgnoreReorder =
8542	!UserIgnoreList && VectorizableTree.front()->hasState() &&
8543	(VectorizableTree.front()->getOpcode() == Instruction::InsertElement \|\|
8544	VectorizableTree.front()->getOpcode() == Instruction::Store);
8545	if (std::optional<OrdersType> CurrentOrder =
8546	getReorderingData(TE: TE, /TopToBottom=/*true, IgnoreReorder)) {
8547	// Do not include ordering for nodes used in the alt opcode vectorization,
8548	// better to reorder them during bottom-to-top stage. If follow the order
8549	// here, it causes reordering of the whole graph though actually it is
8550	// profitable just to reorder the subgraph that starts from the alternate
8551	// opcode vectorization node. Such nodes already end-up with the shuffle
8552	// instruction and it is just enough to change this shuffle rather than
8553	// rotate the scalars for the whole graph.
8554	unsigned Cnt = `0`;
8555	const TreeEntry *UserTE = TE.get();
8556	while (UserTE && Cnt < RecursionMaxDepth) {
8557	if (!UserTE->UserTreeIndex)
8558	break;
8559	if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8560	UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8561	UserTE->UserTreeIndex.UserTE->Idx != `0`)
8562	return;
8563	UserTE = UserTE->UserTreeIndex.UserTE;
8564	++Cnt;
8565	}
8566	VFToOrderedEntries [TE ->getVectorFactor()].insert(X: TE.get());
8567	if (!(TE ->State == TreeEntry::Vectorize \|\|
8568	TE ->State == TreeEntry::StridedVectorize \|\|
8569	TE ->State == TreeEntry::SplitVectorize \|\|
8570	TE ->State == TreeEntry::CompressVectorize) \|\|
8571	!TE ->ReuseShuffleIndices.empty())
8572	GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8573	if (TE ->State == TreeEntry::Vectorize &&
8574	TE ->getOpcode() == Instruction::PHI)
8575	PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8576	}
8577	});
8578
8579	// Reorder the graph nodes according to their vectorization factor.
8580	for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8581	!VFToOrderedEntries.empty() && VF > `1`; VF -= `2` - (VF & `1U`)) {
8582	auto It = VFToOrderedEntries.find(Val: VF);
8583	if (It == VFToOrderedEntries.end())
8584	continue;
8585	// Try to find the most profitable order. We just are looking for the most
8586	// used order and reorder scalar elements in the nodes according to this
8587	// mostly used order.
8588	ArrayRef<TreeEntry *> OrderedEntries = It ->second.getArrayRef();
8589	// Delete VF entry upon exit.
8590	llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(I: It); });
8591
8592	// All operands are reordered and used only in this node - propagate the
8593	// most used order to the user node.
8594	MapVector<OrdersType, unsigned,
8595	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8596	OrdersUses;
8597	for (const TreeEntry *OpTE : OrderedEntries) {
8598	// No need to reorder this nodes, still need to extend and to use shuffle,
8599	// just need to merge reordering shuffle and the reuse shuffle.
8600	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) &&
8601	OpTE->State != TreeEntry::SplitVectorize)
8602	continue;
8603	// Count number of orders uses.
8604	const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8605	&PhisToOrders]() -> const OrdersType & {
8606	if (OpTE->isGather() \|\| !OpTE->ReuseShuffleIndices.empty()) {
8607	auto It = GathersToOrders.find(Val: OpTE);
8608	if (It != GathersToOrders.end())
8609	return It ->second;
8610	}
8611	if (OpTE->hasState() && OpTE->isAltShuffle()) {
8612	auto It = AltShufflesToOrders.find(Val: OpTE);
8613	if (It != AltShufflesToOrders.end())
8614	return It ->second;
8615	}
8616	if (OpTE->State == TreeEntry::Vectorize &&
8617	OpTE->getOpcode() == Instruction::PHI) {
8618	auto It = PhisToOrders.find(Val: OpTE);
8619	if (It != PhisToOrders.end())
8620	return It ->second;
8621	}
8622	return OpTE->ReorderIndices;
8623	}();
8624	// First consider the order of the external scalar users.
8625	auto It = ExternalUserReorderMap.find(Val: OpTE);
8626	if (It != ExternalUserReorderMap.end()) {
8627	const auto &ExternalUserReorderIndices = It ->second;
8628	// If the OpTE vector factor != number of scalars - use natural order,
8629	// it is an attempt to reorder node with reused scalars but with
8630	// external uses.
8631	if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8632	OrdersUses.try_emplace(Key: OrdersType (), Args: `0`).first->second +=
8633	ExternalUserReorderIndices.size();
8634	} else {
8635	for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8636	++OrdersUses.try_emplace(Key: ExtOrder, Args: `0`).first->second;
8637	}
8638	// No other useful reorder data in this entry.
8639	if (Order.empty())
8640	continue;
8641	}
8642	// Stores actually store the mask, not the order, need to invert.
8643	if (OpTE->State == TreeEntry::Vectorize &&
8644	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8645	assert(!OpTE->isAltShuffle() &&
8646	"Alternate instructions are only supported by BinaryOperator "
8647	"and CastInst.");
8648	SmallVector<int> Mask;
8649	inversePermutation(Indices: Order, Mask);
8650	unsigned E = Order.size();
8651	OrdersType CurrentOrder(E, E);
8652	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
8653	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8654	});
8655	fixupOrderingIndices(Order: CurrentOrder);
8656	++OrdersUses.try_emplace(Key: CurrentOrder, Args: `0`).first->second;
8657	} else {
8658	++OrdersUses.try_emplace(Key: Order, Args: `0`).first->second;
8659	}
8660	}
8661	if (OrdersUses.empty())
8662	continue;
8663	// Choose the most used order.
8664	unsigned IdentityCnt = `0`;
8665	unsigned FilledIdentityCnt = `0`;
8666	OrdersType IdentityOrder(VF, VF);
8667	for (auto &Pair : OrdersUses) {
8668	if (Pair.first.empty() \|\| isIdentityOrder(Order: Pair.first)) {
8669	if (!Pair.first.empty())
8670	FilledIdentityCnt += Pair.second;
8671	IdentityCnt += Pair.second;
8672	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
8673	}
8674	}
8675	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8676	unsigned Cnt = IdentityCnt;
8677	for (auto &Pair : OrdersUses) {
8678	// Prefer identity order. But, if filled identity found (non-empty order)
8679	// with same number of uses, as the new candidate order, we can choose
8680	// this candidate order.
8681	if (Cnt < Pair.second \|\|
8682	(Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8683	Cnt == Pair.second && !BestOrder.empty() &&
8684	isIdentityOrder(Order: BestOrder))) {
8685	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
8686	BestOrder = Pair.first;
8687	Cnt = Pair.second;
8688	} else {
8689	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
8690	}
8691	}
8692	// Set order of the user node.
8693	if (isIdentityOrder(Order: BestOrder))
8694	continue;
8695	fixupOrderingIndices(Order: BestOrder);
8696	SmallVector<int> Mask;
8697	inversePermutation(Indices: BestOrder, Mask);
8698	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8699	unsigned E = BestOrder.size();
8700	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8701	return I < E ? static_cast<int>(I) : PoisonMaskElem;
8702	});
8703	// Do an actual reordering, if profitable.
8704	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8705	// Just do the reordering for the nodes with the given VF.
8706	if (TE ->Scalars.size() != VF) {
8707	if (TE ->ReuseShuffleIndices.size() == VF) {
8708	assert(TE->State != TreeEntry::SplitVectorize &&
8709	"Split vectorized not expected.");
8710	// Need to reorder the reuses masks of the operands with smaller VF to
8711	// be able to find the match between the graph nodes and scalar
8712	// operands of the given node during vectorization/cost estimation.
8713	assert(
8714	(!TE->UserTreeIndex \|\|
8715	TE->UserTreeIndex.UserTE->Scalars.size() == VF \|\|
8716	TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() \|\|
8717	TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8718	"All users must be of VF size.");
8719	if (SLPReVec) {
8720	assert(SLPReVec && "Only supported by REVEC.");
8721	// ShuffleVectorInst does not do reorderOperands (and it should not
8722	// because ShuffleVectorInst supports only a limited set of
8723	// patterns). Only do reorderNodeWithReuses if the user is not
8724	// ShuffleVectorInst.
8725	if (TE ->UserTreeIndex && TE ->UserTreeIndex.UserTE->hasState() &&
8726	isa<ShuffleVectorInst>(Val: TE ->UserTreeIndex.UserTE->getMainOp()))
8727	continue;
8728	}
8729	// Update ordering of the operands with the smaller VF than the given
8730	// one.
8731	reorderNodeWithReuses(TE&: *TE, Mask);
8732	// Update orders in user split vectorize nodes.
8733	if (TE ->UserTreeIndex &&
8734	TE ->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8735	TE ->UserTreeIndex.UserTE->reorderSplitNode(
8736	Idx: TE ->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8737	}
8738	continue;
8739	}
8740	if ((TE ->State == TreeEntry::SplitVectorize &&
8741	TE ->ReuseShuffleIndices.empty()) \|\|
8742	((TE ->State == TreeEntry::Vectorize \|\|
8743	TE ->State == TreeEntry::StridedVectorize \|\|
8744	TE ->State == TreeEntry::CompressVectorize) &&
8745	(isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
8746	InsertElementInst>(Val: TE ->getMainOp()) \|\|
8747	(SLPReVec && isa<ShuffleVectorInst>(Val: TE ->getMainOp()))))) {
8748	assert(
8749	(!TE->isAltShuffle() \|\| (TE->State == TreeEntry::SplitVectorize &&
8750	TE->ReuseShuffleIndices.empty())) &&
8751	"Alternate instructions are only supported by BinaryOperator "
8752	"and CastInst.");
8753	// Build correct orders for extract{element,value}, loads,
8754	// stores and alternate (split) nodes.
8755	reorderOrder(Order&: TE ->ReorderIndices, Mask);
8756	if (isa<InsertElementInst, StoreInst>(Val: TE ->getMainOp()))
8757	TE ->reorderOperands(Mask);
8758	} else {
8759	// Reorder the node and its operands.
8760	TE ->reorderOperands(Mask);
8761	assert(TE->ReorderIndices.empty() &&
8762	"Expected empty reorder sequence.");
8763	reorderScalars(Scalars&: TE ->Scalars, Mask);
8764	}
8765	if (!TE ->ReuseShuffleIndices.empty()) {
8766	// Apply reversed order to keep the original ordering of the reused
8767	// elements to avoid extra reorder indices shuffling.
8768	OrdersType CurrentOrder;
8769	reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
8770	SmallVector<int> NewReuses;
8771	inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
8772	addMask(Mask&: NewReuses, SubMask: TE ->ReuseShuffleIndices);
8773	TE ->ReuseShuffleIndices.swap(RHS&: NewReuses);
8774	} else if (TE ->UserTreeIndex &&
8775	TE ->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8776	// Update orders in user split vectorize nodes.
8777	TE ->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE ->UserTreeIndex.EdgeIdx,
8778	Mask, MaskOrder);
8779	}
8780	}
8781	}
8782
8783	void BoUpSLP::buildReorderableOperands(
8784	TreeEntry UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry >> &Edges,
8785	const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8786	SmallVectorImpl<TreeEntry *> &GatherOps) {
8787	for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) {
8788	if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8789	return OpData.first == I &&
8790	(OpData.second->State == TreeEntry::Vectorize \|\|
8791	OpData.second->State == TreeEntry::StridedVectorize \|\|
8792	OpData.second->State == TreeEntry::CompressVectorize \|\|
8793	OpData.second->State == TreeEntry::SplitVectorize);
8794	}))
8795	continue;
8796	// Do not request operands, if they do not exist.
8797	if (UserTE->hasState()) {
8798	if (UserTE->getOpcode() == Instruction::ExtractElement \|\|
8799	UserTE->getOpcode() == Instruction::ExtractValue)
8800	continue;
8801	if (UserTE->getOpcode() == Instruction::InsertElement && I == `0`)
8802	continue;
8803	if (UserTE->getOpcode() == Instruction::Store &&
8804	UserTE->State == TreeEntry::Vectorize && I == `1`)
8805	continue;
8806	if (UserTE->getOpcode() == Instruction::Load &&
8807	(UserTE->State == TreeEntry::Vectorize \|\|
8808	UserTE->State == TreeEntry::StridedVectorize \|\|
8809	UserTE->State == TreeEntry::CompressVectorize))
8810	continue;
8811	}
8812	TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I);
8813	assert(TE && "Expected operand entry.");
8814	if (!TE->isGather()) {
8815	// Add the node to the list of the ordered nodes with the identity
8816	// order.
8817	Edges.emplace_back(Args&: I, Args&: TE);
8818	// Add ScatterVectorize nodes to the list of operands, where just
8819	// reordering of the scalars is required. Similar to the gathers, so
8820	// simply add to the list of gathered ops.
8821	// If there are reused scalars, process this node as a regular vectorize
8822	// node, just reorder reuses mask.
8823	if (TE->State == TreeEntry::ScatterVectorize &&
8824	TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8825	GatherOps.push_back(Elt: TE);
8826	continue;
8827	}
8828	if (ReorderableGathers.contains(Ptr: TE))
8829	GatherOps.push_back(Elt: TE);
8830	}
8831	}
8832
8833	void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8834	struct TreeEntryCompare {
8835	bool operator()(const TreeEntry LHS, const* TreeEntry RHS) const* {
8836	if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8837	return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8838	return LHS->Idx < RHS->Idx;
8839	}
8840	};
8841	PriorityQueue<TreeEntry , SmallVector<TreeEntry >, TreeEntryCompare> Queue;
8842	DenseSet<const TreeEntry *> GathersToOrders;
8843	// Find all reorderable leaf nodes with the given VF.
8844	// Currently the are vectorized loads,extracts without alternate operands +
8845	// some gathering of extracts.
8846	SmallPtrSet<const TreeEntry *, `4`> NonVectorized;
8847	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8848	if (TE ->State != TreeEntry::Vectorize &&
8849	TE ->State != TreeEntry::StridedVectorize &&
8850	TE ->State != TreeEntry::CompressVectorize &&
8851	TE ->State != TreeEntry::SplitVectorize)
8852	NonVectorized.insert(Ptr: TE.get());
8853	if (std::optional<OrdersType> CurrentOrder =
8854	getReorderingData(TE: TE, /TopToBottom=/*false, IgnoreReorder)) {
8855	Queue.push(x: TE.get());
8856	if (!(TE ->State == TreeEntry::Vectorize \|\|
8857	TE ->State == TreeEntry::StridedVectorize \|\|
8858	TE ->State == TreeEntry::CompressVectorize \|\|
8859	TE ->State == TreeEntry::SplitVectorize) \|\|
8860	!TE ->ReuseShuffleIndices.empty())
8861	GathersToOrders.insert(V: TE.get());
8862	}
8863	}
8864
8865	// 1. Propagate order to the graph nodes, which use only reordered nodes.
8866	// I.e., if the node has operands, that are reordered, try to make at least
8867	// one operand order in the natural order and reorder others + reorder the
8868	// user node itself.
8869	SmallPtrSet<const TreeEntry *, `4`> Visited, RevisitedOps;
8870	while (!Queue.empty()) {
8871	// 1. Filter out only reordered nodes.
8872	std::pair<TreeEntry , SmallVector<std::pair<unsigned, TreeEntry >>> Users;
8873	TreeEntry *TE = Queue.top();
8874	const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8875	Queue.pop();
8876	SmallVector<TreeEntry *> OrderedOps(`1`, TE);
8877	while (!Queue.empty()) {
8878	TE = Queue.top();
8879	if (!UserTE \|\| UserTE != TE->UserTreeIndex.UserTE)
8880	break;
8881	Queue.pop();
8882	OrderedOps.push_back(Elt: TE);
8883	}
8884	for (TreeEntry *TE : OrderedOps) {
8885	if (!(TE->State == TreeEntry::Vectorize \|\|
8886	TE->State == TreeEntry::StridedVectorize \|\|
8887	TE->State == TreeEntry::CompressVectorize \|\|
8888	TE->State == TreeEntry::SplitVectorize \|\|
8889	(TE->isGather() && GathersToOrders.contains(V: TE))) \|\|
8890	!TE->UserTreeIndex \|\| !TE->ReuseShuffleIndices.empty() \|\|
8891	!Visited.insert(Ptr: TE).second)
8892	continue;
8893	// Build a map between user nodes and their operands order to speedup
8894	// search. The graph currently does not provide this dependency directly.
8895	Users.first = TE->UserTreeIndex.UserTE;
8896	Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE);
8897	}
8898	if (Users.first) {
8899	auto &Data = Users;
8900	if (Data.first->State == TreeEntry::SplitVectorize) {
8901	assert(
8902	Data.second.size() <= `2` &&
8903	"Expected not greater than 2 operands for split vectorize node.");
8904	if (any_of(Range&: Data.second,
8905	P: [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8906	continue;
8907	// Update orders in user split vectorize nodes.
8908	assert(Data.first->CombinedEntriesWithIndices.size() == `2` &&
8909	"Expected exactly 2 entries.");
8910	for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8911	TreeEntry &OpTE = *VectorizableTree [P.first];
8912	OrdersType Order = OpTE.ReorderIndices;
8913	if (Order.empty() \|\| !OpTE.ReuseShuffleIndices.empty()) {
8914	if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8915	continue;
8916	const auto BestOrder =
8917	getReorderingData(TE: OpTE, /TopToBottom=/false, IgnoreReorder);
8918	if (!BestOrder \|\| BestOrder ->empty() \|\| isIdentityOrder(Order: *BestOrder))
8919	continue;
8920	Order = *BestOrder;
8921	}
8922	fixupOrderingIndices(Order);
8923	SmallVector<int> Mask;
8924	inversePermutation(Indices: Order, Mask);
8925	const unsigned E = Order.size();
8926	SmallVector<int> MaskOrder(E, PoisonMaskElem);
8927	transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8928	return I < E ? static_cast<int>(I) : PoisonMaskElem;
8929	});
8930	Data.first->reorderSplitNode(Idx: P.second ? `1` : `0`, Mask, MaskOrder);
8931	// Clear ordering of the operand.
8932	if (!OpTE.ReorderIndices.empty()) {
8933	OpTE.ReorderIndices.clear();
8934	} else if (!OpTE.ReuseShuffleIndices.empty()) {
8935	reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask);
8936	} else {
8937	assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8938	reorderScalars(Scalars&: OpTE.Scalars, Mask);
8939	}
8940	}
8941	if (Data.first->ReuseShuffleIndices.empty() &&
8942	!Data.first->ReorderIndices.empty()) {
8943	// Insert user node to the list to try to sink reordering deeper in
8944	// the graph.
8945	Queue.push(x: Data.first);
8946	}
8947	continue;
8948	}
8949	// Check that operands are used only in the User node.
8950	SmallVector<TreeEntry *> GatherOps;
8951	buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
8952	GatherOps);
8953	// All operands are reordered and used only in this node - propagate the
8954	// most used order to the user node.
8955	MapVector<OrdersType, unsigned,
8956	DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8957	OrdersUses;
8958	// Do the analysis for each tree entry only once, otherwise the order of
8959	// the same node my be considered several times, though might be not
8960	// profitable.
8961	SmallPtrSet<const TreeEntry *, `4`> VisitedOps;
8962	SmallPtrSet<const TreeEntry *, `4`> VisitedUsers;
8963	for (const auto &Op : Data.second) {
8964	TreeEntry *OpTE = Op.second;
8965	if (!VisitedOps.insert(Ptr: OpTE).second)
8966	continue;
8967	if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
8968	continue;
8969	const auto Order = [&]() -> const OrdersType {
8970	if (OpTE->isGather() \|\| !OpTE->ReuseShuffleIndices.empty())
8971	return getReorderingData(TE: OpTE, /TopToBottom=/*false,
8972	IgnoreReorder)
8973	.value_or(u: OrdersType (`1`));
8974	return OpTE->ReorderIndices;
8975	}();
8976	// The order is partially ordered, skip it in favor of fully non-ordered
8977	// orders.
8978	if (Order.size() == `1`)
8979	continue;
8980
8981	// Check that the reordering does not increase number of shuffles, i.e.
8982	// same-values-nodes has same parents or their parents has same parents.
8983	if (!Order.empty() && !isIdentityOrder(Order)) {
8984	Value *Root = OpTE->hasState()
8985	? OpTE->getMainOp()
8986	: *find_if_not(Range&: OpTE->Scalars, P: isConstant);
8987	auto GetSameNodesUsers = [&](Value *Root) {
8988	SmallSetVector<TreeEntry *, `4`> Res;
8989	for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) {
8990	if (TE != OpTE && TE->UserTreeIndex &&
8991	TE->getVectorFactor() == OpTE->getVectorFactor() &&
8992	TE->Scalars.size() == OpTE->Scalars.size() &&
8993	((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) \|\|
8994	(OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
8995	Res.insert(X: TE->UserTreeIndex.UserTE);
8996	}
8997	for (const TreeEntry *TE : getTreeEntries(V: Root)) {
8998	if (TE != OpTE && TE->UserTreeIndex &&
8999	TE->getVectorFactor() == OpTE->getVectorFactor() &&
9000	TE->Scalars.size() == OpTE->Scalars.size() &&
9001	((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) \|\|
9002	(OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
9003	Res.insert(X: TE->UserTreeIndex.UserTE);
9004	}
9005	return Res.takeVector();
9006	};
9007	auto GetNumOperands = [](const TreeEntry *TE) {
9008	if (TE->State == TreeEntry::SplitVectorize)
9009	return TE->getNumOperands();
9010	if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
9011	return CI->arg_size();
9012	return TE->getNumOperands();
9013	};
9014	auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9015	const TreeEntry *TE) {
9016	Intrinsic::ID ID = Intrinsic::not_intrinsic;
9017	if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
9018	ID = getVectorIntrinsicIDForCall(CI, TLI);
9019	for (unsigned Idx : seq<unsigned>(Size: GetNumOperands (TE))) {
9020	if (ID != Intrinsic::not_intrinsic &&
9021	isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9022	continue;
9023	const TreeEntry *Op = getOperandEntry(E: TE, Idx);
9024	if (Op->isGather() && Op->hasState()) {
9025	const TreeEntry *VecOp =
9026	getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars);
9027	if (VecOp)
9028	Op = VecOp;
9029	}
9030	if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9031	return false;
9032	}
9033	return true;
9034	};
9035	SmallVector<TreeEntry *> Users = GetSameNodesUsers (Root);
9036	if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) {
9037	if (!RevisitedOps.insert(Ptr: UTE).second)
9038	return false;
9039	return UTE == Data.first \|\| !UTE->ReorderIndices.empty() \|\|
9040	!UTE->ReuseShuffleIndices.empty() \|\|
9041	(UTE->UserTreeIndex &&
9042	UTE->UserTreeIndex.UserTE == Data.first) \|\|
9043	(Data.first->UserTreeIndex &&
9044	Data.first->UserTreeIndex.UserTE == UTE) \|\|
9045	(IgnoreReorder && UTE->UserTreeIndex &&
9046	UTE->UserTreeIndex.UserTE->Idx == `0`) \|\|
9047	NodeShouldBeReorderedWithOperands (UTE);
9048	}))
9049	continue;
9050	for (TreeEntry *UTE : Users) {
9051	Intrinsic::ID ID = Intrinsic::not_intrinsic;
9052	if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI)
9053	ID = getVectorIntrinsicIDForCall(CI, TLI);
9054	for (unsigned Idx : seq<unsigned>(Size: GetNumOperands (UTE))) {
9055	if (ID != Intrinsic::not_intrinsic &&
9056	isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9057	continue;
9058	const TreeEntry *Op = getOperandEntry(E: UTE, Idx);
9059	Visited.erase(Ptr: Op);
9060	Queue.push(x: const_cast<TreeEntry *>(Op));
9061	}
9062	}
9063	}
9064	unsigned NumOps = count_if(
9065	Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9066	return P.second == OpTE;
9067	});
9068	// Stores actually store the mask, not the order, need to invert.
9069	if (OpTE->State == TreeEntry::Vectorize &&
9070	OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9071	assert(!OpTE->isAltShuffle() &&
9072	"Alternate instructions are only supported by BinaryOperator "
9073	"and CastInst.");
9074	SmallVector<int> Mask;
9075	inversePermutation(Indices: Order, Mask);
9076	unsigned E = Order.size();
9077	OrdersType CurrentOrder(E, E);
9078	transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
9079	return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9080	});
9081	fixupOrderingIndices(Order: CurrentOrder);
9082	OrdersUses.try_emplace(Key: CurrentOrder, Args: `0`).first->second += NumOps;
9083	} else {
9084	OrdersUses.try_emplace(Key: Order, Args: `0`).first->second += NumOps;
9085	}
9086	auto Res = OrdersUses.try_emplace(Key: OrdersType (), Args: `0`);
9087	const auto AllowsReordering = [&](const TreeEntry *TE) {
9088	if (!TE->ReorderIndices.empty() \|\| !TE->ReuseShuffleIndices.empty() \|\|
9089	(TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) \|\|
9090	(IgnoreReorder && TE->Idx == `0`))
9091	return true;
9092	if (TE->isGather()) {
9093	if (GathersToOrders.contains(V: TE))
9094	return !getReorderingData(TE: TE, /TopToBottom=/*false,
9095	IgnoreReorder)
9096	.value_or(u: OrdersType (`1`))
9097	.empty();
9098	return true;
9099	}
9100	return false;
9101	};
9102	if (OpTE->UserTreeIndex) {
9103	TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9104	if (!VisitedUsers.insert(Ptr: UserTE).second)
9105	continue;
9106	// May reorder user node if it requires reordering, has reused
9107	// scalars, is an alternate op vectorize node or its op nodes require
9108	// reordering.
9109	if (AllowsReordering (UserTE))
9110	continue;
9111	// Check if users allow reordering.
9112	// Currently look up just 1 level of operands to avoid increase of
9113	// the compile time.
9114	// Profitable to reorder if definitely more operands allow
9115	// reordering rather than those with natural order.
9116	ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
9117	if (static_cast<unsigned>(count_if(
9118	Range&: Ops, P: [UserTE, &AllowsReordering](
9119	const std::pair<unsigned, TreeEntry *> &Op) {
9120	return AllowsReordering (Op.second) &&
9121	Op.second->UserTreeIndex.UserTE == UserTE;
9122	})) <= Ops.size() / `2`)
9123	++Res.first->second;
9124	}
9125	}
9126	if (OrdersUses.empty()) {
9127	Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9128	continue;
9129	}
9130	// Choose the most used order.
9131	unsigned IdentityCnt = `0`;
9132	unsigned VF = Data.second.front().second->getVectorFactor();
9133	OrdersType IdentityOrder(VF, VF);
9134	for (auto &Pair : OrdersUses) {
9135	if (Pair.first.empty() \|\| isIdentityOrder(Order: Pair.first)) {
9136	IdentityCnt += Pair.second;
9137	combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
9138	}
9139	}
9140	MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9141	unsigned Cnt = IdentityCnt;
9142	for (auto &Pair : OrdersUses) {
9143	// Prefer identity order. But, if filled identity found (non-empty
9144	// order) with same number of uses, as the new candidate order, we can
9145	// choose this candidate order.
9146	if (Cnt < Pair.second) {
9147	combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
9148	BestOrder = Pair.first;
9149	Cnt = Pair.second;
9150	} else {
9151	combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
9152	}
9153	}
9154	// Set order of the user node.
9155	if (isIdentityOrder(Order: BestOrder)) {
9156	Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9157	continue;
9158	}
9159	fixupOrderingIndices(Order: BestOrder);
9160	// Erase operands from OrderedEntries list and adjust their orders.
9161	VisitedOps.clear();
9162	SmallVector<int> Mask;
9163	inversePermutation(Indices: BestOrder, Mask);
9164	SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9165	unsigned E = BestOrder.size();
9166	transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
9167	return I < E ? static_cast<int>(I) : PoisonMaskElem;
9168	});
9169	for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9170	TreeEntry *TE = Op.second;
9171	if (!VisitedOps.insert(Ptr: TE).second)
9172	continue;
9173	if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9174	reorderNodeWithReuses(TE&: *TE, Mask);
9175	continue;
9176	}
9177	// Gathers are processed separately.
9178	if (TE->State != TreeEntry::Vectorize &&
9179	TE->State != TreeEntry::StridedVectorize &&
9180	TE->State != TreeEntry::CompressVectorize &&
9181	TE->State != TreeEntry::SplitVectorize &&
9182	(TE->State != TreeEntry::ScatterVectorize \|\|
9183	TE->ReorderIndices.empty()))
9184	continue;
9185	assert((BestOrder.size() == TE->ReorderIndices.size() \|\|
9186	TE->ReorderIndices.empty()) &&
9187	"Non-matching sizes of user/operand entries.");
9188	reorderOrder(Order&: TE->ReorderIndices, Mask);
9189	if (IgnoreReorder && TE == VectorizableTree.front().get())
9190	IgnoreReorder = false;
9191	}
9192	// For gathers just need to reorder its scalars.
9193	for (TreeEntry *Gather : GatherOps) {
9194	assert(Gather->ReorderIndices.empty() &&
9195	"Unexpected reordering of gathers.");
9196	if (!Gather->ReuseShuffleIndices.empty()) {
9197	// Just reorder reuses indices.
9198	reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
9199	continue;
9200	}
9201	reorderScalars(Scalars&: Gather->Scalars, Mask);
9202	Visited.insert(Ptr: Gather);
9203	}
9204	// Reorder operands of the user node and set the ordering for the user
9205	// node itself.
9206	auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9207	return TE.isAltShuffle() &&
9208	(!TE.ReuseShuffleIndices.empty() \|\| TE.getVectorFactor() == `2` \|\|
9209	TE.ReorderIndices.empty());
9210	};
9211	if (Data.first->State != TreeEntry::Vectorize \|\|
9212	!isa<ExtractElementInst, ExtractValueInst, LoadInst>(
9213	Val: Data.first->getMainOp()) \|\|
9214	IsNotProfitableAltCodeNode (*Data.first))
9215	Data.first->reorderOperands(Mask);
9216	if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) \|\|
9217	IsNotProfitableAltCodeNode (*Data.first) \|\|
9218	Data.first->State == TreeEntry::StridedVectorize \|\|
9219	Data.first->State == TreeEntry::CompressVectorize) {
9220	reorderScalars(Scalars&: Data.first->Scalars, Mask);
9221	reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
9222	/BottomOrder=/true);
9223	if (Data.first->ReuseShuffleIndices.empty() &&
9224	!Data.first->ReorderIndices.empty() &&
9225	!IsNotProfitableAltCodeNode (*Data.first)) {
9226	// Insert user node to the list to try to sink reordering deeper in
9227	// the graph.
9228	Queue.push(x: Data.first);
9229	}
9230	} else {
9231	reorderOrder(Order&: Data.first->ReorderIndices, Mask);
9232	}
9233	}
9234	}
9235	// If the reordering is unnecessary, just remove the reorder.
9236	if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9237	VectorizableTree.front()->ReuseShuffleIndices.empty())
9238	VectorizableTree.front()->ReorderIndices.clear();
9239	}
9240
9241	Instruction BoUpSLP::getRootEntryInstruction(const* TreeEntry &Entry) const {
9242	if (Entry.hasState() &&
9243	(Entry.getOpcode() == Instruction::Store \|\|
9244	Entry.getOpcode() == Instruction::Load) &&
9245	Entry.State == TreeEntry::StridedVectorize &&
9246	!Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices))
9247	return dyn_cast<Instruction>(Val: Entry.Scalars [Entry.ReorderIndices.front()]);
9248	return dyn_cast<Instruction>(Val: Entry.Scalars.front());
9249	}
9250
9251	void BoUpSLP::buildExternalUses(
9252	const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9253	const size_t NumVectScalars = ScalarToTreeEntries.size() + `1`;
9254	DenseMap<Value , unsigned*> ScalarToExtUses;
9255	// Collect the values that we need to extract from the tree.
9256	for (auto &TEPtr : VectorizableTree) {
9257	TreeEntry *Entry = TEPtr.get();
9258
9259	// No need to handle users of gathered values.
9260	if (Entry->isGather() \|\| Entry->State == TreeEntry::SplitVectorize \|\|
9261	DeletedNodes.contains(Ptr: Entry) \|\|
9262	TransformedToGatherNodes.contains(Val: Entry))
9263	continue;
9264
9265	// For each lane:
9266	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9267	Value *Scalar = Entry->Scalars [Lane];
9268	if (!isa<Instruction>(Val: Scalar) \|\| Entry->isCopyableElement(V: Scalar))
9269	continue;
9270
9271	// All uses must be replaced already? No need to do it again.
9272	auto It = ScalarToExtUses.find(Val: Scalar);
9273	if (It != ScalarToExtUses.end() && !ExternalUses [It ->second].User)
9274	continue;
9275
9276	if (Scalar->hasNUsesOrMore(N: NumVectScalars)) {
9277	unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9278	LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9279	<< " from " << *Scalar << "for many users.\n");
9280	It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9281	ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9282	ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9283	continue;
9284	}
9285
9286	// Check if the scalar is externally used as an extra arg.
9287	const auto ExtI = ExternallyUsedValues.find(V: Scalar);
9288	if (ExtI != ExternallyUsedValues.end()) {
9289	unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9290	LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9291	<< FoundLane << " from " << *Scalar << ".\n");
9292	ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
9293	ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9294	continue;
9295	}
9296	for (User *U : Scalar->users()) {
9297	LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9298
9299	Instruction *UserInst = dyn_cast<Instruction>(Val: U);
9300	if (!UserInst \|\| isDeleted(I: UserInst))
9301	continue;
9302
9303	// Ignore users in the user ignore list.
9304	if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
9305	continue;
9306
9307	// Skip in-tree scalars that become vectors
9308	if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U);
9309	any_of(Range&: UseEntries, P: [this](const TreeEntry *UseEntry) {
9310	return !DeletedNodes.contains(Ptr: UseEntry) &&
9311	!TransformedToGatherNodes.contains(Val: UseEntry);
9312	})) {
9313	// Some in-tree scalars will remain as scalar in vectorized
9314	// instructions. If that is the case, the one in FoundLane will
9315	// be used.
9316	if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9317	isa<LoadInst, StoreInst>(Val: UserInst)) \|\|
9318	isa<CallInst>(Val: UserInst)) \|\|
9319	all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) {
9320	if (DeletedNodes.contains(Ptr: UseEntry) \|\|
9321	TransformedToGatherNodes.contains(Val: UseEntry))
9322	return true;
9323	return UseEntry->State == TreeEntry::ScatterVectorize \|\|
9324	!doesInTreeUserNeedToExtract(
9325	Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI,
9326	TTI);
9327	})) {
9328	LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9329	<< ".\n");
9330	assert(none_of(UseEntries,
9331	[](TreeEntry *UseEntry) {
9332	return UseEntry->isGather();
9333	}) &&
9334	"Bad state");
9335	continue;
9336	}
9337	U = nullptr;
9338	if (It != ScalarToExtUses.end()) {
9339	ExternalUses [It ->second].User = nullptr;
9340	break;
9341	}
9342	}
9343
9344	if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
9345	U = nullptr;
9346	unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9347	LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9348	<< " from lane " << FoundLane << " from " << *Scalar
9349	<< ".\n");
9350	It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9351	ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane);
9352	ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9353	if (!U)
9354	break;
9355	}
9356	}
9357	}
9358	}
9359
9360	SmallVector<SmallVector<StoreInst *>>
9361	BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry TE) const* {
9362	SmallDenseMap<std::tuple<BasicBlock , Type , Value *>,
9363	SmallVector<StoreInst *>, `8`>
9364	PtrToStoresMap;
9365	for (unsigned Lane : seq<unsigned>(Begin: `0`, End: TE->Scalars.size())) {
9366	Value *V = TE->Scalars [Lane];
9367	// Don't iterate over the users of constant data.
9368	if (!isa<Instruction>(Val: V))
9369	continue;
9370	// To save compilation time we don't visit if we have too many users.
9371	if (V->hasNUsesOrMore(N: UsesLimit))
9372	break;
9373
9374	// Collect stores per pointer object.
9375	for (User *U : V->users()) {
9376	auto *SI = dyn_cast<StoreInst>(Val: U);
9377	// Test whether we can handle the store. V might be a global, which could
9378	// be used in a different function.
9379	if (SI == nullptr \|\| !SI->isSimple() \|\| SI->getFunction() != F \|\|
9380	!isValidElementType(Ty: SI->getValueOperand()->getType()))
9381	continue;
9382	// Skip entry if already
9383	if (isVectorized(V: U))
9384	continue;
9385
9386	Value *Ptr =
9387	getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
9388	auto &StoresVec = PtrToStoresMap [{SI->getParent(),
9389	SI->getValueOperand()->getType(), Ptr}];
9390	// For now just keep one store per pointer object per lane.
9391	// TODO: Extend this to support multiple stores per pointer per lane
9392	if (StoresVec.size() > Lane)
9393	continue;
9394	if (!StoresVec.empty()) {
9395	std::optional<int64_t> Diff = getPointersDiff(
9396	ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(),
9397	ElemTyB: SI->getValueOperand()->getType(),
9398	PtrB: StoresVec.front()->getPointerOperand(), DL: DL, SE&: SE,
9399	/StrictCheck=/true);
9400	// We failed to compare the pointers so just abandon this store.
9401	if (!Diff)
9402	continue;
9403	}
9404	StoresVec.push_back(Elt: SI);
9405	}
9406	}
9407	SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9408	unsigned I = `0`;
9409	for (auto &P : PtrToStoresMap) {
9410	Res [I].swap(RHS&: P.second);
9411	++I;
9412	}
9413	return Res;
9414	}
9415
9416	bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9417	OrdersType &ReorderIndices) const {
9418	// We check whether the stores in StoreVec can form a vector by sorting them
9419	// and checking whether they are consecutive.
9420
9421	// To avoid calling getPointersDiff() while sorting we create a vector of
9422	// pairs {store, offset from first} and sort this instead.
9423	SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;
9424	StoreInst *S0 = StoresVec [`0`];
9425	StoreOffsetVec.emplace_back(Args: `0`, Args: `0`);
9426	Type *S0Ty = S0->getValueOperand()->getType();
9427	Value *S0Ptr = S0->getPointerOperand();
9428	for (unsigned Idx : seq<unsigned>(Begin: `1`, End: StoresVec.size())) {
9429	StoreInst *SI = StoresVec [Idx];
9430	std::optional<int64_t> Diff =
9431	getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
9432	PtrB: SI->getPointerOperand(), DL: DL, SE&: SE,
9433	/StrictCheck=/true);
9434	StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx);
9435	}
9436
9437	// Check if the stores are consecutive by checking if their difference is 1.
9438	if (StoreOffsetVec.size() != StoresVec.size())
9439	return false;
9440	sort(C&: StoreOffsetVec, Comp: llvm::less_first ());
9441	unsigned Idx = `0`;
9442	int64_t PrevDist = `0`;
9443	for (const auto &P : StoreOffsetVec) {
9444	if (Idx > `0` && P.first != PrevDist + `1`)
9445	return false;
9446	PrevDist = P.first;
9447	++Idx;
9448	}
9449
9450	// Calculate the shuffle indices according to their offset against the sorted
9451	// StoreOffsetVec.
9452	ReorderIndices.assign(NumElts: StoresVec.size(), Elt: `0`);
9453	bool IsIdentity = true;
9454	for (auto [I, P] : enumerate(First&: StoreOffsetVec)) {
9455	ReorderIndices [P.second] = I;
9456	IsIdentity &= P.second == I;
9457	}
9458	// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9459	// reorderTopToBottom() and reorderBottomToTop(), so we are following the
9460	// same convention here.
9461	if (IsIdentity)
9462	ReorderIndices.clear();
9463
9464	return true;
9465	}
9466
9467	#ifndef NDEBUG
9468	LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
9469	for (unsigned Idx : Order)
9470	dbgs() << Idx << ", ";
9471	dbgs() << "\n";
9472	}
9473	#endif
9474
9475	SmallVector<BoUpSLP::OrdersType, `1`>
9476	BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry TE) const* {
9477	unsigned NumLanes = TE->Scalars.size();
9478
9479	SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9480
9481	// Holds the reorder indices for each candidate store vector that is a user of
9482	// the current TreeEntry.
9483	SmallVector<OrdersType, `1`> ExternalReorderIndices;
9484
9485	// Now inspect the stores collected per pointer and look for vectorization
9486	// candidates. For each candidate calculate the reorder index vector and push
9487	// it into `ExternalReorderIndices`
9488	for (ArrayRef<StoreInst *> StoresVec : Stores) {
9489	// If we have fewer than NumLanes stores, then we can't form a vector.
9490	if (StoresVec.size() != NumLanes)
9491	continue;
9492
9493	// If the stores are not consecutive then abandon this StoresVec.
9494	OrdersType ReorderIndices;
9495	if (!canFormVector(StoresVec, ReorderIndices))
9496	continue;
9497
9498	// We now know that the scalars in StoresVec can form a vector instruction,
9499	// so set the reorder indices.
9500	ExternalReorderIndices.push_back(Elt: ReorderIndices);
9501	}
9502	return ExternalReorderIndices;
9503	}
9504
9505	void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
9506	const SmallDenseSet<Value *> &UserIgnoreLst) {
9507	deleteTree();
9508	assert(TreeEntryToStridedPtrInfoMap.empty() &&
9509	"TreeEntryToStridedPtrInfoMap is not cleared");
9510	UserIgnoreList = &UserIgnoreLst;
9511	if (!allSameType(VL: Roots))
9512	return;
9513	buildTreeRec(Roots, Depth: `0`, EI: EdgeInfo ());
9514	}
9515
9516	void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
9517	deleteTree();
9518	assert(TreeEntryToStridedPtrInfoMap.empty() &&
9519	"TreeEntryToStridedPtrInfoMap is not cleared");
9520	if (!allSameType(VL: Roots))
9521	return;
9522	buildTreeRec(Roots, Depth: `0`, EI: EdgeInfo ());
9523	}
9524
9525	/// Tries to find subvector of loads and builds new vector of only loads if can
9526	/// be profitable.
9527	static void gatherPossiblyVectorizableLoads(
9528	const BoUpSLP &R, ArrayRef<Value > VL, const* DataLayout &DL,
9529	ScalarEvolution &SE, const TargetTransformInfo &TTI,
9530	SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9531	bool AddNew = true) {
9532	if (VL.empty())
9533	return;
9534	Type *ScalarTy = getValueType(V: VL.front());
9535	if (!isValidElementType(Ty: ScalarTy))
9536	return;
9537	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;
9538	SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9539	for (Value *V : VL) {
9540	auto *LI = dyn_cast<LoadInst>(Val: V);
9541	if (!LI)
9542	continue;
9543	if (R.isDeleted(I: LI) \|\| R.isVectorized(V: LI) \|\| !LI->isSimple())
9544	continue;
9545	bool IsFound = false;
9546	for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) {
9547	assert(LI->getParent() == Data.front().first->getParent() &&
9548	LI->getType() == Data.front().first->getType() &&
9549	getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9550	getUnderlyingObject(Data.front().first->getPointerOperand(),
9551	RecursionMaxDepth) &&
9552	"Expected loads with the same type, same parent and same "
9553	"underlying pointer.");
9554	std::optional<int64_t> Dist = getPointersDiff(
9555	ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(),
9556	PtrB: Data.front().first->getPointerOperand(), DL, SE,
9557	/StrictCheck=/true);
9558	if (!Dist)
9559	continue;
9560	auto It = Map.find(Val: *Dist);
9561	if (It != Map.end() && It ->second != LI)
9562	continue;
9563	if (It == Map.end()) {
9564	Data.emplace_back(Args&: LI, Args&: *Dist);
9565	Map.try_emplace(Key: *Dist, Args&: LI);
9566	}
9567	IsFound = true;
9568	break;
9569	}
9570	if (!IsFound) {
9571	ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: `0`);
9572	ClusteredDistToLoad.emplace_back().try_emplace(Key: `0`, Args&: LI);
9573	}
9574	}
9575	auto FindMatchingLoads =
9576	[&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,
9577	SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>
9578	&GatheredLoads,
9579	SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9580	int64_t &Offset, unsigned &Start) {
9581	if (Loads.empty())
9582	return GatheredLoads.end();
9583	LoadInst *LI = Loads.front().first;
9584	for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) {
9585	if (Idx < Start)
9586	continue;
9587	ToAdd.clear();
9588	if (LI->getParent() != Data.front().first->getParent() \|\|
9589	LI->getType() != Data.front().first->getType())
9590	continue;
9591	std::optional<int64_t> Dist =
9592	getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(),
9593	ElemTyB: Data.front().first->getType(),
9594	PtrB: Data.front().first->getPointerOperand(), DL, SE,
9595	/StrictCheck=/true);
9596	if (!Dist)
9597	continue;
9598	SmallSet<int64_t, `4`> DataDists;
9599	SmallPtrSet<LoadInst *, `4`> DataLoads;
9600	for (std::pair<LoadInst *, int64_t> P : Data) {
9601	DataDists.insert(V: P.second);
9602	DataLoads.insert(Ptr: P.first);
9603	}
9604	// Found matching gathered loads - check if all loads are unique or
9605	// can be effectively vectorized.
9606	unsigned NumUniques = `0`;
9607	for (auto [Cnt, Pair] : enumerate(First&: Loads)) {
9608	bool Used = DataLoads.contains(Ptr: Pair.first);
9609	if (!Used && !DataDists.contains(V: *Dist + Pair.second)) {
9610	++NumUniques;
9611	ToAdd.insert(X: Cnt);
9612	} else if (Used) {
9613	Repeated.insert(X: Cnt);
9614	}
9615	}
9616	if (NumUniques > `0` &&
9617	(Loads.size() == NumUniques \|\|
9618	(Loads.size() - NumUniques >= `2` &&
9619	Loads.size() - NumUniques >= Loads.size() / `2` &&
9620	(has_single_bit(Value: Data.size() + NumUniques) \|\|
9621	bit_ceil(Value: Data.size()) <
9622	bit_ceil(Value: Data.size() + NumUniques))))) {
9623	Offset = *Dist;
9624	Start = Idx + `1`;
9625	return std::next(x: GatheredLoads.begin(), n: Idx);
9626	}
9627	}
9628	ToAdd.clear();
9629	return GatheredLoads.end();
9630	};
9631	for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9632	unsigned Start = `0`;
9633	SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9634	int64_t Offset = `0`;
9635	auto *It = FindMatchingLoads (Data, GatheredLoads, LocalToAdd, Repeated,
9636	Offset, Start);
9637	while (It != GatheredLoads.end()) {
9638	assert(!LocalToAdd.empty() && "Expected some elements to add.");
9639	for (unsigned Idx : LocalToAdd)
9640	It->emplace_back(Args: Data [Idx].first, Args: Data [Idx].second + Offset);
9641	ToAdd.insert_range(R&: LocalToAdd);
9642	It = FindMatchingLoads (Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9643	Start);
9644	}
9645	if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) {
9646	return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx);
9647	})) {
9648	auto AddNewLoads =
9649	[&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {
9650	for (unsigned Idx : seq<unsigned>(Size: Data.size())) {
9651	if (ToAdd.contains(key: Idx) \|\| Repeated.contains(key: Idx))
9652	continue;
9653	Loads.push_back(Elt: Data [Idx]);
9654	}
9655	};
9656	if (!AddNew) {
9657	LoadInst *LI = Data.front().first;
9658	It = find_if(
9659	Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9660	return PD.front().first->getParent() == LI->getParent() &&
9661	PD.front().first->getType() == LI->getType();
9662	});
9663	while (It != GatheredLoads.end()) {
9664	AddNewLoads (*It);
9665	It = std::find_if(
9666	first: std::next(x: It), last: GatheredLoads.end(),
9667	pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9668	return PD.front().first->getParent() == LI->getParent() &&
9669	PD.front().first->getType() == LI->getType();
9670	});
9671	}
9672	}
9673	GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end());
9674	AddNewLoads (GatheredLoads.emplace_back());
9675	}
9676	}
9677	}
9678
9679	void BoUpSLP::tryToVectorizeGatheredLoads(
9680	const SmallMapVector<
9681	std::tuple<BasicBlock , Value , Type *>,
9682	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, `8`>
9683	&GatheredLoads) {
9684	GatheredLoadsEntriesFirst = VectorizableTree.size();
9685
9686	SmallVector<SmallPtrSet<const Value *, `4`>> LoadSetsToVectorize(
9687	LoadEntriesToVectorize.size());
9688	for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize))
9689	Set.insert_range(R&: VectorizableTree [Idx]->Scalars);
9690
9691	// Sort loads by distance.
9692	auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9693	const std::pair<LoadInst *, int64_t> &L2) {
9694	return L1.second > L2.second;
9695	};
9696
9697	auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9698	ArrayRef<Value > Values(reinterpret_cast<Value const *>(Loads.begin()),
9699	Loads.size());
9700	Align Alignment = computeCommonAlignment<LoadInst>(VL: Values);
9701	auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size());
9702	return TTI->isLegalMaskedGather(DataType: Ty, Alignment) &&
9703	!TTI->forceScalarizeMaskedGather(Type: Ty, Alignment);
9704	};
9705
9706	auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9707	BoUpSLP::ValueSet &VectorizedLoads,
9708	SmallVectorImpl<LoadInst *> &NonVectorized,
9709	bool Final, unsigned MaxVF) {
9710	SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
9711	unsigned StartIdx = `0`;
9712	SmallVector<int> CandidateVFs;
9713	if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + `1`))
9714	CandidateVFs.push_back(Elt: MaxVF);
9715	for (int NumElts = getFloorFullVectorNumberOfElements(
9716	TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF);
9717	NumElts > `1`; NumElts = getFloorFullVectorNumberOfElements(
9718	TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - `1`)) {
9719	CandidateVFs.push_back(Elt: NumElts);
9720	if (VectorizeNonPowerOf2 && NumElts > `2`)
9721	CandidateVFs.push_back(Elt: NumElts - `1`);
9722	}
9723
9724	if (Final && CandidateVFs.empty())
9725	return Results;
9726
9727	unsigned BestVF = Final ? CandidateVFs.back() : `0`;
9728	for (unsigned NumElts : CandidateVFs) {
9729	if (Final && NumElts > BestVF)
9730	continue;
9731	SmallVector<unsigned> MaskedGatherVectorized;
9732	for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9733	++Cnt) {
9734	ArrayRef<LoadInst *> Slice =
9735	ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt));
9736	if (VectorizedLoads.count(Ptr: Slice.front()) \|\|
9737	VectorizedLoads.count(Ptr: Slice.back()) \|\|
9738	areKnownNonVectorizableLoads(VL: Slice))
9739	continue;
9740	// Check if it is profitable to try vectorizing gathered loads. It is
9741	// profitable if we have more than 3 consecutive loads or if we have
9742	// less but all users are vectorized or deleted.
9743	bool AllowToVectorize = false;
9744	// Check if it is profitable to vectorize 2-elements loads.
9745	if (NumElts == `2`) {
9746	bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9747	ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts));
9748	auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9749	for (LoadInst *LI : Slice) {
9750	// If single use/user - allow to vectorize.
9751	if (LI->hasOneUse())
9752	continue;
9753	// 1. Check if number of uses equals number of users.
9754	// 2. All users are deleted.
9755	// 3. The load broadcasts are not allowed or the load is not
9756	// broadcasted.
9757	if (static_cast<unsigned int>(std::distance(
9758	first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses())
9759	return false;
9760	if (!IsLegalBroadcastLoad)
9761	continue;
9762	if (LI->hasNUsesOrMore(N: UsesLimit))
9763	return false;
9764	for (User *U : LI->users()) {
9765	if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI))
9766	continue;
9767	for (const TreeEntry *UTE : getTreeEntries(V: U)) {
9768	for (int I : seq<int>(Size: UTE->getNumOperands())) {
9769	if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) {
9770	return V == LI \|\| isa<PoisonValue>(Val: V);
9771	}))
9772	// Found legal broadcast - do not vectorize.
9773	return false;
9774	}
9775	}
9776	}
9777	}
9778	return true;
9779	};
9780	AllowToVectorize = CheckIfAllowed(Slice);
9781	} else {
9782	AllowToVectorize =
9783	(NumElts >= `3` \|\|
9784	any_of(Range&: ValueToGatherNodes.at(Val: Slice.front()),
9785	P: [=](const TreeEntry *TE) {
9786	return TE->Scalars.size() == `2` &&
9787	((TE->Scalars.front() == Slice.front() &&
9788	TE->Scalars.back() == Slice.back()) \|\|
9789	(TE->Scalars.front() == Slice.back() &&
9790	TE->Scalars.back() == Slice.front()));
9791	})) &&
9792	hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(),
9793	Sz: Slice.size());
9794	}
9795	if (AllowToVectorize) {
9796	SmallVector<Value *> PointerOps;
9797	OrdersType CurrentOrder;
9798	// Try to build vector load.
9799	ArrayRef<Value *> Values(
9800	reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9801	StridedPtrInfo SPtrInfo;
9802	LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder,
9803	PointerOps, SPtrInfo, BestVF: &BestVF);
9804	if (LS != LoadsState::Gather \|\|
9805	(BestVF > `1` && static_cast<unsigned>(NumElts) == `2` * BestVF)) {
9806	if (LS == LoadsState::ScatterVectorize) {
9807	if (MaskedGatherVectorized.empty() \|\|
9808	Cnt >= MaskedGatherVectorized.back() + NumElts)
9809	MaskedGatherVectorized.push_back(Elt: Cnt);
9810	continue;
9811	}
9812	if (LS != LoadsState::Gather) {
9813	Results.emplace_back(Args&: Values, Args&: LS);
9814	VectorizedLoads.insert_range(R&: Slice);
9815	// If we vectorized initial block, no need to try to vectorize it
9816	// again.
9817	if (Cnt == StartIdx)
9818	StartIdx += NumElts;
9819	}
9820	// Check if the whole array was vectorized already - exit.
9821	if (StartIdx >= Loads.size())
9822	break;
9823	// Erase last masked gather candidate, if another candidate within
9824	// the range is found to be better.
9825	if (!MaskedGatherVectorized.empty() &&
9826	Cnt < MaskedGatherVectorized.back() + NumElts)
9827	MaskedGatherVectorized.pop_back();
9828	Cnt += NumElts - `1`;
9829	continue;
9830	}
9831	}
9832	if (!AllowToVectorize \|\| BestVF == `0`)
9833	registerNonVectorizableLoads(VL: Slice);
9834	}
9835	// Mark masked gathers candidates as vectorized, if any.
9836	for (unsigned Cnt : MaskedGatherVectorized) {
9837	ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9838	N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt));
9839	ArrayRef<Value *> Values(
9840	reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9841	Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize);
9842	VectorizedLoads.insert_range(R&: Slice);
9843	// If we vectorized initial block, no need to try to vectorize it again.
9844	if (Cnt == StartIdx)
9845	StartIdx += NumElts;
9846	}
9847	}
9848	for (LoadInst *LI : Loads) {
9849	if (!VectorizedLoads.contains(Ptr: LI))
9850	NonVectorized.push_back(Elt: LI);
9851	}
9852	return Results;
9853	};
9854	auto ProcessGatheredLoads =
9855	[&, &TTI = *TTI](
9856	ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
9857	bool Final = false) {
9858	SmallVector<LoadInst *> NonVectorized;
9859	for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9860	GatheredLoads) {
9861	if (LoadsDists.size() <= `1`) {
9862	NonVectorized.push_back(Elt: LoadsDists.back().first);
9863	continue;
9864	}
9865	SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
9866	LoadsDists);
9867	SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists));
9868	stable_sort(Range&: LocalLoadsDists, C: LoadSorter);
9869	SmallVector<LoadInst *> Loads;
9870	unsigned MaxConsecutiveDistance = `0`;
9871	unsigned CurrentConsecutiveDist = `1`;
9872	int64_t LastDist = LocalLoadsDists.front().second;
9873	bool AllowMaskedGather = IsMaskedGatherSupported (OriginalLoads);
9874	for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9875	if (isVectorized(V: L.first))
9876	continue;
9877	assert(LastDist >= L.second &&
9878	"Expected first distance always not less than second");
9879	if (static_cast<uint64_t>(LastDist - L.second) ==
9880	CurrentConsecutiveDist) {
9881	++CurrentConsecutiveDist;
9882	MaxConsecutiveDistance =
9883	std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist);
9884	Loads.push_back(Elt: L.first);
9885	continue;
9886	}
9887	if (!AllowMaskedGather && CurrentConsecutiveDist == `1` &&
9888	!Loads.empty())
9889	Loads.pop_back();
9890	CurrentConsecutiveDist = `1`;
9891	LastDist = L.second;
9892	Loads.push_back(Elt: L.first);
9893	}
9894	if (Loads.size() <= `1`)
9895	continue;
9896	if (AllowMaskedGather)
9897	MaxConsecutiveDistance = Loads.size();
9898	else if (MaxConsecutiveDistance < `2`)
9899	continue;
9900	BoUpSLP::ValueSet VectorizedLoads;
9901	SmallVector<LoadInst *> SortedNonVectorized;
9902	SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
9903	GetVectorizedRanges (Loads, VectorizedLoads, SortedNonVectorized,
9904	Final, MaxConsecutiveDistance);
9905	if (!Results.empty() && !SortedNonVectorized.empty() &&
9906	OriginalLoads.size() == Loads.size() &&
9907	MaxConsecutiveDistance == Loads.size() &&
9908	all_of(Range&: Results,
9909	P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9910	return P.second == LoadsState::ScatterVectorize;
9911	})) {
9912	VectorizedLoads.clear();
9913	SmallVector<LoadInst *> UnsortedNonVectorized;
9914	SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
9915	UnsortedResults =
9916	GetVectorizedRanges (OriginalLoads, VectorizedLoads,
9917	UnsortedNonVectorized, Final,
9918	OriginalLoads.size());
9919	if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9920	SortedNonVectorized.swap(RHS&: UnsortedNonVectorized);
9921	Results.swap(RHS&: UnsortedResults);
9922	}
9923	}
9924	for (auto [Slice, _] : Results) {
9925	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9926	<< Slice.size() << ")\n");
9927	if (any_of(Range&: Slice, P: [&](Value V) { return* isVectorized(V); })) {
9928	for (Value *L : Slice)
9929	if (!isVectorized(V: L))
9930	SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L));
9931	continue;
9932	}
9933
9934	// Select maximum VF as a maximum of user gathered nodes and
9935	// distance between scalar loads in these nodes.
9936	unsigned MaxVF = Slice.size();
9937	unsigned UserMaxVF = `0`;
9938	unsigned InterleaveFactor = `0`;
9939	if (MaxVF == `2`) {
9940	UserMaxVF = MaxVF;
9941	} else {
9942	// Found distance between segments of the interleaved loads.
9943	std::optional<unsigned> InterleavedLoadsDistance = `0`;
9944	unsigned Order = `0`;
9945	std::optional<unsigned> CommonVF = `0`;
9946	DenseMap<const TreeEntry , unsigned*> EntryToPosition;
9947	SmallPtrSet<const TreeEntry *, `8`> DeinterleavedNodes;
9948	for (auto [Idx, V] : enumerate(First&: Slice)) {
9949	for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) {
9950	UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size());
9951	unsigned Pos =
9952	EntryToPosition.try_emplace(Key: E, Args&: Idx).first ->second;
9953	UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + `1`);
9954	if (CommonVF) {
9955	if (*CommonVF == `0`) {
9956	CommonVF = E->Scalars.size();
9957	continue;
9958	}
9959	if (*CommonVF != E->Scalars.size())
9960	CommonVF.reset();
9961	}
9962	// Check if the load is the part of the interleaved load.
9963	if (Pos != Idx && InterleavedLoadsDistance) {
9964	if (!DeinterleavedNodes.contains(Ptr: E) &&
9965	any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) {
9966	if (isa<Constant>(Val: V))
9967	return false;
9968	if (isVectorized(V))
9969	return true;
9970	const auto &Nodes = ValueToGatherNodes.at(Val: V);
9971	return (Nodes.size() != `1` \|\| !Nodes.contains(key: E)) &&
9972	!is_contained(Range: Slice, Element: V);
9973	})) {
9974	InterleavedLoadsDistance.reset();
9975	continue;
9976	}
9977	DeinterleavedNodes.insert(Ptr: E);
9978	if (*InterleavedLoadsDistance == `0`) {
9979	InterleavedLoadsDistance = Idx - Pos;
9980	continue;
9981	}
9982	if ((Idx - Pos) % *InterleavedLoadsDistance != `0` \|\|
9983	(Idx - Pos) / *InterleavedLoadsDistance < Order)
9984	InterleavedLoadsDistance.reset();
9985	Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: `1`);
9986	}
9987	}
9988	}
9989	DeinterleavedNodes.clear();
9990	// Check if the large load represents interleaved load operation.
9991	if (InterleavedLoadsDistance.value_or(u: `0`) > `1` &&
9992	CommonVF.value_or(u: `0`) != `0`) {
9993	InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance);
9994	unsigned VF = *CommonVF;
9995	OrdersType Order;
9996	SmallVector<Value *> PointerOps;
9997	StridedPtrInfo SPtrInfo;
9998	// Segmented load detected - vectorize at maximum vector factor.
9999	if (InterleaveFactor <= Slice.size() &&
10000	TTI.isLegalInterleavedAccessType(
10001	VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF),
10002	Factor: InterleaveFactor,
10003	Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(),
10004	AddrSpace: cast<LoadInst>(Val: Slice.front())
10005	->getPointerAddressSpace()) &&
10006	canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
10007	SPtrInfo) == LoadsState::Vectorize) {
10008	UserMaxVF = InterleaveFactor * VF;
10009	} else {
10010	InterleaveFactor = `0`;
10011	}
10012	}
10013	// Cannot represent the loads as consecutive vectorizable nodes -
10014	// just exit.
10015	unsigned ConsecutiveNodesSize = `0`;
10016	if (!LoadEntriesToVectorize.empty() && InterleaveFactor == `0` &&
10017	any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10018	P: [&, Slice = Slice](const auto &P) {
10019	const auto It = find_if(Slice, [&](Value V) {
10020	return std::get<`1`>(P).contains(V);
10021	});
10022	if (It == Slice.end())
10023	return false;
10024	const TreeEntry &TE =
10025	*VectorizableTree[std::get<`0`>(P)];
10026	ArrayRef<Value *> VL = TE.Scalars;
10027	OrdersType Order;
10028	SmallVector<Value *> PointerOps;
10029	StridedPtrInfo SPtrInfo;
10030	LoadsState State = canVectorizeLoads(
10031	VL, VL0: VL.front(), Order, PointerOps, SPtrInfo);
10032	if (State == LoadsState::ScatterVectorize \|\|
10033	State == LoadsState::CompressVectorize)
10034	return false;
10035	ConsecutiveNodesSize += VL.size();
10036	size_t Start = std::distance(Slice.begin(), It);
10037	size_t Sz = Slice.size() - Start;
10038	return Sz < VL.size() \|\|
10039	Slice.slice(N: Start, M: VL.size()) != VL;
10040	}))
10041	continue;
10042	// Try to build long masked gather loads.
10043	UserMaxVF = bit_ceil(Value: UserMaxVF);
10044	if (InterleaveFactor == `0` &&
10045	any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF),
10046	P: [&, Slice = Slice](unsigned Idx) {
10047	OrdersType Order;
10048	SmallVector<Value *> PointerOps;
10049	StridedPtrInfo SPtrInfo;
10050	return canVectorizeLoads(
10051	VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF),
10052	VL0: Slice [Idx * UserMaxVF], Order, PointerOps,
10053	SPtrInfo) == LoadsState::ScatterVectorize;
10054	}))
10055	UserMaxVF = MaxVF;
10056	if (Slice.size() != ConsecutiveNodesSize)
10057	MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF);
10058	}
10059	for (unsigned VF = MaxVF; VF >= `2`; VF /= `2`) {
10060	bool IsVectorized = true;
10061	for (unsigned I = `0`, E = Slice.size(); I < E; I += VF) {
10062	ArrayRef<Value *> SubSlice =
10063	Slice.slice(N: I, M: std::min(a: VF, b: E - I));
10064	if (isVectorized(V: SubSlice.front()))
10065	continue;
10066	// Check if the subslice is to be-vectorized entry, which is not
10067	// equal to entry.
10068	if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10069	P: [&](const auto &P) {
10070	return !SubSlice.equals(
10071	RHS: VectorizableTree[std::get<`0`>(P)]
10072	->Scalars) &&
10073	set_is_subset(SubSlice, std::get<`1`>(P));
10074	}))
10075	continue;
10076	unsigned Sz = VectorizableTree.size();
10077	buildTreeRec(Roots: SubSlice, Depth: `0`, EI: EdgeInfo (), InterleaveFactor);
10078	if (Sz == VectorizableTree.size()) {
10079	IsVectorized = false;
10080	// Try non-interleaved vectorization with smaller vector
10081	// factor.
10082	if (InterleaveFactor > `0`) {
10083	VF = `2` * (MaxVF / InterleaveFactor);
10084	InterleaveFactor = `0`;
10085	}
10086	continue;
10087	}
10088	}
10089	if (IsVectorized)
10090	break;
10091	}
10092	}
10093	NonVectorized.append(RHS: SortedNonVectorized);
10094	}
10095	return NonVectorized;
10096	};
10097	for (const auto &GLs : GatheredLoads) {
10098	const auto &Ref = GLs.second;
10099	SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads (Ref);
10100	if (!Ref.empty() && !NonVectorized.empty() &&
10101	std::accumulate(
10102	first: Ref.begin(), last: Ref.end(), init: `0u`,
10103	binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10104	-> unsigned { return S + LoadsDists.size(); }) !=
10105	NonVectorized.size() &&
10106	IsMaskedGatherSupported (NonVectorized)) {
10107	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>
10108	FinalGatheredLoads;
10109	for (LoadInst *LI : NonVectorized) {
10110	// Reinsert non-vectorized loads to other list of loads with the same
10111	// base pointers.
10112	gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: DL, SE&: SE, TTI: *TTI,
10113	GatheredLoads&: FinalGatheredLoads,
10114	/AddNew=/false);
10115	}
10116	// Final attempt to vectorize non-vectorized loads.
10117	(void)ProcessGatheredLoads (FinalGatheredLoads, /Final=/true);
10118	}
10119	}
10120	// Try to vectorize postponed load entries, previously marked as gathered.
10121	for (unsigned Idx : LoadEntriesToVectorize) {
10122	const TreeEntry &E = *VectorizableTree [Idx];
10123	SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10124	// Avoid reordering, if possible.
10125	if (!E.ReorderIndices.empty()) {
10126	// Build a mask out of the reorder indices and reorder scalars per this
10127	// mask.
10128	SmallVector<int> ReorderMask;
10129	inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask);
10130	reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
10131	}
10132	buildTreeRec(Roots: GatheredScalars, Depth: `0`, EI: EdgeInfo ());
10133	}
10134	// If no new entries created, consider it as no gathered loads entries must be
10135	// handled.
10136	if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10137	VectorizableTree.size())
10138	GatheredLoadsEntriesFirst.reset();
10139	}
10140
10141	/// Generates key/subkey pair for the given value to provide effective sorting
10142	/// of the values and better detection of the vectorizable values sequences. The
10143	/// keys/subkeys can be used for better sorting of the values themselves (keys)
10144	/// and in values subgroups (subkeys).
10145	static std::pair<size_t, size_t> generateKeySubkey(
10146	Value V, const* TargetLibraryInfo *TLI,
10147	function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10148	bool AllowAlternate) {
10149	hash_code Key = hash_value(value: V->getValueID() + `2`);
10150	hash_code SubKey = hash_value(value: `0`);
10151	// Sort the loads by the distance between the pointers.
10152	if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
10153	Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
10154	if (LI->isSimple())
10155	SubKey = hash_value(code: LoadsSubkeyGenerator (Key, LI));
10156	else
10157	Key = SubKey = hash_value(ptr: LI);
10158	} else if (isVectorLikeInstWithConstOps(V)) {
10159	// Sort extracts by the vector operands.
10160	if (isa<ExtractElementInst, UndefValue>(Val: V))
10161	Key = hash_value(value: Value::UndefValueVal + `1`);
10162	if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
10163	if (!isUndefVector(V: EI->getVectorOperand()).all() &&
10164	!isa<UndefValue>(Val: EI->getIndexOperand()))
10165	SubKey = hash_value(ptr: EI->getVectorOperand());
10166	}
10167	} else if (auto *I = dyn_cast<Instruction>(Val: V)) {
10168	// Sort other instructions just by the opcodes except for CMPInst.
10169	// For CMP also sort by the predicate kind.
10170	if ((isa<BinaryOperator, CastInst>(Val: I)) &&
10171	isValidForAlternation(Opcode: I->getOpcode())) {
10172	if (AllowAlternate)
10173	Key = hash_value(value: isa<BinaryOperator>(Val: I) ? `1` : `0`);
10174	else
10175	Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
10176	SubKey = hash_combine(
10177	args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
10178	args: hash_value(ptr: isa<BinaryOperator>(Val: I)
10179	? I->getType()
10180	: cast<CastInst>(Val: I)->getOperand(i_nocapture: `0`)->getType()));
10181	// For casts, look through the only operand to improve compile time.
10182	if (isa<CastInst>(Val: I)) {
10183	std::pair<size_t, size_t> OpVals =
10184	generateKeySubkey(V: I->getOperand(i: `0`), TLI, LoadsSubkeyGenerator,
10185	/AllowAlternate=/true);
10186	Key = hash_combine(args: OpVals.first, args: Key);
10187	SubKey = hash_combine(args: OpVals.first, args: SubKey);
10188	}
10189	} else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
10190	CmpInst::Predicate Pred = CI->getPredicate();
10191	if (CI->isCommutative())
10192	Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
10193	CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
10194	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
10195	args: hash_value(value: SwapPred),
10196	args: hash_value(ptr: CI->getOperand(i_nocapture: `0`)->getType()));
10197	} else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
10198	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
10199	if (isTriviallyVectorizable(ID)) {
10200	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
10201	} else if (!VFDatabase (Call).getMappings(CI: Call).empty()) {
10202	SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
10203	args: hash_value(ptr: Call->getCalledFunction()));
10204	} else {
10205	Key = hash_combine(args: hash_value(ptr: Call), args: Key);
10206	SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
10207	}
10208	for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10209	SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
10210	args: hash_value(ptr: Op.Tag), args: SubKey);
10211	} else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
10212	if (Gep->getNumOperands() == `2` && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: `1`)))
10213	SubKey = hash_value(ptr: Gep->getPointerOperand());
10214	else
10215	SubKey = hash_value(ptr: Gep);
10216	} else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
10217	!isa<ConstantInt>(Val: I->getOperand(i: `1`))) {
10218	// Do not try to vectorize instructions with potentially high cost.
10219	SubKey = hash_value(ptr: I);
10220	} else {
10221	SubKey = hash_value(value: I->getOpcode());
10222	}
10223	Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
10224	}
10225	return std::make_pair(x&: Key, y&: SubKey);
10226	}
10227
10228	/// Checks if the specified instruction \p I is an main operation for the given
10229	/// \p MainOp and \p AltOp instructions.
10230	static bool isMainInstruction(Instruction I, Instruction MainOp,
10231	Instruction AltOp, const* TargetLibraryInfo &TLI);
10232
10233	/// Builds the arguments types vector for the given call instruction with the
10234	/// given \p ID for the specified vector factor.
10235	static SmallVector<Type *>
10236	buildIntrinsicArgTypes(const CallInst CI, const* Intrinsic::ID ID,
10237	const unsigned VF, unsigned MinBW,
10238	const TargetTransformInfo *TTI) {
10239	SmallVector<Type *> ArgTys;
10240	for (auto [Idx, Arg] : enumerate(First: CI->args())) {
10241	if (ID != Intrinsic::not_intrinsic) {
10242	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) {
10243	ArgTys.push_back(Elt: Arg ->getType());
10244	continue;
10245	}
10246	if (MinBW > `0`) {
10247	ArgTys.push_back(
10248	Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
10249	continue;
10250	}
10251	}
10252	ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg ->getType(), VF));
10253	}
10254	return ArgTys;
10255	}
10256
10257	/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10258	/// function (if possible) calls. Returns invalid cost for the corresponding
10259	/// calls, if they cannot be vectorized/will be scalarized.
10260	static std::pair<InstructionCost, InstructionCost>
10261	getVectorCallCosts(CallInst CI, FixedVectorType VecTy,
10262	TargetTransformInfo TTI, TargetLibraryInfo TLI,
10263	ArrayRef<Type *> ArgTys) {
10264	auto Shape = VFShape::get(FTy: CI->getFunctionType(),
10265	EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
10266	HasGlobalPred: false /HasGlobalPred/);
10267	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
10268	auto LibCost = InstructionCost::getInvalid();
10269	if (!CI->isNoBuiltin() && VecFunc) {
10270	// Calculate the cost of the vector library call.
10271	// If the corresponding vector call is cheaper, return its cost.
10272	LibCost =
10273	TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
10274	}
10275	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10276
10277	// Calculate the cost of the vector intrinsic call.
10278	FastMathFlags FMF;
10279	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
10280	FMF = FPCI->getFastMathFlags();
10281	const InstructionCost ScalarLimit = `10000`;
10282	IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10283	LibCost.isValid() ? LibCost : ScalarLimit);
10284	auto IntrinsicCost =
10285	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
10286	if ((LibCost.isValid() && IntrinsicCost > LibCost) \|\|
10287	(!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10288	IntrinsicCost = InstructionCost::getInvalid();
10289
10290	return {IntrinsicCost, LibCost};
10291	}
10292
10293	BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10294	const InstructionsState &S, ArrayRef<Value *> VL,
10295	bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10296	SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10297	assert(S.getMainOp() &&
10298	"Expected instructions with same/alternate opcodes only.");
10299
10300	unsigned ShuffleOrOp =
10301	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10302	Instruction *VL0 = S.getMainOp();
10303	switch (ShuffleOrOp) {
10304	case Instruction::PHI: {
10305	// Too many operands - gather, most probably won't be vectorized.
10306	if (VL0->getNumOperands() > MaxPHINumOperands)
10307	return TreeEntry::NeedToGather;
10308	// Check for terminator values (e.g. invoke).
10309	for (Value *V : VL) {
10310	auto *PHI = dyn_cast<PHINode>(Val: V);
10311	if (!PHI)
10312	continue;
10313	for (Value *Incoming : PHI->incoming_values()) {
10314	Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
10315	if (Term && Term->isTerminator()) {
10316	LLVM_DEBUG(dbgs()
10317	<< "SLP: Need to swizzle PHINodes (terminator use).\n");
10318	return TreeEntry::NeedToGather;
10319	}
10320	}
10321	}
10322
10323	return TreeEntry::Vectorize;
10324	}
10325	case Instruction::ExtractElement:
10326	if (any_of(Range&: VL, P: [&](Value *V) {
10327	auto *EI = dyn_cast<ExtractElementInst>(Val: V);
10328	if (!EI)
10329	return true;
10330	return isVectorized(V: EI->getOperand(i_nocapture: `0`));
10331	}))
10332	return TreeEntry::NeedToGather;
10333	[[fallthrough]];
10334	case Instruction::ExtractValue: {
10335	bool Reuse = canReuseExtract(VL, CurrentOrder);
10336	// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10337	// non-full registers).
10338	if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size()))
10339	return TreeEntry::NeedToGather;
10340	if (Reuse \|\| !CurrentOrder.empty())
10341	return TreeEntry::Vectorize;
10342	LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10343	return TreeEntry::NeedToGather;
10344	}
10345	case Instruction::InsertElement: {
10346	// Check that we have a buildvector and not a shuffle of 2 or more
10347	// different vectors.
10348	ValueSet SourceVectors;
10349	for (Value *V : VL) {
10350	if (isa<PoisonValue>(Val: V)) {
10351	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10352	return TreeEntry::NeedToGather;
10353	}
10354	SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: `0`));
10355	assert(getElementIndex(V) != std::nullopt &&
10356	"Non-constant or undef index?");
10357	}
10358
10359	if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
10360	return !SourceVectors.contains(Ptr: V);
10361	}) >= `2`) {
10362	// Found 2nd source vector - cancel.
10363	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10364	"different source vectors.\n");
10365	return TreeEntry::NeedToGather;
10366	}
10367
10368	if (any_of(Range&: VL, P: [&SourceVectors](Value *V) {
10369	// The last InsertElement can have multiple uses.
10370	return SourceVectors.contains(Ptr: V) && !V->hasOneUse();
10371	})) {
10372	assert(SLPReVec && "Only supported by REVEC.");
10373	LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10374	"multiple uses.\n");
10375	return TreeEntry::NeedToGather;
10376	}
10377
10378	return TreeEntry::Vectorize;
10379	}
10380	case Instruction::Load: {
10381	// Check that a vectorized load would load the same memory as a scalar
10382	// load. For example, we don't want to vectorize loads that are smaller
10383	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10384	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
10385	// from such a struct, we read/write packed bits disagreeing with the
10386	// unvectorized version.
10387	auto IsGatheredNode = [&]() {
10388	if (!GatheredLoadsEntriesFirst)
10389	return false;
10390	return all_of(Range&: VL, P: [&](Value *V) {
10391	if (isa<PoisonValue>(Val: V))
10392	return true;
10393	return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) {
10394	return TE->Idx >= *GatheredLoadsEntriesFirst;
10395	});
10396	});
10397	};
10398	switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps, SPtrInfo)) {
10399	case LoadsState::Vectorize:
10400	return TreeEntry::Vectorize;
10401	case LoadsState::CompressVectorize:
10402	if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10403	// Delay slow vectorized nodes for better vectorization attempts.
10404	LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10405	return TreeEntry::NeedToGather;
10406	}
10407	return IsGatheredNode () ? TreeEntry::NeedToGather
10408	: TreeEntry::CompressVectorize;
10409	case LoadsState::ScatterVectorize:
10410	if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10411	// Delay slow vectorized nodes for better vectorization attempts.
10412	LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10413	return TreeEntry::NeedToGather;
10414	}
10415	return IsGatheredNode () ? TreeEntry::NeedToGather
10416	: TreeEntry::ScatterVectorize;
10417	case LoadsState::StridedVectorize:
10418	if (!IsGraphTransformMode && VectorizableTree.size() > `1`) {
10419	// Delay slow vectorized nodes for better vectorization attempts.
10420	LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10421	return TreeEntry::NeedToGather;
10422	}
10423	return IsGatheredNode () ? TreeEntry::NeedToGather
10424	: TreeEntry::StridedVectorize;
10425	case LoadsState::Gather:
10426	#ifndef NDEBUG
10427	Type *ScalarTy = VL0->getType();
10428	if (DL->getTypeSizeInBits(ScalarTy) !=
10429	DL->getTypeAllocSizeInBits(ScalarTy))
10430	LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10431	else if (any_of(VL, [](Value *V) {
10432	auto *LI = dyn_cast<LoadInst>(V);
10433	return !LI \|\| !LI->isSimple();
10434	}))
10435	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10436	else
10437	LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10438	#endif // NDEBUG
10439	registerNonVectorizableLoads(VL);
10440	return TreeEntry::NeedToGather;
10441	}
10442	llvm_unreachable("Unexpected state of loads");
10443	}
10444	case Instruction::ZExt:
10445	case Instruction::SExt:
10446	case Instruction::FPToUI:
10447	case Instruction::FPToSI:
10448	case Instruction::FPExt:
10449	case Instruction::PtrToInt:
10450	case Instruction::IntToPtr:
10451	case Instruction::SIToFP:
10452	case Instruction::UIToFP:
10453	case Instruction::Trunc:
10454	case Instruction::FPTrunc:
10455	case Instruction::BitCast: {
10456	Type *SrcTy = VL0->getOperand(i: `0`)->getType();
10457	for (Value *V : VL) {
10458	if (isa<PoisonValue>(Val: V))
10459	continue;
10460	Type *Ty = cast<Instruction>(Val: V)->getOperand(i: `0`)->getType();
10461	if (Ty != SrcTy \|\| !isValidElementType(Ty)) {
10462	LLVM_DEBUG(
10463	dbgs() << "SLP: Gathering casts with different src types.\n");
10464	return TreeEntry::NeedToGather;
10465	}
10466	}
10467	return TreeEntry::Vectorize;
10468	}
10469	case Instruction::ICmp:
10470	case Instruction::FCmp: {
10471	// Check that all of the compares have the same predicate.
10472	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
10473	CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
10474	Type *ComparedTy = VL0->getOperand(i: `0`)->getType();
10475	for (Value *V : VL) {
10476	if (isa<PoisonValue>(Val: V))
10477	continue;
10478	auto *Cmp = cast<CmpInst>(Val: V);
10479	if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) \|\|
10480	Cmp->getOperand(i_nocapture: `0`)->getType() != ComparedTy) {
10481	LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10482	return TreeEntry::NeedToGather;
10483	}
10484	}
10485	return TreeEntry::Vectorize;
10486	}
10487	case Instruction::Select:
10488	if (SLPReVec) {
10489	SmallPtrSet<Type *, `4`> CondTypes;
10490	for (Value *V : VL) {
10491	Value *Cond;
10492	if (!match(V, P: m_Select(C: m_Value(V&: Cond), L: m_Value(), R: m_Value())) &&
10493	!match(V, P: m_ZExt(Op: m_Value(V&: Cond))))
10494	continue;
10495	CondTypes.insert(Ptr: Cond->getType());
10496	}
10497	if (CondTypes.size() > `1`) {
10498	LLVM_DEBUG(
10499	dbgs()
10500	<< "SLP: Gathering select with different condition types.\n");
10501	return TreeEntry::NeedToGather;
10502	}
10503	}
10504	[[fallthrough]];
10505	case Instruction::FNeg:
10506	case Instruction::Add:
10507	case Instruction::FAdd:
10508	case Instruction::Sub:
10509	case Instruction::FSub:
10510	case Instruction::Mul:
10511	case Instruction::FMul:
10512	case Instruction::UDiv:
10513	case Instruction::SDiv:
10514	case Instruction::FDiv:
10515	case Instruction::URem:
10516	case Instruction::SRem:
10517	case Instruction::FRem:
10518	case Instruction::Shl:
10519	case Instruction::LShr:
10520	case Instruction::AShr:
10521	case Instruction::And:
10522	case Instruction::Or:
10523	case Instruction::Xor:
10524	case Instruction::Freeze:
10525	if (S.getMainOp()->getType()->isFloatingPointTy() &&
10526	TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10527	auto *I = dyn_cast<Instruction>(Val: V);
10528	return I && I->isBinaryOp() && !I->isFast();
10529	}))
10530	return TreeEntry::NeedToGather;
10531	return TreeEntry::Vectorize;
10532	case Instruction::GetElementPtr: {
10533	// We don't combine GEPs with complicated (nested) indexing.
10534	for (Value *V : VL) {
10535	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10536	if (!I)
10537	continue;
10538	if (I->getNumOperands() != `2`) {
10539	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10540	return TreeEntry::NeedToGather;
10541	}
10542	}
10543
10544	// We can't combine several GEPs into one vector if they operate on
10545	// different types.
10546	Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
10547	for (Value *V : VL) {
10548	auto *GEP = dyn_cast<GEPOperator>(Val: V);
10549	if (!GEP)
10550	continue;
10551	Type *CurTy = GEP->getSourceElementType();
10552	if (Ty0 != CurTy) {
10553	LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10554	return TreeEntry::NeedToGather;
10555	}
10556	}
10557
10558	// We don't combine GEPs with non-constant indexes.
10559	Type *Ty1 = VL0->getOperand(i: `1`)->getType();
10560	for (Value *V : VL) {
10561	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10562	if (!I)
10563	continue;
10564	auto *Op = I->getOperand(i_nocapture: `1`);
10565	if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
10566	(Op->getType() != Ty1 &&
10567	((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) \|\|
10568	Op->getType()->getScalarSizeInBits() >
10569	DL->getIndexSizeInBits(
10570	AS: V->getType()->getPointerAddressSpace())))) {
10571	LLVM_DEBUG(
10572	dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10573	return TreeEntry::NeedToGather;
10574	}
10575	}
10576
10577	return TreeEntry::Vectorize;
10578	}
10579	case Instruction::Store: {
10580	// Check if the stores are consecutive or if we need to swizzle them.
10581	llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
10582	// Avoid types that are padded when being allocated as scalars, while
10583	// being packed together in a vector (such as i1).
10584	if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
10585	DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
10586	LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10587	return TreeEntry::NeedToGather;
10588	}
10589	// Make sure all stores in the bundle are simple - we can't vectorize
10590	// atomic or volatile stores.
10591	for (Value *V : VL) {
10592	auto *SI = cast<StoreInst>(Val: V);
10593	if (!SI->isSimple()) {
10594	LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10595	return TreeEntry::NeedToGather;
10596	}
10597	PointerOps.push_back(Elt: SI->getPointerOperand());
10598	}
10599
10600	// Check the order of pointer operands.
10601	if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: DL, SE&: SE, SortedIndices&: CurrentOrder)) {
10602	Value *Ptr0;
10603	Value *PtrN;
10604	if (CurrentOrder.empty()) {
10605	Ptr0 = PointerOps.front();
10606	PtrN = PointerOps.back();
10607	} else {
10608	Ptr0 = PointerOps [CurrentOrder.front()];
10609	PtrN = PointerOps [CurrentOrder.back()];
10610	}
10611	std::optional<int64_t> Dist =
10612	getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: DL, SE&: SE);
10613	// Check that the sorted pointer operands are consecutive.
10614	if (static_cast<uint64_t>(*Dist) == VL.size() - `1`)
10615	return TreeEntry::Vectorize;
10616	}
10617
10618	LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10619	return TreeEntry::NeedToGather;
10620	}
10621	case Instruction::Call: {
10622	if (S.getMainOp()->getType()->isFloatingPointTy() &&
10623	TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10624	auto *I = dyn_cast<Instruction>(Val: V);
10625	return I && !I->isFast();
10626	}))
10627	return TreeEntry::NeedToGather;
10628	// Check if the calls are all to the same vectorizable intrinsic or
10629	// library function.
10630	CallInst *CI = cast<CallInst>(Val: VL0);
10631	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10632
10633	VFShape Shape = VFShape::get(
10634	FTy: CI->getFunctionType(),
10635	EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
10636	HasGlobalPred: false /HasGlobalPred/);
10637	Function VecFunc = VFDatabase (CI).getVectorizedFunction(Shape);
10638
10639	if (!VecFunc && !isTriviallyVectorizable(ID)) {
10640	LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10641	return TreeEntry::NeedToGather;
10642	}
10643	Function *F = CI->getCalledFunction();
10644	unsigned NumArgs = CI->arg_size();
10645	SmallVector<Value , `4`> ScalarArgs(NumArgs, nullptr*);
10646	for (unsigned J = `0`; J != NumArgs; ++J)
10647	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI))
10648	ScalarArgs [J] = CI->getArgOperand(i: J);
10649	for (Value *V : VL) {
10650	CallInst *CI2 = dyn_cast<CallInst>(Val: V);
10651	if (!CI2 \|\| CI2->getCalledFunction() != F \|\|
10652	getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID \|\|
10653	(VecFunc &&
10654	VecFunc != VFDatabase (*CI2).getVectorizedFunction(Shape)) \|\|
10655	!CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
10656	LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << CI << "!=" << V
10657	<< "\n");
10658	return TreeEntry::NeedToGather;
10659	}
10660	// Some intrinsics have scalar arguments and should be same in order for
10661	// them to be vectorized.
10662	for (unsigned J = `0`; J != NumArgs; ++J) {
10663	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) {
10664	Value *A1J = CI2->getArgOperand(i: J);
10665	if (ScalarArgs [J] != A1J) {
10666	LLVM_DEBUG(dbgs()
10667	<< "SLP: mismatched arguments in call:" << *CI
10668	<< " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10669	return TreeEntry::NeedToGather;
10670	}
10671	}
10672	}
10673	// Verify that the bundle operands are identical between the two calls.
10674	if (CI->hasOperandBundles() &&
10675	!std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
10676	last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
10677	first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10678	LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10679	<< "!=" << *V << `'\n'`);
10680	return TreeEntry::NeedToGather;
10681	}
10682	}
10683	SmallVector<Type *> ArgTys =
10684	buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: `0`, TTI);
10685	auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
10686	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10687	if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10688	return TreeEntry::NeedToGather;
10689
10690	return TreeEntry::Vectorize;
10691	}
10692	case Instruction::ShuffleVector: {
10693	if (!S.isAltShuffle()) {
10694	// REVEC can support non alternate shuffle.
10695	if (SLPReVec && getShufflevectorNumGroups(VL))
10696	return TreeEntry::Vectorize;
10697	// If this is not an alternate sequence of opcode like add-sub
10698	// then do not vectorize this instruction.
10699	LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10700	return TreeEntry::NeedToGather;
10701	}
10702
10703	return TreeEntry::Vectorize;
10704	}
10705	default:
10706	LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10707	return TreeEntry::NeedToGather;
10708	}
10709	}
10710
10711	namespace {
10712	/// Allows to correctly handle operands of the phi nodes based on the \p Main
10713	/// PHINode order of incoming basic blocks/values.
10714	class PHIHandler {
10715	DominatorTree &DT;
10716	PHINode Main = nullptr*;
10717	SmallVector<Value *> Phis;
10718	SmallVector<SmallVector<Value *>> Operands;
10719
10720	public:
10721	PHIHandler() = delete;
10722	PHIHandler(DominatorTree &DT, PHINode Main, ArrayRef<Value > Phis)
10723	: DT(DT), Main(Main), Phis (Phis),
10724	Operands (Main->getNumIncomingValues(),
10725	SmallVector<Value >(Phis.size(), nullptr*)) {}
10726	void buildOperands() {
10727	constexpr unsigned FastLimit = `4`;
10728	if (Main->getNumIncomingValues() <= FastLimit) {
10729	for (unsigned I : seq<unsigned>(Begin: `0`, End: Main->getNumIncomingValues())) {
10730	BasicBlock *InBB = Main->getIncomingBlock(i: I);
10731	if (!DT.isReachableFromEntry(A: InBB)) {
10732	Operands [I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10733	continue;
10734	}
10735	// Prepare the operand vector.
10736	for (auto [Idx, V] : enumerate(First&: Phis)) {
10737	auto *P = dyn_cast<PHINode>(Val: V);
10738	if (!P) {
10739	assert(isa<PoisonValue>(V) &&
10740	"Expected isa instruction or poison value.");
10741	Operands [I][Idx] = V;
10742	continue;
10743	}
10744	if (P->getIncomingBlock(i: I) == InBB)
10745	Operands [I][Idx] = P->getIncomingValue(i: I);
10746	else
10747	Operands [I][Idx] = P->getIncomingValueForBlock(BB: InBB);
10748	}
10749	}
10750	return;
10751	}
10752	SmallMapVector<BasicBlock , SmallVector<unsigned*>, `4`>
10753	Blocks;
10754	for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) {
10755	BasicBlock *InBB = Main->getIncomingBlock(i: I);
10756	if (!DT.isReachableFromEntry(A: InBB)) {
10757	Operands [I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10758	continue;
10759	}
10760	Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
10761	}
10762	for (auto [Idx, V] : enumerate(First&: Phis)) {
10763	if (isa<PoisonValue>(Val: V)) {
10764	for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues()))
10765	Operands [I][Idx] = V;
10766	continue;
10767	}
10768	auto *P = cast<PHINode>(Val: V);
10769	for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
10770	BasicBlock *InBB = P->getIncomingBlock(i: I);
10771	if (InBB == Main->getIncomingBlock(i: I)) {
10772	if (isa_and_nonnull<PoisonValue>(Val: Operands [I][Idx]))
10773	continue;
10774	Operands [I][Idx] = P->getIncomingValue(i: I);
10775	continue;
10776	}
10777	auto *It = Blocks.find(Key: InBB);
10778	if (It == Blocks.end())
10779	continue;
10780	Operands [It->second.front()][Idx] = P->getIncomingValue(i: I);
10781	}
10782	}
10783	for (const auto &P : Blocks) {
10784	ArrayRef<unsigned> IncomingValues = P.second;
10785	if (IncomingValues.size() <= `1`)
10786	continue;
10787	unsigned BasicI = IncomingValues.consume_front();
10788	for (unsigned I : IncomingValues) {
10789	assert(all_of(enumerate(Operands[I]),
10790	[&](const auto &Data) {
10791	return !Data.value() \|\|
10792	Data.value() == Operands[BasicI][Data.index()];
10793	}) &&
10794	"Expected empty operands list.");
10795	Operands [I] = Operands [BasicI];
10796	}
10797	}
10798	}
10799	ArrayRef<Value > getOperands(unsigned* I) const { return Operands [I]; }
10800	};
10801	} // namespace
10802
10803	/// Returns main/alternate instructions for the given \p VL. Unlike
10804	/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10805	/// node support.
10806	/// \returns first main/alt instructions, if only poisons and instruction with
10807	/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10808	static std::pair<Instruction , Instruction >
10809	getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
10810	Instruction MainOp = nullptr*;
10811	Instruction AltOp = nullptr*;
10812	for (Value *V : VL) {
10813	if (isa<PoisonValue>(Val: V))
10814	continue;
10815	auto *I = dyn_cast<Instruction>(Val: V);
10816	if (!I)
10817	return {};
10818	if (!MainOp) {
10819	MainOp = I;
10820	continue;
10821	}
10822	if (MainOp->getOpcode() == I->getOpcode()) {
10823	if (I->getParent() != MainOp->getParent())
10824	return {};
10825	continue;
10826	}
10827	if (!AltOp) {
10828	AltOp = I;
10829	continue;
10830	}
10831	if (AltOp->getOpcode() == I->getOpcode()) {
10832	if (I->getParent() != AltOp->getParent())
10833	return {};
10834	continue;
10835	}
10836	return {};
10837	}
10838	if (!AltOp)
10839	return {};
10840	assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10841	"Expected different main and alt instructions.");
10842	return std::make_pair(x&: MainOp, y&: AltOp);
10843	}
10844
10845	/// Checks that every instruction appears once in the list and if not, packs
10846	/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10847	/// unique scalars is extended by poison values to the whole register size.
10848	///
10849	/// \returns false if \p VL could not be uniquified, in which case \p VL is
10850	/// unchanged and \p ReuseShuffleIndices is empty.
10851	static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
10852	SmallVectorImpl<int> &ReuseShuffleIndices,
10853	const TargetTransformInfo &TTI,
10854	const TargetLibraryInfo &TLI,
10855	const InstructionsState &S,
10856	const BoUpSLP::EdgeInfo &UserTreeIdx,
10857	bool TryPad = false) {
10858	// Check that every instruction appears once in this bundle.
10859	SmallVector<Value *> UniqueValues;
10860	SmallDenseMap<Value , unsigned*, `16`> UniquePositions(VL.size());
10861	for (Value *V : VL) {
10862	if (isConstant(V)) {
10863	// Constants are always considered distinct, even if the same constant
10864	// appears multiple times in VL.
10865	ReuseShuffleIndices.emplace_back(
10866	Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
10867	UniqueValues.emplace_back(Args&: V);
10868	continue;
10869	}
10870	auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
10871	ReuseShuffleIndices.emplace_back(Args&: Res.first ->second);
10872	if (Res.second)
10873	UniqueValues.emplace_back(Args&: V);
10874	}
10875
10876	// Easy case: VL has unique values and a "natural" size
10877	size_t NumUniqueScalarValues = UniqueValues.size();
10878	bool IsFullVectors = hasFullVectorsOrPowerOf2(
10879	TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues);
10880	if (NumUniqueScalarValues == VL.size() &&
10881	(VectorizeNonPowerOf2 \|\| IsFullVectors)) {
10882	ReuseShuffleIndices.clear();
10883	return true;
10884	}
10885
10886	// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10887	if ((UserTreeIdx.UserTE &&
10888	UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) \|\|
10889	!hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) {
10890	LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10891	"for nodes with padding.\n");
10892	ReuseShuffleIndices.clear();
10893	return false;
10894	}
10895
10896	LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10897	if (NumUniqueScalarValues <= `1` \|\| !IsFullVectors \|\|
10898	(UniquePositions.size() == `1` && all_of(Range&: UniqueValues, P: [](Value *V) {
10899	return isa<UndefValue>(Val: V) \|\| !isConstant(V);
10900	}))) {
10901	if (TryPad && UniquePositions.size() > `1` && NumUniqueScalarValues > `1` &&
10902	S.getMainOp()->isSafeToRemove() &&
10903	(S.areInstructionsWithCopyableElements() \|\|
10904	all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>))) {
10905	// Find the number of elements, which forms full vectors.
10906	unsigned PWSz = getFullVectorNumberOfElements(
10907	TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size());
10908	PWSz = std::min<unsigned>(a: PWSz, b: VL.size());
10909	if (PWSz == VL.size()) {
10910	// We ended up with the same size after removing duplicates and
10911	// upgrading the resulting vector size to a "nice size". Just keep
10912	// the initial VL then.
10913	ReuseShuffleIndices.clear();
10914	} else {
10915	// Pad unique values with poison to grow the vector to a "nice" size
10916	SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10917	UniqueValues.end());
10918	PaddedUniqueValues.append(
10919	NumInputs: PWSz - UniqueValues.size(),
10920	Elt: PoisonValue::get(T: UniqueValues.front()->getType()));
10921	// Check that extended with poisons/copyable operations are still valid
10922	// for vectorization (div/rem are not allowed).
10923	if ((!S.areInstructionsWithCopyableElements() &&
10924	!getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) \|\|
10925	(S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10926	(S.getMainOp()->isIntDivRem() \|\| S.getMainOp()->isFPDivRem() \|\|
10927	isa<CallInst>(Val: S.getMainOp())))) {
10928	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10929	ReuseShuffleIndices.clear();
10930	return false;
10931	}
10932	VL = std::move(PaddedUniqueValues);
10933	}
10934	return true;
10935	}
10936	LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10937	ReuseShuffleIndices.clear();
10938	return false;
10939	}
10940	VL = std::move(UniqueValues);
10941	return true;
10942	}
10943
10944	bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10945	const InstructionsState &LocalState,
10946	SmallVectorImpl<Value *> &Op1,
10947	SmallVectorImpl<Value *> &Op2,
10948	OrdersType &ReorderIndices) const {
10949	constexpr unsigned SmallNodeSize = `4`;
10950	if (VL.size() <= SmallNodeSize \|\| TTI->preferAlternateOpcodeVectorization() \|\|
10951	!SplitAlternateInstructions)
10952	return false;
10953
10954	// Check if this is a duplicate of another split entry.
10955	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10956	<< ".\n");
10957	for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) {
10958	if (E->isSame(VL)) {
10959	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10960	<< *LocalState.getMainOp() << ".\n");
10961	return false;
10962	}
10963	SmallPtrSet<Value *, `8`> Values(llvm::from_range, E->Scalars);
10964	if (all_of(Range&: VL, P: [&](Value *V) {
10965	return isa<PoisonValue>(Val: V) \|\| Values.contains(Ptr: V);
10966	})) {
10967	LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10968	return false;
10969	}
10970	}
10971
10972	ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size());
10973	SmallBitVector Op1Indices(VL.size());
10974	for (auto [Idx, V] : enumerate(First&: VL)) {
10975	auto *I = dyn_cast<Instruction>(Val: V);
10976	if (!I) {
10977	Op1.push_back(Elt: V);
10978	Op1Indices.set(Idx);
10979	continue;
10980	}
10981	if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10982	isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(),
10983	TLI: *TLI)) \|\|
10984	(LocalState.getAltOpcode() == LocalState.getOpcode() &&
10985	!isAlternateInstruction(I, MainOp: LocalState.getMainOp(),
10986	AltOp: LocalState.getAltOp(), TLI: *TLI))) {
10987	Op1.push_back(Elt: V);
10988	Op1Indices.set(Idx);
10989	continue;
10990	}
10991	Op2.push_back(Elt: V);
10992	}
10993	Type *ScalarTy = getValueType(V: VL.front());
10994	VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
10995	unsigned Opcode0 = LocalState.getOpcode();
10996	unsigned Opcode1 = LocalState.getAltOpcode();
10997	SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10998	// Enable split node, only if all nodes do not form legal alternate
10999	// instruction (like X86 addsub).
11000	SmallPtrSet<Value *, `4`> UOp1(llvm::from_range, Op1);
11001	SmallPtrSet<Value *, `4`> UOp2(llvm::from_range, Op2);
11002	if (UOp1.size() <= `1` \|\| UOp2.size() <= `1` \|\|
11003	TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) \|\|
11004	!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) \|\|
11005	!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size()))
11006	return false;
11007	// Enable split node, only if all nodes are power-of-2/full registers.
11008	unsigned Op1Cnt = `0`, Op2Cnt = Op1.size();
11009	for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11010	if (Op1Indices.test(Idx)) {
11011	ReorderIndices [Op1Cnt] = Idx;
11012	++Op1Cnt;
11013	} else {
11014	ReorderIndices [Op2Cnt] = Idx;
11015	++Op2Cnt;
11016	}
11017	}
11018	if (isIdentityOrder(Order: ReorderIndices))
11019	ReorderIndices.clear();
11020	SmallVector<int> Mask;
11021	if (!ReorderIndices.empty())
11022	inversePermutation(Indices: ReorderIndices, Mask);
11023	unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
11024	VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size());
11025	VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size());
11026	// Check non-profitable single register ops, which better to be represented
11027	// as alternate ops.
11028	if (NumParts >= VL.size())
11029	return false;
11030	constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11031	InstructionCost InsertCost = ::getShuffleCost(
11032	TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy);
11033	FixedVectorType *SubVecTy =
11034	getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size()));
11035	InstructionCost NewShuffleCost =
11036	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind);
11037	if (!LocalState.isCmpOp() && NumParts <= `1` &&
11038	(Mask.empty() \|\| InsertCost >= NewShuffleCost))
11039	return false;
11040	if ((LocalState.getMainOp()->isBinaryOp() &&
11041	LocalState.getAltOp()->isBinaryOp() &&
11042	(LocalState.isShiftOp() \|\| LocalState.isBitwiseLogicOp() \|\|
11043	LocalState.isAddSubLikeOp() \|\| LocalState.isMulDivLikeOp())) \|\|
11044	(LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) \|\|
11045	(LocalState.getMainOp()->isUnaryOp() &&
11046	LocalState.getAltOp()->isUnaryOp())) {
11047	InstructionCost OriginalVecOpsCost =
11048	TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) +
11049	TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind);
11050	SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11051	for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11052	if (isa<PoisonValue>(Val: VL [Idx]))
11053	continue;
11054	OriginalMask [Idx] = Idx + (Op1Indices.test(Idx) ? `0` : VL.size());
11055	}
11056	InstructionCost OriginalCost =
11057	OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
11058	Tp: VecTy, Mask: OriginalMask, CostKind: Kind);
11059	InstructionCost NewVecOpsCost =
11060	TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) +
11061	TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind);
11062	InstructionCost NewCost =
11063	NewVecOpsCost + InsertCost +
11064	(!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11065	VectorizableTree.front()->getOpcode() == Instruction::Store
11066	? NewShuffleCost
11067	: `0`);
11068	// If not profitable to split - exit.
11069	if (NewCost >= OriginalCost)
11070	return false;
11071	}
11072	return true;
11073	}
11074
11075	namespace {
11076	/// Class accepts incoming list of values, checks if it is able to model
11077	/// "copyable" values as compatible operations, and generates the list of values
11078	/// for scheduling and list of operands doe the new nodes.
11079	class InstructionsCompatibilityAnalysis {
11080	DominatorTree &DT;
11081	const DataLayout &DL;
11082	const TargetTransformInfo &TTI;
11083	const TargetLibraryInfo &TLI;
11084	unsigned MainOpcode = `0`;
11085	Instruction MainOp = nullptr*;
11086
11087	/// Checks if the opcode is supported as the main opcode for copyable
11088	/// elements.
11089	static bool isSupportedOpcode(const unsigned Opcode) {
11090	return Opcode == Instruction::Add \|\| Opcode == Instruction::Sub \|\|
11091	Opcode == Instruction::LShr \|\| Opcode == Instruction::Shl \|\|
11092	Opcode == Instruction::SDiv \|\| Opcode == Instruction::UDiv \|\|
11093	Opcode == Instruction::And \|\| Opcode == Instruction::Or \|\|
11094	Opcode == Instruction::Xor \|\| Opcode == Instruction::FAdd \|\|
11095	Opcode == Instruction::FSub \|\| Opcode == Instruction::FMul \|\|
11096	Opcode == Instruction::FDiv;
11097	}
11098
11099	/// Identifies the best candidate value, which represents main opcode
11100	/// operation.
11101	/// Currently the best candidate is the Add instruction with the parent
11102	/// block with the highest DFS incoming number (block, that dominates other).
11103	void findAndSetMainInstruction(ArrayRef<Value > VL, const* BoUpSLP &R) {
11104	BasicBlock Parent = nullptr*;
11105	// Checks if the instruction has supported opcode.
11106	auto IsSupportedInstruction = [&](Instruction I, bool* AnyUndef) {
11107	if (AnyUndef && (I->isIntDivRem() \|\| I->isFPDivRem() \|\| isa<CallInst>(Val: I)))
11108	return false;
11109	return I && isSupportedOpcode(Opcode: I->getOpcode()) &&
11110	(!doesNotNeedToBeScheduled(V: I) \|\| !R.isVectorized(V: I));
11111	};
11112	// Exclude operands instructions immediately to improve compile time, it
11113	// will be unable to schedule anyway.
11114	SmallDenseSet<Value *, `8`> Operands;
11115	SmallMapVector<unsigned, SmallVector<Instruction *>, `4`> Candidates;
11116	bool AnyUndef = false;
11117	for (Value *V : VL) {
11118	auto *I = dyn_cast<Instruction>(Val: V);
11119	if (!I) {
11120	AnyUndef \|= isa<UndefValue>(Val: V);
11121	continue;
11122	}
11123	if (!DT.isReachableFromEntry(A: I->getParent()))
11124	continue;
11125	if (Candidates.empty()) {
11126	Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11127	Parent = I->getParent();
11128	Operands.insert(I: I->op_begin(), E: I->op_end());
11129	continue;
11130	}
11131	if (Parent == I->getParent()) {
11132	Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11133	Operands.insert(I: I->op_begin(), E: I->op_end());
11134	continue;
11135	}
11136	auto *NodeA = DT.getNode(BB: Parent);
11137	auto *NodeB = DT.getNode(BB: I->getParent());
11138	assert(NodeA && "Should only process reachable instructions");
11139	assert(NodeB && "Should only process reachable instructions");
11140	assert((NodeA == NodeB) ==
11141	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11142	"Different nodes should have different DFS numbers");
11143	if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11144	Candidates.clear();
11145	Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11146	Parent = I->getParent();
11147	Operands.clear();
11148	Operands.insert(I: I->op_begin(), E: I->op_end());
11149	}
11150	}
11151	unsigned BestOpcodeNum = `0`;
11152	MainOp = nullptr;
11153	bool UsedOutside = false;
11154	for (const auto &P : Candidates) {
11155	bool PUsedOutside = all_of(Range: P.second, P: isUsedOutsideBlock);
11156	if (UsedOutside && !PUsedOutside)
11157	continue;
11158	if (!UsedOutside && PUsedOutside)
11159	BestOpcodeNum = `0`;
11160	if (P.second.size() < BestOpcodeNum)
11161	continue;
11162	// If have inner dependencies - skip.
11163	if (!PUsedOutside && any_of(Range: P.second, P: [&](Instruction *I) {
11164	return Operands.contains(V: I);
11165	}))
11166	continue;
11167	UsedOutside = PUsedOutside;
11168	for (Instruction *I : P.second) {
11169	if (IsSupportedInstruction(I, AnyUndef)) {
11170	MainOp = I;
11171	BestOpcodeNum = P.second.size();
11172	break;
11173	}
11174	}
11175	}
11176	if (MainOp) {
11177	// Do not match, if any copyable is a terminator from the same block as
11178	// the main operation.
11179	if (any_of(Range&: VL, P: [&](Value *V) {
11180	auto *I = dyn_cast<Instruction>(Val: V);
11181	return I && I->getParent() == MainOp->getParent() &&
11182	I->isTerminator();
11183	})) {
11184	MainOp = nullptr;
11185	return;
11186	}
11187	MainOpcode = MainOp->getOpcode();
11188	}
11189	}
11190
11191	/// Returns the idempotent value for the \p MainOp with the detected \p
11192	/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11193	/// the operand itself, since V or V == V.
11194	Value selectBestIdempotentValue() const* {
11195	assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11196	return ConstantExpr::getBinOpIdentity(Opcode: MainOpcode, Ty: MainOp->getType(),
11197	AllowRHSConstant: !MainOp->isCommutative());
11198	}
11199
11200	/// Returns the value and operands for the \p V, considering if it is original
11201	/// instruction and its actual operands should be returned, or it is a
11202	/// copyable element and its should be represented as idempotent instruction.
11203	SmallVector<Value > getOperands(const* InstructionsState &S, Value V) const* {
11204	if (isa<PoisonValue>(Val: V))
11205	return {V, V};
11206	if (!S.isCopyableElement(V))
11207	return convertTo(I: cast<Instruction>(Val: V), S).second;
11208	assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11209	return {V, selectBestIdempotentValue()};
11210	}
11211
11212	/// Builds operands for the original instructions.
11213	void
11214	buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11215	SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11216
11217	unsigned ShuffleOrOp =
11218	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11219	Instruction *VL0 = S.getMainOp();
11220
11221	switch (ShuffleOrOp) {
11222	case Instruction::PHI: {
11223	auto *PH = cast<PHINode>(Val: VL0);
11224
11225	// Keeps the reordered operands to avoid code duplication.
11226	PHIHandler Handler(DT, PH, VL);
11227	Handler.buildOperands();
11228	Operands.assign(NumElts: PH->getNumOperands(), Elt: {});
11229	for (unsigned I : seq<unsigned>(Size: PH->getNumOperands()))
11230	Operands [I].assign(in_start: Handler.getOperands(I).begin(),
11231	in_end: Handler.getOperands(I).end());
11232	return;
11233	}
11234	case Instruction::ExtractValue:
11235	case Instruction::ExtractElement:
11236	// This is a special case, as it does not gather, but at the same time
11237	// we are not extending buildTree_rec() towards the operands.
11238	Operands.assign(NumElts: `1`, Elt: {VL.size(), VL0->getOperand(i: `0`)});
11239	return;
11240	case Instruction::InsertElement:
11241	Operands.assign(NumElts: `2`, Elt: {VL.size(), nullptr});
11242	for (auto [Idx, V] : enumerate(First&: VL)) {
11243	auto *IE = cast<InsertElementInst>(Val: V);
11244	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11245	Ops [Idx] = IE->getOperand(i_nocapture: OpIdx);
11246	}
11247	return;
11248	case Instruction::Load:
11249	Operands.assign(
11250	NumElts: `1`, Elt: {VL.size(),
11251	PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())});
11252	for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) {
11253	auto *LI = dyn_cast<LoadInst>(Val: V);
11254	if (!LI)
11255	continue;
11256	Op = LI->getPointerOperand();
11257	}
11258	return;
11259	case Instruction::ZExt:
11260	case Instruction::SExt:
11261	case Instruction::FPToUI:
11262	case Instruction::FPToSI:
11263	case Instruction::FPExt:
11264	case Instruction::PtrToInt:
11265	case Instruction::IntToPtr:
11266	case Instruction::SIToFP:
11267	case Instruction::UIToFP:
11268	case Instruction::Trunc:
11269	case Instruction::FPTrunc:
11270	case Instruction::BitCast:
11271	case Instruction::ICmp:
11272	case Instruction::FCmp:
11273	case Instruction::FNeg:
11274	case Instruction::Add:
11275	case Instruction::FAdd:
11276	case Instruction::Sub:
11277	case Instruction::FSub:
11278	case Instruction::Mul:
11279	case Instruction::FMul:
11280	case Instruction::UDiv:
11281	case Instruction::SDiv:
11282	case Instruction::FDiv:
11283	case Instruction::URem:
11284	case Instruction::SRem:
11285	case Instruction::FRem:
11286	case Instruction::Shl:
11287	case Instruction::LShr:
11288	case Instruction::AShr:
11289	case Instruction::And:
11290	case Instruction::Or:
11291	case Instruction::Xor:
11292	case Instruction::Freeze:
11293	case Instruction::Store:
11294	case Instruction::ShuffleVector:
11295	Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11296	for (auto [Idx, V] : enumerate(First&: VL)) {
11297	auto *I = dyn_cast<Instruction>(Val: V);
11298	if (!I) {
11299	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11300	Ops [Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11301	continue;
11302	}
11303	auto [Op, ConvertedOps] = convertTo(I, S);
11304	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11305	Ops [Idx] = ConvertedOps [OpIdx];
11306	}
11307	return;
11308	case Instruction::Select:
11309	Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11310	for (auto [Idx, V] : enumerate(First&: VL)) {
11311	auto *I = dyn_cast<Instruction>(Val: V);
11312	if (!I) {
11313	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11314	Ops [Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11315	continue;
11316	}
11317	if (isa<ZExtInst>(Val: I)) {
11318	// Special case for select + zext i1 to avoid explosion of different
11319	// types. We want to keep the condition as i1 to be able to match
11320	// different selects together and reuse the vectorized condition
11321	// rather than trying to gather it.
11322	Operands [`0`][Idx] = I->getOperand(i: `0`);
11323	Operands [`1`][Idx] = ConstantInt::get(Ty: I->getType(), V: `1`);
11324	Operands [`2`][Idx] = ConstantInt::getNullValue(Ty: I->getType());
11325	continue;
11326	}
11327	auto [Op, ConvertedOps] = convertTo(I, S);
11328	for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11329	Ops [Idx] = ConvertedOps [OpIdx];
11330	}
11331	return;
11332	case Instruction::GetElementPtr: {
11333	Operands.assign(NumElts: `2`, Elt: {VL.size(), nullptr});
11334	// Need to cast all indices to the same type before vectorization to
11335	// avoid crash.
11336	// Required to be able to find correct matches between different gather
11337	// nodes and reuse the vectorized values rather than trying to gather them
11338	// again.
11339	const unsigned IndexIdx = `1`;
11340	Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
11341	Type *Ty =
11342	all_of(Range&: VL,
11343	P: [&](Value *V) {
11344	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11345	return !GEP \|\| VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
11346	})
11347	? VL0Ty
11348	: DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
11349	->getPointerOperandType()
11350	->getScalarType());
11351	for (auto [Idx, V] : enumerate(First&: VL)) {
11352	auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11353	if (!GEP) {
11354	Operands [`0`][Idx] = V;
11355	Operands [`1`][Idx] = ConstantInt::getNullValue(Ty);
11356	continue;
11357	}
11358	Operands [`0`][Idx] = GEP->getPointerOperand();
11359	auto *Op = GEP->getOperand(i_nocapture: IndexIdx);
11360	auto *CI = dyn_cast<ConstantInt>(Val: Op);
11361	Operands [`1`][Idx] = CI ? ConstantFoldIntegerCast(
11362	C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL)
11363	: Op;
11364	}
11365	return;
11366	}
11367	case Instruction::Call: {
11368	auto *CI = cast<CallInst>(Val: VL0);
11369	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
11370	for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) {
11371	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI))
11372	continue;
11373	auto &Ops = Operands.emplace_back();
11374	for (Value *V : VL) {
11375	auto *I = dyn_cast<Instruction>(Val: V);
11376	Ops.push_back(Elt: I ? I->getOperand(i: Idx)
11377	: PoisonValue::get(T: VL0->getOperand(i: Idx)->getType()));
11378	}
11379	}
11380	return;
11381	}
11382	default:
11383	break;
11384	}
11385	llvm_unreachable("Unexpected vectorization of the instructions.");
11386	}
11387
11388	public:
11389	InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11390	const TargetTransformInfo &TTI,
11391	const TargetLibraryInfo &TLI)
11392	: DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11393
11394	InstructionsState
11395	buildInstructionsState(ArrayRef<Value > VL, const* BoUpSLP &R,
11396	bool TryCopyableElementsVectorization,
11397	bool WithProfitabilityCheck = false,
11398	bool SkipSameCodeCheck = false) {
11399	InstructionsState S = (SkipSameCodeCheck \|\| !allSameBlock(VL))
11400	? InstructionsState::invalid()
11401	: getSameOpcode(VL, TLI);
11402	if (S)
11403	return S;
11404	// Check if series of selects + zext i1 %x to in can be combined into
11405	// selects + select %x, i32 1, i32 0.
11406	Instruction SelectOp = nullptr*;
11407	if (allSameBlock(VL) && all_of(Range&: VL, P: [&](Value *V) {
11408	if (match(V, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()))) {
11409	if (!SelectOp)
11410	SelectOp = cast<Instruction>(Val: V);
11411	return true;
11412	}
11413	auto *ZExt = dyn_cast<ZExtInst>(Val: V);
11414	return (ZExt && ZExt->getSrcTy()->isIntegerTy(Bitwidth: `1`)) \|\|
11415	isa<PoisonValue>(Val: V);
11416	})) {
11417	if (SelectOp)
11418	return InstructionsState (SelectOp, SelectOp);
11419	}
11420	if (!VectorizeCopyableElements \|\| !TryCopyableElementsVectorization)
11421	return S;
11422	findAndSetMainInstruction(VL, R);
11423	if (!MainOp)
11424	return InstructionsState::invalid();
11425	S = InstructionsState (MainOp, MainOp, /HasCopyables=/true);
11426	if (!WithProfitabilityCheck)
11427	return S;
11428	// Check if it is profitable to vectorize the instruction.
11429	SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11430	auto BuildCandidates =
11431	[](SmallVectorImpl<std::pair<Value , Value >> &Candidates, Value *V1,
11432	Value *V2) {
11433	if (V1 != V2 && isa<PHINode>(Val: V1))
11434	return;
11435	auto *I1 = dyn_cast<Instruction>(Val: V1);
11436	auto *I2 = dyn_cast<Instruction>(Val: V2);
11437	if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11438	I1->getParent() != I2->getParent())
11439	return;
11440	Candidates.emplace_back(Args&: V1, Args&: (I1 \|\| I2) ? V2 : V1);
11441	};
11442	if (VL.size() == `2`) {
11443	// Check if the operands allow better vectorization.
11444	SmallVector<std::pair<Value , Value >, `4`> Candidates1, Candidates2;
11445	BuildCandidates(Candidates1, Operands [`0`][`0`], Operands [`0`][`1`]);
11446	BuildCandidates(Candidates2, Operands [`1`][`0`], Operands [`1`][`1`]);
11447	bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11448	R.findBestRootPair(Candidates: Candidates1) &&
11449	R.findBestRootPair(Candidates: Candidates2);
11450	if (!Res && isCommutative(I: MainOp)) {
11451	Candidates1.clear();
11452	Candidates2.clear();
11453	BuildCandidates(Candidates1, Operands [`0`][`0`], Operands [`1`][`1`]);
11454	BuildCandidates(Candidates2, Operands [`1`][`0`], Operands [`0`][`1`]);
11455	Res = !Candidates1.empty() && !Candidates2.empty() &&
11456	R.findBestRootPair(Candidates: Candidates1) &&
11457	R.findBestRootPair(Candidates: Candidates2);
11458	}
11459	if (!Res)
11460	return InstructionsState::invalid();
11461	constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11462	InstructionCost ScalarCost = TTI.getInstructionCost(U: S.getMainOp(), CostKind: Kind);
11463	InstructionCost VectorCost;
11464	FixedVectorType *VecTy =
11465	getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
11466	switch (MainOpcode) {
11467	case Instruction::Add:
11468	case Instruction::Sub:
11469	case Instruction::LShr:
11470	case Instruction::Shl:
11471	case Instruction::SDiv:
11472	case Instruction::UDiv:
11473	case Instruction::And:
11474	case Instruction::Or:
11475	case Instruction::Xor:
11476	case Instruction::FAdd:
11477	case Instruction::FMul:
11478	case Instruction::FSub:
11479	case Instruction::FDiv:
11480	VectorCost = TTI.getArithmeticInstrCost(Opcode: MainOpcode, Ty: VecTy, CostKind: Kind);
11481	break;
11482	default:
11483	llvm_unreachable("Unexpected instruction.");
11484	}
11485	if (VectorCost > ScalarCost)
11486	return InstructionsState::invalid();
11487	return S;
11488	}
11489	assert(Operands.size() == `2` && "Unexpected number of operands!");
11490	unsigned CopyableNum =
11491	count_if(Range&: VL, P: [&](Value V) { return* S.isCopyableElement(V); });
11492	if (CopyableNum < VL.size() / `2`)
11493	return S;
11494	// Too many phi copyables - exit.
11495	const unsigned Limit = VL.size() / `24`;
11496	if ((CopyableNum >= VL.size() - Limit \|\|
11497	(CopyableNum >= VL.size() - `1` && VL.size() > `4`) \|\|
11498	CopyableNum >= MaxPHINumOperands) &&
11499	all_of(Range&: VL, P: [&](Value *V) {
11500	return isa<PHINode>(Val: V) \|\| !S.isCopyableElement(V);
11501	}))
11502	return InstructionsState::invalid();
11503	// Check profitability if number of copyables > VL.size() / 2.
11504	// 1. Reorder operands for better matching.
11505	if (isCommutative(I: MainOp)) {
11506	for (auto [OpL, OpR] : zip(t&: Operands.front(), u&: Operands.back())) {
11507	// Make instructions the first operands.
11508	if (!isa<Instruction>(Val: OpL) && isa<Instruction>(Val: OpR)) {
11509	std::swap(a&: OpL, b&: OpR);
11510	continue;
11511	}
11512	// Make constants the second operands.
11513	if ((isa<Constant>(Val: OpL) && !match(V: OpR, P: m_Zero())) \|\|
11514	match(V: OpL, P: m_Zero())) {
11515	std::swap(a&: OpL, b&: OpR);
11516	continue;
11517	}
11518	}
11519	}
11520	// 2. Check, if operands can be vectorized.
11521	if (count_if(Range&: Operands.back(), P: IsaPred<Instruction>) > `1`)
11522	return InstructionsState::invalid();
11523	auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11524	if (allConstant(VL: Ops) \|\| isSplat(VL: Ops))
11525	return true;
11526	// Check if it is "almost" splat, i.e. has >= 4 elements and only single
11527	// one is different.
11528	constexpr unsigned Limit = `4`;
11529	if (Operands.front().size() >= Limit) {
11530	SmallDenseMap<const Value , unsigned*> Counters;
11531	for (Value *V : Ops) {
11532	if (isa<UndefValue>(Val: V))
11533	continue;
11534	++Counters [V];
11535	}
11536	if (Counters.size() == `2` &&
11537	any_of(Range&: Counters, P: [&](const std::pair<const Value , unsigned*> &C) {
11538	return C.second == `1`;
11539	}))
11540	return true;
11541	}
11542	// First operand not a constant or splat? Last attempt - check for
11543	// potential vectorization.
11544	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11545	InstructionsState OpS = Analysis.buildInstructionsState(
11546	VL: Ops, R, /TryCopyableElementsVectorization=/true);
11547	if (!OpS \|\| (OpS.getOpcode() == Instruction::PHI && !allSameBlock(VL: Ops)))
11548	return false;
11549	unsigned CopyableNum =
11550	count_if(Range&: Ops, P: [&](Value V) { return* OpS.isCopyableElement(V); });
11551	return CopyableNum <= VL.size() / `2`;
11552	};
11553	if (!CheckOperand(Operands.front()))
11554	return InstructionsState::invalid();
11555
11556	return S;
11557	}
11558
11559	SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11560	ArrayRef<Value *> VL) {
11561	assert(S && "Invalid state!");
11562	SmallVector<BoUpSLP::ValueList> Operands;
11563	if (S.areInstructionsWithCopyableElements()) {
11564	MainOp = S.getMainOp();
11565	MainOpcode = S.getOpcode();
11566	Operands.assign(NumElts: MainOp->getNumOperands(),
11567	Elt: BoUpSLP::ValueList (VL.size(), nullptr));
11568	for (auto [Idx, V] : enumerate(First&: VL)) {
11569	SmallVector<Value *> OperandsForValue = getOperands(S, V);
11570	for (auto [OperandIdx, Operand] : enumerate(First&: OperandsForValue))
11571	Operands [OperandIdx][Idx] = Operand;
11572	}
11573	} else {
11574	buildOriginalOperands(S, VL, Operands);
11575	}
11576	return Operands;
11577	}
11578	};
11579	} // namespace
11580
11581	BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11582	ArrayRef<Value > VL, unsigned* Depth, const EdgeInfo &UserTreeIdx,
11583	bool TryCopyableElementsVectorization) const {
11584	assert((allConstant(VL) \|\| allSameType(VL)) && "Invalid types!");
11585
11586	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11587	InstructionsState S = Analysis.buildInstructionsState(
11588	VL, R: *this, TryCopyableElementsVectorization,
11589	/WithProfitabilityCheck=/true, SkipSameCodeCheck: TryCopyableElementsVectorization);
11590
11591	bool AreScatterAllGEPSameBlock = false;
11592	if (!S) {
11593	SmallVector<unsigned> SortedIndices;
11594	BasicBlock BB = nullptr*;
11595	bool IsScatterVectorizeUserTE =
11596	UserTreeIdx.UserTE &&
11597	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11598	AreScatterAllGEPSameBlock =
11599	(IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11600	VL.size() > `2` &&
11601	all_of(Range&: VL,
11602	P: [&BB](Value *V) {
11603	auto *I = dyn_cast<GetElementPtrInst>(Val: V);
11604	if (!I)
11605	return doesNotNeedToBeScheduled(V);
11606	if (!BB)
11607	BB = I->getParent();
11608	return BB == I->getParent() && I->getNumOperands() == `2`;
11609	}) &&
11610	BB &&
11611	sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL,
11612	SE&: *SE, SortedIndices));
11613	if (!AreScatterAllGEPSameBlock) {
11614	LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11615	"C,S,B,O, small shuffle. \n";
11616	dbgs() << "[";
11617	interleaveComma(VL, dbgs(), [&](Value V) { dbgs() << V; });
11618	dbgs() << "]\n");
11619	return ScalarsVectorizationLegality (S, /IsLegal=/false,
11620	/TryToFindDuplicates=/true,
11621	/TrySplitVectorize=/true);
11622	}
11623	// Reset S to make it GetElementPtr kind of node.
11624	const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
11625	assert(It != VL.end() && "Expected at least one GEP.");
11626	S = getSameOpcode(VL: It, TLI: TLI);
11627	}
11628	assert(S && "Must be valid.");
11629
11630	// Don't handle vectors.
11631	if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) {
11632	LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11633	// Do not try to pack to avoid extra instructions here.
11634	return ScalarsVectorizationLegality (S, /IsLegal=/false,
11635	/TryToFindDuplicates=/false);
11636	}
11637
11638	// Check that all of the users of the scalars that we want to vectorize are
11639	// schedulable.
11640	BasicBlock *BB = S.getMainOp()->getParent();
11641
11642	if (BB->isEHPad() \|\| isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) \|\|
11643	!DT->isReachableFromEntry(A: BB)) {
11644	// Don't go into unreachable blocks. They may contain instructions with
11645	// dependency cycles which confuse the final scheduling.
11646	// Do not vectorize EH and non-returning blocks, not profitable in most
11647	// cases.
11648	LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11649	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11650	}
11651
11652	// Don't go into catchswitch blocks, which can happen with PHIs.
11653	// Such blocks can only have PHIs and the catchswitch. There is no
11654	// place to insert a shuffle if we need to, so just avoid that issue.
11655	if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
11656	LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11657	// Do not try to pack to avoid extra instructions here.
11658	return ScalarsVectorizationLegality (S, /IsLegal=/false,
11659	/TryToFindDuplicates=/false);
11660	}
11661
11662	// Don't handle scalable vectors
11663	if (S.getOpcode() == Instruction::ExtractElement &&
11664	isa<ScalableVectorType>(
11665	Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) {
11666	LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11667	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11668	}
11669
11670	// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11671	// a load), in which case peek through to include it in the tree, without
11672	// ballooning over-budget.
11673	if (Depth >= RecursionMaxDepth &&
11674	(S.isAltShuffle() \|\| VL.size() < `4` \|\|
11675	!(match(V: S.getMainOp(), P: m_Load(Op: m_Value())) \|\|
11676	all_of(Range&: VL, P: [&S](const Value *I) {
11677	return match(V: I,
11678	P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
11679	cast<Instruction>(Val: I)->getOpcode() == S.getOpcode();
11680	})))) {
11681	LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11682	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11683	}
11684
11685	// Check if this is a duplicate of another entry.
11686	LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11687	for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) {
11688	if (E->isSame(VL)) {
11689	LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11690	<< ".\n");
11691	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11692	}
11693	SmallPtrSet<Value *, `8`> Values(llvm::from_range, E->Scalars);
11694	if (all_of(Range&: VL, P: [&](Value *V) {
11695	return isa<PoisonValue>(Val: V) \|\| Values.contains(Ptr: V) \|\|
11696	(S.getOpcode() == Instruction::PHI && isa<PHINode>(Val: V) &&
11697	LI->getLoopFor(BB: S.getMainOp()->getParent()) &&
11698	isVectorized(V));
11699	})) {
11700	LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11701	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11702	}
11703	}
11704
11705	bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11706	bool AreAllSameInsts = AreAllSameBlock \|\| AreScatterAllGEPSameBlock;
11707	if (!AreAllSameInsts \|\| isSplat(VL) \|\|
11708	(isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11709	Val: S.getMainOp()) &&
11710	!all_of(Range&: VL, P: isVectorLikeInstWithConstOps))) {
11711	LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O conditions. \n";
11712	dbgs() << "[";
11713	interleaveComma(VL, dbgs(), [&](Value V) { dbgs() << V; });
11714	dbgs() << "]\n");
11715	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11716	}
11717
11718	// Don't vectorize ephemeral values.
11719	if (!EphValues.empty()) {
11720	for (Value *V : VL) {
11721	if (EphValues.count(Ptr: V)) {
11722	LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11723	<< ") is ephemeral.\n");
11724	// Do not try to pack to avoid extra instructions here.
11725	return ScalarsVectorizationLegality (S, /IsLegal=/false,
11726	/TryToFindDuplicates=/false);
11727	}
11728	}
11729	}
11730
11731	// We now know that this is a vector of instructions of the same type from
11732	// the same block.
11733
11734	// Check that none of the instructions in the bundle are already in the tree
11735	// and the node may be not profitable for the vectorization as the small
11736	// alternate node.
11737	if (S.isAltShuffle()) {
11738	auto GetNumVectorizedExtracted = [&]() {
11739	APInt Extracted = APInt::getZero(numBits: VL.size());
11740	APInt Vectorized = APInt::getAllOnes(numBits: VL.size());
11741	for (auto [Idx, V] : enumerate(First&: VL)) {
11742	auto *I = dyn_cast<Instruction>(Val: V);
11743	if (!I \|\| doesNotNeedToBeScheduled(V: I) \|\|
11744	all_of(Range: I->operands(), P: [&](const Use &U) {
11745	return isa<ExtractElementInst>(Val: U.get());
11746	}))
11747	continue;
11748	if (isVectorized(V: I))
11749	Vectorized.clearBit(BitPosition: Idx);
11750	else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList))
11751	Extracted.setBit(Idx);
11752	}
11753	return std::make_pair(x&: Vectorized, y&: Extracted);
11754	};
11755	auto [Vectorized, Extracted] = GetNumVectorizedExtracted ();
11756	constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11757	bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == `2`;
11758	if (!Vectorized.isAllOnes() && !PreferScalarize) {
11759	// Rough cost estimation, if the vector code (+ potential extracts) is
11760	// more profitable than the scalar + buildvector.
11761	Type *ScalarTy = VL.front()->getType();
11762	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11763	InstructionCost VectorizeCostEstimate =
11764	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) +
11765	::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted,
11766	/Insert=/false, /Extract=/true, CostKind: Kind);
11767	InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11768	TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized,
11769	/Insert=/true, /Extract=/false, CostKind: Kind, /ForPoisonSrc=/false);
11770	PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11771	}
11772	if (PreferScalarize) {
11773	LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11774	"node is not profitable.\n");
11775	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11776	}
11777	}
11778
11779	// The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11780	if (UserIgnoreList && !UserIgnoreList->empty()) {
11781	for (Value *V : VL) {
11782	if (UserIgnoreList->contains(V)) {
11783	LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11784	return ScalarsVectorizationLegality (S, /IsLegal=/false);
11785	}
11786	}
11787	}
11788
11789	return ScalarsVectorizationLegality (S, /IsLegal=/true);
11790	}
11791
11792	void BoUpSLP::buildTreeRec(ArrayRef<Value > VLRef, unsigned* Depth,
11793	const EdgeInfo &UserTreeIdx,
11794	unsigned InterleaveFactor) {
11795	assert((allConstant(VLRef) \|\| allSameType(VLRef)) && "Invalid types!");
11796
11797	SmallVector<int> ReuseShuffleIndices;
11798	SmallVector<Value *> VL(VLRef);
11799
11800	// Tries to build split node.
11801	auto TrySplitNode = [&](const InstructionsState &LocalState) {
11802	SmallVector<Value *> Op1, Op2;
11803	OrdersType ReorderIndices;
11804	if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11805	return false;
11806
11807	auto Invalid = ScheduleBundle::invalid();
11808	auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState,
11809	UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices);
11810	LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11811	auto AddNode = [&](ArrayRef<Value > Op, unsigned* Idx) {
11812	InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
11813	if (S && (isa<LoadInst>(Val: S.getMainOp()) \|\|
11814	getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /SameVF=/true))) {
11815	// Build gather node for loads, they will be gathered later.
11816	TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
11817	Args: Idx == `0` ? `0` : Op1.size());
11818	(void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx});
11819	} else {
11820	TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
11821	Args: Idx == `0` ? `0` : Op1.size());
11822	buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx});
11823	}
11824	};
11825	AddNode(Op1, `0`);
11826	AddNode(Op2, `1`);
11827	return true;
11828	};
11829
11830	auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11831	bool AreConsts = false;
11832	for (Value *V : VL) {
11833	if (isa<PoisonValue>(Val: V))
11834	continue;
11835	if (isa<Constant>(Val: V)) {
11836	AreConsts = true;
11837	continue;
11838	}
11839	if (!isa<PHINode>(Val: V))
11840	return false;
11841	}
11842	return AreConsts;
11843	};
11844	if (AreOnlyConstsWithPHIs (VL)) {
11845	LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11846	newGatherTreeEntry(VL, S: InstructionsState::invalid(), UserTreeIdx);
11847	return;
11848	}
11849
11850	ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11851	VL, Depth, UserTreeIdx, /TryCopyableElementsVectorization=/false);
11852	InstructionsState S = Legality.getInstructionsState();
11853	if (!Legality.isLegal()) {
11854	if (Legality.trySplitVectorize()) {
11855	auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11856	// Last chance to try to vectorize alternate node.
11857	if (MainOp && AltOp && TrySplitNode (InstructionsState (MainOp, AltOp)))
11858	return;
11859	}
11860	if (!S)
11861	Legality = getScalarsVectorizationLegality(
11862	VL, Depth, UserTreeIdx, /TryCopyableElementsVectorization=/true);
11863	if (!Legality.isLegal()) {
11864	if (Legality.tryToFindDuplicates())
11865	tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: TTI, TLI: TLI, S,
11866	UserTreeIdx);
11867
11868	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11869	return;
11870	}
11871	S = Legality.getInstructionsState();
11872	}
11873
11874	// FIXME: investigate if there are profitable cases for VL.size() <= 4.
11875	if (S.isAltShuffle() && TrySplitNode (S))
11876	return;
11877
11878	// Check that every instruction appears once in this bundle.
11879	if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: TTI, TLI: TLI, S, UserTreeIdx,
11880	/TryPad=/true)) {
11881	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11882	return;
11883	}
11884
11885	// Perform specific checks for each particular instruction kind.
11886	bool IsScatterVectorizeUserTE =
11887	UserTreeIdx.UserTE &&
11888	UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11889	OrdersType CurrentOrder;
11890	SmallVector<Value *> PointerOps;
11891	StridedPtrInfo SPtrInfo;
11892	TreeEntry::EntryState State = getScalarsVectorizationState(
11893	S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11894	if (State == TreeEntry::NeedToGather) {
11895	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11896	return;
11897	}
11898
11899	Instruction *VL0 = S.getMainOp();
11900	BasicBlock *BB = VL0->getParent();
11901	auto &BSRef = BlocksSchedules [BB];
11902	if (!BSRef)
11903	BSRef = std::make_unique<BlockScheduling>(args&: BB);
11904
11905	BlockScheduling &BS = *BSRef;
11906
11907	SetVector<Value *> UniqueValues(llvm::from_range, VL);
11908	std::optional<ScheduleBundle *> BundlePtr =
11909	BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S, EI: UserTreeIdx);
11910	#ifdef EXPENSIVE_CHECKS
11911	// Make sure we didn't break any internal invariants
11912	BS.verify();
11913	#endif
11914	if (!BundlePtr \|\| (BundlePtr && !BundlePtr.value())) {
11915	LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11916	// Last chance to try to vectorize alternate node.
11917	if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode (S))
11918	return;
11919	newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11920	NonScheduledFirst.insert(Ptr: VL.front());
11921	if (S.getOpcode() == Instruction::Load &&
11922	BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11923	registerNonVectorizableLoads(VL: ArrayRef(VL));
11924	return;
11925	}
11926	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11927	SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11928	ScheduleBundle Empty;
11929	ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11930	LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11931
11932	unsigned ShuffleOrOp =
11933	S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11934	auto CreateOperandNodes = [&](TreeEntry TE, const* auto &Operands) {
11935	// Postpone PHI nodes creation
11936	SmallVector<unsigned> PHIOps;
11937	for (unsigned I : seq<unsigned>(Operands.size())) {
11938	ArrayRef<Value *> Op = Operands[I];
11939	if (Op.empty())
11940	continue;
11941	InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
11942	if ((!S \|\| S.getOpcode() != Instruction::PHI) \|\| S.isAltShuffle())
11943	buildTreeRec(VLRef: Op, Depth: Depth + `1`, UserTreeIdx: {TE, I});
11944	else
11945	PHIOps.push_back(Elt: I);
11946	}
11947	for (unsigned I : PHIOps)
11948	buildTreeRec(VLRef: Operands[I], Depth: Depth + `1`, UserTreeIdx: {TE, I});
11949	};
11950	switch (ShuffleOrOp) {
11951	case Instruction::PHI: {
11952	TreeEntry *TE =
11953	newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11954	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11955	TE->dump());
11956
11957	TE->setOperands(Operands);
11958	CreateOperandNodes (TE, Operands);
11959	return;
11960	}
11961	case Instruction::ExtractValue:
11962	case Instruction::ExtractElement: {
11963	if (CurrentOrder.empty()) {
11964	LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11965	} else {
11966	LLVM_DEBUG({
11967	dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11968	"with order";
11969	for (unsigned Idx : CurrentOrder)
11970	dbgs() << " " << Idx;
11971	dbgs() << "\n";
11972	});
11973	fixupOrderingIndices(Order: CurrentOrder);
11974	}
11975	// Insert new order with initial value 0, if it does not exist,
11976	// otherwise return the iterator to the existing one.
11977	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
11978	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
11979	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11980	"(ExtractValueInst/ExtractElementInst).\n";
11981	TE->dump());
11982	// This is a special case, as it does not gather, but at the same time
11983	// we are not extending buildTreeRec() towards the operands.
11984	TE->setOperands(Operands);
11985	return;
11986	}
11987	case Instruction::InsertElement: {
11988	assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11989
11990	auto OrdCompare = [](const std::pair<int, int> &P1,
11991	const std::pair<int, int> &P2) {
11992	return P1.first > P2.first;
11993	};
11994	PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11995	decltype(OrdCompare)>
11996	Indices(OrdCompare);
11997	for (int I = `0`, E = VL.size(); I < E; ++I) {
11998	unsigned Idx = *getElementIndex(Inst: VL [I]);
11999	Indices.emplace(args&: Idx, args&: I);
12000	}
12001	OrdersType CurrentOrder(VL.size(), VL.size());
12002	bool IsIdentity = true;
12003	for (int I = `0`, E = VL.size(); I < E; ++I) {
12004	CurrentOrder [Indices.top().second] = I;
12005	IsIdentity &= Indices.top().second == I;
12006	Indices.pop();
12007	}
12008	if (IsIdentity)
12009	CurrentOrder.clear();
12010	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12011	ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder);
12012	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12013	TE->dump());
12014
12015	TE->setOperands(Operands);
12016	buildTreeRec(VLRef: TE->getOperand(OpIdx: `1`), Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
12017	return;
12018	}
12019	case Instruction::Load: {
12020	// Check that a vectorized load would load the same memory as a scalar
12021	// load. For example, we don't want to vectorize loads that are smaller
12022	// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12023	// treats loading/storing it as an i8 struct. If we vectorize loads/stores
12024	// from such a struct, we read/write packed bits disagreeing with the
12025	// unvectorized version.
12026	TreeEntry TE = nullptr*;
12027	fixupOrderingIndices(Order: CurrentOrder);
12028	switch (State) {
12029	case TreeEntry::Vectorize:
12030	TE = newTreeEntry(VL, Bundle /vectorized/, S, UserTreeIdx,
12031	ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor);
12032	if (CurrentOrder.empty())
12033	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12034	TE->dump());
12035	else
12036	LLVM_DEBUG(dbgs()
12037	<< "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12038	TE->dump());
12039	break;
12040	case TreeEntry::CompressVectorize:
12041	// Vectorizing non-consecutive loads with (masked)load + compress.
12042	TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S,
12043	UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12044	LLVM_DEBUG(
12045	dbgs()
12046	<< "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12047	TE->dump());
12048	break;
12049	case TreeEntry::StridedVectorize:
12050	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
12051	TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
12052	UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12053	TreeEntryToStridedPtrInfoMap [TE] = SPtrInfo;
12054	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12055	TE->dump());
12056	break;
12057	case TreeEntry::ScatterVectorize:
12058	// Vectorizing non-consecutive loads with `llvm.masked.gather`.
12059	TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
12060	UserTreeIdx, ReuseShuffleIndices);
12061	LLVM_DEBUG(
12062	dbgs()
12063	<< "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12064	TE->dump());
12065	break;
12066	case TreeEntry::CombinedVectorize:
12067	case TreeEntry::SplitVectorize:
12068	case TreeEntry::NeedToGather:
12069	llvm_unreachable("Unexpected loads state.");
12070	}
12071	if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12072	assert(Operands.size() == `1` && "Expected a single operand only");
12073	SmallVector<int> Mask;
12074	inversePermutation(Indices: CurrentOrder, Mask);
12075	reorderScalars(Scalars&: Operands.front(), Mask);
12076	}
12077	TE->setOperands(Operands);
12078	if (State == TreeEntry::ScatterVectorize)
12079	buildTreeRec(VLRef: PointerOps, Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
12080	return;
12081	}
12082	case Instruction::ZExt:
12083	case Instruction::SExt:
12084	case Instruction::FPToUI:
12085	case Instruction::FPToSI:
12086	case Instruction::FPExt:
12087	case Instruction::PtrToInt:
12088	case Instruction::IntToPtr:
12089	case Instruction::SIToFP:
12090	case Instruction::UIToFP:
12091	case Instruction::Trunc:
12092	case Instruction::FPTrunc:
12093	case Instruction::BitCast: {
12094	auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12095	u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
12096	y: std::numeric_limits<unsigned>::max()));
12097	if (ShuffleOrOp == Instruction::ZExt \|\|
12098	ShuffleOrOp == Instruction::SExt) {
12099	CastMaxMinBWSizes = std::make_pair(
12100	x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12101	b: PrevMaxBW),
12102	y: std::min<unsigned>(
12103	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
12104	b: PrevMinBW));
12105	} else if (ShuffleOrOp == Instruction::Trunc) {
12106	CastMaxMinBWSizes = std::make_pair(
12107	x: std::max<unsigned>(
12108	a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()),
12109	b: PrevMaxBW),
12110	y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12111	b: PrevMinBW));
12112	}
12113	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12114	ReuseShuffleIndices);
12115	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12116	TE->dump());
12117
12118	TE->setOperands(Operands);
12119	for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12120	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth, UserTreeIdx: {TE, I});
12121	if (ShuffleOrOp == Instruction::Trunc) {
12122	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
12123	} else if (ShuffleOrOp == Instruction::SIToFP \|\|
12124	ShuffleOrOp == Instruction::UIToFP) {
12125	unsigned NumSignBits =
12126	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
12127	if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: `0`))) {
12128	APInt Mask = DB->getDemandedBits(I: OpI);
12129	NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
12130	}
12131	if (NumSignBits * `2` >=
12132	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
12133	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
12134	}
12135	return;
12136	}
12137	case Instruction::ICmp:
12138	case Instruction::FCmp: {
12139	// Check that all of the compares have the same predicate.
12140	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
12141	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12142	ReuseShuffleIndices);
12143	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12144	TE->dump());
12145
12146	VLOperands Ops(VL, Operands, S, *this);
12147	if (cast<CmpInst>(Val: VL0)->isCommutative()) {
12148	// Commutative predicate - collect + sort operands of the instructions
12149	// so that each side is more likely to have the same opcode.
12150	assert(P0 == CmpInst::getSwappedPredicate(P0) &&
12151	"Commutative Predicate mismatch");
12152	Ops.reorder();
12153	Operands.front() = Ops.getVL(OpIdx: `0`);
12154	Operands.back() = Ops.getVL(OpIdx: `1`);
12155	} else {
12156	// Collect operands - commute if it uses the swapped predicate.
12157	for (auto [Idx, V] : enumerate(First&: VL)) {
12158	if (isa<PoisonValue>(Val: V))
12159	continue;
12160	auto *Cmp = cast<CmpInst>(Val: V);
12161	if (Cmp->getPredicate() != P0)
12162	std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12163	}
12164	}
12165	TE->setOperands(Operands);
12166	buildTreeRec(VLRef: Operands.front(), Depth, UserTreeIdx: {TE, `0`});
12167	buildTreeRec(VLRef: Operands.back(), Depth, UserTreeIdx: {TE, `1`});
12168	if (ShuffleOrOp == Instruction::ICmp) {
12169	unsigned NumSignBits0 =
12170	ComputeNumSignBits(Op: VL0->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
12171	if (NumSignBits0 * `2` >=
12172	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `0`)->getType()))
12173	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `0`)->Idx);
12174	unsigned NumSignBits1 =
12175	ComputeNumSignBits(Op: VL0->getOperand(i: `1`), DL: DL, AC, CxtI: nullptr*, DT);
12176	if (NumSignBits1 * `2` >=
12177	DL->getTypeSizeInBits(Ty: VL0->getOperand(i: `1`)->getType()))
12178	ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: `1`)->Idx);
12179	}
12180	return;
12181	}
12182	case Instruction::Select:
12183	case Instruction::FNeg:
12184	case Instruction::Add:
12185	case Instruction::FAdd:
12186	case Instruction::Sub:
12187	case Instruction::FSub:
12188	case Instruction::Mul:
12189	case Instruction::FMul:
12190	case Instruction::UDiv:
12191	case Instruction::SDiv:
12192	case Instruction::FDiv:
12193	case Instruction::URem:
12194	case Instruction::SRem:
12195	case Instruction::FRem:
12196	case Instruction::Shl:
12197	case Instruction::LShr:
12198	case Instruction::AShr:
12199	case Instruction::And:
12200	case Instruction::Or:
12201	case Instruction::Xor:
12202	case Instruction::Freeze: {
12203	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12204	ReuseShuffleIndices);
12205	LLVM_DEBUG(
12206	dbgs() << "SLP: added a new TreeEntry "
12207	"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12208	TE->dump());
12209
12210	if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
12211	VLOperands Ops(VL, Operands, S, *this);
12212	Ops.reorder();
12213	Operands [`0`] = Ops.getVL(OpIdx: `0`);
12214	Operands [`1`] = Ops.getVL(OpIdx: `1`);
12215	}
12216	TE->setOperands(Operands);
12217	for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12218	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
12219	return;
12220	}
12221	case Instruction::GetElementPtr: {
12222	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12223	ReuseShuffleIndices);
12224	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12225	TE->dump());
12226	TE->setOperands(Operands);
12227
12228	for (unsigned I = `0`, Ops = Operands.size(); I < Ops; ++I)
12229	buildTreeRec(VLRef: Operands [I], Depth: Depth + `1`, UserTreeIdx: {TE, I});
12230	return;
12231	}
12232	case Instruction::Store: {
12233	bool Consecutive = CurrentOrder.empty();
12234	if (!Consecutive)
12235	fixupOrderingIndices(Order: CurrentOrder);
12236	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12237	ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12238	if (Consecutive)
12239	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12240	TE->dump());
12241	else
12242	LLVM_DEBUG(
12243	dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12244	TE->dump());
12245	TE->setOperands(Operands);
12246	buildTreeRec(VLRef: TE->getOperand(OpIdx: `0`), Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
12247	return;
12248	}
12249	case Instruction::Call: {
12250	// Check if the calls are all to the same vectorizable intrinsic or
12251	// library function.
12252	CallInst *CI = cast<CallInst>(Val: VL0);
12253	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12254
12255	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12256	ReuseShuffleIndices);
12257	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12258	TE->dump());
12259	if (isCommutative(I: VL0)) {
12260	VLOperands Ops(VL, Operands, S, *this);
12261	Ops.reorder();
12262	Operands [`0`] = Ops.getVL(OpIdx: `0`);
12263	Operands [`1`] = Ops.getVL(OpIdx: `1`);
12264	}
12265	TE->setOperands(Operands);
12266	for (unsigned I : seq<unsigned>(Size: CI->arg_size())) {
12267	// For scalar operands no need to create an entry since no need to
12268	// vectorize it.
12269	if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI))
12270	continue;
12271	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
12272	}
12273	return;
12274	}
12275	case Instruction::ShuffleVector: {
12276	TreeEntry TE = newTreeEntry(VL, Bundle /vectorized/*, S, UserTreeIdx,
12277	ReuseShuffleIndices);
12278	if (S.isAltShuffle()) {
12279	LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12280	TE->dump());
12281	} else {
12282	assert(SLPReVec && "Only supported by REVEC.");
12283	LLVM_DEBUG(
12284	dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12285	TE->dump());
12286	}
12287
12288	// Reorder operands if reordering would enable vectorization.
12289	auto *CI = dyn_cast<CmpInst>(Val: VL0);
12290	if (CI && any_of(Range&: VL, P: [](Value *V) {
12291	return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative();
12292	})) {
12293	auto *MainCI = cast<CmpInst>(Val: S.getMainOp());
12294	auto *AltCI = cast<CmpInst>(Val: S.getAltOp());
12295	CmpInst::Predicate MainP = MainCI->getPredicate();
12296	CmpInst::Predicate AltP = AltCI->getPredicate();
12297	assert(MainP != AltP &&
12298	"Expected different main/alternate predicates.");
12299	// Collect operands - commute if it uses the swapped predicate or
12300	// alternate operation.
12301	for (auto [Idx, V] : enumerate(First&: VL)) {
12302	if (isa<PoisonValue>(Val: V))
12303	continue;
12304	auto *Cmp = cast<CmpInst>(Val: V);
12305
12306	if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
12307	if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12308	std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12309	} else {
12310	if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12311	std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12312	}
12313	}
12314	TE->setOperands(Operands);
12315	buildTreeRec(VLRef: Operands.front(), Depth: Depth + `1`, UserTreeIdx: {TE, `0`});
12316	buildTreeRec(VLRef: Operands.back(), Depth: Depth + `1`, UserTreeIdx: {TE, `1`});
12317	return;
12318	}
12319
12320	if (isa<BinaryOperator>(Val: VL0) \|\| CI) {
12321	VLOperands Ops(VL, Operands, S, *this);
12322	Ops.reorder();
12323	Operands [`0`] = Ops.getVL(OpIdx: `0`);
12324	Operands [`1`] = Ops.getVL(OpIdx: `1`);
12325	}
12326	TE->setOperands(Operands);
12327	for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12328	buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + `1`, UserTreeIdx: {TE, I});
12329	return;
12330	}
12331	default:
12332	break;
12333	}
12334	llvm_unreachable("Unexpected vectorization of the instructions.");
12335	}
12336
12337	unsigned BoUpSLP::canMapToVector(Type T) const* {
12338	unsigned N = `1`;
12339	Type *EltTy = T;
12340
12341	while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
12342	if (EltTy->isEmptyTy())
12343	return `0`;
12344	if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
12345	// Check that struct is homogeneous.
12346	for (const auto *Ty : ST->elements())
12347	if (Ty != *ST->element_begin())
12348	return `0`;
12349	N *= ST->getNumElements();
12350	EltTy = *ST->element_begin();
12351	} else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
12352	N *= AT->getNumElements();
12353	EltTy = AT->getElementType();
12354	} else {
12355	auto *VT = cast<FixedVectorType>(Val: EltTy);
12356	N *= VT->getNumElements();
12357	EltTy = VT->getElementType();
12358	}
12359	}
12360
12361	if (!isValidElementType(Ty: EltTy))
12362	return `0`;
12363	size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
12364	if (VTSize < MinVecRegSize \|\| VTSize > MaxVecRegSize \|\|
12365	VTSize != DL->getTypeStoreSizeInBits(Ty: T))
12366	return `0`;
12367	return N;
12368	}
12369
12370	bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12371	SmallVectorImpl<unsigned> &CurrentOrder,
12372	bool ResizeAllowed) const {
12373	const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
12374	assert(It != VL.end() && "Expected at least one extract instruction.");
12375	auto E0 = cast<Instruction>(Val: It);
12376	assert(
12377	all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
12378	"Invalid opcode");
12379	// Check if all of the extracts come from the same vector and from the
12380	// correct offset.
12381	Value *Vec = E0->getOperand(i: `0`);
12382
12383	CurrentOrder.clear();
12384
12385	// We have to extract from a vector/aggregate with the same number of elements.
12386	unsigned NElts;
12387	if (E0->getOpcode() == Instruction::ExtractValue) {
12388	NElts = canMapToVector(T: Vec->getType());
12389	if (!NElts)
12390	return false;
12391	// Check if load can be rewritten as load of vector.
12392	LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
12393	if (!LI \|\| !LI->isSimple() \|\| !LI->hasNUses(N: VL.size()))
12394	return false;
12395	} else {
12396	NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
12397	}
12398
12399	unsigned E = VL.size();
12400	if (!ResizeAllowed && NElts != E)
12401	return false;
12402	SmallVector<int> Indices(E, PoisonMaskElem);
12403	unsigned MinIdx = NElts, MaxIdx = `0`;
12404	for (auto [I, V] : enumerate(First&: VL)) {
12405	auto *Inst = dyn_cast<Instruction>(Val: V);
12406	if (!Inst)
12407	continue;
12408	if (Inst->getOperand(i: `0`) != Vec)
12409	return false;
12410	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
12411	if (isa<UndefValue>(Val: EE->getIndexOperand()))
12412	continue;
12413	std::optional<unsigned> Idx = getExtractIndex(E: Inst);
12414	if (!Idx)
12415	return false;
12416	const unsigned ExtIdx = *Idx;
12417	if (ExtIdx >= NElts)
12418	continue;
12419	Indices [I] = ExtIdx;
12420	if (MinIdx > ExtIdx)
12421	MinIdx = ExtIdx;
12422	if (MaxIdx < ExtIdx)
12423	MaxIdx = ExtIdx;
12424	}
12425	if (MaxIdx - MinIdx + `1` > E)
12426	return false;
12427	if (MaxIdx + `1` <= E)
12428	MinIdx = `0`;
12429
12430	// Check that all of the indices extract from the correct offset.
12431	bool ShouldKeepOrder = true;
12432	// Assign to all items the initial value E + 1 so we can check if the extract
12433	// instruction index was used already.
12434	// Also, later we can check that all the indices are used and we have a
12435	// consecutive access in the extract instructions, by checking that no
12436	// element of CurrentOrder still has value E + 1.
12437	CurrentOrder.assign(NumElts: E, Elt: E);
12438	for (unsigned I = `0`; I < E; ++I) {
12439	if (Indices [I] == PoisonMaskElem)
12440	continue;
12441	const unsigned ExtIdx = Indices [I] - MinIdx;
12442	if (CurrentOrder [ExtIdx] != E) {
12443	CurrentOrder.clear();
12444	return false;
12445	}
12446	ShouldKeepOrder &= ExtIdx == I;
12447	CurrentOrder [ExtIdx] = I;
12448	}
12449	if (ShouldKeepOrder)
12450	CurrentOrder.clear();
12451
12452	return ShouldKeepOrder;
12453	}
12454
12455	bool BoUpSLP::areAllUsersVectorized(
12456	Instruction I, const* SmallDenseSet<Value > VectorizedVals) const {
12457	return (I->hasOneUse() && (!VectorizedVals \|\| VectorizedVals->contains(V: I))) \|\|
12458	all_of(Range: I->users(), P: [this](User *U) {
12459	return isVectorized(V: U) \|\| isVectorLikeInstWithConstOps(V: U) \|\|
12460	(isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
12461	});
12462	}
12463
12464	void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12465	const function_ref<bool(Instruction )> IsAltOp, SmallVectorImpl<int*> &Mask,
12466	SmallVectorImpl<Value > OpScalars,
12467	SmallVectorImpl<Value > AltScalars) const {
12468	unsigned Sz = Scalars.size();
12469	Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
12470	SmallVector<int> OrderMask;
12471	if (!ReorderIndices.empty())
12472	inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
12473	for (unsigned I = `0`; I < Sz; ++I) {
12474	unsigned Idx = I;
12475	if (!ReorderIndices.empty())
12476	Idx = OrderMask [I];
12477	if (isa<PoisonValue>(Val: Scalars [Idx]))
12478	continue;
12479	auto *OpInst = cast<Instruction>(Val: Scalars [Idx]);
12480	if (IsAltOp (OpInst)) {
12481	Mask [I] = Sz + Idx;
12482	if (AltScalars)
12483	AltScalars->push_back(Elt: OpInst);
12484	} else {
12485	Mask [I] = Idx;
12486	if (OpScalars)
12487	OpScalars->push_back(Elt: OpInst);
12488	}
12489	}
12490	if (!ReuseShuffleIndices.empty()) {
12491	SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12492	transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
12493	return Idx != PoisonMaskElem ? Mask [Idx] : PoisonMaskElem;
12494	});
12495	Mask.swap(RHS&: NewMask);
12496	}
12497	}
12498
12499	static bool isMainInstruction(Instruction I, Instruction MainOp,
12500	Instruction *AltOp,
12501	const TargetLibraryInfo &TLI) {
12502	return InstructionsState (MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12503	}
12504
12505	static bool isAlternateInstruction(Instruction I, Instruction MainOp,
12506	Instruction *AltOp,
12507	const TargetLibraryInfo &TLI) {
12508	if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
12509	auto *AltCI = cast<CmpInst>(Val: AltOp);
12510	CmpInst::Predicate MainP = MainCI->getPredicate();
12511	[[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12512	assert(MainP != AltP && "Expected different main/alternate predicates.");
12513	auto *CI = cast<CmpInst>(Val: I);
12514	if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
12515	return false;
12516	if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
12517	return true;
12518	CmpInst::Predicate P = CI->getPredicate();
12519	CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
12520
12521	assert((MainP == P \|\| AltP == P \|\| MainP == SwappedP \|\| AltP == SwappedP) &&
12522	"CmpInst expected to match either main or alternate predicate or "
12523	"their swap.");
12524	return MainP != P && MainP != SwappedP;
12525	}
12526	return InstructionsState (MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12527	}
12528
12529	TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value > Ops) const* {
12530	assert(!Ops.empty());
12531	const auto *Op0 = Ops.front();
12532
12533	const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
12534	// TODO: We should allow undef elements here
12535	return isConstant(V) && !isa<UndefValue>(Val: V);
12536	});
12537	const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
12538	// TODO: We should allow undef elements here
12539	return V == Op0;
12540	});
12541	const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12542	// TODO: We should allow undef elements here
12543	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12544	return CI->getValue().isPowerOf2();
12545	return false;
12546	});
12547	const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12548	// TODO: We should allow undef elements here
12549	if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12550	return CI->getValue().isNegatedPowerOf2();
12551	return false;
12552	});
12553
12554	TTI::OperandValueKind VK = TTI::OK_AnyValue;
12555	if (IsConstant && IsUniform)
12556	VK = TTI::OK_UniformConstantValue;
12557	else if (IsConstant)
12558	VK = TTI::OK_NonUniformConstantValue;
12559	else if (IsUniform)
12560	VK = TTI::OK_UniformValue;
12561
12562	TTI::OperandValueProperties VP = TTI::OP_None;
12563	VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12564	VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12565
12566	return {.Kind: VK, .Properties: VP};
12567	}
12568
12569	namespace {
12570	/// The base class for shuffle instruction emission and shuffle cost estimation.
12571	class BaseShuffleAnalysis {
12572	protected:
12573	Type ScalarTy = nullptr*;
12574
12575	BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12576
12577	/// V is expected to be a vectorized value.
12578	/// When REVEC is disabled, there is no difference between VF and
12579	/// VNumElements.
12580	/// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12581	/// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12582	/// of 8.
12583	unsigned getVF(Value V) const* {
12584	assert(V && "V cannot be nullptr");
12585	assert(isa<FixedVectorType>(V->getType()) &&
12586	"V does not have FixedVectorType");
12587	assert(ScalarTy && "ScalarTy cannot be nullptr");
12588	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12589	unsigned VNumElements =
12590	cast<FixedVectorType>(Val: V->getType())->getNumElements();
12591	assert(VNumElements > ScalarTyNumElements &&
12592	"the number of elements of V is not large enough");
12593	assert(VNumElements % ScalarTyNumElements == `0` &&
12594	"the number of elements of V is not a vectorized value");
12595	return VNumElements / ScalarTyNumElements;
12596	}
12597
12598	/// Checks if the mask is an identity mask.
12599	/// \param IsStrict if is true the function returns false if mask size does
12600	/// not match vector size.
12601	static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12602	bool IsStrict) {
12603	int Limit = Mask.size();
12604	int VF = VecTy->getNumElements();
12605	int Index = -`1`;
12606	if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
12607	return true;
12608	if (!IsStrict) {
12609	// Consider extract subvector starting from index 0.
12610	if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
12611	Index == `0`)
12612	return true;
12613	// All VF-size submasks are identity (e.g.
12614	// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12615	if (Limit % VF == `0` && all_of(Range: seq<int>(Begin: `0`, End: Limit / VF), P: [=](int Idx) {
12616	ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
12617	return all_of(Range&: Slice, P: equal_to(Arg: PoisonMaskElem)) \|\|
12618	ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
12619	}))
12620	return true;
12621	}
12622	return false;
12623	}
12624
12625	/// Tries to combine 2 different masks into single one.
12626	/// \param LocalVF Vector length of the permuted input vector. \p Mask may
12627	/// change the size of the vector, \p LocalVF is the original size of the
12628	/// shuffled vector.
12629	static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12630	ArrayRef<int> ExtMask) {
12631	unsigned VF = Mask.size();
12632	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12633	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
12634	if (ExtMask [I] == PoisonMaskElem)
12635	continue;
12636	int MaskedIdx = Mask [ExtMask [I] % VF];
12637	NewMask [I] =
12638	MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12639	}
12640	Mask.swap(RHS&: NewMask);
12641	}
12642
12643	/// Looks through shuffles trying to reduce final number of shuffles in the
12644	/// code. The function looks through the previously emitted shuffle
12645	/// instructions and properly mark indices in mask as undef.
12646	/// For example, given the code
12647	/// \code
12648	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12649	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12650	/// \endcode
12651	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12652	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
12653	/// <0, 1, 2, 3> for the shuffle.
12654	/// If 2 operands are of different size, the smallest one will be resized and
12655	/// the mask recalculated properly.
12656	/// For example, given the code
12657	/// \code
12658	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12659	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12660	/// \endcode
12661	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12662	/// look through %s1 and %s2 and select vectors %0 and %1 with mask
12663	/// <0, 1, 2, 3> for the shuffle.
12664	/// So, it tries to transform permutations to simple vector merge, if
12665	/// possible.
12666	/// \param V The input vector which must be shuffled using the given \p Mask.
12667	/// If the better candidate is found, \p V is set to this best candidate
12668	/// vector.
12669	/// \param Mask The input mask for the shuffle. If the best candidate is found
12670	/// during looking-through-shuffles attempt, it is updated accordingly.
12671	/// \param SinglePermute true if the shuffle operation is originally a
12672	/// single-value-permutation. In this case the look-through-shuffles procedure
12673	/// may look for resizing shuffles as the best candidates.
12674	/// \return true if the shuffle results in the non-resizing identity shuffle
12675	/// (and thus can be ignored), false - otherwise.
12676	static bool peekThroughShuffles(Value &V, SmallVectorImpl<int*> &Mask,
12677	bool SinglePermute) {
12678	Value *Op = V;
12679	ShuffleVectorInst IdentityOp = nullptr*;
12680	SmallVector<int> IdentityMask;
12681	while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
12682	// Exit if not a fixed vector type or changing size shuffle.
12683	auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
12684	if (!SVTy)
12685	break;
12686	// Remember the identity or broadcast mask, if it is not a resizing
12687	// shuffle. If no better candidates are found, this Op and Mask will be
12688	// used in the final shuffle.
12689	if (isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/false)) {
12690	if (!IdentityOp \|\| !SinglePermute \|\|
12691	(isIdentityMask(Mask, VecTy: SVTy, /IsStrict=/true) &&
12692	!ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
12693	NumSrcElts: IdentityMask.size()))) {
12694	IdentityOp = SV;
12695	// Store current mask in the IdentityMask so later we did not lost
12696	// this info if IdentityOp is selected as the best candidate for the
12697	// permutation.
12698	IdentityMask.assign(RHS: Mask);
12699	}
12700	}
12701	// Remember the broadcast mask. If no better candidates are found, this Op
12702	// and Mask will be used in the final shuffle.
12703	// Zero splat can be used as identity too, since it might be used with
12704	// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12705	// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12706	// expensive, the analysis founds out, that the source vector is just a
12707	// broadcast, this original mask can be transformed to identity mask <0,
12708	// 1, 2, 3>.
12709	// \code
12710	// %0 = shuffle %v, poison, zeroinitalizer
12711	// %res = shuffle %0, poison, <3, 1, 2, 0>
12712	// \endcode
12713	// may be transformed to
12714	// \code
12715	// %0 = shuffle %v, poison, zeroinitalizer
12716	// %res = shuffle %0, poison, <0, 1, 2, 3>
12717	// \endcode
12718	if (SV->isZeroEltSplat()) {
12719	IdentityOp = SV;
12720	IdentityMask.assign(RHS: Mask);
12721	}
12722	int LocalVF = Mask.size();
12723	if (auto *SVOpTy =
12724	dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType()))
12725	LocalVF = SVOpTy->getNumElements();
12726	SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12727	for (auto [Idx, I] : enumerate(First&: Mask)) {
12728	if (I == PoisonMaskElem \|\|
12729	static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12730	continue;
12731	ExtMask [Idx] = SV->getMaskValue(Elt: I);
12732	}
12733	bool IsOp1Undef = isUndefVector</isPoisonOnly=/true>(
12734	V: SV->getOperand(i_nocapture: `0`),
12735	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
12736	.all();
12737	bool IsOp2Undef = isUndefVector</isPoisonOnly=/true>(
12738	V: SV->getOperand(i_nocapture: `1`),
12739	UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
12740	.all();
12741	if (!IsOp1Undef && !IsOp2Undef) {
12742	// Update mask and mark undef elems.
12743	for (int &I : Mask) {
12744	if (I == PoisonMaskElem)
12745	continue;
12746	if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
12747	PoisonMaskElem)
12748	I = PoisonMaskElem;
12749	}
12750	break;
12751	}
12752	SmallVector<int> ShuffleMask(SV->getShuffleMask());
12753	combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
12754	Mask.swap(RHS&: ShuffleMask);
12755	if (IsOp2Undef)
12756	Op = SV->getOperand(i_nocapture: `0`);
12757	else
12758	Op = SV->getOperand(i_nocapture: `1`);
12759	}
12760	if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
12761	!OpTy \|\| !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) \|\|
12762	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
12763	if (IdentityOp) {
12764	V = IdentityOp;
12765	assert(Mask.size() == IdentityMask.size() &&
12766	"Expected masks of same sizes.");
12767	// Clear known poison elements.
12768	for (auto [I, Idx] : enumerate(First&: Mask))
12769	if (Idx == PoisonMaskElem)
12770	IdentityMask [I] = PoisonMaskElem;
12771	Mask.swap(RHS&: IdentityMask);
12772	auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
12773	return SinglePermute &&
12774	(isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
12775	/IsStrict=/true) \|\|
12776	(Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12777	Shuffle->isZeroEltSplat() &&
12778	ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) &&
12779	all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) {
12780	return P.value() == PoisonMaskElem \|\|
12781	Shuffle->getShuffleMask()[P.index()] == `0`;
12782	})));
12783	}
12784	V = Op;
12785	return false;
12786	}
12787	V = Op;
12788	return true;
12789	}
12790
12791	/// Smart shuffle instruction emission, walks through shuffles trees and
12792	/// tries to find the best matching vector for the actual shuffle
12793	/// instruction.
12794	template <typename T, typename ShuffleBuilderTy>
12795	static T createShuffle(Value V1, Value V2, ArrayRef<int> Mask,
12796	ShuffleBuilderTy &Builder, Type *ScalarTy) {
12797	assert(V1 && "Expected at least one vector value.");
12798	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12799	SmallVector<int> NewMask(Mask);
12800	if (ScalarTyNumElements != `1`) {
12801	assert(SLPReVec && "FixedVectorType is not expected.");
12802	transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask);
12803	Mask = NewMask;
12804	}
12805	if (V2)
12806	Builder.resizeToMatch(V1, V2);
12807	int VF = Mask.size();
12808	if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
12809	VF = FTy->getNumElements();
12810	if (V2 && !isUndefVector</IsPoisonOnly=/true>(
12811	V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg))
12812	.all()) {
12813	// Peek through shuffles.
12814	Value *Op1 = V1;
12815	Value *Op2 = V2;
12816	int VF =
12817	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12818	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12819	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12820	for (int I = `0`, E = Mask.size(); I < E; ++I) {
12821	if (Mask [I] < VF)
12822	CombinedMask1 [I] = Mask [I];
12823	else
12824	CombinedMask2 [I] = Mask [I] - VF;
12825	}
12826	Value *PrevOp1;
12827	Value *PrevOp2;
12828	do {
12829	PrevOp1 = Op1;
12830	PrevOp2 = Op2;
12831	(void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /SinglePermute=/false);
12832	(void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /SinglePermute=/false);
12833	// Check if we have 2 resizing shuffles - need to peek through operands
12834	// again.
12835	if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
12836	if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
12837	SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12838	for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
12839	if (I == PoisonMaskElem)
12840	continue;
12841	ExtMask1 [Idx] = SV1->getMaskValue(Elt: I);
12842	}
12843	SmallBitVector UseMask1 = buildUseMask(
12844	VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: `1`)->getType())
12845	->getNumElements(),
12846	Mask: ExtMask1, MaskArg: UseMask::SecondArg);
12847	SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12848	for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
12849	if (I == PoisonMaskElem)
12850	continue;
12851	ExtMask2 [Idx] = SV2->getMaskValue(Elt: I);
12852	}
12853	SmallBitVector UseMask2 = buildUseMask(
12854	VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: `1`)->getType())
12855	->getNumElements(),
12856	Mask: ExtMask2, MaskArg: UseMask::SecondArg);
12857	if (SV1->getOperand(i_nocapture: `0`)->getType() ==
12858	SV2->getOperand(i_nocapture: `0`)->getType() &&
12859	SV1->getOperand(i_nocapture: `0`)->getType() != SV1->getType() &&
12860	isUndefVector(V: SV1->getOperand(i_nocapture: `1`), UseMask: UseMask1).all() &&
12861	isUndefVector(V: SV2->getOperand(i_nocapture: `1`), UseMask: UseMask2).all()) {
12862	Op1 = SV1->getOperand(i_nocapture: `0`);
12863	Op2 = SV2->getOperand(i_nocapture: `0`);
12864	SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12865	int LocalVF = ShuffleMask1.size();
12866	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
12867	LocalVF = FTy->getNumElements();
12868	combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
12869	CombinedMask1.swap(RHS&: ShuffleMask1);
12870	SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12871	LocalVF = ShuffleMask2.size();
12872	if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
12873	LocalVF = FTy->getNumElements();
12874	combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
12875	CombinedMask2.swap(RHS&: ShuffleMask2);
12876	}
12877	}
12878	} while (PrevOp1 != Op1 \|\| PrevOp2 != Op2);
12879	Builder.resizeToMatch(Op1, Op2);
12880	VF = std::max(a: cast<VectorType>(Val: Op1->getType())
12881	->getElementCount()
12882	.getKnownMinValue(),
12883	b: cast<VectorType>(Val: Op2->getType())
12884	->getElementCount()
12885	.getKnownMinValue());
12886	for (int I = `0`, E = Mask.size(); I < E; ++I) {
12887	if (CombinedMask2 [I] != PoisonMaskElem) {
12888	assert(CombinedMask1[I] == PoisonMaskElem &&
12889	"Expected undefined mask element");
12890	CombinedMask1 [I] = CombinedMask2 [I] + (Op1 == Op2 ? `0` : VF);
12891	}
12892	}
12893	if (Op1 == Op2 &&
12894	(ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) \|\|
12895	(ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
12896	isa<ShuffleVectorInst>(Val: Op1) &&
12897	cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
12898	ArrayRef(CombinedMask1))))
12899	return Builder.createIdentity(Op1);
12900	return Builder.createShuffleVector(
12901	Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
12902	CombinedMask1);
12903	}
12904	if (isa<PoisonValue>(Val: V1))
12905	return Builder.createPoison(
12906	cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
12907	bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /SinglePermute=/true);
12908	assert(V1 && "Expected non-null value after looking through shuffles.");
12909
12910	if (!IsIdentity)
12911	return Builder.createShuffleVector(V1, NewMask);
12912	return Builder.createIdentity(V1);
12913	}
12914
12915	/// Transforms mask \p CommonMask per given \p Mask to make proper set after
12916	/// shuffle emission.
12917	static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12918	ArrayRef<int> Mask) {
12919	for (unsigned I : seq<unsigned>(Size: CommonMask.size()))
12920	if (Mask [I] != PoisonMaskElem)
12921	CommonMask [I] = I;
12922	}
12923	};
12924	} // namespace
12925
12926	/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12927	static std::pair<InstructionCost, InstructionCost>
12928	getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
12929	Value BasePtr, unsigned* Opcode, TTI::TargetCostKind CostKind,
12930	Type ScalarTy, VectorType VecTy) {
12931	InstructionCost ScalarCost = `0`;
12932	InstructionCost VecCost = `0`;
12933	// Here we differentiate two cases: (1) when Ptrs represent a regular
12934	// vectorization tree node (as they are pointer arguments of scattered
12935	// loads) or (2) when Ptrs are the arguments of loads or stores being
12936	// vectorized as plane wide unit-stride load/store since all the
12937	// loads/stores are known to be from/to adjacent locations.
12938	if (Opcode == Instruction::Load \|\| Opcode == Instruction::Store) {
12939	// Case 2: estimate costs for pointer related costs when vectorizing to
12940	// a wide load/store.
12941	// Scalar cost is estimated as a set of pointers with known relationship
12942	// between them.
12943	// For vector code we will use BasePtr as argument for the wide load/store
12944	// but we also need to account all the instructions which are going to
12945	// stay in vectorized code due to uses outside of these scalar
12946	// loads/stores.
12947	ScalarCost = TTI.getPointersChainCost(
12948	Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
12949	CostKind);
12950
12951	SmallVector<const Value *> PtrsRetainedInVecCode;
12952	for (Value *V : Ptrs) {
12953	if (V == BasePtr) {
12954	PtrsRetainedInVecCode.push_back(Elt: V);
12955	continue;
12956	}
12957	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
12958	// For simplicity assume Ptr to stay in vectorized code if it's not a
12959	// GEP instruction. We don't care since it's cost considered free.
12960	// TODO: We should check for any uses outside of vectorizable tree
12961	// rather than just single use.
12962	if (!Ptr \|\| !Ptr->hasOneUse())
12963	PtrsRetainedInVecCode.push_back(Elt: V);
12964	}
12965
12966	if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12967	// If all pointers stay in vectorized code then we don't have
12968	// any savings on that.
12969	return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
12970	}
12971	VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
12972	Info: TTI::PointersChainInfo::getKnownStride(),
12973	AccessTy: VecTy, CostKind);
12974	} else {
12975	// Case 1: Ptrs are the arguments of loads that we are going to transform
12976	// into masked gather load intrinsic.
12977	// All the scalar GEPs will be removed as a result of vectorization.
12978	// For any external uses of some lanes extract element instructions will
12979	// be generated (which cost is estimated separately).
12980	TTI::PointersChainInfo PtrsInfo =
12981	all_of(Range&: Ptrs,
12982	P: [](const Value *V) {
12983	auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
12984	return Ptr && !Ptr->hasAllConstantIndices();
12985	})
12986	? TTI::PointersChainInfo::getUnknownStride()
12987	: TTI::PointersChainInfo::getKnownStride();
12988
12989	ScalarCost =
12990	TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
12991	auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
12992	if (!BaseGEP) {
12993	auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
12994	if (It != Ptrs.end())
12995	BaseGEP = cast<GEPOperator>(Val: *It);
12996	}
12997	if (BaseGEP) {
12998	SmallVector<const Value *> Indices(BaseGEP->indices());
12999	VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
13000	Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
13001	CostKind);
13002	}
13003	}
13004
13005	return std::make_pair(x&: ScalarCost, y&: VecCost);
13006	}
13007
13008	void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13009	assert(TE.isGather() && TE.ReorderIndices.empty() &&
13010	"Expected gather node without reordering.");
13011	DenseMap<std::pair<size_t, Value >, SmallVector<LoadInst >> LoadsMap;
13012	SmallSet<size_t, `2`> LoadKeyUsed;
13013
13014	// Do not reorder nodes if it small (just 2 elements), all-constant or all
13015	// instructions have same opcode already.
13016	if (TE.Scalars.size() == `2` \|\| (TE.hasState() && !TE.isAltShuffle()) \|\|
13017	all_of(Range&: TE.Scalars, P: isConstant))
13018	return;
13019
13020	if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) {
13021	return VectorizableTree [Idx]->isSame(VL: TE.Scalars);
13022	}))
13023	return;
13024
13025	auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13026	Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
13027	Value *Ptr =
13028	getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
13029	if (LoadKeyUsed.contains(V: Key)) {
13030	auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
13031	if (LIt != LoadsMap.end()) {
13032	for (LoadInst *RLI : LIt ->second) {
13033	if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
13034	ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: DL, SE&: SE,
13035	/StrictCheck=/true))
13036	return hash_value(ptr: RLI->getPointerOperand());
13037	}
13038	for (LoadInst *RLI : LIt ->second) {
13039	if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
13040	Ptr2: LI->getPointerOperand(), TLI: *TLI)) {
13041	hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
13042	return SubKey;
13043	}
13044	}
13045	if (LIt ->second.size() > `2`) {
13046	hash_code SubKey =
13047	hash_value(ptr: LIt ->second.back()->getPointerOperand());
13048	return SubKey;
13049	}
13050	}
13051	}
13052	LoadKeyUsed.insert(V: Key);
13053	LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first ->second.push_back(Elt: LI);
13054	return hash_value(ptr: LI->getPointerOperand());
13055	};
13056	MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13057	SmallDenseMap<Value , SmallVector<unsigned*>, `8`> KeyToIndex;
13058	bool IsOrdered = true;
13059	unsigned NumInstructions = `0`;
13060	// Try to "cluster" scalar instructions, to be able to build extra vectorized
13061	// nodes.
13062	for (auto [I, V] : enumerate(First&: TE.Scalars)) {
13063	size_t Key = `1`, Idx = `1`;
13064	if (auto *Inst = dyn_cast<Instruction>(Val: V);
13065	Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) &&
13066	!isDeleted(I: Inst) && !isVectorized(V)) {
13067	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
13068	/AllowAlternate=/false);
13069	++NumInstructions;
13070	}
13071	auto &Container = SortedValues [Key];
13072	if (IsOrdered && !KeyToIndex.contains(Val: V) &&
13073	!(isa<Constant, ExtractElementInst>(Val: V) \|\|
13074	isVectorLikeInstWithConstOps(V)) &&
13075	((Container.contains(Key: Idx) &&
13076	KeyToIndex.at(Val: Container [Idx].back()).back() != I - `1`) \|\|
13077	(!Container.empty() && !Container.contains(Key: Idx) &&
13078	KeyToIndex.at(Val: Container.back().second.back()).back() != I - `1`)))
13079	IsOrdered = false;
13080	auto &KTI = KeyToIndex [V];
13081	if (KTI.empty())
13082	Container [Idx].push_back(Elt: V);
13083	KTI.push_back(Elt: I);
13084	}
13085	SmallVector<std::pair<unsigned, unsigned>> SubVectors;
13086	APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13087	if (!IsOrdered && NumInstructions > `1`) {
13088	unsigned Cnt = `0`;
13089	TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size());
13090	for (const auto &D : SortedValues) {
13091	for (const auto &P : D.second) {
13092	unsigned Sz = `0`;
13093	for (Value *V : P.second) {
13094	ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V);
13095	for (auto [K, Idx] : enumerate(First&: Indices)) {
13096	TE.ReorderIndices [Cnt + K] = Idx;
13097	TE.Scalars [Cnt + K] = V;
13098	}
13099	Sz += Indices.size();
13100	Cnt += Indices.size();
13101	}
13102	if (Sz > `1` && isa<Instruction>(Val: P.second.front())) {
13103	const unsigned SubVF = getFloorFullVectorNumberOfElements(
13104	TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz);
13105	SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF);
13106	for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF))
13107	DemandedElts.clearBit(BitPosition: I);
13108	} else if (!P.second.empty() && isConstant(V: P.second.front())) {
13109	for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt))
13110	DemandedElts.clearBit(BitPosition: I);
13111	}
13112	}
13113	}
13114	}
13115	// Reuses always require shuffles, so consider it as profitable.
13116	if (!TE.ReuseShuffleIndices.empty() \|\| TE.ReorderIndices.empty())
13117	return;
13118	// Do simple cost estimation.
13119	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13120	InstructionCost Cost = `0`;
13121	auto *ScalarTy = TE.Scalars.front()->getType();
13122	auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size());
13123	for (auto [Idx, Sz] : SubVectors) {
13124	Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind,
13125	Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz));
13126	}
13127	Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13128	/Insert=/true,
13129	/Extract=/false, CostKind);
13130	int Sz = TE.Scalars.size();
13131	SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13132	TE.ReorderIndices.end());
13133	for (unsigned I : seq<unsigned>(Size: Sz)) {
13134	Value *V = TE.getOrdered(Idx: I);
13135	if (isa<PoisonValue>(Val: V)) {
13136	ReorderMask [I] = PoisonMaskElem;
13137	} else if (isConstant(V) \|\| DemandedElts [I]) {
13138	ReorderMask [I] = I + TE.ReorderIndices.size();
13139	}
13140	}
13141	Cost += ::getShuffleCost(TTI: *TTI,
13142	Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; })
13143	? TTI::SK_PermuteTwoSrc
13144	: TTI::SK_PermuteSingleSrc,
13145	Tp: VecTy, Mask: ReorderMask);
13146	DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13147	ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
13148	for (unsigned I : seq<unsigned>(Size: Sz)) {
13149	Value *V = TE.getOrdered(Idx: I);
13150	if (isConstant(V)) {
13151	DemandedElts.clearBit(BitPosition: I);
13152	if (!isa<PoisonValue>(Val: V))
13153	ReorderMask [I] = I;
13154	} else {
13155	ReorderMask [I] = I + Sz;
13156	}
13157	}
13158	InstructionCost BVCost =
13159	getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13160	/Insert=/true, /Extract=/false, CostKind);
13161	if (!DemandedElts.isAllOnes())
13162	BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask);
13163	if (Cost >= BVCost) {
13164	SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13165	reorderScalars(Scalars&: TE.Scalars, Mask);
13166	TE.ReorderIndices.clear();
13167	}
13168	}
13169
13170	/// Check if we can convert fadd/fsub sequence to FMAD.
13171	/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13172	static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
13173	const InstructionsState &S,
13174	DominatorTree &DT, const DataLayout &DL,
13175	TargetTransformInfo &TTI,
13176	const TargetLibraryInfo &TLI) {
13177	assert(all_of(VL,
13178	[](Value *V) {
13179	return V->getType()->getScalarType()->isFloatingPointTy();
13180	}) &&
13181	"Can only convert to FMA for floating point types");
13182	assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13183
13184	auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13185	FastMathFlags FMF;
13186	FMF.set();
13187	for (Value *V : VL) {
13188	auto *I = dyn_cast<Instruction>(Val: V);
13189	if (!I)
13190	continue;
13191	if (S.isCopyableElement(V: I))
13192	continue;
13193	Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13194	if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13195	continue;
13196	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13197	FMF &= FPCI->getFastMathFlags();
13198	}
13199	return FMF.allowContract();
13200	};
13201	if (!CheckForContractable (VL))
13202	return InstructionCost::getInvalid();
13203	// fmul also should be contractable
13204	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13205	SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13206
13207	InstructionsState OpS = getSameOpcode(VL: Operands.front(), TLI);
13208	if (!OpS.valid())
13209	return InstructionCost::getInvalid();
13210
13211	if (OpS.isAltShuffle() \|\| OpS.getOpcode() != Instruction::FMul)
13212	return InstructionCost::getInvalid();
13213	if (!CheckForContractable (Operands.front()))
13214	return InstructionCost::getInvalid();
13215	// Compare the costs.
13216	InstructionCost FMulPlusFAddCost = `0`;
13217	InstructionCost FMACost = `0`;
13218	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13219	FastMathFlags FMF;
13220	FMF.set();
13221	for (Value *V : VL) {
13222	auto *I = dyn_cast<Instruction>(Val: V);
13223	if (!I)
13224	continue;
13225	if (!S.isCopyableElement(V: I))
13226	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13227	FMF &= FPCI->getFastMathFlags();
13228	FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13229	}
13230	unsigned NumOps = `0`;
13231	for (auto [V, Op] : zip(t&: VL, u&: Operands.front())) {
13232	if (S.isCopyableElement(V))
13233	continue;
13234	auto *I = dyn_cast<Instruction>(Val: Op);
13235	if (!I \|\| !I->hasOneUse() \|\| OpS.isCopyableElement(V: I)) {
13236	if (auto *OpI = dyn_cast<Instruction>(Val: V))
13237	FMACost += TTI.getInstructionCost(U: OpI, CostKind);
13238	if (I)
13239	FMACost += TTI.getInstructionCost(U: I, CostKind);
13240	continue;
13241	}
13242	++NumOps;
13243	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13244	FMF &= FPCI->getFastMathFlags();
13245	FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13246	}
13247	Type *Ty = VL.front()->getType();
13248	IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13249	FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13250	return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13251	}
13252
13253	bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
13254	bool &IsBSwap, bool &ForLoads) const {
13255	assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
13256	"Expected Shl node.");
13257	IsBSwap = false;
13258	ForLoads = false;
13259	if (TE.State != TreeEntry::Vectorize \|\| !TE.ReorderIndices.empty() \|\|
13260	!TE.ReuseShuffleIndices.empty() \|\| MinBWs.contains(Val: &TE) \|\|
13261	any_of(Range: TE.Scalars, P: [](Value V) { return* !V->hasOneUse(); }))
13262	return false;
13263	Type *ScalarTy = TE.getMainOp()->getType();
13264	// TODO: Check if same can be done for the vector types.
13265	if (!ScalarTy->isIntegerTy())
13266	return false;
13267	if (ScalarTy->isVectorTy())
13268	return false;
13269	const unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
13270	const TreeEntry LhsTE = getOperandEntry(E: &TE, /Idx=/*`0`);
13271	const TreeEntry RhsTE = getOperandEntry(E: &TE, /Idx=/*`1`);
13272	// Lhs should be zext i<stride> to I<sz>.
13273	if (!(LhsTE->State == TreeEntry::Vectorize &&
13274	LhsTE->getOpcode() == Instruction::ZExt &&
13275	LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13276	!MinBWs.contains(Val: LhsTE) &&
13277	all_of(Range: LhsTE->Scalars, P: [](Value V) { return* V->hasOneUse(); })))
13278	return false;
13279	Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
13280	unsigned Stride = DL->getTypeSizeInBits(Ty: SrcScalarTy);
13281	if (!isPowerOf2_64(Value: Stride) \|\| Stride >= Sz \|\| Sz % Stride != `0` \|\|
13282	!isPowerOf2_64(Value: LhsTE->getVectorFactor()))
13283	return false;
13284	if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13285	RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(Val: RhsTE)))
13286	return false;
13287	Order.clear();
13288	unsigned CurrentValue = `0`;
13289	// Rhs should be (0, Stride, 2 Stride, ..., N-Stride), where N <= Sz.*
13290	if (all_of(Range: RhsTE->Scalars,
13291	P: [&](Value *V) {
13292	CurrentValue += Stride;
13293	if (isa<UndefValue>(Val: V))
13294	return true;
13295	auto *C = dyn_cast<Constant>(Val: V);
13296	if (!C)
13297	return false;
13298	return C->getUniqueInteger() == CurrentValue - Stride;
13299	}) &&
13300	CurrentValue <= Sz) {
13301	Order.clear();
13302	} else {
13303	const unsigned VF = RhsTE->getVectorFactor();
13304	Order.assign(NumElts: VF, Elt: VF);
13305	// Track which logical positions we've seen; reject duplicate shift amounts.
13306	SmallBitVector SeenPositions(VF);
13307	// Check if need to reorder Rhs to make it in form (0, Stride, 2 Stride,*
13308	// ..., N-Stride), where N <= Sz.
13309	if (VF * Stride > Sz)
13310	return false;
13311	for (const auto [Idx, V] : enumerate(First: RhsTE->Scalars)) {
13312	if (isa<UndefValue>(Val: V))
13313	continue;
13314	auto *C = dyn_cast<Constant>(Val: V);
13315	if (!C)
13316	return false;
13317	const APInt &Val = C->getUniqueInteger();
13318	if (Val.isNegative() \|\| Val.uge(RHS: Sz) \|\| Val.getZExtValue() % Stride != `0`)
13319	return false;
13320	unsigned Pos = Val.getZExtValue() / Stride;
13321	// TODO: Support Pos >= VF, in this case need to shift the final value.
13322	if (Order [Idx] != VF \|\| Pos >= VF)
13323	return false;
13324	if (SeenPositions.test(Idx: Pos))
13325	return false;
13326	SeenPositions.set(Pos);
13327	Order [Idx] = Pos;
13328	}
13329	// One of the indices not set - exit.
13330	if (is_contained(Range&: Order, Element: VF))
13331	return false;
13332	}
13333	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13334	auto *SrcType = IntegerType::getIntNTy(C&: ScalarTy->getContext(),
13335	N: Stride * LhsTE->getVectorFactor());
13336	FastMathFlags FMF;
13337	SmallPtrSet<Value *, `4`> CheckedExtracts;
13338	auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
13339	auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
13340	TTI::CastContextHint CastCtx =
13341	getCastContextHint(TE: getOperandEntry(E: LhsTE, /Idx=/*`0`));
13342	InstructionCost VecCost =
13343	TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind) +
13344	TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty: VecTy, CostKind,
13345	Opd1Info: getOperandInfo(Ops: LhsTE->Scalars)) +
13346	TTI->getCastInstrCost(
13347	Opcode: Instruction::ZExt, Dst: VecTy,
13348	Src: getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor()), CCH: CastCtx,
13349	CostKind);
13350	InstructionCost BitcastCost = TTI->getCastInstrCost(
13351	Opcode: Instruction::BitCast, Dst: SrcType, Src: SrcVecTy, CCH: CastCtx, CostKind);
13352	if (!Order.empty()) {
13353	fixupOrderingIndices(Order);
13354	SmallVector<int> Mask;
13355	inversePermutation(Indices: Order, Mask);
13356	BitcastCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SrcVecTy,
13357	Mask, CostKind);
13358	}
13359	// Check if the combination can be modeled as a bitcast+byteswap operation.
13360	constexpr unsigned ByteSize = `8`;
13361	if (!Order.empty() && isReverseOrder(Order) &&
13362	DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13363	IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13364	InstructionCost BSwapCost =
13365	TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: SrcType, Src: SrcVecTy, CCH: CastCtx,
13366	CostKind) +
13367	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13368	if (BSwapCost <= BitcastCost) {
13369	BitcastCost = BSwapCost;
13370	IsBSwap = true;
13371	Order.clear();
13372	// Check for loads in the ZExt node.
13373	const TreeEntry SrcTE = getOperandEntry(E: LhsTE, /Idx=/*`0`);
13374	if (SrcTE->State == TreeEntry::Vectorize &&
13375	SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
13376	SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13377	all_of(Range: SrcTE->Scalars, P: [](Value V) { return* V->hasOneUse(); })) {
13378	auto *LI = cast<LoadInst>(Val: SrcTE->getMainOp());
13379	IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13380	InstructionCost BSwapCost =
13381	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI->getAlign(),
13382	AddressSpace: LI->getPointerAddressSpace(), CostKind) +
13383	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13384	if (BSwapCost <= BitcastCost) {
13385	VecCost +=
13386	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcVecTy, Alignment: LI->getAlign(),
13387	AddressSpace: LI->getPointerAddressSpace(), CostKind);
13388	BitcastCost = BSwapCost;
13389	ForLoads = true;
13390	}
13391	}
13392	}
13393	} else if (Order.empty() && DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13394	// Check for loads in the ZExt node.
13395	const TreeEntry SrcTE = getOperandEntry(E: LhsTE, /Idx=/*`0`);
13396	if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
13397	SrcTE->ReuseShuffleIndices.empty() &&
13398	SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13399	all_of(Range: SrcTE->Scalars, P: [](Value V) { return* V->hasOneUse(); })) {
13400	auto *LI = cast<LoadInst>(Val: SrcTE->getMainOp());
13401	BitcastCost =
13402	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI->getAlign(),
13403	AddressSpace: LI->getPointerAddressSpace(), CostKind);
13404	VecCost +=
13405	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcVecTy, Alignment: LI->getAlign(),
13406	AddressSpace: LI->getPointerAddressSpace(), CostKind);
13407	ForLoads = true;
13408	}
13409	}
13410	if (SrcType != ScalarTy) {
13411	BitcastCost += TTI->getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
13412	CCH: TTI::CastContextHint::None, CostKind);
13413	}
13414	return BitcastCost < VecCost;
13415	}
13416
13417	bool BoUpSLP::matchesInversedZExtSelect(
13418	const TreeEntry &SelectTE,
13419	SmallVectorImpl<unsigned> &InversedCmpsIndices) const {
13420	assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13421	"Expected select node.");
13422	SmallVector<std::pair<Instruction , unsigned*>> ZExts;
13423	for (auto [Idx, V] : enumerate(First: SelectTE.Scalars)) {
13424	auto *Inst = dyn_cast<Instruction>(Val: V);
13425	if (!Inst \|\| Inst->getOpcode() != Instruction::ZExt)
13426	continue;
13427	ZExts.emplace_back(Args&: Inst, Args&: Idx);
13428	}
13429	if (ZExts.empty())
13430	return false;
13431	const auto *CmpTE = getOperandEntry(E: &SelectTE, Idx: `0`);
13432	const auto *Op1TE = getOperandEntry(E: &SelectTE, Idx: `1`);
13433	const auto *Op2TE = getOperandEntry(E: &SelectTE, Idx: `2`);
13434	// Compares must be alternate vectorized, and other operands must be gathers
13435	// or copyables.
13436	// TODO: investigate opportunity for reordered/reused nodes.
13437	if (CmpTE->State != TreeEntry::Vectorize \|\| !CmpTE->isAltShuffle() \|\|
13438	(CmpTE->getOpcode() != Instruction::ICmp &&
13439	CmpTE->getOpcode() != Instruction::FCmp) \|\|
13440	!CmpTE->ReorderIndices.empty() \|\| !CmpTE->ReuseShuffleIndices.empty() \|\|
13441	!Op1TE->ReorderIndices.empty() \|\| !Op1TE->ReuseShuffleIndices.empty() \|\|
13442	!Op2TE->ReorderIndices.empty() \|\| !Op2TE->ReuseShuffleIndices.empty())
13443	return false;
13444	// The operands must be buildvectors/copyables.
13445	if (!Op1TE->isGather() \|\| !Op2TE->isGather())
13446	return false;
13447	// TODO: investigate opportunity for the vector nodes with copyables.
13448	auto *Cmp = CmpTE->getMainOp();
13449	CmpPredicate Pred;
13450	auto MatchCmp = m_Cmp(Pred, L: m_Value(), R: m_Value());
13451	if (!match(V: Cmp, P: MatchCmp))
13452	return false;
13453	CmpPredicate MainPred = Pred;
13454	CmpPredicate InversedPred(CmpInst::getInversePredicate(pred: Pred),
13455	Pred.hasSameSign());
13456	for (const auto [Idx, V] : enumerate(First: CmpTE->Scalars)) {
13457	if (!match(V, P: MatchCmp))
13458	continue;
13459	if (CmpPredicate::getMatching(A: MainPred, B: Pred))
13460	continue;
13461	if (!CmpPredicate::getMatching(A: InversedPred, B: Pred))
13462	return false;
13463	if (!V->hasOneUse())
13464	return false;
13465	InversedCmpsIndices.push_back(Elt: Idx);
13466	}
13467
13468	if (InversedCmpsIndices.empty())
13469	return false;
13470	VectorType *VecTy =
13471	getWidenedType(ScalarTy: Cmp->getOperand(i: `0`)->getType(), VF: CmpTE->getVectorFactor());
13472	Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: VecTy);
13473
13474	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13475	InstructionCost VecCost =
13476	TTI->getCmpSelInstrCost(Opcode: CmpTE->getOpcode(), ValTy: VecTy, CondTy: CmpTy, VecPred: MainPred,
13477	CostKind, Op1Info: getOperandInfo(Ops: CmpTE->getOperand(OpIdx: `0`)),
13478	Op2Info: getOperandInfo(Ops: CmpTE->getOperand(OpIdx: `1`)));
13479	InstructionCost BVCost =
13480	::getScalarizationOverhead(TTI: *TTI, ScalarTy: Cmp->getType(), Ty: cast<VectorType>(Val: CmpTy),
13481	DemandedElts: APInt::getAllOnes(numBits: CmpTE->getVectorFactor()),
13482	/Insert=/true, /Extract=/false, CostKind);
13483	for (Value *V : CmpTE->Scalars) {
13484	auto *I = dyn_cast<Instruction>(Val: V);
13485	if (!I)
13486	continue;
13487	BVCost += TTI->getInstructionCost(U: I, CostKind);
13488	}
13489	return VecCost < BVCost;
13490	}
13491
13492	bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
13493	assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13494	"Expected select node.");
13495	if (DL->isBigEndian())
13496	return false;
13497	if (!SelectTE.ReorderIndices.empty() \|\| !SelectTE.ReuseShuffleIndices.empty())
13498	return false;
13499	if (!UserIgnoreList)
13500	return false;
13501	if (any_of(Range: SelectTE.Scalars, P: [](Value V) { return* !V->hasOneUse(); }))
13502	return false;
13503	// Check that all reduction operands are or instructions.
13504	if (any_of(Range: *UserIgnoreList,
13505	P: [](Value V) { return* !match(V, P: m_Or(L: m_Value(), R: m_Value())); }))
13506	return false;
13507	const TreeEntry *Op1TE = getOperandEntry(E: &SelectTE, Idx: `1`);
13508	const TreeEntry *Op2TE = getOperandEntry(E: &SelectTE, Idx: `2`);
13509	if (!Op1TE->isGather() \|\| !Op2TE->isGather())
13510	return false;
13511	// No need to check for zeroes reordering.
13512	if (!Op1TE->ReorderIndices.empty() \|\| !Op1TE->ReuseShuffleIndices.empty() \|\|
13513	!Op2TE->ReuseShuffleIndices.empty())
13514	return false;
13515	Type *ScalarTy = Op1TE->Scalars.front()->getType();
13516	if (!ScalarTy->isIntegerTy())
13517	return false;
13518	// Check that second operand is all zeroes.
13519	if (any_of(Range: Op2TE->Scalars, P: [](Value V) { return* !match(V, P: m_ZeroInt()); }))
13520	return false;
13521	// Check that first operand is 1,2,4,...
13522	if (any_of(Range: enumerate(First: Op1TE->Scalars), P: [](const auto &P) {
13523	uint64_t V;
13524	return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(Value: V) &&
13525	Log2_64(Value: V) == P.index());
13526	}))
13527	return false;
13528	// Check if bitcast is cheaper than select.
13529	auto *DstTy = IntegerType::getIntNTy(C&: ScalarTy->getContext(),
13530	N: SelectTE.getVectorFactor());
13531	VectorType *OpTy = getWidenedType(ScalarTy: DstTy, VF: SelectTE.getVectorFactor());
13532	Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: OpTy);
13533	VectorType *VecTy = getWidenedType(ScalarTy, VF: SelectTE.getVectorFactor());
13534	auto It = MinBWs.find(Val: &SelectTE);
13535	if (It != MinBWs.end()) {
13536	auto *EffectiveScalarTy =
13537	IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
13538	VecTy = getWidenedType(ScalarTy: EffectiveScalarTy, VF: SelectTE.getVectorFactor());
13539	}
13540	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13541	InstructionCost BitcastCost = TTI->getCastInstrCost(
13542	Opcode: Instruction::BitCast, Dst: DstTy, Src: CmpTy, CCH: TTI::CastContextHint::None, CostKind);
13543	if (DstTy != ScalarTy) {
13544	BitcastCost += TTI->getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: DstTy,
13545	CCH: TTI::CastContextHint::None, CostKind);
13546	}
13547	FastMathFlags FMF;
13548	InstructionCost SelectCost =
13549	TTI->getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy: CmpTy,
13550	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind,
13551	Op1Info: getOperandInfo(Ops: Op1TE->Scalars),
13552	Op2Info: getOperandInfo(Ops: Op2TE->Scalars)) +
13553	TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind);
13554	return BitcastCost <= SelectCost;
13555	}
13556
13557	void BoUpSLP::transformNodes() {
13558	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13559	BaseGraphSize = VectorizableTree.size();
13560	// Turn graph transforming mode on and off, when done.
13561	class GraphTransformModeRAAI {
13562	bool &SavedIsGraphTransformMode;
13563
13564	public:
13565	GraphTransformModeRAAI(bool &IsGraphTransformMode)
13566	: SavedIsGraphTransformMode(IsGraphTransformMode) {
13567	IsGraphTransformMode = true;
13568	}
13569	~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13570	} TransformContext(IsGraphTransformMode);
13571	// Operands are profitable if they are:
13572	// 1. At least one constant
13573	// or
13574	// 2. Splats
13575	// or
13576	// 3. Results in good vectorization opportunity, i.e. may generate vector
13577	// nodes and reduce cost of the graph.
13578	auto CheckOperandsProfitability = [this](Instruction I1, Instruction I2,
13579	const InstructionsState &S) {
13580	SmallVector<SmallVector<std::pair<Value , Value >>> Candidates;
13581	for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands()))
13582	Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
13583	Args: I2->getOperand(i: Op));
13584	return all_of(
13585	Range&: Candidates, P: [this](ArrayRef<std::pair<Value , Value >> Cand) {
13586	return all_of(Range&: Cand,
13587	P: [](const std::pair<Value , Value > &P) {
13588	return isa<Constant>(Val: P.first) \|\|
13589	isa<Constant>(Val: P.second) \|\| P.first == P.second;
13590	}) \|\|
13591	findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads);
13592	});
13593	};
13594
13595	// Try to reorder gather nodes for better vectorization opportunities.
13596	for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13597	TreeEntry &E = *VectorizableTree [Idx];
13598	if (E.isGather())
13599	reorderGatherNode(TE&: E);
13600	}
13601
13602	// Better to use full gathered loads analysis, if there are only 2 loads
13603	// gathered nodes each having less than 16 elements.
13604	constexpr unsigned VFLimit = `16`;
13605	bool ForceLoadGather =
13606	count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
13607	return TE ->isGather() && TE ->hasState() &&
13608	TE ->getOpcode() == Instruction::Load &&
13609	TE ->getVectorFactor() < VFLimit;
13610	}) == `2`;
13611
13612	// Checks if the scalars are used in other node.
13613	auto AreReusedScalars = [&](const TreeEntry TE, ArrayRef<Value > VL,
13614	function_ref<bool(Value *)> CheckContainer) {
13615	return TE->isSame(VL) \|\| all_of(Range&: VL, P: [&](Value *V) {
13616	if (isa<PoisonValue>(Val: V))
13617	return true;
13618	auto *I = dyn_cast<Instruction>(Val: V);
13619	if (!I)
13620	return false;
13621	return is_contained(Range: TE->Scalars, Element: I) \|\| CheckContainer (I);
13622	});
13623	};
13624	auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13625	if (E.hasState()) {
13626	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp());
13627	!TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13628	return AreReusedScalars (TE, E.Scalars, [&](Value *V) {
13629	ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13630	return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13631	return is_contained(Range&: TEs, Element: TE);
13632	});
13633	});
13634	}))
13635	return true;
13636	;
13637	if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp());
13638	!TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13639	return AreReusedScalars (TE, E.Scalars, [&](Value *V) {
13640	ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13641	return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13642	return is_contained(Range&: TEs, Element: TE);
13643	});
13644	});
13645	}))
13646	return true;
13647	} else {
13648	// Check if the gather node full copy of split node.
13649	auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>);
13650	if (It != E.Scalars.end()) {
13651	if (ArrayRef<TreeEntry > TEs = getSplitTreeEntries(V: It);
13652	!TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13653	return AreReusedScalars (TE, E.Scalars, [&](Value *V) {
13654	ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13655	return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13656	return is_contained(Range&: TEs, Element: TE);
13657	});
13658	});
13659	}))
13660	return true;
13661	}
13662	}
13663	return false;
13664	};
13665	// The tree may grow here, so iterate over nodes, built before.
13666	for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13667	TreeEntry &E = *VectorizableTree [Idx];
13668	if (E.isGather()) {
13669	ArrayRef<Value *> VL = E.Scalars;
13670	const unsigned Sz = getVectorElementSize(V: VL.front());
13671	unsigned MinVF = getMinVF(Sz: `2` * Sz);
13672	// Do not try partial vectorization for small nodes (<= 2), nodes with the
13673	// same opcode and same parent block or all constants.
13674	if (VL.size() <= `2` \|\| LoadEntriesToVectorize.contains(key: Idx) \|\|
13675	!(!E.hasState() \|\| E.getOpcode() == Instruction::Load \|\|
13676	// We use allSameOpcode instead of isAltShuffle because we don't
13677	// want to use interchangeable instruction here.
13678	!allSameOpcode(VL) \|\| !allSameBlock(VL)) \|\|
13679	allConstant(VL) \|\| isSplat(VL))
13680	continue;
13681	if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13682	continue;
13683	// Check if the node is a copy of other vector nodes.
13684	if (CheckForSameVectorNodes (E))
13685	continue;
13686	// Try to find vectorizable sequences and transform them into a series of
13687	// insertvector instructions.
13688	unsigned StartIdx = `0`;
13689	unsigned End = VL.size();
13690	SmallBitVector Processed(End);
13691	for (unsigned VF = getFloorFullVectorNumberOfElements(
13692	TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - `1`);
13693	VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13694	TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - `1`)) {
13695	if (StartIdx + VF > End)
13696	continue;
13697	SmallVector<std::pair<unsigned, unsigned>> Slices;
13698	bool AllStrided = true;
13699	for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13700	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
13701	// If any instruction is vectorized already - do not try again.
13702	// Reuse the existing node, if it fully matches the slice.
13703	if ((Processed.test(Idx: Cnt) \|\| isVectorized(V: Slice.front())) &&
13704	!getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /SameVF=/true))
13705	continue;
13706	// Constant already handled effectively - skip.
13707	if (allConstant(VL: Slice))
13708	continue;
13709	// Do not try to vectorize small splats (less than vector register and
13710	// only with the single non-undef element).
13711	bool IsSplat = isSplat(VL: Slice);
13712	bool IsTwoRegisterSplat = true;
13713	if (IsSplat && VF == `2`) {
13714	unsigned NumRegs2VF = ::getNumberOfParts(
13715	TTI: TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: `2` VF));
13716	IsTwoRegisterSplat = NumRegs2VF == `2`;
13717	}
13718	if (Slices.empty() \|\| !IsSplat \|\| !IsTwoRegisterSplat \|\|
13719	count(Range&: Slice, Element: Slice.front()) ==
13720	static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - `1`
13721	: `1`)) {
13722	if (IsSplat)
13723	continue;
13724	InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI);
13725	if (!S \|\| !allSameOpcode(VL: Slice) \|\| !allSameBlock(VL: Slice) \|\|
13726	(S.getOpcode() == Instruction::Load &&
13727	areKnownNonVectorizableLoads(VL: Slice)) \|\|
13728	(S.getOpcode() != Instruction::Load &&
13729	!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF)))
13730	continue;
13731	if (VF == `2`) {
13732	// Try to vectorize reduced values or if all users are vectorized.
13733	// For expensive instructions extra extracts might be profitable.
13734	if ((!UserIgnoreList \|\| E.Idx != `0`) &&
13735	TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
13736	TTI::TCC_Expensive &&
13737	!all_of(Range&: Slice, P: [&](Value *V) {
13738	if (isa<PoisonValue>(Val: V))
13739	return true;
13740	return areAllUsersVectorized(I: cast<Instruction>(Val: V),
13741	VectorizedVals: UserIgnoreList);
13742	}))
13743	continue;
13744	if (S.getOpcode() == Instruction::Load) {
13745	OrdersType Order;
13746	SmallVector<Value *> PointerOps;
13747	StridedPtrInfo SPtrInfo;
13748	LoadsState Res = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
13749	PointerOps, SPtrInfo);
13750	AllStrided &= Res == LoadsState::StridedVectorize \|\|
13751	Res == LoadsState::ScatterVectorize \|\|
13752	Res == LoadsState::Gather;
13753	// Do not vectorize gathers.
13754	if (Res == LoadsState::ScatterVectorize \|\|
13755	Res == LoadsState::Gather) {
13756	if (Res == LoadsState::Gather) {
13757	registerNonVectorizableLoads(VL: Slice);
13758	// If reductions and the scalars from the root node are
13759	// analyzed - mark as non-vectorizable reduction.
13760	if (UserIgnoreList && E.Idx == `0`)
13761	analyzedReductionVals(VL: Slice);
13762	}
13763	continue;
13764	}
13765	} else if (S.getOpcode() == Instruction::ExtractElement \|\|
13766	(TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
13767	TTI::TCC_Expensive &&
13768	!CheckOperandsProfitability (
13769	S.getMainOp(),
13770	cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice),
13771	P: IsaPred<Instruction>)),
13772	S))) {
13773	// Do not vectorize extractelements (handled effectively
13774	// alread). Do not vectorize non-profitable instructions (with
13775	// low cost and non-vectorizable operands.)
13776	continue;
13777	}
13778	}
13779	}
13780	Slices.emplace_back(Args&: Cnt, Args: Slice.size());
13781	}
13782	// Do not try to vectorize if all slides are strided or gathered with
13783	// vector factor 2 and there are more than 2 slices. Better to handle
13784	// them in gathered loads analysis, may result in better vectorization.
13785	if (VF == `2` && AllStrided && Slices.size() > `2`)
13786	continue;
13787	auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13788	E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt);
13789	Processed.set(I: Cnt, E: Cnt + Sz);
13790	if (StartIdx == Cnt)
13791	StartIdx = Cnt + Sz;
13792	if (End == Cnt + Sz)
13793	End = Cnt;
13794	};
13795	for (auto [Cnt, Sz] : Slices) {
13796	ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz);
13797	const TreeEntry SameTE = nullptr*;
13798	if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>);
13799	It != Slice.end()) {
13800	// If any instruction is vectorized already - do not try again.
13801	SameTE = getSameValuesTreeEntry(V: *It, VL: Slice);
13802	}
13803	unsigned PrevSize = VectorizableTree.size();
13804	[[maybe_unused]] unsigned PrevEntriesSize =
13805	LoadEntriesToVectorize.size();
13806	buildTreeRec(VLRef: Slice, Depth: `0`, UserTreeIdx: EdgeInfo (&E, UINT_MAX));
13807	if (PrevSize + `1` == VectorizableTree.size() && !SameTE &&
13808	VectorizableTree [PrevSize]->isGather() &&
13809	VectorizableTree [PrevSize]->hasState() &&
13810	VectorizableTree [PrevSize]->getOpcode() !=
13811	Instruction::ExtractElement &&
13812	!isSplat(VL: Slice)) {
13813	if (UserIgnoreList && E.Idx == `0` && VF == `2`)
13814	analyzedReductionVals(VL: Slice);
13815	VectorizableTree.pop_back();
13816	assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13817	"LoadEntriesToVectorize expected to remain the same");
13818	continue;
13819	}
13820	AddCombinedNode (PrevSize, Cnt, Sz);
13821	}
13822	}
13823	// Restore ordering, if no extra vectorization happened.
13824	if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13825	SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13826	reorderScalars(Scalars&: E.Scalars, Mask);
13827	E.ReorderIndices.clear();
13828	}
13829	}
13830	if (!E.hasState())
13831	continue;
13832	switch (E.getOpcode()) {
13833	case Instruction::Load: {
13834	// No need to reorder masked gather loads, just reorder the scalar
13835	// operands.
13836	if (E.State != TreeEntry::Vectorize)
13837	break;
13838	Type *ScalarTy = E.getMainOp()->getType();
13839	auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
13840	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
13841	// Check if profitable to represent consecutive load + reverse as strided
13842	// load with stride -1.
13843	if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
13844	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
13845	SmallVector<int> Mask;
13846	inversePermutation(Indices: E.ReorderIndices, Mask);
13847	auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
13848	InstructionCost OriginalVecCost =
13849	TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
13850	AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
13851	OpdInfo: TTI::OperandValueInfo ()) +
13852	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
13853	InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13854	MICA: MemIntrinsicCostAttributes (Intrinsic::experimental_vp_strided_load,
13855	VecTy, BaseLI->getPointerOperand(),
13856	/VariableMask=/false, CommonAlignment,
13857	BaseLI),
13858	CostKind);
13859	if (StridedCost < OriginalVecCost \|\| ForceStridedLoads) {
13860	// Strided load is more profitable than consecutive load + reverse -
13861	// transform the node to strided load.
13862	Type *StrideTy = DL->getIndexType(PtrTy: cast<LoadInst>(Val: E.Scalars.front())
13863	->getPointerOperand()
13864	->getType());
13865	StridedPtrInfo SPtrInfo;
13866	SPtrInfo.StrideVal = ConstantInt::get(Ty: StrideTy, V: `1`);
13867	SPtrInfo.Ty = VecTy;
13868	TreeEntryToStridedPtrInfoMap [&E] = SPtrInfo;
13869	E.State = TreeEntry::StridedVectorize;
13870	}
13871	}
13872	break;
13873	}
13874	case Instruction::Store: {
13875	Type *ScalarTy =
13876	cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
13877	auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
13878	Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
13879	// Check if profitable to represent consecutive load + reverse as strided
13880	// load with stride -1.
13881	if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
13882	TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
13883	SmallVector<int> Mask;
13884	inversePermutation(Indices: E.ReorderIndices, Mask);
13885	auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
13886	InstructionCost OriginalVecCost =
13887	TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
13888	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
13889	OpdInfo: TTI::OperandValueInfo ()) +
13890	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
13891	InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13892	MICA: MemIntrinsicCostAttributes (Intrinsic::experimental_vp_strided_store,
13893	VecTy, BaseSI->getPointerOperand(),
13894	/VariableMask=/false, CommonAlignment,
13895	BaseSI),
13896	CostKind);
13897	if (StridedCost < OriginalVecCost)
13898	// Strided store is more profitable than reverse + consecutive store -
13899	// transform the node to strided store.
13900	E.State = TreeEntry::StridedVectorize;
13901	} else if (!E.ReorderIndices.empty()) {
13902	// Check for interleaved stores.
13903	auto IsInterleaveMask = [&, &TTI = TTI](ArrayRef<int*> Mask) {
13904	auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front());
13905	assert(Mask.size() > `1` && "Expected mask greater than 1 element.");
13906	if (Mask.size() < `4`)
13907	return `0u`;
13908	for (unsigned Factor : seq<unsigned>(Begin: `2`, End: Mask.size() / `2` + `1`)) {
13909	if (ShuffleVectorInst::isInterleaveMask(
13910	Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) &&
13911	TTI.isLegalInterleavedAccessType(
13912	VTy: VecTy, Factor, Alignment: BaseSI->getAlign(),
13913	AddrSpace: BaseSI->getPointerAddressSpace()))
13914	return Factor;
13915	}
13916
13917	return `0u`;
13918	};
13919	SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13920	unsigned InterleaveFactor = IsInterleaveMask (Mask);
13921	if (InterleaveFactor != `0`)
13922	E.setInterleave(InterleaveFactor);
13923	}
13924	break;
13925	}
13926	case Instruction::Select: {
13927	if (E.State != TreeEntry::Vectorize)
13928	break;
13929	auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars);
13930	if (MinMaxID != Intrinsic::not_intrinsic) {
13931	// This node is a minmax node.
13932	E.CombinedOp = TreeEntry::MinMax;
13933	TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: `0`);
13934	if (SelectOnly && CondEntry->UserTreeIndex &&
13935	CondEntry->State == TreeEntry::Vectorize) {
13936	// The condition node is part of the combined minmax node.
13937	CondEntry->State = TreeEntry::CombinedVectorize;
13938	}
13939	break;
13940	}
13941	// Check for zext + selects, which can be reordered.
13942	SmallVector<unsigned> InversedCmpsIndices;
13943	if (matchesInversedZExtSelect(SelectTE: E, InversedCmpsIndices)) {
13944	auto *CmpTE = getOperandEntry(E: &E, Idx: `0`);
13945	auto *Op1TE = getOperandEntry(E: &E, Idx: `1`);
13946	auto *Op2TE = getOperandEntry(E: &E, Idx: `2`);
13947	// State now is uniform, not alternate opcode.
13948	CmpTE->setOperations(
13949	InstructionsState (CmpTE->getMainOp(), CmpTE->getMainOp()));
13950	// Update mapping between the swapped values and their internal matching
13951	// nodes.
13952	auto UpdateGatherEntry = [&](TreeEntry OldTE, TreeEntry NewTE,
13953	Value *V) {
13954	if (isConstant(V))
13955	return;
13956	auto It = ValueToGatherNodes.find(Val: V);
13957	assert(It != ValueToGatherNodes.end() &&
13958	"Expected to find the value in the map.");
13959	auto &C = It ->getSecond();
13960	if (!is_contained(Range&: OldTE->Scalars, Element: V))
13961	C.remove(X: OldTE);
13962	C.insert(X: NewTE);
13963	};
13964	ValueList &Op1 = E.getOperand(OpIdx: `1`);
13965	ValueList &Op2 = E.getOperand(OpIdx: `2`);
13966	for (const unsigned Idx : InversedCmpsIndices) {
13967	Value *V1 = Op1TE->Scalars [Idx];
13968	Value *V2 = Op2TE->Scalars [Idx];
13969	std::swap(a&: Op1TE->Scalars [Idx], b&: Op2TE->Scalars [Idx]);
13970	std::swap(a&: Op1 [Idx], b&: Op2 [Idx]);
13971	UpdateGatherEntry (Op1TE, Op2TE, V1);
13972	UpdateGatherEntry (Op2TE, Op1TE, V2);
13973	}
13974	OperandsToTreeEntry.emplace_or_assign(Key: std::make_pair(x: &E, y: `1`), Args&: Op1TE);
13975	OperandsToTreeEntry.emplace_or_assign(Key: std::make_pair(x: &E, y: `2`), Args&: Op2TE);
13976	// NB: Fallback to check if select can be converted to cmp bitcast.
13977	}
13978	if (matchesSelectOfBits(SelectTE: E)) {
13979	// This node is a (reduced or) cmp bitcast node.
13980	const TreeEntry::CombinedOpcode Code = TreeEntry::ReducedCmpBitcast;
13981	E.CombinedOp = Code;
13982	auto *Op1TE = getOperandEntry(E: &E, Idx: `1`);
13983	auto *Op2TE = getOperandEntry(E: &E, Idx: `2`);
13984	Op1TE->State = TreeEntry::CombinedVectorize;
13985	Op1TE->CombinedOp = Code;
13986	Op2TE->State = TreeEntry::CombinedVectorize;
13987	Op2TE->CombinedOp = Code;
13988	break;
13989	}
13990	break;
13991	}
13992	case Instruction::FSub:
13993	case Instruction::FAdd: {
13994	// Check if possible to convert (ab)+c to fma.*
13995	if (E.State != TreeEntry::Vectorize \|\|
13996	!E.getOperations().isAddSubLikeOp())
13997	break;
13998	if (!canConvertToFMA(VL: E.Scalars, S: E.getOperations(), DT&: DT, DL: DL, TTI&: TTI, TLI: TLI)
13999	.isValid())
14000	break;
14001	// This node is a fmuladd node.
14002	E.CombinedOp = TreeEntry::FMulAdd;
14003	TreeEntry *FMulEntry = getOperandEntry(E: &E, Idx: `0`);
14004	if (FMulEntry->UserTreeIndex &&
14005	FMulEntry->State == TreeEntry::Vectorize) {
14006	// The FMul node is part of the combined fmuladd node.
14007	FMulEntry->State = TreeEntry::CombinedVectorize;
14008	}
14009	break;
14010	}
14011	case Instruction::Shl: {
14012	if (E.Idx != `0` \|\| DL->isBigEndian())
14013	break;
14014	if (!UserIgnoreList)
14015	break;
14016	// Check that all reduction operands are disjoint or instructions.
14017	if (any_of(Range: UserIgnoreList, P: [](Value V) {
14018	return !match(V, P: m_DisjointOr(L: m_Value(), R: m_Value()));
14019	}))
14020	break;
14021	OrdersType Order;
14022	bool IsBSwap;
14023	bool ForLoads;
14024	if (!matchesShlZExt(TE: E, Order, IsBSwap, ForLoads))
14025	break;
14026	// This node is a (reduced disjoint or) bitcast node.
14027	TreeEntry::CombinedOpcode Code =
14028	IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
14029	: TreeEntry::ReducedBitcastBSwap)
14030	: (ForLoads ? TreeEntry::ReducedBitcastLoads
14031	: TreeEntry::ReducedBitcast);
14032	E.CombinedOp = Code;
14033	E.ReorderIndices = std::move(Order);
14034	TreeEntry *ZExtEntry = getOperandEntry(E: &E, Idx: `0`);
14035	assert(ZExtEntry->UserTreeIndex &&
14036	ZExtEntry->State == TreeEntry::Vectorize &&
14037	ZExtEntry->getOpcode() == Instruction::ZExt &&
14038	"Expected ZExt node.");
14039	// The ZExt node is part of the combined node.
14040	ZExtEntry->State = TreeEntry::CombinedVectorize;
14041	ZExtEntry->CombinedOp = Code;
14042	if (ForLoads) {
14043	TreeEntry *LoadsEntry = getOperandEntry(E: ZExtEntry, Idx: `0`);
14044	assert(LoadsEntry->UserTreeIndex &&
14045	LoadsEntry->State == TreeEntry::Vectorize &&
14046	LoadsEntry->getOpcode() == Instruction::Load &&
14047	"Expected Load node.");
14048	// The Load node is part of the combined node.
14049	LoadsEntry->State = TreeEntry::CombinedVectorize;
14050	LoadsEntry->CombinedOp = Code;
14051	}
14052	TreeEntry *ConstEntry = getOperandEntry(E: &E, Idx: `1`);
14053	assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
14054	"Expected ZExt node.");
14055	// The ConstNode node is part of the combined node.
14056	ConstEntry->State = TreeEntry::CombinedVectorize;
14057	ConstEntry->CombinedOp = Code;
14058	break;
14059	}
14060	default:
14061	break;
14062	}
14063	}
14064
14065	if (LoadEntriesToVectorize.empty()) {
14066	// Single load node - exit.
14067	if (VectorizableTree.size() <= `1` && VectorizableTree.front()->hasState() &&
14068	VectorizableTree.front()->getOpcode() == Instruction::Load)
14069	return;
14070	// Small graph with small VF - exit.
14071	constexpr unsigned SmallTree = `3`;
14072	constexpr unsigned SmallVF = `2`;
14073	if ((VectorizableTree.size() <= SmallTree &&
14074	VectorizableTree.front()->Scalars.size() == SmallVF) \|\|
14075	(VectorizableTree.size() <= `2` && UserIgnoreList))
14076	return;
14077
14078	if (VectorizableTree.front()->isNonPowOf2Vec() &&
14079	getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
14080	getCanonicalGraphSize() <= SmallTree &&
14081	count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
14082	P: [](const std::unique_ptr<TreeEntry> &TE) {
14083	return TE ->isGather() && TE ->hasState() &&
14084	TE ->getOpcode() == Instruction::Load &&
14085	!allSameBlock(VL: TE ->Scalars);
14086	}) == `1`)
14087	return;
14088	}
14089
14090	// A list of loads to be gathered during the vectorization process. We can
14091	// try to vectorize them at the end, if profitable.
14092	SmallMapVector<std::tuple<BasicBlock , Value , Type *>,
14093	SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, `8`>
14094	GatheredLoads;
14095
14096	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14097	TreeEntry &E = *TE;
14098	if (E.isGather() &&
14099	((E.hasState() && E.getOpcode() == Instruction::Load) \|\|
14100	(!E.hasState() && any_of(Range&: E.Scalars,
14101	P: [&](Value *V) {
14102	return isa<LoadInst>(Val: V) &&
14103	!isVectorized(V) &&
14104	!isDeleted(I: cast<Instruction>(Val: V));
14105	}))) &&
14106	!isSplat(VL: E.Scalars)) {
14107	for (Value *V : E.Scalars) {
14108	auto *LI = dyn_cast<LoadInst>(Val: V);
14109	if (!LI)
14110	continue;
14111	if (isDeleted(I: LI) \|\| isVectorized(V: LI) \|\| !LI->isSimple())
14112	continue;
14113	gatherPossiblyVectorizableLoads(
14114	R: *this, VL: V, DL: DL, SE&: SE, TTI: *TTI,
14115	GatheredLoads&: GatheredLoads [std::make_tuple(
14116	args: LI->getParent(),
14117	args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth),
14118	args: LI->getType())]);
14119	}
14120	}
14121	}
14122	// Try to vectorize gathered loads if this is not just a gather of loads.
14123	if (!GatheredLoads.empty())
14124	tryToVectorizeGatheredLoads(GatheredLoads);
14125	}
14126
14127	/// Merges shuffle masks and emits final shuffle instruction, if required. It
14128	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14129	/// when the actual shuffle instruction is generated only if this is actually
14130	/// required. Otherwise, the shuffle instruction emission is delayed till the
14131	/// end of the process, to reduce the number of emitted instructions and further
14132	/// analysis/transformations.
14133	class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
14134	bool IsFinalized = false;
14135	SmallVector<int> CommonMask;
14136	SmallVector<PointerUnion<Value , const* TreeEntry *>, `2`> InVectors;
14137	const TargetTransformInfo &TTI;
14138	InstructionCost Cost = `0`;
14139	SmallDenseSet<Value *> VectorizedVals;
14140	BoUpSLP &R;
14141	SmallPtrSetImpl<Value *> &CheckedExtracts;
14142	constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14143	/// While set, still trying to estimate the cost for the same nodes and we
14144	/// can delay actual cost estimation (virtual shuffle instruction emission).
14145	/// May help better estimate the cost if same nodes must be permuted + allows
14146	/// to move most of the long shuffles cost estimation to TTI.
14147	bool SameNodesEstimated = true;
14148
14149	static Constant getAllOnesValue(const* DataLayout &DL, Type *Ty) {
14150	if (Ty->getScalarType()->isPointerTy()) {
14151	Constant *Res = ConstantExpr::getIntToPtr(
14152	C: ConstantInt::getAllOnesValue(
14153	Ty: IntegerType::get(C&: Ty->getContext(),
14154	NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
14155	Ty: Ty->getScalarType());
14156	if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
14157	Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
14158	return Res;
14159	}
14160	return Constant::getAllOnesValue(Ty);
14161	}
14162
14163	InstructionCost getBuildVectorCost(ArrayRef<Value > VL, Value Root) {
14164	if ((!Root && allConstant(VL)) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>))
14165	return TTI::TCC_Free;
14166	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
14167	InstructionCost GatherCost = `0`;
14168	SmallVector<Value *> Gathers(VL);
14169	if (!Root && isSplat(VL)) {
14170	// Found the broadcasting of the single scalar, calculate the cost as
14171	// the broadcast.
14172	const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
14173	assert(It != VL.end() && "Expected at least one non-undef value.");
14174	// Add broadcast for non-identity shuffle only.
14175	bool NeedShuffle =
14176	count(Range&: VL, Element: *It) > `1` &&
14177	(VL.front() != *It \|\| !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
14178	if (!NeedShuffle) {
14179	if (isa<FixedVectorType>(Val: ScalarTy)) {
14180	assert(SLPReVec && "FixedVectorType is not expected.");
14181	return TTI.getShuffleCost(
14182	Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind,
14183	Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy),
14184	SubTp: cast<FixedVectorType>(Val: ScalarTy));
14185	}
14186	return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
14187	CostKind, Index: std::distance(first: VL.begin(), last: It),
14188	Op0: PoisonValue::get(T: VecTy), Op1: *It);
14189	}
14190
14191	SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
14192	transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
14193	return isa<PoisonValue>(Val: V) ? PoisonMaskElem : `0`;
14194	});
14195	InstructionCost InsertCost =
14196	TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: `0`,
14197	Op0: PoisonValue::get(T: VecTy), Op1: *It);
14198	return InsertCost + ::getShuffleCost(TTI,
14199	Kind: TargetTransformInfo::SK_Broadcast,
14200	Tp: VecTy, Mask: ShuffleMask, CostKind,
14201	/Index=/`0`, /SubTp=/nullptr,
14202	/Args=/*It);
14203	}
14204	return GatherCost +
14205	(all_of(Range&: Gathers, P: IsaPred<UndefValue>)
14206	? TTI::TCC_Free
14207	: R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
14208	ScalarTy));
14209	};
14210
14211	/// Compute the cost of creating a vector containing the extracted values from
14212	/// \p VL.
14213	InstructionCost
14214	computeExtractCost(ArrayRef<Value > VL, ArrayRef<int*> Mask,
14215	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14216	unsigned NumParts) {
14217	assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
14218	unsigned NumElts =
14219	std::accumulate(first: VL.begin(), last: VL.end(), init: `0`, binary_op: [](unsigned Sz, Value *V) {
14220	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
14221	if (!EE)
14222	return Sz;
14223	auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
14224	if (!VecTy)
14225	return Sz;
14226	return std::max(a: Sz, b: VecTy->getNumElements());
14227	});
14228	// FIXME: this must be moved to TTI for better estimation.
14229	unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
14230	auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
14231	SmallVectorImpl<unsigned> &Indices,
14232	SmallVectorImpl<unsigned> &SubVecSizes)
14233	-> std::optional<TTI::ShuffleKind> {
14234	if (NumElts <= EltsPerVector)
14235	return std::nullopt;
14236	int OffsetReg0 =
14237	alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
14238	binary_op: [](int S, int I) {
14239	if (I == PoisonMaskElem)
14240	return S;
14241	return std::min(a: S, b: I);
14242	}),
14243	Align: EltsPerVector);
14244	int OffsetReg1 = OffsetReg0;
14245	DenseSet<int> RegIndices;
14246	// Check that if trying to permute same single/2 input vectors.
14247	TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
14248	int FirstRegId = -`1`;
14249	Indices.assign(NumElts: `1`, Elt: OffsetReg0);
14250	for (auto [Pos, I] : enumerate(First&: Mask)) {
14251	if (I == PoisonMaskElem)
14252	continue;
14253	int Idx = I - OffsetReg0;
14254	int RegId =
14255	(Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14256	if (FirstRegId < `0`)
14257	FirstRegId = RegId;
14258	RegIndices.insert(V: RegId);
14259	if (RegIndices.size() > `2`)
14260	return std::nullopt;
14261	if (RegIndices.size() == `2`) {
14262	ShuffleKind = TTI::SK_PermuteTwoSrc;
14263	if (Indices.size() == `1`) {
14264	OffsetReg1 = alignDown(
14265	Value: std::accumulate(
14266	first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
14267	binary_op: [&](int S, int I) {
14268	if (I == PoisonMaskElem)
14269	return S;
14270	int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14271	((I - OffsetReg0) % NumElts) / EltsPerVector;
14272	if (RegId == FirstRegId)
14273	return S;
14274	return std::min(a: S, b: I);
14275	}),
14276	Align: EltsPerVector);
14277	unsigned Index = OffsetReg1 % NumElts;
14278	Indices.push_back(Elt: Index);
14279	SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector));
14280	}
14281	Idx = I - OffsetReg1;
14282	}
14283	I = (Idx % NumElts) % EltsPerVector +
14284	(RegId == FirstRegId ? `0` : EltsPerVector);
14285	}
14286	return ShuffleKind;
14287	};
14288	InstructionCost Cost = `0`;
14289
14290	// Process extracts in blocks of EltsPerVector to check if the source vector
14291	// operand can be re-used directly. If not, add the cost of creating a
14292	// shuffle to extract the values into a vector register.
14293	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14294	if (!ShuffleKinds [Part])
14295	continue;
14296	ArrayRef<int> MaskSlice = Mask.slice(
14297	N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
14298	SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
14299	copy(Range&: MaskSlice, Out: SubMask.begin());
14300	SmallVector<unsigned, `2`> Indices;
14301	SmallVector<unsigned, `2`> SubVecSizes;
14302	std::optional<TTI::ShuffleKind> RegShuffleKind =
14303	CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14304	if (!RegShuffleKind) {
14305	if (*ShuffleKinds [Part] != TTI::SK_PermuteSingleSrc \|\|
14306	!ShuffleVectorInst::isIdentityMask(
14307	Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
14308	Cost +=
14309	::getShuffleCost(TTI, Kind: *ShuffleKinds [Part],
14310	Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
14311	continue;
14312	}
14313	if (*RegShuffleKind != TTI::SK_PermuteSingleSrc \|\|
14314	!ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
14315	Cost +=
14316	::getShuffleCost(TTI, Kind: *RegShuffleKind,
14317	Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
14318	}
14319	const unsigned BaseVF = getFullVectorNumberOfElements(
14320	TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector));
14321	for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) {
14322	assert((Idx + SubVecSize) <= BaseVF &&
14323	"SK_ExtractSubvector index out of range");
14324	Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
14325	Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind,
14326	Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize));
14327	}
14328	// Second attempt to check, if just a permute is better estimated than
14329	// subvector extract.
14330	SubMask.assign(NumElts, Elt: PoisonMaskElem);
14331	copy(Range&: MaskSlice, Out: SubMask.begin());
14332	InstructionCost OriginalCost = ::getShuffleCost(
14333	TTI, Kind: *ShuffleKinds [Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
14334	if (OriginalCost < Cost)
14335	Cost = OriginalCost;
14336	}
14337	return Cost;
14338	}
14339	/// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14340	/// mask \p Mask, register number \p Part, that includes \p SliceSize
14341	/// elements.
14342	void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14343	ArrayRef<int> Mask, unsigned Part,
14344	unsigned SliceSize) {
14345	if (SameNodesEstimated) {
14346	// Delay the cost estimation if the same nodes are reshuffling.
14347	// If we already requested the cost of reshuffling of E1 and E2 before, no
14348	// need to estimate another cost with the sub-Mask, instead include this
14349	// sub-Mask into the CommonMask to estimate it later and avoid double cost
14350	// estimation.
14351	if ((InVectors.size() == `2` &&
14352	cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 &&
14353	cast<const TreeEntry *>(Val&: InVectors.back()) == E2) \|\|
14354	(!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) {
14355	unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
14356	assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14357	[](int Idx) { return Idx == PoisonMaskElem; }) &&
14358	"Expected all poisoned elements.");
14359	ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
14360	copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
14361	return;
14362	}
14363	// Found non-matching nodes - need to estimate the cost for the matched
14364	// and transform mask.
14365	Cost += createShuffle(P1: InVectors.front(),
14366	P2: InVectors.size() == `1` ? nullptr : InVectors.back(),
14367	Mask: CommonMask);
14368	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14369	} else if (InVectors.size() == `2`) {
14370	Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14371	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14372	}
14373	SameNodesEstimated = false;
14374	if (!E2 && InVectors.size() == `1`) {
14375	unsigned VF = E1.getVectorFactor();
14376	if (Value V1 = dyn_cast<Value >(Val&: InVectors.front())) {
14377	VF = std::max(a: VF, b: getVF(V: V1));
14378	} else {
14379	const auto E = cast<const* TreeEntry *>(Val&: InVectors.front());
14380	VF = std::max(a: VF, b: E->getVectorFactor());
14381	}
14382	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14383	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
14384	CommonMask [Idx] = Mask [Idx] + VF;
14385	Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
14386	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14387	} else {
14388	auto P = InVectors.front();
14389	Cost += createShuffle(P1: &E1, P2: E2, Mask);
14390	unsigned VF = Mask.size();
14391	if (Value V1 = dyn_cast<Value >(Val&: P)) {
14392	VF = std::max(a: VF,
14393	b: getNumElements(Ty: V1->getType()));
14394	} else {
14395	const auto E = cast<const* TreeEntry *>(Val&: P);
14396	VF = std::max(a: VF, b: E->getVectorFactor());
14397	}
14398	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14399	if (Mask [Idx] != PoisonMaskElem)
14400	CommonMask [Idx] = Idx + (InVectors.empty() ? `0` : VF);
14401	Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask);
14402	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14403	}
14404	}
14405
14406	class ShuffleCostBuilder {
14407	const TargetTransformInfo &TTI;
14408
14409	static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14410	int Index = -`1`;
14411	return Mask.empty() \|\|
14412	(VF == Mask.size() &&
14413	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) \|\|
14414	(ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
14415	Index == `0`);
14416	}
14417
14418	public:
14419	ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14420	~ShuffleCostBuilder() = default;
14421	InstructionCost createShuffleVector(Value V1, Value ,
14422	ArrayRef<int> Mask) const {
14423	// Empty mask or identity mask are free.
14424	unsigned VF =
14425	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14426	if (isEmptyOrIdentity(Mask, VF))
14427	return TTI::TCC_Free;
14428	return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14429	Tp: cast<VectorType>(Val: V1->getType()), Mask);
14430	}
14431	InstructionCost createShuffleVector(Value V1, ArrayRef<int> Mask) const* {
14432	// Empty mask or identity mask are free.
14433	unsigned VF =
14434	cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14435	if (isEmptyOrIdentity(Mask, VF))
14436	return TTI::TCC_Free;
14437	return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
14438	Tp: cast<VectorType>(Val: V1->getType()), Mask);
14439	}
14440	InstructionCost createIdentity(Value ) const* { return TTI::TCC_Free; }
14441	InstructionCost createPoison(Type Ty, unsigned* VF) const {
14442	return TTI::TCC_Free;
14443	}
14444	void resizeToMatch(Value &, Value &) const {}
14445	};
14446
14447	/// Smart shuffle instruction emission, walks through shuffles trees and
14448	/// tries to find the best matching vector for the actual shuffle
14449	/// instruction.
14450	InstructionCost
14451	createShuffle(const PointerUnion<Value , const* TreeEntry *> &P1,
14452	const PointerUnion<Value , const* TreeEntry *> &P2,
14453	ArrayRef<int> Mask) {
14454	ShuffleCostBuilder Builder(TTI);
14455	SmallVector<int> CommonMask(Mask);
14456	Value V1 = P1.dyn_cast<Value >(), V2 = P2.dyn_cast<Value >();
14457	unsigned CommonVF = Mask.size();
14458	InstructionCost ExtraCost = `0`;
14459	auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14460	unsigned VF) -> InstructionCost {
14461	if (E.isGather() && allConstant(VL: E.Scalars))
14462	return TTI::TCC_Free;
14463	Type *EScalarTy = E.Scalars.front()->getType();
14464	bool IsSigned = true;
14465	if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
14466	EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It ->second.first);
14467	IsSigned = It ->second.second;
14468	}
14469	if (EScalarTy != ScalarTy) {
14470	unsigned CastOpcode = Instruction::Trunc;
14471	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14472	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14473	if (DstSz > SrcSz)
14474	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14475	return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
14476	Src: getWidenedType(ScalarTy: EScalarTy, VF),
14477	CCH: TTI::CastContextHint::None, CostKind);
14478	}
14479	return TTI::TCC_Free;
14480	};
14481	auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14482	if (isa<Constant>(Val: V))
14483	return TTI::TCC_Free;
14484	auto *VecTy = cast<VectorType>(Val: V->getType());
14485	Type *EScalarTy = VecTy->getElementType();
14486	if (EScalarTy != ScalarTy) {
14487	bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery (*R.DL));
14488	unsigned CastOpcode = Instruction::Trunc;
14489	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14490	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14491	if (DstSz > SrcSz)
14492	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14493	return TTI.getCastInstrCost(
14494	Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
14495	Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
14496	}
14497	return TTI::TCC_Free;
14498	};
14499	if (!V1 && !V2 && !P2.isNull()) {
14500	// Shuffle 2 entry nodes.
14501	const TreeEntry E = cast<const* TreeEntry *>(Val: P1);
14502	unsigned VF = E->getVectorFactor();
14503	const TreeEntry E2 = cast<const* TreeEntry *>(Val: P2);
14504	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14505	assert(all_of(Mask,
14506	[=](int Idx) {
14507	return Idx < `2` * static_cast<int>(CommonVF);
14508	}) &&
14509	"All elements in mask must be less than 2 * CommonVF.");
14510	if (E->Scalars.size() == E2->Scalars.size()) {
14511	SmallVector<int> EMask = E->getCommonMask();
14512	SmallVector<int> E2Mask = E2->getCommonMask();
14513	if (!EMask.empty() \|\| !E2Mask.empty()) {
14514	for (int &Idx : CommonMask) {
14515	if (Idx == PoisonMaskElem)
14516	continue;
14517	if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14518	Idx = EMask [Idx];
14519	else if (Idx >= static_cast<int>(CommonVF))
14520	Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask [Idx - CommonVF]) +
14521	E->Scalars.size();
14522	}
14523	}
14524	CommonVF = E->Scalars.size();
14525	ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14526	GetNodeMinBWAffectedCost(*E2, CommonVF);
14527	} else {
14528	ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14529	GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14530	}
14531	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14532	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14533	} else if (!V1 && P2.isNull()) {
14534	// Shuffle single entry node.
14535	const TreeEntry E = cast<const* TreeEntry *>(Val: P1);
14536	unsigned VF = E->getVectorFactor();
14537	CommonVF = VF;
14538	assert(
14539	all_of(Mask,
14540	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14541	"All elements in mask must be less than CommonVF.");
14542	if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14543	SmallVector<int> EMask = E->getCommonMask();
14544	assert(!EMask.empty() && "Expected non-empty common mask.");
14545	for (int &Idx : CommonMask) {
14546	if (Idx != PoisonMaskElem)
14547	Idx = EMask [Idx];
14548	}
14549	CommonVF = E->Scalars.size();
14550	} else if (unsigned Factor = E->getInterleaveFactor();
14551	Factor > `0` && E->Scalars.size() != Mask.size() &&
14552	ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask,
14553	Factor)) {
14554	// Deinterleaved nodes are free.
14555	std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: `0`);
14556	}
14557	ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14558	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14559	// Not identity/broadcast? Try to see if the original vector is better.
14560	if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14561	CommonVF == CommonMask.size() &&
14562	any_of(Range: enumerate(First&: CommonMask),
14563	P: [](const auto &&P) {
14564	return P.value() != PoisonMaskElem &&
14565	static_cast<unsigned>(P.value()) != P.index();
14566	}) &&
14567	any_of(Range&: CommonMask,
14568	P: [](int Idx) { return Idx != PoisonMaskElem && Idx != `0`; })) {
14569	SmallVector<int> ReorderMask;
14570	inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
14571	::addMask(Mask&: CommonMask, SubMask: ReorderMask);
14572	}
14573	} else if (V1 && P2.isNull()) {
14574	// Shuffle single vector.
14575	ExtraCost += GetValueMinBWAffectedCost(V1);
14576	CommonVF = getVF(V: V1);
14577	assert(
14578	all_of(Mask,
14579	[=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14580	"All elements in mask must be less than CommonVF.");
14581	} else if (V1 && !V2) {
14582	// Shuffle vector and tree node.
14583	unsigned VF = getVF(V: V1);
14584	const TreeEntry E2 = cast<const* TreeEntry *>(Val: P2);
14585	CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14586	assert(all_of(Mask,
14587	[=](int Idx) {
14588	return Idx < `2` * static_cast<int>(CommonVF);
14589	}) &&
14590	"All elements in mask must be less than 2 * CommonVF.");
14591	if (E2->Scalars.size() == VF && VF != CommonVF) {
14592	SmallVector<int> E2Mask = E2->getCommonMask();
14593	assert(!E2Mask.empty() && "Expected non-empty common mask.");
14594	for (int &Idx : CommonMask) {
14595	if (Idx == PoisonMaskElem)
14596	continue;
14597	if (Idx >= static_cast<int>(CommonVF))
14598	Idx = E2Mask [Idx - CommonVF] + VF;
14599	}
14600	CommonVF = VF;
14601	}
14602	ExtraCost += GetValueMinBWAffectedCost(V1);
14603	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14604	ExtraCost += GetNodeMinBWAffectedCost(
14605	*E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
14606	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14607	} else if (!V1 && V2) {
14608	// Shuffle vector and tree node.
14609	unsigned VF = getVF(V: V2);
14610	const TreeEntry E1 = cast<const* TreeEntry *>(Val: P1);
14611	CommonVF = std::max(a: VF, b: E1->getVectorFactor());
14612	assert(all_of(Mask,
14613	[=](int Idx) {
14614	return Idx < `2` * static_cast<int>(CommonVF);
14615	}) &&
14616	"All elements in mask must be less than 2 * CommonVF.");
14617	if (E1->Scalars.size() == VF && VF != CommonVF) {
14618	SmallVector<int> E1Mask = E1->getCommonMask();
14619	assert(!E1Mask.empty() && "Expected non-empty common mask.");
14620	for (int &Idx : CommonMask) {
14621	if (Idx == PoisonMaskElem)
14622	continue;
14623	if (Idx >= static_cast<int>(CommonVF))
14624	Idx = E1Mask [Idx - CommonVF] + VF;
14625	else
14626	Idx = E1Mask [Idx];
14627	}
14628	CommonVF = VF;
14629	}
14630	ExtraCost += GetNodeMinBWAffectedCost(
14631	*E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
14632	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14633	ExtraCost += GetValueMinBWAffectedCost(V2);
14634	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14635	} else {
14636	assert(V1 && V2 && "Expected both vectors.");
14637	unsigned VF = getVF(V: V1);
14638	CommonVF = std::max(a: VF, b: getVF(V: V2));
14639	assert(all_of(Mask,
14640	[=](int Idx) {
14641	return Idx < `2` * static_cast<int>(CommonVF);
14642	}) &&
14643	"All elements in mask must be less than 2 * CommonVF.");
14644	ExtraCost +=
14645	GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14646	if (V1->getType() != V2->getType()) {
14647	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14648	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14649	} else {
14650	if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
14651	V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14652	if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
14653	V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14654	}
14655	}
14656	InVectors.front() =
14657	Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
14658	if (InVectors.size() == `2`)
14659	InVectors.pop_back();
14660	return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14661	V1, V2, Mask: CommonMask, Builder, ScalarTy);
14662	}
14663
14664	public:
14665	ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
14666	ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14667	SmallPtrSetImpl<Value *> &CheckedExtracts)
14668	: BaseShuffleAnalysis (ScalarTy), TTI(TTI),
14669	VectorizedVals (VectorizedVals.begin(), VectorizedVals.end()), R(R),
14670	CheckedExtracts(CheckedExtracts) {}
14671	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
14672	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14673	unsigned NumParts, bool &UseVecBaseAsInput) {
14674	UseVecBaseAsInput = false;
14675	if (Mask.empty())
14676	return nullptr;
14677	Value VecBase = nullptr*;
14678	SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14679	if (!E->ReorderIndices.empty()) {
14680	SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14681	E->ReorderIndices.end());
14682	reorderScalars(Scalars&: VL, Mask: ReorderMask);
14683	}
14684	// Check if it can be considered reused if same extractelements were
14685	// vectorized already.
14686	bool PrevNodeFound = any_of(
14687	Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
14688	P: [&](const std::unique_ptr<TreeEntry> &TE) {
14689	return ((TE ->hasState() && !TE ->isAltShuffle() &&
14690	TE ->getOpcode() == Instruction::ExtractElement) \|\|
14691	TE ->isGather()) &&
14692	all_of(Range: enumerate(First&: TE ->Scalars), P: [&](auto &&Data) {
14693	return VL.size() > Data.index() &&
14694	(Mask[Data.index()] == PoisonMaskElem \|\|
14695	isa<UndefValue>(VL[Data.index()]) \|\|
14696	Data.value() == VL[Data.index()]);
14697	});
14698	});
14699	SmallPtrSet<Value *, `4`> UniqueBases;
14700	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
14701	SmallDenseMap<Value *, APInt, `4`> VectorOpsToExtracts;
14702	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14703	unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
14704	ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
14705	for (auto [I, V] :
14706	enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) {
14707	// Ignore non-extractelement scalars.
14708	if (isa<UndefValue>(Val: V) \|\|
14709	(!SubMask.empty() && SubMask [I] == PoisonMaskElem))
14710	continue;
14711	// If all users of instruction are going to be vectorized and this
14712	// instruction itself is not going to be vectorized, consider this
14713	// instruction as dead and remove its cost from the final cost of the
14714	// vectorized tree.
14715	// Also, avoid adjusting the cost for extractelements with multiple uses
14716	// in different graph entries.
14717	auto *EE = cast<ExtractElementInst>(Val: V);
14718	VecBase = EE->getVectorOperand();
14719	UniqueBases.insert(Ptr: VecBase);
14720	ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14721	if (!CheckedExtracts.insert(Ptr: V).second \|\|
14722	!R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) \|\|
14723	any_of(Range&: VEs,
14724	P: [&](const TreeEntry *TE) {
14725	return R.DeletedNodes.contains(Ptr: TE) \|\|
14726	R.TransformedToGatherNodes.contains(Val: TE);
14727	}) \|\|
14728	(E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14729	!R.isVectorized(V: EE) &&
14730	count_if(Range: E->Scalars, P: [&](Value V) { return* V == EE; }) !=
14731	count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
14732	P: [&](Value V) { return* V == EE; })) \|\|
14733	any_of(Range: EE->users(),
14734	P: [&](User *U) {
14735	return isa<GetElementPtrInst>(Val: U) &&
14736	!R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
14737	VectorizedVals: &VectorizedVals);
14738	}) \|\|
14739	(!VEs.empty() && !is_contained(Range&: VEs, Element: E)))
14740	continue;
14741	std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
14742	if (!EEIdx)
14743	continue;
14744	unsigned Idx = *EEIdx;
14745	// Take credit for instruction that will become dead.
14746	if (EE->hasOneUse() \|\| !PrevNodeFound) {
14747	Instruction *Ext = EE->user_back();
14748	if (isa<SExtInst, ZExtInst>(Val: Ext) &&
14749	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
14750	// Use getExtractWithExtendCost() to calculate the cost of
14751	// extractelement/ext pair.
14752	Cost -= TTI.getExtractWithExtendCost(
14753	Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(),
14754	Index: Idx, CostKind);
14755	// Add back the cost of s\|zext which is subtracted separately.
14756	Cost += TTI.getCastInstrCost(
14757	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
14758	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
14759	continue;
14760	}
14761	}
14762	APInt &DemandedElts =
14763	VectorOpsToExtracts
14764	.try_emplace(Key: VecBase,
14765	Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType())))
14766	.first ->getSecond();
14767	DemandedElts.setBit(Idx);
14768	}
14769	}
14770	for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14771	Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()),
14772	DemandedElts, /Insert=/false,
14773	/Extract=/true, CostKind);
14774	// Check that gather of extractelements can be represented as just a
14775	// shuffle of a single/two vectors the scalars are extracted from.
14776	// Found the bunch of extractelement instructions that must be gathered
14777	// into a vector and can be represented as a permutation elements in a
14778	// single input vector or of 2 input vectors.
14779	// Done for reused if same extractelements were vectorized already.
14780	if (!PrevNodeFound)
14781	Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14782	InVectors.assign(NumElts: `1`, Elt: E);
14783	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14784	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14785	SameNodesEstimated = false;
14786	if (NumParts != `1` && UniqueBases.size() != `1`) {
14787	UseVecBaseAsInput = true;
14788	VecBase =
14789	Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
14790	}
14791	return VecBase;
14792	}
14793	/// Checks if the specified entry \p E needs to be delayed because of its
14794	/// dependency nodes.
14795	std::optional<InstructionCost>
14796	needToDelay(const TreeEntry *,
14797	ArrayRef<SmallVector<const TreeEntry >>) const* {
14798	// No need to delay the cost estimation during analysis.
14799	return std::nullopt;
14800	}
14801	/// Reset the builder to handle perfect diamond match.
14802	void resetForSameNode() {
14803	IsFinalized = false;
14804	CommonMask.clear();
14805	InVectors.clear();
14806	Cost = `0`;
14807	VectorizedVals.clear();
14808	SameNodesEstimated = true;
14809	}
14810	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14811	if (&E1 == &E2) {
14812	assert(all_of(Mask,
14813	[&](int Idx) {
14814	return Idx < static_cast<int>(E1.getVectorFactor());
14815	}) &&
14816	"Expected single vector shuffle mask.");
14817	add(E1, Mask);
14818	return;
14819	}
14820	if (InVectors.empty()) {
14821	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14822	InVectors.assign(IL: {&E1, &E2});
14823	return;
14824	}
14825	assert(!CommonMask.empty() && "Expected non-empty common mask.");
14826	auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
14827	unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
14828	unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
14829	const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
14830	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
14831	estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
14832	}
14833	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14834	if (InVectors.empty()) {
14835	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14836	InVectors.assign(NumElts: `1`, Elt: &E1);
14837	return;
14838	}
14839	assert(!CommonMask.empty() && "Expected non-empty common mask.");
14840	auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
14841	unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
14842	unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
14843	const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
14844	unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
14845	estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
14846	if (!SameNodesEstimated && InVectors.size() == `1`)
14847	InVectors.emplace_back(Args: &E1);
14848	}
14849	/// Adds 2 input vectors and the mask for their shuffling.
14850	void add(Value V1, Value V2, ArrayRef<int> Mask) {
14851	// May come only for shuffling of 2 vectors with extractelements, already
14852	// handled in adjustExtracts.
14853	assert(InVectors.size() == `1` &&
14854	all_of(enumerate(CommonMask),
14855	[&](auto P) {
14856	if (P.value() == PoisonMaskElem)
14857	return Mask[P.index()] == PoisonMaskElem;
14858	auto *EI = cast<ExtractElementInst>(
14859	cast<const TreeEntry *>(InVectors.front())
14860	->getOrdered(P.index()));
14861	return EI->getVectorOperand() == V1 \|\|
14862	EI->getVectorOperand() == V2;
14863	}) &&
14864	"Expected extractelement vectors.");
14865	}
14866	/// Adds another one input vector and the mask for the shuffling.
14867	void add(Value V1, ArrayRef<int> Mask, bool* ForExtracts = false) {
14868	if (InVectors.empty()) {
14869	assert(CommonMask.empty() && !ForExtracts &&
14870	"Expected empty input mask/vectors.");
14871	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14872	InVectors.assign(NumElts: `1`, Elt: V1);
14873	return;
14874	}
14875	if (ForExtracts) {
14876	// No need to add vectors here, already handled them in adjustExtracts.
14877	assert(InVectors.size() == `1` && isa<const TreeEntry *>(InVectors[`0`]) &&
14878	!CommonMask.empty() &&
14879	all_of(enumerate(CommonMask),
14880	[&](auto P) {
14881	Value Scalar = cast<const* TreeEntry *>(InVectors[`0`])
14882	->getOrdered(P.index());
14883	if (P.value() == PoisonMaskElem)
14884	return P.value() == Mask[P.index()] \|\|
14885	isa<UndefValue>(Scalar);
14886	if (isa<Constant>(V1))
14887	return true;
14888	auto *EI = cast<ExtractElementInst>(Scalar);
14889	return EI->getVectorOperand() == V1;
14890	}) &&
14891	"Expected only tree entry for extractelement vectors.");
14892	return;
14893	}
14894	assert(!InVectors.empty() && !CommonMask.empty() &&
14895	"Expected only tree entries from extracts/reused buildvectors.");
14896	unsigned VF = getVF(V: V1);
14897	if (InVectors.size() == `2`) {
14898	Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14899	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14900	VF = std::max<unsigned>(a: VF, b: CommonMask.size());
14901	} else if (const auto *InTE =
14902	InVectors.front().dyn_cast<const TreeEntry *>()) {
14903	VF = std::max(a: VF, b: InTE->getVectorFactor());
14904	} else {
14905	VF = std::max(
14906	a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType())
14907	->getNumElements());
14908	}
14909	InVectors.push_back(Elt: V1);
14910	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14911	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
14912	CommonMask [Idx] = Mask [Idx] + VF;
14913	}
14914	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
14915	Value Root = nullptr*) {
14916	Cost += getBuildVectorCost(VL, Root);
14917	if (!Root) {
14918	// FIXME: Need to find a way to avoid use of getNullValue here.
14919	SmallVector<Constant *> Vals;
14920	unsigned VF = VL.size();
14921	if (MaskVF != `0`)
14922	VF = std::min(a: VF, b: MaskVF);
14923	Type *VLScalarTy = VL.front()->getType();
14924	for (Value *V : VL.take_front(N: VF)) {
14925	Type *ScalarTy = VLScalarTy->getScalarType();
14926	if (isa<PoisonValue>(Val: V)) {
14927	Vals.push_back(Elt: PoisonValue::get(T: ScalarTy));
14928	continue;
14929	}
14930	if (isa<UndefValue>(Val: V)) {
14931	Vals.push_back(Elt: UndefValue::get(T: ScalarTy));
14932	continue;
14933	}
14934	Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy));
14935	}
14936	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) {
14937	assert(SLPReVec && "FixedVectorType is not expected.");
14938	// When REVEC is enabled, we need to expand vector types into scalar
14939	// types.
14940	Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements());
14941	}
14942	return ConstantVector::get(V: Vals);
14943	}
14944	return ConstantVector::getSplat(
14945	EC: ElementCount::getFixed(
14946	MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
14947	Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType()));
14948	}
14949	InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
14950	/// Finalize emission of the shuffles.
14951	InstructionCost finalize(
14952	ArrayRef<int> ExtMask,
14953	ArrayRef<std::pair<const TreeEntry , unsigned*>> SubVectors,
14954	ArrayRef<int> SubVectorsMask, unsigned VF = `0`,
14955	function_ref<void(Value &, SmallVectorImpl<int*> &,
14956	function_ref<Value (Value , Value , ArrayRef<int*>)>)>
14957	Action = {}) {
14958	IsFinalized = true;
14959	if (Action) {
14960	const PointerUnion<Value , const* TreeEntry *> &Vec = InVectors.front();
14961	if (InVectors.size() == `2`)
14962	Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
14963	else
14964	Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
14965	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14966	assert(VF > `0` &&
14967	"Expected vector length for the final value before action.");
14968	Value V = cast<Value >(Val: Vec);
14969	Action (V, CommonMask, [this](Value V1, Value V2, ArrayRef<int> Mask) {
14970	Cost += createShuffle(P1: V1, P2: V2, Mask);
14971	return V1;
14972	});
14973	InVectors.front() = V;
14974	}
14975	if (!SubVectors.empty()) {
14976	const PointerUnion<Value , const* TreeEntry *> &Vec = InVectors.front();
14977	if (InVectors.size() == `2`)
14978	Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
14979	else
14980	Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
14981	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14982	// Add subvectors permutation cost.
14983	if (!SubVectorsMask.empty()) {
14984	assert(SubVectorsMask.size() <= CommonMask.size() &&
14985	"Expected same size of masks for subvectors and common mask.");
14986	SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14987	copy(Range&: SubVectorsMask, Out: SVMask.begin());
14988	for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
14989	if (I2 != PoisonMaskElem) {
14990	assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14991	I1 = I2 + CommonMask.size();
14992	}
14993	}
14994	Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14995	Tp: getWidenedType(ScalarTy, VF: CommonMask.size()),
14996	Mask: SVMask, CostKind);
14997	}
14998	for (auto [E, Idx] : SubVectors) {
14999	Type *EScalarTy = E->Scalars.front()->getType();
15000	bool IsSigned = true;
15001	if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) {
15002	EScalarTy =
15003	IntegerType::get(C&: EScalarTy->getContext(), NumBits: It ->second.first);
15004	IsSigned = It ->second.second;
15005	}
15006	if (ScalarTy != EScalarTy) {
15007	unsigned CastOpcode = Instruction::Trunc;
15008	unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
15009	unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
15010	if (DstSz > SrcSz)
15011	CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15012	Cost += TTI.getCastInstrCost(
15013	Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()),
15014	Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()),
15015	CCH: TTI::CastContextHint::Normal, CostKind);
15016	}
15017	Cost += ::getShuffleCost(
15018	TTI, Kind: TTI::SK_InsertSubvector,
15019	Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx,
15020	SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor()));
15021	if (!CommonMask.empty()) {
15022	std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
15023	last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
15024	value: Idx);
15025	}
15026	}
15027	}
15028
15029	if (!ExtMask.empty()) {
15030	if (CommonMask.empty()) {
15031	CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
15032	} else {
15033	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
15034	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
15035	if (ExtMask [I] == PoisonMaskElem)
15036	continue;
15037	NewMask [I] = CommonMask [ExtMask [I]];
15038	}
15039	CommonMask.swap(RHS&: NewMask);
15040	}
15041	}
15042	if (CommonMask.empty()) {
15043	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
15044	return Cost;
15045	}
15046	return Cost +
15047	createShuffle(P1: InVectors.front(),
15048	P2: InVectors.size() == `2` ? InVectors.back() : nullptr,
15049	Mask: CommonMask);
15050	}
15051
15052	~ShuffleCostEstimator() {
15053	assert((IsFinalized \|\| CommonMask.empty()) &&
15054	"Shuffle construction must be finalized.");
15055	}
15056	};
15057
15058	const BoUpSLP::TreeEntry BoUpSLP::getOperandEntry(const* TreeEntry *E,
15059	unsigned Idx) const {
15060	TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx});
15061	assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
15062	return Op;
15063	}
15064
15065	TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
15066	if (TE.State == TreeEntry::ScatterVectorize \|\|
15067	TE.State == TreeEntry::StridedVectorize)
15068	return TTI::CastContextHint::GatherScatter;
15069	if (TE.State == TreeEntry::CompressVectorize)
15070	return TTI::CastContextHint::Masked;
15071	if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
15072	!TE.isAltShuffle()) {
15073	if (TE.ReorderIndices.empty())
15074	return TTI::CastContextHint::Normal;
15075	SmallVector<int> Mask;
15076	inversePermutation(Indices: TE.ReorderIndices, Mask);
15077	if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
15078	return TTI::CastContextHint::Reversed;
15079	}
15080	return TTI::CastContextHint::None;
15081	}
15082
15083	InstructionCost
15084	BoUpSLP::getEntryCost(const TreeEntry E, ArrayRef<Value > VectorizedVals,
15085	SmallPtrSetImpl<Value *> &CheckedExtracts) {
15086	ArrayRef<Value *> VL = E->Scalars;
15087
15088	Type *ScalarTy = getValueType(V: VL [`0`]);
15089	if (!isValidElementType(Ty: ScalarTy))
15090	return InstructionCost::getInvalid();
15091	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15092
15093	// If we have computed a smaller type for the expression, update VecTy so
15094	// that the costs will be accurate.
15095	auto It = MinBWs.find(Val: E);
15096	Type *OrigScalarTy = ScalarTy;
15097	if (It != MinBWs.end()) {
15098	auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
15099	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
15100	if (VecTy)
15101	ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
15102	} else if (E->Idx == `0` && isReducedBitcastRoot()) {
15103	const TreeEntry ZExt = getOperandEntry(E, /Idx=/*`0`);
15104	ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
15105	}
15106	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
15107	unsigned EntryVF = E->getVectorFactor();
15108	auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
15109
15110	if (E->isGather() \|\| TransformedToGatherNodes.contains(Val: E)) {
15111	if (allConstant(VL))
15112	return `0`;
15113	if (isa<InsertElementInst>(Val: VL [`0`]))
15114	return InstructionCost::getInvalid();
15115	if (isa<CmpInst>(Val: VL.front()))
15116	ScalarTy = VL.front()->getType();
15117	return processBuildVector<ShuffleCostEstimator, InstructionCost>(
15118	E, ScalarTy, Params&: TTI, Params&: VectorizedVals, Params&: this, Params&: CheckedExtracts);
15119	}
15120	if (E->State == TreeEntry::SplitVectorize) {
15121	assert(E->CombinedEntriesWithIndices.size() == `2` &&
15122	"Expected exactly 2 combined entries.");
15123	assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
15124	InstructionCost VectorCost = `0`;
15125	if (E->ReorderIndices.empty()) {
15126	VectorCost = ::getShuffleCost(
15127	TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind,
15128	Index: E->CombinedEntriesWithIndices.back().second,
15129	SubTp: getWidenedType(
15130	ScalarTy,
15131	VF: VectorizableTree [E->CombinedEntriesWithIndices.back().first]
15132	->getVectorFactor()));
15133	} else {
15134	unsigned CommonVF =
15135	std::max(a: VectorizableTree [E->CombinedEntriesWithIndices.front().first]
15136	->getVectorFactor(),
15137	b: VectorizableTree [E->CombinedEntriesWithIndices.back().first]
15138	->getVectorFactor());
15139	VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
15140	Tp: getWidenedType(ScalarTy, VF: CommonVF),
15141	Mask: E->getSplitMask(), CostKind);
15142	}
15143	LLVM_DEBUG(dumpTreeCosts(E, `0`, VectorCost, `0`, "Calculated costs for Tree"));
15144	return VectorCost;
15145	}
15146	InstructionCost CommonCost = `0`;
15147	SmallVector<int> Mask;
15148	if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
15149	(E->State != TreeEntry::StridedVectorize \|\|
15150	!isReverseOrder(Order: E->ReorderIndices))) {
15151	SmallVector<int> NewMask;
15152	if (E->getOpcode() == Instruction::Store) {
15153	// For stores the order is actually a mask.
15154	NewMask.resize(N: E->ReorderIndices.size());
15155	copy(Range: E->ReorderIndices, Out: NewMask.begin());
15156	} else {
15157	inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
15158	}
15159	::addMask(Mask, SubMask: NewMask);
15160	}
15161	if (!E->ReuseShuffleIndices.empty())
15162	::addMask(Mask, SubMask: E->ReuseShuffleIndices);
15163	if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
15164	CommonCost =
15165	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
15166	assert((E->State == TreeEntry::Vectorize \|\|
15167	E->State == TreeEntry::ScatterVectorize \|\|
15168	E->State == TreeEntry::StridedVectorize \|\|
15169	E->State == TreeEntry::CompressVectorize) &&
15170	"Unhandled state");
15171	assert(E->getOpcode() &&
15172	((allSameType(VL) && allSameBlock(VL)) \|\|
15173	(E->getOpcode() == Instruction::GetElementPtr &&
15174	E->getMainOp()->getType()->isPointerTy()) \|\|
15175	E->hasCopyableElements()) &&
15176	"Invalid VL");
15177	Instruction *VL0 = E->getMainOp();
15178	unsigned ShuffleOrOp =
15179	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15180	if (E->CombinedOp != TreeEntry::NotCombinedOp)
15181	ShuffleOrOp = E->CombinedOp;
15182	SmallSetVector<Value *, `16`> UniqueValues;
15183	SmallVector<unsigned, `16`> UniqueIndexes;
15184	for (auto [Idx, V] : enumerate(First&: VL))
15185	if (UniqueValues.insert(X: V))
15186	UniqueIndexes.push_back(Elt: Idx);
15187	const unsigned Sz = UniqueValues.size();
15188	SmallBitVector UsedScalars(Sz, false);
15189	for (unsigned I = `0`; I < Sz; ++I) {
15190	if (isa<Instruction>(Val: UniqueValues [I]) &&
15191	!E->isCopyableElement(V: UniqueValues [I]) &&
15192	getTreeEntries(V: UniqueValues [I]).front() == E)
15193	continue;
15194	UsedScalars.set(I);
15195	}
15196	auto GetCastContextHint = [&](Value *V) {
15197	if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == `1`)
15198	return getCastContextHint(TE: *OpTEs.front());
15199	InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: `0`), TLI: *TLI);
15200	if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15201	!SrcState.isAltShuffle())
15202	return TTI::CastContextHint::GatherScatter;
15203	return TTI::CastContextHint::None;
15204	};
15205	auto GetCostDiff =
15206	[=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
15207	function_ref<InstructionCost(InstructionCost)> VectorCost) {
15208	// Calculate the cost of this instruction.
15209	InstructionCost ScalarCost = `0`;
15210	if (isa<CastInst, CallInst>(Val: VL0)) {
15211	// For some of the instructions no need to calculate cost for each
15212	// particular instruction, we can use the cost of the single
15213	// instruction x total number of scalar instructions.
15214	ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost (`0`);
15215	} else {
15216	for (unsigned I = `0`; I < Sz; ++I) {
15217	if (UsedScalars.test(Idx: I))
15218	continue;
15219	ScalarCost += ScalarEltCost (I);
15220	}
15221	}
15222
15223	InstructionCost VecCost = VectorCost (CommonCost);
15224	// Check if the current node must be resized, if the parent node is not
15225	// resized.
15226	if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) &&
15227	E->Idx != `0` &&
15228	(E->getOpcode() != Instruction::Load \|\| E->UserTreeIndex)) {
15229	const EdgeInfo &EI = E->UserTreeIndex;
15230	if (!EI.UserTE->hasState() \|\|
15231	EI.UserTE->getOpcode() != Instruction::Select \|\|
15232	EI.EdgeIdx != `0`) {
15233	auto UserBWIt = MinBWs.find(Val: EI.UserTE);
15234	Type *UserScalarTy =
15235	(EI.UserTE->isGather() \|\|
15236	EI.UserTE->State == TreeEntry::SplitVectorize)
15237	? EI.UserTE->Scalars.front()->getType()
15238	: EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
15239	if (UserBWIt != MinBWs.end())
15240	UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
15241	NumBits: UserBWIt ->second.first);
15242	if (ScalarTy != UserScalarTy) {
15243	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
15244	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
15245	unsigned VecOpcode;
15246	auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size());
15247	if (BWSz > SrcBWSz)
15248	VecOpcode = Instruction::Trunc;
15249	else
15250	VecOpcode =
15251	It ->second.second ? Instruction::SExt : Instruction::ZExt;
15252	TTI::CastContextHint CCH = GetCastContextHint (VL0);
15253	VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
15254	CostKind);
15255	}
15256	}
15257	}
15258	LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
15259	ScalarCost, "Calculated costs for Tree"));
15260	return VecCost - ScalarCost;
15261	};
15262	// Calculate cost difference from vectorizing set of GEPs.
15263	// Negative value means vectorizing is profitable.
15264	auto GetGEPCostDiff = [=](ArrayRef<Value > Ptrs, Value BasePtr) {
15265	assert((E->State == TreeEntry::Vectorize \|\|
15266	E->State == TreeEntry::StridedVectorize \|\|
15267	E->State == TreeEntry::CompressVectorize) &&
15268	"Entry state expected to be Vectorize, StridedVectorize or "
15269	"MaskedLoadCompressVectorize here.");
15270	InstructionCost ScalarCost = `0`;
15271	InstructionCost VecCost = `0`;
15272	std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
15273	TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
15274	LLVM_DEBUG(dumpTreeCosts(E, `0`, VecCost, ScalarCost,
15275	"Calculated GEPs cost for Tree"));
15276
15277	return VecCost - ScalarCost;
15278	};
15279
15280	auto GetMinMaxCost = [&](Type Ty, Instruction VI = nullptr) {
15281	auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL);
15282	if (MinMaxID == Intrinsic::not_intrinsic)
15283	return InstructionCost::getInvalid();
15284	Type *CanonicalType = Ty;
15285	if (CanonicalType->isPtrOrPtrVectorTy())
15286	CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
15287	C&: CanonicalType->getContext(),
15288	NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
15289
15290	IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15291	{CanonicalType, CanonicalType});
15292	InstructionCost IntrinsicCost =
15293	TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15294	// If the selects are the only uses of the compares, they will be
15295	// dead and we can adjust the cost by removing their cost.
15296	if (VI && SelectOnly) {
15297	assert((!Ty->isVectorTy() \|\| SLPReVec) &&
15298	"Expected only for scalar type.");
15299	auto *CI = cast<CmpInst>(Val: VI->getOperand(i: `0`));
15300	IntrinsicCost -= TTI->getCmpSelInstrCost(
15301	Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(),
15302	CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
15303	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI);
15304	}
15305	return IntrinsicCost;
15306	};
15307	auto GetFMulAddCost = [&, &TTI = TTI](const* InstructionsState &S,
15308	Instruction *VI) {
15309	InstructionCost Cost = canConvertToFMA(VL: VI, S, DT&: DT, DL: DL, TTI, TLI: *TLI);
15310	return Cost;
15311	};
15312	switch (ShuffleOrOp) {
15313	case Instruction::PHI: {
15314	// Count reused scalars.
15315	InstructionCost ScalarCost = `0`;
15316	SmallPtrSet<const TreeEntry *, `4`> CountedOps;
15317	for (Value *V : UniqueValues) {
15318	auto *PHI = dyn_cast<PHINode>(Val: V);
15319	if (!PHI)
15320	continue;
15321
15322	ValueList Operands(PHI->getNumIncomingValues(), nullptr);
15323	for (unsigned I = `0`, N = PHI->getNumIncomingValues(); I < N; ++I) {
15324	Value *Op = PHI->getIncomingValue(i: I);
15325	Operands [I] = Op;
15326	}
15327	if (const TreeEntry *OpTE =
15328	getSameValuesTreeEntry(V: Operands.front(), VL: Operands))
15329	if (CountedOps.insert(Ptr: OpTE).second &&
15330	!OpTE->ReuseShuffleIndices.empty())
15331	ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15332	OpTE->Scalars.size());
15333	}
15334
15335	return CommonCost - ScalarCost;
15336	}
15337	case Instruction::ExtractValue:
15338	case Instruction::ExtractElement: {
15339	APInt DemandedElts;
15340	VectorType SrcVecTy = nullptr*;
15341	auto GetScalarCost = [&](unsigned Idx) {
15342	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15343	return InstructionCost (TTI::TCC_Free);
15344
15345	auto *I = cast<Instruction>(Val: UniqueValues [Idx]);
15346	if (!SrcVecTy) {
15347	if (ShuffleOrOp == Instruction::ExtractElement) {
15348	auto *EE = cast<ExtractElementInst>(Val: I);
15349	SrcVecTy = EE->getVectorOperandType();
15350	} else {
15351	auto *EV = cast<ExtractValueInst>(Val: I);
15352	Type *AggregateTy = EV->getAggregateOperand()->getType();
15353	unsigned NumElts;
15354	if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
15355	NumElts = ATy->getNumElements();
15356	else
15357	NumElts = AggregateTy->getStructNumElements();
15358	SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
15359	}
15360	}
15361	if (I->hasOneUse()) {
15362	Instruction *Ext = I->user_back();
15363	if ((isa<SExtInst>(Val: Ext) \|\| isa<ZExtInst>(Val: Ext)) &&
15364	all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
15365	// Use getExtractWithExtendCost() to calculate the cost of
15366	// extractelement/ext pair.
15367	InstructionCost Cost = TTI->getExtractWithExtendCost(
15368	Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I),
15369	CostKind);
15370	// Subtract the cost of s\|zext which is subtracted separately.
15371	Cost -= TTI->getCastInstrCost(
15372	Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
15373	CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
15374	return Cost;
15375	}
15376	}
15377	if (DemandedElts.isZero())
15378	DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy));
15379	DemandedElts.setBit(*getExtractIndex(E: I));
15380	return InstructionCost (TTI::TCC_Free);
15381	};
15382	auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15383	return CommonCost - (DemandedElts.isZero()
15384	? TTI::TCC_Free
15385	: TTI.getScalarizationOverhead(
15386	Ty: SrcVecTy, DemandedElts, /Insert=/false,
15387	/Extract=/true, CostKind));
15388	};
15389	return GetCostDiff (GetScalarCost, GetVectorCost);
15390	}
15391	case Instruction::InsertElement: {
15392	assert(E->ReuseShuffleIndices.empty() &&
15393	"Unique insertelements only are expected.");
15394	auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
15395	unsigned const NumElts = SrcVecTy->getNumElements();
15396	unsigned const NumScalars = VL.size();
15397
15398	unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy);
15399
15400	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15401	unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
15402	unsigned OffsetEnd = OffsetBeg;
15403	InsertMask [OffsetBeg] = `0`;
15404	for (auto [I, V] : enumerate(First: VL.drop_front())) {
15405	unsigned Idx = *getElementIndex(Inst: V);
15406	if (OffsetBeg > Idx)
15407	OffsetBeg = Idx;
15408	else if (OffsetEnd < Idx)
15409	OffsetEnd = Idx;
15410	InsertMask [Idx] = I + `1`;
15411	}
15412	unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
15413	if (NumOfParts > `0` && NumOfParts < NumElts)
15414	VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - `1`) / NumOfParts);
15415	unsigned VecSz = (`1` + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15416	VecScalarsSz;
15417	unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15418	unsigned InsertVecSz = std::min<unsigned>(
15419	a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + `1`),
15420	b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15421	bool IsWholeSubvector =
15422	OffsetBeg == Offset && ((OffsetEnd + `1`) % VecScalarsSz == `0`);
15423	// Check if we can safely insert a subvector. If it is not possible, just
15424	// generate a whole-sized vector and shuffle the source vector and the new
15425	// subvector.
15426	if (OffsetBeg + InsertVecSz > VecSz) {
15427	// Align OffsetBeg to generate correct mask.
15428	OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
15429	InsertVecSz = VecSz;
15430	}
15431
15432	APInt DemandedElts = APInt::getZero(numBits: NumElts);
15433	// TODO: Add support for Instruction::InsertValue.
15434	SmallVector<int> Mask;
15435	if (!E->ReorderIndices.empty()) {
15436	inversePermutation(Indices: E->ReorderIndices, Mask);
15437	Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
15438	} else {
15439	Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
15440	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: `0`);
15441	}
15442	bool IsIdentity = true;
15443	SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15444	Mask.swap(RHS&: PrevMask);
15445	for (unsigned I = `0`; I < NumScalars; ++I) {
15446	unsigned InsertIdx = *getElementIndex(Inst: VL [PrevMask [I]]);
15447	DemandedElts.setBit(InsertIdx);
15448	IsIdentity &= InsertIdx - OffsetBeg == I;
15449	Mask [InsertIdx - OffsetBeg] = I;
15450	}
15451	assert(Offset < NumElts && "Failed to find vector index offset");
15452
15453	InstructionCost Cost = `0`;
15454	Cost -=
15455	getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts,
15456	/Insert/ true, /Extract/ false, CostKind);
15457
15458	// First cost - resize to actual vector size if not identity shuffle or
15459	// need to shift the vector.
15460	// Do not calculate the cost if the actual size is the register size and
15461	// we can merge this shuffle with the following SK_Select.
15462	auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
15463	if (!IsIdentity)
15464	Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc,
15465	Tp: InsertVecTy, Mask);
15466	auto FirstInsert = cast<Instruction>(Val: find_if(Range: E->Scalars, P: [E](Value *V) {
15467	return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
15468	}));
15469	// Second cost - permutation with subvector, if some elements are from the
15470	// initial vector or inserting a subvector.
15471	// TODO: Implement the analysis of the FirstInsert->getOperand(0)
15472	// subvector of ActualVecTy.
15473	SmallBitVector InMask =
15474	isUndefVector(V: FirstInsert->getOperand(i: `0`),
15475	UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
15476	if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15477	if (InsertVecSz != VecSz) {
15478	auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
15479	Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {},
15480	CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy);
15481	} else {
15482	for (unsigned I = `0`, End = OffsetBeg - Offset; I < End; ++I)
15483	Mask [I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
15484	for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15485	I <= End; ++I)
15486	if (Mask [I] != PoisonMaskElem)
15487	Mask [I] = I + VecSz;
15488	for (unsigned I = OffsetEnd + `1` - Offset; I < VecSz; ++I)
15489	Mask [I] =
15490	((I >= InMask.size()) \|\| InMask.test(Idx: I)) ? PoisonMaskElem : I;
15491	Cost +=
15492	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
15493	}
15494	}
15495	return Cost;
15496	}
15497	case Instruction::ZExt:
15498	case Instruction::SExt:
15499	case Instruction::FPToUI:
15500	case Instruction::FPToSI:
15501	case Instruction::FPExt:
15502	case Instruction::PtrToInt:
15503	case Instruction::IntToPtr:
15504	case Instruction::SIToFP:
15505	case Instruction::UIToFP:
15506	case Instruction::Trunc:
15507	case Instruction::FPTrunc:
15508	case Instruction::BitCast: {
15509	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
15510	Type *SrcScalarTy = VL0->getOperand(i: `0`)->getType();
15511	auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
15512	unsigned Opcode = ShuffleOrOp;
15513	unsigned VecOpcode = Opcode;
15514	if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15515	(SrcIt != MinBWs.end() \|\| It != MinBWs.end())) {
15516	// Check if the values are candidates to demote.
15517	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType());
15518	if (SrcIt != MinBWs.end()) {
15519	SrcBWSz = SrcIt ->second.first;
15520	unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy);
15521	SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
15522	SrcVecTy =
15523	getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements);
15524	}
15525	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
15526	if (BWSz == SrcBWSz) {
15527	VecOpcode = Instruction::BitCast;
15528	} else if (BWSz < SrcBWSz) {
15529	VecOpcode = Instruction::Trunc;
15530	} else if (It != MinBWs.end()) {
15531	assert(BWSz > SrcBWSz && "Invalid cast!");
15532	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
15533	} else if (SrcIt != MinBWs.end()) {
15534	assert(BWSz > SrcBWSz && "Invalid cast!");
15535	VecOpcode =
15536	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
15537	}
15538	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15539	!SrcIt ->second.second) {
15540	VecOpcode = Instruction::UIToFP;
15541	}
15542	auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15543	assert(Idx == `0` && "Expected 0 index only");
15544	return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
15545	Src: VL0->getOperand(i: `0`)->getType(),
15546	CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0);
15547	};
15548	auto GetVectorCost = [=](InstructionCost CommonCost) {
15549	// Do not count cost here if minimum bitwidth is in effect and it is just
15550	// a bitcast (here it is just a noop).
15551	if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15552	return CommonCost;
15553	auto VI = VL0->getOpcode() == Opcode ? VL0 : nullptr*;
15554	TTI::CastContextHint CCH = GetCastContextHint (VL0->getOperand(i: `0`));
15555
15556	bool IsArithmeticExtendedReduction =
15557	E->Idx == `0` && UserIgnoreList &&
15558	all_of(Range: UserIgnoreList, P: [](Value V) {
15559	auto *I = cast<Instruction>(Val: V);
15560	return is_contained(Set: {Instruction::Add, Instruction::FAdd,
15561	Instruction::Mul, Instruction::FMul,
15562	Instruction::And, Instruction::Or,
15563	Instruction::Xor},
15564	Element: I->getOpcode());
15565	});
15566	if (IsArithmeticExtendedReduction &&
15567	(VecOpcode == Instruction::ZExt \|\| VecOpcode == Instruction::SExt))
15568	return CommonCost;
15569	return CommonCost +
15570	TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
15571	I: VecOpcode == Opcode ? VI : nullptr);
15572	};
15573	return GetCostDiff (GetScalarCost, GetVectorCost);
15574	}
15575	case Instruction::FCmp:
15576	case Instruction::ICmp:
15577	case Instruction::Select: {
15578	CmpPredicate VecPred, SwappedVecPred;
15579	auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
15580	if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) \|\|
15581	match(V: VL0, P: MatchCmp))
15582	SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
15583	else
15584	SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15585	? CmpInst::BAD_FCMP_PREDICATE
15586	: CmpInst::BAD_ICMP_PREDICATE;
15587	auto GetScalarCost = [&](unsigned Idx) {
15588	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15589	return InstructionCost (TTI::TCC_Free);
15590
15591	if (!isa<SelectInst>(Val: UniqueValues [Idx]))
15592	return TTI->getInstructionCost(U: cast<Instruction>(Val: UniqueValues [Idx]),
15593	CostKind);
15594
15595	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
15596	CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15597	? CmpInst::BAD_FCMP_PREDICATE
15598	: CmpInst::BAD_ICMP_PREDICATE;
15599	auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
15600	if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
15601	!match(V: VI, P: MatchCmp)) \|\|
15602	(CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15603	CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15604	VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15605	? CmpInst::BAD_FCMP_PREDICATE
15606	: CmpInst::BAD_ICMP_PREDICATE;
15607
15608	InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15609	Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
15610	CostKind, Op1Info: getOperandInfo(Ops: VI->getOperand(i: `0`)),
15611	Op2Info: getOperandInfo(Ops: VI->getOperand(i: `1`)), I: VI);
15612	InstructionCost IntrinsicCost = GetMinMaxCost (OrigScalarTy, VI);
15613	if (IntrinsicCost.isValid())
15614	ScalarCost = IntrinsicCost;
15615
15616	return ScalarCost;
15617	};
15618	auto GetVectorCost = [&](InstructionCost CommonCost) {
15619	auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
15620
15621	InstructionCost VecCost =
15622	TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred,
15623	CostKind, Op1Info: getOperandInfo(Ops: E->getOperand(OpIdx: `0`)),
15624	Op2Info: getOperandInfo(Ops: E->getOperand(OpIdx: `1`)), I: VL0);
15625	if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) {
15626	auto *CondType =
15627	getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size());
15628	unsigned CondNumElements = CondType->getNumElements();
15629	unsigned VecTyNumElements = getNumElements(Ty: VecTy);
15630	assert(VecTyNumElements >= CondNumElements &&
15631	VecTyNumElements % CondNumElements == `0` &&
15632	"Cannot vectorize Instruction::Select");
15633	if (CondNumElements != VecTyNumElements) {
15634	// When the return type is i1 but the source is fixed vector type, we
15635	// need to duplicate the condition value.
15636	VecCost += ::getShuffleCost(
15637	TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType,
15638	Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements,
15639	VF: CondNumElements));
15640	}
15641	}
15642	return VecCost + CommonCost;
15643	};
15644	return GetCostDiff (GetScalarCost, GetVectorCost);
15645	}
15646	case TreeEntry::MinMax: {
15647	auto GetScalarCost = [&](unsigned Idx) {
15648	return GetMinMaxCost (OrigScalarTy);
15649	};
15650	auto GetVectorCost = [&](InstructionCost CommonCost) {
15651	InstructionCost VecCost = GetMinMaxCost (VecTy);
15652	return VecCost + CommonCost;
15653	};
15654	return GetCostDiff (GetScalarCost, GetVectorCost);
15655	}
15656	case TreeEntry::FMulAdd: {
15657	auto GetScalarCost = [&](unsigned Idx) {
15658	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15659	return InstructionCost (TTI::TCC_Free);
15660	return GetFMulAddCost (E->getOperations(),
15661	cast<Instruction>(Val: UniqueValues [Idx]));
15662	};
15663	auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15664	FastMathFlags FMF;
15665	FMF.set();
15666	for (Value *V : E->Scalars) {
15667	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: V)) {
15668	FMF &= FPCI->getFastMathFlags();
15669	if (auto *FPCIOp = dyn_cast<FPMathOperator>(Val: FPCI->getOperand(i: `0`)))
15670	FMF &= FPCIOp->getFastMathFlags();
15671	}
15672	}
15673	IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15674	{VecTy, VecTy, VecTy}, FMF);
15675	InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15676	return VecCost + CommonCost;
15677	};
15678	return GetCostDiff (GetScalarCost, GetVectorCost);
15679	}
15680	case TreeEntry::ReducedBitcast:
15681	case TreeEntry::ReducedBitcastBSwap: {
15682	auto GetScalarCost = [&, &TTI = TTI](unsigned* Idx) {
15683	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15684	return InstructionCost (TTI::TCC_Free);
15685	auto *Shl = dyn_cast<Instruction>(Val: UniqueValues [Idx]);
15686	if (!Shl)
15687	return InstructionCost (TTI::TCC_Free);
15688	InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
15689	auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: `0`));
15690	if (!ZExt)
15691	return ScalarCost;
15692	ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
15693	return ScalarCost;
15694	};
15695	auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15696	const TreeEntry LhsTE = getOperandEntry(E, /Idx=/*`0`);
15697	TTI::CastContextHint CastCtx =
15698	getCastContextHint(TE: getOperandEntry(E: LhsTE, /Idx=/*`0`));
15699	Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
15700	auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
15701	InstructionCost BitcastCost = TTI.getCastInstrCost(
15702	Opcode: Instruction::BitCast, Dst: ScalarTy, Src: SrcVecTy, CCH: CastCtx, CostKind);
15703	if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
15704	auto *SrcType = IntegerType::getIntNTy(
15705	C&: ScalarTy->getContext(),
15706	N: DL->getTypeSizeInBits(Ty: SrcScalarTy) * EntryVF);
15707	IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
15708	InstructionCost IntrinsicCost =
15709	TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15710	BitcastCost += IntrinsicCost;
15711	if (SrcType != ScalarTy) {
15712	BitcastCost +=
15713	TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
15714	CCH: TTI::CastContextHint::None, CostKind);
15715	}
15716	}
15717	return BitcastCost + CommonCost;
15718	};
15719	return GetCostDiff (GetScalarCost, GetVectorCost);
15720	}
15721	case TreeEntry::ReducedBitcastLoads:
15722	case TreeEntry::ReducedBitcastBSwapLoads: {
15723	auto GetScalarCost = [&, &TTI = TTI](unsigned* Idx) {
15724	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15725	return InstructionCost (TTI::TCC_Free);
15726	auto *Shl = dyn_cast<Instruction>(Val: UniqueValues [Idx]);
15727	if (!Shl)
15728	return InstructionCost (TTI::TCC_Free);
15729	InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
15730	auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: `0`));
15731	if (!ZExt)
15732	return ScalarCost;
15733	ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
15734	auto *Load = dyn_cast<Instruction>(Val: ZExt->getOperand(i: `0`));
15735	if (!Load)
15736	return ScalarCost;
15737	ScalarCost += TTI.getInstructionCost(U: Load, CostKind);
15738	return ScalarCost;
15739	};
15740	auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15741	const TreeEntry LhsTE = getOperandEntry(E, /Idx=/*`0`);
15742	const TreeEntry LoadTE = getOperandEntry(E: LhsTE, /Idx=/*`0`);
15743	auto *LI0 = cast<LoadInst>(Val: LoadTE->getMainOp());
15744	auto *SrcType = IntegerType::getIntNTy(
15745	C&: ScalarTy->getContext(),
15746	N: DL->getTypeSizeInBits(Ty: LI0->getType()) * EntryVF);
15747	InstructionCost LoadCost =
15748	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI0->getAlign(),
15749	AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15750	if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
15751	IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
15752	InstructionCost IntrinsicCost =
15753	TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15754	LoadCost += IntrinsicCost;
15755	if (SrcType != ScalarTy) {
15756	LoadCost +=
15757	TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
15758	CCH: TTI::CastContextHint::None, CostKind);
15759	}
15760	}
15761	return LoadCost + CommonCost;
15762	};
15763	return GetCostDiff (GetScalarCost, GetVectorCost);
15764	}
15765	case TreeEntry::ReducedCmpBitcast: {
15766	auto GetScalarCost = [&, &TTI = TTI](unsigned* Idx) {
15767	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15768	return InstructionCost (TTI::TCC_Free);
15769	auto *Sel = dyn_cast<Instruction>(Val: UniqueValues [Idx]);
15770	if (!Sel)
15771	return InstructionCost (TTI::TCC_Free);
15772	InstructionCost ScalarCost = TTI.getInstructionCost(U: Sel, CostKind);
15773	return ScalarCost;
15774	};
15775	auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15776	Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: VecTy);
15777	auto *DstTy =
15778	IntegerType::getIntNTy(C&: ScalarTy->getContext(), N: E->getVectorFactor());
15779	InstructionCost BitcastCost =
15780	TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: DstTy, Src: CmpTy,
15781	CCH: TTI::CastContextHint::None, CostKind);
15782	if (DstTy != ScalarTy) {
15783	BitcastCost +=
15784	TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: DstTy,
15785	CCH: TTI::CastContextHint::None, CostKind);
15786	}
15787	return BitcastCost + CommonCost;
15788	};
15789	return GetCostDiff (GetScalarCost, GetVectorCost);
15790	}
15791	case Instruction::FNeg:
15792	case Instruction::Add:
15793	case Instruction::FAdd:
15794	case Instruction::Sub:
15795	case Instruction::FSub:
15796	case Instruction::Mul:
15797	case Instruction::FMul:
15798	case Instruction::UDiv:
15799	case Instruction::SDiv:
15800	case Instruction::FDiv:
15801	case Instruction::URem:
15802	case Instruction::SRem:
15803	case Instruction::FRem:
15804	case Instruction::Shl:
15805	case Instruction::LShr:
15806	case Instruction::AShr:
15807	case Instruction::And:
15808	case Instruction::Or:
15809	case Instruction::Xor: {
15810	auto GetScalarCost = [&](unsigned Idx) {
15811	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
15812	return InstructionCost (TTI::TCC_Free);
15813
15814	// We cannot retrieve the operand from UniqueValues[Idx] because an
15815	// interchangeable instruction may be used. The order and the actual
15816	// operand might differ from what is retrieved from UniqueValues[Idx].
15817	unsigned Lane = UniqueIndexes [Idx];
15818	Value *Op1 = E->getOperand(OpIdx: `0`)[Lane];
15819	Value *Op2;
15820	SmallVector<const Value *, `2`> Operands(`1`, Op1);
15821	if (isa<UnaryOperator>(Val: UniqueValues [Idx])) {
15822	Op2 = Op1;
15823	} else {
15824	Op2 = E->getOperand(OpIdx: `1`)[Lane];
15825	Operands.push_back(Elt: Op2);
15826	}
15827	TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1);
15828	TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2);
15829	InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15830	Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands);
15831	if (auto *I = dyn_cast<Instruction>(Val: UniqueValues [Idx]);
15832	I && (ShuffleOrOp == Instruction::FAdd \|\|
15833	ShuffleOrOp == Instruction::FSub)) {
15834	InstructionCost IntrinsicCost = GetFMulAddCost (E->getOperations(), I);
15835	if (IntrinsicCost.isValid())
15836	ScalarCost = IntrinsicCost;
15837	}
15838	return ScalarCost;
15839	};
15840	auto GetVectorCost = [=](InstructionCost CommonCost) {
15841	if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15842	for (unsigned I : seq<unsigned>(Begin: `0`, End: E->getNumOperands())) {
15843	ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
15844	if (all_of(Range&: Ops, P: [&](Value *Op) {
15845	auto *CI = dyn_cast<ConstantInt>(Val: Op);
15846	return CI && CI->getValue().countr_one() >= It ->second.first;
15847	}))
15848	return CommonCost;
15849	}
15850	}
15851	unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? `0` : `1`;
15852	TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
15853	TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
15854	return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
15855	Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) +
15856	CommonCost;
15857	};
15858	return GetCostDiff (GetScalarCost, GetVectorCost);
15859	}
15860	case Instruction::GetElementPtr: {
15861	return CommonCost + GetGEPCostDiff (VL, VL0);
15862	}
15863	case Instruction::Load: {
15864	auto GetScalarCost = [&](unsigned Idx) {
15865	auto *VI = cast<LoadInst>(Val: UniqueValues [Idx]);
15866	return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
15867	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
15868	CostKind, OpdInfo: TTI::OperandValueInfo (), I: VI);
15869	};
15870	auto *LI0 = cast<LoadInst>(Val: VL0);
15871	auto GetVectorCost = [&](InstructionCost CommonCost) {
15872	InstructionCost VecLdCost;
15873	switch (E->State) {
15874	case TreeEntry::Vectorize:
15875	if (unsigned Factor = E->getInterleaveFactor()) {
15876	VecLdCost = TTI->getInterleavedMemoryOpCost(
15877	Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(),
15878	AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15879
15880	} else {
15881	VecLdCost = TTI->getMemoryOpCost(
15882	Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
15883	AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo ());
15884	}
15885	break;
15886	case TreeEntry::StridedVectorize: {
15887	const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
15888	FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15889	assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
15890	Align CommonAlignment =
15891	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
15892	VecLdCost = TTI->getMemIntrinsicInstrCost(
15893	MICA: MemIntrinsicCostAttributes (Intrinsic::experimental_vp_strided_load,
15894	StridedLoadTy, LI0->getPointerOperand(),
15895	/VariableMask=/false, CommonAlignment),
15896	CostKind);
15897	if (StridedLoadTy != VecTy)
15898	VecLdCost +=
15899	TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: VecTy, Src: StridedLoadTy,
15900	CCH: getCastContextHint(TE: *E), CostKind);
15901
15902	break;
15903	}
15904	case TreeEntry::CompressVectorize: {
15905	bool IsMasked;
15906	unsigned InterleaveFactor;
15907	SmallVector<int> CompressMask;
15908	VectorType *LoadVecTy;
15909	SmallVector<Value *> Scalars(VL);
15910	if (!E->ReorderIndices.empty()) {
15911	SmallVector<int> Mask(E->ReorderIndices.begin(),
15912	E->ReorderIndices.end());
15913	reorderScalars(Scalars, Mask);
15914	}
15915	SmallVector<Value *> PointerOps(Scalars.size());
15916	for (auto [I, V] : enumerate(First&: Scalars))
15917	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
15918	[[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15919	VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: TTI, DL: DL, SE&: SE, AC&: AC, DT: *DT,
15920	TLI: TLI, AreAllUsersVectorized: [](Value ) { return true; }, IsMasked, InterleaveFactor,
15921	CompressMask, LoadVecTy);
15922	assert(IsVectorized && "Failed to vectorize load");
15923	CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy,
15924	Args&: InterleaveFactor, Args&: IsMasked);
15925	Align CommonAlignment = LI0->getAlign();
15926	if (InterleaveFactor) {
15927	VecLdCost = TTI->getInterleavedMemoryOpCost(
15928	Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {},
15929	Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15930	} else if (IsMasked) {
15931	VecLdCost = TTI->getMemIntrinsicInstrCost(
15932	MICA: MemIntrinsicCostAttributes (Intrinsic::masked_load, LoadVecTy,
15933	CommonAlignment,
15934	LI0->getPointerAddressSpace()),
15935	CostKind);
15936	// TODO: include this cost into CommonCost.
15937	VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
15938	Tp: LoadVecTy, Mask: CompressMask, CostKind);
15939	} else {
15940	VecLdCost = TTI->getMemoryOpCost(
15941	Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
15942	AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo ());
15943	// TODO: include this cost into CommonCost.
15944	VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
15945	Tp: LoadVecTy, Mask: CompressMask, CostKind);
15946	}
15947	break;
15948	}
15949	case TreeEntry::ScatterVectorize: {
15950	Align CommonAlignment =
15951	computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
15952	VecLdCost = TTI->getMemIntrinsicInstrCost(
15953	MICA: MemIntrinsicCostAttributes (Intrinsic::masked_gather, VecTy,
15954	LI0->getPointerOperand(),
15955	/VariableMask=/false, CommonAlignment),
15956	CostKind);
15957	break;
15958	}
15959	case TreeEntry::CombinedVectorize:
15960	case TreeEntry::SplitVectorize:
15961	case TreeEntry::NeedToGather:
15962	llvm_unreachable("Unexpected vectorization state.");
15963	}
15964	return VecLdCost + CommonCost;
15965	};
15966
15967	InstructionCost Cost = GetCostDiff (GetScalarCost, GetVectorCost);
15968	// If this node generates masked gather load then it is not a terminal node.
15969	// Hence address operand cost is estimated separately.
15970	if (E->State == TreeEntry::ScatterVectorize)
15971	return Cost;
15972
15973	// Estimate cost of GEPs since this tree node is a terminator.
15974	SmallVector<Value *> PointerOps(VL.size());
15975	for (auto [I, V] : enumerate(First&: VL))
15976	PointerOps [I] = cast<LoadInst>(Val: V)->getPointerOperand();
15977	return Cost + GetGEPCostDiff (PointerOps, LI0->getPointerOperand());
15978	}
15979	case Instruction::Store: {
15980	bool IsReorder = !E->ReorderIndices.empty();
15981	auto GetScalarCost = [=](unsigned Idx) {
15982	auto *VI = cast<StoreInst>(Val: VL [Idx]);
15983	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
15984	return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
15985	Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
15986	CostKind, OpdInfo: OpInfo, I: VI);
15987	};
15988	auto *BaseSI =
15989	cast<StoreInst>(Val: IsReorder ? VL [E->ReorderIndices.front()] : VL0);
15990	auto GetVectorCost = [=](InstructionCost CommonCost) {
15991	// We know that we can merge the stores. Calculate the cost.
15992	InstructionCost VecStCost;
15993	if (E->State == TreeEntry::StridedVectorize) {
15994	Align CommonAlignment =
15995	computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
15996	VecStCost = TTI->getMemIntrinsicInstrCost(
15997	MICA: MemIntrinsicCostAttributes (Intrinsic::experimental_vp_strided_store,
15998	VecTy, BaseSI->getPointerOperand(),
15999	/VariableMask=/false, CommonAlignment),
16000	CostKind);
16001	} else {
16002	assert(E->State == TreeEntry::Vectorize &&
16003	"Expected either strided or consecutive stores.");
16004	if (unsigned Factor = E->getInterleaveFactor()) {
16005	assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
16006	"No reused shuffles expected");
16007	CommonCost = `0`;
16008	VecStCost = TTI->getInterleavedMemoryOpCost(
16009	Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(),
16010	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind);
16011	} else {
16012	TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: `0`));
16013	VecStCost = TTI->getMemoryOpCost(
16014	Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
16015	AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
16016	}
16017	}
16018	return VecStCost + CommonCost;
16019	};
16020	SmallVector<Value *> PointerOps(VL.size());
16021	for (auto [I, V] : enumerate(First&: VL)) {
16022	unsigned Idx = IsReorder ? E->ReorderIndices [I] : I;
16023	PointerOps [Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
16024	}
16025
16026	return GetCostDiff (GetScalarCost, GetVectorCost) +
16027	GetGEPCostDiff (PointerOps, BaseSI->getPointerOperand());
16028	}
16029	case Instruction::Call: {
16030	auto GetScalarCost = [&](unsigned Idx) {
16031	auto *CI = cast<CallInst>(Val: UniqueValues [Idx]);
16032	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
16033	if (ID != Intrinsic::not_intrinsic) {
16034	IntrinsicCostAttributes CostAttrs(ID, *CI, `1`);
16035	return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
16036	}
16037	return TTI->getCallInstrCost(F: CI->getCalledFunction(),
16038	RetTy: CI->getFunctionType()->getReturnType(),
16039	Tys: CI->getFunctionType()->params(), CostKind);
16040	};
16041	auto GetVectorCost = [=](InstructionCost CommonCost) {
16042	auto *CI = cast<CallInst>(Val: VL0);
16043	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
16044	SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
16045	CI, ID, VF: VecTy->getNumElements(),
16046	MinBW: It != MinBWs.end() ? It ->second.first : `0`, TTI);
16047	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16048	return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
16049	};
16050	return GetCostDiff (GetScalarCost, GetVectorCost);
16051	}
16052	case Instruction::ShuffleVector: {
16053	if (!SLPReVec \|\| E->isAltShuffle())
16054	assert(E->isAltShuffle() &&
16055	((Instruction::isBinaryOp(E->getOpcode()) &&
16056	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
16057	(Instruction::isCast(E->getOpcode()) &&
16058	Instruction::isCast(E->getAltOpcode())) \|\|
16059	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16060	"Invalid Shuffle Vector Operand");
16061	// Try to find the previous shuffle node with the same operands and same
16062	// main/alternate ops.
16063	auto TryFindNodeWithEqualOperands = [=]() {
16064	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16065	if (TE.get() == E)
16066	break;
16067	if (TE ->hasState() && TE ->isAltShuffle() &&
16068	((TE ->getOpcode() == E->getOpcode() &&
16069	TE ->getAltOpcode() == E->getAltOpcode()) \|\|
16070	(TE ->getOpcode() == E->getAltOpcode() &&
16071	TE ->getAltOpcode() == E->getOpcode())) &&
16072	TE ->hasEqualOperands(TE: *E))
16073	return true;
16074	}
16075	return false;
16076	};
16077	auto GetScalarCost = [&](unsigned Idx) {
16078	if (isa<PoisonValue>(Val: UniqueValues [Idx]))
16079	return InstructionCost (TTI::TCC_Free);
16080
16081	auto *VI = cast<Instruction>(Val: UniqueValues [Idx]);
16082	assert(E->getMatchingMainOpOrAltOp(VI) &&
16083	"Unexpected main/alternate opcode");
16084	(void)E;
16085	return TTI->getInstructionCost(U: VI, CostKind);
16086	};
16087	// Need to clear CommonCost since the final shuffle cost is included into
16088	// vector cost.
16089	auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
16090	// VecCost is equal to sum of the cost of creating 2 vectors
16091	// and the cost of creating shuffle.
16092	InstructionCost VecCost = `0`;
16093	if (TryFindNodeWithEqualOperands ()) {
16094	LLVM_DEBUG({
16095	dbgs() << "SLP: diamond match for alternate node found.\n";
16096	E->dump();
16097	});
16098	// No need to add new vector costs here since we're going to reuse
16099	// same main/alternate vector ops, just do different shuffling.
16100	} else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
16101	VecCost =
16102	TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
16103	VecCost +=
16104	TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
16105	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
16106	auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
16107	VecCost = TTIRef.getCmpSelInstrCost(
16108	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind,
16109	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
16110	I: VL0);
16111	VecCost += TTIRef.getCmpSelInstrCost(
16112	Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
16113	VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
16114	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
16115	I: E->getAltOp());
16116	} else {
16117	Type *SrcSclTy = E->getMainOp()->getOperand(i: `0`)->getType();
16118	auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
16119	if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
16120	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
16121	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
16122	unsigned SrcBWSz =
16123	DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: `0`)->getType());
16124	if (SrcIt != MinBWs.end()) {
16125	SrcBWSz = SrcIt ->second.first;
16126	SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
16127	SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
16128	}
16129	if (BWSz <= SrcBWSz) {
16130	if (BWSz < SrcBWSz)
16131	VecCost =
16132	TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
16133	CCH: TTI::CastContextHint::None, CostKind);
16134	LLVM_DEBUG({
16135	dbgs()
16136	<< "SLP: alternate extension, which should be truncated.\n";
16137	E->dump();
16138	});
16139	return VecCost;
16140	}
16141	}
16142	VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
16143	CCH: TTI::CastContextHint::None, CostKind);
16144	VecCost +=
16145	TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
16146	CCH: TTI::CastContextHint::None, CostKind);
16147	}
16148	SmallVector<int> Mask;
16149	E->buildAltOpShuffleMask(
16150	IsAltOp: [&](Instruction *I) {
16151	assert(E->getMatchingMainOpOrAltOp(I) &&
16152	"Unexpected main/alternate opcode");
16153	return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
16154	TLI: *TLI);
16155	},
16156	Mask);
16157	VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
16158	Tp: FinalVecTy, Mask, CostKind);
16159	// Patterns like [fadd,fsub] can be combined into a single instruction
16160	// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
16161	// need to take into account their order when looking for the most used
16162	// order.
16163	unsigned Opcode0 = E->getOpcode();
16164	unsigned Opcode1 = E->getAltOpcode();
16165	SmallBitVector OpcodeMask(
16166	getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1));
16167	// If this pattern is supported by the target then we consider the
16168	// order.
16169	if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
16170	InstructionCost AltVecCost = TTIRef.getAltInstrCost(
16171	VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
16172	return AltVecCost < VecCost ? AltVecCost : VecCost;
16173	}
16174	// TODO: Check the reverse order too.
16175	return VecCost;
16176	};
16177	if (SLPReVec && !E->isAltShuffle())
16178	return GetCostDiff (
16179	GetScalarCost, [&](InstructionCost) -> InstructionCost {
16180	// If a group uses mask in order, the shufflevector can be
16181	// eliminated by instcombine. Then the cost is 0.
16182	assert(isa<ShuffleVectorInst>(VL.front()) &&
16183	"Not supported shufflevector usage.");
16184	auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
16185	unsigned SVNumElements =
16186	cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: `0`)->getType())
16187	->getNumElements();
16188	unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
16189	for (size_t I = `0`, End = VL.size(); I != End; I += GroupSize) {
16190	ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
16191	int NextIndex = `0`;
16192	if (!all_of(Range&: Group, P: [&](Value *V) {
16193	assert(isa<ShuffleVectorInst>(V) &&
16194	"Not supported shufflevector usage.");
16195	auto *SV = cast<ShuffleVectorInst>(Val: V);
16196	int Index;
16197	[[maybe_unused]] bool IsExtractSubvectorMask =
16198	SV->isExtractSubvectorMask(Index);
16199	assert(IsExtractSubvectorMask &&
16200	"Not supported shufflevector usage.");
16201	if (NextIndex != Index)
16202	return false;
16203	NextIndex += SV->getShuffleMask().size();
16204	return true;
16205	}))
16206	return ::getShuffleCost(
16207	TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy,
16208	Mask: calculateShufflevectorMask(VL: E->Scalars));
16209	}
16210	return TTI::TCC_Free;
16211	});
16212	return GetCostDiff (GetScalarCost, GetVectorCost);
16213	}
16214	case Instruction::Freeze:
16215	return CommonCost;
16216	default:
16217	llvm_unreachable("Unknown instruction");
16218	}
16219	}
16220
16221	bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
16222	LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
16223	<< VectorizableTree.size() << " is fully vectorizable .\n");
16224
16225	auto &&AreVectorizableGathers = [this](const TreeEntry TE, unsigned* Limit) {
16226	SmallVector<int> Mask;
16227	return TE->isGather() &&
16228	!any_of(Range: TE->Scalars,
16229	P: [this](Value V) { return* EphValues.contains(Ptr: V); }) &&
16230	(allConstant(VL: TE->Scalars) \|\| isSplat(VL: TE->Scalars) \|\|
16231	TE->Scalars.size() < Limit \|\|
16232	(((TE->hasState() &&
16233	TE->getOpcode() == Instruction::ExtractElement) \|\|
16234	all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
16235	isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) \|\|
16236	(TE->hasState() && TE->getOpcode() == Instruction::Load &&
16237	!TE->isAltShuffle()) \|\|
16238	any_of(Range: TE->Scalars, P: IsaPred<LoadInst>));
16239	};
16240
16241	// We only handle trees of heights 1 and 2.
16242	if (VectorizableTree.size() == `1` &&
16243	(VectorizableTree [`0`]->State == TreeEntry::Vectorize \|\|
16244	VectorizableTree [`0`]->State == TreeEntry::StridedVectorize \|\|
16245	VectorizableTree [`0`]->State == TreeEntry::CompressVectorize \|\|
16246	(ForReduction &&
16247	AreVectorizableGathers (VectorizableTree [`0`].get(),
16248	VectorizableTree [`0`]->Scalars.size()) &&
16249	VectorizableTree [`0`]->getVectorFactor() > `2`)))
16250	return true;
16251
16252	if (VectorizableTree.size() != `2`)
16253	return false;
16254
16255	// Handle splat and all-constants stores. Also try to vectorize tiny trees
16256	// with the second gather nodes if they have less scalar operands rather than
16257	// the initial tree element (may be profitable to shuffle the second gather)
16258	// or they are extractelements, which form shuffle.
16259	if (VectorizableTree [`0`]->State == TreeEntry::Vectorize &&
16260	AreVectorizableGathers (VectorizableTree [`1`].get(),
16261	VectorizableTree [`0`]->Scalars.size()))
16262	return true;
16263
16264	// Gathering cost would be too much for tiny trees.
16265	if (VectorizableTree [`0`]->isGather() \|\|
16266	(VectorizableTree [`1`]->isGather() &&
16267	VectorizableTree [`0`]->State != TreeEntry::ScatterVectorize &&
16268	VectorizableTree [`0`]->State != TreeEntry::StridedVectorize &&
16269	VectorizableTree [`0`]->State != TreeEntry::CompressVectorize))
16270	return false;
16271
16272	return true;
16273	}
16274
16275	bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
16276	if (!DebugCounter::shouldExecute(Counter&: VectorizedGraphs))
16277	return true;
16278
16279	// Graph is empty - do nothing.
16280	if (VectorizableTree.empty()) {
16281	assert(ExternalUses.empty() && "We shouldn't have any external users");
16282
16283	return true;
16284	}
16285
16286	// No need to vectorize inserts of gathered values.
16287	if (VectorizableTree.size() == `2` &&
16288	isa<InsertElementInst>(Val: VectorizableTree [`0`]->Scalars [`0`]) &&
16289	VectorizableTree [`1`]->isGather() &&
16290	(VectorizableTree [`1`]->getVectorFactor() <= `2` \|\|
16291	!(isSplat(VL: VectorizableTree [`1`]->Scalars) \|\|
16292	allConstant(VL: VectorizableTree [`1`]->Scalars))))
16293	return true;
16294
16295	// If the graph includes only PHI nodes and gathers, it is defnitely not
16296	// profitable for the vectorization, we can skip it, if the cost threshold is
16297	// default. The cost of vectorized PHI nodes is almost always 0 + the cost of
16298	// gathers/buildvectors.
16299	constexpr int Limit = `4`;
16300	if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16301	!VectorizableTree.empty() &&
16302	all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16303	return (TE ->isGather() &&
16304	(!TE ->hasState() \|\|
16305	TE ->getOpcode() != Instruction::ExtractElement) &&
16306	count_if(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) \|\|
16307	(TE ->hasState() && TE ->getOpcode() == Instruction::PHI);
16308	}))
16309	return true;
16310
16311	// Do not vectorize small tree of phis only, if all vector phis are also
16312	// gathered.
16313	if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16314	VectorizableTree.size() <= Limit &&
16315	all_of(Range: VectorizableTree,
16316	P: [&](const std::unique_ptr<TreeEntry> &TE) {
16317	return (TE ->isGather() &&
16318	(!TE ->hasState() \|\|
16319	TE ->getOpcode() != Instruction::ExtractElement) &&
16320	count_if(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>) <=
16321	Limit) \|\|
16322	(TE ->hasState() &&
16323	(TE ->getOpcode() == Instruction::InsertElement \|\|
16324	(TE ->getOpcode() == Instruction::PHI &&
16325	all_of(Range&: TE ->Scalars, P: [&](Value *V) {
16326	return isa<PoisonValue>(Val: V) \|\| MustGather.contains(Ptr: V);
16327	}))));
16328	}) &&
16329	any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16330	return TE ->State == TreeEntry::Vectorize &&
16331	TE ->getOpcode() == Instruction::PHI;
16332	}))
16333	return true;
16334
16335	// If the tree contains only phis, buildvectors, split nodes and
16336	// small nodes with reuses, we can skip it.
16337	SmallVector<const TreeEntry *> StoreLoadNodes;
16338	unsigned NumGathers = `0`;
16339	constexpr int LimitTreeSize = `36`;
16340	if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16341	all_of(Range: VectorizableTree,
16342	P: [&](const std::unique_ptr<TreeEntry> &TE) {
16343	if (!TE ->isGather() && TE ->hasState() &&
16344	(TE ->getOpcode() == Instruction::Load \|\|
16345	TE ->getOpcode() == Instruction::Store)) {
16346	StoreLoadNodes.push_back(Elt: TE.get());
16347	return true;
16348	}
16349	if (TE ->isGather())
16350	++NumGathers;
16351	return TE ->State == TreeEntry::SplitVectorize \|\|
16352	(TE ->Idx == `0` && TE ->Scalars.size() == `2` &&
16353	TE ->hasState() && TE ->getOpcode() == Instruction::ICmp &&
16354	VectorizableTree.size() > LimitTreeSize) \|\|
16355	(TE ->isGather() &&
16356	none_of(Range&: TE ->Scalars, P: IsaPred<ExtractElementInst>)) \|\|
16357	(TE ->hasState() &&
16358	(TE ->getOpcode() == Instruction::PHI \|\|
16359	(TE ->hasCopyableElements() &&
16360	static_cast<unsigned>(count_if(
16361	Range&: TE ->Scalars, P: IsaPred<PHINode, Constant>)) >=
16362	TE ->Scalars.size() / `2`) \|\|
16363	((!TE ->ReuseShuffleIndices.empty() \|\|
16364	!TE ->ReorderIndices.empty() \|\| TE ->isAltShuffle()) &&
16365	TE ->Scalars.size() == `2`)));
16366	}) &&
16367	(StoreLoadNodes.empty() \|\|
16368	(VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
16369	(NumGathers > `0` \|\| none_of(Range&: StoreLoadNodes, P: [&](const TreeEntry *TE) {
16370	return TE->getOpcode() == Instruction::Store \|\|
16371	all_of(Range: TE->Scalars, P: [&](Value *V) {
16372	return !isa<LoadInst>(Val: V) \|\|
16373	areAllUsersVectorized(I: cast<Instruction>(Val: V));
16374	});
16375	})))))
16376	return true;
16377
16378	// If the tree contains only buildvector, 2 non-buildvectors (with root user
16379	// tree node) and other buildvectors, we can skip it.
16380	if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16381	VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16382	VectorizableTree.size() >= Limit &&
16383	count_if(Range: ArrayRef(VectorizableTree).drop_front(),
16384	P: [&](const std::unique_ptr<TreeEntry> &TE) {
16385	return !TE ->isGather() && TE ->UserTreeIndex.UserTE &&
16386	TE ->UserTreeIndex.UserTE->Idx == `0`;
16387	}) == `2`)
16388	return true;
16389
16390	// If the tree contains only vectorization of the phi node from the
16391	// buildvector - skip it.
16392	if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16393	VectorizableTree.size() > `2` &&
16394	VectorizableTree.front()->State == TreeEntry::Vectorize &&
16395	VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16396	VectorizableTree [`1`]->State == TreeEntry::Vectorize &&
16397	VectorizableTree [`1`]->getOpcode() == Instruction::PHI &&
16398	all_of(
16399	Range: ArrayRef(VectorizableTree).drop_front(N: `2`),
16400	P: [&](const std::unique_ptr<TreeEntry> &TE) { return TE ->isGather(); }))
16401	return true;
16402
16403	// We can vectorize the tree if its size is greater than or equal to the
16404	// minimum size specified by the MinTreeSize command line option.
16405	if (VectorizableTree.size() >= MinTreeSize)
16406	return false;
16407
16408	// If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16409	// can vectorize it if we can prove it fully vectorizable.
16410	if (isFullyVectorizableTinyTree(ForReduction))
16411	return false;
16412
16413	// Check if any of the gather node forms an insertelement buildvector
16414	// somewhere.
16415	bool IsAllowedSingleBVNode =
16416	VectorizableTree.size() > `1` \|\|
16417	(VectorizableTree.size() == `1` && VectorizableTree.front()->hasState() &&
16418	!VectorizableTree.front()->isAltShuffle() &&
16419	VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16420	VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16421	allSameBlock(VL: VectorizableTree.front()->Scalars));
16422	if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16423	return TE ->isGather() && all_of(Range&: TE ->Scalars, P: [&](Value *V) {
16424	return isa<ExtractElementInst, Constant>(Val: V) \|\|
16425	(IsAllowedSingleBVNode &&
16426	!V->hasNUsesOrMore(N: UsesLimit) &&
16427	any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
16428	});
16429	}))
16430	return false;
16431
16432	if (VectorizableTree.back()->isGather() &&
16433	VectorizableTree.back()->hasState() &&
16434	VectorizableTree.back()->isAltShuffle() &&
16435	VectorizableTree.back()->getVectorFactor() > `2` &&
16436	allSameBlock(VL: VectorizableTree.back()->Scalars) &&
16437	!VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16438	TTI->getScalarizationOverhead(
16439	Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(),
16440	VF: VectorizableTree.back()->getVectorFactor()),
16441	DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()),
16442	/Insert=/true, /Extract=/false,
16443	CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold)
16444	return false;
16445
16446	// Otherwise, we can't vectorize the tree. It is both tiny and not fully
16447	// vectorizable.
16448	return true;
16449	}
16450
16451	bool BoUpSLP::isTreeNotExtendable() const {
16452	if (getCanonicalGraphSize() != getTreeSize()) {
16453	constexpr unsigned SmallTree = `3`;
16454	if (VectorizableTree.front()->isNonPowOf2Vec() &&
16455	getCanonicalGraphSize() <= SmallTree &&
16456	count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
16457	P: [](const std::unique_ptr<TreeEntry> &TE) {
16458	return TE ->isGather() && TE ->hasState() &&
16459	TE ->getOpcode() == Instruction::Load &&
16460	!allSameBlock(VL: TE ->Scalars);
16461	}) == `1`)
16462	return true;
16463	return false;
16464	}
16465	bool Res = false;
16466	for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) {
16467	TreeEntry &E = *VectorizableTree [Idx];
16468	if (E.State == TreeEntry::SplitVectorize)
16469	return false;
16470	if (!E.isGather())
16471	continue;
16472	if ((E.hasState() && E.getOpcode() != Instruction::Load) \|\|
16473	(!E.hasState() &&
16474	all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) \|\|
16475	(isa<ExtractElementInst>(Val: E.Scalars.front()) &&
16476	getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid()))
16477	return false;
16478	if (isSplat(VL: E.Scalars) \|\| allConstant(VL: E.Scalars))
16479	continue;
16480	Res = true;
16481	}
16482	return Res;
16483	}
16484
16485	InstructionCost BoUpSLP::getSpillCost() {
16486	// Walk from the bottom of the tree to the top, tracking which values are
16487	// live. When we see a call instruction that is not part of our tree,
16488	// query TTI to see if there is a cost to keeping values live over it
16489	// (for example, if spills and fills are required).
16490
16491	const TreeEntry *Root = VectorizableTree.front().get();
16492	if (Root->isGather())
16493	return `0`;
16494
16495	InstructionCost Cost = `0`;
16496	SmallDenseMap<const TreeEntry , SmallVector<const* TreeEntry *>>
16497	EntriesToOperands;
16498	SmallDenseMap<const TreeEntry , Instruction > EntriesToLastInstruction;
16499	SmallPtrSet<const Instruction *, `8`> LastInstructions;
16500	SmallPtrSet<const TreeEntry *, `8`> ScalarOrPseudoEntries;
16501	for (const auto &TEPtr : VectorizableTree) {
16502	if (TEPtr ->CombinedOp == TreeEntry::ReducedBitcast \|\|
16503	TEPtr ->CombinedOp == TreeEntry::ReducedBitcastBSwap \|\|
16504	TEPtr ->CombinedOp == TreeEntry::ReducedBitcastLoads \|\|
16505	TEPtr ->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads \|\|
16506	TEPtr ->CombinedOp == TreeEntry::ReducedCmpBitcast) {
16507	ScalarOrPseudoEntries.insert(Ptr: TEPtr.get());
16508	continue;
16509	}
16510	if (!TEPtr ->isGather()) {
16511	Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get());
16512	EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst);
16513	LastInstructions.insert(Ptr: LastInst);
16514	}
16515	if (TEPtr ->UserTreeIndex)
16516	EntriesToOperands [TEPtr ->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get());
16517	}
16518
16519	auto NoCallIntrinsic = [this](const Instruction *I) {
16520	const auto *II = dyn_cast<IntrinsicInst>(Val: I);
16521	if (!II)
16522	return false;
16523	if (II->isAssumeLikeIntrinsic())
16524	return true;
16525	IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16526	InstructionCost IntrCost =
16527	TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
16528	InstructionCost CallCost = TTI->getCallInstrCost(
16529	F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput);
16530	return IntrCost < CallCost;
16531	};
16532
16533	// Maps last instruction in the entry to the last instruction for the one of
16534	// operand entries and the flag. If the flag is true, there are no calls in
16535	// between these instructions.
16536	SmallDenseMap<const Instruction , PointerIntPair<const* Instruction *, `1`>>
16537	CheckedInstructions;
16538	unsigned Budget = `0`;
16539	const unsigned BudgetLimit =
16540	ScheduleRegionSizeBudget / VectorizableTree.size();
16541	auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16542	const Instruction *Last) {
16543	assert(First->getParent() == Last->getParent() &&
16544	"Expected instructions in same block.");
16545	if (auto It = CheckedInstructions.find(Val: Last);
16546	It != CheckedInstructions.end()) {
16547	const Instruction *Checked = It ->second.getPointer();
16548	if (Checked == First \|\| Checked->comesBefore(Other: First))
16549	return It ->second.getInt() != `0`;
16550	Last = Checked;
16551	} else if (Last == First \|\| Last->comesBefore(Other: First)) {
16552	return true;
16553	}
16554	BasicBlock::const_reverse_iterator InstIt =
16555	++First->getIterator().getReverse(),
16556	PrevInstIt =
16557	Last->getIterator().getReverse();
16558	SmallVector<const Instruction *> LastInstsInRange;
16559	while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16560	// Debug information does not impact spill cost.
16561	// Vectorized calls, represented as vector intrinsics, do not impact spill
16562	// cost.
16563	if (const auto CB = dyn_cast<CallBase>(Val: &PrevInstIt);
16564	CB && !NoCallIntrinsic (CB) && !isVectorized(V: CB)) {
16565	for (const Instruction *LastInst : LastInstsInRange)
16566	CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: `0`);
16567	return false;
16568	}
16569	if (LastInstructions.contains(Ptr: &*PrevInstIt))
16570	LastInstsInRange.push_back(Elt: &*PrevInstIt);
16571
16572	++PrevInstIt;
16573	++Budget;
16574	}
16575	for (const Instruction *LastInst : LastInstsInRange)
16576	CheckedInstructions.try_emplace(
16577	Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt,
16578	Args: Budget <= BudgetLimit ? `1` : `0`);
16579	return Budget <= BudgetLimit;
16580	};
16581	auto AddCosts = [&](const TreeEntry *Op) {
16582	if (ScalarOrPseudoEntries.contains(Ptr: Op))
16583	return;
16584	Type *ScalarTy = Op->Scalars.front()->getType();
16585	auto It = MinBWs.find(Val: Op);
16586	if (It != MinBWs.end())
16587	ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It ->second.first);
16588	auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor());
16589	Cost += TTI->getCostOfKeepingLiveOverCall(Tys: VecTy);
16590	if (ScalarTy->isVectorTy()) {
16591	// Handle revec dead vector instructions.
16592	Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy);
16593	}
16594	};
16595	// Memoize the relationship between blocks, i.e. if there is (at least one)
16596	// non-vectorized call between the blocks. This allows to skip the analysis of
16597	// the same block paths multiple times.
16598	SmallDenseMap<std::pair<const BasicBlock , const* BasicBlock >, bool*>
16599	ParentOpParentToPreds;
16600	auto CheckPredecessors = [&](BasicBlock Root, BasicBlock Pred,
16601	BasicBlock *OpParent) {
16602	auto Key = std::make_pair(x&: Root, y&: OpParent);
16603	if (auto It = ParentOpParentToPreds.find(Val: Key);
16604	It != ParentOpParentToPreds.end())
16605	return It ->second;
16606	SmallVector<BasicBlock *> Worklist;
16607	if (Pred)
16608	Worklist.push_back(Elt: Pred);
16609	else
16610	Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root));
16611	SmallPtrSet<const BasicBlock *, `16`> Visited;
16612	SmallDenseSet<std::pair<const BasicBlock , const* BasicBlock *>>
16613	ParentsPairsToAdd;
16614	bool Res = false;
16615	llvm::scope_exit Cleanup([&]() {
16616	for (const auto &KeyPair : ParentsPairsToAdd) {
16617	assert(!ParentOpParentToPreds.contains(KeyPair) &&
16618	"Should not have been added before.");
16619	ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res);
16620	}
16621	});
16622	while (!Worklist.empty()) {
16623	BasicBlock *BB = Worklist.pop_back_val();
16624	if (BB == OpParent \|\| !Visited.insert(Ptr: BB).second)
16625	continue;
16626	auto Pair = std::make_pair(x&: BB, y&: OpParent);
16627	if (auto It = ParentOpParentToPreds.find(Val: Pair);
16628	It != ParentOpParentToPreds.end()) {
16629	Res = It ->second;
16630	return Res;
16631	}
16632	ParentsPairsToAdd.insert(V: Pair);
16633	unsigned BlockSize = BB->size();
16634	if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16635	return Res;
16636	Budget += BlockSize;
16637	if (Budget > BudgetLimit)
16638	return Res;
16639	if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) &&
16640	!CheckForNonVecCallsInSameBlock (&*BB->getFirstNonPHIOrDbgOrAlloca(),
16641	BB->getTerminator()))
16642	return Res;
16643	Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB));
16644	}
16645	Res = true;
16646	return Res;
16647	};
16648	SmallVector<const TreeEntry *> LiveEntries(`1`, Root);
16649	auto FindNonScalarParentEntry = [&](const TreeEntry E) -> const* TreeEntry * {
16650	assert(ScalarOrPseudoEntries.contains(E) &&
16651	"Expected scalar or pseudo entry.");
16652	const TreeEntry *Entry = E;
16653	while (Entry->UserTreeIndex) {
16654	Entry = Entry->UserTreeIndex.UserTE;
16655	if (!ScalarOrPseudoEntries.contains(Ptr: Entry))
16656	return Entry;
16657	}
16658	return nullptr;
16659	};
16660	while (!LiveEntries.empty()) {
16661	const TreeEntry *Entry = LiveEntries.pop_back_val();
16662	SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry);
16663	if (Operands.empty())
16664	continue;
16665	if (ScalarOrPseudoEntries.contains(Ptr: Entry)) {
16666	Entry = FindNonScalarParentEntry (Entry);
16667	if (!Entry) {
16668	for (const TreeEntry *Op : Operands) {
16669	if (!Op->isGather())
16670	LiveEntries.push_back(Elt: Op);
16671	}
16672	continue;
16673	}
16674	}
16675	Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry);
16676	BasicBlock *Parent = LastInst->getParent();
16677	for (const TreeEntry *Op : Operands) {
16678	if (!Op->isGather())
16679	LiveEntries.push_back(Elt: Op);
16680	if (ScalarOrPseudoEntries.contains(Ptr: Op))
16681	continue;
16682	if (Entry->State == TreeEntry::SplitVectorize \|\|
16683	(Entry->getOpcode() != Instruction::PHI && Op->isGather()) \|\|
16684	(Op->isGather() && allConstant(VL: Op->Scalars)))
16685	continue;
16686	Budget = `0`;
16687	BasicBlock Pred = nullptr*;
16688	if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp()))
16689	Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
16690	BasicBlock *OpParent;
16691	Instruction *OpLastInst;
16692	if (Op->isGather()) {
16693	assert(Entry->getOpcode() == Instruction::PHI &&
16694	"Expected phi node only.");
16695	OpParent = cast<PHINode>(Val: Entry->getMainOp())
16696	->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
16697	OpLastInst = OpParent->getTerminator();
16698	for (Value *V : Op->Scalars) {
16699	auto *Inst = dyn_cast<Instruction>(Val: V);
16700	if (!Inst)
16701	continue;
16702	if (isVectorized(V)) {
16703	OpParent = Inst->getParent();
16704	OpLastInst = Inst;
16705	break;
16706	}
16707	}
16708	} else {
16709	OpLastInst = EntriesToLastInstruction.at(Val: Op);
16710	OpParent = OpLastInst->getParent();
16711	}
16712	// Check the call instructions within the same basic blocks.
16713	if (OpParent == Parent) {
16714	if (Entry->getOpcode() == Instruction::PHI) {
16715	if (!CheckForNonVecCallsInSameBlock (LastInst, OpLastInst))
16716	AddCosts (Op);
16717	continue;
16718	}
16719	if (!CheckForNonVecCallsInSameBlock (OpLastInst, LastInst))
16720	AddCosts (Op);
16721	continue;
16722	}
16723	// Check for call instruction in between blocks.
16724	// 1. Check entry's block to the head.
16725	if (Entry->getOpcode() != Instruction::PHI &&
16726	!CheckForNonVecCallsInSameBlock (
16727	&*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
16728	AddCosts (Op);
16729	continue;
16730	}
16731	// 2. Check op's block from the end.
16732	if (!CheckForNonVecCallsInSameBlock (OpLastInst,
16733	OpParent->getTerminator())) {
16734	AddCosts (Op);
16735	continue;
16736	}
16737	// 3. Check the predecessors of entry's block till op's block.
16738	if (!CheckPredecessors (Parent, Pred, OpParent)) {
16739	AddCosts (Op);
16740	continue;
16741	}
16742	}
16743	}
16744
16745	return Cost;
16746	}
16747
16748	/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16749	/// buildvector sequence.
16750	static bool isFirstInsertElement(const InsertElementInst *IE1,
16751	const InsertElementInst *IE2) {
16752	if (IE1 == IE2)
16753	return false;
16754	const auto *I1 = IE1;
16755	const auto *I2 = IE2;
16756	const InsertElementInst *PrevI1;
16757	const InsertElementInst *PrevI2;
16758	unsigned Idx1 = *getElementIndex(Inst: IE1);
16759	unsigned Idx2 = *getElementIndex(Inst: IE2);
16760	do {
16761	if (I2 == IE1)
16762	return true;
16763	if (I1 == IE2)
16764	return false;
16765	PrevI1 = I1;
16766	PrevI2 = I2;
16767	if (I1 && (I1 == IE1 \|\| I1->hasOneUse()) &&
16768	getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
16769	I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: `0`));
16770	if (I2 && ((I2 == IE2 \|\| I2->hasOneUse())) &&
16771	getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
16772	I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: `0`));
16773	} while ((I1 && PrevI1 != I1) \|\| (I2 && PrevI2 != I2));
16774	llvm_unreachable("Two different buildvectors not expected.");
16775	}
16776
16777	namespace {
16778	/// Returns incoming Value , if the requested type is Value * too, or a default*
16779	/// value, otherwise.
16780	struct ValueSelect {
16781	template <typename U>
16782	static std::enable_if_t<std::is_same_v<Value , U>, Value > get(Value *V) {
16783	return V;
16784	}
16785	template <typename U>
16786	static std::enable_if_t<!std::is_same_v<Value , U>, U> get(Value ) {
16787	return U();
16788	}
16789	};
16790	} // namespace
16791
16792	/// Does the analysis of the provided shuffle masks and performs the requested
16793	/// actions on the vectors with the given shuffle masks. It tries to do it in
16794	/// several steps.
16795	/// 1. If the Base vector is not undef vector, resizing the very first mask to
16796	/// have common VF and perform action for 2 input vectors (including non-undef
16797	/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16798	/// and processed as a shuffle of 2 elements.
16799	/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16800	/// action only for 1 vector with the given mask, if it is not the identity
16801	/// mask.
16802	/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16803	/// vectors, combing the masks properly between the steps.
16804	template <typename T>
16805	static T *performExtractsShuffleAction(
16806	MutableArrayRef<std::pair<T , SmallVector<int>>> ShuffleMask, Value Base,
16807	function_ref<unsigned(T *)> GetVF,
16808	function_ref<std::pair<T , bool>(T , ArrayRef<int>, bool)> ResizeAction,
16809	function_ref<T (ArrayRef<int>, ArrayRef<T >)> Action) {
16810	assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16811	SmallVector<int> Mask(ShuffleMask.begin()->second);
16812	auto VMIt = std::next(ShuffleMask.begin());
16813	T Prev = nullptr*;
16814	SmallBitVector UseMask =
16815	buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
16816	SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
16817	if (!IsBaseUndef.all()) {
16818	// Base is not undef, need to combine it with the next subvectors.
16819	std::pair<T , bool*> Res =
16820	ResizeAction(ShuffleMask.begin()->first, Mask, /ForSingleMask=/false);
16821	SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
16822	for (unsigned Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
16823	if (Mask [Idx] == PoisonMaskElem)
16824	Mask [Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16825	else
16826	Mask [Idx] = (Res.second ? Idx : Mask [Idx]) + VF;
16827	}
16828	[[maybe_unused]] auto V = ValueSelect::get<T >(Base);
16829	assert((!V \|\| GetVF(V) == Mask.size()) &&
16830	"Expected base vector of VF number of elements.");
16831	Prev = Action(Mask, {nullptr, Res.first});
16832	} else if (ShuffleMask.size() == `1`) {
16833	// Base is undef and only 1 vector is shuffled - perform the action only for
16834	// single vector, if the mask is not the identity mask.
16835	std::pair<T , bool*> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16836	/ForSingleMask=/true);
16837	if (Res.second)
16838	// Identity mask is found.
16839	Prev = Res.first;
16840	else
16841	Prev = Action(Mask, {ShuffleMask.begin()->first});
16842	} else {
16843	// Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16844	// shuffles step by step, combining shuffle between the steps.
16845	unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16846	unsigned Vec2VF = GetVF(VMIt->first);
16847	if (Vec1VF == Vec2VF) {
16848	// No need to resize the input vectors since they are of the same size, we
16849	// can shuffle them directly.
16850	ArrayRef<int> SecMask = VMIt->second;
16851	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
16852	if (SecMask [I] != PoisonMaskElem) {
16853	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16854	Mask [I] = SecMask [I] + Vec1VF;
16855	}
16856	}
16857	Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16858	} else {
16859	// Vectors of different sizes - resize and reshuffle.
16860	std::pair<T , bool*> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16861	/ForSingleMask=/false);
16862	std::pair<T , bool*> Res2 =
16863	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
16864	ArrayRef<int> SecMask = VMIt->second;
16865	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
16866	if (Mask [I] != PoisonMaskElem) {
16867	assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16868	if (Res1.second)
16869	Mask [I] = I;
16870	} else if (SecMask [I] != PoisonMaskElem) {
16871	assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16872	Mask [I] = (Res2.second ? I : SecMask [I]) + VF;
16873	}
16874	}
16875	Prev = Action(Mask, {Res1.first, Res2.first});
16876	}
16877	VMIt = std::next(VMIt);
16878	}
16879	[[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16880	// Perform requested actions for the remaining masks/vectors.
16881	for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16882	// Shuffle other input vectors, if any.
16883	std::pair<T , bool*> Res =
16884	ResizeAction(VMIt->first, VMIt->second, /ForSingleMask=/false);
16885	ArrayRef<int> SecMask = VMIt->second;
16886	for (unsigned I = `0`, VF = Mask.size(); I < VF; ++I) {
16887	if (SecMask [I] != PoisonMaskElem) {
16888	assert((Mask[I] == PoisonMaskElem \|\| IsBaseNotUndef) &&
16889	"Multiple uses of scalars.");
16890	Mask [I] = (Res.second ? I : SecMask [I]) + VF;
16891	} else if (Mask [I] != PoisonMaskElem) {
16892	Mask [I] = I;
16893	}
16894	}
16895	Prev = Action(Mask, {Prev, Res.first});
16896	}
16897	return Prev;
16898	}
16899
16900	InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
16901	ArrayRef<Value *> VectorizedVals) {
16902	SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
16903	SmallPtrSet<Value *, `4`> CheckedExtracts;
16904	SmallSetVector<TreeEntry *, `4`> GatheredLoadsNodes;
16905	SmallDenseMap<const TreeEntry *, InstructionCost> ExtractCosts;
16906	LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16907	<< VectorizableTree.size() << ".\n");
16908	auto IsExternallyUsed = [&](const TreeEntry &TE, Value *V) {
16909	assert(TE.hasState() && !TE.isGather() &&
16910	TE.State != TreeEntry::SplitVectorize && "Expected vector node.");
16911	if (V->hasOneUse() \|\| V->getType()->isVoidTy())
16912	return false;
16913	if (TE.hasCopyableElements() && TE.isCopyableElement(V))
16914	return false;
16915	const size_t NumVectScalars = ScalarToTreeEntries.size() + `1`;
16916	if (V->hasNUsesOrMore(N: NumVectScalars))
16917	return true;
16918	auto *I = dyn_cast<Instruction>(Val: V);
16919	// Check if any user is used outside of the tree.
16920	return I && any_of(Range: I->users(), P: [&](const User *U) {
16921	// store/insertelt v, [cast]U will likely be vectorized.
16922	if (match(V: U, P: m_InsertElt(Val: m_Value(),
16923	Elt: m_OneUse(SubPattern: m_CastOrSelf(Op: m_Specific(V: I))),
16924	Idx: m_ConstantInt())))
16925	return false;
16926	if (match(V: U,
16927	P: m_InsertElt(Val: m_Value(), Elt: m_Specific(V: I), Idx: m_ConstantInt())))
16928	return false;
16929	if (match(V: U, P: m_Store(ValueOp: m_OneUse(SubPattern: m_CastOrSelf(Op: m_Specific(V: I))),
16930	PointerOp: m_Value())))
16931	return false;
16932	if (match(V: U, P: m_Store(ValueOp: m_Specific(V: I), PointerOp: m_Value())))
16933	return false;
16934	ArrayRef<TreeEntry *> Entries = getTreeEntries(V: U);
16935	if (Entries.empty() && !MustGather.contains(Ptr: U))
16936	return true;
16937	if (any_of(Range&: Entries, P: [&](TreeEntry *TE) {
16938	return DeletedNodes.contains(Ptr: TE);
16939	}))
16940	return true;
16941	return any_of(Range: ValueToGatherNodes.lookup(Val: U),
16942	P: [&](const TreeEntry *TE) {
16943	return DeletedNodes.contains(Ptr: TE);
16944	});
16945	});
16946	};
16947	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
16948	InstructionCost Cost = `0`;
16949	for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16950	TreeEntry &TE = *Ptr;
16951	// No need to count the cost for combined entries, they are combined and
16952	// just skip their cost.
16953	if (TE.State == TreeEntry::CombinedVectorize) {
16954	LLVM_DEBUG(
16955	dbgs() << "SLP: Skipping cost for combined node that starts with "
16956	<< *TE.Scalars[`0`] << ".\n";
16957	TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16958	NodesCosts.try_emplace(Key: &TE);
16959	continue;
16960	}
16961	if (TE.hasState() &&
16962	(TE.isGather() \|\| TE.State == TreeEntry::SplitVectorize)) {
16963	if (const TreeEntry *E =
16964	getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars);
16965	E && E->getVectorFactor() == TE.getVectorFactor()) {
16966	// Some gather nodes might be absolutely the same as some vectorizable
16967	// nodes after reordering, need to handle it.
16968	LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16969	<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16970	<< "SLP: Current total cost = " << Cost << "\n");
16971	NodesCosts.try_emplace(Key: &TE);
16972	continue;
16973	}
16974	}
16975
16976	// Exclude cost of gather loads nodes which are not used. These nodes were
16977	// built as part of the final attempt to vectorize gathered loads.
16978	assert((!TE.isGather() \|\| TE.Idx == `0` \|\| TE.UserTreeIndex) &&
16979	"Expected gather nodes with users only.");
16980
16981	InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
16982	Cost += C;
16983	NodesCosts.try_emplace(Key: &TE, Args&: C);
16984	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16985	<< shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16986	<< "SLP: Current total cost = " << Cost << "\n");
16987	// Add gathered loads nodes to the set for later processing.
16988	if (TE.Idx > `0` && !TE.UserTreeIndex && TE.hasState() &&
16989	TE.getOpcode() == Instruction::Load)
16990	GatheredLoadsNodes.insert(X: &TE);
16991	if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
16992	!(TE.Idx == `0` && (TE.getOpcode() == Instruction::InsertElement \|\|
16993	TE.getOpcode() == Instruction::Store))) {
16994	// Calculate costs of external uses.
16995	APInt DemandedElts = APInt::getZero(numBits: TE.getVectorFactor());
16996	for (Value *V : TE.Scalars) {
16997	if (IsExternallyUsed (TE, V))
16998	DemandedElts.setBit(TE.findLaneForValue(V));
16999	}
17000	if (!DemandedElts.isZero()) {
17001	Type *ScalarTy = TE.Scalars.front()->getType();
17002	auto It = MinBWs.find(Val: &TE);
17003	if (It != MinBWs.end())
17004	ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It ->second.first);
17005	auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
17006	InstructionCost ExtCost = ::getScalarizationOverhead(
17007	TTI: TTI, ScalarTy, Ty: VecTy, DemandedElts, /Insert=/*false,
17008	/Extract=/true, CostKind);
17009	ExtractCosts.try_emplace(Key: &TE, Args&: ExtCost);
17010	}
17011	}
17012	}
17013	// Bail out if the cost threshold is negative and cost already below it.
17014	if (SLPCostThreshold.getNumOccurrences() > `0` && SLPCostThreshold < `0` &&
17015	Cost < -SLPCostThreshold)
17016	return Cost;
17017	// The narrow non-profitable tree in loop? Skip, may cause regressions.
17018	constexpr unsigned PartLimit = `2`;
17019	const unsigned Sz =
17020	getVectorElementSize(V: VectorizableTree.front()->Scalars.front());
17021	const unsigned MinVF = getMinVF(Sz);
17022	if (Cost >= -SLPCostThreshold &&
17023	VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
17024	(!VectorizableTree.front()->hasState() \|\|
17025	(VectorizableTree.front()->getOpcode() != Instruction::Store &&
17026	LI->getLoopFor(BB: VectorizableTree.front()->getMainOp()->getParent()))))
17027	return Cost;
17028	// Store the cost + external uses estimation as the first element of the
17029	// tuple, just the cost as the second element of the tuple. Required to return
17030	// correct cost estimation for the tree, extracts are calculated separately.
17031	// Extracts, calculated here, are just quick estimations.
17032	SmallVector<
17033	std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
17034	SubtreeCosts(VectorizableTree.size());
17035	auto UpdateParentNodes =
17036	[&](const TreeEntry UserTE, const* TreeEntry *TE,
17037	InstructionCost TotalCost, InstructionCost Cost,
17038	SmallDenseSet<std::pair<const TreeEntry , const* TreeEntry *>, `4`>
17039	&VisitedUser,
17040	bool AddToList = true) {
17041	while (UserTE &&
17042	VisitedUser.insert(V: std::make_pair(x&: TE, y&: UserTE)).second) {
17043	std::get<`0`>(t&: SubtreeCosts [UserTE->Idx]) += TotalCost;
17044	std::get<`1`>(t&: SubtreeCosts [UserTE->Idx]) += Cost;
17045	if (AddToList)
17046	std::get<`2`>(t&: SubtreeCosts [UserTE->Idx]).push_back(Elt: TE->Idx);
17047	UserTE = UserTE->UserTreeIndex.UserTE;
17048	}
17049	};
17050	for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17051	TreeEntry &TE = *Ptr;
17052	InstructionCost C = NodesCosts.at(Val: &TE);
17053	InstructionCost ExtractCost = ExtractCosts.lookup(Val: &TE);
17054	std::get<`0`>(t&: SubtreeCosts [TE.Idx]) += C + ExtractCost;
17055	std::get<`1`>(t&: SubtreeCosts [TE.Idx]) += C;
17056	if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
17057	SmallDenseSet<std::pair<const TreeEntry , const* TreeEntry *>, `4`>
17058	VisitedUser;
17059	UpdateParentNodes (UserTE, &TE, C + ExtractCost, C, VisitedUser);
17060	}
17061	}
17062	SmallDenseSet<std::pair<const TreeEntry , const* TreeEntry *>, `4`> Visited;
17063	for (TreeEntry *TE : GatheredLoadsNodes) {
17064	InstructionCost TotalCost = std::get<`0`>(t&: SubtreeCosts [TE->Idx]);
17065	InstructionCost Cost = std::get<`1`>(t&: SubtreeCosts [TE->Idx]);
17066	for (Value *V : TE->Scalars) {
17067	for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(Val: V))
17068	UpdateParentNodes (BVTE, TE, TotalCost, Cost, Visited,
17069	/AddToList=/false);
17070	}
17071	}
17072	Visited.clear();
17073	using CostIndicesTy =
17074	std::pair<TreeEntry *, std::tuple<InstructionCost, InstructionCost,
17075	SmallVector<unsigned>>>;
17076	struct FirstGreater {
17077	bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
17078	return std::get<`0`>(t: LHS.second) < std::get<`0`>(t: RHS.second) \|\|
17079	(std::get<`0`>(t: LHS.second) == std::get<`0`>(t: RHS.second) &&
17080	LHS.first->Idx < RHS.first->Idx);
17081	}
17082	};
17083	PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
17084	Worklist;
17085	for (const auto [Idx, P] : enumerate(First&: SubtreeCosts))
17086	Worklist.emplace(args: VectorizableTree [Idx].get(), args&: P);
17087
17088	// Narrow store trees with non-profitable immediate values - exit.
17089	if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
17090	VectorizableTree.front()->hasState() &&
17091	VectorizableTree.front()->getOpcode() == Instruction::Store &&
17092	(Worklist.top().first->Idx == `0` \|\| Worklist.top().first->Idx == `1`))
17093	return Cost;
17094
17095	bool Changed = false;
17096	while (!Worklist.empty() && std::get<`0`>(t: Worklist.top().second) > `0`) {
17097	TreeEntry *TE = Worklist.top().first;
17098	if (TE->isGather() \|\| TE->Idx == `0` \|\| DeletedNodes.contains(Ptr: TE) \|\|
17099	// Exit early if the parent node is split node and any of scalars is
17100	// used in other split nodes.
17101	(TE->UserTreeIndex &&
17102	TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
17103	any_of(Range&: TE->Scalars, P: [&](Value *V) {
17104	ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
17105	return Entries.size() > `1`;
17106	}))) {
17107	Worklist.pop();
17108	continue;
17109	}
17110	// Skip inversed compare nodes, they cannot be transformed to buildvectors.
17111	if (TE->State == TreeEntry::Vectorize && !TE->isAltShuffle() &&
17112	(TE->getOpcode() == Instruction::ICmp \|\|
17113	TE->getOpcode() == Instruction::FCmp) &&
17114	any_of(Range&: TE->Scalars, P: [&](Value *V) {
17115	auto *I = dyn_cast<CmpInst>(Val: V);
17116	if (!I)
17117	return false;
17118	return I->getPredicate() !=
17119	cast<CmpInst>(Val: TE->getMainOp())->getPredicate();
17120	})) {
17121	Worklist.pop();
17122	continue;
17123	}
17124
17125	// Calculate the gather cost of the root node.
17126	InstructionCost TotalSubtreeCost = std::get<`0`>(t: Worklist.top().second);
17127	InstructionCost SubtreeCost = std::get<`1`>(t: Worklist.top().second);
17128	if (TotalSubtreeCost < TE->Scalars.size()) {
17129	Worklist.pop();
17130	continue;
17131	}
17132	if (!TransformedToGatherNodes.empty()) {
17133	for (unsigned Idx : std::get<`2`>(t: Worklist.top().second)) {
17134	auto It = TransformedToGatherNodes.find(Val: VectorizableTree [Idx].get());
17135	if (It != TransformedToGatherNodes.end()) {
17136	TotalSubtreeCost -= std::get<`0`>(t&: SubtreeCosts [Idx]);
17137	SubtreeCost -= std::get<`1`>(t&: SubtreeCosts [Idx]);
17138	TotalSubtreeCost += It ->second;
17139	SubtreeCost += It ->second;
17140	}
17141	}
17142	}
17143	if (TotalSubtreeCost < `0` \|\| TotalSubtreeCost < TE->Scalars.size()) {
17144	Worklist.pop();
17145	continue;
17146	}
17147	const unsigned Sz = TE->Scalars.size();
17148	APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
17149	for (auto [Idx, V] : enumerate(First&: TE->Scalars)) {
17150	if (isConstant(V))
17151	DemandedElts.clearBit(BitPosition: Idx);
17152	}
17153
17154	Type *ScalarTy = getValueType(V: TE->Scalars.front());
17155	auto It = MinBWs.find(Val: TE);
17156	if (It != MinBWs.end())
17157	ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It ->second.first);
17158	if (isa<CmpInst>(Val: TE->Scalars.front()))
17159	ScalarTy = TE->Scalars.front()->getType();
17160	auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
17161	const unsigned EntryVF = TE->getVectorFactor();
17162	auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
17163	InstructionCost GatherCost = ::getScalarizationOverhead(
17164	TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
17165	/Insert=/true, /Extract=/false, CostKind);
17166	SmallVector<int> Mask;
17167	if (!TE->ReorderIndices.empty() &&
17168	TE->State != TreeEntry::CompressVectorize &&
17169	(TE->State != TreeEntry::StridedVectorize \|\|
17170	!isReverseOrder(Order: TE->ReorderIndices))) {
17171	SmallVector<int> NewMask;
17172	if (TE->getOpcode() == Instruction::Store) {
17173	// For stores the order is actually a mask.
17174	NewMask.resize(N: TE->ReorderIndices.size());
17175	copy(Range&: TE->ReorderIndices, Out: NewMask.begin());
17176	} else {
17177	inversePermutation(Indices: TE->ReorderIndices, Mask&: NewMask);
17178	}
17179	::addMask(Mask, SubMask: NewMask);
17180	}
17181	if (!TE->ReuseShuffleIndices.empty())
17182	::addMask(Mask, SubMask: TE->ReuseShuffleIndices);
17183	if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: EntryVF))
17184	GatherCost +=
17185	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
17186	// If all scalars are reused in gather node(s) or other vector nodes, there
17187	// might be extra cost for inserting them.
17188	if ((!TE->hasState() \|\| !TE->isAltShuffle()) &&
17189	all_of(Range&: TE->Scalars, P: [&](Value *V) {
17190	return (TE->hasCopyableElements() && TE->isCopyableElement(V)) \|\|
17191	isConstant(V) \|\| isGathered(V) \|\| getTreeEntries(V).size() > `1`;
17192	}))
17193	GatherCost *= `2`;
17194	// Erase subtree if it is non-profitable.
17195	if (TotalSubtreeCost > GatherCost) {
17196	// If the remaining tree is just a buildvector - exit, it will cause
17197	// endless attempts to vectorize.
17198	if (VectorizableTree.front()->hasState() &&
17199	VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17200	TE->Idx == `1`)
17201	return InstructionCost::getInvalid();
17202
17203	LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
17204	<< TE->Idx << " with cost "
17205	<< std::get<`0`>(Worklist.top().second)
17206	<< " and gather cost " << GatherCost << ".\n");
17207	if (TE->UserTreeIndex) {
17208	TransformedToGatherNodes.try_emplace(Key: TE, Args&: GatherCost);
17209	NodesCosts.erase(Val: TE);
17210	} else {
17211	DeletedNodes.insert(Ptr: TE);
17212	TransformedToGatherNodes.erase(Val: TE);
17213	NodesCosts.erase(Val: TE);
17214	}
17215	for (unsigned Idx : std::get<`2`>(t: Worklist.top().second)) {
17216	TreeEntry &ChildTE = *VectorizableTree [Idx];
17217	DeletedNodes.insert(Ptr: &ChildTE);
17218	TransformedToGatherNodes.erase(Val: &ChildTE);
17219	NodesCosts.erase(Val: &ChildTE);
17220	}
17221	Changed = true;
17222	}
17223	Worklist.pop();
17224	}
17225	if (!Changed)
17226	return std::get<`1`>(t&: SubtreeCosts.front());
17227
17228	SmallPtrSet<TreeEntry *, `4`> GatheredLoadsToDelete;
17229	InstructionCost LoadsExtractsCost = `0`;
17230	// Check if all loads of gathered loads nodes are marked for deletion. In this
17231	// case the whole gathered loads subtree must be deleted.
17232	// Also, try to account for extracts, which might be required, if only part of
17233	// gathered load must be vectorized. Keep partially vectorized nodes, if
17234	// extracts are cheaper than gathers.
17235	for (TreeEntry *TE : GatheredLoadsNodes) {
17236	if (DeletedNodes.contains(Ptr: TE) \|\| TransformedToGatherNodes.contains(Val: TE))
17237	continue;
17238	GatheredLoadsToDelete.insert(Ptr: TE);
17239	APInt DemandedElts = APInt::getZero(numBits: TE->getVectorFactor());
17240	// All loads are removed from gathered? Need to delete the subtree.
17241	SmallDenseMap<const TreeEntry , SmallVector<Value >> ValuesToInsert;
17242	for (Value *V : TE->Scalars) {
17243	unsigned Pos = TE->findLaneForValue(V);
17244	for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
17245	if (DeletedNodes.contains(Ptr: BVE))
17246	continue;
17247	DemandedElts.setBit(Pos);
17248	ValuesToInsert.try_emplace(Key: BVE).first ->second.push_back(Elt: V);
17249	}
17250	}
17251	if (!DemandedElts.isZero()) {
17252	Type *ScalarTy = TE->Scalars.front()->getType();
17253	auto It = MinBWs.find(Val: TE);
17254	if (It != MinBWs.end())
17255	ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It ->second.first);
17256	auto *VecTy = getWidenedType(ScalarTy, VF: TE->getVectorFactor());
17257	InstructionCost ExtractsCost = ::getScalarizationOverhead(
17258	TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
17259	/Insert=/false, /Extract=/true, CostKind);
17260	InstructionCost BVCost = `0`;
17261	for (const auto &[BVE, Values] : ValuesToInsert) {
17262	APInt BVDemandedElts = APInt::getZero(numBits: BVE->getVectorFactor());
17263	SmallVector<Value *> BVValues(BVE->getVectorFactor(),
17264	PoisonValue::get(T: ScalarTy));
17265	for (Value *V : Values) {
17266	unsigned Pos = BVE->findLaneForValue(V);
17267	BVValues [Pos] = V;
17268	BVDemandedElts.setBit(Pos);
17269	}
17270	auto *BVVecTy = getWidenedType(ScalarTy, VF: BVE->getVectorFactor());
17271	BVCost += ::getScalarizationOverhead(
17272	TTI: *TTI, ScalarTy, Ty: BVVecTy, DemandedElts: BVDemandedElts,
17273	/Insert=/true, /Extract=/false, CostKind,
17274	ForPoisonSrc: BVDemandedElts.isAllOnes(), VL: BVValues);
17275	}
17276	if (ExtractsCost < BVCost) {
17277	LoadsExtractsCost += ExtractsCost;
17278	GatheredLoadsToDelete.erase(Ptr: TE);
17279	continue;
17280	}
17281	LoadsExtractsCost += BVCost;
17282	}
17283	NodesCosts.erase(Val: TE);
17284	}
17285
17286	// Deleted all subtrees rooted at gathered loads nodes.
17287	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17288	if (TE ->UserTreeIndex &&
17289	GatheredLoadsToDelete.contains(Ptr: TE ->UserTreeIndex.UserTE)) {
17290	DeletedNodes.insert(Ptr: TE.get());
17291	NodesCosts.erase(Val: TE.get());
17292	GatheredLoadsToDelete.insert(Ptr: TE.get());
17293	}
17294	if (GatheredLoadsToDelete.contains(Ptr: TE.get()))
17295	DeletedNodes.insert(Ptr: TE.get());
17296	}
17297
17298	for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17299	if (!TE ->UserTreeIndex && TransformedToGatherNodes.contains(Val: TE.get())) {
17300	assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
17301	continue;
17302	}
17303	if (DeletedNodes.contains(Ptr: TE.get()))
17304	continue;
17305	if (!NodesCosts.contains(Val: TE.get())) {
17306	InstructionCost C =
17307	getEntryCost(E: TE.get(), VectorizedVals, CheckedExtracts);
17308	NodesCosts.try_emplace(Key: TE.get(), Args&: C);
17309	}
17310	}
17311
17312	LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
17313	InstructionCost NewCost = `0`;
17314	for (const auto &P : NodesCosts) {
17315	NewCost += P.second;
17316	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
17317	<< shortBundleName(P.first->Scalars, P.first->Idx)
17318	<< ".\n"
17319	<< "SLP: Current total cost = " << Cost << "\n");
17320	}
17321	if (NewCost + LoadsExtractsCost >= Cost) {
17322	DeletedNodes.clear();
17323	TransformedToGatherNodes.clear();
17324	NewCost = Cost;
17325	} else {
17326	// If the remaining tree is just a buildvector - exit, it will cause
17327	// endless attempts to vectorize.
17328	if (VectorizableTree.size()>= `2` && VectorizableTree.front()->hasState() &&
17329	VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17330	TransformedToGatherNodes.contains(Val: VectorizableTree [`1`].get()))
17331	return InstructionCost::getInvalid();
17332	if (VectorizableTree.size() >= `3` && VectorizableTree.front()->hasState() &&
17333	VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17334	VectorizableTree [`1`]->hasState() &&
17335	VectorizableTree [`1`]->State == TreeEntry::Vectorize &&
17336	(VectorizableTree [`1`]->getOpcode() == Instruction::ZExt \|\|
17337	VectorizableTree [`1`]->getOpcode() == Instruction::SExt \|\|
17338	VectorizableTree [`1`]->getOpcode() == Instruction::Trunc) &&
17339	TransformedToGatherNodes.contains(Val: VectorizableTree [`2`].get()))
17340	return InstructionCost::getInvalid();
17341	}
17342	return NewCost;
17343	}
17344
17345	namespace {
17346	/// Data type for handling buildvector sequences with the reused scalars from
17347	/// other tree entries.
17348	template <typename T> struct ShuffledInsertData {
17349	/// List of insertelements to be replaced by shuffles.
17350	SmallVector<InsertElementInst *> InsertElements;
17351	/// The parent vectors and shuffle mask for the given list of inserts.
17352	MapVector<T, SmallVector<int>> ValueMasks;
17353	};
17354	} // namespace
17355
17356	InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
17357	ArrayRef<Value *> VectorizedVals,
17358	InstructionCost ReductionCost) {
17359	InstructionCost Cost = TreeCost + ReductionCost;
17360
17361	if (Cost >= -SLPCostThreshold &&
17362	none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) {
17363	return isa_and_nonnull<InsertElementInst>(Val: EU.User);
17364	}))
17365	return Cost;
17366
17367	SmallPtrSet<Value *, `16`> ExtractCostCalculated;
17368	InstructionCost ExtractCost = `0`;
17369	SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
17370	SmallVector<APInt> DemandedElts;
17371	SmallDenseSet<Value *, `4`> UsedInserts;
17372	DenseSet<std::pair<const TreeEntry , Type >> VectorCasts;
17373	std::optional<DenseMap<Value , unsigned*>> ValueToExtUses;
17374	DenseMap<const TreeEntry , DenseSet<Value >> ExtractsCount;
17375	SmallPtrSet<Value *, `4`> ScalarOpsFromCasts;
17376	// Keep track {Scalar, Index, User} tuple.
17377	// On AArch64, this helps in fusing a mov instruction, associated with
17378	// extractelement, with fmul in the backend so that extractelement is free.
17379	SmallVector<std::tuple<Value , User , int>, `4`> ScalarUserAndIdx;
17380	for (ExternalUser &EU : ExternalUses) {
17381	ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane);
17382	}
17383	SmallDenseSet<std::pair<Value , Value >, `8`> CheckedScalarUser;
17384	for (ExternalUser &EU : ExternalUses) {
17385	LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
17386	<< EU.E.Idx << " in lane " << EU.Lane << "\n");
17387	LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
17388	else dbgs() << " User: nullptr\n");
17389	LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
17390
17391	// Uses by ephemeral values are free (because the ephemeral value will be
17392	// removed prior to code generation, and so the extraction will be
17393	// removed as well).
17394	if (EphValues.count(Ptr: EU.User))
17395	continue;
17396
17397	// Check if the scalar for the given user or all users is accounted already.
17398	if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second \|\|
17399	(EU.User &&
17400	CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr))))
17401	continue;
17402
17403	// Used in unreachable blocks or in EH pads (rarely executed) or is
17404	// terminated with unreachable instruction.
17405	if (BasicBlock *UserParent =
17406	EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr;
17407	UserParent &&
17408	(!DT->isReachableFromEntry(A: UserParent) \|\| UserParent->isEHPad() \|\|
17409	isa_and_present<UnreachableInst>(Val: UserParent->getTerminator())))
17410	continue;
17411
17412	// We only add extract cost once for the same scalar.
17413	if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
17414	!ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
17415	continue;
17416
17417	// No extract cost for vector "scalar" if REVEC is disabled
17418	if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType()))
17419	continue;
17420
17421	// If found user is an insertelement, do not calculate extract cost but try
17422	// to detect it as a final shuffled/identity match.
17423	// TODO: what if a user is insertvalue when REVEC is enabled?
17424	if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
17425	VU && VU->getOperand(i_nocapture: `1`) == EU.Scalar) {
17426	if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
17427	if (!UsedInserts.insert(V: VU).second)
17428	continue;
17429	std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
17430	if (InsertIdx) {
17431	const TreeEntry *ScalarTE = &EU.E;
17432	auto *It = find_if(
17433	Range&: ShuffledInserts,
17434	P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
17435	// Checks if 2 insertelements are from the same buildvector.
17436	InsertElementInst *VecInsert = Data.InsertElements.front();
17437	return areTwoInsertFromSameBuildVector(
17438	VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst II) -> Value {
17439	Value *Op0 = II->getOperand(i_nocapture: `0`);
17440	if (isVectorized(V: II) && !isVectorized(V: Op0))
17441	return nullptr;
17442	return Op0;
17443	});
17444	});
17445	int VecId = -`1`;
17446	if (It == ShuffledInserts.end()) {
17447	auto &Data = ShuffledInserts.emplace_back();
17448	Data.InsertElements.emplace_back(Args&: VU);
17449	DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
17450	VecId = ShuffledInserts.size() - `1`;
17451	auto It = MinBWs.find(Val: ScalarTE);
17452	if (It != MinBWs.end() &&
17453	VectorCasts
17454	.insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
17455	.second) {
17456	unsigned BWSz = It ->second.first;
17457	unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
17458	unsigned VecOpcode;
17459	if (DstBWSz < BWSz)
17460	VecOpcode = Instruction::Trunc;
17461	else
17462	VecOpcode =
17463	It ->second.second ? Instruction::SExt : Instruction::ZExt;
17464	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17465	InstructionCost C = TTI->getCastInstrCost(
17466	Opcode: VecOpcode, Dst: FTy,
17467	Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
17468	VF: FTy->getNumElements()),
17469	CCH: TTI::CastContextHint::None, CostKind);
17470	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17471	<< " for extending externally used vector with "
17472	"non-equal minimum bitwidth.\n");
17473	Cost += C;
17474	}
17475	} else {
17476	if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front()))
17477	It->InsertElements.front() = VU;
17478	VecId = std::distance(first: ShuffledInserts.begin(), last: It);
17479	}
17480	int InIdx = *InsertIdx;
17481	SmallVectorImpl<int> &Mask =
17482	ShuffledInserts [VecId].ValueMasks [ScalarTE];
17483	if (Mask.empty())
17484	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
17485	Mask [InIdx] = EU.Lane;
17486	DemandedElts [VecId].setBit(InIdx);
17487	continue;
17488	}
17489	}
17490	}
17491
17492	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17493	// If we plan to rewrite the tree in a smaller type, we will need to sign
17494	// extend the extracted value back to the original type. Here, we account
17495	// for the extract and the added cost of the sign extend if needed.
17496	InstructionCost ExtraCost = TTI::TCC_Free;
17497	auto *ScalarTy = EU.Scalar->getType();
17498	const unsigned BundleWidth = EU.E.getVectorFactor();
17499	assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
17500	auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth);
17501	const TreeEntry *Entry = &EU.E;
17502	auto It = MinBWs.find(Val: Entry);
17503	if (It != MinBWs.end()) {
17504	Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
17505	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
17506	MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements());
17507	unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery (*DL))
17508	? Instruction::ZExt
17509	: Instruction::SExt;
17510	VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
17511	ExtraCost =
17512	getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane);
17513	LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
17514	<< ExtraCost << "\n");
17515	} else {
17516	ExtraCost =
17517	getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy,
17518	CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx);
17519	LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
17520	<< *VecTy << ": " << ExtraCost << "\n");
17521	}
17522	// Leave the scalar instructions as is if they are cheaper than extracts.
17523	if (Entry->Idx != `0` \|\| Entry->getOpcode() == Instruction::GetElementPtr \|\|
17524	Entry->getOpcode() == Instruction::Load) {
17525	// Checks if the user of the external scalar is phi in loop body.
17526	auto IsPhiInLoop = [&](const ExternalUser &U) {
17527	if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) {
17528	auto *I = cast<Instruction>(Val: U.Scalar);
17529	const Loop *L = LI->getLoopFor(BB: Phi->getParent());
17530	return L && (Phi->getParent() == I->getParent() \|\|
17531	L == LI->getLoopFor(BB: I->getParent()));
17532	}
17533	return false;
17534	};
17535	if (!ValueToExtUses) {
17536	ValueToExtUses.emplace();
17537	for (const auto &P : enumerate(First&: ExternalUses)) {
17538	// Ignore phis in loops.
17539	if (IsPhiInLoop (P.value()))
17540	continue;
17541
17542	ValueToExtUses ->try_emplace(Key: P.value().Scalar, Args: P.index());
17543	}
17544	}
17545	// Can use original instruction, if no operands vectorized or they are
17546	// marked as externally used already.
17547	auto *Inst = cast<Instruction>(Val: EU.Scalar);
17548	InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind);
17549	auto OperandIsScalar = [&](Value *V) {
17550	if (!isVectorized(V)) {
17551	// Some extractelements might be not vectorized, but
17552	// transformed into shuffle and removed from the function,
17553	// consider it here.
17554	if (auto *EE = dyn_cast<ExtractElementInst>(Val: V))
17555	return !EE->hasOneUse() \|\| !MustGather.contains(Ptr: EE);
17556	return true;
17557	}
17558	return ValueToExtUses ->contains(Val: V);
17559	};
17560	bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar);
17561	bool CanBeUsedAsScalarCast = false;
17562	if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) {
17563	if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: `0`));
17564	Op && all_of(Range: Op->operands(), P: OperandIsScalar)) {
17565	InstructionCost OpCost =
17566	(isVectorized(V: Op) && !ValueToExtUses ->contains(Val: Op))
17567	? TTI->getInstructionCost(U: Op, CostKind)
17568	: `0`;
17569	if (ScalarCost + OpCost <= ExtraCost) {
17570	CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
17571	ScalarCost += OpCost;
17572	}
17573	}
17574	}
17575	if (CanBeUsedAsScalar) {
17576	bool KeepScalar = ScalarCost <= ExtraCost;
17577	// Try to keep original scalar if the user is the phi node from the same
17578	// block as the root phis, currently vectorized. It allows to keep
17579	// better ordering info of PHIs, being vectorized currently.
17580	bool IsProfitablePHIUser =
17581	(KeepScalar \|\| (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
17582	VectorizableTree.front()->Scalars.size() > `2`)) &&
17583	VectorizableTree.front()->hasState() &&
17584	VectorizableTree.front()->getOpcode() == Instruction::PHI &&
17585	!Inst->hasNUsesOrMore(N: UsesLimit) &&
17586	none_of(Range: Inst->users(),
17587	P: [&](User *U) {
17588	auto *PHIUser = dyn_cast<PHINode>(Val: U);
17589	return (!PHIUser \|\|
17590	PHIUser->getParent() !=
17591	cast<Instruction>(
17592	Val: VectorizableTree.front()->getMainOp())
17593	->getParent()) &&
17594	!isVectorized(V: U);
17595	}) &&
17596	count_if(Range: Entry->Scalars, P: [&](Value *V) {
17597	return ValueToExtUses ->contains(Val: V);
17598	}) <= `2`;
17599	if (IsProfitablePHIUser) {
17600	KeepScalar = true;
17601	} else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
17602	ExtraCost - ScalarCost <= TTI::TCC_Basic &&
17603	(!GatheredLoadsEntriesFirst.has_value() \|\|
17604	Entry->Idx < *GatheredLoadsEntriesFirst)) {
17605	unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) {
17606	return ValueToExtUses ->contains(Val: V);
17607	});
17608	auto It = ExtractsCount.find(Val: Entry);
17609	if (It != ExtractsCount.end()) {
17610	assert(ScalarUsesCount >= It->getSecond().size() &&
17611	"Expected total number of external uses not less than "
17612	"number of scalar uses.");
17613	ScalarUsesCount -= It ->getSecond().size();
17614	}
17615	// Keep original scalar if number of externally used instructions in
17616	// the same entry is not power of 2. It may help to do some extra
17617	// vectorization for now.
17618	KeepScalar = ScalarUsesCount <= `1` \|\| !has_single_bit(Value: ScalarUsesCount);
17619	}
17620	if (KeepScalar) {
17621	ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar);
17622	for (Value *V : Inst->operands()) {
17623	auto It = ValueToExtUses ->find(Val: V);
17624	if (It != ValueToExtUses ->end()) {
17625	// Replace all uses to avoid compiler crash.
17626	ExternalUses [It ->second].User = nullptr;
17627	}
17628	}
17629	ExtraCost = ScalarCost;
17630	if (!IsPhiInLoop (EU))
17631	ExtractsCount [Entry].insert(V: Inst);
17632	if (CanBeUsedAsScalarCast) {
17633	ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: `0`));
17634	// Update the users of the operands of the cast operand to avoid
17635	// compiler crash.
17636	if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: `0`))) {
17637	for (Value *V : IOp->operands()) {
17638	auto It = ValueToExtUses ->find(Val: V);
17639	if (It != ValueToExtUses ->end()) {
17640	// Replace all uses to avoid compiler crash.
17641	ExternalUses [It ->second].User = nullptr;
17642	}
17643	}
17644	}
17645	}
17646	}
17647	}
17648	}
17649
17650	ExtractCost += ExtraCost;
17651	}
17652	// Insert externals for extract of operands of casts to be emitted as scalars
17653	// instead of extractelement.
17654	for (Value *V : ScalarOpsFromCasts) {
17655	ExternalUsesAsOriginalScalar.insert(Ptr: V);
17656	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17657	const auto It = find_if_not(Range&: TEs, P: [&](TreeEntry TE) {
17658	return TransformedToGatherNodes.contains(Val: TE) \|\|
17659	DeletedNodes.contains(Ptr: TE);
17660	});
17661	if (It != TEs.end()) {
17662	const TreeEntry UserTE = It;
17663	ExternalUses.emplace_back(Args&: V, Args: nullptr, Args: *UserTE,
17664	Args: UserTE->findLaneForValue(V));
17665	}
17666	}
17667	}
17668	// Add reduced value cost, if resized.
17669	if (!VectorizedVals.empty()) {
17670	const TreeEntry &Root = *VectorizableTree.front();
17671	auto BWIt = MinBWs.find(Val: &Root);
17672	if (BWIt != MinBWs.end()) {
17673	Type *DstTy = Root.Scalars.front()->getType();
17674	unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType());
17675	unsigned SrcSz =
17676	ReductionBitWidth == `0` ? BWIt ->second.first : ReductionBitWidth;
17677	if (OriginalSz != SrcSz) {
17678	unsigned Opcode = Instruction::Trunc;
17679	if (OriginalSz > SrcSz)
17680	Opcode = BWIt ->second.second ? Instruction::SExt : Instruction::ZExt;
17681	Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
17682	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) {
17683	assert(SLPReVec && "Only supported by REVEC.");
17684	SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements());
17685	}
17686	Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
17687	CCH: TTI::CastContextHint::None,
17688	CostKind: TTI::TCK_RecipThroughput);
17689	}
17690	}
17691	}
17692
17693	// Buildvector with externally used scalars, which should remain as scalars,
17694	// should not be vectorized, the compiler may hang.
17695	if (SLPCostThreshold < `0` && VectorizableTree.size() > `1` &&
17696	isa<InsertElementInst>(Val: VectorizableTree [`0`]->Scalars [`0`]) &&
17697	VectorizableTree [`1`]->hasState() &&
17698	VectorizableTree [`1`]->State == TreeEntry::Vectorize &&
17699	all_of(Range&: VectorizableTree [`1`]->Scalars, P: [&](Value *V) {
17700	return ExternalUsesAsOriginalScalar.contains(Ptr: V);
17701	}))
17702	return InstructionCost::getInvalid();
17703
17704	Cost += ExtractCost;
17705	auto &&ResizeToVF = [this, &Cost](const TreeEntry TE, ArrayRef<int*> Mask,
17706	bool ForSingleMask) {
17707	InstructionCost C = `0`;
17708	unsigned VF = Mask.size();
17709	unsigned VecVF = TE->getVectorFactor();
17710	bool HasLargeIndex =
17711	any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17712	if ((VF != VecVF && HasLargeIndex) \|\|
17713	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) {
17714
17715	if (HasLargeIndex) {
17716	SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17717	std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
17718	result: OrigMask.begin());
17719	C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17720	Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
17721	Mask: OrigMask);
17722	LLVM_DEBUG(
17723	dbgs() << "SLP: Adding cost " << C
17724	<< " for final shuffle of insertelement external users.\n";
17725	TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17726	Cost += C;
17727	return std::make_pair(x&: TE, y: true);
17728	}
17729
17730	if (!ForSingleMask) {
17731	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17732	for (unsigned I = `0`; I < VF; ++I) {
17733	if (Mask [I] != PoisonMaskElem)
17734	ResizeMask [Mask [I]] = Mask [I];
17735	}
17736	if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF))
17737	C = ::getShuffleCost(
17738	TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17739	Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask);
17740	LLVM_DEBUG(
17741	dbgs() << "SLP: Adding cost " << C
17742	<< " for final shuffle of insertelement external users.\n";
17743	TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17744
17745	Cost += C;
17746	}
17747	}
17748	return std::make_pair(x&: TE, y: false);
17749	};
17750	// Calculate the cost of the reshuffled vectors, if any.
17751	for (int I = `0`, E = ShuffledInserts.size(); I < E; ++I) {
17752	Value *Base = ShuffledInserts [I].InsertElements.front()->getOperand(i_nocapture: `0`);
17753	auto Vector = ShuffledInserts [I].ValueMasks.takeVector();
17754	unsigned VF = `0`;
17755	auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17756	ArrayRef<const TreeEntry *> TEs) {
17757	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
17758	"Expected exactly 1 or 2 tree entries.");
17759	if (TEs.size() == `1`) {
17760	if (VF == `0`)
17761	VF = TEs.front()->getVectorFactor();
17762	auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
17763	if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
17764	!all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
17765	return Data.value() == PoisonMaskElem \|\|
17766	(Data.index() < VF &&
17767	static_cast<int>(Data.index()) == Data.value());
17768	})) {
17769	InstructionCost C =
17770	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
17771	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17772	<< " for final shuffle of insertelement "
17773	"external users.\n";
17774	TEs.front()->dump();
17775	dbgs() << "SLP: Current total cost = " << Cost << "\n");
17776	Cost += C;
17777	}
17778	} else {
17779	if (VF == `0`) {
17780	if (TEs.front() &&
17781	TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17782	VF = TEs.front()->getVectorFactor();
17783	else
17784	VF = Mask.size();
17785	}
17786	auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
17787	InstructionCost C =
17788	::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
17789	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17790	<< " for final shuffle of vector node and external "
17791	"insertelement users.\n";
17792	if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17793	dbgs() << "SLP: Current total cost = " << Cost << "\n");
17794	Cost += C;
17795	}
17796	VF = Mask.size();
17797	return TEs.back();
17798	};
17799	(void)performExtractsShuffleAction<const TreeEntry>(
17800	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
17801	GetVF: [](const TreeEntry E) { return* E->getVectorFactor(); }, ResizeAction: ResizeToVF,
17802	Action: EstimateShufflesCost);
17803	InstructionCost InsertCost = TTI->getScalarizationOverhead(
17804	Ty: cast<FixedVectorType>(
17805	Val: ShuffledInserts [I].InsertElements.front()->getType()),
17806	DemandedElts: DemandedElts [I],
17807	/Insert/ true, /Extract/ false, CostKind: TTI::TCK_RecipThroughput);
17808	Cost -= InsertCost;
17809	}
17810
17811	// Add the cost for reduced value resize (if required).
17812	if (ReductionBitWidth != `0`) {
17813	assert(UserIgnoreList && "Expected reduction tree.");
17814	const TreeEntry &E = *VectorizableTree.front();
17815	auto It = MinBWs.find(Val: &E);
17816	if (It != MinBWs.end() && It ->second.first != ReductionBitWidth) {
17817	unsigned SrcSize = It ->second.first;
17818	unsigned DstSize = ReductionBitWidth;
17819	unsigned Opcode = Instruction::Trunc;
17820	if (SrcSize < DstSize) {
17821	bool IsArithmeticExtendedReduction =
17822	all_of(Range: UserIgnoreList, P: [](Value V) {
17823	auto *I = cast<Instruction>(Val: V);
17824	return is_contained(Set: {Instruction::Add, Instruction::FAdd,
17825	Instruction::Mul, Instruction::FMul,
17826	Instruction::And, Instruction::Or,
17827	Instruction::Xor},
17828	Element: I->getOpcode());
17829	});
17830	if (IsArithmeticExtendedReduction)
17831	Opcode =
17832	Instruction::BitCast; // Handle it by getExtendedReductionCost
17833	else
17834	Opcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
17835	}
17836	if (Opcode != Instruction::BitCast) {
17837	auto *SrcVecTy =
17838	getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
17839	auto *DstVecTy =
17840	getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
17841	TTI::CastContextHint CCH = getCastContextHint(TE: E);
17842	InstructionCost CastCost;
17843	switch (E.getOpcode()) {
17844	case Instruction::SExt:
17845	case Instruction::ZExt:
17846	case Instruction::Trunc: {
17847	const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: `0`);
17848	CCH = getCastContextHint(TE: *OpTE);
17849	break;
17850	}
17851	default:
17852	break;
17853	}
17854	CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
17855	CostKind: TTI::TCK_RecipThroughput);
17856	Cost += CastCost;
17857	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17858	<< " for final resize for reduction from " << SrcVecTy
17859	<< " to " << DstVecTy << "\n";
17860	dbgs() << "SLP: Current total cost = " << Cost << "\n");
17861	}
17862	}
17863	}
17864
17865	std::optional<InstructionCost> SpillCost;
17866	if (Cost < -SLPCostThreshold) {
17867	SpillCost = getSpillCost();
17868	Cost += *SpillCost;
17869	}
17870	#ifndef NDEBUG
17871	SmallString<`256`> Str;
17872	{
17873	raw_svector_ostream OS(Str);
17874	OS << "SLP: Spill Cost = ";
17875	if (SpillCost)
17876	OS << *SpillCost;
17877	else
17878	OS << "<skipped>";
17879	OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17880	<< "SLP: Total Cost = " << Cost << ".\n";
17881	}
17882	LLVM_DEBUG(dbgs() << Str);
17883	if (ViewSLPTree)
17884	ViewGraph(this, "SLP" + F->getName(), false, Str);
17885	#endif
17886
17887	return Cost;
17888	}
17889
17890	/// Tries to find extractelement instructions with constant indices from fixed
17891	/// vector type and gather such instructions into a bunch, which highly likely
17892	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17893	/// successful, the matched scalars are replaced by poison values in \p VL for
17894	/// future analysis.
17895	std::optional<TTI::ShuffleKind>
17896	BoUpSLP::tryToGatherSingleRegisterExtractElements(
17897	MutableArrayRef<Value > VL, SmallVectorImpl<int> &Mask) const* {
17898	// Scan list of gathered scalars for extractelements that can be represented
17899	// as shuffles.
17900	MapVector<Value , SmallVector<int*>> VectorOpToIdx;
17901	SmallVector<int> UndefVectorExtracts;
17902	for (int I = `0`, E = VL.size(); I < E; ++I) {
17903	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
17904	if (!EI) {
17905	if (isa<UndefValue>(Val: VL [I]))
17906	UndefVectorExtracts.push_back(Elt: I);
17907	continue;
17908	}
17909	auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
17910	if (!VecTy \|\| !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
17911	continue;
17912	std::optional<unsigned> Idx = getExtractIndex(E: EI);
17913	// Undefined index.
17914	if (!Idx) {
17915	UndefVectorExtracts.push_back(Elt: I);
17916	continue;
17917	}
17918	if (Idx >= VecTy->getNumElements()) {
17919	UndefVectorExtracts.push_back(Elt: I);
17920	continue;
17921	}
17922	SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17923	ExtractMask.reset(Idx: *Idx);
17924	if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
17925	UndefVectorExtracts.push_back(Elt: I);
17926	continue;
17927	}
17928	VectorOpToIdx [EI->getVectorOperand()].push_back(Elt: I);
17929	}
17930	// Sort the vector operands by the maximum number of uses in extractelements.
17931	SmallVector<std::pair<Value , SmallVector<int*>>> Vectors =
17932	VectorOpToIdx.takeVector();
17933	stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
17934	return P1.second.size() > P2.second.size();
17935	});
17936	// Find the best pair of the vectors or a single vector.
17937	const int UndefSz = UndefVectorExtracts.size();
17938	unsigned SingleMax = `0`;
17939	unsigned PairMax = `0`;
17940	if (!Vectors.empty()) {
17941	SingleMax = Vectors.front().second.size() + UndefSz;
17942	if (Vectors.size() > `1`) {
17943	auto *ItNext = std::next(x: Vectors.begin());
17944	PairMax = SingleMax + ItNext->second.size();
17945	}
17946	}
17947	if (SingleMax == `0` && PairMax == `0` && UndefSz == `0`)
17948	return std::nullopt;
17949	// Check if better to perform a shuffle of 2 vectors or just of a single
17950	// vector.
17951	SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17952	SmallVector<Value *> GatheredExtracts(
17953	VL.size(), PoisonValue::get(T: VL.front()->getType()));
17954	if (SingleMax >= PairMax && SingleMax) {
17955	for (int Idx : Vectors.front().second)
17956	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
17957	} else if (!Vectors.empty()) {
17958	for (unsigned Idx : {`0`, `1`})
17959	for (int Idx : Vectors [Idx].second)
17960	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
17961	}
17962	// Add extracts from undefs too.
17963	for (int Idx : UndefVectorExtracts)
17964	std::swap(a&: GatheredExtracts [Idx], b&: VL [Idx]);
17965	// Check that gather of extractelements can be represented as just a
17966	// shuffle of a single/two vectors the scalars are extracted from.
17967	std::optional<TTI::ShuffleKind> Res =
17968	isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC);
17969	if (!Res \|\| all_of(Range&: Mask, P: equal_to(Arg: PoisonMaskElem))) {
17970	// TODO: try to check other subsets if possible.
17971	// Restore the original VL if attempt was not successful.
17972	copy(Range&: SavedVL, Out: VL.begin());
17973	return std::nullopt;
17974	}
17975	// Restore unused scalars from mask, if some of the extractelements were not
17976	// selected for shuffle.
17977	for (int I = `0`, E = GatheredExtracts.size(); I < E; ++I) {
17978	if (Mask [I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts [I]) &&
17979	isa<UndefValue>(Val: GatheredExtracts [I])) {
17980	std::swap(a&: VL [I], b&: GatheredExtracts [I]);
17981	continue;
17982	}
17983	auto *EI = dyn_cast<ExtractElementInst>(Val: VL [I]);
17984	if (!EI \|\| !isa<FixedVectorType>(Val: EI->getVectorOperandType()) \|\|
17985	!isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) \|\|
17986	is_contained(Range&: UndefVectorExtracts, Element: I))
17987	continue;
17988	}
17989	return Res;
17990	}
17991
17992	/// Tries to find extractelement instructions with constant indices from fixed
17993	/// vector type and gather such instructions into a bunch, which highly likely
17994	/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17995	/// successful, the matched scalars are replaced by poison values in \p VL for
17996	/// future analysis.
17997	SmallVector<std::optional<TTI::ShuffleKind>>
17998	BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17999	SmallVectorImpl<int> &Mask,
18000	unsigned NumParts) const {
18001	assert(NumParts > `0` && "NumParts expected be greater than or equal to 1.");
18002	SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
18003	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
18004	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
18005	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
18006	// Scan list of gathered scalars for extractelements that can be represented
18007	// as shuffles.
18008	const unsigned PartOffset = Part * SliceSize;
18009	const unsigned PartSize = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
18010	// It may happen in case of revec, need to check no access out of bounds.
18011	if (PartOffset + PartSize > VL.size())
18012	break;
18013	MutableArrayRef<Value *> SubVL =
18014	MutableArrayRef(VL).slice(N: PartOffset, M: PartSize);
18015	SmallVector<int> SubMask;
18016	std::optional<TTI::ShuffleKind> Res =
18017	tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
18018	ShufflesRes [Part] = Res;
18019	copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
18020	}
18021	if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
18022	return Res.has_value();
18023	}))
18024	ShufflesRes.clear();
18025	return ShufflesRes;
18026	}
18027
18028	std::optional<TargetTransformInfo::ShuffleKind>
18029	BoUpSLP::isGatherShuffledSingleRegisterEntry(
18030	const TreeEntry TE, ArrayRef<Value > VL, MutableArrayRef<int> Mask,
18031	SmallVectorImpl<const TreeEntry > &Entries, unsigned* Part, bool ForOrder) {
18032	Entries.clear();
18033	if (TE->Idx == `0`)
18034	return std::nullopt;
18035	// TODO: currently checking only for Scalars in the tree entry, need to count
18036	// reused elements too for better cost estimation.
18037	auto GetUserEntry = [&](const TreeEntry *TE) {
18038	while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18039	TE = TE->UserTreeIndex.UserTE;
18040	if (TE == VectorizableTree.front().get())
18041	return EdgeInfo (const_cast<TreeEntry *>(TE), `0`);
18042	return TE->UserTreeIndex;
18043	};
18044	auto HasGatherUser = [&](const TreeEntry *TE) {
18045	while (TE->Idx != `0` && TE->UserTreeIndex) {
18046	if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18047	return true;
18048	TE = TE->UserTreeIndex.UserTE;
18049	}
18050	return false;
18051	};
18052	const EdgeInfo TEUseEI = GetUserEntry (TE);
18053	if (!TEUseEI \|\| (TEUseEI.UserTE->Idx == `0` && TEUseEI.UserTE->isGather() &&
18054	!TEUseEI.UserTE->hasState()))
18055	return std::nullopt;
18056	const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
18057	const BasicBlock TEInsertBlock = nullptr*;
18058	// Main node of PHI entries keeps the correct order of operands/incoming
18059	// blocks.
18060	if (auto *PHI = dyn_cast_or_null<PHINode>(
18061	Val: TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
18062	PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
18063	TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
18064	TEInsertPt = TEInsertBlock->getTerminator();
18065	} else {
18066	TEInsertBlock = TEInsertPt->getParent();
18067	}
18068	if (!DT->isReachableFromEntry(A: TEInsertBlock))
18069	return std::nullopt;
18070	auto *NodeUI = DT->getNode(BB: TEInsertBlock);
18071	assert(NodeUI && "Should only process reachable instructions");
18072	SmallPtrSet<Value *, `4`> GatheredScalars(llvm::from_range, VL);
18073	auto CheckOrdering = [&](const Instruction *InsertPt) {
18074	// Argument InsertPt is an instruction where vector code for some other
18075	// tree entry (one that shares one or more scalars with TE) is going to be
18076	// generated. This lambda returns true if insertion point of vector code
18077	// for the TE dominates that point (otherwise dependency is the other way
18078	// around). The other node is not limited to be of a gather kind. Gather
18079	// nodes are not scheduled and their vector code is inserted before their
18080	// first user. If user is PHI, that is supposed to be at the end of a
18081	// predecessor block. Otherwise it is the last instruction among scalars of
18082	// the user node. So, instead of checking dependency between instructions
18083	// themselves, we check dependency between their insertion points for vector
18084	// code (since each scalar instruction ends up as a lane of a vector
18085	// instruction).
18086	const BasicBlock *InsertBlock = InsertPt->getParent();
18087	auto *NodeEUI = DT->getNode(BB: InsertBlock);
18088	if (!NodeEUI)
18089	return false;
18090	assert((NodeUI == NodeEUI) ==
18091	(NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
18092	"Different nodes should have different DFS numbers");
18093	// Check the order of the gather nodes users.
18094	if (TEInsertPt->getParent() != InsertBlock &&
18095	(DT->dominates(A: NodeUI, B: NodeEUI) \|\| !DT->dominates(A: NodeEUI, B: NodeUI)))
18096	return false;
18097	if (TEInsertPt->getParent() == InsertBlock &&
18098	TEInsertPt->comesBefore(Other: InsertPt))
18099	return false;
18100	return true;
18101	};
18102	// Find all tree entries used by the gathered values. If no common entries
18103	// found - not a shuffle.
18104	// Here we build a set of tree nodes for each gathered value and trying to
18105	// find the intersection between these sets. If we have at least one common
18106	// tree node for each gathered value - we have just a permutation of the
18107	// single vector. If we have 2 different sets, we're in situation where we
18108	// have a permutation of 2 input vectors.
18109	SmallVector<SmallPtrSet<const TreeEntry *, `4`>> UsedTEs;
18110	SmallDenseMap<Value , int*> UsedValuesEntry;
18111	SmallPtrSet<const Value *, `16`> VisitedValue;
18112	bool IsReusedNodeFound = false;
18113	auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
18114	// The node is reused - exit.
18115	if (IsReusedNodeFound)
18116	return false;
18117	if ((TEPtr->getVectorFactor() != VL.size() &&
18118	TEPtr->Scalars.size() != VL.size()) \|\|
18119	(!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars)))
18120	return false;
18121	IsReusedNodeFound =
18122	equal(LRange: TE->Scalars, RRange: TEPtr->Scalars) &&
18123	equal(LRange: TE->ReorderIndices, RRange: TEPtr->ReorderIndices) &&
18124	equal(LRange: TE->ReuseShuffleIndices, RRange: TEPtr->ReuseShuffleIndices);
18125	UsedTEs.clear();
18126	UsedTEs.emplace_back().insert(Ptr: TEPtr);
18127	for (Value *V : VL) {
18128	if (isConstant(V))
18129	continue;
18130	UsedValuesEntry.try_emplace(Key: V, Args: `0`);
18131	}
18132	return true;
18133	};
18134	auto CheckParentNodes = [&](const TreeEntry User1, const* TreeEntry *User2,
18135	unsigned EdgeIdx) {
18136	const TreeEntry *Ptr1 = User1;
18137	const TreeEntry *Ptr2 = User2;
18138	SmallDenseMap<const TreeEntry , unsigned*> PtrToIdx;
18139	while (Ptr2) {
18140	PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx);
18141	EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
18142	Ptr2 = Ptr2->UserTreeIndex.UserTE;
18143	}
18144	while (Ptr1) {
18145	unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
18146	Ptr1 = Ptr1->UserTreeIndex.UserTE;
18147	if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end())
18148	return Idx < It ->second;
18149	}
18150	return false;
18151	};
18152	auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
18153	Instruction *InsertPt) {
18154	return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
18155	!TEUseEI.UserTE->isCopyableElement(
18156	V: const_cast<Instruction *>(TEInsertPt)) &&
18157	isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
18158	InsertPt->getNextNode() == TEInsertPt &&
18159	(!E->hasCopyableElements() \|\| !E->isCopyableElement(V: InsertPt) \|\|
18160	!isUsedOutsideBlock(V: InsertPt));
18161	};
18162	for (Value *V : VL) {
18163	if (isConstant(V) \|\| !VisitedValue.insert(Ptr: V).second)
18164	continue;
18165	// Build a list of tree entries where V is used.
18166	SmallPtrSet<const TreeEntry *, `4`> VToTEs;
18167	SmallVector<const TreeEntry *> GatherNodes(
18168	ValueToGatherNodes.lookup(Val: V).takeVector());
18169	if (TransformedToGatherNodes.contains(Val: TE)) {
18170	for (TreeEntry *E : getSplitTreeEntries(V)) {
18171	if (TE == E \|\| !TransformedToGatherNodes.contains(Val: E) \|\|
18172	!E->UserTreeIndex \|\| E->UserTreeIndex.UserTE->isGather())
18173	continue;
18174	GatherNodes.push_back(Elt: E);
18175	}
18176	for (TreeEntry *E : getTreeEntries(V)) {
18177	if (TE == E \|\| !TransformedToGatherNodes.contains(Val: E) \|\|
18178	!E->UserTreeIndex \|\| E->UserTreeIndex.UserTE->isGather())
18179	continue;
18180	GatherNodes.push_back(Elt: E);
18181	}
18182	}
18183	for (const TreeEntry *TEPtr : GatherNodes) {
18184	if (TEPtr == TE \|\| TEPtr->Idx == `0` \|\| DeletedNodes.contains(Ptr: TEPtr))
18185	continue;
18186	assert(any_of(TEPtr->Scalars,
18187	[&](Value V) { return* GatheredScalars.contains(V); }) &&
18188	"Must contain at least single gathered value.");
18189	assert(TEPtr->UserTreeIndex &&
18190	"Expected only single user of a gather node.");
18191	const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
18192
18193	PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
18194	UseEI.UserTE->hasState())
18195	? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp())
18196	: nullptr;
18197	Instruction *InsertPt =
18198	UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
18199	: &getLastInstructionInBundle(E: UseEI.UserTE);
18200	if (TEInsertPt == InsertPt) {
18201	// Check nodes, which might be emitted first.
18202	if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18203	(TEUseEI.UserTE->getOpcode() != Instruction::PHI \|\|
18204	TEUseEI.UserTE->isAltShuffle()) &&
18205	all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) {
18206	if (UseEI.UserTE->State != TreeEntry::Vectorize \|\|
18207	(UseEI.UserTE->hasState() &&
18208	UseEI.UserTE->getOpcode() == Instruction::PHI &&
18209	!UseEI.UserTE->isAltShuffle()) \|\|
18210	!all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock))
18211	continue;
18212	}
18213
18214	// If the schedulable insertion point is used in multiple entries - just
18215	// exit, no known ordering at this point, available only after real
18216	// scheduling.
18217	if (!doesNotNeedToBeScheduled(V: InsertPt) &&
18218	(TEUseEI.UserTE != UseEI.UserTE \|\| TEUseEI.EdgeIdx < UseEI.EdgeIdx))
18219	continue;
18220	// If the users are the PHI nodes with the same incoming blocks - skip.
18221	if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18222	TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
18223	UseEI.UserTE->State == TreeEntry::Vectorize &&
18224	UseEI.UserTE->getOpcode() == Instruction::PHI &&
18225	TEUseEI.UserTE != UseEI.UserTE)
18226	continue;
18227	// If 2 gathers are operands of the same entry (regardless of whether
18228	// user is PHI or else), compare operands indices, use the earlier one
18229	// as the base.
18230	if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
18231	continue;
18232	// If the user instruction is used for some reason in different
18233	// vectorized nodes - make it depend on index.
18234	if (TEUseEI.UserTE != UseEI.UserTE &&
18235	(TEUseEI.UserTE->Idx < UseEI.UserTE->Idx \|\|
18236	HasGatherUser (TEUseEI.UserTE)))
18237	continue;
18238	// If the user node is the operand of the other user node - skip.
18239	if (CheckParentNodes (TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
18240	continue;
18241	}
18242
18243	if (!TEUseEI.UserTE->isGather() && !UserPHI &&
18244	TEUseEI.UserTE->doesNotNeedToSchedule() !=
18245	UseEI.UserTE->doesNotNeedToSchedule() &&
18246	is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))
18247	continue;
18248	// Check if the user node of the TE comes after user node of TEPtr,
18249	// otherwise TEPtr depends on TE.
18250	if ((TEInsertBlock != InsertPt->getParent() \|\|
18251	TEUseEI.EdgeIdx < UseEI.EdgeIdx \|\| TEUseEI.UserTE != UseEI.UserTE) &&
18252	(!CheckOrdering (InsertPt) \|\|
18253	(UseEI.UserTE->hasCopyableElements() &&
18254	isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
18255	is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))))
18256	continue;
18257	// The node is reused - exit.
18258	if (CheckAndUseSameNode (TEPtr))
18259	break;
18260	// The parent node is copyable with last inst used outside? And the last
18261	// inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
18262	// preserve def-use chain.
18263	if (CheckNonSchedulableOrdering (UseEI.UserTE, InsertPt))
18264	continue;
18265	VToTEs.insert(Ptr: TEPtr);
18266	}
18267	if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
18268	const auto It = find_if(Range&: VTEs, P: [&](const* TreeEntry *MTE) {
18269	return MTE != TE && MTE != TEUseEI.UserTE &&
18270	!DeletedNodes.contains(Ptr: MTE) &&
18271	!TransformedToGatherNodes.contains(Val: MTE);
18272	});
18273	if (It != VTEs.end()) {
18274	const TreeEntry VTE = It;
18275	if (none_of(Range: TE->CombinedEntriesWithIndices,
18276	P: [&](const auto &P) { return P.first == VTE->Idx; })) {
18277	Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
18278	if (&LastBundleInst == TEInsertPt \|\| !CheckOrdering (&LastBundleInst))
18279	continue;
18280	}
18281	// The node is reused - exit.
18282	if (CheckAndUseSameNode (VTE))
18283	break;
18284	VToTEs.insert(Ptr: VTE);
18285	}
18286	}
18287	if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
18288	const auto It = find_if(Range&: VTEs, P: [&, MainTE = TE](const* TreeEntry *TE) {
18289	return TE != MainTE && !DeletedNodes.contains(Ptr: TE) &&
18290	!TransformedToGatherNodes.contains(Val: TE);
18291	});
18292	if (It != VTEs.end()) {
18293	const TreeEntry VTE = It;
18294	if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: `0`) &&
18295	VTEs.size() > `1` && VTE->State != TreeEntry::Vectorize) {
18296	VTEs = VTEs.drop_front();
18297	// Iterate through all vectorized nodes.
18298	const auto MIt = find_if(Range&: VTEs, P: [](const* TreeEntry *MTE) {
18299	return MTE->State == TreeEntry::Vectorize;
18300	});
18301	if (MIt == VTEs.end())
18302	continue;
18303	VTE = *MIt;
18304	}
18305	if (none_of(Range: TE->CombinedEntriesWithIndices,
18306	P: [&](const auto &P) { return P.first == VTE->Idx; })) {
18307	Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
18308	if (&LastBundleInst == TEInsertPt \|\|
18309	!CheckOrdering (&LastBundleInst) \|\|
18310	CheckNonSchedulableOrdering (VTE, &LastBundleInst))
18311	continue;
18312	}
18313	// The node is reused - exit.
18314	if (CheckAndUseSameNode (VTE))
18315	break;
18316	VToTEs.insert(Ptr: VTE);
18317	}
18318	}
18319	if (IsReusedNodeFound)
18320	break;
18321	if (VToTEs.empty())
18322	continue;
18323	if (UsedTEs.empty()) {
18324	// The first iteration, just insert the list of nodes to vector.
18325	UsedTEs.push_back(Elt: VToTEs);
18326	UsedValuesEntry.try_emplace(Key: V, Args: `0`);
18327	} else {
18328	// Need to check if there are any previously used tree nodes which use V.
18329	// If there are no such nodes, consider that we have another one input
18330	// vector.
18331	SmallPtrSet<const TreeEntry *, `4`> SavedVToTEs(VToTEs);
18332	unsigned Idx = `0`;
18333	for (SmallPtrSet<const TreeEntry *, `4`> &Set : UsedTEs) {
18334	// Do we have a non-empty intersection of previously listed tree entries
18335	// and tree entries using current V?
18336	set_intersect(S1&: VToTEs, S2: Set);
18337	if (!VToTEs.empty()) {
18338	// Yes, write the new subset and continue analysis for the next
18339	// scalar.
18340	Set.swap(RHS&: VToTEs);
18341	break;
18342	}
18343	VToTEs = SavedVToTEs;
18344	++Idx;
18345	}
18346	// No non-empty intersection found - need to add a second set of possible
18347	// source vectors.
18348	if (Idx == UsedTEs.size()) {
18349	// If the number of input vectors is greater than 2 - not a permutation,
18350	// fallback to the regular gather.
18351	// TODO: support multiple reshuffled nodes.
18352	if (UsedTEs.size() == `2`)
18353	continue;
18354	UsedTEs.push_back(Elt: SavedVToTEs);
18355	Idx = UsedTEs.size() - `1`;
18356	}
18357	UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
18358	}
18359	}
18360
18361	if (UsedTEs.empty()) {
18362	Entries.clear();
18363	return std::nullopt;
18364	}
18365
18366	unsigned VF = `0`;
18367	if (UsedTEs.size() == `1`) {
18368	// Keep the order to avoid non-determinism.
18369	SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
18370	UsedTEs.front().end());
18371	sort(C&: FirstEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
18372	return TE1->Idx < TE2->Idx;
18373	});
18374	// Try to find the perfect match in another gather node at first.
18375	auto It = find_if(Range&: FirstEntries, P: [=](const* TreeEntry *EntryPtr) {
18376	return EntryPtr->isSame(VL) \|\| EntryPtr->isSame(VL: TE->Scalars);
18377	});
18378	if (It != FirstEntries.end() &&
18379	(IsReusedNodeFound \|\| (*It)->getVectorFactor() == VL.size() \|\|
18380	((*It)->getVectorFactor() == TE->Scalars.size() &&
18381	TE->ReuseShuffleIndices.size() == VL.size() &&
18382	(*It)->isSame(VL: TE->Scalars)))) {
18383	Entries.push_back(Elt: *It);
18384	if (IsReusedNodeFound \|\| (*It)->getVectorFactor() == VL.size()) {
18385	std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18386	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: `0`);
18387	} else {
18388	SmallVector<int> CommonMask = TE->getCommonMask();
18389	copy(Range&: CommonMask, Out: Mask.begin());
18390	}
18391	// Clear undef scalars.
18392	for (unsigned I : seq<unsigned>(Size: VL.size()))
18393	if (isa<PoisonValue>(Val: VL [I]))
18394	Mask [Part * VL.size() + I] = PoisonMaskElem;
18395	return TargetTransformInfo::SK_PermuteSingleSrc;
18396	}
18397	// No perfect match, just shuffle, so choose the first tree node from the
18398	// tree.
18399	Entries.push_back(Elt: FirstEntries.front());
18400	// Update mapping between values and corresponding tree entries.
18401	for (auto &P : UsedValuesEntry)
18402	P.second = `0`;
18403	VF = FirstEntries.front()->getVectorFactor();
18404	} else {
18405	// Try to find nodes with the same vector factor.
18406	assert(UsedTEs.size() == `2` && "Expected at max 2 permuted entries.");
18407	// Keep the order of tree nodes to avoid non-determinism.
18408	DenseMap<int, const TreeEntry *> VFToTE;
18409	for (const TreeEntry *TE : UsedTEs.front()) {
18410	unsigned VF = TE->getVectorFactor();
18411	auto It = VFToTE.find(Val: VF);
18412	if (It != VFToTE.end()) {
18413	if (It ->second->Idx > TE->Idx)
18414	It ->getSecond() = TE;
18415	continue;
18416	}
18417	VFToTE.try_emplace(Key: VF, Args&: TE);
18418	}
18419	// Same, keep the order to avoid non-determinism.
18420	SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
18421	UsedTEs.back().end());
18422	sort(C&: SecondEntries, Comp: [](const TreeEntry TE1, const* TreeEntry *TE2) {
18423	return TE1->Idx < TE2->Idx;
18424	});
18425	for (const TreeEntry *TE : SecondEntries) {
18426	auto It = VFToTE.find(Val: TE->getVectorFactor());
18427	if (It != VFToTE.end()) {
18428	VF = It ->first;
18429	Entries.push_back(Elt: It ->second);
18430	Entries.push_back(Elt: TE);
18431	break;
18432	}
18433	}
18434	// No 2 source vectors with the same vector factor - just choose 2 with max
18435	// index.
18436	if (Entries.empty()) {
18437	Entries.push_back(Elt: *llvm::max_element(
18438	Range&: UsedTEs.front(), C: [](const TreeEntry TE1, const* TreeEntry *TE2) {
18439	return TE1->Idx < TE2->Idx;
18440	}));
18441	Entries.push_back(Elt: SecondEntries.front());
18442	VF = std::max(a: Entries.front()->getVectorFactor(),
18443	b: Entries.back()->getVectorFactor());
18444	} else {
18445	VF = Entries.front()->getVectorFactor();
18446	}
18447	SmallVector<SmallPtrSet<Value *, `8`>> ValuesToEntries;
18448	for (const TreeEntry *E : Entries)
18449	ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(),
18450	E: E->Scalars.end());
18451	// Update mapping between values and corresponding tree entries.
18452	for (auto &P : UsedValuesEntry) {
18453	for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size()))
18454	if (ValuesToEntries [Idx].contains(Ptr: P.first)) {
18455	P.second = Idx;
18456	break;
18457	}
18458	}
18459	}
18460
18461	bool IsSplatOrUndefs = isSplat(VL) \|\| all_of(Range&: VL, P: IsaPred<UndefValue>);
18462	// Checks if the 2 PHIs are compatible in terms of high possibility to be
18463	// vectorized.
18464	auto AreCompatiblePHIs = [&](Value V, Value V1) {
18465	auto *PHI = cast<PHINode>(Val: V);
18466	auto *PHI1 = cast<PHINode>(Val: V1);
18467	// Check that all incoming values are compatible/from same parent (if they
18468	// are instructions).
18469	// The incoming values are compatible if they all are constants, or
18470	// instruction with the same/alternate opcodes from the same basic block.
18471	for (int I = `0`, E = PHI->getNumIncomingValues(); I < E; ++I) {
18472	Value *In = PHI->getIncomingValue(i: I);
18473	Value *In1 = PHI1->getIncomingValue(i: I);
18474	if (isConstant(V: In) && isConstant(V: In1))
18475	continue;
18476	if (!getSameOpcode(VL: {In, In1}, TLI: *TLI))
18477	return false;
18478	if (cast<Instruction>(Val: In)->getParent() !=
18479	cast<Instruction>(Val: In1)->getParent())
18480	return false;
18481	}
18482	return true;
18483	};
18484	// Check if the value can be ignored during analysis for shuffled gathers.
18485	// We suppose it is better to ignore instruction, which do not form splats,
18486	// are not vectorized/not extractelements (these instructions will be handled
18487	// by extractelements processing) or may form vector node in future.
18488	auto MightBeIgnored = [=](Value *V) {
18489	auto *I = dyn_cast<Instruction>(Val: V);
18490	return I && !IsSplatOrUndefs && !isVectorized(V: I) &&
18491	!isVectorLikeInstWithConstOps(V: I) &&
18492	!areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
18493	};
18494	// Check that the neighbor instruction may form a full vector node with the
18495	// current instruction V. It is possible, if they have same/alternate opcode
18496	// and same parent basic block.
18497	auto NeighborMightBeIgnored = [&](Value V, int* Idx) {
18498	Value *V1 = VL [Idx];
18499	bool UsedInSameVTE = false;
18500	auto It = UsedValuesEntry.find(Val: V1);
18501	if (It != UsedValuesEntry.end())
18502	UsedInSameVTE = It ->second == UsedValuesEntry.find(Val: V)->second;
18503	return V != V1 && MightBeIgnored (V1) && !UsedInSameVTE &&
18504	getSameOpcode(VL: {V, V1}, TLI: *TLI) &&
18505	cast<Instruction>(Val: V)->getParent() ==
18506	cast<Instruction>(Val: V1)->getParent() &&
18507	(!isa<PHINode>(Val: V1) \|\| AreCompatiblePHIs (V, V1));
18508	};
18509	// Build a shuffle mask for better cost estimation and vector emission.
18510	SmallBitVector UsedIdxs(Entries.size());
18511	SmallVector<std::pair<unsigned, int>> EntryLanes;
18512	for (int I = `0`, E = VL.size(); I < E; ++I) {
18513	Value *V = VL [I];
18514	auto It = UsedValuesEntry.find(Val: V);
18515	if (It == UsedValuesEntry.end())
18516	continue;
18517	// Do not try to shuffle scalars, if they are constants, or instructions
18518	// that can be vectorized as a result of the following vector build
18519	// vectorization.
18520	if (isConstant(V) \|\| (MightBeIgnored (V) &&
18521	((I > `0` && NeighborMightBeIgnored (V, I - `1`)) \|\|
18522	(I != E - `1` && NeighborMightBeIgnored (V, I + `1`)))))
18523	continue;
18524	unsigned Idx = It ->second;
18525	EntryLanes.emplace_back(Args&: Idx, Args&: I);
18526	UsedIdxs.set(Idx);
18527	}
18528	// Iterate through all shuffled scalars and select entries, which can be used
18529	// for final shuffle.
18530	SmallVector<const TreeEntry *> TempEntries;
18531	for (unsigned I = `0`, Sz = Entries.size(); I < Sz; ++I) {
18532	if (!UsedIdxs.test(Idx: I))
18533	continue;
18534	// Fix the entry number for the given scalar. If it is the first entry, set
18535	// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
18536	// These indices are used when calculating final shuffle mask as the vector
18537	// offset.
18538	for (std::pair<unsigned, int> &Pair : EntryLanes)
18539	if (Pair.first == I)
18540	Pair.first = TempEntries.size();
18541	TempEntries.push_back(Elt: Entries [I]);
18542	}
18543	Entries.swap(RHS&: TempEntries);
18544	if (EntryLanes.size() == Entries.size() &&
18545	!VL.equals(RHS: ArrayRef(TE->Scalars)
18546	.slice(N: Part * VL.size(),
18547	M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
18548	// We may have here 1 or 2 entries only. If the number of scalars is equal
18549	// to the number of entries, no need to do the analysis, it is not very
18550	// profitable. Since VL is not the same as TE->Scalars, it means we already
18551	// have some shuffles before. Cut off not profitable case.
18552	Entries.clear();
18553	return std::nullopt;
18554	}
18555	// Build the final mask, check for the identity shuffle, if possible.
18556	bool IsIdentity = Entries.size() == `1`;
18557	// Pair.first is the offset to the vector, while Pair.second is the index of
18558	// scalar in the list.
18559	for (const std::pair<unsigned, int> &Pair : EntryLanes) {
18560	unsigned Idx = Part * VL.size() + Pair.second;
18561	Mask [Idx] =
18562	Pair.first * VF +
18563	(ForOrder ? std::distance(
18564	first: Entries [Pair.first]->Scalars.begin(),
18565	last: find(Range: Entries [Pair.first]->Scalars, Val: VL [Pair.second]))
18566	: Entries [Pair.first]->findLaneForValue(V: VL [Pair.second]));
18567	IsIdentity &= Mask [Idx] == Pair.second;
18568	}
18569	if (ForOrder \|\| IsIdentity \|\| Entries.empty()) {
18570	switch (Entries.size()) {
18571	case `1`:
18572	if (IsIdentity \|\| EntryLanes.size() > `1` \|\| VL.size() <= `2`)
18573	return TargetTransformInfo::SK_PermuteSingleSrc;
18574	break;
18575	case `2`:
18576	if (EntryLanes.size() > `2` \|\| VL.size() <= `2`)
18577	return TargetTransformInfo::SK_PermuteTwoSrc;
18578	break;
18579	default:
18580	break;
18581	}
18582	} else if (!isa<VectorType>(Val: VL.front()->getType()) &&
18583	(EntryLanes.size() > Entries.size() \|\| VL.size() <= `2`)) {
18584	// Do the cost estimation if shuffle beneficial than buildvector.
18585	SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()),
18586	std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()));
18587	int MinElement = SubMask.front(), MaxElement = SubMask.front();
18588	for (int Idx : SubMask) {
18589	if (Idx == PoisonMaskElem)
18590	continue;
18591	if (MinElement == PoisonMaskElem \|\| MinElement % VF > Idx % VF)
18592	MinElement = Idx;
18593	if (MaxElement == PoisonMaskElem \|\| MaxElement % VF < Idx % VF)
18594	MaxElement = Idx;
18595	}
18596	assert(MaxElement >= `0` && MinElement >= `0` &&
18597	MaxElement % VF >= MinElement % VF &&
18598	"Expected at least single element.");
18599	unsigned NewVF = std::max<unsigned>(
18600	a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(),
18601	Sz: (MaxElement % VF) -
18602	(MinElement % VF) + `1`));
18603	if (NewVF < VF) {
18604	for (int &Idx : SubMask) {
18605	if (Idx == PoisonMaskElem)
18606	continue;
18607	Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
18608	(Idx >= static_cast<int>(VF) ? NewVF : `0`);
18609	}
18610	} else {
18611	NewVF = VF;
18612	}
18613
18614	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18615	auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF);
18616	auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size());
18617	auto GetShuffleCost = [&,
18618	&TTI = TTI](ArrayRef<int*> Mask,
18619	ArrayRef<const TreeEntry *> Entries,
18620	VectorType *VecTy) -> InstructionCost {
18621	if (Entries.size() == `1` && Entries.front()->getInterleaveFactor() > `0` &&
18622	ShuffleVectorInst::isDeInterleaveMaskOfFactor(
18623	Mask, Factor: Entries.front()->getInterleaveFactor()))
18624	return TTI::TCC_Free;
18625	return ::getShuffleCost(TTI,
18626	Kind: Entries.size() > `1` ? TTI::SK_PermuteTwoSrc
18627	: TTI::SK_PermuteSingleSrc,
18628	Tp: VecTy, Mask, CostKind);
18629	};
18630	InstructionCost ShuffleCost = GetShuffleCost (SubMask, Entries, VecTy);
18631	InstructionCost FirstShuffleCost = `0`;
18632	SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18633	if (Entries.size() == `1` \|\| !Entries [`0`]->isGather()) {
18634	FirstShuffleCost = ShuffleCost;
18635	} else {
18636	// Transform mask to include only first entry.
18637	APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18638	bool IsIdentity = true;
18639	for (auto [I, Idx] : enumerate(First&: FirstMask)) {
18640	if (Idx >= static_cast<int>(NewVF)) {
18641	Idx = PoisonMaskElem;
18642	} else {
18643	DemandedElts.clearBit(BitPosition: I);
18644	if (Idx != PoisonMaskElem)
18645	IsIdentity &= static_cast<int>(I) == Idx;
18646	}
18647	}
18648	if (!IsIdentity)
18649	FirstShuffleCost = GetShuffleCost (FirstMask, Entries.front(), VecTy);
18650	FirstShuffleCost += getScalarizationOverhead(
18651	TTI: TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /Insert=/*true,
18652	/Extract=/false, CostKind);
18653	}
18654	InstructionCost SecondShuffleCost = `0`;
18655	SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18656	if (Entries.size() == `1` \|\| !Entries [`1`]->isGather()) {
18657	SecondShuffleCost = ShuffleCost;
18658	} else {
18659	// Transform mask to include only first entry.
18660	APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18661	bool IsIdentity = true;
18662	for (auto [I, Idx] : enumerate(First&: SecondMask)) {
18663	if (Idx < static_cast<int>(NewVF) && Idx >= `0`) {
18664	Idx = PoisonMaskElem;
18665	} else {
18666	DemandedElts.clearBit(BitPosition: I);
18667	if (Idx != PoisonMaskElem) {
18668	Idx -= NewVF;
18669	IsIdentity &= static_cast<int>(I) == Idx;
18670	}
18671	}
18672	}
18673	if (!IsIdentity)
18674	SecondShuffleCost = GetShuffleCost (SecondMask, Entries [`1`], VecTy);
18675	SecondShuffleCost += getScalarizationOverhead(
18676	TTI: TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /Insert=/*true,
18677	/Extract=/false, CostKind);
18678	}
18679	APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18680	for (auto [I, Idx] : enumerate(First&: SubMask))
18681	if (Idx == PoisonMaskElem)
18682	DemandedElts.clearBit(BitPosition: I);
18683	InstructionCost BuildVectorCost = getScalarizationOverhead(
18684	TTI: TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /Insert=/*true,
18685	/Extract=/false, CostKind);
18686	const TreeEntry BestEntry = nullptr*;
18687	if (FirstShuffleCost < ShuffleCost) {
18688	std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18689	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()),
18690	f: [&](int &Idx) {
18691	if (Idx >= static_cast<int>(VF))
18692	Idx = PoisonMaskElem;
18693	});
18694	BestEntry = Entries.front();
18695	ShuffleCost = FirstShuffleCost;
18696	}
18697	if (SecondShuffleCost < ShuffleCost) {
18698	std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18699	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()),
18700	f: [&](int &Idx) {
18701	if (Idx < static_cast<int>(VF))
18702	Idx = PoisonMaskElem;
18703	else
18704	Idx -= VF;
18705	});
18706	BestEntry = Entries [`1`];
18707	ShuffleCost = SecondShuffleCost;
18708	}
18709	if (BuildVectorCost >= ShuffleCost) {
18710	if (BestEntry) {
18711	Entries.clear();
18712	Entries.push_back(Elt: BestEntry);
18713	}
18714	return Entries.size() > `1` ? TargetTransformInfo::SK_PermuteTwoSrc
18715	: TargetTransformInfo::SK_PermuteSingleSrc;
18716	}
18717	}
18718	Entries.clear();
18719	// Clear the corresponding mask elements.
18720	std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18721	last: std::next(x: Mask.begin(), n: (Part + `1`) * VL.size()), value: PoisonMaskElem);
18722	return std::nullopt;
18723	}
18724
18725	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
18726	BoUpSLP::isGatherShuffledEntry(
18727	const TreeEntry TE, ArrayRef<Value > VL, SmallVectorImpl<int> &Mask,
18728	SmallVectorImpl<SmallVector<const TreeEntry >> &Entries, unsigned* NumParts,
18729	bool ForOrder) {
18730	assert(NumParts > `0` && NumParts < VL.size() &&
18731	"Expected positive number of registers.");
18732	Entries.clear();
18733	// No need to check for the topmost gather node.
18734	if (TE == VectorizableTree.front().get() &&
18735	(!GatheredLoadsEntriesFirst.has_value() \|\|
18736	none_of(Range: ArrayRef(VectorizableTree).drop_front(),
18737	P: [](const std::unique_ptr<TreeEntry> &TE) {
18738	return !TE ->isGather();
18739	})))
18740	return {};
18741	// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18742	// implemented yet.
18743	if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
18744	return {};
18745	Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
18746	assert((TE->UserTreeIndex \|\| TE == VectorizableTree.front().get()) &&
18747	"Expected only single user of the gather node.");
18748	assert(VL.size() % NumParts == `0` &&
18749	"Number of scalars must be divisible by NumParts.");
18750	if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18751	TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18752	(TE->Idx == `0` \|\|
18753	(TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) \|\|
18754	isSplat(VL: TE->Scalars) \|\|
18755	(TE->hasState() &&
18756	getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars))))
18757	return {};
18758	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
18759	SmallVector<std::optional<TTI::ShuffleKind>> Res;
18760	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
18761	ArrayRef<Value *> SubVL =
18762	VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
18763	SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18764	std::optional<TTI::ShuffleKind> SubRes =
18765	isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
18766	ForOrder);
18767	if (!SubRes)
18768	SubEntries.clear();
18769	Res.push_back(Elt: SubRes);
18770	if (SubEntries.size() == `1` && *SubRes == TTI::SK_PermuteSingleSrc &&
18771	SubEntries.front()->getVectorFactor() == VL.size() &&
18772	(SubEntries.front()->isSame(VL: TE->Scalars) \|\|
18773	SubEntries.front()->isSame(VL))) {
18774	SmallVector<const TreeEntry *> LocalSubEntries;
18775	LocalSubEntries.swap(RHS&: SubEntries);
18776	Entries.clear();
18777	Res.clear();
18778	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
18779	// Clear undef scalars.
18780	for (int I = `0`, Sz = VL.size(); I < Sz; ++I)
18781	if (isa<PoisonValue>(Val: VL [I]))
18782	Mask [I] = PoisonMaskElem;
18783	Entries.emplace_back(Args: `1`, Args&: LocalSubEntries.front());
18784	Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
18785	return Res;
18786	}
18787	}
18788	if (all_of(Range&: Res,
18789	P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18790	Entries.clear();
18791	return {};
18792	}
18793	return Res;
18794	}
18795
18796	InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value > VL, bool* ForPoisonSrc,
18797	Type ScalarTy) const* {
18798	const unsigned VF = VL.size();
18799	auto *VecTy = getWidenedType(ScalarTy, VF);
18800	// Find the cost of inserting/extracting values from the vector.
18801	// Check if the same elements are inserted several times and count them as
18802	// shuffle candidates.
18803	APInt DemandedElements = APInt::getZero(numBits: VF);
18804	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18805	InstructionCost Cost;
18806	auto EstimateInsertCost = [&](unsigned I, Value *V) {
18807	DemandedElements.setBit(I);
18808	if (V->getType() != ScalarTy)
18809	Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
18810	CCH: TTI::CastContextHint::None, CostKind);
18811	};
18812	SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18813	std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: `0`);
18814	for (auto [I, V] : enumerate(First&: VL)) {
18815	// No need to shuffle duplicates for constants.
18816	if ((ForPoisonSrc && isConstant(V)) \|\| isa<UndefValue>(Val: V))
18817	continue;
18818
18819	if (isConstant(V)) {
18820	ConstantShuffleMask [I] = I + VF;
18821	continue;
18822	}
18823	EstimateInsertCost (I, V);
18824	}
18825	// FIXME: add a cost for constant vector materialization.
18826	bool IsAnyNonUndefConst =
18827	any_of(Range&: VL, P: [](Value V) { return* !isa<UndefValue>(Val: V) && isConstant(V); });
18828	// 1. Shuffle input source vector and constant vector.
18829	if (!ForPoisonSrc && IsAnyNonUndefConst) {
18830	Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy,
18831	Mask: ConstantShuffleMask);
18832	}
18833
18834	// 2. Insert unique non-constants.
18835	if (!DemandedElements.isZero())
18836	Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements,
18837	/Insert=/true,
18838	/Extract=/false, CostKind,
18839	ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL);
18840	return Cost;
18841	}
18842
18843	Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18844	auto It = EntryToLastInstruction.find(Val: E);
18845	if (It != EntryToLastInstruction.end())
18846	return *cast<Instruction>(Val&: It ->second);
18847	Instruction Res = nullptr*;
18848	// Get the basic block this bundle is in. All instructions in the bundle
18849	// should be in this block (except for extractelement-like instructions with
18850	// constant indices or gathered loads or copyables).
18851	Instruction *Front;
18852	unsigned Opcode;
18853	if (E->hasState()) {
18854	Front = E->getMainOp();
18855	Opcode = E->getOpcode();
18856	} else {
18857	Front = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: IsaPred<Instruction>));
18858	Opcode = Front->getOpcode();
18859	}
18860	auto *BB = Front->getParent();
18861	assert(
18862	((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18863	E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) \|\|
18864	E->State == TreeEntry::SplitVectorize \|\| E->hasCopyableElements() \|\|
18865	all_of(E->Scalars,
18866	[=](Value V) -> bool* {
18867	if (Opcode == Instruction::GetElementPtr &&
18868	!isa<GetElementPtrInst>(V))
18869	return true;
18870	auto *I = dyn_cast<Instruction>(V);
18871	return !I \|\| !E->getMatchingMainOpOrAltOp(I) \|\|
18872	I->getParent() == BB \|\| isVectorLikeInstWithConstOps(I);
18873	})) &&
18874	"Expected gathered loads or GEPs or instructions from same basic "
18875	"block.");
18876
18877	auto FindLastInst = [&]() {
18878	Instruction *LastInst = Front;
18879	for (Value *V : E->Scalars) {
18880	auto *I = dyn_cast<Instruction>(Val: V);
18881	if (!I)
18882	continue;
18883	if (E->isCopyableElement(V: I))
18884	continue;
18885	if (LastInst->getParent() == I->getParent()) {
18886	if (LastInst->comesBefore(Other: I))
18887	LastInst = I;
18888	continue;
18889	}
18890	assert(((Opcode == Instruction::GetElementPtr &&
18891	!isa<GetElementPtrInst>(I)) \|\|
18892	E->State == TreeEntry::SplitVectorize \|\|
18893	(isVectorLikeInstWithConstOps(LastInst) &&
18894	isVectorLikeInstWithConstOps(I)) \|\|
18895	(GatheredLoadsEntriesFirst.has_value() &&
18896	Opcode == Instruction::Load && E->isGather() &&
18897	E->Idx < *GatheredLoadsEntriesFirst)) &&
18898	"Expected vector-like or non-GEP in GEP node insts only.");
18899	if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
18900	LastInst = I;
18901	continue;
18902	}
18903	if (!DT->isReachableFromEntry(A: I->getParent()))
18904	continue;
18905	auto *NodeA = DT->getNode(BB: LastInst->getParent());
18906	auto *NodeB = DT->getNode(BB: I->getParent());
18907	assert(NodeA && "Should only process reachable instructions");
18908	assert(NodeB && "Should only process reachable instructions");
18909	assert((NodeA == NodeB) ==
18910	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18911	"Different nodes should have different DFS numbers");
18912	if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18913	LastInst = I;
18914	}
18915	BB = LastInst->getParent();
18916	return LastInst;
18917	};
18918
18919	auto FindFirstInst = [&]() {
18920	Instruction *FirstInst = Front;
18921	for (Value *V : E->Scalars) {
18922	auto *I = dyn_cast<Instruction>(Val: V);
18923	if (!I)
18924	continue;
18925	if (E->isCopyableElement(V: I))
18926	continue;
18927	if (FirstInst->getParent() == I->getParent()) {
18928	if (I->comesBefore(Other: FirstInst))
18929	FirstInst = I;
18930	continue;
18931	}
18932	assert(((Opcode == Instruction::GetElementPtr &&
18933	!isa<GetElementPtrInst>(I)) \|\|
18934	(isVectorLikeInstWithConstOps(FirstInst) &&
18935	isVectorLikeInstWithConstOps(I))) &&
18936	"Expected vector-like or non-GEP in GEP node insts only.");
18937	if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
18938	FirstInst = I;
18939	continue;
18940	}
18941	if (!DT->isReachableFromEntry(A: I->getParent()))
18942	continue;
18943	auto *NodeA = DT->getNode(BB: FirstInst->getParent());
18944	auto *NodeB = DT->getNode(BB: I->getParent());
18945	assert(NodeA && "Should only process reachable instructions");
18946	assert(NodeB && "Should only process reachable instructions");
18947	assert((NodeA == NodeB) ==
18948	(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18949	"Different nodes should have different DFS numbers");
18950	if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18951	FirstInst = I;
18952	}
18953	return FirstInst;
18954	};
18955
18956	if (E->State == TreeEntry::SplitVectorize) {
18957	Res = FindLastInst ();
18958	if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) {
18959	for (auto *E : Entries) {
18960	auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue);
18961	if (!I)
18962	I = &getLastInstructionInBundle(E);
18963	if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I))
18964	Res = I;
18965	}
18966	}
18967	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18968	return *Res;
18969	}
18970
18971	// Set insertpoint for gathered loads to the very first load.
18972	if (GatheredLoadsEntriesFirst.has_value() &&
18973	E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18974	Opcode == Instruction::Load) {
18975	Res = FindFirstInst ();
18976	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18977	return *Res;
18978	}
18979
18980	// Set the insert point to the beginning of the basic block if the entry
18981	// should not be scheduled.
18982	auto FindScheduleBundle = [&](const TreeEntry E) -> const* ScheduleBundle * {
18983	if (E->isGather())
18984	return nullptr;
18985	// Found previously that the instruction do not need to be scheduled.
18986	const auto *It = BlocksSchedules.find(Key: BB);
18987	if (It == BlocksSchedules.end())
18988	return nullptr;
18989	for (Value *V : E->Scalars) {
18990	auto *I = dyn_cast<Instruction>(Val: V);
18991	if (!I \|\| isa<PHINode>(Val: I) \|\|
18992	(!E->isCopyableElement(V: I) && doesNotNeedToBeScheduled(V: I)))
18993	continue;
18994	ArrayRef<ScheduleBundle *> Bundles = It->second ->getScheduleBundles(V: I);
18995	if (Bundles.empty())
18996	continue;
18997	const auto *It = find_if(
18998	Range&: Bundles, P: [&](ScheduleBundle B) { return* B->getTreeEntry() == E; });
18999	if (It != Bundles.end())
19000	return *It;
19001	}
19002	return nullptr;
19003	};
19004	const ScheduleBundle *Bundle = FindScheduleBundle (E);
19005	if (!E->isGather() && !Bundle) {
19006	if ((Opcode == Instruction::GetElementPtr &&
19007	any_of(Range: E->Scalars,
19008	P: [](Value *V) {
19009	return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
19010	})) \|\|
19011	(all_of(Range: E->Scalars,
19012	P: [&](Value *V) {
19013	return isa<PoisonValue>(Val: V) \|\|
19014	(E->Idx == `0` && isa<InsertElementInst>(Val: V)) \|\|
19015	E->isCopyableElement(V) \|\|
19016	(!isVectorLikeInstWithConstOps(V) &&
19017	isUsedOutsideBlock(V));
19018	}) &&
19019	(!E->doesNotNeedToSchedule() \|\|
19020	any_of(Range: E->Scalars,
19021	P: [&](Value *V) {
19022	if (!isa<Instruction>(Val: V) \|\|
19023	(E->hasCopyableElements() && E->isCopyableElement(V)))
19024	return false;
19025	return !areAllOperandsNonInsts(V);
19026	}) \|\|
19027	none_of(Range: E->Scalars, P: [&](Value *V) {
19028	if (!isa<Instruction>(Val: V) \|\|
19029	(E->hasCopyableElements() && E->isCopyableElement(V)))
19030	return false;
19031	return MustGather.contains(Ptr: V);
19032	}))))
19033	Res = FindLastInst ();
19034	else
19035	Res = FindFirstInst ();
19036	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19037	return *Res;
19038	}
19039
19040	// Find the last instruction. The common case should be that BB has been
19041	// scheduled, and the last instruction is VL.back(). So we start with
19042	// VL.back() and iterate over schedule data until we reach the end of the
19043	// bundle. The end of the bundle is marked by null ScheduleData.
19044	if (Bundle) {
19045	assert(!E->isGather() && "Gathered instructions should not be scheduled");
19046	Res = Bundle->getBundle().back()->getInst();
19047	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19048	return *Res;
19049	}
19050
19051	// LastInst can still be null at this point if there's either not an entry
19052	// for BB in BlocksSchedules or there's no ScheduleData available for
19053	// VL.back(). This can be the case if buildTreeRec aborts for various
19054	// reasons (e.g., the maximum recursion depth is reached, the maximum region
19055	// size is reached, etc.). ScheduleData is initialized in the scheduling
19056	// "dry-run".
19057	//
19058	// If this happens, we can still find the last instruction by brute force. We
19059	// iterate forwards from Front (inclusive) until we either see all
19060	// instructions in the bundle or reach the end of the block. If Front is the
19061	// last instruction in program order, LastInst will be set to Front, and we
19062	// will visit all the remaining instructions in the block.
19063	//
19064	// One of the reasons we exit early from buildTreeRec is to place an upper
19065	// bound on compile-time. Thus, taking an additional compile-time hit here is
19066	// not ideal. However, this should be exceedingly rare since it requires that
19067	// we both exit early from buildTreeRec and that the bundle be out-of-order
19068	// (causing us to iterate all the way to the end of the block).
19069	if (!Res)
19070	Res = FindLastInst ();
19071	assert(Res && "Failed to find last instruction in bundle");
19072	EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19073	return *Res;
19074	}
19075
19076	void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
19077	auto *Front = E->getMainOp();
19078	Instruction *LastInst = &getLastInstructionInBundle(E);
19079	assert(LastInst && "Failed to find last instruction in bundle");
19080	BasicBlock::iterator LastInstIt = LastInst->getIterator();
19081	// If the instruction is PHI, set the insert point after all the PHIs.
19082	bool IsPHI = isa<PHINode>(Val: LastInst);
19083	if (IsPHI) {
19084	LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
19085	if (LastInstIt != LastInst->getParent()->end() &&
19086	LastInstIt ->getParent()->isLandingPad())
19087	LastInstIt = std::next(x: LastInstIt);
19088	}
19089	if (IsPHI \|\|
19090	(!E->isGather() && E->State != TreeEntry::SplitVectorize &&
19091	(E->doesNotNeedToSchedule() \|\|
19092	(E->hasCopyableElements() && !E->isCopyableElement(V: LastInst) &&
19093	isUsedOutsideBlock(V: LastInst)))) \|\|
19094	(GatheredLoadsEntriesFirst.has_value() &&
19095	E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
19096	E->getOpcode() == Instruction::Load)) {
19097	Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
19098	} else {
19099	// Set the insertion point after the last instruction in the bundle. Set the
19100	// debug location to Front.
19101	Builder.SetInsertPoint(
19102	TheBB: LastInst->getParent(),
19103	IP: LastInst->getNextNode()->getIterator());
19104	if (Instruction *Res = LastInstructionToPos.lookup(Val: LastInst)) {
19105	Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
19106	} else {
19107	Res = Builder.CreateAlignedLoad(Ty: Builder.getPtrTy(),
19108	Ptr: PoisonValue::get(T: Builder.getPtrTy()),
19109	Align: MaybeAlign ());
19110	Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
19111	eraseInstruction(I: Res);
19112	LastInstructionToPos.try_emplace(Key: LastInst, Args&: Res);
19113	}
19114	}
19115	Builder.SetCurrentDebugLocation(Front->getDebugLoc());
19116	}
19117
19118	Value *BoUpSLP::gather(
19119	ArrayRef<Value > VL, Value Root, Type *ScalarTy,
19120	function_ref<Value (Value , Value , ArrayRef<int*>)> CreateShuffle) {
19121	// List of instructions/lanes from current block and/or the blocks which are
19122	// part of the current loop. These instructions will be inserted at the end to
19123	// make it possible to optimize loops and hoist invariant instructions out of
19124	// the loops body with better chances for success.
19125	SmallVector<std::pair<Value , unsigned*>, `4`> PostponedInsts;
19126	SmallSet<int, `4`> PostponedIndices;
19127	Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
19128	auto &&CheckPredecessor = [](BasicBlock InstBB, BasicBlock InsertBB) {
19129	SmallPtrSet<BasicBlock *, `4`> Visited;
19130	while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
19131	InsertBB = InsertBB->getSinglePredecessor();
19132	return InsertBB && InsertBB == InstBB;
19133	};
19134	for (int I = `0`, E = VL.size(); I < E; ++I) {
19135	if (auto *Inst = dyn_cast<Instruction>(Val: VL [I]))
19136	if ((CheckPredecessor (Inst->getParent(), Builder.GetInsertBlock()) \|\|
19137	isVectorized(V: Inst) \|\|
19138	(L && (!Root \|\| L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
19139	PostponedIndices.insert(V: I).second)
19140	PostponedInsts.emplace_back(Args&: Inst, Args&: I);
19141	}
19142
19143	auto &&CreateInsertElement = [this](Value Vec, Value V, unsigned Pos,
19144	Type *Ty) {
19145	Value *Scalar = V;
19146	if (Scalar->getType() != Ty) {
19147	assert(Scalar->getType()->isIntOrIntVectorTy() &&
19148	Ty->isIntOrIntVectorTy() && "Expected integer types only.");
19149	Value *V = Scalar;
19150	if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
19151	isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
19152	Value *Op = CI->getOperand(i_nocapture: `0`);
19153	if (auto *IOp = dyn_cast<Instruction>(Val: Op);
19154	!IOp \|\| !(isDeleted(I: IOp) \|\| isVectorized(V: IOp)))
19155	V = Op;
19156	}
19157	Scalar = Builder.CreateIntCast(
19158	V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery (*DL)));
19159	}
19160
19161	Instruction *InsElt;
19162	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
19163	assert(SLPReVec && "FixedVectorType is not expected.");
19164	Vec =
19165	createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy));
19166	auto *II = dyn_cast<Instruction>(Val: Vec);
19167	if (!II)
19168	return Vec;
19169	InsElt = II;
19170	} else {
19171	Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
19172	InsElt = dyn_cast<InsertElementInst>(Val: Vec);
19173	if (!InsElt)
19174	return Vec;
19175	}
19176	GatherShuffleExtractSeq.insert(X: InsElt);
19177	CSEBlocks.insert(V: InsElt->getParent());
19178	// Add to our 'need-to-extract' list.
19179	if (isa<Instruction>(Val: V)) {
19180	ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
19181	const auto It = find_if(Range&: Entries, P: [&](const* TreeEntry *E) {
19182	return !TransformedToGatherNodes.contains(Val: E) &&
19183	!DeletedNodes.contains(Ptr: E);
19184	});
19185	if (It != Entries.end()) {
19186	// Find which lane we need to extract.
19187	User UserOp = nullptr*;
19188	if (Scalar != V) {
19189	if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
19190	UserOp = SI;
19191	} else {
19192	if (V->getType()->isVectorTy()) {
19193	if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: InsElt);
19194	SV && SV->getOperand(i_nocapture: `0`) != V && SV->getOperand(i_nocapture: `1`) != V) {
19195	// Find shufflevector, caused by resize.
19196	auto FindOperand = [](Value Vec, Value V) -> Instruction * {
19197	if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Vec)) {
19198	if (SV->getOperand(i_nocapture: `0`) == V)
19199	return SV;
19200	if (SV->getOperand(i_nocapture: `1`) == V)
19201	return SV;
19202	}
19203	return nullptr;
19204	};
19205	InsElt = nullptr;
19206	if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: `0`), V))
19207	InsElt = User;
19208	else if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: `1`), V))
19209	InsElt = User;
19210	assert(InsElt &&
19211	"Failed to find shufflevector, caused by resize.");
19212	}
19213	}
19214	UserOp = InsElt;
19215	}
19216	if (UserOp) {
19217	unsigned FoundLane = (*It)->findLaneForValue(V);
19218	ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: **It, Args&: FoundLane);
19219	}
19220	}
19221	}
19222	return Vec;
19223	};
19224	auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
19225	Value *Vec = PoisonValue::get(T: VecTy);
19226	SmallVector<int> NonConsts;
19227	SmallVector<int> Mask(VL.size());
19228	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
19229	Value *OriginalRoot = Root;
19230	if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root);
19231	SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: `1`)) &&
19232	SV->getOperand(i_nocapture: `0`)->getType() == VecTy) {
19233	Root = SV->getOperand(i_nocapture: `0`);
19234	Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end());
19235	}
19236	// Insert constant values at first.
19237	for (int I = `0`, E = VL.size(); I < E; ++I) {
19238	if (PostponedIndices.contains(V: I))
19239	continue;
19240	if (!isConstant(V: VL [I])) {
19241	NonConsts.push_back(Elt: I);
19242	continue;
19243	}
19244	if (isa<PoisonValue>(Val: VL [I]))
19245	continue;
19246	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
19247	Mask [I] = I + E;
19248	}
19249	if (Root) {
19250	if (isa<PoisonValue>(Val: Vec)) {
19251	Vec = OriginalRoot;
19252	} else {
19253	Vec = CreateShuffle (Root, Vec, Mask);
19254	if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot);
19255	OI && OI->use_empty() &&
19256	none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
19257	return TE ->VectorizedValue == OI;
19258	}))
19259	eraseInstruction(I: OI);
19260	}
19261	}
19262	// Insert non-constant values.
19263	for (int I : NonConsts)
19264	Vec = CreateInsertElement (Vec, VL [I], I, ScalarTy);
19265	// Append instructions, which are/may be part of the loop, in the end to make
19266	// it possible to hoist non-loop-based instructions.
19267	for (const std::pair<Value , unsigned*> &Pair : PostponedInsts)
19268	Vec = CreateInsertElement (Vec, Pair.first, Pair.second, ScalarTy);
19269
19270	return Vec;
19271	}
19272
19273	/// Merges shuffle masks and emits final shuffle instruction, if required. It
19274	/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
19275	/// when the actual shuffle instruction is generated only if this is actually
19276	/// required. Otherwise, the shuffle instruction emission is delayed till the
19277	/// end of the process, to reduce the number of emitted instructions and further
19278	/// analysis/transformations.
19279	/// The class also will look through the previously emitted shuffle instructions
19280	/// and properly mark indices in mask as undef.
19281	/// For example, given the code
19282	/// \code
19283	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
19284	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
19285	/// \endcode
19286	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
19287	/// look through %s1 and %s2 and emit
19288	/// \code
19289	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19290	/// \endcode
19291	/// instead.
19292	/// If 2 operands are of different size, the smallest one will be resized and
19293	/// the mask recalculated properly.
19294	/// For example, given the code
19295	/// \code
19296	/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
19297	/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
19298	/// \endcode
19299	/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
19300	/// look through %s1 and %s2 and emit
19301	/// \code
19302	/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19303	/// \endcode
19304	/// instead.
19305	class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
19306	bool IsFinalized = false;
19307	/// Combined mask for all applied operands and masks. It is built during
19308	/// analysis and actual emission of shuffle vector instructions.
19309	SmallVector<int> CommonMask;
19310	/// List of operands for the shuffle vector instruction. It hold at max 2
19311	/// operands, if the 3rd is going to be added, the first 2 are combined into
19312	/// shuffle with \p CommonMask mask, the first operand sets to be the
19313	/// resulting shuffle and the second operand sets to be the newly added
19314	/// operand. The \p CommonMask is transformed in the proper way after that.
19315	SmallVector<Value *, `2`> InVectors;
19316	IRBuilderBase &Builder;
19317	BoUpSLP &R;
19318
19319	class ShuffleIRBuilder {
19320	IRBuilderBase &Builder;
19321	/// Holds all of the instructions that we gathered.
19322	SetVector<Instruction *> &GatherShuffleExtractSeq;
19323	/// A list of blocks that we are going to CSE.
19324	DenseSet<BasicBlock *> &CSEBlocks;
19325	/// Data layout.
19326	const DataLayout &DL;
19327
19328	public:
19329	ShuffleIRBuilder(IRBuilderBase &Builder,
19330	SetVector<Instruction *> &GatherShuffleExtractSeq,
19331	DenseSet<BasicBlock > &CSEBlocks, const* DataLayout &DL)
19332	: Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19333	CSEBlocks(CSEBlocks), DL(DL) {}
19334	~ShuffleIRBuilder() = default;
19335	/// Creates shufflevector for the 2 operands with the given mask.
19336	Value createShuffleVector(Value V1, Value V2, ArrayRef<int*> Mask) {
19337	if (V1->getType() != V2->getType()) {
19338	assert(V1->getType()->isIntOrIntVectorTy() &&
19339	V1->getType()->isIntOrIntVectorTy() &&
19340	"Expected integer vector types only.");
19341	if (V1->getType() != V2->getType()) {
19342	if (cast<VectorType>(Val: V2->getType())
19343	->getElementType()
19344	->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
19345	->getElementType()
19346	->getIntegerBitWidth())
19347	V2 = Builder.CreateIntCast(
19348	V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery (DL)));
19349	else
19350	V1 = Builder.CreateIntCast(
19351	V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery (DL)));
19352	}
19353	}
19354	Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19355	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
19356	GatherShuffleExtractSeq.insert(X: I);
19357	CSEBlocks.insert(V: I->getParent());
19358	}
19359	return Vec;
19360	}
19361	/// Creates permutation of the single vector operand with the given mask, if
19362	/// it is not identity mask.
19363	Value createShuffleVector(Value V1, ArrayRef<int> Mask) {
19364	if (Mask.empty())
19365	return V1;
19366	unsigned VF = Mask.size();
19367	unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19368	if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
19369	return V1;
19370	Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
19371	if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
19372	GatherShuffleExtractSeq.insert(X: I);
19373	CSEBlocks.insert(V: I->getParent());
19374	}
19375	return Vec;
19376	}
19377	Value createIdentity(Value V) { return V; }
19378	Value createPoison(Type Ty, unsigned VF) {
19379	return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
19380	}
19381	/// Resizes 2 input vector to match the sizes, if the they are not equal
19382	/// yet. The smallest vector is resized to the size of the larger vector.
19383	void resizeToMatch(Value &V1, Value &V2) {
19384	if (V1->getType() == V2->getType())
19385	return;
19386	int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19387	int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
19388	int VF = std::max(a: V1VF, b: V2VF);
19389	int MinVF = std::min(a: V1VF, b: V2VF);
19390	SmallVector<int> IdentityMask(VF, PoisonMaskElem);
19391	std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
19392	value: `0`);
19393	Value *&Op = MinVF == V1VF ? V1 : V2;
19394	Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
19395	if (auto *I = dyn_cast<Instruction>(Val: Op)) {
19396	GatherShuffleExtractSeq.insert(X: I);
19397	CSEBlocks.insert(V: I->getParent());
19398	}
19399	if (MinVF == V1VF)
19400	V1 = Op;
19401	else
19402	V2 = Op;
19403	}
19404	};
19405
19406	/// Smart shuffle instruction emission, walks through shuffles trees and
19407	/// tries to find the best matching vector for the actual shuffle
19408	/// instruction.
19409	Value createShuffle(Value V1, Value V2, ArrayRef<int*> Mask) {
19410	assert(V1 && "Expected at least one vector value.");
19411	ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19412	R.CSEBlocks, *R.DL);
19413	return BaseShuffleAnalysis::createShuffle<Value *>(
19414	V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy);
19415	}
19416
19417	/// Cast value \p V to the vector type with the same number of elements, but
19418	/// the base type \p ScalarTy.
19419	Value castToScalarTyElem(Value V,
19420	std::optional<bool> IsSigned = std::nullopt) {
19421	auto *VecTy = cast<VectorType>(Val: V->getType());
19422	assert(getNumElements(VecTy) % getNumElements(ScalarTy) == `0`);
19423	if (VecTy->getElementType() == ScalarTy->getScalarType())
19424	return V;
19425	return Builder.CreateIntCast(
19426	V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
19427	isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery (*R.DL))));
19428	}
19429
19430	Value getVectorizedValue(const* TreeEntry &E) {
19431	Value *Vec = E.VectorizedValue;
19432	if (!Vec->getType()->isIntOrIntVectorTy())
19433	return Vec;
19434	return castToScalarTyElem(V: Vec, IsSigned: any_of(Range: E.Scalars, P: [&](Value *V) {
19435	return !isa<PoisonValue>(Val: V) &&
19436	!isKnownNonNegative(
19437	V, SQ: SimplifyQuery (*R.DL));
19438	}));
19439	}
19440
19441	public:
19442	ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
19443	: BaseShuffleAnalysis (ScalarTy), Builder(Builder), R(R) {}
19444
19445	/// Adjusts extractelements after reusing them.
19446	Value adjustExtracts(const* TreeEntry E, MutableArrayRef<int*> Mask,
19447	ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
19448	unsigned NumParts, bool &UseVecBaseAsInput) {
19449	UseVecBaseAsInput = false;
19450	SmallPtrSet<Value *, `4`> UniqueBases;
19451	Value VecBase = nullptr*;
19452	SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
19453	if (!E->ReorderIndices.empty()) {
19454	SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19455	E->ReorderIndices.end());
19456	reorderScalars(Scalars&: VL, Mask: ReorderMask);
19457	}
19458	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
19459	int Idx = Mask [I];
19460	if (Idx == PoisonMaskElem)
19461	continue;
19462	auto *EI = cast<ExtractElementInst>(Val: VL [I]);
19463	VecBase = EI->getVectorOperand();
19464	if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty())
19465	VecBase = TEs.front()->VectorizedValue;
19466	assert(VecBase && "Expected vectorized value.");
19467	UniqueBases.insert(Ptr: VecBase);
19468	// If the only one use is vectorized - can delete the extractelement
19469	// itself.
19470	if (!EI->hasOneUse() \|\| R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) \|\|
19471	(E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
19472	!R.isVectorized(V: EI) &&
19473	count_if(Range: E->Scalars, P: [&](Value V) { return* V == EI; }) !=
19474	count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
19475	P: [&](Value V) { return* V == EI; })) \|\|
19476	(NumParts != `1` && count(Range&: VL, Element: EI) > `1`) \|\|
19477	any_of(Range: EI->users(), P: [&](User *U) {
19478	ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U);
19479	return UTEs.empty() \|\| UTEs.size() > `1` \|\|
19480	any_of(Range&: UTEs,
19481	P: [&](const TreeEntry *TE) {
19482	return R.DeletedNodes.contains(Ptr: TE) \|\|
19483	R.TransformedToGatherNodes.contains(Val: TE);
19484	}) \|\|
19485	(isa<GetElementPtrInst>(Val: U) &&
19486	!R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) \|\|
19487	(!UTEs.empty() &&
19488	count_if(Range&: R.VectorizableTree,
19489	P: [&](const std::unique_ptr<TreeEntry> &TE) {
19490	return TE ->UserTreeIndex.UserTE ==
19491	UTEs.front() &&
19492	is_contained(Range&: VL, Element: EI);
19493	}) != `1`);
19494	}))
19495	continue;
19496	R.eraseInstruction(I: EI);
19497	}
19498	if (NumParts == `1` \|\| UniqueBases.size() == `1`) {
19499	assert(VecBase && "Expected vectorized value.");
19500	return castToScalarTyElem(V: VecBase);
19501	}
19502	UseVecBaseAsInput = true;
19503	auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
19504	for (auto [I, Idx] : enumerate(First&: Mask))
19505	if (Idx != PoisonMaskElem)
19506	Idx = I;
19507	};
19508	// Perform multi-register vector shuffle, joining them into a single virtual
19509	// long vector.
19510	// Need to shuffle each part independently and then insert all this parts
19511	// into a long virtual vector register, forming the original vector.
19512	Value Vec = nullptr*;
19513	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19514	unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
19515	for (unsigned Part : seq<unsigned>(Size: NumParts)) {
19516	unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
19517	ArrayRef<Value > SubVL = ArrayRef(VL).slice(N: Part SliceSize, M: Limit);
19518	MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
19519	constexpr int MaxBases = `2`;
19520	SmallVector<Value *, MaxBases> Bases(MaxBases);
19521	auto VLMask = zip(t&: SubVL, u&: SubMask);
19522	const unsigned VF = std::accumulate(
19523	first: VLMask.begin(), last: VLMask.end(), init: `0U`, binary_op: [&](unsigned S, const auto &D) {
19524	if (std::get<`1`>(D) == PoisonMaskElem)
19525	return S;
19526	Value *VecOp =
19527	cast<ExtractElementInst>(std::get<`0`>(D))->getVectorOperand();
19528	if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp);
19529	!TEs.empty())
19530	VecOp = TEs.front()->VectorizedValue;
19531	assert(VecOp && "Expected vectorized value.");
19532	const unsigned Size =
19533	cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
19534	return std::max(a: S, b: Size);
19535	});
19536	for (const auto [V, I] : VLMask) {
19537	if (I == PoisonMaskElem)
19538	continue;
19539	Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
19540	if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty())
19541	VecOp = TEs.front()->VectorizedValue;
19542	assert(VecOp && "Expected vectorized value.");
19543	VecOp = castToScalarTyElem(V: VecOp);
19544	Bases [I / VF] = VecOp;
19545	}
19546	if (!Bases.front())
19547	continue;
19548	Value *SubVec;
19549	if (Bases.back()) {
19550	SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
19551	TransformToIdentity(SubMask);
19552	} else {
19553	SubVec = Bases.front();
19554	}
19555	if (!Vec) {
19556	Vec = SubVec;
19557	assert((Part == `0` \|\| all_of(seq<unsigned>(`0`, Part),
19558	[&](unsigned P) {
19559	ArrayRef<int> SubMask =
19560	Mask.slice(P * SliceSize,
19561	getNumElems(Mask.size(),
19562	SliceSize, P));
19563	return all_of(SubMask, [](int Idx) {
19564	return Idx == PoisonMaskElem;
19565	});
19566	})) &&
19567	"Expected first part or all previous parts masked.");
19568	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
19569	} else {
19570	unsigned NewVF =
19571	cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19572	if (Vec->getType() != SubVec->getType()) {
19573	unsigned SubVecVF =
19574	cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
19575	NewVF = std::max(a: NewVF, b: SubVecVF);
19576	}
19577	// Adjust SubMask.
19578	for (int &Idx : SubMask)
19579	if (Idx != PoisonMaskElem)
19580	Idx += NewVF;
19581	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
19582	Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
19583	TransformToIdentity(VecMask);
19584	}
19585	}
19586	copy(Range&: VecMask, Out: Mask.begin());
19587	return Vec;
19588	}
19589	/// Checks if the specified entry \p E needs to be delayed because of its
19590	/// dependency nodes.
19591	std::optional<Value *>
19592	needToDelay(const TreeEntry *E,
19593	ArrayRef<SmallVector<const TreeEntry >> Deps) const* {
19594	// No need to delay emission if all deps are ready.
19595	if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
19596	return all_of(
19597	Range&: TEs, P: [](const TreeEntry TE) { return* TE->VectorizedValue; });
19598	}))
19599	return std::nullopt;
19600	// Postpone gather emission, will be emitted after the end of the
19601	// process to keep correct order.
19602	auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
19603	return Builder.CreateAlignedLoad(
19604	Ty: ResVecTy,
19605	Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
19606	Align: MaybeAlign ());
19607	}
19608	/// Reset the builder to handle perfect diamond match.
19609	void resetForSameNode() {
19610	IsFinalized = false;
19611	CommonMask.clear();
19612	InVectors.clear();
19613	}
19614	/// Adds 2 input vectors (in form of tree entries) and the mask for their
19615	/// shuffling.
19616	void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
19617	Value *V1 = getVectorizedValue(E: E1);
19618	Value *V2 = getVectorizedValue(E: E2);
19619	add(V1, V2, Mask);
19620	}
19621	/// Adds single input vector (in form of tree entry) and the mask for its
19622	/// shuffling.
19623	void add(const TreeEntry &E1, ArrayRef<int> Mask) {
19624	Value *V1 = getVectorizedValue(E: E1);
19625	add(V1, Mask);
19626	}
19627	/// Adds 2 input vectors and the mask for their shuffling.
19628	void add(Value V1, Value V2, ArrayRef<int> Mask) {
19629	assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
19630	assert(isa<FixedVectorType>(V1->getType()) &&
19631	isa<FixedVectorType>(V2->getType()) &&
19632	"castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19633	V1 = castToScalarTyElem(V: V1);
19634	V2 = castToScalarTyElem(V: V2);
19635	if (InVectors.empty()) {
19636	InVectors.push_back(Elt: V1);
19637	InVectors.push_back(Elt: V2);
19638	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
19639	return;
19640	}
19641	Value *Vec = InVectors.front();
19642	if (InVectors.size() == `2`) {
19643	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19644	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19645	} else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
19646	Mask.size()) {
19647	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19648	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19649	}
19650	V1 = createShuffle(V1, V2, Mask);
19651	unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec));
19652	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19653	if (Mask [Idx] != PoisonMaskElem)
19654	CommonMask [Idx] = Idx + VF;
19655	InVectors.front() = Vec;
19656	if (InVectors.size() == `2`)
19657	InVectors.back() = V1;
19658	else
19659	InVectors.push_back(Elt: V1);
19660	}
19661	/// Adds another one input vector and the mask for the shuffling.
19662	void add(Value V1, ArrayRef<int> Mask, bool* = false) {
19663	assert(isa<FixedVectorType>(V1->getType()) &&
19664	"castToScalarTyElem expects V1 to be FixedVectorType");
19665	V1 = castToScalarTyElem(V: V1);
19666	if (InVectors.empty()) {
19667	InVectors.push_back(Elt: V1);
19668	CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
19669	return;
19670	}
19671	const auto *It = find(Range&: InVectors, Val: V1);
19672	if (It == InVectors.end()) {
19673	if (InVectors.size() == `2` \|\|
19674	InVectors.front()->getType() != V1->getType()) {
19675	Value *V = InVectors.front();
19676	if (InVectors.size() == `2`) {
19677	V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
19678	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19679	} else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
19680	CommonMask.size()) {
19681	V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
19682	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19683	}
19684	unsigned VF = std::max(a: CommonMask.size(), b: Mask.size());
19685	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19686	if (CommonMask [Idx] == PoisonMaskElem && Mask [Idx] != PoisonMaskElem)
19687	CommonMask [Idx] = V->getType() != V1->getType()
19688	? Idx + VF
19689	: Mask [Idx] + getVF(V: V1);
19690	if (V->getType() != V1->getType())
19691	V1 = createShuffle(V1, V2: nullptr, Mask);
19692	InVectors.front() = V;
19693	if (InVectors.size() == `2`)
19694	InVectors.back() = V1;
19695	else
19696	InVectors.push_back(Elt: V1);
19697	return;
19698	}
19699	// Check if second vector is required if the used elements are already
19700	// used from the first one.
19701	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19702	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem) {
19703	InVectors.push_back(Elt: V1);
19704	break;
19705	}
19706	}
19707	unsigned VF = `0`;
19708	for (Value *V : InVectors)
19709	VF = std::max(a: VF, b: getVF(V));
19710	for (unsigned Idx = `0`, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19711	if (Mask [Idx] != PoisonMaskElem && CommonMask [Idx] == PoisonMaskElem)
19712	CommonMask [Idx] = Mask [Idx] + (It == InVectors.begin() ? `0` : VF);
19713	}
19714	/// Adds another one input vector and the mask for the shuffling.
19715	void addOrdered(Value V1, ArrayRef<unsigned*> Order) {
19716	SmallVector<int> NewMask;
19717	inversePermutation(Indices: Order, Mask&: NewMask);
19718	add(V1, Mask: NewMask);
19719	}
19720	Value gather(ArrayRef<Value > VL, unsigned MaskVF = `0`,
19721	Value Root = nullptr*) {
19722	return R.gather(VL, Root, ScalarTy,
19723	CreateShuffle: [&](Value V1, Value V2, ArrayRef<int> Mask) {
19724	return createShuffle(V1, V2, Mask);
19725	});
19726	}
19727	Value createFreeze(Value V) { return Builder.CreateFreeze(V); }
19728	/// Finalize emission of the shuffles.
19729	/// \param Action the action (if any) to be performed before final applying of
19730	/// the \p ExtMask mask.
19731	Value *finalize(
19732	ArrayRef<int> ExtMask,
19733	ArrayRef<std::pair<const TreeEntry , unsigned*>> SubVectors,
19734	ArrayRef<int> SubVectorsMask, unsigned VF = `0`,
19735	function_ref<void(Value &, SmallVectorImpl<int*> &,
19736	function_ref<Value (Value , Value , ArrayRef<int*>)>)>
19737	Action = {}) {
19738	IsFinalized = true;
19739	if (Action) {
19740	Value *Vec = InVectors.front();
19741	if (InVectors.size() == `2`) {
19742	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19743	InVectors.pop_back();
19744	} else {
19745	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19746	}
19747	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19748	assert(VF > `0` &&
19749	"Expected vector length for the final value before action.");
19750	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19751	if (VecVF < VF) {
19752	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19753	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: `0`);
19754	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
19755	}
19756	Action (Vec, CommonMask, [this](Value V1, Value V2, ArrayRef<int> Mask) {
19757	return createShuffle(V1, V2, Mask);
19758	});
19759	InVectors.front() = Vec;
19760	}
19761	if (!SubVectors.empty()) {
19762	Value *Vec = InVectors.front();
19763	if (InVectors.size() == `2`) {
19764	Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19765	InVectors.pop_back();
19766	} else {
19767	Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19768	}
19769	transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19770	auto CreateSubVectors = [&](Value *Vec,
19771	SmallVectorImpl<int> &CommonMask) {
19772	for (auto [E, Idx] : SubVectors) {
19773	Value V = getVectorizedValue(E: E);
19774	unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy);
19775	// Use scalar version of the SCalarType to correctly handle shuffles
19776	// for revectorization. The revectorization mode operates by the
19777	// vectors, but here we need to operate on the scalars, because the
19778	// masks were already transformed for the vector elements and we don't
19779	// need doing this transformation again.
19780	Type *OrigScalarTy = ScalarTy;
19781	ScalarTy = ScalarTy->getScalarType();
19782	Vec = createInsertVector(
19783	Builder, Vec, V, Index: InsertionIndex,
19784	Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2,
19785	args: _3));
19786	ScalarTy = OrigScalarTy;
19787	if (!CommonMask.empty()) {
19788	std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
19789	last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
19790	value: Idx);
19791	}
19792	}
19793	return Vec;
19794	};
19795	if (SubVectorsMask.empty()) {
19796	Vec = CreateSubVectors(Vec, CommonMask);
19797	} else {
19798	SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19799	copy(Range&: SubVectorsMask, Out: SVMask.begin());
19800	for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
19801	if (I2 != PoisonMaskElem) {
19802	assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19803	I1 = I2 + CommonMask.size();
19804	}
19805	}
19806	Value *InsertVec =
19807	CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask);
19808	Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask);
19809	transformMaskAfterShuffle(CommonMask, Mask: SVMask);
19810	}
19811	InVectors.front() = Vec;
19812	}
19813
19814	if (!ExtMask.empty()) {
19815	if (CommonMask.empty()) {
19816	CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
19817	} else {
19818	SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19819	for (int I = `0`, Sz = ExtMask.size(); I < Sz; ++I) {
19820	if (ExtMask [I] == PoisonMaskElem)
19821	continue;
19822	NewMask [I] = CommonMask [ExtMask [I]];
19823	}
19824	CommonMask.swap(RHS&: NewMask);
19825	}
19826	}
19827	if (CommonMask.empty()) {
19828	assert(InVectors.size() == `1` && "Expected only one vector with no mask");
19829	return InVectors.front();
19830	}
19831	if (InVectors.size() == `2`)
19832	return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
19833	return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
19834	}
19835
19836	~ShuffleInstructionBuilder() {
19837	assert((IsFinalized \|\| CommonMask.empty()) &&
19838	"Shuffle construction must be finalized.");
19839	}
19840	};
19841
19842	Value BoUpSLP::vectorizeOperand(TreeEntry E, unsigned NodeIdx) {
19843	return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx));
19844	}
19845
19846	template <typename BVTy, typename ResTy, typename... Args>
19847	ResTy BoUpSLP::processBuildVector(const TreeEntry E, Type ScalarTy,
19848	Args &...Params) {
19849	assert((E->isGather() \|\| TransformedToGatherNodes.contains(E)) &&
19850	"Expected gather node.");
19851	unsigned VF = E->getVectorFactor();
19852
19853	bool NeedFreeze = false;
19854	SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19855	// Do not process split vectorize node, marked to be gathers/buildvectors.
19856	SmallVector<std::pair<const TreeEntry , unsigned*>> SubVectors(
19857	E->CombinedEntriesWithIndices.size());
19858	if (E->State == TreeEntry::SplitVectorize &&
19859	TransformedToGatherNodes.contains(Val: E)) {
19860	SubVectors.clear();
19861	} else {
19862	// Clear values, to be replaced by insertvector instructions.
19863	for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19864	for_each(MutableArrayRef(GatheredScalars)
19865	.slice(N: Idx, M: VectorizableTree [EIdx]->getVectorFactor()),
19866	[&](Value *&V) { V = PoisonValue::get(T: V->getType()); });
19867	transform(
19868	E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19869	return std::make_pair(VectorizableTree[P.first].get(), P.second);
19870	});
19871	}
19872	// Build a mask out of the reorder indices and reorder scalars per this
19873	// mask.
19874	SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19875	E->ReorderIndices.end());
19876	if (!ReorderMask.empty())
19877	reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
19878	SmallVector<int> SubVectorsMask;
19879	inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask);
19880	// Transform non-clustered elements in the mask to poison (-1).
19881	// "Clustered" operations will be reordered using this mask later.
19882	if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19883	for (unsigned I : seq<unsigned>(Size: GatheredScalars.size()))
19884	if (E->Scalars [I] == GatheredScalars [ReorderMask [I]])
19885	SubVectorsMask [ReorderMask [I]] = PoisonMaskElem;
19886	} else {
19887	SubVectorsMask.clear();
19888	}
19889	SmallVector<Value *> StoredGS(GatheredScalars);
19890	auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19891	unsigned I, unsigned SliceSize,
19892	bool IsNotPoisonous) {
19893	if (!isSplat(VL: E->Scalars) \|\| none_of(E->Scalars, [](Value *V) {
19894	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
19895	}))
19896	return false;
19897	TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19898	unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19899	if (UserTE->getNumOperands() != `2`)
19900	return false;
19901	if (!IsNotPoisonous) {
19902	auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + `1`),
19903	[=](const std::unique_ptr<TreeEntry> &TE) {
19904	return TE ->UserTreeIndex.UserTE == UserTE &&
19905	TE ->UserTreeIndex.EdgeIdx != EdgeIdx;
19906	});
19907	if (It == VectorizableTree.end())
19908	return false;
19909	SmallVector<Value > GS((It)->Scalars.begin(), (*It)->Scalars.end());
19910	if (!(*It)->ReorderIndices.empty()) {
19911	inversePermutation((*It)->ReorderIndices, ReorderMask);
19912	reorderScalars(Scalars&: GS, Mask: ReorderMask);
19913	}
19914	if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) {
19915	Value *V0 = std::get<`0`>(P);
19916	Value *V1 = std::get<`1`>(P);
19917	return !isa<UndefValue>(Val: V0) \|\| isa<PoisonValue>(Val: V0) \|\|
19918	(isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) &&
19919	is_contained(Range: E->Scalars, Element: V1));
19920	}))
19921	return false;
19922	}
19923	int Idx;
19924	if ((Mask.size() < InputVF &&
19925	ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
19926	Idx == `0`) \|\|
19927	(Mask.size() == InputVF &&
19928	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
19929	std::iota(
19930	first: std::next(x: Mask.begin(), n: I * SliceSize),
19931	last: std::next(x: Mask.begin(),
19932	n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
19933	value: `0`);
19934	} else {
19935	unsigned IVal =
19936	find_if_not(Mask, [](int* Idx) { return Idx == PoisonMaskElem; });
19937	std::fill(
19938	first: std::next(x: Mask.begin(), n: I * SliceSize),
19939	last: std::next(x: Mask.begin(),
19940	n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
19941	value: IVal);
19942	}
19943	return true;
19944	};
19945	BVTy ShuffleBuilder(ScalarTy, Params...);
19946	ResTy Res = ResTy();
19947	SmallVector<int> Mask;
19948	SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19949	SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
19950	Value ExtractVecBase = nullptr*;
19951	bool UseVecBaseAsInput = false;
19952	SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
19953	SmallVector<SmallVector<const TreeEntry *>> Entries;
19954	Type *OrigScalarTy = GatheredScalars.front()->getType();
19955	auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
19956	unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size());
19957	if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
19958	// Check for gathered extracts.
19959	bool Resized = false;
19960	ExtractShuffles =
19961	tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
19962	if (!ExtractShuffles.empty()) {
19963	SmallVector<const TreeEntry *> ExtractEntries;
19964	for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
19965	if (I == PoisonMaskElem)
19966	continue;
19967	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19968	V: cast<ExtractElementInst>(Val: StoredGS [Idx])->getVectorOperand());
19969	!TEs.empty())
19970	ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end());
19971	}
19972	if (std::optional<ResTy> Delayed =
19973	ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19974	// Delay emission of gathers which are not ready yet.
19975	PostponedGathers.insert(X: E);
19976	// Postpone gather emission, will be emitted after the end of the
19977	// process to keep correct order.
19978	return *Delayed;
19979	}
19980	if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19981	E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19982	ExtractVecBase = VecBase;
19983	if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
19984	if (VF == VecBaseTy->getNumElements() &&
19985	GatheredScalars.size() != VF) {
19986	Resized = true;
19987	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
19988	Elt: PoisonValue::get(T: OrigScalarTy));
19989	NumParts =
19990	::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF);
19991	}
19992	}
19993	}
19994	// Gather extracts after we check for full matched gathers only.
19995	if (!ExtractShuffles.empty() \|\| !E->hasState() \|\|
19996	E->getOpcode() != Instruction::Load \|\|
19997	(((E->hasState() && E->getOpcode() == Instruction::Load) \|\|
19998	any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) &&
19999	any_of(E->Scalars,
20000	[this](Value *V) {
20001	return isa<LoadInst>(Val: V) && isVectorized(V);
20002	})) \|\|
20003	(E->hasState() && E->isAltShuffle()) \|\|
20004	all_of(E->Scalars, [this](Value V) { return* isVectorized(V); }) \|\|
20005	isSplat(VL: E->Scalars) \|\|
20006	(E->Scalars != GatheredScalars && GatheredScalars.size() <= `2`)) {
20007	GatherShuffles =
20008	isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
20009	}
20010	if (!GatherShuffles.empty()) {
20011	if (std::optional<ResTy> Delayed =
20012	ShuffleBuilder.needToDelay(E, Entries)) {
20013	// Delay emission of gathers which are not ready yet.
20014	PostponedGathers.insert(X: E);
20015	// Postpone gather emission, will be emitted after the end of the
20016	// process to keep correct order.
20017	return *Delayed;
20018	}
20019	if (GatherShuffles.size() == `1` &&
20020	*GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
20021	Entries.front().front()->isSame(VL: E->Scalars)) {
20022	// Perfect match in the graph, will reuse the previously vectorized
20023	// node. Cost is 0.
20024	LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
20025	<< shortBundleName(E->Scalars, E->Idx) << ".\n");
20026	// Restore the mask for previous partially matched values.
20027	Mask.resize(N: E->Scalars.size());
20028	const TreeEntry *FrontTE = Entries.front().front();
20029	if (FrontTE->ReorderIndices.empty() &&
20030	((FrontTE->ReuseShuffleIndices.empty() &&
20031	E->Scalars.size() == FrontTE->Scalars.size()) \|\|
20032	(E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
20033	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
20034	} else {
20035	for (auto [I, V] : enumerate(First: E->Scalars)) {
20036	if (isa<PoisonValue>(Val: V)) {
20037	Mask [I] = PoisonMaskElem;
20038	continue;
20039	}
20040	Mask [I] = FrontTE->findLaneForValue(V);
20041	}
20042	}
20043	// Reset the builder(s) to correctly handle perfect diamond matched
20044	// nodes.
20045	ShuffleBuilder.resetForSameNode();
20046	// Full matched entry found, no need to insert subvectors.
20047	if (equal(LRange: E->Scalars, RRange: FrontTE->Scalars) &&
20048	equal(LRange: E->ReorderIndices, RRange: FrontTE->ReorderIndices) &&
20049	equal(LRange: E->ReuseShuffleIndices, RRange: FrontTE->ReuseShuffleIndices)) {
20050	Mask.resize(N: FrontTE->getVectorFactor());
20051	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
20052	ShuffleBuilder.add(*FrontTE, Mask);
20053	Res = ShuffleBuilder.finalize({}, {}, {});
20054	} else {
20055	ShuffleBuilder.add(*FrontTE, Mask);
20056	Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
20057	}
20058	return Res;
20059	}
20060	if (!Resized) {
20061	if (GatheredScalars.size() != VF &&
20062	any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
20063	return any_of(TEs, [&](const TreeEntry *TE) {
20064	return TE->getVectorFactor() == VF;
20065	});
20066	}))
20067	GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
20068	Elt: PoisonValue::get(T: OrigScalarTy));
20069	}
20070	// Remove shuffled elements from list of gathers.
20071	for (int I = `0`, Sz = Mask.size(); I < Sz; ++I) {
20072	if (Mask [I] != PoisonMaskElem)
20073	GatheredScalars [I] = PoisonValue::get(T: OrigScalarTy);
20074	}
20075	}
20076	}
20077	auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
20078	SmallVectorImpl<int> &ReuseMask,
20079	bool IsRootPoison) {
20080	// For splats with can emit broadcasts instead of gathers, so try to find
20081	// such sequences.
20082	bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
20083	(Scalars.size() > `2` \|\| Scalars.front() == Scalars.back());
20084	Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
20085	SmallVector<int> UndefPos;
20086	DenseMap<Value , unsigned*> UniquePositions;
20087	// Gather unique non-const values and all constant values.
20088	// For repeated values, just shuffle them.
20089	int NumNonConsts = `0`;
20090	int SinglePos = `0`;
20091	for (auto [I, V] : enumerate(First&: Scalars)) {
20092	if (isa<UndefValue>(Val: V)) {
20093	if (!isa<PoisonValue>(Val: V)) {
20094	ReuseMask [I] = I;
20095	UndefPos.push_back(Elt: I);
20096	}
20097	continue;
20098	}
20099	if (isConstant(V)) {
20100	ReuseMask [I] = I;
20101	continue;
20102	}
20103	++NumNonConsts;
20104	SinglePos = I;
20105	Value *OrigV = V;
20106	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
20107	if (IsSplat) {
20108	Scalars.front() = OrigV;
20109	ReuseMask [I] = `0`;
20110	} else {
20111	const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
20112	Scalars [Res.first ->second] = OrigV;
20113	ReuseMask [I] = Res.first ->second;
20114	}
20115	}
20116	if (NumNonConsts == `1`) {
20117	// Restore single insert element.
20118	if (IsSplat) {
20119	ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
20120	std::swap(a&: Scalars.front(), b&: Scalars [SinglePos]);
20121	if (!UndefPos.empty() && UndefPos.front() == `0`)
20122	Scalars.front() = UndefValue::get(T: OrigScalarTy);
20123	}
20124	ReuseMask [SinglePos] = SinglePos;
20125	} else if (!UndefPos.empty() && IsSplat) {
20126	// For undef values, try to replace them with the simple broadcast.
20127	// We can do it if the broadcasted value is guaranteed to be
20128	// non-poisonous, or by freezing the incoming scalar value first.
20129	auto It = find_if(Scalars, [this, E](Value V) {
20130	return !isa<UndefValue>(Val: V) &&
20131	(isVectorized(V) \|\| isGuaranteedNotToBePoison(V, AC) \|\|
20132	(E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
20133	// Check if the value already used in the same operation in
20134	// one of the nodes already.
20135	return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
20136	is_contained(Range&: E->UserTreeIndex.UserTE->Scalars,
20137	Element: U.getUser());
20138	})));
20139	});
20140	if (It != Scalars.end()) {
20141	// Replace undefs by the non-poisoned scalars and emit broadcast.
20142	int Pos = std::distance(Scalars.begin(), It);
20143	for (int I : UndefPos) {
20144	// Set the undef position to the non-poisoned scalar.
20145	ReuseMask [I] = Pos;
20146	// Replace the undef by the poison, in the mask it is replaced by
20147	// non-poisoned scalar already.
20148	if (I != Pos)
20149	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
20150	}
20151	} else {
20152	// Replace undefs by the poisons, emit broadcast and then emit
20153	// freeze.
20154	for (int I : UndefPos) {
20155	ReuseMask [I] = PoisonMaskElem;
20156	if (isa<UndefValue>(Val: Scalars [I]))
20157	Scalars [I] = PoisonValue::get(T: OrigScalarTy);
20158	}
20159	NeedFreeze = true;
20160	}
20161	}
20162	};
20163	if (!ExtractShuffles.empty() \|\| !GatherShuffles.empty()) {
20164	bool IsNonPoisoned = true;
20165	bool IsUsedInExpr = true;
20166	Value Vec1 = nullptr*;
20167	if (!ExtractShuffles.empty()) {
20168	// Gather of extractelements can be represented as just a shuffle of
20169	// a single/two vectors the scalars are extracted from.
20170	// Find input vectors.
20171	Value Vec2 = nullptr*;
20172	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
20173	if (!Mask.empty() && Mask [I] != PoisonMaskElem)
20174	ExtractMask [I] = PoisonMaskElem;
20175	}
20176	if (UseVecBaseAsInput) {
20177	Vec1 = ExtractVecBase;
20178	} else {
20179	for (unsigned I = `0`, Sz = ExtractMask.size(); I < Sz; ++I) {
20180	if (ExtractMask [I] == PoisonMaskElem)
20181	continue;
20182	if (isa<UndefValue>(Val: StoredGS [I]))
20183	continue;
20184	auto *EI = cast<ExtractElementInst>(Val: StoredGS [I]);
20185	Value *VecOp = EI->getVectorOperand();
20186	if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp);
20187	!TEs.empty() && TEs.front()->VectorizedValue)
20188	VecOp = TEs.front()->VectorizedValue;
20189	if (!Vec1) {
20190	Vec1 = VecOp;
20191	} else if (Vec1 != VecOp) {
20192	assert((!Vec2 \|\| Vec2 == VecOp) &&
20193	"Expected only 1 or 2 vectors shuffle.");
20194	Vec2 = VecOp;
20195	}
20196	}
20197	}
20198	if (Vec2) {
20199	IsUsedInExpr = false;
20200	IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) &&
20201	isGuaranteedNotToBePoison(V: Vec2, AC);
20202	ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
20203	} else if (Vec1) {
20204	bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC);
20205	IsUsedInExpr &= FindReusedSplat(
20206	ExtractMask,
20207	cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), `0`,
20208	ExtractMask.size(), IsNotPoisonedVec);
20209	ShuffleBuilder.add(Vec1, ExtractMask, /ForExtracts=/true);
20210	IsNonPoisoned &= IsNotPoisonedVec;
20211	} else {
20212	IsUsedInExpr = false;
20213	ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
20214	/ForExtracts=/true);
20215	}
20216	}
20217	if (!GatherShuffles.empty()) {
20218	unsigned SliceSize =
20219	getPartNumElems(Size: E->Scalars.size(),
20220	NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size()));
20221	SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
20222	for (const auto [I, TEs] : enumerate(First&: Entries)) {
20223	if (TEs.empty()) {
20224	assert(!GatherShuffles[I] &&
20225	"No shuffles with empty entries list expected.");
20226	continue;
20227	}
20228	assert((TEs.size() == `1` \|\| TEs.size() == `2`) &&
20229	"Expected shuffle of 1 or 2 entries.");
20230	unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
20231	auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
20232	VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
20233	copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
20234	if (TEs.size() == `1`) {
20235	bool IsNotPoisonedVec =
20236	TEs.front()->VectorizedValue
20237	? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC)
20238	: true;
20239	IsUsedInExpr &=
20240	FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
20241	SliceSize, IsNotPoisonedVec);
20242	ShuffleBuilder.add(*TEs.front(), VecMask);
20243	IsNonPoisoned &= IsNotPoisonedVec;
20244	} else {
20245	IsUsedInExpr = false;
20246	ShuffleBuilder.add(TEs.front(), TEs.back(), VecMask);
20247	if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
20248	IsNonPoisoned &=
20249	isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) &&
20250	isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC);
20251	}
20252	}
20253	}
20254	// Try to figure out best way to combine values: build a shuffle and insert
20255	// elements or just build several shuffles.
20256	// Insert non-constant scalars.
20257	SmallVector<Value *> NonConstants(GatheredScalars);
20258	int EMSz = ExtractMask.size();
20259	int MSz = Mask.size();
20260	// Try to build constant vector and shuffle with it only if currently we
20261	// have a single permutation and more than 1 scalar constants.
20262	bool IsSingleShuffle = ExtractShuffles.empty() \|\| GatherShuffles.empty();
20263	bool IsIdentityShuffle =
20264	((UseVecBaseAsInput \|\|
20265	all_of(ExtractShuffles,
20266	[](const std::optional<TTI::ShuffleKind> &SK) {
20267	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
20268	TTI::SK_PermuteSingleSrc;
20269	})) &&
20270	none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
20271	ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) \|\|
20272	(!GatherShuffles.empty() &&
20273	all_of(GatherShuffles,
20274	[](const std::optional<TTI::ShuffleKind> &SK) {
20275	return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
20276	TTI::SK_PermuteSingleSrc;
20277	}) &&
20278	none_of(Mask, [&](int I) { return I >= MSz; }) &&
20279	ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
20280	bool EnoughConstsForShuffle =
20281	IsSingleShuffle &&
20282	(none_of(GatheredScalars,
20283	[](Value *V) {
20284	return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
20285	}) \|\|
20286	any_of(GatheredScalars,
20287	[](Value *V) {
20288	return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
20289	})) &&
20290	(!IsIdentityShuffle \|\|
20291	(GatheredScalars.size() == `2` &&
20292	any_of(GatheredScalars,
20293	[](Value V) { return* !isa<UndefValue>(Val: V); })) \|\|
20294	count_if(GatheredScalars, [](Value *V) {
20295	return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
20296	}) > `1`);
20297	// NonConstants array contains just non-constant values, GatheredScalars
20298	// contains only constant to build final vector and then shuffle.
20299	for (int I = `0`, Sz = GatheredScalars.size(); I < Sz; ++I) {
20300	if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars [I]))
20301	NonConstants [I] = PoisonValue::get(T: OrigScalarTy);
20302	else
20303	GatheredScalars [I] = PoisonValue::get(T: OrigScalarTy);
20304	}
20305	// Generate constants for final shuffle and build a mask for them.
20306	if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
20307	SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
20308	TryPackScalars(GatheredScalars, BVMask, /IsRootPoison=/true);
20309	Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
20310	ShuffleBuilder.add(BV, BVMask);
20311	}
20312	if (all_of(NonConstants, [=](Value *V) {
20313	return isa<PoisonValue>(Val: V) \|\|
20314	(IsSingleShuffle && ((IsIdentityShuffle &&
20315	IsNonPoisoned) \|\| IsUsedInExpr) && isa<UndefValue>(Val: V));
20316	}))
20317	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20318	SubVectorsMask);
20319	else
20320	Res = ShuffleBuilder.finalize(
20321	E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
20322	[&](Value &Vec, SmallVectorImpl<int> &Mask, auto* CreateShuffle) {
20323	bool IsSplat = isSplat(VL: NonConstants);
20324	SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20325	TryPackScalars(NonConstants, BVMask, /IsRootPoison=/false);
20326	auto CheckIfSplatIsProfitable = [&]() {
20327	// Estimate the cost of splatting + shuffle and compare with
20328	// insert + shuffle.
20329	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20330	Value V = find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
20331	if (isa<ExtractElementInst>(Val: V) \|\| isVectorized(V))
20332	return false;
20333	InstructionCost SplatCost = TTI->getVectorInstrCost(
20334	Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /Index=/`0`,
20335	Op0: PoisonValue::get(T: VecTy), Op1: V);
20336	SmallVector<int> NewMask(Mask.begin(), Mask.end());
20337	for (auto [Idx, I] : enumerate(First&: BVMask))
20338	if (I != PoisonMaskElem)
20339	NewMask [Idx] = Mask.size();
20340	SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy,
20341	Mask: NewMask, CostKind);
20342	InstructionCost BVCost = TTI->getVectorInstrCost(
20343	Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
20344	Index: *find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem)), Op0: Vec, Op1: V);
20345	// Shuffle required?
20346	if (count(Range&: BVMask, Element: PoisonMaskElem) <
20347	static_cast<int>(BVMask.size() - `1`)) {
20348	SmallVector<int> NewMask(Mask.begin(), Mask.end());
20349	for (auto [Idx, I] : enumerate(First&: BVMask))
20350	if (I != PoisonMaskElem)
20351	NewMask [Idx] = I;
20352	BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
20353	Tp: VecTy, Mask: NewMask, CostKind);
20354	}
20355	return SplatCost <= BVCost;
20356	};
20357	if (!IsSplat \|\| Mask.size() <= `2` \|\| !CheckIfSplatIsProfitable()) {
20358	for (auto [Idx, I] : enumerate(First&: BVMask))
20359	if (I != PoisonMaskElem)
20360	Mask [Idx] = I;
20361	Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20362	} else {
20363	Value V = find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
20364	SmallVector<Value *> Values(NonConstants.size(),
20365	PoisonValue::get(T: ScalarTy));
20366	Values [`0`] = V;
20367	Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
20368	SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
20369	transform(BVMask, SplatMask.begin(), [](int I) {
20370	return I == PoisonMaskElem ? PoisonMaskElem : `0`;
20371	});
20372	if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF))
20373	BV = CreateShuffle(BV, nullptr, SplatMask);
20374	for (auto [Idx, I] : enumerate(First&: BVMask))
20375	if (I != PoisonMaskElem)
20376	Mask [Idx] = BVMask.size() + Idx;
20377	Vec = CreateShuffle(Vec, BV, Mask);
20378	for (auto [Idx, I] : enumerate(First&: Mask))
20379	if (I != PoisonMaskElem)
20380	Mask [Idx] = Idx;
20381	}
20382	});
20383	} else if (!allConstant(VL: GatheredScalars)) {
20384	// Gather unique scalars and all constants.
20385	SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
20386	TryPackScalars(GatheredScalars, ReuseMask, /IsRootPoison=/true);
20387	Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20388	ShuffleBuilder.add(BV, ReuseMask);
20389	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20390	SubVectorsMask);
20391	} else {
20392	// Gather all constants.
20393	SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
20394	for (auto [I, V] : enumerate(First&: GatheredScalars)) {
20395	if (!isa<PoisonValue>(Val: V))
20396	Mask [I] = I;
20397	}
20398	Value *BV = ShuffleBuilder.gather(GatheredScalars);
20399	ShuffleBuilder.add(BV, Mask);
20400	Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20401	SubVectorsMask);
20402	}
20403
20404	if (NeedFreeze)
20405	Res = ShuffleBuilder.createFreeze(Res);
20406	return Res;
20407	}
20408
20409	Value BoUpSLP::createBuildVector(const* TreeEntry E, Type ScalarTy) {
20410	// Do not do this for split vectorize node, marked to be gathers/buildvectors.
20411	if (E->State != TreeEntry::SplitVectorize \|\|
20412	!TransformedToGatherNodes.contains(Val: E)) {
20413	for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
20414	(void)vectorizeTree(E: VectorizableTree [EIdx].get());
20415	}
20416	return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
20417	Params&: Builder, Params&: *this);
20418	}
20419
20420	/// \returns \p I after propagating metadata from \p VL only for instructions in
20421	/// \p VL.
20422	static Instruction propagateMetadata(Instruction Inst, ArrayRef<Value *> VL) {
20423	SmallVector<Value *> Insts;
20424	for (Value *V : VL)
20425	if (isa<Instruction>(Val: V))
20426	Insts.push_back(Elt: V);
20427	return llvm::propagateMetadata(I: Inst, VL: Insts);
20428	}
20429
20430	static DebugLoc getDebugLocFromPHI(PHINode &PN) {
20431	if (DebugLoc DL = PN.getDebugLoc())
20432	return DL;
20433	return DebugLoc::getUnknown();
20434	}
20435
20436	Value BoUpSLP::vectorizeTree(TreeEntry E) {
20437	IRBuilderBase::InsertPointGuard Guard(Builder);
20438
20439	Value *V = E->Scalars.front();
20440	Type *ScalarTy = V->getType();
20441	if (!isa<CmpInst>(Val: V))
20442	ScalarTy = getValueType(V);
20443	auto It = MinBWs.find(Val: E);
20444	if (It != MinBWs.end()) {
20445	auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
20446	ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It ->second.first);
20447	if (VecTy)
20448	ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
20449	}
20450	if (E->VectorizedValue)
20451	return E->VectorizedValue;
20452	auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
20453	if (E->isGather() \|\| TransformedToGatherNodes.contains(Val: E)) {
20454	// Set insert point for non-reduction initial nodes.
20455	if (E->hasState() && E->Idx == `0` && !UserIgnoreList)
20456	setInsertPointAfterBundle(E);
20457	Value *Vec = createBuildVector(E, ScalarTy);
20458	E->VectorizedValue = Vec;
20459	return Vec;
20460	}
20461	if (E->State == TreeEntry::SplitVectorize) {
20462	assert(E->CombinedEntriesWithIndices.size() == `2` &&
20463	"Expected exactly 2 combined entries.");
20464	setInsertPointAfterBundle(E);
20465	TreeEntry &OpTE1 =
20466	*VectorizableTree [E->CombinedEntriesWithIndices.front().first];
20467	assert(OpTE1.isSame(
20468	ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
20469	"Expected same first part of scalars.");
20470	Value *Op1 = vectorizeTree(E: &OpTE1);
20471	TreeEntry &OpTE2 =
20472	*VectorizableTree [E->CombinedEntriesWithIndices.back().first];
20473	assert(
20474	OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
20475	"Expected same second part of scalars.");
20476	Value *Op2 = vectorizeTree(E: &OpTE2);
20477	auto GetOperandSignedness = [&](const TreeEntry *OpE) {
20478	bool IsSigned = false;
20479	auto It = MinBWs.find(Val: OpE);
20480	if (It != MinBWs.end())
20481	IsSigned = It ->second.second;
20482	else
20483	IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
20484	if (isa<PoisonValue>(Val: V))
20485	return false;
20486	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
20487	});
20488	return IsSigned;
20489	};
20490	if (cast<VectorType>(Val: Op1->getType())->getElementType() !=
20491	ScalarTy->getScalarType()) {
20492	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20493	Op1 = Builder.CreateIntCast(
20494	V: Op1,
20495	DestTy: getWidenedType(
20496	ScalarTy,
20497	VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()),
20498	isSigned: GetOperandSignedness (&OpTE1));
20499	}
20500	if (cast<VectorType>(Val: Op2->getType())->getElementType() !=
20501	ScalarTy->getScalarType()) {
20502	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20503	Op2 = Builder.CreateIntCast(
20504	V: Op2,
20505	DestTy: getWidenedType(
20506	ScalarTy,
20507	VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()),
20508	isSigned: GetOperandSignedness (&OpTE2));
20509	}
20510	if (E->ReorderIndices.empty()) {
20511	SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
20512	std::iota(
20513	first: Mask.begin(),
20514	last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second),
20515	value: `0`);
20516	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
20517	if (ScalarTyNumElements != `1`) {
20518	assert(SLPReVec && "Only supported by REVEC.");
20519	transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask);
20520	}
20521	Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask);
20522	Vec = createInsertVector(Builder, Vec, V: Op2,
20523	Index: E->CombinedEntriesWithIndices.back().second *
20524	ScalarTyNumElements);
20525	E->VectorizedValue = Vec;
20526	return Vec;
20527	}
20528	unsigned CommonVF =
20529	std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor());
20530	const unsigned Scale = getNumElements(Ty: ScalarTy);
20531	CommonVF *= Scale;
20532	if (getNumElements(Ty: Op1->getType()) != CommonVF) {
20533	SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20534	copy(Range: createReplicatedMask(ReplicationFactor: Scale, VF: OpTE1.getVectorFactor() * Scale),
20535	Out: Mask.begin());
20536	Op1 = Builder.CreateShuffleVector(V: Op1, Mask);
20537	}
20538	if (getNumElements(Ty: Op2->getType()) != CommonVF) {
20539	SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20540	copy(Range: createReplicatedMask(ReplicationFactor: Scale, VF: OpTE2.getVectorFactor() * Scale),
20541	Out: Mask.begin());
20542	Op2 = Builder.CreateShuffleVector(V: Op2, Mask);
20543	}
20544	Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask());
20545	E->VectorizedValue = Vec;
20546	return Vec;
20547	}
20548
20549	bool IsReverseOrder =
20550	!E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices);
20551	auto FinalShuffle = [&](Value V, const* TreeEntry *E) {
20552	ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
20553	if (E->getOpcode() == Instruction::Store &&
20554	E->State == TreeEntry::Vectorize) {
20555	ArrayRef<int> Mask =
20556	ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
20557	E->ReorderIndices.size());
20558	ShuffleBuilder.add(V1: V, Mask);
20559	} else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) \|\|
20560	E->State == TreeEntry::CompressVectorize) {
20561	ShuffleBuilder.addOrdered(V1: V, Order: {});
20562	} else {
20563	ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
20564	}
20565	SmallVector<std::pair<const TreeEntry , unsigned*>> SubVectors(
20566	E->CombinedEntriesWithIndices.size());
20567	transform(
20568	Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) {
20569	return std::make_pair(VectorizableTree[P.first].get(), P.second);
20570	});
20571	assert(
20572	(E->CombinedEntriesWithIndices.empty() \|\| E->ReorderIndices.empty()) &&
20573	"Expected either combined subnodes or reordering");
20574	return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {});
20575	};
20576
20577	assert(!E->isGather() && "Unhandled state");
20578	unsigned ShuffleOrOp =
20579	E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
20580	if (!E->isAltShuffle()) {
20581	switch (E->CombinedOp) {
20582	case TreeEntry::ReducedBitcast:
20583	case TreeEntry::ReducedBitcastBSwap:
20584	case TreeEntry::ReducedBitcastLoads:
20585	case TreeEntry::ReducedBitcastBSwapLoads:
20586	case TreeEntry::ReducedCmpBitcast:
20587	ShuffleOrOp = E->CombinedOp;
20588	break;
20589	default:
20590	break;
20591	}
20592	}
20593	Instruction *VL0 = E->getMainOp();
20594	auto GetOperandSignedness = [&](unsigned Idx) {
20595	const TreeEntry *OpE = getOperandEntry(E, Idx);
20596	bool IsSigned = false;
20597	auto It = MinBWs.find(Val: OpE);
20598	if (It != MinBWs.end())
20599	IsSigned = It ->second.second;
20600	else
20601	IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
20602	if (isa<PoisonValue>(Val: V))
20603	return false;
20604	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
20605	});
20606	return IsSigned;
20607	};
20608	switch (ShuffleOrOp) {
20609	case Instruction::PHI: {
20610	assert((E->ReorderIndices.empty() \|\| !E->ReuseShuffleIndices.empty() \|\|
20611	E != VectorizableTree.front().get() \|\| E->UserTreeIndex) &&
20612	"PHI reordering is free.");
20613	auto *PH = cast<PHINode>(Val: VL0);
20614	Builder.SetInsertPoint(TheBB: PH->getParent(),
20615	IP: PH->getParent()->getFirstNonPHIIt());
20616	Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20617	PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
20618	Value *V = NewPhi;
20619
20620	// Adjust insertion point once all PHI's have been generated.
20621	Builder.SetInsertPoint(TheBB: PH->getParent(),
20622	IP: PH->getParent()->getFirstInsertionPt());
20623	Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20624
20625	V = FinalShuffle (V, E);
20626
20627	E->VectorizedValue = V;
20628	// If phi node is fully emitted - exit.
20629	if (NewPhi->getNumIncomingValues() != `0`)
20630	return NewPhi;
20631
20632	// PHINodes may have multiple entries from the same block. We want to
20633	// visit every block once.
20634	SmallDenseMap<BasicBlock , unsigned*, `4`> VisitedBBs;
20635	for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) {
20636	BasicBlock *IBB = PH->getIncomingBlock(i: I);
20637
20638	// Stop emission if all incoming values are generated.
20639	if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
20640	LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
20641	return NewPhi;
20642	}
20643
20644	auto Res = VisitedBBs.try_emplace(Key: IBB, Args&: I);
20645	if (!Res.second) {
20646	TreeEntry *OpTE = getOperandEntry(E, Idx: I);
20647	if (OpTE->isGather() \|\| DeletedNodes.contains(Ptr: OpTE) \|\|
20648	TransformedToGatherNodes.contains(Val: OpTE)) {
20649	Value *VecOp = NewPhi->getIncomingValue(i: Res.first ->getSecond());
20650	NewPhi->addIncoming(V: VecOp, BB: IBB);
20651	assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
20652	OpTE->VectorizedValue = VecOp;
20653	continue;
20654	}
20655	}
20656
20657	Builder.SetInsertPoint(IBB->getTerminator());
20658	Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20659	Value *Vec = vectorizeOperand(E, NodeIdx: I);
20660	if (VecTy != Vec->getType()) {
20661	assert((It != MinBWs.end() \|\| getOperandEntry(E, I)->isGather() \|\|
20662	MinBWs.contains(getOperandEntry(E, I))) &&
20663	"Expected item in MinBWs.");
20664	Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
20665	}
20666	NewPhi->addIncoming(V: Vec, BB: IBB);
20667	}
20668
20669	assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
20670	"Invalid number of incoming values");
20671	assert(E->VectorizedValue && "Expected vectorized value.");
20672	return E->VectorizedValue;
20673	}
20674
20675	case Instruction::ExtractElement: {
20676	Value *V = E->getSingleOperand(OpIdx: `0`);
20677	setInsertPointAfterBundle(E);
20678	V = FinalShuffle (V, E);
20679	E->VectorizedValue = V;
20680	return V;
20681	}
20682	case Instruction::ExtractValue: {
20683	auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: `0`));
20684	Builder.SetInsertPoint(LI);
20685	Value *Ptr = LI->getPointerOperand();
20686	LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
20687	Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars);
20688	NewV = FinalShuffle (NewV, E);
20689	E->VectorizedValue = NewV;
20690	return NewV;
20691	}
20692	case Instruction::InsertElement: {
20693	assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20694	if (const TreeEntry *OpE = getOperandEntry(E, Idx: `1`);
20695	OpE && !OpE->isGather() && OpE->hasState() &&
20696	!OpE->hasCopyableElements())
20697	Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
20698	else
20699	setInsertPointAfterBundle(E);
20700	Value *V = vectorizeOperand(E, NodeIdx: `1`);
20701	ArrayRef<Value *> Op = E->getOperand(OpIdx: `1`);
20702	Type *ScalarTy = Op.front()->getType();
20703	if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
20704	assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20705	std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: `1`));
20706	assert(Res.first > `0` && "Expected item in MinBWs.");
20707	V = Builder.CreateIntCast(
20708	V,
20709	DestTy: getWidenedType(
20710	ScalarTy,
20711	VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
20712	isSigned: Res.second);
20713	}
20714
20715	// Create InsertVector shuffle if necessary
20716	auto FirstInsert = cast<Instruction>(Val: find_if(Range&: E->Scalars, P: [E](Value *V) {
20717	return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: `0`));
20718	}));
20719	const unsigned NumElts =
20720	cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
20721	const unsigned NumScalars = E->Scalars.size();
20722
20723	unsigned Offset = *getElementIndex(Inst: VL0);
20724	assert(Offset < NumElts && "Failed to find vector index offset");
20725
20726	// Create shuffle to resize vector
20727	SmallVector<int> Mask;
20728	if (!E->ReorderIndices.empty()) {
20729	inversePermutation(Indices: E->ReorderIndices, Mask);
20730	Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
20731	} else {
20732	Mask.assign(NumElts, Elt: PoisonMaskElem);
20733	std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: `0`);
20734	}
20735	// Create InsertVector shuffle if necessary
20736	bool IsIdentity = true;
20737	SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20738	Mask.swap(RHS&: PrevMask);
20739	for (unsigned I = `0`; I < NumScalars; ++I) {
20740	Value *Scalar = E->Scalars [PrevMask [I]];
20741	unsigned InsertIdx = *getElementIndex(Inst: Scalar);
20742	IsIdentity &= InsertIdx - Offset == I;
20743	Mask [InsertIdx - Offset] = I;
20744	}
20745	if (!IsIdentity \|\| NumElts != NumScalars) {
20746	Value V2 = nullptr*;
20747	bool IsVNonPoisonous =
20748	!isConstant(V) && isGuaranteedNotToBePoison(V, AC);
20749	SmallVector<int> InsertMask(Mask);
20750	if (NumElts != NumScalars && Offset == `0`) {
20751	// Follow all insert element instructions from the current buildvector
20752	// sequence.
20753	InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
20754	do {
20755	std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
20756	if (!InsertIdx)
20757	break;
20758	if (InsertMask [*InsertIdx] == PoisonMaskElem)
20759	InsertMask [InsertIdx] = InsertIdx;
20760	if (!Ins->hasOneUse())
20761	break;
20762	Ins = dyn_cast_or_null<InsertElementInst>(
20763	Val: Ins->getUniqueUndroppableUser());
20764	} while (Ins);
20765	SmallBitVector UseMask =
20766	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
20767	SmallBitVector IsFirstPoison =
20768	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
20769	SmallBitVector IsFirstUndef =
20770	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
20771	if (!IsFirstPoison.all()) {
20772	unsigned Idx = `0`;
20773	for (unsigned I = `0`; I < NumElts; I++) {
20774	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
20775	IsFirstUndef.test(Idx: I)) {
20776	if (IsVNonPoisonous) {
20777	InsertMask [I] = I < NumScalars ? I : `0`;
20778	continue;
20779	}
20780	if (!V2)
20781	V2 = UndefValue::get(T: V->getType());
20782	if (Idx >= NumScalars)
20783	Idx = NumScalars - `1`;
20784	InsertMask [I] = NumScalars + Idx;
20785	++Idx;
20786	} else if (InsertMask [I] != PoisonMaskElem &&
20787	Mask [I] == PoisonMaskElem) {
20788	InsertMask [I] = PoisonMaskElem;
20789	}
20790	}
20791	} else {
20792	InsertMask = Mask;
20793	}
20794	}
20795	if (!V2)
20796	V2 = PoisonValue::get(T: V->getType());
20797	V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
20798	if (auto *I = dyn_cast<Instruction>(Val: V)) {
20799	GatherShuffleExtractSeq.insert(X: I);
20800	CSEBlocks.insert(V: I->getParent());
20801	}
20802	}
20803
20804	SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20805	for (unsigned I = `0`; I < NumElts; I++) {
20806	if (Mask [I] != PoisonMaskElem)
20807	InsertMask [Offset + I] = I;
20808	}
20809	SmallBitVector UseMask =
20810	buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
20811	SmallBitVector IsFirstUndef =
20812	isUndefVector(V: FirstInsert->getOperand(i: `0`), UseMask);
20813	if ((!IsIdentity \|\| Offset != `0` \|\| !IsFirstUndef.all()) &&
20814	NumElts != NumScalars) {
20815	if (IsFirstUndef.all()) {
20816	if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
20817	SmallBitVector IsFirstPoison =
20818	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
20819	if (!IsFirstPoison.all()) {
20820	for (unsigned I = `0`; I < NumElts; I++) {
20821	if (InsertMask [I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
20822	InsertMask [I] = I + NumElts;
20823	}
20824	}
20825	V = Builder.CreateShuffleVector(
20826	V1: V,
20827	V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
20828	: FirstInsert->getOperand(i: `0`),
20829	Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
20830	if (auto *I = dyn_cast<Instruction>(Val: V)) {
20831	GatherShuffleExtractSeq.insert(X: I);
20832	CSEBlocks.insert(V: I->getParent());
20833	}
20834	}
20835	} else {
20836	SmallBitVector IsFirstPoison =
20837	isUndefVector<true>(V: FirstInsert->getOperand(i: `0`), UseMask);
20838	for (unsigned I = `0`; I < NumElts; I++) {
20839	if (InsertMask [I] == PoisonMaskElem)
20840	InsertMask [I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
20841	else
20842	InsertMask [I] += NumElts;
20843	}
20844	V = Builder.CreateShuffleVector(
20845	V1: FirstInsert->getOperand(i: `0`), V2: V, Mask: InsertMask,
20846	Name: cast<Instruction>(Val: E->Scalars.back())->getName());
20847	if (auto *I = dyn_cast<Instruction>(Val: V)) {
20848	GatherShuffleExtractSeq.insert(X: I);
20849	CSEBlocks.insert(V: I->getParent());
20850	}
20851	}
20852	}
20853
20854	++NumVectorInstructions;
20855	E->VectorizedValue = V;
20856	return V;
20857	}
20858	case Instruction::ZExt:
20859	case Instruction::SExt:
20860	case Instruction::FPToUI:
20861	case Instruction::FPToSI:
20862	case Instruction::FPExt:
20863	case Instruction::PtrToInt:
20864	case Instruction::IntToPtr:
20865	case Instruction::SIToFP:
20866	case Instruction::UIToFP:
20867	case Instruction::Trunc:
20868	case Instruction::FPTrunc:
20869	case Instruction::BitCast: {
20870	setInsertPointAfterBundle(E);
20871
20872	Value *InVec = vectorizeOperand(E, NodeIdx: `0`);
20873
20874	auto *CI = cast<CastInst>(Val: VL0);
20875	Instruction::CastOps VecOpcode = CI->getOpcode();
20876	Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
20877	auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: `0`));
20878	if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20879	(SrcIt != MinBWs.end() \|\| It != MinBWs.end() \|\|
20880	SrcScalarTy != CI->getOperand(i_nocapture: `0`)->getType()->getScalarType())) {
20881	// Check if the values are candidates to demote.
20882	unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
20883	if (SrcIt != MinBWs.end())
20884	SrcBWSz = SrcIt ->second.first;
20885	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
20886	if (BWSz == SrcBWSz) {
20887	VecOpcode = Instruction::BitCast;
20888	} else if (BWSz < SrcBWSz) {
20889	VecOpcode = Instruction::Trunc;
20890	} else if (It != MinBWs.end()) {
20891	assert(BWSz > SrcBWSz && "Invalid cast!");
20892	VecOpcode = It ->second.second ? Instruction::SExt : Instruction::ZExt;
20893	} else if (SrcIt != MinBWs.end()) {
20894	assert(BWSz > SrcBWSz && "Invalid cast!");
20895	VecOpcode =
20896	SrcIt ->second.second ? Instruction::SExt : Instruction::ZExt;
20897	}
20898	} else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20899	!SrcIt ->second.second) {
20900	VecOpcode = Instruction::UIToFP;
20901	} else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
20902	ScalarTy->isFPOrFPVectorTy()) {
20903	Type *OrigSrcScalarTy = CI->getSrcTy();
20904	auto *OrigSrcVectorTy =
20905	getWidenedType(ScalarTy: OrigSrcScalarTy, VF: E->Scalars.size());
20906	InVec =
20907	Builder.CreateIntCast(V: InVec, DestTy: OrigSrcVectorTy, isSigned: SrcIt ->second.second);
20908	}
20909	Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20910	? InVec
20911	: Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
20912	V = FinalShuffle (V, E);
20913
20914	E->VectorizedValue = V;
20915	++NumVectorInstructions;
20916	return V;
20917	}
20918	case Instruction::FCmp:
20919	case Instruction::ICmp: {
20920	setInsertPointAfterBundle(E);
20921
20922	Value *L = vectorizeOperand(E, NodeIdx: `0`);
20923	Value *R = vectorizeOperand(E, NodeIdx: `1`);
20924	if (L->getType() != R->getType()) {
20925	assert((getOperandEntry(E, `0`)->isGather() \|\|
20926	getOperandEntry(E, `1`)->isGather() \|\|
20927	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
20928	MinBWs.contains(getOperandEntry(E, `1`))) &&
20929	"Expected item in MinBWs.");
20930	if (cast<VectorType>(Val: L->getType())
20931	->getElementType()
20932	->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
20933	->getElementType()
20934	->getIntegerBitWidth()) {
20935	Type *CastTy = R->getType();
20936	L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
20937	} else {
20938	Type *CastTy = L->getType();
20939	R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
20940	}
20941	}
20942
20943	CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
20944	Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
20945	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
20946	if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end())
20947	ICmp->setSameSign(/B=/false);
20948	// Do not cast for cmps.
20949	VecTy = cast<FixedVectorType>(Val: V->getType());
20950	V = FinalShuffle (V, E);
20951
20952	E->VectorizedValue = V;
20953	++NumVectorInstructions;
20954	return V;
20955	}
20956	case Instruction::Select: {
20957	setInsertPointAfterBundle(E);
20958
20959	Value *Cond = vectorizeOperand(E, NodeIdx: `0`);
20960	Value *True = vectorizeOperand(E, NodeIdx: `1`);
20961	Value *False = vectorizeOperand(E, NodeIdx: `2`);
20962	if (True->getType() != VecTy \|\| False->getType() != VecTy) {
20963	assert((It != MinBWs.end() \|\| getOperandEntry(E, `1`)->isGather() \|\|
20964	getOperandEntry(E, `2`)->isGather() \|\|
20965	MinBWs.contains(getOperandEntry(E, `1`)) \|\|
20966	MinBWs.contains(getOperandEntry(E, `2`))) &&
20967	"Expected item in MinBWs.");
20968	if (True->getType() != VecTy)
20969	True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
20970	if (False->getType() != VecTy)
20971	False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness (`2`));
20972	}
20973
20974	unsigned CondNumElements = getNumElements(Ty: Cond->getType());
20975	unsigned TrueNumElements = getNumElements(Ty: True->getType());
20976	assert(TrueNumElements >= CondNumElements &&
20977	TrueNumElements % CondNumElements == `0` &&
20978	"Cannot vectorize Instruction::Select");
20979	assert(TrueNumElements == getNumElements(False->getType()) &&
20980	"Cannot vectorize Instruction::Select");
20981	if (CondNumElements != TrueNumElements) {
20982	// When the return type is i1 but the source is fixed vector type, we
20983	// need to duplicate the condition value.
20984	Cond = Builder.CreateShuffleVector(
20985	V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements,
20986	VF: CondNumElements));
20987	}
20988	assert(getNumElements(Cond->getType()) == TrueNumElements &&
20989	"Cannot vectorize Instruction::Select");
20990	Value *V =
20991	Builder.CreateSelectWithUnknownProfile(C: Cond, True, False, DEBUG_TYPE);
20992	V = FinalShuffle (V, E);
20993
20994	E->VectorizedValue = V;
20995	++NumVectorInstructions;
20996	return V;
20997	}
20998	case Instruction::FNeg: {
20999	setInsertPointAfterBundle(E);
21000
21001	Value *Op = vectorizeOperand(E, NodeIdx: `0`);
21002
21003	Value *V = Builder.CreateUnOp(
21004	Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
21005	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21006	if (auto *I = dyn_cast<Instruction>(Val: V))
21007	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21008
21009	V = FinalShuffle (V, E);
21010
21011	E->VectorizedValue = V;
21012	++NumVectorInstructions;
21013
21014	return V;
21015	}
21016	case Instruction::Freeze: {
21017	setInsertPointAfterBundle(E);
21018
21019	Value *Op = vectorizeOperand(E, NodeIdx: `0`);
21020
21021	if (Op->getType() != VecTy) {
21022	assert((It != MinBWs.end() \|\| getOperandEntry(E, `0`)->isGather() \|\|
21023	MinBWs.contains(getOperandEntry(E, `0`))) &&
21024	"Expected item in MinBWs.");
21025	Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
21026	}
21027	Value *V = Builder.CreateFreeze(V: Op);
21028	V = FinalShuffle (V, E);
21029
21030	E->VectorizedValue = V;
21031	++NumVectorInstructions;
21032
21033	return V;
21034	}
21035	case Instruction::Add:
21036	case Instruction::FAdd:
21037	case Instruction::Sub:
21038	case Instruction::FSub:
21039	case Instruction::Mul:
21040	case Instruction::FMul:
21041	case Instruction::UDiv:
21042	case Instruction::SDiv:
21043	case Instruction::FDiv:
21044	case Instruction::URem:
21045	case Instruction::SRem:
21046	case Instruction::FRem:
21047	case Instruction::Shl:
21048	case Instruction::LShr:
21049	case Instruction::AShr:
21050	case Instruction::And:
21051	case Instruction::Or:
21052	case Instruction::Xor: {
21053	setInsertPointAfterBundle(E);
21054
21055	Value *LHS = vectorizeOperand(E, NodeIdx: `0`);
21056	Value *RHS = vectorizeOperand(E, NodeIdx: `1`);
21057	if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
21058	for (unsigned I : seq<unsigned>(Begin: `0`, End: E->getNumOperands())) {
21059	ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
21060	if (all_of(Range&: Ops, P: [&](Value *Op) {
21061	auto *CI = dyn_cast<ConstantInt>(Val: Op);
21062	return CI && CI->getValue().countr_one() >= It ->second.first;
21063	})) {
21064	V = FinalShuffle (I == `0` ? RHS : LHS, E);
21065	E->VectorizedValue = V;
21066	++NumVectorInstructions;
21067	return V;
21068	}
21069	}
21070	}
21071	if (LHS->getType() != VecTy \|\| RHS->getType() != VecTy) {
21072	assert((It != MinBWs.end() \|\| getOperandEntry(E, `0`)->isGather() \|\|
21073	getOperandEntry(E, `1`)->isGather() \|\|
21074	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
21075	MinBWs.contains(getOperandEntry(E, `1`))) &&
21076	"Expected item in MinBWs.");
21077	if (LHS->getType() != VecTy)
21078	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
21079	if (RHS->getType() != VecTy)
21080	RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness (`1`));
21081	}
21082
21083	Value *V = Builder.CreateBinOp(
21084	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
21085	RHS);
21086	propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end());
21087	if (auto *I = dyn_cast<Instruction>(Val: V)) {
21088	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21089	// Drop nuw flags for abs(sub(commutative), true).
21090	if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
21091	any_of(Range&: E->Scalars, P: [E](Value *V) {
21092	return isa<PoisonValue>(Val: V) \|\|
21093	(E->hasCopyableElements() && E->isCopyableElement(V)) \|\|
21094	isCommutative(I: cast<Instruction>(Val: V));
21095	}))
21096	I->setHasNoUnsignedWrap(/b=/false);
21097	}
21098
21099	V = FinalShuffle (V, E);
21100
21101	E->VectorizedValue = V;
21102	++NumVectorInstructions;
21103
21104	return V;
21105	}
21106	case Instruction::Load: {
21107	// Loads are inserted at the head of the tree because we don't want to
21108	// sink them all the way down past store instructions.
21109	setInsertPointAfterBundle(E);
21110
21111	LoadInst *LI = cast<LoadInst>(Val: VL0);
21112	Instruction *NewLI;
21113	FixedVectorType StridedLoadTy = nullptr*;
21114	Value *PO = LI->getPointerOperand();
21115	if (E->State == TreeEntry::Vectorize) {
21116	NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
21117	} else if (E->State == TreeEntry::CompressVectorize) {
21118	auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
21119	CompressEntryToData.at(Val: E);
21120	Align CommonAlignment = LI->getAlign();
21121	if (IsMasked) {
21122	unsigned VF = getNumElements(Ty: LoadVecTy);
21123	SmallVector<Constant *> MaskValues(
21124	VF / getNumElements(Ty: LI->getType()),
21125	ConstantInt::getFalse(Context&: VecTy->getContext()));
21126	for (int I : CompressMask)
21127	MaskValues [I] = ConstantInt::getTrue(Context&: VecTy->getContext());
21128	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
21129	assert(SLPReVec && "Only supported by REVEC.");
21130	MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements());
21131	}
21132	Constant *MaskValue = ConstantVector::get(V: MaskValues);
21133	NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment,
21134	Mask: MaskValue);
21135	} else {
21136	NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment);
21137	}
21138	NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
21139	// TODO: include this cost into CommonCost.
21140	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
21141	assert(SLPReVec && "FixedVectorType is not expected.");
21142	transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(),
21143	Mask&: CompressMask);
21144	}
21145	NewLI =
21146	cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask));
21147	} else if (E->State == TreeEntry::StridedVectorize) {
21148	Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
21149	Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
21150	PO = IsReverseOrder ? PtrN : Ptr0;
21151	Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
21152	Value *StrideVal;
21153	const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
21154	StridedLoadTy = SPtrInfo.Ty;
21155	assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
21156	unsigned StridedLoadEC =
21157	StridedLoadTy->getElementCount().getKnownMinValue();
21158
21159	Value *Stride = SPtrInfo.StrideVal;
21160	if (!Stride) {
21161	const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
21162	assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
21163	SCEVExpander Expander(*SE, "strided-load-vec");
21164	Stride = Expander.expandCodeFor(SH: StrideSCEV, Ty: StrideSCEV->getType(),
21165	I: &*Builder.GetInsertPoint());
21166	}
21167	Value *NewStride =
21168	Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /isSigned=/true);
21169	StrideVal = Builder.CreateMul(
21170	LHS: NewStride, RHS: ConstantInt::getSigned(
21171	Ty: StrideTy, V: (IsReverseOrder ? -`1` : `1`) *
21172	static_cast<int>(
21173	DL->getTypeAllocSize(Ty: ScalarTy))));
21174	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
21175	auto *Inst = Builder.CreateIntrinsic(
21176	ID: Intrinsic::experimental_vp_strided_load,
21177	Types: {StridedLoadTy, PO->getType(), StrideTy},
21178	Args: {PO, StrideVal,
21179	Builder.getAllOnesMask(NumElts: ElementCount::getFixed(MinVal: StridedLoadEC)),
21180	Builder.getInt32(C: StridedLoadEC)});
21181	Inst->addParamAttr(
21182	/ArgNo=/`0`,
21183	Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
21184	NewLI = Inst;
21185	} else {
21186	assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
21187	Value *VecPtr = vectorizeOperand(E, NodeIdx: `0`);
21188	if (isa<FixedVectorType>(Val: ScalarTy)) {
21189	assert(SLPReVec && "FixedVectorType is not expected.");
21190	// CreateMaskedGather expects VecTy and VecPtr have same size. We need
21191	// to expand VecPtr if ScalarTy is a vector type.
21192	unsigned ScalarTyNumElements =
21193	cast<FixedVectorType>(Val: ScalarTy)->getNumElements();
21194	unsigned VecTyNumElements =
21195	cast<FixedVectorType>(Val: VecTy)->getNumElements();
21196	assert(VecTyNumElements % ScalarTyNumElements == `0` &&
21197	"Cannot expand getelementptr.");
21198	unsigned VF = VecTyNumElements / ScalarTyNumElements;
21199	SmallVector<Constant *> Indices(VecTyNumElements);
21200	transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) {
21201	return Builder.getInt64(C: I % ScalarTyNumElements);
21202	});
21203	VecPtr = Builder.CreateGEP(
21204	Ty: VecTy->getElementType(),
21205	Ptr: Builder.CreateShuffleVector(
21206	V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)),
21207	IdxList: ConstantVector::get(V: Indices));
21208	}
21209	// Use the minimum alignment of the gathered loads.
21210	Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
21211	NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
21212	}
21213	Value *V = E->State == TreeEntry::CompressVectorize
21214	? NewLI
21215	: ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
21216
21217	if (StridedLoadTy != VecTy)
21218	V = Builder.CreateBitOrPointerCast(V, DestTy: VecTy);
21219	V = FinalShuffle (V, E);
21220	E->VectorizedValue = V;
21221	++NumVectorInstructions;
21222	return V;
21223	}
21224	case Instruction::Store: {
21225	auto *SI = cast<StoreInst>(Val: VL0);
21226
21227	setInsertPointAfterBundle(E);
21228
21229	Value *VecValue = vectorizeOperand(E, NodeIdx: `0`);
21230	if (VecValue->getType() != VecTy)
21231	VecValue =
21232	Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness (`0`));
21233	VecValue = FinalShuffle (VecValue, E);
21234
21235	Value *Ptr = SI->getPointerOperand();
21236	Instruction *ST;
21237	if (E->State == TreeEntry::Vectorize) {
21238	ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
21239	} else {
21240	assert(E->State == TreeEntry::StridedVectorize &&
21241	"Expected either strided or consecutive stores.");
21242	if (!E->ReorderIndices.empty()) {
21243	SI = cast<StoreInst>(Val: E->Scalars [E->ReorderIndices.front()]);
21244	Ptr = SI->getPointerOperand();
21245	}
21246	Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
21247	Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
21248	auto *Inst = Builder.CreateIntrinsic(
21249	ID: Intrinsic::experimental_vp_strided_store,
21250	Types: {VecTy, Ptr->getType(), StrideTy},
21251	Args: {VecValue, Ptr,
21252	ConstantInt::getSigned(
21253	Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
21254	Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
21255	Builder.getInt32(C: E->Scalars.size())});
21256	Inst->addParamAttr(
21257	/ArgNo=/`1`,
21258	Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
21259	ST = Inst;
21260	}
21261
21262	Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars);
21263
21264	E->VectorizedValue = V;
21265	++NumVectorInstructions;
21266	return V;
21267	}
21268	case Instruction::GetElementPtr: {
21269	auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
21270	setInsertPointAfterBundle(E);
21271
21272	Value *Op0 = vectorizeOperand(E, NodeIdx: `0`);
21273
21274	SmallVector<Value *> OpVecs;
21275	for (int J = `1`, N = GEP0->getNumOperands(); J < N; ++J) {
21276	Value *OpVec = vectorizeOperand(E, NodeIdx: J);
21277	OpVecs.push_back(Elt: OpVec);
21278	}
21279
21280	Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
21281	if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
21282	SmallVector<Value *> GEPs;
21283	for (Value *V : E->Scalars) {
21284	if (isa<GetElementPtrInst>(Val: V))
21285	GEPs.push_back(Elt: V);
21286	}
21287	V = ::propagateMetadata(Inst: I, VL: GEPs);
21288	}
21289
21290	V = FinalShuffle (V, E);
21291
21292	E->VectorizedValue = V;
21293	++NumVectorInstructions;
21294
21295	return V;
21296	}
21297	case Instruction::Call: {
21298	CallInst *CI = cast<CallInst>(Val: VL0);
21299	setInsertPointAfterBundle(E);
21300
21301	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
21302
21303	SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
21304	CI, ID, VF: VecTy->getNumElements(),
21305	MinBW: It != MinBWs.end() ? It ->second.first : `0`, TTI);
21306	auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
21307	bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
21308	VecCallCosts.first <= VecCallCosts.second;
21309
21310	Value ScalarArg = nullptr*;
21311	SmallVector<Value *> OpVecs;
21312	SmallVector<Type *, `2`> TysForDecl;
21313	// Add return type if intrinsic is overloaded on it.
21314	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -`1`, TTI))
21315	TysForDecl.push_back(Elt: VecTy);
21316	auto *CEI = cast<CallInst>(Val: VL0);
21317	for (unsigned I : seq<unsigned>(Begin: `0`, End: CI->arg_size())) {
21318	// Some intrinsics have scalar arguments. This argument should not be
21319	// vectorized.
21320	if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) {
21321	ScalarArg = CEI->getArgOperand(i: I);
21322	// if decided to reduce bitwidth of abs intrinsic, it second argument
21323	// must be set false (do not return poison, if value issigned min).
21324	if (ID == Intrinsic::abs && It != MinBWs.end() &&
21325	It ->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
21326	ScalarArg = Builder.getFalse();
21327	OpVecs.push_back(Elt: ScalarArg);
21328	if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
21329	TysForDecl.push_back(Elt: ScalarArg->getType());
21330	continue;
21331	}
21332
21333	Value *OpVec = vectorizeOperand(E, NodeIdx: I);
21334	ScalarArg = CEI->getArgOperand(i: I);
21335	if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
21336	ScalarArg->getType()->getScalarType() &&
21337	It == MinBWs.end()) {
21338	auto *CastTy =
21339	getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
21340	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness (I));
21341	} else if (It != MinBWs.end()) {
21342	OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness (I));
21343	}
21344	LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
21345	OpVecs.push_back(Elt: OpVec);
21346	if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
21347	TysForDecl.push_back(Elt: OpVec->getType());
21348	}
21349
21350	Function *CF;
21351	if (!UseIntrinsic) {
21352	VFShape Shape =
21353	VFShape::get(FTy: CI->getFunctionType(),
21354	EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
21355	HasGlobalPred: false /HasGlobalPred/);
21356	CF = VFDatabase (*CI).getVectorizedFunction(Shape);
21357	} else {
21358	CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
21359	}
21360
21361	SmallVector<OperandBundleDef, `1`> OpBundles;
21362	CI->getOperandBundlesAsDefs(Defs&: OpBundles);
21363	Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
21364
21365	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21366	cast<CallInst>(Val: V)->setCallingConv(CF->getCallingConv());
21367	V = FinalShuffle (V, E);
21368
21369	E->VectorizedValue = V;
21370	++NumVectorInstructions;
21371	return V;
21372	}
21373	case Instruction::ShuffleVector: {
21374	Value *V;
21375	if (SLPReVec && !E->isAltShuffle()) {
21376	setInsertPointAfterBundle(E);
21377	Value *Src = vectorizeOperand(E, NodeIdx: `0`);
21378	SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars));
21379	if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) {
21380	SmallVector<int> NewMask(ThisMask.size());
21381	transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) {
21382	return SVSrc->getShuffleMask()[Mask];
21383	});
21384	V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: `0`),
21385	V2: SVSrc->getOperand(i_nocapture: `1`), Mask: NewMask);
21386	} else {
21387	V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask);
21388	}
21389	propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21390	if (auto *I = dyn_cast<Instruction>(Val: V))
21391	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21392	V = FinalShuffle (V, E);
21393	} else {
21394	assert(E->isAltShuffle() &&
21395	((Instruction::isBinaryOp(E->getOpcode()) &&
21396	Instruction::isBinaryOp(E->getAltOpcode())) \|\|
21397	(Instruction::isCast(E->getOpcode()) &&
21398	Instruction::isCast(E->getAltOpcode())) \|\|
21399	(isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
21400	"Invalid Shuffle Vector Operand");
21401
21402	Value LHS = nullptr, RHS = nullptr;
21403	if (Instruction::isBinaryOp(Opcode: E->getOpcode()) \|\| isa<CmpInst>(Val: VL0)) {
21404	setInsertPointAfterBundle(E);
21405	LHS = vectorizeOperand(E, NodeIdx: `0`);
21406	RHS = vectorizeOperand(E, NodeIdx: `1`);
21407	} else {
21408	setInsertPointAfterBundle(E);
21409	LHS = vectorizeOperand(E, NodeIdx: `0`);
21410	}
21411	if (LHS && RHS &&
21412	((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
21413	(LHS->getType() != VecTy \|\| RHS->getType() != VecTy)) \|\|
21414	(isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
21415	assert((It != MinBWs.end() \|\|
21416	getOperandEntry(E, `0`)->State == TreeEntry::NeedToGather \|\|
21417	getOperandEntry(E, `1`)->State == TreeEntry::NeedToGather \|\|
21418	MinBWs.contains(getOperandEntry(E, `0`)) \|\|
21419	MinBWs.contains(getOperandEntry(E, `1`))) &&
21420	"Expected item in MinBWs.");
21421	Type *CastTy = VecTy;
21422	if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
21423	if (cast<VectorType>(Val: LHS->getType())
21424	->getElementType()
21425	->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
21426	->getElementType()
21427	->getIntegerBitWidth())
21428	CastTy = RHS->getType();
21429	else
21430	CastTy = LHS->getType();
21431	}
21432	if (LHS->getType() != CastTy)
21433	LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness (`0`));
21434	if (RHS->getType() != CastTy)
21435	RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness (`1`));
21436	}
21437
21438	Value V0, V1;
21439	if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
21440	V0 = Builder.CreateBinOp(
21441	Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
21442	V1 = Builder.CreateBinOp(
21443	Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
21444	} else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
21445	V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
21446	auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
21447	CmpInst::Predicate AltPred = AltCI->getPredicate();
21448	V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
21449	} else {
21450	if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
21451	unsigned SrcBWSz = DL->getTypeSizeInBits(
21452	Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
21453	unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
21454	if (BWSz <= SrcBWSz) {
21455	if (BWSz < SrcBWSz)
21456	LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It ->second.first);
21457	assert(LHS->getType() == VecTy &&
21458	"Expected same type as operand.");
21459	if (auto *I = dyn_cast<Instruction>(Val: LHS))
21460	LHS = ::propagateMetadata(Inst: I, VL: E->Scalars);
21461	LHS = FinalShuffle (LHS, E);
21462	E->VectorizedValue = LHS;
21463	++NumVectorInstructions;
21464	return LHS;
21465	}
21466	}
21467	V0 = Builder.CreateCast(
21468	Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
21469	V1 = Builder.CreateCast(
21470	Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
21471	}
21472	// Add V0 and V1 to later analysis to try to find and remove matching
21473	// instruction, if any.
21474	for (Value *V : {V0, V1}) {
21475	if (auto *I = dyn_cast<Instruction>(Val: V)) {
21476	GatherShuffleExtractSeq.insert(X: I);
21477	CSEBlocks.insert(V: I->getParent());
21478	}
21479	}
21480
21481	// Create shuffle to take alternate operations from the vector.
21482	// Also, gather up main and alt scalar ops to propagate IR flags to
21483	// each vector operation.
21484	ValueList OpScalars, AltScalars;
21485	SmallVector<int> Mask;
21486	E->buildAltOpShuffleMask(
21487	IsAltOp: [E, this](Instruction *I) {
21488	assert(E->getMatchingMainOpOrAltOp(I) &&
21489	"Unexpected main/alternate opcode");
21490	return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
21491	TLI: *TLI);
21492	},
21493	Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
21494
21495	propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
21496	propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
21497	auto DropNuwFlag = [&](Value Vec, unsigned* Opcode) {
21498	// Drop nuw flags for abs(sub(commutative), true).
21499	if (auto *I = dyn_cast<Instruction>(Val: Vec);
21500	I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
21501	any_of(Range&: E->Scalars, P: [E](Value *V) {
21502	if (isa<PoisonValue>(Val: V))
21503	return false;
21504	if (E->hasCopyableElements() && E->isCopyableElement(V))
21505	return false;
21506	auto *IV = cast<Instruction>(Val: V);
21507	return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV);
21508	}))
21509	I->setHasNoUnsignedWrap(/b=/false);
21510	};
21511	DropNuwFlag (V0, E->getOpcode());
21512	DropNuwFlag (V1, E->getAltOpcode());
21513
21514	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
21515	assert(SLPReVec && "FixedVectorType is not expected.");
21516	transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask);
21517	}
21518	V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
21519	if (auto *I = dyn_cast<Instruction>(Val: V)) {
21520	V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21521	GatherShuffleExtractSeq.insert(X: I);
21522	CSEBlocks.insert(V: I->getParent());
21523	}
21524	}
21525
21526	E->VectorizedValue = V;
21527	++NumVectorInstructions;
21528
21529	return V;
21530	}
21531	case TreeEntry::ReducedBitcast:
21532	case TreeEntry::ReducedBitcastBSwap: {
21533	assert(UserIgnoreList && "Expected reduction operations only.");
21534	setInsertPointAfterBundle(E);
21535	TreeEntry ZExt = getOperandEntry(E, /Idx=/*`0`);
21536	ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
21537	ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
21538	TreeEntry Const = getOperandEntry(E, /Idx=/*`1`);
21539	Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
21540	ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
21541	Value *Op = vectorizeOperand(E: ZExt, NodeIdx: `0`);
21542	auto *SrcType = IntegerType::get(
21543	C&: Op->getContext(),
21544	NumBits: DL->getTypeSizeInBits(Ty: cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy()) *
21545	E->getVectorFactor());
21546	auto *OrigScalarTy = ScalarTy;
21547	// Set the scalar type properly to avoid casting to the extending type.
21548	ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
21549	Op = FinalShuffle (Op, E);
21550	auto *V = Builder.CreateBitCast(V: Op, DestTy: SrcType);
21551	++NumVectorInstructions;
21552	if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
21553	V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
21554	++NumVectorInstructions;
21555	}
21556	if (SrcType != OrigScalarTy) {
21557	V = Builder.CreateIntCast(V, DestTy: OrigScalarTy, /isSigned=/false);
21558	++NumVectorInstructions;
21559	}
21560	E->VectorizedValue = V;
21561	return V;
21562	}
21563	case TreeEntry::ReducedBitcastLoads:
21564	case TreeEntry::ReducedBitcastBSwapLoads: {
21565	assert(UserIgnoreList && "Expected reduction operations only.");
21566	TreeEntry ZExt = getOperandEntry(E, /Idx=/*`0`);
21567	TreeEntry Load = getOperandEntry(E: ZExt, /Idx=/*`0`);
21568	setInsertPointAfterBundle(Load);
21569	ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
21570	ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
21571	TreeEntry Const = getOperandEntry(E, /Idx=/*`1`);
21572	Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
21573	ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
21574	Load->VectorizedValue = PoisonValue::get(T: getWidenedType(
21575	ScalarTy: Load->getMainOp()->getType(), VF: Load->getVectorFactor()));
21576	LoadInst *LI = cast<LoadInst>(Val: Load->getMainOp());
21577	Value *PO = LI->getPointerOperand();
21578	auto *SrcTy = IntegerType::get(
21579	C&: ScalarTy->getContext(),
21580	NumBits: DL->getTypeSizeInBits(Ty: cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy()) *
21581	E->getVectorFactor());
21582	auto *OrigScalarTy = ScalarTy;
21583	ScalarTy = ZExt->getMainOp()->getType();
21584	Value *V = Builder.CreateAlignedLoad(Ty: SrcTy, Ptr: PO, Align: LI->getAlign());
21585	++NumVectorInstructions;
21586	if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
21587	V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
21588	++NumVectorInstructions;
21589	}
21590	if (SrcTy != OrigScalarTy) {
21591	V = Builder.CreateIntCast(V, DestTy: OrigScalarTy, /isSigned=/false);
21592	++NumVectorInstructions;
21593	}
21594	E->VectorizedValue = V;
21595	return V;
21596	}
21597	case TreeEntry::ReducedCmpBitcast: {
21598	assert(UserIgnoreList && "Expected reduction operations only.");
21599	setInsertPointAfterBundle(E);
21600	TreeEntry Op1TE = getOperandEntry(E, /Idx=/*`1`);
21601	TreeEntry Op2TE = getOperandEntry(E, /Idx=/*`2`);
21602	Op1TE->VectorizedValue =
21603	PoisonValue::get(T: getWidenedType(ScalarTy, VF: Op1TE->getVectorFactor()));
21604	Op2TE->VectorizedValue =
21605	PoisonValue::get(T: getWidenedType(ScalarTy, VF: Op2TE->getVectorFactor()));
21606	Value Cmp = vectorizeOperand(E, /NodeIdx=/*`0`);
21607	// Set the scalar type properly to avoid casting to the extending type.
21608	auto *DstTy =
21609	IntegerType::getIntNTy(C&: ScalarTy->getContext(), N: E->getVectorFactor());
21610	auto *V = Builder.CreateBitCast(V: Cmp, DestTy: DstTy);
21611	++NumVectorInstructions;
21612	if (DstTy != ScalarTy) {
21613	V = Builder.CreateIntCast(V, DestTy: ScalarTy, /isSigned=/false);
21614	++NumVectorInstructions;
21615	}
21616	E->VectorizedValue = V;
21617	return V;
21618	}
21619	default:
21620	llvm_unreachable("unknown inst");
21621	}
21622	return nullptr;
21623	}
21624
21625	Value *BoUpSLP::vectorizeTree() {
21626	ExtraValueToDebugLocsMap ExternallyUsedValues;
21627	return vectorizeTree(ExternallyUsedValues);
21628	}
21629
21630	Value *BoUpSLP::vectorizeTree(
21631	const ExtraValueToDebugLocsMap &ExternallyUsedValues,
21632	Instruction *ReductionRoot,
21633	ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
21634	VectorValuesAndScales) {
21635	// Clean Entry-to-LastInstruction table. It can be affected after scheduling,
21636	// need to rebuild it.
21637	EntryToLastInstruction.clear();
21638	// All blocks must be scheduled before any instructions are inserted.
21639	for (auto &BSIter : BlocksSchedules)
21640	scheduleBlock(R: *this, BS: BSIter.second.get());
21641	// Cache last instructions for the nodes to avoid side effects, which may
21642	// appear during vectorization, like extra uses, etc.
21643	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21644	// Need to generate insertion point for loads nodes of the bitcast/bswap
21645	// ops.
21646	if (TE ->isGather() \|\| DeletedNodes.contains(Ptr: TE.get()) \|\|
21647	(TE ->State == TreeEntry::CombinedVectorize &&
21648	(TE ->CombinedOp == TreeEntry::ReducedBitcast \|\|
21649	TE ->CombinedOp == TreeEntry::ReducedBitcastBSwap \|\|
21650	((TE ->CombinedOp == TreeEntry::ReducedBitcastLoads \|\|
21651	TE ->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads \|\|
21652	TE ->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
21653	(!TE ->hasState() \|\| TE ->getOpcode() != Instruction::Load)))))
21654	continue;
21655	(void)getLastInstructionInBundle(E: TE.get());
21656	}
21657
21658	if (ReductionRoot)
21659	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
21660	IP: ReductionRoot->getIterator());
21661	else
21662	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21663
21664	// Vectorize gather operands of the nodes with the external uses only.
21665	SmallVector<std::pair<TreeEntry , Instruction >> GatherEntries;
21666	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21667	if (DeletedNodes.contains(Ptr: TE.get()))
21668	continue;
21669	if (TE ->isGather() && !TE ->VectorizedValue && TE ->UserTreeIndex.UserTE &&
21670	TE ->UserTreeIndex.UserTE->hasState() &&
21671	TE ->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
21672	(TE ->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI \|\|
21673	TE ->UserTreeIndex.UserTE->isAltShuffle()) &&
21674	!TE ->UserTreeIndex.UserTE->hasCopyableElements() &&
21675	all_of(Range&: TE ->UserTreeIndex.UserTE->Scalars,
21676	P: [](Value V) { return* isUsedOutsideBlock(V); })) {
21677	Instruction &LastInst =
21678	getLastInstructionInBundle(E: TE ->UserTreeIndex.UserTE);
21679	GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst);
21680	}
21681	}
21682	for (auto &Entry : GatherEntries) {
21683	IRBuilderBase::InsertPointGuard Guard(Builder);
21684	Builder.SetInsertPoint(Entry.second);
21685	Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
21686	(void)vectorizeTree(E: Entry.first);
21687	}
21688	// Emit gathered loads first to emit better code for the users of those
21689	// gathered loads.
21690	for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21691	if (DeletedNodes.contains(Ptr: TE.get()))
21692	continue;
21693	if (GatheredLoadsEntriesFirst.has_value() &&
21694	TE ->Idx >= *GatheredLoadsEntriesFirst && !TE ->VectorizedValue &&
21695	(!TE ->isGather() \|\| TE ->UserTreeIndex)) {
21696	assert((TE->UserTreeIndex \|\|
21697	(TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
21698	"Expected gathered load node.");
21699	(void)vectorizeTree(E: TE.get());
21700	}
21701	}
21702	(void)vectorizeTree(E: VectorizableTree [`0`].get());
21703	// Run through the list of postponed gathers and emit them, replacing the temp
21704	// emitted allocas with actual vector instructions.
21705	ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
21706	DenseMap<Value , SmallVector<TreeEntry >> PostponedValues;
21707	for (const TreeEntry *E : PostponedNodes) {
21708	auto TE = const_cast<TreeEntry >(E);
21709	auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
21710	TE->VectorizedValue = nullptr;
21711	auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue);
21712	// If user is a PHI node, its vector code have to be inserted right before
21713	// block terminator. Since the node was delayed, there were some unresolved
21714	// dependencies at the moment when stab instruction was emitted. In a case
21715	// when any of these dependencies turn out an operand of another PHI, coming
21716	// from this same block, position of a stab instruction will become invalid.
21717	// The is because source vector that supposed to feed this gather node was
21718	// inserted at the end of the block [after stab instruction]. So we need
21719	// to adjust insertion point again to the end of block.
21720	if (isa<PHINode>(Val: UserI) \|\|
21721	(TE->UserTreeIndex.UserTE->hasState() &&
21722	TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21723	TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
21724	// Insert before all users.
21725	Instruction *InsertPt = PrevVec->getParent()->getTerminator();
21726	for (User *U : PrevVec->users()) {
21727	if (U == UserI)
21728	continue;
21729	auto *UI = dyn_cast<Instruction>(Val: U);
21730	if (!UI \|\| isa<PHINode>(Val: UI) \|\| UI->getParent() != InsertPt->getParent())
21731	continue;
21732	if (UI->comesBefore(Other: InsertPt))
21733	InsertPt = UI;
21734	}
21735	Builder.SetInsertPoint(InsertPt);
21736	} else {
21737	Builder.SetInsertPoint(PrevVec);
21738	}
21739	Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
21740	Value *Vec = vectorizeTree(E: TE);
21741	if (auto *VecI = dyn_cast<Instruction>(Val: Vec);
21742	VecI && VecI->getParent() == Builder.GetInsertBlock() &&
21743	Builder.GetInsertPoint()->comesBefore(Other: VecI))
21744	VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(),
21745	I: Builder.GetInsertPoint());
21746	if (Vec->getType() != PrevVec->getType()) {
21747	assert(Vec->getType()->isIntOrIntVectorTy() &&
21748	PrevVec->getType()->isIntOrIntVectorTy() &&
21749	"Expected integer vector types only.");
21750	std::optional<bool> IsSigned;
21751	for (Value *V : TE->Scalars) {
21752	if (isVectorized(V)) {
21753	for (const TreeEntry *MNTE : getTreeEntries(V)) {
21754	auto It = MinBWs.find(Val: MNTE);
21755	if (It != MinBWs.end()) {
21756	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
21757	if (*IsSigned)
21758	break;
21759	}
21760	}
21761	if (IsSigned.value_or(u: false))
21762	break;
21763	// Scan through gather nodes.
21764	for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
21765	auto It = MinBWs.find(Val: BVE);
21766	if (It != MinBWs.end()) {
21767	IsSigned = IsSigned.value_or(u: false) \|\| It ->second.second;
21768	if (*IsSigned)
21769	break;
21770	}
21771	}
21772	if (IsSigned.value_or(u: false))
21773	break;
21774	if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
21775	IsSigned =
21776	IsSigned.value_or(u: false) \|\|
21777	!isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery (*DL));
21778	continue;
21779	}
21780	if (IsSigned.value_or(u: false))
21781	break;
21782	}
21783	}
21784	if (IsSigned.value_or(u: false)) {
21785	// Final attempt - check user node.
21786	auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE);
21787	if (It != MinBWs.end())
21788	IsSigned = It ->second.second;
21789	}
21790	assert(IsSigned &&
21791	"Expected user node or perfect diamond match in MinBWs.");
21792	Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
21793	}
21794	PrevVec->replaceAllUsesWith(V: Vec);
21795	PostponedValues.try_emplace(Key: Vec).first ->second.push_back(Elt: TE);
21796	// Replace the stub vector node, if it was used before for one of the
21797	// buildvector nodes already.
21798	auto It = PostponedValues.find(Val: PrevVec);
21799	if (It != PostponedValues.end()) {
21800	for (TreeEntry *VTE : It ->getSecond())
21801	VTE->VectorizedValue = Vec;
21802	}
21803	eraseInstruction(I: PrevVec);
21804	}
21805
21806	LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21807	<< " values .\n");
21808
21809	SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
21810	// Maps vector instruction to original insertelement instruction
21811	DenseMap<Value , InsertElementInst > VectorToInsertElement;
21812	// Maps extract Scalar to the corresponding extractelement instruction in the
21813	// basic block. Only one extractelement per block should be emitted.
21814	DenseMap<Value , DenseMap<BasicBlock , std::pair<Value , Value >>>
21815	ScalarToEEs;
21816	SmallDenseSet<Value *, `4`> UsedInserts;
21817	DenseMap<std::pair<Value , Type >, Value *> VectorCasts;
21818	SmallDenseSet<Value *, `4`> ScalarsWithNullptrUser;
21819	SmallDenseSet<ExtractElementInst *, `4`> IgnoredExtracts;
21820	// Extract all of the elements with the external uses.
21821	for (const auto &ExternalUse : ExternalUses) {
21822	Value *Scalar = ExternalUse.Scalar;
21823	llvm::User *User = ExternalUse.User;
21824
21825	// Skip users that we already RAUW. This happens when one instruction
21826	// has multiple uses of the same value.
21827	if (User && !is_contained(Range: Scalar->users(), Element: User))
21828	continue;
21829	const TreeEntry *E = &ExternalUse.E;
21830	assert(E && "Invalid scalar");
21831	assert(!E->isGather() && "Extracting from a gather list");
21832	// Non-instruction pointers are not deleted, just skip them.
21833	if (E->getOpcode() == Instruction::GetElementPtr &&
21834	!isa<GetElementPtrInst>(Val: Scalar))
21835	continue;
21836
21837	Value *Vec = E->VectorizedValue;
21838	assert(Vec && "Can't find vectorizable value");
21839
21840	Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
21841	auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21842	if (Scalar->getType() != Vec->getType()) {
21843	Value Ex = nullptr*;
21844	Value ExV = nullptr*;
21845	auto *Inst = dyn_cast<Instruction>(Val: Scalar);
21846	bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst);
21847	auto It = ScalarToEEs.find(Val: Scalar);
21848	if (It != ScalarToEEs.end()) {
21849	// No need to emit many extracts, just move the only one in the
21850	// current block.
21851	auto EEIt = It ->second.find(Val: ReplaceInst ? Inst->getParent()
21852	: Builder.GetInsertBlock());
21853	if (EEIt != It ->second.end()) {
21854	Value *PrevV = EEIt ->second.first;
21855	if (auto *I = dyn_cast<Instruction>(Val: PrevV);
21856	I && !ReplaceInst &&
21857	Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21858	Builder.GetInsertPoint()->comesBefore(Other: I)) {
21859	I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
21860	I: Builder.GetInsertPoint());
21861	if (auto *CI = dyn_cast<Instruction>(Val: EEIt ->second.second))
21862	CI->moveAfter(MovePos: I);
21863	}
21864	Ex = PrevV;
21865	ExV = EEIt ->second.second ? EEIt ->second.second : Ex;
21866	}
21867	}
21868	if (!Ex) {
21869	// "Reuse" the existing extract to improve final codegen.
21870	if (ReplaceInst) {
21871	// Leave the instruction as is, if it cheaper extracts and all
21872	// operands are scalar.
21873	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) {
21874	IgnoredExtracts.insert(V: EE);
21875	Ex = EE;
21876	} else {
21877	auto *CloneInst = Inst->clone();
21878	CloneInst->insertBefore(InsertPos: Inst->getIterator());
21879	if (Inst->hasName())
21880	CloneInst->takeName(V: Inst);
21881	Ex = CloneInst;
21882	}
21883	} else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar);
21884	ES && isa<Instruction>(Val: Vec)) {
21885	Value *V = ES->getVectorOperand();
21886	auto *IVec = cast<Instruction>(Val: Vec);
21887	if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21888	V = ETEs.front()->VectorizedValue;
21889	if (auto *IV = dyn_cast<Instruction>(Val: V);
21890	!IV \|\| IV == Vec \|\| IV->getParent() != IVec->getParent() \|\|
21891	IV->comesBefore(Other: IVec))
21892	Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
21893	else
21894	Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
21895	} else if (auto *VecTy =
21896	dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
21897	assert(SLPReVec && "FixedVectorType is not expected.");
21898	unsigned VecTyNumElements = VecTy->getNumElements();
21899	// When REVEC is enabled, we need to extract a vector.
21900	// Note: The element size of Scalar may be different from the
21901	// element size of Vec.
21902	Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements,
21903	Index: ExternalUse.Lane * VecTyNumElements);
21904	} else {
21905	Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
21906	}
21907	// If necessary, sign-extend or zero-extend ScalarRoot
21908	// to the larger type.
21909	ExV = Ex;
21910	if (Scalar->getType() != Ex->getType())
21911	ExV = Builder.CreateIntCast(
21912	V: Ex, DestTy: Scalar->getType(),
21913	isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery (*DL)));
21914	auto *I = dyn_cast<Instruction>(Val: Ex);
21915	ScalarToEEs [Scalar].try_emplace(Key: I ? I->getParent()
21916	: &F->getEntryBlock(),
21917	Args: std::make_pair(x&: Ex, y&: ExV));
21918	}
21919	// The then branch of the previous if may produce constants, since 0
21920	// operand might be a constant.
21921	if (auto *ExI = dyn_cast<Instruction>(Val: Ex);
21922	ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) {
21923	GatherShuffleExtractSeq.insert(X: ExI);
21924	CSEBlocks.insert(V: ExI->getParent());
21925	}
21926	return ExV;
21927	}
21928	assert(isa<FixedVectorType>(Scalar->getType()) &&
21929	isa<InsertElementInst>(Scalar) &&
21930	"In-tree scalar of vector type is not insertelement?");
21931	auto *IE = cast<InsertElementInst>(Val: Scalar);
21932	VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
21933	return Vec;
21934	};
21935	// If User == nullptr, the Scalar remains as scalar in vectorized
21936	// instructions or is used as extra arg. Generate ExtractElement instruction
21937	// and update the record for this scalar in ExternallyUsedValues.
21938	if (!User) {
21939	if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
21940	continue;
21941	assert(
21942	(ExternallyUsedValues.count(Scalar) \|\|
21943	ExternalUsesWithNonUsers.count(Scalar) \|\|
21944	ExternalUsesAsOriginalScalar.contains(Scalar) \|\|
21945	any_of(
21946	Scalar->users(),
21947	[&, TTI = TTI](llvm::User *U) {
21948	if (ExternalUsesAsOriginalScalar.contains(U))
21949	return true;
21950	ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21951	return !UseEntries.empty() &&
21952	(E->State == TreeEntry::Vectorize \|\|
21953	E->State == TreeEntry::StridedVectorize \|\|
21954	E->State == TreeEntry::CompressVectorize) &&
21955	any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21956	return (UseEntry->State == TreeEntry::Vectorize \|\|
21957	UseEntry->State ==
21958	TreeEntry::StridedVectorize \|\|
21959	UseEntry->State ==
21960	TreeEntry::CompressVectorize) &&
21961	doesInTreeUserNeedToExtract(
21962	Scalar, getRootEntryInstruction(*UseEntry),
21963	TLI, TTI);
21964	});
21965	})) &&
21966	"Scalar with nullptr User must be registered in "
21967	"ExternallyUsedValues map or remain as scalar in vectorized "
21968	"instructions");
21969	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
21970	if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) {
21971	if (PHI->getParent()->isLandingPad())
21972	Builder.SetInsertPoint(
21973	TheBB: PHI->getParent(),
21974	IP: std::next(
21975	x: PHI->getParent()->getLandingPadInst()->getIterator()));
21976	else
21977	Builder.SetInsertPoint(TheBB: PHI->getParent(),
21978	IP: PHI->getParent()->getFirstNonPHIIt());
21979	} else {
21980	Builder.SetInsertPoint(TheBB: VecI->getParent(),
21981	IP: std::next(x: VecI->getIterator()));
21982	}
21983	} else {
21984	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21985	}
21986	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
21987	// Required to update internally referenced instructions.
21988	if (Scalar != NewInst) {
21989	assert((!isa<ExtractElementInst>(Scalar) \|\|
21990	!IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21991	"Extractelements should not be replaced.");
21992	Scalar->replaceAllUsesWith(V: NewInst);
21993	}
21994	continue;
21995	}
21996
21997	if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
21998	VU && VU->getOperand(i_nocapture: `1`) == Scalar) {
21999	// Skip if the scalar is another vector op or Vec is not an instruction.
22000	if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
22001	if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
22002	if (!UsedInserts.insert(V: VU).second)
22003	continue;
22004	// Need to use original vector, if the root is truncated.
22005	auto BWIt = MinBWs.find(Val: E);
22006	if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
22007	auto *ScalarTy = FTy->getElementType();
22008	auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
22009	auto VecIt = VectorCasts.find(Val: Key);
22010	if (VecIt == VectorCasts.end()) {
22011	IRBuilderBase::InsertPointGuard Guard(Builder);
22012	if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) {
22013	if (IVec->getParent()->isLandingPad())
22014	Builder.SetInsertPoint(TheBB: IVec->getParent(),
22015	IP: std::next(x: IVec->getParent()
22016	->getLandingPadInst()
22017	->getIterator()));
22018	else
22019	Builder.SetInsertPoint(
22020	IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
22021	} else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) {
22022	Builder.SetInsertPoint(IVec->getNextNode());
22023	}
22024	Vec = Builder.CreateIntCast(
22025	V: Vec,
22026	DestTy: getWidenedType(
22027	ScalarTy,
22028	VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
22029	isSigned: BWIt ->second.second);
22030	VectorCasts.try_emplace(Key, Args&: Vec);
22031	} else {
22032	Vec = VecIt ->second;
22033	}
22034	}
22035
22036	std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
22037	if (InsertIdx) {
22038	auto *It = find_if(
22039	Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) {
22040	// Checks if 2 insertelements are from the same buildvector.
22041	InsertElementInst *VecInsert = Data.InsertElements.front();
22042	return areTwoInsertFromSameBuildVector(
22043	VU, V: VecInsert,
22044	GetBaseOperand: [](InsertElementInst II) { return* II->getOperand(i_nocapture: `0`); });
22045	});
22046	unsigned Idx = *InsertIdx;
22047	if (It == ShuffledInserts.end()) {
22048	(void)ShuffledInserts.emplace_back();
22049	It = std::next(x: ShuffledInserts.begin(),
22050	n: ShuffledInserts.size() - `1`);
22051	}
22052	SmallVectorImpl<int> &Mask = It->ValueMasks [Vec];
22053	if (Mask.empty())
22054	Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
22055	Mask [Idx] = ExternalUse.Lane;
22056	It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
22057	continue;
22058	}
22059	}
22060	}
22061	}
22062
22063	// Generate extracts for out-of-tree users.
22064	// Find the insertion point for the extractelement lane.
22065	if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
22066	if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
22067	for (unsigned I : seq<unsigned>(Begin: `0`, End: PH->getNumIncomingValues())) {
22068	if (PH->getIncomingValue(i: I) == Scalar) {
22069	Instruction *IncomingTerminator =
22070	PH->getIncomingBlock(i: I)->getTerminator();
22071	if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
22072	Builder.SetInsertPoint(TheBB: VecI->getParent(),
22073	IP: std::next(x: VecI->getIterator()));
22074	} else {
22075	Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
22076	}
22077	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
22078	PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
22079	}
22080	}
22081	} else {
22082	Builder.SetInsertPoint(cast<Instruction>(Val: User));
22083	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
22084	User->replaceUsesOfWith(From: Scalar, To: NewInst);
22085	}
22086	} else {
22087	Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
22088	Value *NewInst = ExtractAndExtendIfNeeded (Vec);
22089	User->replaceUsesOfWith(From: Scalar, To: NewInst);
22090	}
22091
22092	LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
22093	}
22094
22095	auto CreateShuffle = [&](Value V1, Value V2, ArrayRef<int> Mask) {
22096	SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
22097	SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
22098	int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
22099	for (int I = `0`, E = Mask.size(); I < E; ++I) {
22100	if (Mask [I] < VF)
22101	CombinedMask1 [I] = Mask [I];
22102	else
22103	CombinedMask2 [I] = Mask [I] - VF;
22104	}
22105	ShuffleInstructionBuilder ShuffleBuilder(
22106	cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
22107	ShuffleBuilder.add(V1, Mask: CombinedMask1);
22108	if (V2)
22109	ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
22110	return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {});
22111	};
22112
22113	auto &&ResizeToVF = [&CreateShuffle](Value Vec, ArrayRef<int*> Mask,
22114	bool ForSingleMask) {
22115	unsigned VF = Mask.size();
22116	unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
22117	if (VF != VecVF) {
22118	if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
22119	Vec = CreateShuffle (Vec, nullptr, Mask);
22120	return std::make_pair(x&: Vec, y: true);
22121	}
22122	if (!ForSingleMask) {
22123	SmallVector<int> ResizeMask(VF, PoisonMaskElem);
22124	for (unsigned I = `0`; I < VF; ++I) {
22125	if (Mask [I] != PoisonMaskElem)
22126	ResizeMask [Mask [I]] = Mask [I];
22127	}
22128	Vec = CreateShuffle (Vec, nullptr, ResizeMask);
22129	}
22130	}
22131
22132	return std::make_pair(x&: Vec, y: false);
22133	};
22134	// Perform shuffling of the vectorize tree entries for better handling of
22135	// external extracts.
22136	for (int I = `0`, E = ShuffledInserts.size(); I < E; ++I) {
22137	// Find the first and the last instruction in the list of insertelements.
22138	sort(C&: ShuffledInserts [I].InsertElements, Comp: isFirstInsertElement);
22139	InsertElementInst *FirstInsert = ShuffledInserts [I].InsertElements.front();
22140	InsertElementInst *LastInsert = ShuffledInserts [I].InsertElements.back();
22141	Builder.SetInsertPoint(LastInsert);
22142	auto Vector = ShuffledInserts [I].ValueMasks.takeVector();
22143	Value *NewInst = performExtractsShuffleAction<Value>(
22144	ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
22145	Base: FirstInsert->getOperand(i_nocapture: `0`),
22146	GetVF: [](Value *Vec) {
22147	return cast<VectorType>(Val: Vec->getType())
22148	->getElementCount()
22149	.getKnownMinValue();
22150	},
22151	ResizeAction: ResizeToVF,
22152	Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
22153	ArrayRef<Value *> Vals) {
22154	assert((Vals.size() == `1` \|\| Vals.size() == `2`) &&
22155	"Expected exactly 1 or 2 input values.");
22156	if (Vals.size() == `1`) {
22157	// Do not create shuffle if the mask is a simple identity
22158	// non-resizing mask.
22159	if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
22160	->getNumElements() \|\|
22161	!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
22162	return CreateShuffle (Vals.front(), nullptr, Mask);
22163	return Vals.front();
22164	}
22165	return CreateShuffle (Vals.front() ? Vals.front()
22166	: FirstInsert->getOperand(i_nocapture: `0`),
22167	Vals.back(), Mask);
22168	});
22169	auto It = ShuffledInserts [I].InsertElements.rbegin();
22170	// Rebuild buildvector chain.
22171	InsertElementInst II = nullptr*;
22172	if (It != ShuffledInserts [I].InsertElements.rend())
22173	II = *It;
22174	SmallVector<Instruction *> Inserts;
22175	while (It != ShuffledInserts [I].InsertElements.rend()) {
22176	assert(II && "Must be an insertelement instruction.");
22177	if (*It == II)
22178	++It;
22179	else
22180	Inserts.push_back(Elt: cast<Instruction>(Val: II));
22181	II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: `0`));
22182	}
22183	for (Instruction *II : reverse(C&: Inserts)) {
22184	II->replaceUsesOfWith(From: II->getOperand(i: `0`), To: NewInst);
22185	if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
22186	if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
22187	II->moveAfter(MovePos: NewI);
22188	NewInst = II;
22189	}
22190	LastInsert->replaceAllUsesWith(V: NewInst);
22191	for (InsertElementInst *IE : reverse(C&: ShuffledInserts [I].InsertElements)) {
22192	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `0`),
22193	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `0`)->getType()));
22194	IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: `1`),
22195	To: PoisonValue::get(T: IE->getOperand(i_nocapture: `1`)->getType()));
22196	eraseInstruction(I: IE);
22197	}
22198	CSEBlocks.insert(V: LastInsert->getParent());
22199	}
22200
22201	SmallVector<Instruction *> RemovedInsts;
22202	// For each vectorized value:
22203	for (auto &TEPtr : VectorizableTree) {
22204	TreeEntry *Entry = TEPtr.get();
22205
22206	// No need to handle users of gathered values.
22207	if (Entry->isGather() \|\| Entry->State == TreeEntry::SplitVectorize \|\|
22208	DeletedNodes.contains(Ptr: Entry) \|\|
22209	TransformedToGatherNodes.contains(Val: Entry))
22210	continue;
22211
22212	if (Entry->CombinedOp == TreeEntry::ReducedBitcast \|\|
22213	Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap \|\|
22214	Entry->CombinedOp == TreeEntry::ReducedBitcastLoads \|\|
22215	Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads \|\|
22216	Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
22217	// Skip constant node
22218	if (!Entry->hasState()) {
22219	assert(allConstant(Entry->Scalars) && "Expected constants only.");
22220	continue;
22221	}
22222	for (Value *Scalar : Entry->Scalars) {
22223	auto *I = dyn_cast<Instruction>(Val: Scalar);
22224
22225	if (!I \|\| Entry->isCopyableElement(V: I))
22226	continue;
22227	LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
22228	RemovedInsts.push_back(Elt: I);
22229	}
22230	continue;
22231	}
22232
22233	assert(Entry->VectorizedValue && "Can't find vectorizable value");
22234
22235	// For each lane:
22236	for (int Lane = `0`, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
22237	Value *Scalar = Entry->Scalars [Lane];
22238
22239	if (Entry->getOpcode() == Instruction::GetElementPtr &&
22240	!isa<GetElementPtrInst>(Val: Scalar))
22241	continue;
22242	if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar);
22243	EE && IgnoredExtracts.contains(V: EE))
22244	continue;
22245	if (!isa<Instruction>(Val: Scalar) \|\| Entry->isCopyableElement(V: Scalar))
22246	continue;
22247	#ifndef NDEBUG
22248	Type *Ty = Scalar->getType();
22249	if (!Ty->isVoidTy()) {
22250	for (User *U : Scalar->users()) {
22251	LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
22252
22253	// It is legal to delete users in the ignorelist.
22254	assert((isVectorized(U) \|\|
22255	(UserIgnoreList && UserIgnoreList->contains(U)) \|\|
22256	(isa_and_nonnull<Instruction>(U) &&
22257	isDeleted(cast<Instruction>(U)))) &&
22258	"Deleting out-of-tree value");
22259	}
22260	}
22261	#endif
22262	LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
22263	auto *I = cast<Instruction>(Val: Scalar);
22264	RemovedInsts.push_back(Elt: I);
22265	}
22266	}
22267
22268	// Merge the DIAssignIDs from the about-to-be-deleted instructions into the
22269	// new vector instruction.
22270	if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree [`0`]->VectorizedValue))
22271	V->mergeDIAssignID(SourceInstructions: RemovedInsts);
22272
22273	// Clear up reduction references, if any.
22274	if (UserIgnoreList) {
22275	for (Instruction *I : RemovedInsts) {
22276	const TreeEntry *IE = getTreeEntries(V: I).front();
22277	if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(V: I);
22278	!SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
22279	IE = SplitEntries.front();
22280	if (IE->Idx != `0` &&
22281	!(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
22282	(ValueToGatherNodes.lookup(Val: I).contains(
22283	key: VectorizableTree.front().get()) \|\|
22284	(IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
22285	IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
22286	!(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
22287	IE->UserTreeIndex &&
22288	is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
22289	!(GatheredLoadsEntriesFirst.has_value() &&
22290	IE->Idx >= *GatheredLoadsEntriesFirst &&
22291	VectorizableTree.front()->isGather() &&
22292	is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
22293	!(!VectorizableTree.front()->isGather() &&
22294	VectorizableTree.front()->isCopyableElement(V: I)))
22295	continue;
22296	SmallVector<SelectInst *> LogicalOpSelects;
22297	I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
22298	// Do not replace condition of the logical op in form select <cond>.
22299	bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
22300	(match(V: U.getUser(), P: m_LogicalAnd()) \|\|
22301	match(V: U.getUser(), P: m_LogicalOr())) &&
22302	U.getOperandNo() == `0`;
22303	if (IsPoisoningLogicalOp) {
22304	LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
22305	return false;
22306	}
22307	return UserIgnoreList->contains(V: U.getUser());
22308	});
22309	// Replace conditions of the poisoning logical ops with the non-poison
22310	// constant value.
22311	for (SelectInst *SI : LogicalOpSelects)
22312	SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
22313	}
22314	}
22315	// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
22316	// cache correctness.
22317	// NOTE: removeInstructionAndOperands only marks the instruction for deletion
22318	// - instructions are not deleted until later.
22319	removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales);
22320
22321	Builder.ClearInsertionPoint();
22322	InstrElementSize.clear();
22323
22324	const TreeEntry &RootTE = *VectorizableTree.front();
22325	Value *Vec = RootTE.VectorizedValue;
22326	if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != `0` &&
22327	It != MinBWs.end() &&
22328	ReductionBitWidth != It ->second.first) {
22329	IRBuilder<>::InsertPointGuard Guard(Builder);
22330	Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
22331	IP: ReductionRoot->getIterator());
22332	if (isReducedBitcastRoot() \|\| isReducedCmpBitcastRoot()) {
22333	Vec = Builder.CreateIntCast(V: Vec, DestTy: Builder.getIntNTy(N: ReductionBitWidth),
22334	isSigned: It ->second.second);
22335
22336	} else {
22337	Vec = Builder.CreateIntCast(
22338	V: Vec,
22339	DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
22340	EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
22341	isSigned: It ->second.second);
22342	}
22343	}
22344	return Vec;
22345	}
22346
22347	void BoUpSLP::optimizeGatherSequence() {
22348	LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
22349	<< " gather sequences instructions.\n");
22350	// LICM InsertElementInst sequences.
22351	for (Instruction *I : GatherShuffleExtractSeq) {
22352	if (isDeleted(I))
22353	continue;
22354
22355	// Check if this block is inside a loop.
22356	Loop *L = LI->getLoopFor(BB: I->getParent());
22357	if (!L)
22358	continue;
22359
22360	// Check if it has a preheader.
22361	BasicBlock *PreHeader = L->getLoopPreheader();
22362	if (!PreHeader)
22363	continue;
22364
22365	// If the vector or the element that we insert into it are
22366	// instructions that are defined in this basic block then we can't
22367	// hoist this instruction.
22368	if (any_of(Range: I->operands(), P: [L](Value *V) {
22369	auto *OpI = dyn_cast<Instruction>(Val: V);
22370	return OpI && L->contains(Inst: OpI);
22371	}))
22372	continue;
22373
22374	// We can hoist this instruction. Move it to the pre-header.
22375	I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator());
22376	CSEBlocks.insert(V: PreHeader);
22377	}
22378
22379	// Make a list of all reachable blocks in our CSE queue.
22380	SmallVector<const DomTreeNode *, `8`> CSEWorkList;
22381	CSEWorkList.reserve(N: CSEBlocks.size());
22382	for (BasicBlock *BB : CSEBlocks)
22383	if (DomTreeNode *N = DT->getNode(BB)) {
22384	assert(DT->isReachableFromEntry(N));
22385	CSEWorkList.push_back(Elt: N);
22386	}
22387
22388	// Sort blocks by domination. This ensures we visit a block after all blocks
22389	// dominating it are visited.
22390	llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode A, const* DomTreeNode *B) {
22391	assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
22392	"Different nodes should have different DFS numbers");
22393	return A->getDFSNumIn() < B->getDFSNumIn();
22394	});
22395
22396	// Less defined shuffles can be replaced by the more defined copies.
22397	// Between two shuffles one is less defined if it has the same vector operands
22398	// and its mask indeces are the same as in the first one or undefs. E.g.
22399	// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
22400	// poison, <0, 0, 0, 0>.
22401	auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
22402	Instruction *I2,
22403	SmallVectorImpl<int> &NewMask) {
22404	if (I1->getType() != I2->getType())
22405	return false;
22406	auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
22407	auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
22408	if (!SI1 \|\| !SI2)
22409	return I1->isIdenticalTo(I: I2);
22410	if (SI1->isIdenticalTo(I: SI2))
22411	return true;
22412	for (int I = `0`, E = SI1->getNumOperands(); I < E; ++I)
22413	if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
22414	return false;
22415	// Check if the second instruction is more defined than the first one.
22416	NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
22417	ArrayRef<int> SM1 = SI1->getShuffleMask();
22418	// Count trailing undefs in the mask to check the final number of used
22419	// registers.
22420	unsigned LastUndefsCnt = `0`;
22421	for (int I = `0`, E = NewMask.size(); I < E; ++I) {
22422	if (SM1 [I] == PoisonMaskElem)
22423	++LastUndefsCnt;
22424	else
22425	LastUndefsCnt = `0`;
22426	if (NewMask [I] != PoisonMaskElem && SM1 [I] != PoisonMaskElem &&
22427	NewMask [I] != SM1 [I])
22428	return false;
22429	if (NewMask [I] == PoisonMaskElem)
22430	NewMask [I] = SM1 [I];
22431	}
22432	// Check if the last undefs actually change the final number of used vector
22433	// registers.
22434	return SM1.size() - LastUndefsCnt > `1` &&
22435	::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) ==
22436	::getNumberOfParts(
22437	TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
22438	VF: SM1.size() - LastUndefsCnt));
22439	};
22440	// Perform O(N^2) search over the gather/shuffle sequences and merge identical
22441	// instructions. TODO: We can further optimize this scan if we split the
22442	// instructions into different buckets based on the insert lane.
22443	SmallVector<Instruction *, `16`> Visited;
22444	for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
22445	assert(*I &&
22446	(I == CSEWorkList.begin() \|\| !DT->dominates(I, std::prev(I))) &&
22447	"Worklist not sorted properly!");
22448	BasicBlock BB = (I)->getBlock();
22449	// For all instructions in blocks containing gather sequences:
22450	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
22451	if (isDeleted(I: &In))
22452	continue;
22453	if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
22454	!GatherShuffleExtractSeq.contains(key: &In))
22455	continue;
22456
22457	// Check if we can replace this instruction with any of the
22458	// visited instructions.
22459	bool Replaced = false;
22460	for (Instruction *&V : Visited) {
22461	SmallVector<int> NewMask;
22462	if (IsIdenticalOrLessDefined (&In, V, NewMask) &&
22463	DT->dominates(A: V->getParent(), B: In.getParent())) {
22464	In.replaceAllUsesWith(V);
22465	eraseInstruction(I: &In);
22466	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
22467	if (!NewMask.empty())
22468	SI->setShuffleMask(NewMask);
22469	Replaced = true;
22470	break;
22471	}
22472	if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
22473	GatherShuffleExtractSeq.contains(key: V) &&
22474	IsIdenticalOrLessDefined (V, &In, NewMask) &&
22475	DT->dominates(A: In.getParent(), B: V->getParent())) {
22476	In.moveAfter(MovePos: V);
22477	V->replaceAllUsesWith(V: &In);
22478	eraseInstruction(I: V);
22479	if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
22480	if (!NewMask.empty())
22481	SI->setShuffleMask(NewMask);
22482	V = &In;
22483	Replaced = true;
22484	break;
22485	}
22486	}
22487	if (!Replaced) {
22488	assert(!is_contained(Visited, &In));
22489	Visited.push_back(Elt: &In);
22490	}
22491	}
22492	}
22493	CSEBlocks.clear();
22494	GatherShuffleExtractSeq.clear();
22495	}
22496
22497	BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
22498	ArrayRef<Value > VL, const* InstructionsState &S, const EdgeInfo &EI) {
22499	auto &BundlePtr =
22500	ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>());
22501	for (Value *V : VL) {
22502	if (S.isNonSchedulable(V))
22503	continue;
22504	auto *I = cast<Instruction>(Val: V);
22505	if (S.isCopyableElement(V)) {
22506	// Add a copyable element model.
22507	ScheduleCopyableData &SD =
22508	addScheduleCopyableData(EI, I, SchedulingRegionID, Bundle&: *BundlePtr);
22509	// Group the instructions to a bundle.
22510	BundlePtr ->add(SD: &SD);
22511	continue;
22512	}
22513	ScheduleData *BundleMember = getScheduleData(V);
22514	assert(BundleMember && "no ScheduleData for bundle member "
22515	"(maybe not in same basic block)");
22516	// Group the instructions to a bundle.
22517	BundlePtr ->add(SD: BundleMember);
22518	ScheduledBundles.try_emplace(Key: I).first ->getSecond().push_back(
22519	Elt: BundlePtr.get());
22520	}
22521	assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
22522	return *BundlePtr;
22523	}
22524
22525	// Groups the instructions to a bundle (which is then a single scheduling entity)
22526	// and schedules instructions until the bundle gets ready.
22527	std::optional<BoUpSLP::ScheduleBundle *>
22528	BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value > VL, BoUpSLP SLP,
22529	const InstructionsState &S,
22530	const EdgeInfo &EI) {
22531	// No need to schedule PHIs, insertelement, extractelement and extractvalue
22532	// instructions.
22533	if (isa<PHINode>(Val: S.getMainOp()) \|\|
22534	isVectorLikeInstWithConstOps(V: S.getMainOp()))
22535	return nullptr;
22536	// If the parent node is non-schedulable and the current node is copyable, and
22537	// any of parent instructions are used outside several basic blocks or in
22538	// bin-op node - cancel scheduling, it may cause wrong def-use deps in
22539	// analysis, leading to a crash.
22540	// Non-scheduled nodes may not have related ScheduleData model, which may lead
22541	// to a skipped dep analysis.
22542	bool HasCopyables = S.areInstructionsWithCopyableElements();
22543	bool DoesNotRequireScheduling =
22544	(!HasCopyables && doesNotNeedToSchedule(VL)) \|\|
22545	all_of(Range&: VL, P: [&](Value V) { return* S.isNonSchedulable(V); });
22546	if (!DoesNotRequireScheduling && S.areInstructionsWithCopyableElements() &&
22547	EI && EI.UserTE->hasState() && EI.UserTE->doesNotNeedToSchedule() &&
22548	EI.UserTE->getOpcode() != Instruction::PHI &&
22549	EI.UserTE->getOpcode() != Instruction::InsertElement &&
22550	any_of(Range&: EI.UserTE->Scalars, P: [](Value *V) {
22551	auto *I = dyn_cast<Instruction>(Val: V);
22552	if (!I)
22553	return false;
22554	for (User *U : I->users()) {
22555	auto *UI = cast<Instruction>(Val: U);
22556	if (isa<BinaryOperator>(Val: UI))
22557	return true;
22558	}
22559	return false;
22560	}))
22561	return std::nullopt;
22562	if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22563	EI.UserTE->hasCopyableElements() &&
22564	EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
22565	all_of(Range&: VL, P: [&](Value *V) {
22566	if (S.isCopyableElement(V))
22567	return true;
22568	return isUsedOutsideBlock(V);
22569	}))
22570	return std::nullopt;
22571	// If any instruction is used outside block only and its operand is placed
22572	// immediately before it, do not schedule, it may cause wrong def-use chain.
22573	if (S.areInstructionsWithCopyableElements() && any_of(Range&: VL, P: [&](Value *V) {
22574	if (isa<PoisonValue>(Val: V) \|\| S.isCopyableElement(V))
22575	return false;
22576	if (isUsedOutsideBlock(V)) {
22577	for (Value *Op : cast<Instruction>(Val: V)->operands()) {
22578	auto *I = dyn_cast<Instruction>(Val: Op);
22579	if (!I)
22580	continue;
22581	return SLP->isVectorized(V: I) && I->getNextNode() == V;
22582	}
22583	}
22584	return false;
22585	}))
22586	return std::nullopt;
22587	if (S.areInstructionsWithCopyableElements() && EI) {
22588	bool IsNonSchedulableWithParentPhiNode =
22589	EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
22590	EI.UserTE->UserTreeIndex.UserTE->hasState() &&
22591	EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22592	EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22593	if (IsNonSchedulableWithParentPhiNode) {
22594	SmallSet<std::pair<Value , Value >, `4`> Values;
22595	for (const auto [Idx, V] :
22596	enumerate(First&: EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
22597	Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
22598	OpIdx: EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
22599	auto *I = dyn_cast<Instruction>(Val: Op);
22600	if (!I \|\| !isCommutative(I))
22601	continue;
22602	if (!Values.insert(V: std::make_pair(x&: V, y&: Op)).second)
22603	return std::nullopt;
22604	}
22605	} else {
22606	// If any of the parent requires scheduling - exit, complex dep between
22607	// schedulable/non-schedulable parents.
22608	if (any_of(Range&: EI.UserTE->Scalars, P: [&](Value *V) {
22609	if (EI.UserTE->hasCopyableElements() &&
22610	EI.UserTE->isCopyableElement(V))
22611	return false;
22612	ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
22613	return any_of(Range&: Entries, P: [](const TreeEntry *TE) {
22614	return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
22615	TE->UserTreeIndex.UserTE->hasState() &&
22616	TE->UserTreeIndex.UserTE->State !=
22617	TreeEntry::SplitVectorize &&
22618	TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22619	});
22620	}))
22621	return std::nullopt;
22622	}
22623	}
22624	if (DoesNotRequireScheduling) {
22625	// If all operands were replaced by copyables, the operands of this node
22626	// might be not, so need to recalculate dependencies for schedule data,
22627	// replaced by copyable schedule data.
22628	for (Value *V : VL) {
22629	auto *I = dyn_cast<Instruction>(Val: V);
22630	if (!I \|\| (HasCopyables && S.isCopyableElement(V)))
22631	continue;
22632	SmallDenseMap<std::pair<Instruction , Value >, unsigned> UserOpToNumOps;
22633	for (const Use &U : I->operands()) {
22634	unsigned &NumOps =
22635	UserOpToNumOps.try_emplace(Key: std::make_pair(x&: I, y: U.get()), Args: `0`)
22636	.first ->getSecond();
22637	++NumOps;
22638	if (auto *Op = dyn_cast<Instruction>(Val: U.get());
22639	Op && areAllOperandsReplacedByCopyableData(User: I, Op, SLP&: *SLP, NumOps)) {
22640	if (ScheduleData *OpSD = getScheduleData(I: Op);
22641	OpSD && OpSD->hasValidDependencies())
22642	// TODO: investigate how to improve it instead of early exiting.
22643	return std::nullopt;
22644	}
22645	}
22646	}
22647	return nullptr;
22648	}
22649
22650	// Initialize the instruction bundle.
22651	Instruction *OldScheduleEnd = ScheduleEnd;
22652	LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
22653
22654	auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
22655	// Clear deps or recalculate the region, if the memory instruction is a
22656	// copyable. It may have memory deps, which must be recalculated.
22657	SmallVector<ScheduleData *> ControlDependentMembers;
22658	auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
22659	SmallDenseMap<std::pair<Instruction , Value >, unsigned> UserOpToNumOps;
22660	for (ScheduleEntity *SE : Bundle.getBundle()) {
22661	if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
22662	if (ScheduleData *BundleMember = getScheduleData(I: SD->getInst());
22663	BundleMember && BundleMember->hasValidDependencies()) {
22664	BundleMember->clearDirectDependencies();
22665	if (RegionHasStackSave \|\|
22666	!isGuaranteedToTransferExecutionToSuccessor(
22667	I: BundleMember->getInst()))
22668	ControlDependentMembers.push_back(Elt: BundleMember);
22669	}
22670	continue;
22671	}
22672	auto *SD = cast<ScheduleData>(Val: SE);
22673	if (SD->hasValidDependencies() &&
22674	(!S.areInstructionsWithCopyableElements() \|\|
22675	!S.isCopyableElement(V: SD->getInst())) &&
22676	!getScheduleCopyableData(I: SD->getInst()).empty() && EI.UserTE &&
22677	EI.UserTE->hasState() &&
22678	(!EI.UserTE->hasCopyableElements() \|\|
22679	!EI.UserTE->isCopyableElement(V: SD->getInst())))
22680	SD->clearDirectDependencies();
22681	for (const Use &U : SD->getInst()->operands()) {
22682	unsigned &NumOps =
22683	UserOpToNumOps
22684	.try_emplace(Key: std::make_pair(x: SD->getInst(), y: U.get()), Args: `0`)
22685	.first ->getSecond();
22686	++NumOps;
22687	if (auto *Op = dyn_cast<Instruction>(Val: U.get());
22688	Op && areAllOperandsReplacedByCopyableData(User: SD->getInst(), Op,
22689	SLP&: *SLP, NumOps)) {
22690	if (ScheduleData *OpSD = getScheduleData(I: Op);
22691	OpSD && OpSD->hasValidDependencies()) {
22692	OpSD->clearDirectDependencies();
22693	if (RegionHasStackSave \|\|
22694	!isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
22695	ControlDependentMembers.push_back(Elt: OpSD);
22696	}
22697	}
22698	}
22699	}
22700	};
22701	// The scheduling region got new instructions at the lower end (or it is a
22702	// new region for the first bundle). This makes it necessary to
22703	// recalculate all dependencies.
22704	// It is seldom that this needs to be done a second time after adding the
22705	// initial bundle to the region.
22706	if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
22707	for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
22708	if (BB != P.first->getParent())
22709	return;
22710	ScheduleData *SD = P.second;
22711	if (isInSchedulingRegion(SD: *SD))
22712	SD->clearDependencies();
22713	});
22714	for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
22715	for_each(P.second, [&](ScheduleCopyableData *SD) {
22716	if (isInSchedulingRegion(SD: *SD))
22717	SD->clearDependencies();
22718	});
22719	});
22720	ReSchedule = true;
22721	}
22722	// Check if the bundle data has deps for copyable elements already. In
22723	// this case need to reset deps and recalculate it.
22724	if (Bundle && !Bundle.getBundle().empty()) {
22725	if (S.areInstructionsWithCopyableElements() \|\|
22726	!ScheduleCopyableDataMap.empty())
22727	CheckIfNeedToClearDeps(Bundle);
22728	LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
22729	<< BB->getName() << "\n");
22730	calculateDependencies(Bundle, /InsertInReadyList=/!ReSchedule, SLP,
22731	ControlDeps: ControlDependentMembers);
22732	} else if (!ControlDependentMembers.empty()) {
22733	ScheduleBundle Invalid = ScheduleBundle::invalid();
22734	calculateDependencies(Bundle&: Invalid, /InsertInReadyList=/!ReSchedule, SLP,
22735	ControlDeps: ControlDependentMembers);
22736	}
22737
22738	if (ReSchedule) {
22739	resetSchedule();
22740	initialFillReadyList(ReadyList&: ReadyInsts);
22741	}
22742
22743	// Now try to schedule the new bundle or (if no bundle) just calculate
22744	// dependencies. As soon as the bundle is "ready" it means that there are no
22745	// cyclic dependencies and we can schedule it. Note that's important that we
22746	// don't "schedule" the bundle yet.
22747	while (((!Bundle && ReSchedule) \|\| (Bundle && !Bundle.isReady())) &&
22748	!ReadyInsts.empty()) {
22749	ScheduleEntity *Picked = ReadyInsts.pop_back_val();
22750	assert(Picked->isReady() && "must be ready to schedule");
22751	schedule(R: *SLP, S, EI, Data: Picked, ReadyList&: ReadyInsts);
22752	if (Picked == &Bundle)
22753	break;
22754	}
22755	};
22756
22757	// Make sure that the scheduling region contains all
22758	// instructions of the bundle.
22759	for (Value *V : VL) {
22760	if (S.isNonSchedulable(V))
22761	continue;
22762	if (!extendSchedulingRegion(V, S)) {
22763	// If the scheduling region got new instructions at the lower end (or it
22764	// is a new region for the first bundle). This makes it necessary to
22765	// recalculate all dependencies.
22766	// Otherwise the compiler may crash trying to incorrectly calculate
22767	// dependencies and emit instruction in the wrong order at the actual
22768	// scheduling.
22769	ScheduleBundle Invalid = ScheduleBundle::invalid();
22770	TryScheduleBundleImpl (/ReSchedule=/false, Invalid);
22771	return std::nullopt;
22772	}
22773	}
22774
22775	bool ReSchedule = false;
22776	for (Value *V : VL) {
22777	if (S.isNonSchedulable(V))
22778	continue;
22779	SmallVector<ScheduleCopyableData *> CopyableData =
22780	getScheduleCopyableData(I: cast<Instruction>(Val: V));
22781	if (!CopyableData.empty()) {
22782	for (ScheduleCopyableData *SD : CopyableData)
22783	ReadyInsts.remove(X: SD);
22784	}
22785	ScheduleData *BundleMember = getScheduleData(V);
22786	assert((BundleMember \|\| S.isCopyableElement(V)) &&
22787	"no ScheduleData for bundle member (maybe not in same basic block)");
22788	if (!BundleMember)
22789	continue;
22790
22791	// Make sure we don't leave the pieces of the bundle in the ready list when
22792	// whole bundle might not be ready.
22793	ReadyInsts.remove(X: BundleMember);
22794	if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
22795	!Bundles.empty()) {
22796	for (ScheduleBundle *B : Bundles)
22797	ReadyInsts.remove(X: B);
22798	}
22799
22800	if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
22801	continue;
22802	// A bundle member was scheduled as single instruction before and now
22803	// needs to be scheduled as part of the bundle. We just get rid of the
22804	// existing schedule.
22805	// A bundle member has deps calculated before it was copyable element - need
22806	// to reschedule.
22807	LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
22808	<< " was already scheduled\n");
22809	ReSchedule = true;
22810	}
22811
22812	ScheduleBundle &Bundle = buildBundle(VL, S, EI);
22813	TryScheduleBundleImpl (ReSchedule, Bundle);
22814	if (!Bundle.isReady()) {
22815	for (ScheduleEntity *BD : Bundle.getBundle()) {
22816	// Copyable data scheduling is just removed.
22817	if (isa<ScheduleCopyableData>(Val: BD))
22818	continue;
22819	if (BD->isReady()) {
22820	ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst());
22821	if (Bundles.empty()) {
22822	ReadyInsts.insert(X: BD);
22823	continue;
22824	}
22825	for (ScheduleBundle *B : Bundles)
22826	if (B->isReady())
22827	ReadyInsts.insert(X: B);
22828	}
22829	}
22830	ScheduledBundlesList.pop_back();
22831	SmallVector<ScheduleData *> ControlDependentMembers;
22832	for (Value *V : VL) {
22833	if (S.isNonSchedulable(V))
22834	continue;
22835	auto *I = cast<Instruction>(Val: V);
22836	if (S.isCopyableElement(V: I)) {
22837	// Remove the copyable data from the scheduling region and restore
22838	// previous mappings.
22839	auto KV = std::make_pair(x: EI, y&: I);
22840	assert(ScheduleCopyableDataMap.contains(KV) &&
22841	"no ScheduleCopyableData for copyable element");
22842	ScheduleCopyableData *SD =
22843	ScheduleCopyableDataMapByInst.find(Val: I)->getSecond().pop_back_val();
22844	ScheduleCopyableDataMapByUsers [I].remove(X: SD);
22845	if (EI.UserTE) {
22846	ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
22847	const auto *It = find(Range&: Op, Val: I);
22848	assert(It != Op.end() && "Lane not set");
22849	SmallPtrSet<Instruction *, `4`> Visited;
22850	do {
22851	int Lane = std::distance(first: Op.begin(), last: It);
22852	assert(Lane >= `0` && "Lane not set");
22853	if (isa<StoreInst>(Val: EI.UserTE->Scalars [Lane]) &&
22854	!EI.UserTE->ReorderIndices.empty())
22855	Lane = EI.UserTE->ReorderIndices [Lane];
22856	assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22857	"Couldn't find extract lane");
22858	auto *In = cast<Instruction>(Val: EI.UserTE->Scalars [Lane]);
22859	if (!Visited.insert(Ptr: In).second) {
22860	It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
22861	break;
22862	}
22863	ScheduleCopyableDataMapByInstUser
22864	[std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I)]
22865	.pop_back();
22866	It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
22867	} while (It != Op.end());
22868	EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22869	if (ScheduleCopyableData *UserCD = getScheduleCopyableData(EI: UserEI, V: I))
22870	ScheduleCopyableDataMapByUsers [I].insert(X: UserCD);
22871	}
22872	if (ScheduleCopyableDataMapByUsers [I].empty())
22873	ScheduleCopyableDataMapByUsers.erase(Val: I);
22874	ScheduleCopyableDataMap.erase(Val: KV);
22875	// Need to recalculate dependencies for the actual schedule data.
22876	if (ScheduleData *OpSD = getScheduleData(I);
22877	OpSD && OpSD->hasValidDependencies()) {
22878	OpSD->clearDirectDependencies();
22879	if (RegionHasStackSave \|\|
22880	!isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
22881	ControlDependentMembers.push_back(Elt: OpSD);
22882	}
22883	continue;
22884	}
22885	ScheduledBundles.find(Val: I)->getSecond().pop_back();
22886	}
22887	if (!ControlDependentMembers.empty()) {
22888	ScheduleBundle Invalid = ScheduleBundle::invalid();
22889	calculateDependencies(Bundle&: Invalid, /InsertInReadyList=/false, SLP,
22890	ControlDeps: ControlDependentMembers);
22891	}
22892	return std::nullopt;
22893	}
22894	return &Bundle;
22895	}
22896
22897	BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22898	// Allocate a new ScheduleData for the instruction.
22899	if (ChunkPos >= ChunkSize) {
22900	ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
22901	ChunkPos = `0`;
22902	}
22903	return &(ScheduleDataChunks.back()[ChunkPos++]);
22904	}
22905
22906	bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22907	Value V, const* InstructionsState &S) {
22908	Instruction *I = dyn_cast<Instruction>(Val: V);
22909	assert(I && "bundle member must be an instruction");
22910	if (getScheduleData(I))
22911	return true;
22912	if (!ScheduleStart) {
22913	// It's the first instruction in the new region.
22914	initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
22915	ScheduleStart = I;
22916	ScheduleEnd = I->getNextNode();
22917	assert(ScheduleEnd && "tried to vectorize a terminator?");
22918	LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22919	return true;
22920	}
22921	// Search up and down at the same time, because we don't know if the new
22922	// instruction is above or below the existing scheduling region.
22923	// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22924	// against the budget. Otherwise debug info could affect codegen.
22925	BasicBlock::reverse_iterator UpIter =
22926	++ScheduleStart->getIterator().getReverse();
22927	BasicBlock::reverse_iterator UpperEnd = BB->rend();
22928	BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22929	BasicBlock::iterator LowerEnd = BB->end();
22930	auto IsAssumeLikeIntr = [](const Instruction &I) {
22931	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
22932	return II->isAssumeLikeIntrinsic();
22933	return false;
22934	};
22935	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
22936	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
22937	while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22938	&*DownIter != I) {
22939	if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22940	LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22941	return false;
22942	}
22943
22944	++UpIter;
22945	++DownIter;
22946
22947	UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
22948	DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
22949	}
22950	if (DownIter == LowerEnd \|\| (UpIter != UpperEnd && &*UpIter == I)) {
22951	assert(I->getParent() == ScheduleStart->getParent() &&
22952	"Instruction is in wrong basic block.");
22953	initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
22954	ScheduleStart = I;
22955	LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22956	<< "\n");
22957	return true;
22958	}
22959	assert((UpIter == UpperEnd \|\| (DownIter != LowerEnd && &*DownIter == I)) &&
22960	"Expected to reach top of the basic block or instruction down the "
22961	"lower end.");
22962	assert(I->getParent() == ScheduleEnd->getParent() &&
22963	"Instruction is in wrong basic block.");
22964	initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
22965	NextLoadStore: nullptr);
22966	ScheduleEnd = I->getNextNode();
22967	assert(ScheduleEnd && "tried to vectorize a terminator?");
22968	LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22969	return true;
22970	}
22971
22972	void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22973	Instruction *ToI,
22974	ScheduleData *PrevLoadStore,
22975	ScheduleData *NextLoadStore) {
22976	ScheduleData *CurrentLoadStore = PrevLoadStore;
22977	for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22978	// No need to allocate data for non-schedulable instructions.
22979	if (isa<PHINode>(Val: I))
22980	continue;
22981	ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
22982	if (!SD) {
22983	SD = allocateScheduleDataChunks();
22984	ScheduleDataMap [I] = SD;
22985	}
22986	assert(!isInSchedulingRegion(*SD) &&
22987	"new ScheduleData already in scheduling region");
22988	SD->init(BlockSchedulingRegionID: SchedulingRegionID, I);
22989
22990	auto CanIgnoreLoad = [](const Instruction *I) {
22991	const auto *LI = dyn_cast<LoadInst>(Val: I);
22992	// If there is a simple load marked as invariant, we can ignore it.
22993	// But, in the (unlikely) case of non-simple invariant load,
22994	// we should not ignore it.
22995	return LI && LI->isSimple() &&
22996	LI->getMetadata(KindID: LLVMContext::MD_invariant_load);
22997	};
22998
22999	if (I->mayReadOrWriteMemory() &&
23000	// Simple InvariantLoad does not depend on other memory accesses.
23001	!CanIgnoreLoad (I) &&
23002	(!isa<IntrinsicInst>(Val: I) \|\|
23003	(cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
23004	cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
23005	Intrinsic::pseudoprobe))) {
23006	// Update the linked list of memory accessing instructions.
23007	if (CurrentLoadStore) {
23008	CurrentLoadStore->setNextLoadStore(SD);
23009	} else {
23010	FirstLoadStoreInRegion = SD;
23011	}
23012	CurrentLoadStore = SD;
23013	}
23014
23015	if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
23016	match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23017	RegionHasStackSave = true;
23018	}
23019	if (NextLoadStore) {
23020	if (CurrentLoadStore)
23021	CurrentLoadStore->setNextLoadStore(NextLoadStore);
23022	} else {
23023	LastLoadStoreInRegion = CurrentLoadStore;
23024	}
23025	}
23026
23027	void BoUpSLP::BlockScheduling::calculateDependencies(
23028	ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
23029	ArrayRef<ScheduleData *> ControlDeps) {
23030	SmallVector<ScheduleEntity *> WorkList;
23031	auto ProcessNode = [&](ScheduleEntity *SE) {
23032	if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
23033	if (CD->hasValidDependencies())
23034	return;
23035	LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
23036	CD->initDependencies();
23037	CD->resetUnscheduledDeps();
23038	const EdgeInfo &EI = CD->getEdgeInfo();
23039	if (EI.UserTE) {
23040	ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
23041	const auto *It = find(Range&: Op, Val: CD->getInst());
23042	assert(It != Op.end() && "Lane not set");
23043	SmallPtrSet<Instruction *, `4`> Visited;
23044	do {
23045	int Lane = std::distance(first: Op.begin(), last: It);
23046	assert(Lane >= `0` && "Lane not set");
23047	if (isa<StoreInst>(Val: EI.UserTE->Scalars [Lane]) &&
23048	!EI.UserTE->ReorderIndices.empty())
23049	Lane = EI.UserTE->ReorderIndices [Lane];
23050	assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
23051	"Couldn't find extract lane");
23052	auto *In = cast<Instruction>(Val: EI.UserTE->Scalars [Lane]);
23053	if (EI.UserTE->isCopyableElement(V: In)) {
23054	// We may have not have related copyable scheduling data, if the
23055	// instruction is non-schedulable.
23056	if (ScheduleCopyableData *UseSD =
23057	getScheduleCopyableData(EI: EI.UserTE->UserTreeIndex, V: In)) {
23058	CD->incDependencies();
23059	if (!UseSD->isScheduled())
23060	CD->incrementUnscheduledDeps(Incr: `1`);
23061	if (!UseSD->hasValidDependencies() \|\|
23062	(InsertInReadyList && UseSD->isReady()))
23063	WorkList.push_back(Elt: UseSD);
23064	}
23065	} else if (Visited.insert(Ptr: In).second) {
23066	if (ScheduleData *UseSD = getScheduleData(I: In)) {
23067	CD->incDependencies();
23068	if (!UseSD->isScheduled())
23069	CD->incrementUnscheduledDeps(Incr: `1`);
23070	if (!UseSD->hasValidDependencies() \|\|
23071	(InsertInReadyList && UseSD->isReady()))
23072	WorkList.push_back(Elt: UseSD);
23073	}
23074	}
23075	It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: CD->getInst());
23076	} while (It != Op.end());
23077	if (CD->isReady() && CD->getDependencies() == `0` &&
23078	(EI.UserTE->hasState() &&
23079	(EI.UserTE->getMainOp()->getParent() !=
23080	CD->getInst()->getParent() \|\|
23081	(isa<PHINode>(Val: EI.UserTE->getMainOp()) &&
23082	(EI.UserTE->getMainOp()->hasNUsesOrMore(N: UsesLimit) \|\|
23083	any_of(Range: EI.UserTE->getMainOp()->users(), P: [&](User *U) {
23084	auto *IU = dyn_cast<Instruction>(Val: U);
23085	if (!IU)
23086	return true;
23087	return IU->getParent() == EI.UserTE->getMainOp()->getParent();
23088	})))))) {
23089	// If no uses in the block - mark as having pseudo-use, which cannot
23090	// be scheduled.
23091	// Prevents incorrect def-use tracking between external user and
23092	// actual instruction.
23093	CD->incDependencies();
23094	CD->incrementUnscheduledDeps(Incr: `1`);
23095	}
23096	}
23097	return;
23098	}
23099	auto *BundleMember = cast<ScheduleData>(Val: SE);
23100	if (BundleMember->hasValidDependencies())
23101	return;
23102	LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
23103	BundleMember->initDependencies();
23104	BundleMember->resetUnscheduledDeps();
23105	// Handle def-use chain dependencies.
23106	SmallDenseMap<Value , unsigned*> UserToNumOps;
23107	for (User *U : BundleMember->getInst()->users()) {
23108	if (isa<PHINode>(Val: U))
23109	continue;
23110	if (ScheduleData *UseSD = getScheduleData(V: U)) {
23111	// The operand is a copyable element - skip.
23112	unsigned &NumOps = UserToNumOps.try_emplace(Key: U, Args: `0`).first ->getSecond();
23113	++NumOps;
23114	if (areAllOperandsReplacedByCopyableData(
23115	User: cast<Instruction>(Val: U), Op: BundleMember->getInst(), SLP&: *SLP, NumOps))
23116	continue;
23117	BundleMember->incDependencies();
23118	if (!UseSD->isScheduled())
23119	BundleMember->incrementUnscheduledDeps(Incr: `1`);
23120	if (!UseSD->hasValidDependencies() \|\|
23121	(InsertInReadyList && UseSD->isReady()))
23122	WorkList.push_back(Elt: UseSD);
23123	}
23124	}
23125	for (ScheduleCopyableData *UseSD :
23126	getScheduleCopyableDataUsers(User: BundleMember->getInst())) {
23127	BundleMember->incDependencies();
23128	if (!UseSD->isScheduled())
23129	BundleMember->incrementUnscheduledDeps(Incr: `1`);
23130	if (!UseSD->hasValidDependencies() \|\|
23131	(InsertInReadyList && UseSD->isReady()))
23132	WorkList.push_back(Elt: UseSD);
23133	}
23134
23135	SmallPtrSet<const Instruction *, `4`> Visited;
23136	auto MakeControlDependent = [&](Instruction *I) {
23137	// Do not mark control dependent twice.
23138	if (!Visited.insert(Ptr: I).second)
23139	return;
23140	auto *DepDest = getScheduleData(I);
23141	assert(DepDest && "must be in schedule window");
23142	DepDest->addControlDependency(Dep: BundleMember);
23143	BundleMember->incDependencies();
23144	if (!DepDest->isScheduled())
23145	BundleMember->incrementUnscheduledDeps(Incr: `1`);
23146	if (!DepDest->hasValidDependencies() \|\|
23147	(InsertInReadyList && DepDest->isReady()))
23148	WorkList.push_back(Elt: DepDest);
23149	};
23150
23151	// Any instruction which isn't safe to speculate at the beginning of the
23152	// block is control depend on any early exit or non-willreturn call
23153	// which proceeds it.
23154	if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) {
23155	for (Instruction *I = BundleMember->getInst()->getNextNode();
23156	I != ScheduleEnd; I = I->getNextNode()) {
23157	if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
23158	continue;
23159
23160	// Add the dependency
23161	MakeControlDependent(I);
23162
23163	if (!isGuaranteedToTransferExecutionToSuccessor(I))
23164	// Everything past here must be control dependent on I.
23165	break;
23166	}
23167	}
23168
23169	if (RegionHasStackSave) {
23170	// If we have an inalloc alloca instruction, it needs to be scheduled
23171	// after any preceeding stacksave. We also need to prevent any alloca
23172	// from reordering above a preceeding stackrestore.
23173	if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
23174	match(V: BundleMember->getInst(),
23175	P: m_Intrinsic<Intrinsic::stackrestore>())) {
23176	for (Instruction *I = BundleMember->getInst()->getNextNode();
23177	I != ScheduleEnd; I = I->getNextNode()) {
23178	if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) \|\|
23179	match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23180	// Any allocas past here must be control dependent on I, and I
23181	// must be memory dependend on BundleMember->Inst.
23182	break;
23183
23184	if (!isa<AllocaInst>(Val: I))
23185	continue;
23186
23187	// Add the dependency
23188	MakeControlDependent(I);
23189	}
23190	}
23191
23192	// In addition to the cases handle just above, we need to prevent
23193	// allocas and loads/stores from moving below a stacksave or a
23194	// stackrestore. Avoiding moving allocas below stackrestore is currently
23195	// thought to be conservatism. Moving loads/stores below a stackrestore
23196	// can lead to incorrect code.
23197	if (isa<AllocaInst>(Val: BundleMember->getInst()) \|\|
23198	BundleMember->getInst()->mayReadOrWriteMemory()) {
23199	for (Instruction *I = BundleMember->getInst()->getNextNode();
23200	I != ScheduleEnd; I = I->getNextNode()) {
23201	if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
23202	!match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23203	continue;
23204
23205	// Add the dependency
23206	MakeControlDependent(I);
23207	break;
23208	}
23209	}
23210	}
23211
23212	// Handle the memory dependencies (if any).
23213	ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
23214	if (!NextLoadStore)
23215	return;
23216	Instruction *SrcInst = BundleMember->getInst();
23217	assert(SrcInst->mayReadOrWriteMemory() &&
23218	"NextLoadStore list for non memory effecting bundle?");
23219	MemoryLocation SrcLoc = getLocation(I: SrcInst);
23220	bool SrcMayWrite = SrcInst->mayWriteToMemory();
23221	unsigned NumAliased = `0`;
23222	unsigned DistToSrc = `1`;
23223	bool IsNonSimpleSrc = !SrcLoc.Ptr \|\| !isSimple(I: SrcInst);
23224
23225	for (ScheduleData *DepDest = NextLoadStore; DepDest;
23226	DepDest = DepDest->getNextLoadStore()) {
23227	assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
23228
23229	// We have two limits to reduce the complexity:
23230	// 1) AliasedCheckLimit: It's a small limit to reduce calls to
23231	// SLP->isAliased (which is the expensive part in this loop).
23232	// 2) MaxMemDepDistance: It's for very large blocks and it aborts
23233	// the whole loop (even if the loop is fast, it's quadratic).
23234	// It's important for the loop break condition (see below) to
23235	// check this limit even between two read-only instructions.
23236	if (DistToSrc >= MaxMemDepDistance \|\|
23237	((SrcMayWrite \|\| DepDest->getInst()->mayWriteToMemory()) &&
23238	(IsNonSimpleSrc \|\| NumAliased >= AliasedCheckLimit \|\|
23239	SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) {
23240
23241	// We increment the counter only if the locations are aliased
23242	// (instead of counting all alias checks). This gives a better
23243	// balance between reduced runtime and accurate dependencies.
23244	NumAliased++;
23245
23246	DepDest->addMemoryDependency(Dep: BundleMember);
23247	BundleMember->incDependencies();
23248	if (!DepDest->isScheduled())
23249	BundleMember->incrementUnscheduledDeps(Incr: `1`);
23250	if (!DepDest->hasValidDependencies() \|\|
23251	(InsertInReadyList && DepDest->isReady()))
23252	WorkList.push_back(Elt: DepDest);
23253	}
23254
23255	// Example, explaining the loop break condition: Let's assume our
23256	// starting instruction is i0 and MaxMemDepDistance = 3.
23257	//
23258	// +--------v--v--v
23259	// i0,i1,i2,i3,i4,i5,i6,i7,i8
23260	// +--------^--^--^
23261	//
23262	// MaxMemDepDistance let us stop alias-checking at i3 and we add
23263	// dependencies from i0 to i3,i4,.. (even if they are not aliased).
23264	// Previously we already added dependencies from i3 to i6,i7,i8
23265	// (because of MaxMemDepDistance). As we added a dependency from
23266	// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
23267	// and we can abort this loop at i6.
23268	if (DistToSrc >= `2` * MaxMemDepDistance)
23269	break;
23270	DistToSrc++;
23271	}
23272	};
23273
23274	assert((Bundle \|\| !ControlDeps.empty()) &&
23275	"expected at least one instruction to schedule");
23276	if (Bundle)
23277	WorkList.push_back(Elt: Bundle.getBundle().front());
23278	WorkList.append(in_start: ControlDeps.begin(), in_end: ControlDeps.end());
23279	SmallPtrSet<ScheduleBundle *, `16`> Visited;
23280	while (!WorkList.empty()) {
23281	ScheduleEntity *SD = WorkList.pop_back_val();
23282	SmallVector<ScheduleBundle *, `1`> CopyableBundle;
23283	ArrayRef<ScheduleBundle *> Bundles;
23284	if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SD)) {
23285	CopyableBundle.push_back(Elt: &CD->getBundle());
23286	Bundles = CopyableBundle;
23287	} else {
23288	Bundles = getScheduleBundles(V: SD->getInst());
23289	}
23290	if (Bundles.empty()) {
23291	if (!SD->hasValidDependencies())
23292	ProcessNode (SD);
23293	if (InsertInReadyList && SD->isReady()) {
23294	ReadyInsts.insert(X: SD);
23295	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
23296	}
23297	continue;
23298	}
23299	for (ScheduleBundle *Bundle : Bundles) {
23300	if (Bundle->hasValidDependencies() \|\| !Visited.insert(Ptr: Bundle).second)
23301	continue;
23302	assert(isInSchedulingRegion(*Bundle) &&
23303	"ScheduleData not in scheduling region");
23304	for_each(Range: Bundle->getBundle(), F: ProcessNode);
23305	}
23306	if (InsertInReadyList && SD->isReady()) {
23307	for (ScheduleBundle *Bundle : Bundles) {
23308	assert(isInSchedulingRegion(*Bundle) &&
23309	"ScheduleData not in scheduling region");
23310	if (!Bundle->isReady())
23311	continue;
23312	ReadyInsts.insert(X: Bundle);
23313	LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
23314	<< "\n");
23315	}
23316	}
23317	}
23318	}
23319
23320	void BoUpSLP::BlockScheduling::resetSchedule() {
23321	assert(ScheduleStart &&
23322	"tried to reset schedule on block which has not been scheduled");
23323	for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
23324	if (BB != P.first->getParent())
23325	return;
23326	ScheduleData *SD = P.second;
23327	if (isInSchedulingRegion(SD: *SD)) {
23328	SD->setScheduled(/Scheduled=/false);
23329	SD->resetUnscheduledDeps();
23330	}
23331	});
23332	for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
23333	for_each(P.second, [&](ScheduleCopyableData *SD) {
23334	if (isInSchedulingRegion(SD: *SD)) {
23335	SD->setScheduled(/Scheduled=/false);
23336	SD->resetUnscheduledDeps();
23337	}
23338	});
23339	});
23340	for_each(Range&: ScheduledBundles, F: [&](auto &P) {
23341	for_each(P.second, [&](ScheduleBundle *Bundle) {
23342	if (isInSchedulingRegion(SD: *Bundle))
23343	Bundle->setScheduled(/Scheduled=/false);
23344	});
23345	});
23346	// Reset schedule data for copyable elements.
23347	for (auto &P : ScheduleCopyableDataMap) {
23348	if (isInSchedulingRegion(SD: *P.second)) {
23349	P.second ->setScheduled(/Scheduled=/false);
23350	P.second ->resetUnscheduledDeps();
23351	}
23352	}
23353	ReadyInsts.clear();
23354	}
23355
23356	void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
23357	if (!BS->ScheduleStart)
23358	return;
23359
23360	LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
23361
23362	// A key point - if we got here, pre-scheduling was able to find a valid
23363	// scheduling of the sub-graph of the scheduling window which consists
23364	// of all vector bundles and their transitive users. As such, we do not
23365	// need to reschedule anything outside of* that subgraph.*
23366
23367	BS->resetSchedule();
23368
23369	// For the real scheduling we use a more sophisticated ready-list: it is
23370	// sorted by the original instruction location. This lets the final schedule
23371	// be as close as possible to the original instruction order.
23372	// WARNING: If changing this order causes a correctness issue, that means
23373	// there is some missing dependence edge in the schedule data graph.
23374	struct ScheduleDataCompare {
23375	bool operator()(const ScheduleEntity *SD1,
23376	const ScheduleEntity SD2) const* {
23377	return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
23378	}
23379	};
23380	std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
23381
23382	// Ensure that all dependency data is updated (for nodes in the sub-graph)
23383	// and fill the ready-list with initial instructions.
23384	int Idx = `0`;
23385	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23386	I = I->getNextNode()) {
23387	ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I);
23388	if (!Bundles.empty()) {
23389	for (ScheduleBundle *Bundle : Bundles) {
23390	Bundle->setSchedulingPriority(Idx++);
23391	if (!Bundle->hasValidDependencies())
23392	BS->calculateDependencies(Bundle&: Bundle, /InsertInReadyList=/*false, SLP: this);
23393	}
23394	SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
23395	for (ScheduleCopyableData *SD : reverse(C&: SDs)) {
23396	ScheduleBundle &Bundle = SD->getBundle();
23397	Bundle.setSchedulingPriority(Idx++);
23398	if (!Bundle.hasValidDependencies())
23399	BS->calculateDependencies(Bundle, /InsertInReadyList=/false, SLP: this);
23400	}
23401	continue;
23402	}
23403	SmallVector<ScheduleCopyableData *> CopyableData =
23404	BS->getScheduleCopyableDataUsers(User: I);
23405	if (ScheduleData *SD = BS->getScheduleData(I)) {
23406	[[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I);
23407	assert((isVectorLikeInstWithConstOps(SD->getInst()) \|\| SDTEs.empty() \|\|
23408	SDTEs.front()->doesNotNeedToSchedule() \|\|
23409	doesNotNeedToBeScheduled(I)) &&
23410	"scheduler and vectorizer bundle mismatch");
23411	SD->setSchedulingPriority(Idx++);
23412	if (!CopyableData.empty() \|\|
23413	any_of(Range: R.ValueToGatherNodes.lookup(Val: I), P: [&](const TreeEntry *TE) {
23414	assert(TE->isGather() && "expected gather node");
23415	return TE->hasState() && TE->hasCopyableElements() &&
23416	TE->isCopyableElement(V: I);
23417	})) {
23418	SD->clearDirectDependencies();
23419	// Need to calculate deps for these nodes to correctly handle copyable
23420	// dependencies, even if they were cancelled.
23421	// If copyables bundle was cancelled, the deps are cleared and need to
23422	// recalculate them.
23423	ScheduleBundle Bundle;
23424	Bundle.add(SD);
23425	BS->calculateDependencies(Bundle, /InsertInReadyList=/false, SLP: this);
23426	}
23427	}
23428	for (ScheduleCopyableData *SD : reverse(C&: CopyableData)) {
23429	ScheduleBundle &Bundle = SD->getBundle();
23430	Bundle.setSchedulingPriority(Idx++);
23431	if (!Bundle.hasValidDependencies())
23432	BS->calculateDependencies(Bundle, /InsertInReadyList=/false, SLP: this);
23433	}
23434	}
23435	BS->initialFillReadyList(ReadyList&: ReadyInsts);
23436
23437	Instruction *LastScheduledInst = BS->ScheduleEnd;
23438
23439	// Do the "real" scheduling.
23440	SmallPtrSet<Instruction *, `16`> Scheduled;
23441	while (!ReadyInsts.empty()) {
23442	auto Picked = ReadyInsts.begin();
23443	ReadyInsts.erase(position: ReadyInsts.begin());
23444
23445	// Move the scheduled instruction(s) to their dedicated places, if not
23446	// there yet.
23447	if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) {
23448	for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
23449	Instruction *PickedInst = BundleMember->getInst();
23450	// If copyable must be schedule as part of something else, skip it.
23451	bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(V: PickedInst);
23452	if ((IsCopyable && BS->getScheduleData(I: PickedInst)) \|\|
23453	(!IsCopyable && !Scheduled.insert(Ptr: PickedInst).second))
23454	continue;
23455	if (PickedInst->getNextNode() != LastScheduledInst)
23456	PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
23457	LastScheduledInst = PickedInst;
23458	}
23459	EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(),
23460	Args&: LastScheduledInst);
23461	} else {
23462	auto *SD = cast<ScheduleData>(Val: Picked);
23463	Instruction *PickedInst = SD->getInst();
23464	if (PickedInst->getNextNode() != LastScheduledInst)
23465	PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
23466	LastScheduledInst = PickedInst;
23467	}
23468	auto Invalid = InstructionsState::invalid();
23469	BS->schedule(R, S: Invalid, EI: EdgeInfo (), Data: Picked, ReadyList&: ReadyInsts);
23470	}
23471
23472	// Check that we didn't break any of our invariants.
23473	#ifdef EXPENSIVE_CHECKS
23474	BS->verify();
23475	#endif
23476
23477	#if !defined(NDEBUG) \|\| defined(EXPENSIVE_CHECKS)
23478	// Check that all schedulable entities got scheduled
23479	for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23480	I = I->getNextNode()) {
23481	ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
23482	assert(all_of(Bundles,
23483	[](const ScheduleBundle *Bundle) {
23484	return Bundle->isScheduled();
23485	}) &&
23486	"must be scheduled at this point");
23487	}
23488	#endif
23489
23490	// Avoid duplicate scheduling of the block.
23491	BS->ScheduleStart = nullptr;
23492	}
23493
23494	unsigned BoUpSLP::getVectorElementSize(Value *V) {
23495	// If V is a store, just return the width of the stored value (or value
23496	// truncated just before storing) without traversing the expression tree.
23497	// This is the common case.
23498	if (auto *Store = dyn_cast<StoreInst>(Val: V))
23499	return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
23500
23501	if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
23502	return getVectorElementSize(V: IEI->getOperand(i_nocapture: `1`));
23503
23504	auto E = InstrElementSize.find(Val: V);
23505	if (E != InstrElementSize.end())
23506	return E ->second;
23507
23508	// If V is not a store, we can traverse the expression tree to find loads
23509	// that feed it. The type of the loaded value may indicate a more suitable
23510	// width than V's type. We want to base the vector element size on the width
23511	// of memory operations where possible.
23512	SmallVector<std::tuple<Instruction , BasicBlock , unsigned>> Worklist;
23513	SmallPtrSet<Instruction *, `16`> Visited;
23514	if (auto *I = dyn_cast<Instruction>(Val: V)) {
23515	Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: `0`);
23516	Visited.insert(Ptr: I);
23517	}
23518
23519	// Traverse the expression tree in bottom-up order looking for loads. If we
23520	// encounter an instruction we don't yet handle, we give up.
23521	auto Width = `0u`;
23522	Value FirstNonBool = nullptr*;
23523	while (!Worklist.empty()) {
23524	auto [I, Parent, Level] = Worklist.pop_back_val();
23525
23526	// We should only be looking at scalar instructions here. If the current
23527	// instruction has a vector type, skip.
23528	auto *Ty = I->getType();
23529	if (isa<VectorType>(Val: Ty))
23530	continue;
23531	if (Ty != Builder.getInt1Ty() && !FirstNonBool)
23532	FirstNonBool = I;
23533	if (Level > RecursionMaxDepth)
23534	continue;
23535
23536	// If the current instruction is a load, update MaxWidth to reflect the
23537	// width of the loaded value.
23538	if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
23539	Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
23540
23541	// Otherwise, we need to visit the operands of the instruction. We only
23542	// handle the interesting cases from buildTree here. If an operand is an
23543	// instruction we haven't yet visited and from the same basic block as the
23544	// user or the use is a PHI node, we add it to the worklist.
23545	else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
23546	BinaryOperator, UnaryOperator>(Val: I)) {
23547	for (Use &U : I->operands()) {
23548	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
23549	if (Visited.insert(Ptr: J).second &&
23550	(isa<PHINode>(Val: I) \|\| J->getParent() == Parent)) {
23551	Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + `1`);
23552	continue;
23553	}
23554	if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
23555	FirstNonBool = U.get();
23556	}
23557	} else {
23558	break;
23559	}
23560	}
23561
23562	// If we didn't encounter a memory access in the expression tree, or if we
23563	// gave up for some reason, just return the width of V. Otherwise, return the
23564	// maximum width we found.
23565	if (!Width) {
23566	if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
23567	V = FirstNonBool;
23568	Width = DL->getTypeSizeInBits(Ty: V->getType());
23569	}
23570
23571	for (Instruction *I : Visited)
23572	InstrElementSize [I] = Width;
23573
23574	return Width;
23575	}
23576
23577	bool BoUpSLP::collectValuesToDemote(
23578	const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
23579	SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
23580	const SmallDenseSet<unsigned, `8`> &NodesToKeepBWs, unsigned &MaxDepthLevel,
23581	bool &IsProfitableToDemote, bool IsTruncRoot) const {
23582	// We can always demote constants.
23583	if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
23584	return true;
23585
23586	unsigned OrigBitWidth =
23587	DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType());
23588	if (OrigBitWidth == BitWidth) {
23589	MaxDepthLevel = `1`;
23590	return true;
23591	}
23592
23593	// Check if the node was analyzed already and must keep its original bitwidth.
23594	if (NodesToKeepBWs.contains(V: E.Idx))
23595	return false;
23596
23597	// If the value is not a vectorized instruction in the expression and not used
23598	// by the insertelement instruction and not used in multiple vector nodes, it
23599	// cannot be demoted.
23600	bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
23601	if (isa<PoisonValue>(Val: R))
23602	return false;
23603	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
23604	});
23605	auto IsPotentiallyTruncated = [&](Value V, unsigned* &BitWidth) -> bool {
23606	if (isa<PoisonValue>(Val: V))
23607	return true;
23608	if (getTreeEntries(V).size() > `1`)
23609	return false;
23610	// For lat shuffle of sext/zext with many uses need to check the extra bit
23611	// for unsigned values, otherwise may have incorrect casting for reused
23612	// scalars.
23613	bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery (*DL));
23614	if ((!IsSignedNode \|\| IsSignedVal) && OrigBitWidth > BitWidth) {
23615	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23616	if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL)))
23617	return true;
23618	}
23619	unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: DL, AC, CxtI: nullptr*, DT);
23620	unsigned BitWidth1 = OrigBitWidth - NumSignBits;
23621	if (IsSignedNode)
23622	++BitWidth1;
23623	if (auto *I = dyn_cast<Instruction>(Val: V)) {
23624	APInt Mask = DB->getDemandedBits(I);
23625	unsigned BitWidth2 =
23626	std::max<unsigned>(a: `1`, b: Mask.getBitWidth() - Mask.countl_zero());
23627	while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
23628	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - `1`);
23629	if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL)))
23630	break;
23631	BitWidth2 *= `2`;
23632	}
23633	BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
23634	}
23635	BitWidth = std::max(a: BitWidth, b: BitWidth1);
23636	return BitWidth > `0` && OrigBitWidth >= (BitWidth * `2`);
23637	};
23638	auto FinalAnalysis = [&, TTI = TTI]() {
23639	if (!IsProfitableToDemote)
23640	return false;
23641	bool Res = all_of(
23642	Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
23643	// Demote gathers.
23644	if (Res && E.isGather()) {
23645	if (E.hasState()) {
23646	if (const TreeEntry *SameTE =
23647	getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars);
23648	SameTE)
23649	if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth,
23650	ToDemote, Visited, NodesToKeepBWs,
23651	MaxDepthLevel, IsProfitableToDemote,
23652	IsTruncRoot)) {
23653	ToDemote.push_back(Elt: E.Idx);
23654	return true;
23655	}
23656	}
23657	// Check possible extractelement instructions bases and final vector
23658	// length.
23659	SmallPtrSet<Value *, `4`> UniqueBases;
23660	for (Value *V : E.Scalars) {
23661	auto *EE = dyn_cast<ExtractElementInst>(Val: V);
23662	if (!EE)
23663	continue;
23664	UniqueBases.insert(Ptr: EE->getVectorOperand());
23665	}
23666	const unsigned VF = E.Scalars.size();
23667	Type *OrigScalarTy = E.Scalars.front()->getType();
23668	if (UniqueBases.size() <= `2` \|\|
23669	::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >=
23670	::getNumberOfParts(
23671	TTI: *TTI,
23672	VecTy: getWidenedType(
23673	ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth),
23674	VF))) {
23675	ToDemote.push_back(Elt: E.Idx);
23676	return true;
23677	}
23678	}
23679	return Res;
23680	};
23681	if (E.isGather() \|\| !Visited.insert(V: &E).second \|\|
23682	any_of(Range: E.Scalars, P: [&](Value *V) {
23683	return !isa<Constant>(Val: V) && all_of(Range: V->users(), P: [&](User *U) {
23684	return isa<InsertElementInst>(Val: U) && !isVectorized(V: U);
23685	});
23686	}))
23687	return FinalAnalysis ();
23688
23689	if (any_of(Range: E.Scalars, P: [&](Value *V) {
23690	return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) {
23691	return isVectorized(V: U) \|\|
23692	(E.Idx == `0` && UserIgnoreList &&
23693	UserIgnoreList->contains(V: U)) \|\|
23694	(!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
23695	!U->getType()->isScalableTy() &&
23696	DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
23697	}) && !IsPotentiallyTruncated (V, BitWidth);
23698	}))
23699	return false;
23700
23701	auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
23702	bool &NeedToExit) {
23703	NeedToExit = false;
23704	unsigned InitLevel = MaxDepthLevel;
23705	for (const TreeEntry *Op : Operands) {
23706	unsigned Level = InitLevel;
23707	if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
23708	ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level,
23709	IsProfitableToDemote, IsTruncRoot)) {
23710	if (!IsProfitableToDemote)
23711	return false;
23712	NeedToExit = true;
23713	if (!FinalAnalysis ())
23714	return false;
23715	continue;
23716	}
23717	MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
23718	}
23719	return true;
23720	};
23721	auto AttemptCheckBitwidth =
23722	[&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
23723	// Try all bitwidth < OrigBitWidth.
23724	NeedToExit = false;
23725	unsigned BestFailBitwidth = `0`;
23726	for (; BitWidth < OrigBitWidth; BitWidth *= `2`) {
23727	if (Checker (BitWidth, OrigBitWidth))
23728	return true;
23729	if (BestFailBitwidth == `0` && FinalAnalysis ())
23730	BestFailBitwidth = BitWidth;
23731	}
23732	if (BitWidth >= OrigBitWidth) {
23733	if (BestFailBitwidth == `0`) {
23734	BitWidth = OrigBitWidth;
23735	return false;
23736	}
23737	MaxDepthLevel = `1`;
23738	BitWidth = BestFailBitwidth;
23739	NeedToExit = true;
23740	return true;
23741	}
23742	return false;
23743	};
23744	auto TryProcessInstruction =
23745	[&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
23746	function_ref<bool(unsigned, unsigned)> Checker = {}) {
23747	if (Operands.empty()) {
23748	if (!IsTruncRoot)
23749	MaxDepthLevel = `1`;
23750	for (Value *V : E.Scalars)
23751	(void)IsPotentiallyTruncated (V, BitWidth);
23752	} else {
23753	// Several vectorized uses? Check if we can truncate it, otherwise -
23754	// exit.
23755	if (any_of(Range: E.Scalars, P: [&](Value *V) {
23756	return !V->hasOneUse() && !IsPotentiallyTruncated (V, BitWidth);
23757	}))
23758	return false;
23759	bool NeedToExit = false;
23760	if (Checker && !AttemptCheckBitwidth (Checker, NeedToExit))
23761	return false;
23762	if (NeedToExit)
23763	return true;
23764	if (!ProcessOperands (Operands, NeedToExit))
23765	return false;
23766	if (NeedToExit)
23767	return true;
23768	}
23769
23770	++MaxDepthLevel;
23771	// Record the entry that we can demote.
23772	ToDemote.push_back(Elt: E.Idx);
23773	return IsProfitableToDemote;
23774	};
23775
23776	if (E.State == TreeEntry::SplitVectorize)
23777	return TryProcessInstruction (
23778	BitWidth,
23779	{VectorizableTree [E.CombinedEntriesWithIndices.front().first].get(),
23780	VectorizableTree [E.CombinedEntriesWithIndices.back().first].get()});
23781
23782	if (E.isAltShuffle()) {
23783	// Combining these opcodes may lead to incorrect analysis, skip for now.
23784	auto IsDangerousOpcode = [](unsigned Opcode) {
23785	switch (Opcode) {
23786	case Instruction::Shl:
23787	case Instruction::AShr:
23788	case Instruction::LShr:
23789	case Instruction::UDiv:
23790	case Instruction::SDiv:
23791	case Instruction::URem:
23792	case Instruction::SRem:
23793	return true;
23794	default:
23795	break;
23796	}
23797	return false;
23798	};
23799	if (IsDangerousOpcode (E.getAltOpcode()))
23800	return FinalAnalysis ();
23801	}
23802
23803	switch (E.getOpcode()) {
23804
23805	// We can always demote truncations and extensions. Since truncations can
23806	// seed additional demotion, we save the truncated value.
23807	case Instruction::Trunc:
23808	if (IsProfitableToDemoteRoot)
23809	IsProfitableToDemote = true;
23810	return TryProcessInstruction (BitWidth);
23811	case Instruction::ZExt:
23812	case Instruction::SExt:
23813	if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
23814	E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
23815	E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
23816	return false;
23817	IsProfitableToDemote = true;
23818	return TryProcessInstruction (BitWidth);
23819
23820	// We can demote certain binary operations if we can demote both of their
23821	// operands.
23822	case Instruction::Add:
23823	case Instruction::Sub:
23824	case Instruction::Mul:
23825	case Instruction::And:
23826	case Instruction::Or:
23827	case Instruction::Xor: {
23828	return TryProcessInstruction (
23829	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)});
23830	}
23831	case Instruction::Freeze:
23832	return TryProcessInstruction (BitWidth, getOperandEntry(E: &E, Idx: `0`));
23833	case Instruction::Shl: {
23834	// If we are truncating the result of this SHL, and if it's a shift of an
23835	// inrange amount, we can always perform a SHL in a smaller type.
23836	auto ShlChecker = [&](unsigned BitWidth, unsigned) {
23837	return all_of(Range: E.Scalars, P: [&](Value *V) {
23838	if (isa<PoisonValue>(Val: V))
23839	return true;
23840	if (E.isCopyableElement(V))
23841	return true;
23842	auto *I = cast<Instruction>(Val: V);
23843	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
23844	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
23845	});
23846	};
23847	return TryProcessInstruction (
23848	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, ShlChecker);
23849	}
23850	case Instruction::LShr: {
23851	// If this is a truncate of a logical shr, we can truncate it to a smaller
23852	// lshr iff we know that the bits we would otherwise be shifting in are
23853	// already zeros.
23854	auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23855	return all_of(Range: E.Scalars, P: [&](Value *V) {
23856	if (isa<PoisonValue>(Val: V))
23857	return true;
23858	APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23859	if (E.isCopyableElement(V))
23860	return MaskedValueIsZero(V, Mask: ShiftedBits, SQ: SimplifyQuery (*DL));
23861	auto *I = cast<Instruction>(Val: V);
23862	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
23863	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
23864	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask: ShiftedBits,
23865	SQ: SimplifyQuery (*DL));
23866	});
23867	};
23868	return TryProcessInstruction (
23869	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
23870	LShrChecker);
23871	}
23872	case Instruction::AShr: {
23873	// If this is a truncate of an arithmetic shr, we can truncate it to a
23874	// smaller ashr iff we know that all the bits from the sign bit of the
23875	// original type and the sign bit of the truncate type are similar.
23876	auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23877	return all_of(Range: E.Scalars, P: [&](Value *V) {
23878	if (isa<PoisonValue>(Val: V))
23879	return true;
23880	auto *I = cast<Instruction>(Val: V);
23881	KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: `1`), DL: *DL);
23882	unsigned ShiftedBits = OrigBitWidth - BitWidth;
23883	return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
23884	ShiftedBits <
23885	ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
23886	});
23887	};
23888	return TryProcessInstruction (
23889	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)},
23890	AShrChecker);
23891	}
23892	case Instruction::UDiv:
23893	case Instruction::URem: {
23894	// UDiv and URem can be truncated if all the truncated bits are zero.
23895	auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23896	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23897	return all_of(Range: E.Scalars, P: [&](Value *V) {
23898	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23899	if (E.hasCopyableElements() && E.isCopyableElement(V))
23900	return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL));
23901	auto *I = cast<Instruction>(Val: V);
23902	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask, SQ: SimplifyQuery (*DL)) &&
23903	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, SQ: SimplifyQuery (*DL));
23904	});
23905	};
23906	return TryProcessInstruction (
23907	BitWidth, {getOperandEntry(E: &E, Idx: `0`), getOperandEntry(E: &E, Idx: `1`)}, Checker);
23908	}
23909
23910	// We can demote selects if we can demote their true and false values.
23911	case Instruction::Select: {
23912	return TryProcessInstruction (
23913	BitWidth, {getOperandEntry(E: &E, Idx: `1`), getOperandEntry(E: &E, Idx: `2`)});
23914	}
23915
23916	// We can demote phis if we can demote all their incoming operands.
23917	case Instruction::PHI: {
23918	const unsigned NumOps = E.getNumOperands();
23919	SmallVector<const TreeEntry *> Ops(NumOps);
23920	transform(Range: seq<unsigned>(Begin: `0`, End: NumOps), d_first: Ops.begin(),
23921	F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); });
23922
23923	return TryProcessInstruction (BitWidth, Ops);
23924	}
23925
23926	case Instruction::Call: {
23927	auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
23928	if (!IC)
23929	break;
23930	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
23931	if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23932	ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23933	break;
23934	SmallVector<const TreeEntry *, `2`> Operands(`1`, getOperandEntry(E: &E, Idx: `0`));
23935	function_ref<bool(unsigned, unsigned)> CallChecker;
23936	auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23937	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23938	return all_of(Range: E.Scalars, P: [&](Value *V) {
23939	auto *I = cast<Instruction>(Val: V);
23940	if (ID == Intrinsic::umin \|\| ID == Intrinsic::umax) {
23941	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23942	return MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
23943	SQ: SimplifyQuery (*DL)) &&
23944	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, SQ: SimplifyQuery (*DL));
23945	}
23946	assert((ID == Intrinsic::smin \|\| ID == Intrinsic::smax) &&
23947	"Expected min/max intrinsics only.");
23948	unsigned SignBits = OrigBitWidth - BitWidth;
23949	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - `1`);
23950	unsigned Op0SignBits =
23951	ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
23952	unsigned Op1SignBits =
23953	ComputeNumSignBits(Op: I->getOperand(i: `1`), DL: DL, AC, CxtI: nullptr*, DT);
23954	return SignBits <= Op0SignBits &&
23955	((SignBits != Op0SignBits &&
23956	!isKnownNonNegative(V: I->getOperand(i: `0`), SQ: SimplifyQuery (*DL))) \|\|
23957	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask,
23958	SQ: SimplifyQuery (*DL))) &&
23959	SignBits <= Op1SignBits &&
23960	((SignBits != Op1SignBits &&
23961	!isKnownNonNegative(V: I->getOperand(i: `1`), SQ: SimplifyQuery (*DL))) \|\|
23962	MaskedValueIsZero(V: I->getOperand(i: `1`), Mask, SQ: SimplifyQuery (*DL)));
23963	});
23964	};
23965	auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23966	assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23967	return all_of(Range: E.Scalars, P: [&](Value *V) {
23968	auto *I = cast<Instruction>(Val: V);
23969	unsigned SignBits = OrigBitWidth - BitWidth;
23970	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - `1`);
23971	unsigned Op0SignBits =
23972	ComputeNumSignBits(Op: I->getOperand(i: `0`), DL: DL, AC, CxtI: nullptr*, DT);
23973	return SignBits <= Op0SignBits &&
23974	((SignBits != Op0SignBits &&
23975	!isKnownNonNegative(V: I->getOperand(i: `0`), SQ: SimplifyQuery (*DL))) \|\|
23976	MaskedValueIsZero(V: I->getOperand(i: `0`), Mask, SQ: SimplifyQuery (*DL)));
23977	});
23978	};
23979	if (ID != Intrinsic::abs) {
23980	Operands.push_back(Elt: getOperandEntry(E: &E, Idx: `1`));
23981	CallChecker = CompChecker;
23982	} else {
23983	CallChecker = AbsChecker;
23984	}
23985	InstructionCost BestCost =
23986	std::numeric_limits<InstructionCost::CostType>::max();
23987	unsigned BestBitWidth = BitWidth;
23988	unsigned VF = E.Scalars.size();
23989	// Choose the best bitwidth based on cost estimations.
23990	auto Checker = [&](unsigned BitWidth, unsigned) {
23991	unsigned MinBW = PowerOf2Ceil(A: BitWidth);
23992	SmallVector<Type *> ArgTys =
23993	buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI);
23994	auto VecCallCosts = getVectorCallCosts(
23995	CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
23996	TTI, TLI, ArgTys);
23997	InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
23998	if (Cost < BestCost) {
23999	BestCost = Cost;
24000	BestBitWidth = BitWidth;
24001	}
24002	return false;
24003	};
24004	[[maybe_unused]] bool NeedToExit;
24005	(void)AttemptCheckBitwidth (Checker, NeedToExit);
24006	BitWidth = BestBitWidth;
24007	return TryProcessInstruction (BitWidth, Operands, CallChecker);
24008	}
24009
24010	// Otherwise, conservatively give up.
24011	default:
24012	break;
24013	}
24014	MaxDepthLevel = `1`;
24015	return FinalAnalysis ();
24016	}
24017
24018	static RecurKind getRdxKind(Value *V);
24019
24020	void BoUpSLP::computeMinimumValueSizes() {
24021	// We only attempt to truncate integer expressions.
24022	bool IsStoreOrInsertElt =
24023	VectorizableTree.front()->hasState() &&
24024	(VectorizableTree.front()->getOpcode() == Instruction::Store \|\|
24025	VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
24026	if ((IsStoreOrInsertElt \|\| UserIgnoreList) &&
24027	ExtraBitWidthNodes.size() <= `1` &&
24028	(!CastMaxMinBWSizes \|\| CastMaxMinBWSizes ->second == `0` \|\|
24029	CastMaxMinBWSizes ->first / CastMaxMinBWSizes ->second <= `2`))
24030	return;
24031
24032	unsigned NodeIdx = `0`;
24033	if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
24034	NodeIdx = `1`;
24035
24036	// Ensure the roots of the vectorizable tree don't form a cycle.
24037	assert((VectorizableTree[NodeIdx]->isGather() \|\| NodeIdx != `0` \|\|
24038	!VectorizableTree[NodeIdx]->UserTreeIndex) &&
24039	"Unexpected tree is graph.");
24040
24041	// The first value node for store/insertelement is sext/zext/trunc? Skip it,
24042	// resize to the final type.
24043	bool IsTruncRoot = false;
24044	bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
24045	SmallVector<unsigned> RootDemotes;
24046	SmallDenseSet<unsigned, `8`> NodesToKeepBWs;
24047	if (NodeIdx != `0` &&
24048	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
24049	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
24050	assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
24051	IsTruncRoot = true;
24052	RootDemotes.push_back(Elt: NodeIdx);
24053	IsProfitableToDemoteRoot = true;
24054	++NodeIdx;
24055	}
24056
24057	// Analyzed the reduction already and not profitable - exit.
24058	if (AnalyzedMinBWVals.contains(V: VectorizableTree [NodeIdx]->Scalars.front()))
24059	return;
24060
24061	SmallVector<unsigned> ToDemote;
24062	auto ComputeMaxBitWidth =
24063	[&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
24064	unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
24065	ToDemote.clear();
24066	// Check if the root is trunc and the next node is gather/buildvector, then
24067	// keep trunc in scalars, which is free in most cases.
24068	if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
24069	!NodesToKeepBWs.contains(V: E.Idx) &&
24070	E.Idx > (IsStoreOrInsertElt ? `2u` : `1u`) &&
24071	all_of(Range: E.Scalars, P: [&](Value *V) {
24072	return V->hasOneUse() \|\| isa<Constant>(Val: V) \|\|
24073	(!V->hasNUsesOrMore(N: UsesLimit) &&
24074	none_of(Range: V->users(), P: [&](User *U) {
24075	ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U);
24076	const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24077	if (TEs.empty() \|\| is_contained(Range&: TEs, Element: UserTE))
24078	return false;
24079	if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24080	SelectInst>(Val: U) \|\|
24081	isa<SIToFPInst, UIToFPInst>(Val: U) \|\|
24082	(UserTE->hasState() &&
24083	(!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24084	SelectInst>(Val: UserTE->getMainOp()) \|\|
24085	isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp()))))
24086	return true;
24087	unsigned UserTESz = DL->getTypeSizeInBits(
24088	Ty: UserTE->Scalars.front()->getType());
24089	if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) {
24090	auto It = MinBWs.find(Val: TE);
24091	return It != MinBWs.end() &&
24092	It ->second.first > UserTESz;
24093	}))
24094	return true;
24095	return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
24096	}));
24097	})) {
24098	ToDemote.push_back(Elt: E.Idx);
24099	const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24100	auto It = MinBWs.find(Val: UserTE);
24101	if (It != MinBWs.end())
24102	return It ->second.first;
24103	unsigned MaxBitWidth =
24104	DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
24105	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
24106	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
24107	MaxBitWidth = `8`;
24108	return MaxBitWidth;
24109	}
24110
24111	if (!E.hasState())
24112	return `0u`;
24113
24114	unsigned VF = E.getVectorFactor();
24115	Type *ScalarTy = E.Scalars.front()->getType();
24116	unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
24117	auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType());
24118	if (!TreeRootIT)
24119	return `0u`;
24120
24121	if (any_of(Range: E.Scalars,
24122	P: [&](Value V) { return* AnalyzedMinBWVals.contains(V); }))
24123	return `0u`;
24124
24125	unsigned NumParts = ::getNumberOfParts(
24126	TTI: TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF ScalarTyNumElements));
24127
24128	// The maximum bit width required to represent all the values that can be
24129	// demoted without loss of precision. It would be safe to truncate the roots
24130	// of the expression to this width.
24131	unsigned MaxBitWidth = `1u`;
24132
24133	// True if the roots can be zero-extended back to their original type,
24134	// rather than sign-extended. We know that if the leading bits are not
24135	// demanded, we can safely zero-extend. So we initialize IsKnownPositive to
24136	// True.
24137	// Determine if the sign bit of all the roots is known to be zero. If not,
24138	// IsKnownPositive is set to False.
24139	bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
24140	if (isa<PoisonValue>(Val: R))
24141	return true;
24142	KnownBits Known = computeKnownBits(V: R, DL: *DL);
24143	return Known.isNonNegative();
24144	});
24145
24146	if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
24147	E.UserTreeIndex.UserTE->hasState() &&
24148	E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
24149	MaxBitWidth =
24150	std::min(a: DL->getTypeSizeInBits(
24151	Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()),
24152	b: DL->getTypeSizeInBits(Ty: ScalarTy));
24153
24154	// We first check if all the bits of the roots are demanded. If they're not,
24155	// we can truncate the roots to this narrower type.
24156	for (Value *Root : E.Scalars) {
24157	if (isa<PoisonValue>(Val: Root))
24158	continue;
24159	unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: DL, AC, CxtI: nullptr*, DT);
24160	TypeSize NumTypeBits =
24161	DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType());
24162	unsigned BitWidth1 = NumTypeBits - NumSignBits;
24163	// If we can't prove that the sign bit is zero, we must add one to the
24164	// maximum bit width to account for the unknown sign bit. This preserves
24165	// the existing sign bit so we can safely sign-extend the root back to the
24166	// original type. Otherwise, if we know the sign bit is zero, we will
24167	// zero-extend the root instead.
24168	//
24169	// FIXME: This is somewhat suboptimal, as there will be cases where adding
24170	// one to the maximum bit width will yield a larger-than-necessary
24171	// type. In general, we need to add an extra bit only if we can't
24172	// prove that the upper bit of the original type is equal to the
24173	// upper bit of the proposed smaller type. If these two bits are
24174	// the same (either zero or one) we know that sign-extending from
24175	// the smaller type will result in the same value. Here, since we
24176	// can't yet prove this, we are just making the proposed smaller
24177	// type larger to ensure correctness.
24178	if (!IsKnownPositive)
24179	++BitWidth1;
24180
24181	auto *I = dyn_cast<Instruction>(Val: Root);
24182	if (!I) {
24183	MaxBitWidth = std::max(a: BitWidth1, b: MaxBitWidth);
24184	continue;
24185	}
24186	APInt Mask = DB->getDemandedBits(I);
24187	unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24188	MaxBitWidth =
24189	std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
24190	}
24191
24192	if (MaxBitWidth < `8` && MaxBitWidth > `1`)
24193	MaxBitWidth = `8`;
24194
24195	// If the original type is large, but reduced type does not improve the reg
24196	// use - ignore it.
24197	if (NumParts > `1` &&
24198	NumParts ==
24199	::getNumberOfParts(
24200	TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(),
24201	NumBits: bit_ceil(Value: MaxBitWidth)),
24202	VF)))
24203	return `0u`;
24204
24205	unsigned Opcode = E.getOpcode();
24206	bool IsProfitableToDemote = Opcode == Instruction::Trunc \|\|
24207	Opcode == Instruction::SExt \|\|
24208	Opcode == Instruction::ZExt \|\| NumParts > `1`;
24209	// Conservatively determine if we can actually truncate the roots of the
24210	// expression. Collect the values that can be demoted in ToDemote and
24211	// additional roots that require investigating in Roots.
24212	DenseSet<const TreeEntry *> Visited;
24213	unsigned MaxDepthLevel = IsTruncRoot ? Limit : `1`;
24214	bool NeedToDemote = IsProfitableToDemote;
24215
24216	if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
24217	ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
24218	IsProfitableToDemote&: NeedToDemote, IsTruncRoot) \|\|
24219	(MaxDepthLevel <= Limit &&
24220	!(((Opcode == Instruction::SExt \|\| Opcode == Instruction::ZExt) &&
24221	(!IsTopRoot \|\| !(IsStoreOrInsertElt \|\| UserIgnoreList) \|\|
24222	DL->getTypeSizeInBits(Ty: TreeRootIT) /
24223	DL->getTypeSizeInBits(
24224	Ty: E.getMainOp()->getOperand(i: `0`)->getType()) >
24225	`2`)))))
24226	return `0u`;
24227	// Round MaxBitWidth up to the next power-of-two.
24228	MaxBitWidth = bit_ceil(Value: MaxBitWidth);
24229
24230	return MaxBitWidth;
24231	};
24232
24233	// If we can truncate the root, we must collect additional values that might
24234	// be demoted as a result. That is, those seeded by truncations we will
24235	// modify.
24236	// Add reduction ops sizes, if any.
24237	if (UserIgnoreList &&
24238	isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
24239	// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
24240	// x i1> to in)).
24241	if (all_of(Range: *UserIgnoreList,
24242	P: [](Value *V) {
24243	return isa<PoisonValue>(Val: V) \|\|
24244	cast<Instruction>(Val: V)->getOpcode() == Instruction::Add;
24245	}) &&
24246	VectorizableTree.front()->State == TreeEntry::Vectorize &&
24247	VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
24248	cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() ==
24249	Builder.getInt1Ty()) {
24250	ReductionBitWidth = `1`;
24251	} else {
24252	for (Value V : UserIgnoreList) {
24253	if (isa<PoisonValue>(Val: V))
24254	continue;
24255	unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: DL, AC, CxtI: nullptr*, DT);
24256	TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
24257	unsigned BitWidth1 = NumTypeBits - NumSignBits;
24258	if (!isKnownNonNegative(V, SQ: SimplifyQuery (*DL)))
24259	++BitWidth1;
24260	unsigned BitWidth2 = BitWidth1;
24261	if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
24262	APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
24263	BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24264	}
24265	ReductionBitWidth =
24266	std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
24267	}
24268	if (ReductionBitWidth < `8` && ReductionBitWidth > `1`)
24269	ReductionBitWidth = `8`;
24270
24271	ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
24272	}
24273	}
24274	bool IsTopRoot = NodeIdx == `0`;
24275	while (NodeIdx < VectorizableTree.size() &&
24276	VectorizableTree [NodeIdx]->State == TreeEntry::Vectorize &&
24277	VectorizableTree [NodeIdx]->getOpcode() == Instruction::Trunc) {
24278	RootDemotes.push_back(Elt: NodeIdx);
24279	++NodeIdx;
24280	IsTruncRoot = true;
24281	}
24282	bool IsSignedCmp = false;
24283	if (UserIgnoreList &&
24284	all_of(Range: *UserIgnoreList,
24285	P: match_fn(P: m_CombineOr(L: m_SMin(L: m_Value(), R: m_Value()),
24286	R: m_SMax(L: m_Value(), R: m_Value())))))
24287	IsSignedCmp = true;
24288	while (NodeIdx < VectorizableTree.size()) {
24289	ArrayRef<Value *> TreeRoot = VectorizableTree [NodeIdx]->Scalars;
24290	unsigned Limit = `2`;
24291	if (IsTopRoot &&
24292	ReductionBitWidth ==
24293	DL->getTypeSizeInBits(
24294	Ty: VectorizableTree.front()->Scalars.front()->getType()))
24295	Limit = `3`;
24296	unsigned MaxBitWidth = ComputeMaxBitWidth (
24297	*VectorizableTree [NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
24298	IsTruncRoot, IsSignedCmp);
24299	if (ReductionBitWidth != `0` && (IsTopRoot \|\| !RootDemotes.empty())) {
24300	if (MaxBitWidth != `0` && ReductionBitWidth < MaxBitWidth)
24301	ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
24302	else if (MaxBitWidth == `0`)
24303	ReductionBitWidth = `0`;
24304	}
24305
24306	for (unsigned Idx : RootDemotes) {
24307	if (all_of(Range&: VectorizableTree [Idx]->Scalars, P: [&](Value *V) {
24308	uint32_t OrigBitWidth =
24309	DL->getTypeSizeInBits(Ty: V->getType()->getScalarType());
24310	if (OrigBitWidth > MaxBitWidth) {
24311	APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
24312	return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery (*DL));
24313	}
24314	return false;
24315	}))
24316	ToDemote.push_back(Elt: Idx);
24317	}
24318	RootDemotes.clear();
24319	IsTopRoot = false;
24320	IsProfitableToDemoteRoot = true;
24321
24322	if (ExtraBitWidthNodes.empty()) {
24323	NodeIdx = VectorizableTree.size();
24324	} else {
24325	unsigned NewIdx = `0`;
24326	do {
24327	NewIdx = *ExtraBitWidthNodes.begin();
24328	ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
24329	} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
24330	NodeIdx = NewIdx;
24331	IsTruncRoot =
24332	NodeIdx < VectorizableTree.size() &&
24333	VectorizableTree [NodeIdx]->UserTreeIndex &&
24334	VectorizableTree [NodeIdx]->UserTreeIndex.EdgeIdx == `0` &&
24335	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24336	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24337	Instruction::Trunc &&
24338	!VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
24339	IsSignedCmp =
24340	NodeIdx < VectorizableTree.size() &&
24341	VectorizableTree [NodeIdx]->UserTreeIndex &&
24342	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24343	VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24344	Instruction::ICmp &&
24345	any_of(
24346	Range&: VectorizableTree [NodeIdx]->UserTreeIndex.UserTE->Scalars,
24347	P: [&](Value *V) {
24348	auto *IC = dyn_cast<ICmpInst>(Val: V);
24349	return IC && (IC->isSigned() \|\|
24350	!isKnownNonNegative(V: IC->getOperand(i_nocapture: `0`),
24351	SQ: SimplifyQuery (*DL)) \|\|
24352	!isKnownNonNegative(V: IC->getOperand(i_nocapture: `1`),
24353	SQ: SimplifyQuery (*DL)));
24354	});
24355	}
24356
24357	// If the maximum bit width we compute is less than the width of the roots'
24358	// type, we can proceed with the narrowing. Otherwise, do nothing.
24359	if (MaxBitWidth == `0` \|\|
24360	MaxBitWidth >=
24361	cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType())
24362	->getBitWidth()) {
24363	if (UserIgnoreList)
24364	AnalyzedMinBWVals.insert_range(R&: TreeRoot);
24365	NodesToKeepBWs.insert_range(R&: ToDemote);
24366	continue;
24367	}
24368
24369	// Finally, map the values we can demote to the maximum bit with we
24370	// computed.
24371	for (unsigned Idx : ToDemote) {
24372	TreeEntry *TE = VectorizableTree [Idx].get();
24373	if (MinBWs.contains(Val: TE))
24374	continue;
24375	bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
24376	if (isa<PoisonValue>(Val: R))
24377	return false;
24378	return !isKnownNonNegative(V: R, SQ: SimplifyQuery (*DL));
24379	});
24380	MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
24381	}
24382	}
24383	}
24384
24385	PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
24386	auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
24387	auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
24388	auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
24389	auto *AA = &AM.getResult<AAManager>(IR&: F);
24390	auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
24391	auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
24392	auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
24393	auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
24394	auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
24395
24396	bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
24397	if (!Changed)
24398	return PreservedAnalyses::all();
24399
24400	PreservedAnalyses PA;
24401	PA.preserveSet<CFGAnalyses>();
24402	return PA;
24403	}
24404
24405	bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
24406	TargetTransformInfo *TTI_,
24407	TargetLibraryInfo TLI_, AAResults AA_,
24408	LoopInfo LI_, DominatorTree DT_,
24409	AssumptionCache AC_, DemandedBits DB_,
24410	OptimizationRemarkEmitter *ORE_) {
24411	if (!RunSLPVectorization)
24412	return false;
24413	SE = SE_;
24414	TTI = TTI_;
24415	TLI = TLI_;
24416	AA = AA_;
24417	LI = LI_;
24418	DT = DT_;
24419	AC = AC_;
24420	DB = DB_;
24421	DL = &F.getDataLayout();
24422
24423	Stores.clear();
24424	GEPs.clear();
24425	bool Changed = false;
24426
24427	// If the target claims to have no vector registers don't attempt
24428	// vectorization.
24429	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
24430	LLVM_DEBUG(
24431	dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
24432	return false;
24433	}
24434
24435	// Don't vectorize when the attribute NoImplicitFloat is used.
24436	if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
24437	return false;
24438
24439	LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
24440
24441	// Use the bottom up slp vectorizer to construct chains that start with
24442	// store instructions.
24443	BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
24444
24445	// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
24446	// delete instructions.
24447
24448	// Update DFS numbers now so that we can use them for ordering.
24449	DT->updateDFSNumbers();
24450
24451	// Scan the blocks in the function in post order.
24452	for (auto *BB : post_order(G: &F.getEntryBlock())) {
24453	if (BB->isEHPad() \|\| isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()))
24454	continue;
24455
24456	// Start new block - clear the list of reduction roots.
24457	R.clearReductionData();
24458	collectSeedInstructions(BB);
24459
24460	// Vectorize trees that end at stores.
24461	if (!Stores.empty()) {
24462	LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
24463	<< " underlying objects.\n");
24464	Changed \|= vectorizeStoreChains(R);
24465	}
24466
24467	// Vectorize trees that end at reductions.
24468	Changed \|= vectorizeChainsInBlock(BB, R);
24469
24470	// Vectorize the index computations of getelementptr instructions. This
24471	// is primarily intended to catch gather-like idioms ending at
24472	// non-consecutive loads.
24473	if (!GEPs.empty()) {
24474	LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
24475	<< " underlying objects.\n");
24476	Changed \|= vectorizeGEPIndices(BB, R);
24477	}
24478	}
24479
24480	if (Changed) {
24481	R.optimizeGatherSequence();
24482	LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
24483	}
24484	return Changed;
24485	}
24486
24487	std::optional<bool>
24488	SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
24489	unsigned Idx, unsigned MinVF,
24490	unsigned &Size) {
24491	Size = `0`;
24492	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
24493	<< "\n");
24494	const unsigned Sz = R.getVectorElementSize(V: Chain [`0`]);
24495	unsigned VF = Chain.size();
24496
24497	if (!has_single_bit(Value: Sz) \|\|
24498	!hasFullVectorsOrPowerOf2(
24499	TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(),
24500	Sz: VF) \|\|
24501	VF < `2` \|\| VF < MinVF) {
24502	// Check if vectorizing with a non-power-of-2 VF should be considered. At
24503	// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
24504	// all vector lanes are used.
24505	if (!VectorizeNonPowerOf2 \|\| (VF < MinVF && VF + `1` != MinVF))
24506	return false;
24507	}
24508
24509	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
24510	<< "\n");
24511
24512	SetVector<Value *> ValOps;
24513	for (Value *V : Chain)
24514	ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
24515	// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
24516	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
24517	InstructionsState S = Analysis.buildInstructionsState(
24518	VL: ValOps.getArrayRef(), R, /TryCopyableElementsVectorization=/true);
24519	if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > `1`) {
24520	DenseSet<Value *> Stores(Chain.begin(), Chain.end());
24521	bool IsAllowedSize =
24522	hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(),
24523	Sz: ValOps.size()) \|\|
24524	(VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + `1`));
24525	if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
24526	(!S.getMainOp()->isSafeToRemove() \|\|
24527	any_of(Range: ValOps.getArrayRef(),
24528	P: [&](Value *V) {
24529	return !isa<ExtractElementInst>(Val: V) &&
24530	(V->getNumUses() > Chain.size() \|\|
24531	any_of(Range: V->users(), P: [&](User *U) {
24532	return !Stores.contains(V: U);
24533	}));
24534	}))) \|\|
24535	(ValOps.size() > Chain.size() / `2` && !S)) {
24536	Size = (!IsAllowedSize && S) ? `1` : `2`;
24537	return false;
24538	}
24539	}
24540	R.buildTree(Roots: Chain);
24541	// Check if tree tiny and store itself or its value is not vectorized.
24542	if (R.isTreeTinyAndNotFullyVectorizable()) {
24543	if (R.isGathered(V: Chain.front()) \|\|
24544	R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
24545	return std::nullopt;
24546	Size = R.getCanonicalGraphSize();
24547	return false;
24548	}
24549	if (R.isProfitableToReorder()) {
24550	R.reorderTopToBottom();
24551	R.reorderBottomToTop();
24552	}
24553	R.transformNodes();
24554	R.computeMinimumValueSizes();
24555
24556	InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24557	R.buildExternalUses();
24558
24559	Size = R.getCanonicalGraphSize();
24560	if (S && S.getOpcode() == Instruction::Load)
24561	Size = `2`; // cut off masked gather small trees
24562	InstructionCost Cost = R.getTreeCost(TreeCost);
24563
24564	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
24565	if (Cost < -SLPCostThreshold) {
24566	LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
24567
24568	using namespace ore;
24569
24570	R.getORE()->emit(OptDiag: OptimizationRemark (SV_NAME, "StoresVectorized",
24571	cast<StoreInst>(Val: Chain [`0`]))
24572	<< "Stores SLP vectorized with cost " << NV ("Cost", Cost)
24573	<< " and with tree size "
24574	<< NV ("TreeSize", R.getTreeSize()));
24575
24576	R.vectorizeTree();
24577	return true;
24578	}
24579
24580	return false;
24581	}
24582
24583	/// Checks if the quadratic mean deviation is less than 90% of the mean size.
24584	static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes) {
24585	unsigned Num = `0`;
24586	uint64_t Sum = std::accumulate(
24587	first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(`0`),
24588	binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24589	unsigned Size = Val.first;
24590	if (Size == `1`)
24591	return V;
24592	++Num;
24593	return V + Size;
24594	});
24595	if (Num == `0`)
24596	return true;
24597	uint64_t Mean = Sum / Num;
24598	if (Mean == `0`)
24599	return true;
24600	uint64_t Dev = std::accumulate(
24601	first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(`0`),
24602	binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24603	unsigned P = Val.first;
24604	if (P == `1`)
24605	return V;
24606	return V + (P - Mean) * (P - Mean);
24607	}) /
24608	Num;
24609	return Dev * `96` / (Mean * Mean) == `0`;
24610	}
24611
24612	namespace {
24613
24614	/// A group of stores that we'll try to bundle together using vector ops.
24615	/// They are ordered using the signed distance of their address operand to the
24616	/// address of this group's BaseInstr.
24617	class RelatedStoreInsts {
24618	public:
24619	RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
24620	: AllStores (AllStores) {
24621	reset(NewBaseInstr: BaseInstrIdx);
24622	}
24623
24624	void reset(unsigned NewBaseInstr) {
24625	assert(NewBaseInstr < AllStores.size() &&
24626	"Instruction index out of bounds");
24627	BaseInstrIdx = NewBaseInstr;
24628	Instrs.clear();
24629	insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: `0`);
24630	}
24631
24632	/// Tries to insert \p InstrIdx as the store with a pointer distance of
24633	/// \p PtrDist.
24634	/// Does nothing if there is already a store with that \p PtrDist.
24635	/// \returns The previously associated Instruction index, or std::nullopt
24636	std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
24637	auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx);
24638	return Inserted ? std::nullopt : std::make_optional(t&: It ->second);
24639	}
24640
24641	using DistToInstMap = std::map<int64_t, unsigned>;
24642	const DistToInstMap &getStores() const { return Instrs; }
24643
24644	/// If \p SI is related to this group of stores, return the distance of its
24645	/// pointer operand to the one the group's BaseInstr.
24646	std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
24647	ScalarEvolution &SE) const {
24648	StoreInst &BaseStore = *AllStores [BaseInstrIdx];
24649	return getPointersDiff(
24650	ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(),
24651	ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE,
24652	/StrictCheck=/true);
24653	}
24654
24655	/// Recompute the pointer distances to be based on \p NewBaseInstIdx.
24656	/// Stores whose index is less than \p MinSafeIdx will be dropped.
24657	void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
24658	int64_t DistFromCurBase) {
24659	DistToInstMap PrevSet = std::move(Instrs);
24660	reset(NewBaseInstr: NewBaseInstIdx);
24661
24662	// Re-insert stores that come after MinSafeIdx to try and vectorize them
24663	// again. Their distance will be "rebased" to use NewBaseInstIdx as
24664	// reference.
24665	for (auto [Dist, InstIdx] : PrevSet) {
24666	if (InstIdx >= MinSafeIdx)
24667	insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase);
24668	}
24669	}
24670
24671	/// Remove all stores that have been vectorized from this group.
24672	void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
24673	DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
24674	Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
24675	return VectorizedStores.contains(Ptr: AllStores [DistAndIdx.second]);
24676	});
24677
24678	// Get a forward iterator pointing after the last vectorized store and erase
24679	// all stores before it so we don't try to vectorize them again.
24680	DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
24681	Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd);
24682	}
24683
24684	private:
24685	/// The index of the Base instruction, i.e. the one with a 0 pointer distance.
24686	unsigned BaseInstrIdx;
24687
24688	/// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
24689	DistToInstMap Instrs;
24690
24691	/// Reference to all the stores in the BB being analyzed.
24692	ArrayRef<StoreInst *> AllStores;
24693	};
24694
24695	} // end anonymous namespace
24696
24697	bool SLPVectorizerPass::vectorizeStores(
24698	ArrayRef<StoreInst *> Stores, BoUpSLP &R,
24699	DenseSet<std::tuple<Value , Value , Value , Value , unsigned>>
24700	&Visited) {
24701	// We may run into multiple chains that merge into a single chain. We mark the
24702	// stores that we vectorized so that we don't visit the same store twice.
24703	BoUpSLP::ValueSet VectorizedStores;
24704	bool Changed = false;
24705
24706	auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
24707	int64_t PrevDist = -`1`;
24708	BoUpSLP::ValueList Operands;
24709	// Collect the chain into a list.
24710	for (auto [Idx, Data] : enumerate(First: StoreSeq)) {
24711	auto &[Dist, InstIdx] = Data;
24712	if (Operands.empty() \|\| Dist - PrevDist == `1`) {
24713	Operands.push_back(Elt: Stores [InstIdx]);
24714	PrevDist = Dist;
24715	if (Idx != StoreSeq.size() - `1`)
24716	continue;
24717	}
24718	llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
24719	Operands.clear();
24720	Operands.push_back(Elt: Stores [InstIdx]);
24721	PrevDist = Dist;
24722	});
24723
24724	if (Operands.size() <= `1` \|\|
24725	!Visited
24726	.insert(V: {Operands.front(),
24727	cast<StoreInst>(Val: Operands.front())->getValueOperand(),
24728	Operands.back(),
24729	cast<StoreInst>(Val: Operands.back())->getValueOperand(),
24730	Operands.size()})
24731	.second)
24732	continue;
24733
24734	unsigned MaxVecRegSize = R.getMaxVecRegSize();
24735	unsigned EltSize = R.getVectorElementSize(V: Operands [`0`]);
24736	unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
24737
24738	unsigned MaxVF =
24739	std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
24740	auto *Store = cast<StoreInst>(Val: Operands [`0`]);
24741	Type *StoreTy = Store->getValueOperand()->getType();
24742	Type *ValueTy = StoreTy;
24743	if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
24744	ValueTy = Trunc->getSrcTy();
24745	// When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
24746	// getStoreMinimumVF only support scalar type as arguments. As a result,
24747	// we need to use the element type of StoreTy and ValueTy to retrieve the
24748	// VF and then transform it back.
24749	// Remember: VF is defined as the number we want to vectorize, not the
24750	// number of elements in the final vector.
24751	Type *StoreScalarTy = StoreTy->getScalarType();
24752	unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
24753	VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy,
24754	ScalarValTy: ValueTy->getScalarType()));
24755	MinVF /= getNumElements(Ty: StoreTy);
24756	MinVF = std::max<unsigned>(a: `2`, b: MinVF);
24757
24758	if (MaxVF < MinVF) {
24759	LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24760	<< ") < "
24761	<< "MinVF (" << MinVF << ")\n");
24762	continue;
24763	}
24764
24765	unsigned NonPowerOf2VF = `0`;
24766	if (VectorizeNonPowerOf2) {
24767	// First try vectorizing with a non-power-of-2 VF. At the moment, only
24768	// consider cases where VF + 1 is a power-of-2, i.e. almost all vector
24769	// lanes are used.
24770	unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF);
24771	if (has_single_bit(Value: CandVF + `1`)) {
24772	NonPowerOf2VF = CandVF;
24773	assert(NonPowerOf2VF != MaxVF &&
24774	"Non-power-of-2 VF should not be equal to MaxVF");
24775	}
24776	}
24777
24778	// MaxRegVF represents the number of instructions (scalar, or vector in
24779	// case of revec) that can be vectorized to naturally fit in a vector
24780	// register.
24781	unsigned MaxRegVF = MaxVF;
24782
24783	MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
24784	if (MaxVF < MinVF) {
24785	LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24786	<< ") < "
24787	<< "MinVF (" << MinVF << ")\n");
24788	continue;
24789	}
24790
24791	SmallVector<unsigned> CandidateVFs;
24792	for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF;
24793	VF = divideCeil(Numerator: VF, Denominator: `2`))
24794	CandidateVFs.push_back(Elt: VF);
24795
24796	unsigned End = Operands.size();
24797	unsigned Repeat = `0`;
24798	constexpr unsigned MaxAttempts = `4`;
24799	// first: the best TreeSize from all prior loops over CandidateVFs, gets
24800	// updated after looping through CandidateVFs
24801	// second: the best TreeSize from all prior loops including the current
24802	// one
24803	llvm::SmallVector<std::pair<unsigned, unsigned>> RangeSizesStorage(
24804	Operands.size(), {`1`, `1`});
24805	// The `slice` and `drop_front` interfaces are convenient
24806	const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
24807	DenseMap<Value , std::pair<unsigned, unsigned*>> NonSchedulable;
24808	auto IsNotVectorized = [](const std::pair<unsigned, unsigned> &P) {
24809	return P.first > `0`;
24810	};
24811	auto IsVectorized = [](const std::pair<unsigned, unsigned> &P) {
24812	return P.first == `0`;
24813	};
24814	auto VFIsProfitable = [](unsigned Size,
24815	const std::pair<unsigned, unsigned> &P) {
24816	return Size >= P.first;
24817	};
24818	auto FirstSizeSame = [](unsigned Size,
24819	const std::pair<unsigned, unsigned> &P) {
24820	return Size == P.first;
24821	};
24822	while (true) {
24823	++Repeat;
24824	bool RepeatChanged = false;
24825	bool AnyProfitableGraph = false;
24826	for (unsigned VF : CandidateVFs) {
24827	AnyProfitableGraph = false;
24828	unsigned FirstUnvecStore = std::distance(
24829	first: RangeSizes.begin(), last: find_if(Range: RangeSizes, P: IsNotVectorized));
24830
24831	// Form slices of size VF starting from FirstUnvecStore and try to
24832	// vectorize them.
24833	while (FirstUnvecStore < End) {
24834	unsigned FirstVecStore = std::distance(
24835	first: RangeSizes.begin(),
24836	last: find_if(Range: RangeSizes.drop_front(N: FirstUnvecStore), P: IsVectorized));
24837	unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24838	for (unsigned SliceStartIdx = FirstUnvecStore;
24839	SliceStartIdx + VF <= MaxSliceEnd;) {
24840	if (!checkTreeSizes(Sizes: RangeSizes.slice(N: SliceStartIdx, M: VF))) {
24841	++SliceStartIdx;
24842	continue;
24843	}
24844	ArrayRef<Value *> Slice =
24845	ArrayRef(Operands).slice(N: SliceStartIdx, M: VF);
24846	assert(all_of(Slice,
24847	[&](Value *V) {
24848	return cast<StoreInst>(V)
24849	->getValueOperand()
24850	->getType() ==
24851	cast<StoreInst>(Slice.front())
24852	->getValueOperand()
24853	->getType();
24854	}) &&
24855	"Expected all operands of same type.");
24856	if (!NonSchedulable.empty()) {
24857	auto [NonSchedSizeMax, NonSchedSizeMin] =
24858	NonSchedulable.lookup(Val: Slice.front());
24859	if (NonSchedSizeMax > `0` && NonSchedSizeMin <= VF) {
24860	// VF is too ambitious. Try to vectorize another slice before
24861	// trying a smaller VF.
24862	SliceStartIdx += NonSchedSizeMax;
24863	continue;
24864	}
24865	}
24866	unsigned TreeSize;
24867	std::optional<bool> Res =
24868	vectorizeStoreChain(Chain: Slice, R, Idx: SliceStartIdx, MinVF, Size&: TreeSize);
24869	if (!Res) {
24870	// Update the range of non schedulable VFs for slices starting
24871	// at SliceStartIdx.
24872	NonSchedulable
24873	.try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF))
24874	.first ->getSecond()
24875	.second = VF;
24876	} else if (*Res) {
24877	// Mark the vectorized stores so that we don't vectorize them
24878	// again.
24879	VectorizedStores.insert_range(R&: Slice);
24880	AnyProfitableGraph = RepeatChanged = Changed = true;
24881	// If we vectorized initial block, no need to try to vectorize
24882	// it again.
24883	for (std::pair<unsigned, unsigned> &P :
24884	RangeSizes.slice(N: SliceStartIdx, M: VF))
24885	P.first = P.second = `0`;
24886	if (SliceStartIdx < FirstUnvecStore + MinVF) {
24887	for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24888	N: FirstUnvecStore, M: SliceStartIdx - FirstUnvecStore))
24889	P.first = P.second = `0`;
24890	FirstUnvecStore = SliceStartIdx + VF;
24891	}
24892	if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24893	for (std::pair<unsigned, unsigned> &P :
24894	RangeSizes.slice(N: SliceStartIdx + VF,
24895	M: MaxSliceEnd - (SliceStartIdx + VF)))
24896	P.first = P.second = `0`;
24897	if (MaxSliceEnd == End)
24898	End = SliceStartIdx;
24899	MaxSliceEnd = SliceStartIdx;
24900	}
24901	SliceStartIdx += VF;
24902	continue;
24903	}
24904	if (VF > `2` && Res &&
24905	!all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
24906	P: std::bind(f&: VFIsProfitable, args&: TreeSize, args: _1))) {
24907	SliceStartIdx += VF;
24908	continue;
24909	}
24910	// Check for the very big VFs that we're not rebuilding same
24911	// trees, just with larger number of elements.
24912	if (VF > MaxRegVF && TreeSize > `1` &&
24913	all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
24914	P: std::bind(f&: FirstSizeSame, args&: TreeSize, args: _1))) {
24915	SliceStartIdx += VF;
24916	while (SliceStartIdx != MaxSliceEnd &&
24917	RangeSizes [SliceStartIdx].first == TreeSize)
24918	++SliceStartIdx;
24919	continue;
24920	}
24921	if (TreeSize > `1`)
24922	for (std::pair<unsigned, unsigned> &P :
24923	RangeSizes.slice(N: SliceStartIdx, M: VF))
24924	P.second = std::max(a: P.second, b: TreeSize);
24925	++SliceStartIdx;
24926	AnyProfitableGraph = true;
24927	}
24928	if (FirstUnvecStore >= End)
24929	break;
24930	if (MaxSliceEnd - FirstUnvecStore < VF &&
24931	MaxSliceEnd - FirstUnvecStore >= MinVF)
24932	AnyProfitableGraph = true;
24933	FirstUnvecStore = std::distance(
24934	first: RangeSizes.begin(),
24935	last: find_if(Range: RangeSizes.drop_front(N: MaxSliceEnd), P: IsNotVectorized));
24936	}
24937	if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF))
24938	break;
24939	// For the MaxRegVF case, save RangeSizes to limit compile time
24940	if (VF == MaxRegVF)
24941	for (std::pair<unsigned, unsigned> &P : RangeSizes)
24942	if (P.first != `0`)
24943	P.first = std::max(a: P.second, b: P.first);
24944	}
24945	// All values vectorized - exit.
24946	if (all_of(Range: RangeSizes, P: IsVectorized))
24947	break;
24948	// Check if tried all attempts or no need for the last attempts at all.
24949	if (Repeat >= MaxAttempts \|\|
24950	(Repeat > `1` && (RepeatChanged \|\| !AnyProfitableGraph)))
24951	break;
24952	constexpr unsigned StoresLimit = `64`;
24953	const unsigned MaxTotalNum = std::min<unsigned>(
24954	a: Operands.size(),
24955	b: static_cast<unsigned>(
24956	End -
24957	std::distance(first: RangeSizes.begin(),
24958	last: find_if(Range: RangeSizes, P: IsNotVectorized)) +
24959	`1`));
24960	unsigned VF = bit_ceil(Value: CandidateVFs.front()) * `2`;
24961	if (VF > MaxTotalNum \|\| VF >= StoresLimit)
24962	break;
24963	for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24964	if (P.first != `0`)
24965	P.first = std::max(a: P.second, b: P.first);
24966	}
24967	// Attempt again to vectorize even larger chains if all previous
24968	// attempts were unsuccessful because of the cost issues.
24969	CandidateVFs.clear();
24970	unsigned Limit =
24971	getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: StoreTy, Sz: MaxTotalNum);
24972	if (bit_floor(Value: Limit) == VF && Limit != VF)
24973	CandidateVFs.push_back(Elt: Limit);
24974	CandidateVFs.push_back(Elt: VF);
24975	}
24976	}
24977	};
24978
24979	/// Groups of stores to vectorize
24980	SmallVector<RelatedStoreInsts> SortedStores;
24981
24982	// Inserts the specified store SI with the given index Idx to the set of the
24983	// stores. If the store with the same distance is found already - stop
24984	// insertion, try to vectorize already found stores. If some stores from this
24985	// sequence were not vectorized - try to vectorize them with the new store
24986	// later. But this logic is applied only to the stores, that come before the
24987	// previous store with the same distance.
24988	// Example:
24989	// 1. store x, %p
24990	// 2. store y, %p+1
24991	// 3. store z, %p+2
24992	// 4. store a, %p
24993	// 5. store b, %p+3
24994	// - Scan this from the last to first store. The very first bunch of stores is
24995	// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24996	// vector).
24997	// - The next store in the list - #1 - has the same distance from store #5 as
24998	// the store #4.
24999	// - Try to vectorize sequence of stores 4,2,3,5.
25000	// - If all these stores are vectorized - just drop them.
25001	// - If some of them are not vectorized (say, #3 and #5), do extra analysis.
25002	// - Start new stores sequence.
25003	// The new bunch of stores is {1, {1, 0}}.
25004	// - Add the stores from previous sequence, that were not vectorized.
25005	// Here we consider the stores in the reversed order, rather they are used in
25006	// the IR (Stores are reversed already, see vectorizeStoreChains() function).
25007	// Store #3 can be added -> comes after store #4 with the same distance as
25008	// store #1.
25009	// Store #5 cannot be added - comes before store #4.
25010	// This logic allows to improve the compile time, we assume that the stores
25011	// after previous store with the same distance most likely have memory
25012	// dependencies and no need to waste compile time to try to vectorize them.
25013	// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
25014	auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
25015	std::optional<int64_t> PtrDist;
25016	auto *RelatedStores = find_if(
25017	Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
25018	PtrDist = StoreSeq.getPointerDiff(SI&: SI, DL: DL, SE&: *SE);
25019	return PtrDist.has_value();
25020	});
25021
25022	// We did not find a comparable store, start a new group.
25023	if (RelatedStores == SortedStores.end()) {
25024	SortedStores.emplace_back(Args&: Idx, Args&: Stores);
25025	return;
25026	}
25027
25028	// If there is already a store in the group with the same PtrDiff, try to
25029	// vectorize the existing instructions before adding the current store.
25030	// Otherwise, insert this store and keep collecting.
25031	if (std::optional<unsigned> PrevInst =
25032	RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) {
25033	TryToVectorize (RelatedStores->getStores());
25034	RelatedStores->clearVectorizedStores(VectorizedStores);
25035	RelatedStores->rebase(/MinSafeIdx=/*PrevInst + `1`,
25036	/NewBaseInstIdx=/Idx,
25037	/DistFromCurBase=/*PtrDist);
25038	}
25039	};
25040	Type PrevValTy = nullptr*;
25041	for (auto [I, SI] : enumerate(First&: Stores)) {
25042	if (R.isDeleted(I: SI))
25043	continue;
25044	if (!PrevValTy)
25045	PrevValTy = SI->getValueOperand()->getType();
25046	// Check that we do not try to vectorize stores of different types.
25047	if (PrevValTy != SI->getValueOperand()->getType()) {
25048	for (RelatedStoreInsts &StoreSeq : SortedStores)
25049	TryToVectorize (StoreSeq.getStores());
25050	SortedStores.clear();
25051	PrevValTy = SI->getValueOperand()->getType();
25052	}
25053	FillStoresSet (I, SI);
25054	}
25055
25056	// Final vectorization attempt.
25057	for (RelatedStoreInsts &StoreSeq : SortedStores)
25058	TryToVectorize (StoreSeq.getStores());
25059
25060	return Changed;
25061	}
25062
25063	void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
25064	// Initialize the collections. We will make a single pass over the block.
25065	Stores.clear();
25066	GEPs.clear();
25067
25068	// Visit the store and getelementptr instructions in BB and organize them in
25069	// Stores and GEPs according to the underlying objects of their pointer
25070	// operands.
25071	for (Instruction &I : *BB) {
25072	// Ignore store instructions that are volatile or have a pointer operand
25073	// that doesn't point to a scalar type.
25074	if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
25075	if (!SI->isSimple())
25076	continue;
25077	if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
25078	continue;
25079	Stores [getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
25080	}
25081
25082	// Ignore getelementptr instructions that have more than one index, a
25083	// constant index, or a pointer operand that doesn't point to a scalar
25084	// type.
25085	else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
25086	if (GEP->getNumIndices() != `1`)
25087	continue;
25088	Value *Idx = GEP->idx_begin()->get();
25089	if (isa<Constant>(Val: Idx))
25090	continue;
25091	if (!isValidElementType(Ty: Idx->getType()))
25092	continue;
25093	if (GEP->getType()->isVectorTy())
25094	continue;
25095	GEPs [GEP->getPointerOperand()].push_back(Elt: GEP);
25096	}
25097	}
25098	}
25099
25100	bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
25101	bool MaxVFOnly) {
25102	if (VL.size() < `2`)
25103	return false;
25104
25105	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
25106	<< VL.size() << ".\n");
25107
25108	// Check that all of the parts are instructions of the same type,
25109	// we permit an alternate opcode via InstructionsState.
25110	InstructionsState S = getSameOpcode(VL, TLI: *TLI);
25111	if (!S)
25112	return false;
25113
25114	Instruction *I0 = S.getMainOp();
25115	// Make sure invalid types (including vector type) are rejected before
25116	// determining vectorization factor for scalar instructions.
25117	for (Value *V : VL) {
25118	Type *Ty = V->getType();
25119	if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
25120	// NOTE: the following will give user internal llvm type name, which may
25121	// not be useful.
25122	R.getORE()->emit(RemarkBuilder: [&]() {
25123	std::string TypeStr;
25124	llvm::raw_string_ostream OS(TypeStr);
25125	Ty->print(O&: OS);
25126	return OptimizationRemarkMissed (SV_NAME, "UnsupportedType", I0)
25127	<< "Cannot SLP vectorize list: type "
25128	<< TypeStr + " is unsupported by vectorizer";
25129	});
25130	return false;
25131	}
25132	}
25133
25134	Type *ScalarTy = getValueType(V: VL [`0`]);
25135	unsigned Sz = R.getVectorElementSize(V: I0);
25136	unsigned MinVF = R.getMinVF(Sz);
25137	unsigned MaxVF = std::max<unsigned>(
25138	a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF);
25139	MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
25140	if (MaxVF < `2`) {
25141	R.getORE()->emit(RemarkBuilder: [&]() {
25142	return OptimizationRemarkMissed (SV_NAME, "SmallVF", I0)
25143	<< "Cannot SLP vectorize list: vectorization factor "
25144	<< "less than 2 is not supported";
25145	});
25146	return false;
25147	}
25148
25149	bool Changed = false;
25150	bool CandidateFound = false;
25151	InstructionCost MinCost = SLPCostThreshold.getValue();
25152
25153	unsigned NextInst = `0`, MaxInst = VL.size();
25154	for (unsigned VF = MaxVF; NextInst + `1` < MaxInst && VF >= MinVF;
25155	VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - `1`)) {
25156	// No actual vectorization should happen, if number of parts is the same as
25157	// provided vectorization factor (i.e. the scalar type is used for vector
25158	// code during codegen).
25159	auto *VecTy = getWidenedType(ScalarTy, VF);
25160	if (TTI->getNumberOfParts(Tp: VecTy) == VF)
25161	continue;
25162	for (unsigned I = NextInst; I < MaxInst; ++I) {
25163	unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
25164
25165	if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF))
25166	continue;
25167
25168	if (MaxVFOnly && ActualVF < MaxVF)
25169	break;
25170	if ((VF > MinVF && ActualVF < VF) \|\| (VF == MinVF && ActualVF < `2`))
25171	break;
25172
25173	SmallVector<Value > Ops(ActualVF, nullptr*);
25174	unsigned Idx = `0`;
25175	for (Value *V : VL.drop_front(N: I)) {
25176	// Check that a previous iteration of this loop did not delete the
25177	// Value.
25178	if (auto *Inst = dyn_cast<Instruction>(Val: V);
25179	!Inst \|\| !R.isDeleted(I: Inst)) {
25180	Ops [Idx] = V;
25181	++Idx;
25182	if (Idx == ActualVF)
25183	break;
25184	}
25185	}
25186	// Not enough vectorizable instructions - exit.
25187	if (Idx != ActualVF)
25188	break;
25189
25190	LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
25191	<< "\n");
25192
25193	R.buildTree(Roots: Ops);
25194	if (R.isTreeTinyAndNotFullyVectorizable())
25195	continue;
25196	if (R.isProfitableToReorder()) {
25197	R.reorderTopToBottom();
25198	R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front()));
25199	}
25200	R.transformNodes();
25201	R.computeMinimumValueSizes();
25202	InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
25203	R.buildExternalUses();
25204
25205	InstructionCost Cost = R.getTreeCost(TreeCost);
25206	CandidateFound = true;
25207	MinCost = std::min(a: MinCost, b: Cost);
25208
25209	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25210	<< " for VF=" << ActualVF << "\n");
25211	if (Cost < -SLPCostThreshold) {
25212	LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
25213	R.getORE()->emit(OptDiag: OptimizationRemark (SV_NAME, "VectorizedList",
25214	cast<Instruction>(Val: Ops [`0`]))
25215	<< "SLP vectorized with cost " << ore::NV ("Cost", Cost)
25216	<< " and with tree size "
25217	<< ore::NV ("TreeSize", R.getTreeSize()));
25218
25219	R.vectorizeTree();
25220	// Move to the next bundle.
25221	I += VF - `1`;
25222	NextInst = I + `1`;
25223	Changed = true;
25224	}
25225	}
25226	}
25227
25228	if (!Changed && CandidateFound) {
25229	R.getORE()->emit(RemarkBuilder: [&]() {
25230	return OptimizationRemarkMissed (SV_NAME, "NotBeneficial", I0)
25231	<< "List vectorization was possible but not beneficial with cost "
25232	<< ore::NV ("Cost", MinCost) << " >= "
25233	<< ore::NV ("Treshold", -SLPCostThreshold);
25234	});
25235	} else if (!Changed) {
25236	R.getORE()->emit(RemarkBuilder: [&]() {
25237	return OptimizationRemarkMissed (SV_NAME, "NotPossible", I0)
25238	<< "Cannot SLP vectorize list: vectorization was impossible"
25239	<< " with available vectorization factors";
25240	});
25241	}
25242	return Changed;
25243	}
25244
25245	namespace {
25246
25247	/// Model horizontal reductions.
25248	///
25249	/// A horizontal reduction is a tree of reduction instructions that has values
25250	/// that can be put into a vector as its leaves. For example:
25251	///
25252	/// mul mul mul mul
25253	/// \ / \ /
25254	/// + +
25255	/// \ /
25256	/// +
25257	/// This tree has "mul" as its leaf values and "+" as its reduction
25258	/// instructions. A reduction can feed into a store or a binary operation
25259	/// feeding a phi.
25260	/// ...
25261	/// \ /
25262	/// +
25263	/// \|
25264	/// phi +=
25265	///
25266	/// Or:
25267	/// ...
25268	/// \ /
25269	/// +
25270	/// \|
25271	/// p =*
25272	///
25273	class HorizontalReduction {
25274	using ReductionOpsType = SmallVector<Value *, `16`>;
25275	using ReductionOpsListType = SmallVector<ReductionOpsType, `2`>;
25276	ReductionOpsListType ReductionOps;
25277	/// List of possibly reduced values.
25278	SmallVector<SmallVector<Value *>> ReducedVals;
25279	/// Maps reduced value to the corresponding reduction operation.
25280	SmallDenseMap<Value , SmallVector<Instruction >, `16`> ReducedValsToOps;
25281	WeakTrackingVH ReductionRoot;
25282	/// The type of reduction operation.
25283	RecurKind RdxKind;
25284	/// Checks if the optimization of original scalar identity operations on
25285	/// matched horizontal reductions is enabled and allowed.
25286	bool IsSupportedHorRdxIdentityOp = false;
25287	/// The minimum number of the reduced values.
25288	const unsigned ReductionLimit = VectorizeNonPowerOf2 ? `3` : `4`;
25289	/// Contains vector values for reduction including their scale factor and
25290	/// signedness. The last bool is true, if the value was reduced in-tree.
25291	SmallVector<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
25292	VectorValuesAndScales;
25293
25294	static bool isCmpSelMinMax(Instruction *I) {
25295	return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
25296	RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
25297	}
25298
25299	// And/or are potentially poison-safe logical patterns like:
25300	// select x, y, false
25301	// select x, true, y
25302	static bool isBoolLogicOp(Instruction *I) {
25303	return isa<SelectInst>(Val: I) &&
25304	(match(V: I, P: m_LogicalAnd()) \|\| match(V: I, P: m_LogicalOr()));
25305	}
25306
25307	/// Checks if instruction is associative and can be vectorized.
25308	static bool isVectorizable(RecurKind Kind, Instruction *I,
25309	bool TwoElementReduction = false) {
25310	if (Kind == RecurKind::None)
25311	return false;
25312
25313	// Integer ops that map to select instructions or intrinsics are fine.
25314	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) \|\|
25315	isBoolLogicOp(I))
25316	return true;
25317
25318	// No need to check for associativity, if 2 reduced values.
25319	if (TwoElementReduction)
25320	return true;
25321
25322	if (Kind == RecurKind::FMax \|\| Kind == RecurKind::FMin) {
25323	// FP min/max are associative except for NaN and -0.0. We do not
25324	// have to rule out -0.0 here because the intrinsic semantics do not
25325	// specify a fixed result for it.
25326	return I->getFastMathFlags().noNaNs();
25327	}
25328
25329	if (Kind == RecurKind::FMaximum \|\| Kind == RecurKind::FMinimum)
25330	return true;
25331
25332	return I->isAssociative();
25333	}
25334
25335	static Value getRdxOperand(Instruction I, unsigned Index) {
25336	// Poison-safe 'or' takes the form: select X, true, Y
25337	// To make that work with the normal operand processing, we skip the
25338	// true value operand.
25339	// TODO: Change the code and data structures to handle this without a hack.
25340	if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == `1`)
25341	return I->getOperand(i: `2`);
25342	return I->getOperand(i: Index);
25343	}
25344
25345	/// Creates reduction operation with the current opcode.
25346	static Value createOp(IRBuilderBase &Builder, RecurKind Kind, Value LHS,
25347	Value RHS, const* Twine &Name, bool UseSelect) {
25348	Type *OpTy = LHS->getType();
25349	assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
25350	switch (Kind) {
25351	case RecurKind::Or: {
25352	if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
25353	return Builder.CreateSelectWithUnknownProfile(
25354	C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
25355	False: RHS, DEBUG_TYPE, Name);
25356	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25357	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25358	Name);
25359	}
25360	case RecurKind::And: {
25361	if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
25362	return Builder.CreateSelectWithUnknownProfile(
25363	C: LHS, True: RHS,
25364	False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
25365	DEBUG_TYPE, Name);
25366	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25367	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25368	Name);
25369	}
25370	case RecurKind::Add:
25371	case RecurKind::Mul:
25372	case RecurKind::Xor:
25373	case RecurKind::FAdd:
25374	case RecurKind::FMul: {
25375	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25376	return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25377	Name);
25378	}
25379	case RecurKind::SMax:
25380	case RecurKind::SMin:
25381	case RecurKind::UMax:
25382	case RecurKind::UMin:
25383	if (UseSelect) {
25384	CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind);
25385	Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name);
25386	return Builder.CreateSelectWithUnknownProfile(C: Cmp, True: LHS, False: RHS, DEBUG_TYPE,
25387	Name);
25388	}
25389	[[fallthrough]];
25390	case RecurKind::FMax:
25391	case RecurKind::FMin:
25392	case RecurKind::FMaximum:
25393	case RecurKind::FMinimum:
25394	case RecurKind::FMaximumNum:
25395	case RecurKind::FMinimumNum: {
25396	Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind);
25397	return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS);
25398	}
25399	default:
25400	llvm_unreachable("Unknown reduction operation.");
25401	}
25402	}
25403
25404	/// Creates reduction operation with the current opcode with the IR flags
25405	/// from \p ReductionOps, dropping nuw/nsw flags.
25406	static Value createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value LHS,
25407	Value RHS, const* Twine &Name,
25408	const ReductionOpsListType &ReductionOps) {
25409	bool UseSelect = ReductionOps.size() == `2` \|\|
25410	// Logical or/and.
25411	(ReductionOps.size() == `1` &&
25412	any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
25413	assert((!UseSelect \|\| ReductionOps.size() != `2` \|\|
25414	isa<SelectInst>(ReductionOps[`1`][`0`])) &&
25415	"Expected cmp + select pairs for reduction");
25416	Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
25417	if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
25418	if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
25419	propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps [`0`], OpValue: nullptr,
25420	/IncludeWrapFlags=/false);
25421	propagateIRFlags(I: Op, VL: ReductionOps [`1`], OpValue: nullptr,
25422	/IncludeWrapFlags=/false);
25423	return Op;
25424	}
25425	}
25426	propagateIRFlags(I: Op, VL: ReductionOps [`0`], OpValue: nullptr, /IncludeWrapFlags=/false);
25427	return Op;
25428	}
25429
25430	public:
25431	static RecurKind getRdxKind(Value *V) {
25432	auto *I = dyn_cast<Instruction>(Val: V);
25433	if (!I)
25434	return RecurKind::None;
25435	if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
25436	return RecurKind::Add;
25437	if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
25438	return RecurKind::Mul;
25439	if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) \|\|
25440	match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
25441	return RecurKind::And;
25442	if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) \|\|
25443	match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
25444	return RecurKind::Or;
25445	if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
25446	return RecurKind::Xor;
25447	if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
25448	return RecurKind::FAdd;
25449	if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
25450	return RecurKind::FMul;
25451
25452	if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
25453	return RecurKind::FMax;
25454	if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
25455	return RecurKind::FMin;
25456
25457	if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value())))
25458	return RecurKind::FMaximum;
25459	if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value())))
25460	return RecurKind::FMinimum;
25461	// This matches either cmp+select or intrinsics. SLP is expected to handle
25462	// either form.
25463	// TODO: If we are canonicalizing to intrinsics, we can remove several
25464	// special-case paths that deal with selects.
25465	if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
25466	return RecurKind::SMax;
25467	if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
25468	return RecurKind::SMin;
25469	if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
25470	return RecurKind::UMax;
25471	if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
25472	return RecurKind::UMin;
25473
25474	if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
25475	// Try harder: look for min/max pattern based on instructions producing
25476	// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
25477	// During the intermediate stages of SLP, it's very common to have
25478	// pattern like this (since optimizeGatherSequence is run only once
25479	// at the end):
25480	// %1 = extractelement <2 x i32> %a, i32 0
25481	// %2 = extractelement <2 x i32> %a, i32 1
25482	// %cond = icmp sgt i32 %1, %2
25483	// %3 = extractelement <2 x i32> %a, i32 0
25484	// %4 = extractelement <2 x i32> %a, i32 1
25485	// %select = select i1 %cond, i32 %3, i32 %4
25486	CmpPredicate Pred;
25487	Instruction *L1;
25488	Instruction *L2;
25489
25490	Value *LHS = Select->getTrueValue();
25491	Value *RHS = Select->getFalseValue();
25492	Value *Cond = Select->getCondition();
25493
25494	// TODO: Support inverse predicates.
25495	if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
25496	if (!isa<ExtractElementInst>(Val: RHS) \|\|
25497	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
25498	return RecurKind::None;
25499	} else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
25500	if (!isa<ExtractElementInst>(Val: LHS) \|\|
25501	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
25502	return RecurKind::None;
25503	} else {
25504	if (!isa<ExtractElementInst>(Val: LHS) \|\| !isa<ExtractElementInst>(Val: RHS))
25505	return RecurKind::None;
25506	if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) \|\|
25507	!L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) \|\|
25508	!L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
25509	return RecurKind::None;
25510	}
25511
25512	switch (Pred) {
25513	default:
25514	return RecurKind::None;
25515	case CmpInst::ICMP_SGT:
25516	case CmpInst::ICMP_SGE:
25517	return RecurKind::SMax;
25518	case CmpInst::ICMP_SLT:
25519	case CmpInst::ICMP_SLE:
25520	return RecurKind::SMin;
25521	case CmpInst::ICMP_UGT:
25522	case CmpInst::ICMP_UGE:
25523	return RecurKind::UMax;
25524	case CmpInst::ICMP_ULT:
25525	case CmpInst::ICMP_ULE:
25526	return RecurKind::UMin;
25527	}
25528	}
25529	return RecurKind::None;
25530	}
25531
25532	/// Get the index of the first operand.
25533	static unsigned getFirstOperandIndex(Instruction *I) {
25534	return isCmpSelMinMax(I) ? `1` : `0`;
25535	}
25536
25537	private:
25538	/// Total number of operands in the reduction operation.
25539	static unsigned getNumberOfOperands(Instruction *I) {
25540	return isCmpSelMinMax(I) ? `3` : `2`;
25541	}
25542
25543	/// Checks if the instruction is in basic block \p BB.
25544	/// For a cmp+sel min/max reduction check that both ops are in \p BB.
25545	static bool hasSameParent(Instruction I, BasicBlock BB) {
25546	if (isCmpSelMinMax(I) \|\| isBoolLogicOp(I)) {
25547	auto *Sel = cast<SelectInst>(Val: I);
25548	auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
25549	return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
25550	}
25551	return I->getParent() == BB;
25552	}
25553
25554	/// Expected number of uses for reduction operations/reduced values.
25555	static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
25556	if (IsCmpSelMinMax) {
25557	// SelectInst must be used twice while the condition op must have single
25558	// use only.
25559	if (auto *Sel = dyn_cast<SelectInst>(Val: I))
25560	return Sel->hasNUses(N: `2`) && Sel->getCondition()->hasOneUse();
25561	return I->hasNUses(N: `2`);
25562	}
25563
25564	// Arithmetic reduction operation must be used once only.
25565	return I->hasOneUse();
25566	}
25567
25568	/// Initializes the list of reduction operations.
25569	void initReductionOps(Instruction *I) {
25570	if (isCmpSelMinMax(I))
25571	ReductionOps.assign(NumElts: `2`, Elt: ReductionOpsType ());
25572	else
25573	ReductionOps.assign(NumElts: `1`, Elt: ReductionOpsType ());
25574	}
25575
25576	/// Add all reduction operations for the reduction instruction \p I.
25577	void addReductionOps(Instruction *I) {
25578	if (isCmpSelMinMax(I)) {
25579	ReductionOps [`0`].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
25580	ReductionOps [`1`].emplace_back(Args&: I);
25581	} else {
25582	ReductionOps [`0`].emplace_back(Args&: I);
25583	}
25584	}
25585
25586	static bool isGoodForReduction(ArrayRef<Value *> Data) {
25587	int Sz = Data.size();
25588	auto *I = dyn_cast<Instruction>(Val: Data.front());
25589	return Sz > `1` \|\| isConstant(V: Data.front()) \|\|
25590	(I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
25591	}
25592
25593	/// Optimizes original placement of the reduced values for the reduction tree.
25594	/// For example, if there is a zext i1 + selects, we can merge select
25595	/// into zext and improve emission of the reductions.
25596	void optimizeReducedVals() {
25597	SmallDenseMap<unsigned, unsigned> UsedReductionOpIds;
25598	for (const auto [Idx, Vals] : enumerate(First&: ReducedVals)) {
25599	if (auto *I = dyn_cast<Instruction>(Val: Vals.front()))
25600	UsedReductionOpIds.try_emplace(Key: I->getOpcode(), Args&: Idx);
25601	}
25602	// Check if zext i1 can be merged with select.
25603	auto ZExtIt = UsedReductionOpIds.find(Val: Instruction::ZExt);
25604	auto SelectIt = UsedReductionOpIds.find(Val: Instruction::Select);
25605	if (ZExtIt != UsedReductionOpIds.end() &&
25606	SelectIt != UsedReductionOpIds.end()) {
25607	unsigned ZExtIdx = ZExtIt ->second;
25608	unsigned SelectIdx = SelectIt ->second;
25609	auto *ZExt = cast<ZExtInst>(Val: ReducedVals [ZExtIdx].front());
25610	// ZExt is compatible with Select? Merge select to zext, if so.
25611	if (ZExt->getSrcTy()->isIntegerTy(Bitwidth: `1`) &&
25612	ZExt->getType() == ReducedVals [SelectIdx].front()->getType()) {
25613	ReducedVals [ZExtIdx].append(RHS: ReducedVals [SelectIdx]);
25614	ReducedVals.erase(CI: std::next(x: ReducedVals.begin(), n: SelectIdx));
25615	}
25616	}
25617	}
25618
25619	public:
25620	HorizontalReduction() = default;
25621	HorizontalReduction(Instruction I, ArrayRef<Value > Ops)
25622	: ReductionRoot (I), ReductionLimit(`2`) {
25623	RdxKind = HorizontalReduction::getRdxKind(V: I);
25624	ReductionOps.emplace_back().push_back(Elt: I);
25625	ReducedVals.emplace_back().assign(in_start: Ops.begin(), in_end: Ops.end());
25626	for (Value *V : Ops)
25627	ReducedValsToOps [V].push_back(Elt: I);
25628	}
25629
25630	bool matchReductionForOperands() const {
25631	// Analyze "regular" integer/FP types for reductions - no target-specific
25632	// types or pointers.
25633	assert(ReductionRoot && "Reduction root is not set!");
25634	if (!isVectorizable(Kind: RdxKind, I: cast<Instruction>(Val: ReductionRoot),
25635	TwoElementReduction: all_of(Range: ReducedVals, P: [](ArrayRef<Value *> Ops) {
25636	return Ops.size() == `2`;
25637	})))
25638	return false;
25639
25640	return true;
25641	}
25642
25643	/// Try to find a reduction tree.
25644	bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
25645	ScalarEvolution &SE, const DataLayout &DL,
25646	const TargetLibraryInfo &TLI) {
25647	RdxKind = HorizontalReduction::getRdxKind(V: Root);
25648	if (!isVectorizable(Kind: RdxKind, I: Root))
25649	return false;
25650
25651	// Analyze "regular" integer/FP types for reductions - no target-specific
25652	// types or pointers.
25653	Type *Ty = Root->getType();
25654	if (!isValidElementType(Ty) \|\| Ty->isPointerTy())
25655	return false;
25656
25657	// Though the ultimate reduction may have multiple uses, its condition must
25658	// have only single use.
25659	if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
25660	if (!Sel->getCondition()->hasOneUse())
25661	return false;
25662
25663	ReductionRoot = Root;
25664
25665	// Iterate through all the operands of the possible reduction tree and
25666	// gather all the reduced values, sorting them by their value id.
25667	BasicBlock *BB = Root->getParent();
25668	bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
25669	SmallVector<std::pair<Instruction , unsigned*>> Worklist(
25670	`1`, std::make_pair(x&: Root, y: `0`));
25671	// Checks if the operands of the \p TreeN instruction are also reduction
25672	// operations or should be treated as reduced values or an extra argument,
25673	// which is not part of the reduction.
25674	auto CheckOperands = [&](Instruction *TreeN,
25675	SmallVectorImpl<Value *> &PossibleReducedVals,
25676	SmallVectorImpl<Instruction *> &ReductionOps,
25677	unsigned Level) {
25678	for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
25679	End: getNumberOfOperands(I: TreeN)))) {
25680	Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
25681	ReducedValsToOps [EdgeVal].push_back(Elt: TreeN);
25682	auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
25683	// If the edge is not an instruction, or it is different from the main
25684	// reduction opcode or has too many uses - possible reduced value.
25685	// Also, do not try to reduce const values, if the operation is not
25686	// foldable.
25687	if (!EdgeInst \|\| Level > RecursionMaxDepth \|\|
25688	getRdxKind(V: EdgeInst) != RdxKind \|\|
25689	IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) \|\|
25690	!hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) \|\|
25691	!isVectorizable(Kind: RdxKind, I: EdgeInst) \|\|
25692	(R.isAnalyzedReductionRoot(I: EdgeInst) &&
25693	all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
25694	PossibleReducedVals.push_back(Elt: EdgeVal);
25695	continue;
25696	}
25697	ReductionOps.push_back(Elt: EdgeInst);
25698	}
25699	};
25700	// Try to regroup reduced values so that it gets more profitable to try to
25701	// reduce them. Values are grouped by their value ids, instructions - by
25702	// instruction op id and/or alternate op id, plus do extra analysis for
25703	// loads (grouping them by the distance between pointers) and cmp
25704	// instructions (grouping them by the predicate).
25705	SmallMapVector<
25706	size_t, SmallMapVector<size_t, SmallMapVector<Value , unsigned*, `2`>, `2`>,
25707	`8`>
25708	PossibleReducedVals;
25709	initReductionOps(I: Root);
25710	DenseMap<std::pair<size_t, Value >, SmallVector<LoadInst >> LoadsMap;
25711	SmallSet<size_t, `2`> LoadKeyUsed;
25712
25713	auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
25714	Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
25715	Value *Ptr =
25716	getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
25717	if (!LoadKeyUsed.insert(V: Key).second) {
25718	auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
25719	if (LIt != LoadsMap.end()) {
25720	for (LoadInst *RLI : LIt ->second) {
25721	if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
25722	ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
25723	/StrictCheck=/true))
25724	return hash_value(ptr: RLI->getPointerOperand());
25725	}
25726	for (LoadInst *RLI : LIt ->second) {
25727	if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
25728	Ptr2: LI->getPointerOperand(), TLI)) {
25729	hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
25730	return SubKey;
25731	}
25732	}
25733	if (LIt ->second.size() > `2`) {
25734	hash_code SubKey =
25735	hash_value(ptr: LIt ->second.back()->getPointerOperand());
25736	return SubKey;
25737	}
25738	}
25739	}
25740	LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr))
25741	.first ->second.push_back(Elt: LI);
25742	return hash_value(ptr: LI->getPointerOperand());
25743	};
25744
25745	while (!Worklist.empty()) {
25746	auto [TreeN, Level] = Worklist.pop_back_val();
25747	SmallVector<Value *> PossibleRedVals;
25748	SmallVector<Instruction *> PossibleReductionOps;
25749	CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
25750	addReductionOps(I: TreeN);
25751	// Add reduction values. The values are sorted for better vectorization
25752	// results.
25753	for (Value *V : PossibleRedVals) {
25754	size_t Key, Idx;
25755	std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
25756	/AllowAlternate=/false);
25757	++PossibleReducedVals [Key][Idx].try_emplace(Key: V, Args: `0`).first->second;
25758	}
25759	for (Instruction *I : reverse(C&: PossibleReductionOps))
25760	Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? `0` : Level + `1`);
25761	}
25762	auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
25763	// Sort values by the total number of values kinds to start the reduction
25764	// from the longest possible reduced values sequences.
25765	for (auto &PossibleReducedVals : PossibleReducedValsVect) {
25766	auto PossibleRedVals = PossibleReducedVals.second.takeVector();
25767	SmallVector<SmallVector<Value *>> PossibleRedValsVect;
25768	for (auto &Slice : PossibleRedVals) {
25769	PossibleRedValsVect.emplace_back();
25770	auto RedValsVect = Slice.second.takeVector();
25771	stable_sort(Range&: RedValsVect, C: llvm::less_second ());
25772	for (const std::pair<Value , unsigned*> &Data : RedValsVect)
25773	PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
25774	}
25775	stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
25776	return P1.size() > P2.size();
25777	});
25778	bool First = true;
25779	for (ArrayRef<Value *> Data : PossibleRedValsVect) {
25780	if (First) {
25781	First = false;
25782	ReducedVals.emplace_back();
25783	} else if (!isGoodForReduction(Data)) {
25784	auto *LI = dyn_cast<LoadInst>(Val: Data.front());
25785	auto *LastLI = dyn_cast<LoadInst>(Val: ReducedVals.back().front());
25786	if (!LI \|\| !LastLI \|\|
25787	getUnderlyingObject(V: LI->getPointerOperand()) !=
25788	getUnderlyingObject(V: LastLI->getPointerOperand()))
25789	ReducedVals.emplace_back();
25790	}
25791	ReducedVals.back().append(in_start: Data.rbegin(), in_end: Data.rend());
25792	}
25793	}
25794	// Post optimize reduced values to get better reduction sequences and sort
25795	// them by size.
25796	optimizeReducedVals();
25797	// Sort the reduced values by number of same/alternate opcode and/or pointer
25798	// operand.
25799	stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value > P1, ArrayRef<Value > P2) {
25800	return P1.size() > P2.size();
25801	});
25802	return true;
25803	}
25804
25805	/// Attempt to vectorize the tree found by matchAssociativeReduction.
25806	Value tryToReduce(BoUpSLP &V, const* DataLayout &DL, TargetTransformInfo *TTI,
25807	const TargetLibraryInfo &TLI, AssumptionCache *AC,
25808	DominatorTree &DT) {
25809	constexpr unsigned RegMaxNumber = `4`;
25810	constexpr unsigned RedValsMaxNumber = `128`;
25811	// If there are a sufficient number of reduction values, reduce
25812	// to a nearby power-of-2. We can safely generate oversized
25813	// vectors and rely on the backend to split them to legal sizes.
25814	if (unsigned NumReducedVals = std::accumulate(
25815	first: ReducedVals.begin(), last: ReducedVals.end(), init: `0`,
25816	binary_op: [](unsigned Num, ArrayRef<Value > Vals) -> unsigned* {
25817	if (!isGoodForReduction(Data: Vals))
25818	return Num;
25819	return Num + Vals.size();
25820	});
25821	NumReducedVals < ReductionLimit &&
25822	all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
25823	return RedV.size() < `2` \|\| !allConstant(VL: RedV) \|\| !isSplat(VL: RedV);
25824	})) {
25825	for (ReductionOpsType &RdxOps : ReductionOps)
25826	for (Value *RdxOp : RdxOps)
25827	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
25828	return nullptr;
25829	}
25830
25831	IRBuilder<TargetFolder> Builder(ReductionRoot ->getContext(),
25832	TargetFolder (DL));
25833	Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
25834
25835	// Track the reduced values in case if they are replaced by extractelement
25836	// because of the vectorization.
25837	DenseMap<Value , WeakTrackingVH> TrackedVals(ReducedVals.size()
25838	ReducedVals.front().size());
25839
25840	// The compare instruction of a min/max is the insertion point for new
25841	// instructions and may be replaced with a new compare instruction.
25842	auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
25843	assert(isa<SelectInst>(RdxRootInst) &&
25844	"Expected min/max reduction to have select root instruction");
25845	Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
25846	assert(isa<Instruction>(ScalarCond) &&
25847	"Expected min/max reduction to have compare condition");
25848	return cast<Instruction>(Val: ScalarCond);
25849	};
25850
25851	bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) {
25852	return isBoolLogicOp(I: cast<Instruction>(Val: V));
25853	});
25854	// Return new VectorizedTree, based on previous value.
25855	auto GetNewVectorizedTree = [&](Value VectorizedTree, Value Res) {
25856	if (VectorizedTree) {
25857	// Update the final value in the reduction.
25858	Builder.SetCurrentDebugLocation(
25859	cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
25860	if (AnyBoolLogicOp) {
25861	auto It = ReducedValsToOps.find(Val: VectorizedTree);
25862	auto It1 = ReducedValsToOps.find(Val: Res);
25863	if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) \|\|
25864	isGuaranteedNotToBePoison(V: VectorizedTree, AC) \|\|
25865	(It != ReducedValsToOps.end() &&
25866	any_of(Range&: It ->getSecond(), P: [&](Instruction *I) {
25867	return isBoolLogicOp(I) &&
25868	getRdxOperand(I, Index: `0`) == VectorizedTree;
25869	}))) {
25870	;
25871	} else if (isGuaranteedNotToBePoison(V: Res, AC) \|\|
25872	(It1 != ReducedValsToOps.end() &&
25873	any_of(Range&: It1 ->getSecond(), P: [&](Instruction *I) {
25874	return isBoolLogicOp(I) && getRdxOperand(I, Index: `0`) == Res;
25875	}))) {
25876	std::swap(a&: VectorizedTree, b&: Res);
25877	} else {
25878	VectorizedTree = Builder.CreateFreeze(V: VectorizedTree);
25879	}
25880	}
25881
25882	return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
25883	ReductionOps);
25884	}
25885	// Initialize the final value in the reduction.
25886	return Res;
25887	};
25888	SmallDenseSet<Value > IgnoreList(ReductionOps.size()
25889	ReductionOps.front().size());
25890	for (ReductionOpsType &RdxOps : ReductionOps)
25891	for (Value *RdxOp : RdxOps) {
25892	if (!RdxOp)
25893	continue;
25894	IgnoreList.insert(V: RdxOp);
25895	}
25896	// Intersect the fast-math-flags from all reduction operations.
25897	FastMathFlags RdxFMF;
25898	RdxFMF.set();
25899	for (Value *U : IgnoreList)
25900	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
25901	RdxFMF &= FPMO->getFastMathFlags();
25902	bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
25903
25904	// Need to track reduced vals, they may be changed during vectorization of
25905	// subvectors.
25906	for (ArrayRef<Value *> Candidates : ReducedVals)
25907	for (Value *V : Candidates)
25908	TrackedVals.try_emplace(Key: V, Args&: V);
25909
25910	auto At = [](SmallMapVector<Value , unsigned*, `16`> &MV,
25911	Value V) -> unsigned* & {
25912	auto *It = MV.find(Key: V);
25913	assert(It != MV.end() && "Unable to find given key.");
25914	return It->second;
25915	};
25916
25917	DenseMap<Value , unsigned*> VectorizedVals(ReducedVals.size());
25918	// List of the values that were reduced in other trees as part of gather
25919	// nodes and thus requiring extract if fully vectorized in other trees.
25920	SmallPtrSet<Value *, `4`> RequiredExtract;
25921	WeakTrackingVH VectorizedTree = nullptr;
25922	bool CheckForReusedReductionOps = false;
25923	// Try to vectorize elements based on their type.
25924	SmallVector<InstructionsState> States;
25925	SmallVector<SmallVector<Value *>> LocalReducedVals;
25926	// Try merge consecutive reduced values into a single vectorizable group and
25927	// check, if they can be vectorized as copyables.
25928	for (ArrayRef<Value *> RV : ReducedVals) {
25929	// Loads are not very compatible with undefs.
25930	if (isa<UndefValue>(Val: RV.front()) &&
25931	(States.empty() \|\| !States.back() \|\|
25932	States.back().getOpcode() == Instruction::Load)) {
25933	LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25934	States.push_back(Elt: InstructionsState::invalid());
25935	continue;
25936	}
25937	if (!LocalReducedVals.empty() &&
25938	isa<UndefValue>(Val: LocalReducedVals.back().front()) &&
25939	isa<LoadInst>(Val: RV.front())) {
25940	LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25941	States.push_back(Elt: getSameOpcode(VL: RV, TLI));
25942	continue;
25943	}
25944	SmallVector<Value *> Ops;
25945	if (!LocalReducedVals.empty())
25946	Ops = LocalReducedVals.back();
25947	Ops.append(in_start: RV.begin(), in_end: RV.end());
25948	InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25949	InstructionsState OpS =
25950	Analysis.buildInstructionsState(VL: Ops, R: V, TryCopyableElementsVectorization: VectorizeCopyableElements);
25951	if (LocalReducedVals.empty()) {
25952	LocalReducedVals.push_back(Elt: Ops);
25953	States.push_back(Elt: OpS);
25954	continue;
25955	}
25956	if (OpS) {
25957	LocalReducedVals.back().swap(RHS&: Ops);
25958	States.back() = OpS;
25959	continue;
25960	}
25961	LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25962	States.push_back(Elt: getSameOpcode(VL: RV, TLI));
25963	}
25964	ReducedVals.swap(RHS&: LocalReducedVals);
25965	for (unsigned I = `0`, E = ReducedVals.size(); I < E; ++I) {
25966	ArrayRef<Value *> OrigReducedVals = ReducedVals [I];
25967	InstructionsState S = States [I];
25968	SmallVector<Value *> Candidates;
25969	Candidates.reserve(N: `2` * OrigReducedVals.size());
25970	DenseMap<Value , Value > TrackedToOrig(`2` * OrigReducedVals.size());
25971	for (Value *ReducedVal : OrigReducedVals) {
25972	Value *RdxVal = TrackedVals.at(Val: ReducedVal);
25973	// Check if the reduction value was not overriden by the extractelement
25974	// instruction because of the vectorization and exclude it, if it is not
25975	// compatible with other values.
25976	// Also check if the instruction was folded to constant/other value.
25977	auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
25978	if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
25979	(!S \|\| (!S.getMatchingMainOpOrAltOp(I: Inst) &&
25980	!S.isCopyableElement(V: Inst)))) \|\|
25981	(S && !Inst && !isa<PoisonValue>(Val: RdxVal) &&
25982	!S.isCopyableElement(V: RdxVal)))
25983	continue;
25984	Candidates.push_back(Elt: RdxVal);
25985	TrackedToOrig.try_emplace(Key: RdxVal, Args&: ReducedVal);
25986	}
25987	bool ShuffledExtracts = false;
25988	// Try to handle shuffled extractelements.
25989	if (S && S.getOpcode() == Instruction::ExtractElement &&
25990	!S.isAltShuffle() && I + `1` < E) {
25991	SmallVector<Value *> CommonCandidates(Candidates);
25992	for (Value *RV : ReducedVals [I + `1`]) {
25993	Value *RdxVal = TrackedVals.at(Val: RV);
25994	// Check if the reduction value was not overriden by the
25995	// extractelement instruction because of the vectorization and
25996	// exclude it, if it is not compatible with other values.
25997	auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal);
25998	if (!Inst)
25999	continue;
26000	CommonCandidates.push_back(Elt: RdxVal);
26001	TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
26002	}
26003	SmallVector<int> Mask;
26004	if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) {
26005	++I;
26006	Candidates.swap(RHS&: CommonCandidates);
26007	ShuffledExtracts = true;
26008	}
26009	}
26010
26011	// Emit code for constant values.
26012	if (Candidates.size() > `1` && allConstant(VL: Candidates)) {
26013	Value *Res = Candidates.front();
26014	Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
26015	++VectorizedVals.try_emplace(Key: OrigV).first ->getSecond();
26016	for (Value *VC : ArrayRef(Candidates).drop_front()) {
26017	Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
26018	Value *OrigV = TrackedToOrig.at(Val: VC);
26019	++VectorizedVals.try_emplace(Key: OrigV).first ->getSecond();
26020	if (auto *ResI = dyn_cast<Instruction>(Val: Res))
26021	V.analyzedReductionRoot(I: ResI);
26022	}
26023	VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
26024	continue;
26025	}
26026
26027	unsigned NumReducedVals = Candidates.size();
26028	if (NumReducedVals < ReductionLimit &&
26029	(NumReducedVals < `2` \|\| !isSplat(VL: Candidates)))
26030	continue;
26031
26032	// Check if we support repeated scalar values processing (optimization of
26033	// original scalar identity operations on matched horizontal reductions).
26034	IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
26035	RdxKind != RecurKind::FMul &&
26036	RdxKind != RecurKind::FMulAdd;
26037	// Gather same values.
26038	SmallMapVector<Value , unsigned*, `16`> SameValuesCounter;
26039	if (IsSupportedHorRdxIdentityOp)
26040	for (Value *V : Candidates) {
26041	Value *OrigV = TrackedToOrig.at(Val: V);
26042	++SameValuesCounter.try_emplace(Key: OrigV).first->second;
26043	}
26044	// Used to check if the reduced values used same number of times. In this
26045	// case the compiler may produce better code. E.g. if reduced values are
26046	// aabbccdd (8 x values), then the first node of the tree will have a node
26047	// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
26048	// Plus, the final reduction will be performed on <8 x aabbccdd>.
26049	// Instead compiler may build <4 x abcd> tree immediately, + reduction (4
26050	// x abcd) 2.*
26051	// Currently it only handles add/fadd/xor. and/or/min/max do not require
26052	// this analysis, other operations may require an extra estimation of
26053	// the profitability.
26054	bool SameScaleFactor = false;
26055	bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
26056	SameValuesCounter.size() != Candidates.size();
26057	BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
26058	if (OptReusedScalars) {
26059	SameScaleFactor =
26060	(RdxKind == RecurKind::Add \|\| RdxKind == RecurKind::FAdd \|\|
26061	RdxKind == RecurKind::Xor) &&
26062	all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
26063	P: [&SameValuesCounter](const std::pair<Value , unsigned*> &P) {
26064	return P.second == SameValuesCounter.front().second;
26065	});
26066	Candidates.resize(N: SameValuesCounter.size());
26067	transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
26068	F: [&](const auto &P) { return TrackedVals.at(P.first); });
26069	NumReducedVals = Candidates.size();
26070	// Have a reduction of the same element.
26071	if (NumReducedVals == `1`) {
26072	Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
26073	unsigned Cnt = At(SameValuesCounter, OrigV);
26074	Value *RedVal =
26075	emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
26076	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
26077	VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
26078	ExternallyUsedValues.insert(V: OrigV);
26079	continue;
26080	}
26081	}
26082
26083	unsigned MaxVecRegSize = V.getMaxVecRegSize();
26084	unsigned EltSize = V.getVectorElementSize(V: Candidates [`0`]);
26085	const unsigned MaxElts = std::clamp<unsigned>(
26086	val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber,
26087	hi: RegMaxNumber * RedValsMaxNumber);
26088
26089	unsigned ReduxWidth = NumReducedVals;
26090	auto GetVectorFactor = [&, &TTI = TTI](unsigned* ReduxWidth) {
26091	unsigned NumParts, NumRegs;
26092	Type *ScalarTy = Candidates.front()->getType();
26093	ReduxWidth =
26094	getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth);
26095	VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
26096	NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
26097	NumRegs =
26098	TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
26099	while (NumParts > NumRegs) {
26100	assert(ReduxWidth > `0` && "ReduxWidth is unexpectedly 0.");
26101	ReduxWidth = bit_floor(Value: ReduxWidth - `1`);
26102	VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
26103	NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
26104	NumRegs =
26105	TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
26106	}
26107	if (NumParts > NumRegs / `2`)
26108	ReduxWidth = bit_floor(Value: ReduxWidth);
26109	return ReduxWidth;
26110	};
26111	if (!VectorizeNonPowerOf2 \|\| !has_single_bit(Value: ReduxWidth + `1`))
26112	ReduxWidth = GetVectorFactor(ReduxWidth);
26113	ReduxWidth = std::min(a: ReduxWidth, b: MaxElts);
26114
26115	unsigned Start = `0`;
26116	unsigned Pos = Start;
26117	// Restarts vectorization attempt with lower vector factor.
26118	unsigned PrevReduxWidth = ReduxWidth;
26119	bool CheckForReusedReductionOpsLocal = false;
26120	auto AdjustReducedVals = [&](bool IgnoreVL = false) {
26121	bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
26122	if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
26123	// Check if any of the reduction ops are gathered. If so, worth
26124	// trying again with less number of reduction ops.
26125	CheckForReusedReductionOpsLocal \|= IsAnyRedOpGathered;
26126	}
26127	++Pos;
26128	if (Pos < NumReducedVals - ReduxWidth + `1`)
26129	return IsAnyRedOpGathered;
26130	Pos = Start;
26131	--ReduxWidth;
26132	if (ReduxWidth > `1`)
26133	ReduxWidth = GetVectorFactor(ReduxWidth);
26134	return IsAnyRedOpGathered;
26135	};
26136	bool AnyVectorized = false;
26137	SmallDenseSet<std::pair<unsigned, unsigned>, `8`> IgnoredCandidates;
26138	while (Pos < NumReducedVals - ReduxWidth + `1` &&
26139	ReduxWidth >= ReductionLimit) {
26140	// Dependency in tree of the reduction ops - drop this attempt, try
26141	// later.
26142	if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
26143	Start == `0`) {
26144	CheckForReusedReductionOps = true;
26145	break;
26146	}
26147	PrevReduxWidth = ReduxWidth;
26148	ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
26149	// Been analyzed already - skip.
26150	if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) \|\|
26151	(!has_single_bit(Value: ReduxWidth) &&
26152	(IgnoredCandidates.contains(
26153	V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) \|\|
26154	IgnoredCandidates.contains(
26155	V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)),
26156	y: bit_floor(Value: ReduxWidth))))) \|\|
26157	V.areAnalyzedReductionVals(VL)) {
26158	(void)AdjustReducedVals(/IgnoreVL=/true);
26159	continue;
26160	}
26161	// Early exit if any of the reduction values were deleted during
26162	// previous vectorization attempts.
26163	if (any_of(Range&: VL, P: [&V](Value *RedVal) {
26164	auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
26165	return RedValI && V.isDeleted(I: RedValI);
26166	}))
26167	break;
26168	V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
26169	if (V.isTreeTinyAndNotFullyVectorizable(/ForReduction=/true)) {
26170	if (!AdjustReducedVals())
26171	V.analyzedReductionVals(VL);
26172	continue;
26173	}
26174	V.reorderTopToBottom();
26175	// No need to reorder the root node at all for reassociative reduction.
26176	V.reorderBottomToTop(/IgnoreReorder=/RdxFMF.allowReassoc() \|\|
26177	VL.front()->getType()->isIntOrIntVectorTy() \|\|
26178	ReductionLimit > `2`);
26179	// Keep extracted other reduction values, if they are used in the
26180	// vectorization trees.
26181	BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
26182	ExternallyUsedValues);
26183	// The reduction root is used as the insertion point for new
26184	// instructions, so set it as externally used to prevent it from being
26185	// deleted.
26186	LocalExternallyUsedValues.insert(V: ReductionRoot);
26187	for (unsigned Cnt = `0`, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
26188	if (Cnt == I \|\| (ShuffledExtracts && Cnt == I - `1`))
26189	continue;
26190	for (Value *V : ReducedVals [Cnt])
26191	if (isa<Instruction>(Val: V))
26192	LocalExternallyUsedValues.insert(V: TrackedVals [V]);
26193	}
26194	if (!IsSupportedHorRdxIdentityOp) {
26195	// Number of uses of the candidates in the vector of values.
26196	assert(SameValuesCounter.empty() &&
26197	"Reused values counter map is not empty");
26198	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
26199	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26200	continue;
26201	Value *V = Candidates [Cnt];
26202	Value *OrigV = TrackedToOrig.at(Val: V);
26203	++SameValuesCounter.try_emplace(Key: OrigV).first->second;
26204	}
26205	}
26206	V.transformNodes();
26207	V.computeMinimumValueSizes();
26208	InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VectorizedVals: VL);
26209
26210	SmallPtrSet<Value *, `4`> VLScalars(llvm::from_range, VL);
26211	// Gather externally used values.
26212	SmallPtrSet<Value *, `4`> Visited;
26213	for (unsigned Cnt = `0`; Cnt < NumReducedVals; ++Cnt) {
26214	if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26215	continue;
26216	Value *RdxVal = Candidates [Cnt];
26217	if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end())
26218	RdxVal = It ->second;
26219	if (!Visited.insert(Ptr: RdxVal).second)
26220	continue;
26221	// Check if the scalar was vectorized as part of the vectorization
26222	// tree but not the top node.
26223	if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
26224	LocalExternallyUsedValues.insert(V: RdxVal);
26225	continue;
26226	}
26227	Value *OrigV = TrackedToOrig.at(Val: RdxVal);
26228	unsigned NumOps =
26229	VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV);
26230	if (NumOps != ReducedValsToOps.at(Val: OrigV).size())
26231	LocalExternallyUsedValues.insert(V: RdxVal);
26232	}
26233	// Do not need the list of reused scalars in regular mode anymore.
26234	if (!IsSupportedHorRdxIdentityOp)
26235	SameValuesCounter.clear();
26236	for (Value *RdxVal : VL)
26237	if (RequiredExtract.contains(Ptr: RdxVal))
26238	LocalExternallyUsedValues.insert(V: RdxVal);
26239	V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
26240
26241	// Estimate cost.
26242	InstructionCost ReductionCost;
26243	if (V.isReducedBitcastRoot() \|\| V.isReducedCmpBitcastRoot())
26244	ReductionCost = `0`;
26245	else
26246	ReductionCost =
26247	getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, FMF: RdxFMF, R: V, DT, DL, TLI);
26248	InstructionCost Cost = V.getTreeCost(TreeCost, VectorizedVals: VL, ReductionCost);
26249	LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
26250	<< " for reduction\n");
26251	if (!Cost.isValid())
26252	break;
26253	if (Cost >= -SLPCostThreshold) {
26254	V.getORE()->emit(RemarkBuilder: [&]() {
26255	return OptimizationRemarkMissed (SV_NAME, "HorSLPNotBeneficial",
26256	ReducedValsToOps.at(Val: VL [`0`]).front())
26257	<< "Vectorizing horizontal reduction is possible "
26258	<< "but not beneficial with cost " << ore::NV ("Cost", Cost)
26259	<< " and threshold "
26260	<< ore::NV ("Threshold", -SLPCostThreshold);
26261	});
26262	if (!AdjustReducedVals()) {
26263	V.analyzedReductionVals(VL);
26264	unsigned Offset = Pos == Start ? Pos : Pos - `1`;
26265	if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
26266	// Add subvectors of VL to the list of the analyzed values.
26267	for (unsigned VF = getFloorFullVectorNumberOfElements(
26268	TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - `1`);
26269	VF >= ReductionLimit;
26270	VF = getFloorFullVectorNumberOfElements(
26271	TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - `1`)) {
26272	if (has_single_bit(Value: VF) &&
26273	V.getCanonicalGraphSize() != V.getTreeSize())
26274	continue;
26275	for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF))
26276	IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF));
26277	}
26278	}
26279	}
26280	continue;
26281	}
26282
26283	LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
26284	<< Cost << ". (HorRdx)\n");
26285	V.getORE()->emit(RemarkBuilder: [&]() {
26286	return OptimizationRemark (SV_NAME, "VectorizedHorizontalReduction",
26287	ReducedValsToOps.at(Val: VL [`0`]).front())
26288	<< "Vectorized horizontal reduction with cost "
26289	<< ore::NV ("Cost", Cost) << " and with tree size "
26290	<< ore::NV ("TreeSize", V.getTreeSize());
26291	});
26292
26293	Builder.setFastMathFlags(RdxFMF);
26294
26295	// Emit a reduction. If the root is a select (min/max idiom), the insert
26296	// point is the compare condition of that select.
26297	Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
26298	Instruction *InsertPt = RdxRootInst;
26299	if (IsCmpSelMinMax)
26300	InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
26301
26302	// Vectorize a tree.
26303	Value *VectorizedRoot = V.vectorizeTree(
26304	ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales);
26305	// Update TrackedToOrig mapping, since the tracked values might be
26306	// updated.
26307	for (Value *RdxVal : Candidates) {
26308	Value *OrigVal = TrackedToOrig.at(Val: RdxVal);
26309	Value *TransformedRdxVal = TrackedVals.at(Val: OrigVal);
26310	if (TransformedRdxVal != RdxVal)
26311	TrackedToOrig.try_emplace(Key: TransformedRdxVal, Args&: OrigVal);
26312	}
26313
26314	Builder.SetInsertPoint(InsertPt);
26315
26316	// To prevent poison from leaking across what used to be sequential,
26317	// safe, scalar boolean logic operations, the reduction operand must be
26318	// frozen.
26319	if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC))
26320	VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
26321
26322	// Emit code to correctly handle reused reduced values, if required.
26323	if (OptReusedScalars && !SameScaleFactor) {
26324	VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
26325	SameValuesCounter, TrackedToOrig);
26326	}
26327
26328	Type *ScalarTy = VL.front()->getType();
26329	Type *VecTy = VectorizedRoot->getType();
26330	Type *RedScalarTy = VecTy->getScalarType();
26331	VectorValuesAndScales.emplace_back(
26332	Args&: VectorizedRoot,
26333	Args: OptReusedScalars && SameScaleFactor
26334	? SameValuesCounter.front().second
26335	: `1`,
26336	Args: RedScalarTy != ScalarTy->getScalarType()
26337	? V.isSignedMinBitwidthRootNode()
26338	: true,
26339	Args: V.isReducedBitcastRoot() \|\| V.isReducedCmpBitcastRoot());
26340
26341	// Count vectorized reduced values to exclude them from final reduction.
26342	for (Value *RdxVal : VL) {
26343	Value *OrigV = TrackedToOrig.at(Val: RdxVal);
26344	if (IsSupportedHorRdxIdentityOp) {
26345	VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV));
26346	continue;
26347	}
26348	++VectorizedVals.try_emplace(Key: OrigV).first ->getSecond();
26349	if (!V.isVectorized(V: RdxVal))
26350	RequiredExtract.insert(Ptr: RdxVal);
26351	}
26352	Pos += ReduxWidth;
26353	Start = Pos;
26354	ReduxWidth = NumReducedVals - Pos;
26355	if (ReduxWidth > `1`)
26356	ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
26357	AnyVectorized = true;
26358	}
26359	if (OptReusedScalars && !AnyVectorized) {
26360	for (const std::pair<Value , unsigned*> &P : SameValuesCounter) {
26361	Value *RdxVal = TrackedVals.at(Val: P.first);
26362	Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second);
26363	VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
26364	VectorizedVals.try_emplace(Key: P.first, Args: P.second);
26365	}
26366	continue;
26367	}
26368	}
26369	if (!VectorValuesAndScales.empty())
26370	VectorizedTree = GetNewVectorizedTree(
26371	VectorizedTree,
26372	emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot ->getType()));
26373
26374	if (!VectorizedTree) {
26375	if (!CheckForReusedReductionOps) {
26376	for (ReductionOpsType &RdxOps : ReductionOps)
26377	for (Value *RdxOp : RdxOps)
26378	V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
26379	}
26380	return nullptr;
26381	}
26382
26383	// Reorder operands of bool logical op in the natural order to avoid
26384	// possible problem with poison propagation. If not possible to reorder
26385	// (both operands are originally RHS), emit an extra freeze instruction
26386	// for the LHS operand.
26387	// I.e., if we have original code like this:
26388	// RedOp1 = select i1 ?, i1 LHS, i1 false
26389	// RedOp2 = select i1 RHS, i1 ?, i1 false
26390
26391	// Then, we swap LHS/RHS to create a new op that matches the poison
26392	// semantics of the original code.
26393
26394	// If we have original code like this and both values could be poison:
26395	// RedOp1 = select i1 ?, i1 LHS, i1 false
26396	// RedOp2 = select i1 ?, i1 RHS, i1 false
26397
26398	// Then, we must freeze LHS in the new op.
26399	auto FixBoolLogicalOps =
26400	[&, VectorizedTree](Value &LHS, Value &RHS, Instruction *RedOp1,
26401	Instruction RedOp2, bool* InitStep) {
26402	if (!AnyBoolLogicOp)
26403	return;
26404	if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) \|\|
26405	getRdxOperand(I: RedOp1, Index: `0`) == LHS \|\|
26406	isGuaranteedNotToBePoison(V: LHS, AC)))
26407	return;
26408	bool NeedFreeze = LHS != VectorizedTree;
26409	if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) \|\|
26410	getRdxOperand(I: RedOp2, Index: `0`) == RHS \|\|
26411	isGuaranteedNotToBePoison(V: RHS, AC))) {
26412	// If RedOp2 was used as a second operand - do not swap.
26413	if ((InitStep \|\| RHS != VectorizedTree) &&
26414	getRdxOperand(I: RedOp2, Index: `0`) == RHS &&
26415	((isBoolLogicOp(I: RedOp1) &&
26416	getRdxOperand(I: RedOp1, Index: `1`) == RedOp2) \|\|
26417	any_of(Range&: ReductionOps, P: [&](ArrayRef<Value *> Ops) {
26418	return any_of(Range&: Ops, P: [&](Value *Op) {
26419	auto *OpI = dyn_cast<Instruction>(Val: Op);
26420	return OpI && isBoolLogicOp(I: OpI) &&
26421	getRdxOperand(I: OpI, Index: `1`) == RedOp2;
26422	});
26423	}))) {
26424	NeedFreeze = false;
26425	} else {
26426	std::swap(a&: LHS, b&: RHS);
26427	return;
26428	}
26429	}
26430	if (NeedFreeze)
26431	LHS = Builder.CreateFreeze(V: LHS);
26432	};
26433	// Finish the reduction.
26434	// Need to add extra arguments and not vectorized possible reduction values.
26435	// Try to avoid dependencies between the scalar remainders after reductions.
26436	auto FinalGen = [&](ArrayRef<std::pair<Instruction , Value >> InstVals,
26437	bool InitStep) {
26438	unsigned Sz = InstVals.size();
26439	SmallVector<std::pair<Instruction , Value >> ExtraReds(Sz / `2` + Sz % `2`);
26440	for (unsigned I = `0`, E = (Sz / `2`) * `2`; I < E; I += `2`) {
26441	Instruction *RedOp = InstVals [I + `1`].first;
26442	Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
26443	Value *RdxVal1 = InstVals [I].second;
26444	Value *StableRdxVal1 = RdxVal1;
26445	auto It1 = TrackedVals.find(Val: RdxVal1);
26446	if (It1 != TrackedVals.end())
26447	StableRdxVal1 = It1 ->second;
26448	Value *RdxVal2 = InstVals [I + `1`].second;
26449	Value *StableRdxVal2 = RdxVal2;
26450	auto It2 = TrackedVals.find(Val: RdxVal2);
26451	if (It2 != TrackedVals.end())
26452	StableRdxVal2 = It2 ->second;
26453	// To prevent poison from leaking across what used to be sequential,
26454	// safe, scalar boolean logic operations, the reduction operand must be
26455	// frozen.
26456	FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals [I].first,
26457	RedOp, InitStep);
26458	Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
26459	RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
26460	ExtraReds [I / `2`] = std::make_pair(x: InstVals [I].first, y&: ExtraRed);
26461	}
26462	if (Sz % `2` == `1`)
26463	ExtraReds [Sz / `2`] = InstVals.back();
26464	return ExtraReds;
26465	};
26466	SmallVector<std::pair<Instruction , Value >> ExtraReductions;
26467	ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
26468	Args&: VectorizedTree);
26469	SmallPtrSet<Value *, `8`> Visited;
26470	for (ArrayRef<Value *> Candidates : ReducedVals) {
26471	for (Value *RdxVal : Candidates) {
26472	if (!Visited.insert(Ptr: RdxVal).second)
26473	continue;
26474	unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
26475	for (Instruction *RedOp :
26476	ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps))
26477	ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
26478	}
26479	}
26480	// Iterate through all not-vectorized reduction values/extra arguments.
26481	bool InitStep = true;
26482	while (ExtraReductions.size() > `1`) {
26483	SmallVector<std::pair<Instruction , Value >> NewReds =
26484	FinalGen(ExtraReductions, InitStep);
26485	ExtraReductions.swap(RHS&: NewReds);
26486	InitStep = false;
26487	}
26488	VectorizedTree = ExtraReductions.front().second;
26489
26490	ReductionRoot ->replaceAllUsesWith(V: VectorizedTree);
26491
26492	// The original scalar reduction is expected to have no remaining
26493	// uses outside the reduction tree itself. Assert that we got this
26494	// correct, replace internal uses with undef, and mark for eventual
26495	// deletion.
26496	#ifndef NDEBUG
26497	SmallPtrSet<Value *, `4`> IgnoreSet;
26498	for (ArrayRef<Value *> RdxOps : ReductionOps)
26499	IgnoreSet.insert_range(RdxOps);
26500	#endif
26501	for (ArrayRef<Value *> RdxOps : ReductionOps) {
26502	for (Value *Ignore : RdxOps) {
26503	if (!Ignore)
26504	continue;
26505	#ifndef NDEBUG
26506	for (auto *U : Ignore->users()) {
26507	assert(IgnoreSet.count(U) &&
26508	"All users must be either in the reduction ops list.");
26509	}
26510	#endif
26511	if (!Ignore->use_empty()) {
26512	Value *P = PoisonValue::get(T: Ignore->getType());
26513	Ignore->replaceAllUsesWith(V: P);
26514	}
26515	}
26516	V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales);
26517	}
26518	return VectorizedTree;
26519	}
26520
26521	private:
26522	/// Creates the reduction from the given \p Vec vector value with the given
26523	/// scale \p Scale and signedness \p IsSigned.
26524	Value createSingleOp(IRBuilderBase &Builder, const* TargetTransformInfo &TTI,
26525	Value Vec, unsigned* Scale, bool IsSigned, Type *DestTy,
26526	bool ReducedInTree) {
26527	Value *Rdx;
26528	if (ReducedInTree) {
26529	Rdx = Vec;
26530	} else if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) {
26531	unsigned DestTyNumElements = getNumElements(Ty: VecTy);
26532	unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements;
26533	Rdx = PoisonValue::get(
26534	T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements));
26535	for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) {
26536	// Do reduction for each lane.
26537	// e.g., do reduce add for
26538	// VL[0] = <4 x Ty> <a, b, c, d>
26539	// VL[1] = <4 x Ty> <e, f, g, h>
26540	// Lane[0] = <2 x Ty> <a, e>
26541	// Lane[1] = <2 x Ty> <b, f>
26542	// Lane[2] = <2 x Ty> <c, g>
26543	// Lane[3] = <2 x Ty> <d, h>
26544	// result[0] = reduce add Lane[0]
26545	// result[1] = reduce add Lane[1]
26546	// result[2] = reduce add Lane[2]
26547	// result[3] = reduce add Lane[3]
26548	SmallVector<int, `16`> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF);
26549	Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask);
26550	Rdx = Builder.CreateInsertElement(
26551	Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I);
26552	}
26553	} else {
26554	Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy);
26555	}
26556	if (Rdx->getType() != DestTy)
26557	Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned);
26558	// Improved analysis for add/fadd/xor reductions with same scale
26559	// factor for all operands of reductions. We can emit scalar ops for
26560	// them instead.
26561	if (Scale > `1`)
26562	Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale);
26563	return Rdx;
26564	}
26565
26566	/// Calculate the cost of a reduction.
26567	InstructionCost getReductionCost(TargetTransformInfo *TTI,
26568	ArrayRef<Value *> ReducedVals,
26569	bool IsCmpSelMinMax, FastMathFlags FMF,
26570	const BoUpSLP &R, DominatorTree &DT,
26571	const DataLayout &DL,
26572	const TargetLibraryInfo &TLI) {
26573	TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
26574	Type *ScalarTy = ReducedVals.front()->getType();
26575	unsigned ReduxWidth = ReducedVals.size();
26576	FixedVectorType *VectorTy = R.getReductionType();
26577	InstructionCost VectorCost = `0`, ScalarCost;
26578	// If all of the reduced values are constant, the vector cost is 0, since
26579	// the reduction value can be calculated at the compile time.
26580	bool AllConsts = allConstant(VL: ReducedVals);
26581	auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
26582	InstructionCost Cost = `0`;
26583	// Scalar cost is repeated for N-1 elements.
26584	int Cnt = ReducedVals.size();
26585	for (Value *RdxVal : ReducedVals) {
26586	if (!isa<Instruction>(Val: RdxVal))
26587	continue;
26588	if (Cnt == `1`)
26589	break;
26590	--Cnt;
26591	if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? `3` : `2`)) {
26592	Cost += GenCostFn ();
26593	continue;
26594	}
26595	InstructionCost ScalarCost = `0`;
26596	for (User *U : RdxVal->users()) {
26597	auto *RdxOp = cast<Instruction>(Val: U);
26598	if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
26599	if (RdxKind == RecurKind::FAdd) {
26600	InstructionCost FMACost = canConvertToFMA(
26601	VL: RdxOp, S: getSameOpcode(VL: RdxOp, TLI), DT, DL, TTI&: *TTI, TLI);
26602	if (FMACost.isValid()) {
26603	LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
26604	if (auto *I = dyn_cast<Instruction>(Val: RdxVal)) {
26605	// Also, exclude scalar fmul cost.
26606	InstructionCost FMulCost =
26607	TTI->getInstructionCost(U: I, CostKind);
26608	LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
26609	FMACost -= FMulCost;
26610	}
26611	ScalarCost += FMACost;
26612	continue;
26613	}
26614	}
26615	ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
26616	continue;
26617	}
26618	ScalarCost = InstructionCost::getInvalid();
26619	break;
26620	}
26621	if (ScalarCost.isValid())
26622	Cost += ScalarCost;
26623	else
26624	Cost += GenCostFn ();
26625	}
26626	return Cost;
26627	};
26628	// Require reduction cost if:
26629	// 1. This type is not a full register type and no other vectors with the
26630	// same type in the storage (first vector with small type).
26631	// 2. The storage does not have any vector with full vector use (first
26632	// vector with full register use).
26633	bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
26634	switch (RdxKind) {
26635	case RecurKind::Add:
26636	case RecurKind::Mul:
26637	case RecurKind::Or:
26638	case RecurKind::And:
26639	case RecurKind::Xor:
26640	case RecurKind::FAdd:
26641	case RecurKind::FMul: {
26642	unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
26643	if (!AllConsts) {
26644	if (DoesRequireReductionOp) {
26645	if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
26646	assert(SLPReVec && "FixedVectorType is not expected.");
26647	unsigned ScalarTyNumElements = VecTy->getNumElements();
26648	for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) {
26649	VectorCost += TTI->getShuffleCost(
26650	Kind: TTI::SK_PermuteSingleSrc,
26651	DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(),
26652	NumElts: ReducedVals.size()),
26653	SrcTy: VectorTy,
26654	Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size()));
26655	VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy,
26656	FMF, CostKind);
26657	}
26658	VectorCost += TTI->getScalarizationOverhead(
26659	Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /Insert/ true,
26660	/Extract/ false, CostKind: TTI::TCK_RecipThroughput);
26661	} else {
26662	Type *RedTy = VectorTy->getElementType();
26663	auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26664	u: std::make_pair(x&: RedTy, y: true));
26665	if (RType == RedTy) {
26666	VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy,
26667	FMF, CostKind);
26668	} else {
26669	VectorCost = TTI->getExtendedReductionCost(
26670	Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy,
26671	Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind);
26672	}
26673	}
26674	} else {
26675	Type *RedTy = VectorTy->getElementType();
26676	auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26677	u: std::make_pair(x&: RedTy, y: true));
26678	VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
26679	InstructionCost FMACost = InstructionCost::getInvalid();
26680	if (RdxKind == RecurKind::FAdd) {
26681	// Check if the reduction operands can be converted to FMA.
26682	SmallVector<Value *> Ops;
26683	FastMathFlags FMF;
26684	FMF.set();
26685	for (Value *RdxVal : ReducedVals) {
26686	if (!RdxVal->hasOneUse()) {
26687	Ops.clear();
26688	break;
26689	}
26690	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: RdxVal))
26691	FMF &= FPCI->getFastMathFlags();
26692	Ops.push_back(Elt: RdxVal->user_back());
26693	}
26694	if (!Ops.empty()) {
26695	FMACost = canConvertToFMA(VL: Ops, S: getSameOpcode(VL: Ops, TLI), DT, DL,
26696	TTI&: *TTI, TLI);
26697	if (FMACost.isValid()) {
26698	// Calculate actual FMAD cost.
26699	IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
26700	{RVecTy, RVecTy, RVecTy}, FMF);
26701	FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
26702
26703	LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
26704	// Also, exclude vector fmul cost.
26705	InstructionCost FMulCost = TTI->getArithmeticInstrCost(
26706	Opcode: Instruction::FMul, Ty: RVecTy, CostKind);
26707	LLVM_DEBUG(dbgs()
26708	<< "Minus vector FMul cost: " << FMulCost << "\n");
26709	FMACost -= FMulCost;
26710	}
26711	}
26712	}
26713	if (FMACost.isValid())
26714	VectorCost += FMACost;
26715	else
26716	VectorCost +=
26717	TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind);
26718	if (RType != RedTy) {
26719	unsigned Opcode = Instruction::Trunc;
26720	if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26721	Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26722	VectorCost += TTI->getCastInstrCost(
26723	Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
26724	}
26725	}
26726	}
26727	ScalarCost = EvaluateScalarCost([&]() {
26728	return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
26729	});
26730	break;
26731	}
26732	case RecurKind::FMax:
26733	case RecurKind::FMin:
26734	case RecurKind::FMaximum:
26735	case RecurKind::FMinimum:
26736	case RecurKind::SMax:
26737	case RecurKind::SMin:
26738	case RecurKind::UMax:
26739	case RecurKind::UMin: {
26740	Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
26741	if (!AllConsts) {
26742	if (DoesRequireReductionOp) {
26743	VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
26744	} else {
26745	// Check if the previous reduction already exists and account it as
26746	// series of operations + single reduction.
26747	Type *RedTy = VectorTy->getElementType();
26748	auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26749	u: std::make_pair(x&: RedTy, y: true));
26750	VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
26751	IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
26752	VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
26753	if (RType != RedTy) {
26754	unsigned Opcode = Instruction::Trunc;
26755	if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26756	Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26757	VectorCost += TTI->getCastInstrCost(
26758	Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
26759	}
26760	}
26761	}
26762	ScalarCost = EvaluateScalarCost([&]() {
26763	IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
26764	return TTI->getIntrinsicInstrCost(ICA, CostKind);
26765	});
26766	break;
26767	}
26768	default:
26769	llvm_unreachable("Expected arithmetic or min/max reduction operation");
26770	}
26771
26772	LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
26773	<< " for reduction of " << shortBundleName(ReducedVals)
26774	<< " (It is a splitting reduction)\n");
26775	return VectorCost - ScalarCost;
26776	}
26777
26778	/// Splits the values, stored in VectorValuesAndScales, into registers/free
26779	/// sub-registers, combines them with the given reduction operation as a
26780	/// vector operation and then performs single (small enough) reduction.
26781	Value emitReduction(IRBuilderBase &Builder, const* TargetTransformInfo &TTI,
26782	Type *DestTy) {
26783	Value ReducedSubTree = nullptr*;
26784	// Creates reduction and combines with the previous reduction.
26785	auto CreateSingleOp = [&](Value Vec, unsigned* Scale, bool IsSigned,
26786	bool ReducedInTree) {
26787	Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy,
26788	ReducedInTree);
26789	if (ReducedSubTree)
26790	ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx,
26791	Name: "op.rdx", ReductionOps);
26792	else
26793	ReducedSubTree = Rdx;
26794	};
26795	if (VectorValuesAndScales.size() == `1`) {
26796	const auto &[Vec, Scale, IsSigned, ReducedInTree] =
26797	VectorValuesAndScales.front();
26798	CreateSingleOp(Vec, Scale, IsSigned, ReducedInTree);
26799	return ReducedSubTree;
26800	}
26801	// Scales Vec using given Cnt scale factor and then performs vector combine
26802	// with previous value of VecOp.
26803	Value VecRes = nullptr*;
26804	bool VecResSignedness = false;
26805	auto CreateVecOp = [&](Value Vec, unsigned* Cnt, bool IsSigned,
26806	bool ReducedInTree) {
26807	if (ReducedInTree) {
26808	CreateSingleOp(Vec, Cnt, IsSigned, ReducedInTree);
26809	return;
26810	}
26811	Type *ScalarTy = Vec->getType()->getScalarType();
26812	// Scale Vec using given Cnt scale factor.
26813	if (Cnt > `1`) {
26814	ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount();
26815	switch (RdxKind) {
26816	case RecurKind::Add: {
26817	if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
26818	unsigned VF = getNumElements(Ty: Vec->getType());
26819	LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
26820	<< ". (HorRdx)\n");
26821	SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
26822	for (unsigned I : seq<unsigned>(Size: Cnt))
26823	std::iota(first: std::next(x: Mask.begin(), n: VF * I),
26824	last: std::next(x: Mask.begin(), n: VF * (I + `1`)), value: `0`);
26825	++NumVectorInstructions;
26826	Vec = Builder.CreateShuffleVector(V: Vec, Mask);
26827	break;
26828	}
26829	// res = mul vv, n
26830	if (ScalarTy != DestTy->getScalarType())
26831	Vec = Builder.CreateIntCast(
26832	V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
26833	isSigned: IsSigned);
26834	Value *Scale = ConstantVector::getSplat(
26835	EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt));
26836	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
26837	<< ". (HorRdx)\n");
26838	++NumVectorInstructions;
26839	Vec = Builder.CreateMul(LHS: Vec, RHS: Scale);
26840	break;
26841	}
26842	case RecurKind::Xor: {
26843	// res = n % 2 ? 0 : vv
26844	LLVM_DEBUG(dbgs()
26845	<< "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
26846	if (Cnt % `2` == `0`)
26847	Vec = Constant::getNullValue(Ty: Vec->getType());
26848	break;
26849	}
26850	case RecurKind::FAdd: {
26851	// res = fmul v, n
26852	Value *Scale =
26853	ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt));
26854	LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
26855	<< ". (HorRdx)\n");
26856	++NumVectorInstructions;
26857	Vec = Builder.CreateFMul(L: Vec, R: Scale);
26858	break;
26859	}
26860	case RecurKind::And:
26861	case RecurKind::Or:
26862	case RecurKind::SMax:
26863	case RecurKind::SMin:
26864	case RecurKind::UMax:
26865	case RecurKind::UMin:
26866	case RecurKind::FMax:
26867	case RecurKind::FMin:
26868	case RecurKind::FMaximum:
26869	case RecurKind::FMinimum:
26870	// res = vv
26871	break;
26872	case RecurKind::Sub:
26873	case RecurKind::AddChainWithSubs:
26874	case RecurKind::Mul:
26875	case RecurKind::FMul:
26876	case RecurKind::FMulAdd:
26877	case RecurKind::AnyOf:
26878	case RecurKind::FindIV:
26879	case RecurKind::FindLast:
26880	case RecurKind::FMaxNum:
26881	case RecurKind::FMinNum:
26882	case RecurKind::FMaximumNum:
26883	case RecurKind::FMinimumNum:
26884	case RecurKind::None:
26885	llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26886	}
26887	}
26888	// Combine Vec with the previous VecOp.
26889	if (!VecRes) {
26890	VecRes = Vec;
26891	VecResSignedness = IsSigned;
26892	} else {
26893	++NumVectorInstructions;
26894	if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26895	VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26896	// Handle ctpop.
26897	unsigned VecResVF = getNumElements(Ty: VecRes->getType());
26898	unsigned VecVF = getNumElements(Ty: Vec->getType());
26899	SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26900	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
26901	// Ensure that VecRes is always larger than Vec
26902	if (VecResVF < VecVF) {
26903	std::swap(a&: VecRes, b&: Vec);
26904	std::swap(a&: VecResVF, b&: VecVF);
26905	}
26906	if (VecResVF != VecVF) {
26907	SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26908	std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: `0`);
26909	Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask);
26910	}
26911	VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op");
26912	return;
26913	}
26914	if (VecRes->getType()->getScalarType() != DestTy->getScalarType()) {
26915	assert(getNumElements(VecRes->getType()) % getNumElements(DestTy) ==
26916	`0` &&
26917	"Expected the number of elements in VecRes to be a multiple "
26918	"of the number of elements in DestTy");
26919	VecRes = Builder.CreateIntCast(
26920	V: VecRes,
26921	DestTy: getWidenedType(ScalarTy: DestTy->getScalarType(),
26922	VF: getNumElements(Ty: VecRes->getType())),
26923	isSigned: VecResSignedness);
26924	}
26925	if (ScalarTy != DestTy->getScalarType())
26926	Vec = Builder.CreateIntCast(
26927	V: Vec,
26928	DestTy: getWidenedType(ScalarTy: DestTy->getScalarType(),
26929	VF: getNumElements(Ty: Vec->getType())),
26930	isSigned: IsSigned);
26931	unsigned VecResVF = getNumElements(Ty: VecRes->getType());
26932	unsigned VecVF = getNumElements(Ty: Vec->getType());
26933	// Ensure that VecRes is always larger than Vec
26934	if (VecResVF < VecVF) {
26935	std::swap(a&: VecRes, b&: Vec);
26936	std::swap(a&: VecResVF, b&: VecVF);
26937	}
26938	// extract + op + insert
26939	Value *Op = VecRes;
26940	if (VecResVF != VecVF)
26941	Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /Index=/`0`);
26942	Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op", ReductionOps);
26943	if (VecResVF != VecVF)
26944	Op = createInsertVector(Builder, Vec: VecRes, V: Op, /Index=/`0`);
26945	VecRes = Op;
26946	}
26947	};
26948	for (auto [Vec, Scale, IsSigned, ReducedInTree] : VectorValuesAndScales)
26949	CreateVecOp(Vec, Scale, IsSigned, ReducedInTree);
26950	CreateSingleOp(VecRes, /Scale=/`1`, /IsSigned=/false,
26951	/ReducedInTree=/false);
26952
26953	return ReducedSubTree;
26954	}
26955
26956	/// Emit a horizontal reduction of the vectorized value.
26957	Value emitReduction(Value VectorizedValue, IRBuilderBase &Builder,
26958	const TargetTransformInfo TTI, Type DestTy) {
26959	assert(VectorizedValue && "Need to have a vectorized tree node");
26960	assert(RdxKind != RecurKind::FMulAdd &&
26961	"A call to the llvm.fmuladd intrinsic is not handled yet");
26962
26963	auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
26964	if (FTy->getScalarType() == Builder.getInt1Ty() &&
26965	RdxKind == RecurKind::Add &&
26966	DestTy->getScalarType() != FTy->getScalarType()) {
26967	// Convert vector_reduce_add(ZExt(<n x i1>)) to
26968	// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26969	Value *V = Builder.CreateBitCast(
26970	V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements()));
26971	++NumVectorInstructions;
26972	return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V);
26973	}
26974	++NumVectorInstructions;
26975	return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind);
26976	}
26977
26978	/// Emits optimized code for unique scalar value reused \p Cnt times.
26979	Value emitScaleForReusedOps(Value VectorizedValue, IRBuilderBase &Builder,
26980	unsigned Cnt) {
26981	assert(IsSupportedHorRdxIdentityOp &&
26982	"The optimization of matched scalar identity horizontal reductions "
26983	"must be supported.");
26984	if (Cnt == `1`)
26985	return VectorizedValue;
26986	switch (RdxKind) {
26987	case RecurKind::Add: {
26988	// res = mul vv, n
26989	Value *Scale =
26990	ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt,
26991	/IsSigned=/false, /ImplicitTrunc=/true);
26992	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26993	<< VectorizedValue << ". (HorRdx)\n");
26994	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
26995	}
26996	case RecurKind::Xor: {
26997	// res = n % 2 ? 0 : vv
26998	LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26999	<< ". (HorRdx)\n");
27000	if (Cnt % `2` == `0`)
27001	return Constant::getNullValue(Ty: VectorizedValue->getType());
27002	return VectorizedValue;
27003	}
27004	case RecurKind::FAdd: {
27005	// res = fmul v, n
27006	Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
27007	LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
27008	<< VectorizedValue << ". (HorRdx)\n");
27009	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
27010	}
27011	case RecurKind::And:
27012	case RecurKind::Or:
27013	case RecurKind::SMax:
27014	case RecurKind::SMin:
27015	case RecurKind::UMax:
27016	case RecurKind::UMin:
27017	case RecurKind::FMax:
27018	case RecurKind::FMin:
27019	case RecurKind::FMaximum:
27020	case RecurKind::FMinimum:
27021	// res = vv
27022	return VectorizedValue;
27023	case RecurKind::Sub:
27024	case RecurKind::AddChainWithSubs:
27025	case RecurKind::Mul:
27026	case RecurKind::FMul:
27027	case RecurKind::FMulAdd:
27028	case RecurKind::AnyOf:
27029	case RecurKind::FindIV:
27030	case RecurKind::FindLast:
27031	case RecurKind::FMaxNum:
27032	case RecurKind::FMinNum:
27033	case RecurKind::FMaximumNum:
27034	case RecurKind::FMinimumNum:
27035	case RecurKind::None:
27036	llvm_unreachable("Unexpected reduction kind for repeated scalar.");
27037	}
27038	return nullptr;
27039	}
27040
27041	/// Emits actual operation for the scalar identity values, found during
27042	/// horizontal reduction analysis.
27043	Value *
27044	emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
27045	const SmallMapVector<Value , unsigned*, `16`> &SameValuesCounter,
27046	const DenseMap<Value , Value > &TrackedToOrig) {
27047	assert(IsSupportedHorRdxIdentityOp &&
27048	"The optimization of matched scalar identity horizontal reductions "
27049	"must be supported.");
27050	ArrayRef<Value *> VL = R.getRootNodeScalars();
27051	auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
27052	if (VTy->getElementType() != VL.front()->getType()) {
27053	VectorizedValue = Builder.CreateIntCast(
27054	V: VectorizedValue,
27055	DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
27056	isSigned: R.isSignedMinBitwidthRootNode());
27057	}
27058	switch (RdxKind) {
27059	case RecurKind::Add: {
27060	// root = mul prev_root, <1, 1, n, 1>
27061	SmallVector<Constant *> Vals;
27062	for (Value *V : VL) {
27063	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
27064	Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /IsSigned=/false));
27065	}
27066	auto *Scale = ConstantVector::get(V: Vals);
27067	LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
27068	<< VectorizedValue << ". (HorRdx)\n");
27069	return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
27070	}
27071	case RecurKind::And:
27072	case RecurKind::Or:
27073	// No need for multiple or/and(s).
27074	LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
27075	<< ". (HorRdx)\n");
27076	return VectorizedValue;
27077	case RecurKind::SMax:
27078	case RecurKind::SMin:
27079	case RecurKind::UMax:
27080	case RecurKind::UMin:
27081	case RecurKind::FMax:
27082	case RecurKind::FMin:
27083	case RecurKind::FMaximum:
27084	case RecurKind::FMinimum:
27085	// No need for multiple min/max(s) of the same value.
27086	LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
27087	<< ". (HorRdx)\n");
27088	return VectorizedValue;
27089	case RecurKind::Xor: {
27090	// Replace values with even number of repeats with 0, since
27091	// x xor x = 0.
27092	// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
27093	// 7>, if elements 4th and 6th elements have even number of repeats.
27094	SmallVector<int> Mask(
27095	cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
27096	PoisonMaskElem);
27097	std::iota(first: Mask.begin(), last: Mask.end(), value: `0`);
27098	bool NeedShuffle = false;
27099	for (unsigned I = `0`, VF = VL.size(); I < VF; ++I) {
27100	Value *V = VL [I];
27101	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
27102	if (Cnt % `2` == `0`) {
27103	Mask [I] = VF;
27104	NeedShuffle = true;
27105	}
27106	}
27107	LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
27108	: Mask) dbgs()
27109	<< I << " ";
27110	dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
27111	if (NeedShuffle)
27112	VectorizedValue = Builder.CreateShuffleVector(
27113	V1: VectorizedValue,
27114	V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
27115	return VectorizedValue;
27116	}
27117	case RecurKind::FAdd: {
27118	// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
27119	SmallVector<Constant *> Vals;
27120	for (Value *V : VL) {
27121	unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
27122	Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
27123	}
27124	auto *Scale = ConstantVector::get(V: Vals);
27125	return Builder.CreateFMul(L: VectorizedValue, R: Scale);
27126	}
27127	case RecurKind::Sub:
27128	case RecurKind::AddChainWithSubs:
27129	case RecurKind::Mul:
27130	case RecurKind::FMul:
27131	case RecurKind::FMulAdd:
27132	case RecurKind::AnyOf:
27133	case RecurKind::FindIV:
27134	case RecurKind::FindLast:
27135	case RecurKind::FMaxNum:
27136	case RecurKind::FMinNum:
27137	case RecurKind::FMaximumNum:
27138	case RecurKind::FMinimumNum:
27139	case RecurKind::None:
27140	llvm_unreachable("Unexpected reduction kind for reused scalars.");
27141	}
27142	return nullptr;
27143	}
27144	};
27145	} // end anonymous namespace
27146
27147	/// Gets recurrence kind from the specified value.
27148	static RecurKind getRdxKind(Value *V) {
27149	return HorizontalReduction::getRdxKind(V);
27150	}
27151	static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
27152	if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
27153	return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
27154
27155	unsigned AggregateSize = `1`;
27156	auto *IV = cast<InsertValueInst>(Val: InsertInst);
27157	Type *CurrentType = IV->getType();
27158	do {
27159	if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
27160	for (auto *Elt : ST->elements())
27161	if (Elt != ST->getElementType(N: `0`)) // check homogeneity
27162	return std::nullopt;
27163	AggregateSize *= ST->getNumElements();
27164	CurrentType = ST->getElementType(N: `0`);
27165	} else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
27166	AggregateSize *= AT->getNumElements();
27167	CurrentType = AT->getElementType();
27168	} else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
27169	AggregateSize *= VT->getNumElements();
27170	return AggregateSize;
27171	} else if (CurrentType->isSingleValueType()) {
27172	return AggregateSize;
27173	} else {
27174	return std::nullopt;
27175	}
27176	} while (true);
27177	}
27178
27179	static void findBuildAggregateRec(Instruction *LastInsertInst,
27180	TargetTransformInfo *TTI,
27181	SmallVectorImpl<Value *> &BuildVectorOpds,
27182	SmallVectorImpl<Value *> &InsertElts,
27183	unsigned OperandOffset, const BoUpSLP &R) {
27184	do {
27185	Value *InsertedOperand = LastInsertInst->getOperand(i: `1`);
27186	std::optional<unsigned> OperandIndex =
27187	getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
27188	if (!OperandIndex \|\| R.isDeleted(I: LastInsertInst))
27189	return;
27190	if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
27191	findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
27192	BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R);
27193
27194	} else {
27195	BuildVectorOpds [*OperandIndex] = InsertedOperand;
27196	InsertElts [*OperandIndex] = LastInsertInst;
27197	}
27198	LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: `0`));
27199	} while (LastInsertInst != nullptr &&
27200	isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
27201	LastInsertInst->hasOneUse());
27202	}
27203
27204	/// Recognize construction of vectors like
27205	/// %ra = insertelement <4 x float> poison, float %s0, i32 0
27206	/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
27207	/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
27208	/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
27209	/// starting from the last insertelement or insertvalue instruction.
27210	///
27211	/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
27212	/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
27213	/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
27214	///
27215	/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
27216	///
27217	/// \return true if it matches.
27218	static bool findBuildAggregate(Instruction *LastInsertInst,
27219	TargetTransformInfo *TTI,
27220	SmallVectorImpl<Value *> &BuildVectorOpds,
27221	SmallVectorImpl<Value *> &InsertElts,
27222	const BoUpSLP &R) {
27223
27224	assert((isa<InsertElementInst>(LastInsertInst) \|\|
27225	isa<InsertValueInst>(LastInsertInst)) &&
27226	"Expected insertelement or insertvalue instruction!");
27227
27228	assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
27229	"Expected empty result vectors!");
27230
27231	std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
27232	if (!AggregateSize)
27233	return false;
27234	BuildVectorOpds.resize(N: *AggregateSize);
27235	InsertElts.resize(N: *AggregateSize);
27236
27237	findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: `0`, R);
27238	llvm::erase(C&: BuildVectorOpds, V: nullptr);
27239	llvm::erase(C&: InsertElts, V: nullptr);
27240	if (BuildVectorOpds.size() >= `2`)
27241	return true;
27242
27243	return false;
27244	}
27245
27246	/// Try and get a reduction instruction from a phi node.
27247	///
27248	/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
27249	/// if they come from either \p ParentBB or a containing loop latch.
27250	///
27251	/// \returns A candidate reduction value if possible, or \code nullptr \endcode
27252	/// if not possible.
27253	static Instruction getReductionInstr(const* DominatorTree DT, PHINode P,
27254	BasicBlock ParentBB, LoopInfo LI) {
27255	// There are situations where the reduction value is not dominated by the
27256	// reduction phi. Vectorizing such cases has been reported to cause
27257	// miscompiles. See PR25787.
27258	auto DominatedReduxValue = [&](Value *R) {
27259	return isa<Instruction>(Val: R) &&
27260	DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
27261	};
27262
27263	Instruction Rdx = nullptr*;
27264
27265	// Return the incoming value if it comes from the same BB as the phi node.
27266	if (P->getIncomingBlock(i: `0`) == ParentBB) {
27267	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
27268	} else if (P->getIncomingBlock(i: `1`) == ParentBB) {
27269	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
27270	}
27271
27272	if (Rdx && DominatedReduxValue (Rdx))
27273	return Rdx;
27274
27275	// Otherwise, check whether we have a loop latch to look at.
27276	Loop *BBL = LI->getLoopFor(BB: ParentBB);
27277	if (!BBL)
27278	return nullptr;
27279	BasicBlock *BBLatch = BBL->getLoopLatch();
27280	if (!BBLatch)
27281	return nullptr;
27282
27283	// There is a loop latch, return the incoming value if it comes from
27284	// that. This reduction pattern occasionally turns up.
27285	if (P->getIncomingBlock(i: `0`) == BBLatch) {
27286	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `0`));
27287	} else if (P->getIncomingBlock(i: `1`) == BBLatch) {
27288	Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: `1`));
27289	}
27290
27291	if (Rdx && DominatedReduxValue (Rdx))
27292	return Rdx;
27293
27294	return nullptr;
27295	}
27296
27297	static bool matchRdxBop(Instruction I, Value &V0, Value *&V1) {
27298	if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
27299	return true;
27300	if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27301	return true;
27302	if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27303	return true;
27304	if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27305	return true;
27306	if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27307	return true;
27308	if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27309	return true;
27310	if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27311	return true;
27312	if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27313	return true;
27314	if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27315	return true;
27316	return false;
27317	}
27318
27319	/// We could have an initial reduction that is not an add.
27320	/// r = v1 + v2 + v3 + v4*
27321	/// In such a case start looking for a tree rooted in the first '+'.
27322	/// \Returns the new root if found, which may be nullptr if not an instruction.
27323	static Instruction tryGetSecondaryReductionRoot(PHINode Phi,
27324	Instruction *Root) {
27325	assert((isa<BinaryOperator>(Root) \|\| isa<SelectInst>(Root) \|\|
27326	isa<IntrinsicInst>(Root)) &&
27327	"Expected binop, select, or intrinsic for reduction matching");
27328	Value *LHS =
27329	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
27330	Value *RHS =
27331	Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + `1`);
27332	if (LHS == Phi)
27333	return dyn_cast<Instruction>(Val: RHS);
27334	if (RHS == Phi)
27335	return dyn_cast<Instruction>(Val: LHS);
27336	return nullptr;
27337	}
27338
27339	/// \p Returns the first operand of \p I that does not match \p Phi. If
27340	/// operand is not an instruction it returns nullptr.
27341	static Instruction getNonPhiOperand(Instruction I, PHINode *Phi) {
27342	Value Op0 = nullptr*;
27343	Value Op1 = nullptr*;
27344	if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
27345	return nullptr;
27346	return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
27347	}
27348
27349	/// \Returns true if \p I is a candidate instruction for reduction vectorization.
27350	static bool isReductionCandidate(Instruction *I) {
27351	bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
27352	Value B0 = nullptr, B1 = nullptr;
27353	bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
27354	return IsBinop \|\| IsSelect;
27355	}
27356
27357	bool SLPVectorizerPass::vectorizeHorReduction(
27358	PHINode P, Instruction Root, BasicBlock *BB, BoUpSLP &R,
27359	SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
27360	if (!ShouldVectorizeHor)
27361	return false;
27362	bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
27363
27364	if (Root->getParent() != BB \|\| isa<PHINode>(Val: Root))
27365	return false;
27366
27367	// If we can find a secondary reduction root, use that instead.
27368	auto SelectRoot = [&]() {
27369	if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
27370	HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
27371	if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
27372	return NewRoot;
27373	return Root;
27374	};
27375
27376	// Start analysis starting from Root instruction. If horizontal reduction is
27377	// found, try to vectorize it. If it is not a horizontal reduction or
27378	// vectorization is not possible or not effective, and currently analyzed
27379	// instruction is a binary operation, try to vectorize the operands, using
27380	// pre-order DFS traversal order. If the operands were not vectorized, repeat
27381	// the same procedure considering each operand as a possible root of the
27382	// horizontal reduction.
27383	// Interrupt the process if the Root instruction itself was vectorized or all
27384	// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
27385	// If a horizintal reduction was not matched or vectorized we collect
27386	// instructions for possible later attempts for vectorization.
27387	std::queue<std::pair<Instruction , unsigned*>> Stack;
27388	Stack.emplace(args: SelectRoot (), args: `0`);
27389	SmallPtrSet<Value *, `8`> VisitedInstrs;
27390	bool Res = false;
27391	auto TryToReduce = [this, &R, TTI = TTI](Instruction Inst) -> Value {
27392	if (R.isAnalyzedReductionRoot(I: Inst))
27393	return nullptr;
27394	if (!isReductionCandidate(I: Inst))
27395	return nullptr;
27396	HorizontalReduction HorRdx;
27397	if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: SE, DL: DL, TLI: *TLI))
27398	return nullptr;
27399	return HorRdx.tryToReduce(V&: R, DL: DL, TTI, TLI: TLI, AC, DT&: *DT);
27400	};
27401	auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
27402	if (TryOperandsAsNewSeeds && FutureSeed == Root) {
27403	FutureSeed = getNonPhiOperand(I: Root, Phi: P);
27404	if (!FutureSeed)
27405	return false;
27406	}
27407	// Do not collect CmpInst or InsertElementInst/InsertValueInst as their
27408	// analysis is done separately.
27409	if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
27410	PostponedInsts.push_back(Elt: FutureSeed);
27411	return true;
27412	};
27413
27414	while (!Stack.empty()) {
27415	Instruction *Inst;
27416	unsigned Level;
27417	std::tie(args&: Inst, args&: Level) = Stack.front();
27418	Stack.pop();
27419	// Do not try to analyze instruction that has already been vectorized.
27420	// This may happen when we vectorize instruction operands on a previous
27421	// iteration while stack was populated before that happened.
27422	if (R.isDeleted(I: Inst))
27423	continue;
27424	if (Value *VectorizedV = TryToReduce (Inst)) {
27425	Res = true;
27426	if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
27427	// Try to find another reduction.
27428	Stack.emplace(args&: I, args&: Level);
27429	continue;
27430	}
27431	if (R.isDeleted(I: Inst))
27432	continue;
27433	} else {
27434	// We could not vectorize `Inst` so try to use it as a future seed.
27435	if (!TryAppendToPostponedInsts (Inst)) {
27436	assert(Stack.empty() && "Expected empty stack");
27437	break;
27438	}
27439	}
27440
27441	// Try to vectorize operands.
27442	// Continue analysis for the instruction from the same basic block only to
27443	// save compile time.
27444	if (++Level < RecursionMaxDepth)
27445	for (auto *Op : Inst->operand_values())
27446	if (VisitedInstrs.insert(Ptr: Op).second)
27447	if (auto *I = dyn_cast<Instruction>(Val: Op))
27448	// Do not try to vectorize CmpInst operands, this is done
27449	// separately.
27450	if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
27451	!R.isDeleted(I) && I->getParent() == BB)
27452	Stack.emplace(args&: I, args&: Level);
27453	}
27454	return Res;
27455	}
27456
27457	bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
27458	if (!I)
27459	return false;
27460
27461	if (!isa<BinaryOperator, CmpInst>(Val: I) \|\| isa<VectorType>(Val: I->getType()))
27462	return false;
27463	// Skip potential FMA candidates.
27464	if ((I->getOpcode() == Instruction::FAdd \|\|
27465	I->getOpcode() == Instruction::FSub) &&
27466	canConvertToFMA(VL: I, S: getSameOpcode(VL: I, TLI: TLI), DT&: DT, DL: DL, TTI&: TTI, TLI: *TLI)
27467	.isValid())
27468	return false;
27469
27470	Value *P = I->getParent();
27471
27472	// Vectorize in current basic block only.
27473	auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
27474	auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: `1`));
27475	if (!Op0 \|\| !Op1 \|\| Op0->getParent() != P \|\| Op1->getParent() != P \|\|
27476	R.isDeleted(I: Op0) \|\| R.isDeleted(I: Op1))
27477	return false;
27478
27479	// First collect all possible candidates
27480	SmallVector<std::pair<Value , Value >, `4`> Candidates;
27481	Candidates.emplace_back(Args&: Op0, Args&: Op1);
27482
27483	auto *A = dyn_cast<BinaryOperator>(Val: Op0);
27484	auto *B = dyn_cast<BinaryOperator>(Val: Op1);
27485	// Try to skip B.
27486	if (A && B && B->hasOneUse()) {
27487	auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `0`));
27488	auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: `1`));
27489	if (B0 && B0->getParent() == P && !R.isDeleted(I: B0))
27490	Candidates.emplace_back(Args&: A, Args&: B0);
27491	if (B1 && B1->getParent() == P && !R.isDeleted(I: B1))
27492	Candidates.emplace_back(Args&: A, Args&: B1);
27493	}
27494	// Try to skip A.
27495	if (B && A && A->hasOneUse()) {
27496	auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `0`));
27497	auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: `1`));
27498	if (A0 && A0->getParent() == P && !R.isDeleted(I: A0))
27499	Candidates.emplace_back(Args&: A0, Args&: B);
27500	if (A1 && A1->getParent() == P && !R.isDeleted(I: A1))
27501	Candidates.emplace_back(Args&: A1, Args&: B);
27502	}
27503
27504	auto TryToReduce = [this, &R, &TTI = TTI](Instruction Inst,
27505	ArrayRef<Value *> Ops) {
27506	if (!isReductionCandidate(I: Inst))
27507	return false;
27508	Type *Ty = Inst->getType();
27509	if (!isValidElementType(Ty) \|\| Ty->isPointerTy())
27510	return false;
27511	HorizontalReduction HorRdx(Inst, Ops);
27512	if (!HorRdx.matchReductionForOperands())
27513	return false;
27514	// Check the cost of operations.
27515	VectorType *VecTy = getWidenedType(ScalarTy: Ty, VF: Ops.size());
27516	constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
27517	InstructionCost ScalarCost =
27518	TTI.getScalarizationOverhead(
27519	Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: getNumElements(Ty: VecTy)), /Insert=/false,
27520	/Extract=/true, CostKind) +
27521	TTI.getInstructionCost(U: Inst, CostKind);
27522	InstructionCost RedCost;
27523	switch (::getRdxKind(V: Inst)) {
27524	case RecurKind::Add:
27525	case RecurKind::Mul:
27526	case RecurKind::Or:
27527	case RecurKind::And:
27528	case RecurKind::Xor:
27529	case RecurKind::FAdd:
27530	case RecurKind::FMul: {
27531	FastMathFlags FMF;
27532	if (auto *FPCI = dyn_cast<FPMathOperator>(Val: Inst))
27533	FMF = FPCI->getFastMathFlags();
27534	RedCost = TTI.getArithmeticReductionCost(Opcode: Inst->getOpcode(), Ty: VecTy, FMF,
27535	CostKind);
27536	break;
27537	}
27538	default:
27539	return false;
27540	}
27541	if (RedCost >= ScalarCost)
27542	return false;
27543
27544	return HorRdx.tryToReduce(V&: R, DL: DL, TTI: &TTI, TLI: TLI, AC, DT&: DT) != nullptr*;
27545	};
27546	if (Candidates.size() == `1`)
27547	return TryToReduce (I, {Op0, Op1}) \|\| tryToVectorizeList(VL: {Op0, Op1}, R);
27548
27549	// We have multiple options. Try to pick the single best.
27550	std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
27551	if (!BestCandidate)
27552	return false;
27553	return (*BestCandidate == `0` &&
27554	TryToReduce (I, {Candidates [*BestCandidate].first,
27555	Candidates [*BestCandidate].second})) \|\|
27556	tryToVectorizeList(VL: {Candidates [*BestCandidate].first,
27557	Candidates [*BestCandidate].second},
27558	R);
27559	}
27560
27561	bool SLPVectorizerPass::vectorizeRootInstruction(PHINode P, Instruction Root,
27562	BasicBlock *BB, BoUpSLP &R) {
27563	SmallVector<WeakTrackingVH> PostponedInsts;
27564	bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
27565	Res \|= tryToVectorize(Insts: PostponedInsts, R);
27566	return Res;
27567	}
27568
27569	bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
27570	BoUpSLP &R) {
27571	bool Res = false;
27572	for (Value *V : Insts)
27573	if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
27574	Res \|= tryToVectorize(I: Inst, R);
27575	return Res;
27576	}
27577
27578	bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
27579	BasicBlock *BB, BoUpSLP &R,
27580	bool MaxVFOnly) {
27581	if (!R.canMapToVector(T: IVI->getType()))
27582	return false;
27583
27584	SmallVector<Value *, `16`> BuildVectorOpds;
27585	SmallVector<Value *, `16`> BuildVectorInsts;
27586	if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R))
27587	return false;
27588
27589	if (MaxVFOnly && BuildVectorOpds.size() == `2`) {
27590	R.getORE()->emit(RemarkBuilder: [&]() {
27591	return OptimizationRemarkMissed (SV_NAME, "NotPossible", IVI)
27592	<< "Cannot SLP vectorize list: only 2 elements of buildvalue, "
27593	"trying reduction first.";
27594	});
27595	return false;
27596	}
27597	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
27598	// Aggregate value is unlikely to be processed in vector register.
27599	return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
27600	}
27601
27602	bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
27603	BasicBlock *BB, BoUpSLP &R,
27604	bool MaxVFOnly) {
27605	SmallVector<Value *, `16`> BuildVectorInsts;
27606	SmallVector<Value *, `16`> BuildVectorOpds;
27607	SmallVector<int> Mask;
27608	if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) \|\|
27609	(all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
27610	isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC)))
27611	return false;
27612
27613	if (MaxVFOnly && BuildVectorInsts.size() == `2`) {
27614	R.getORE()->emit(RemarkBuilder: [&]() {
27615	return OptimizationRemarkMissed (SV_NAME, "NotPossible", IEI)
27616	<< "Cannot SLP vectorize list: only 2 elements of buildvector, "
27617	"trying reduction first.";
27618	});
27619	return false;
27620	}
27621	LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
27622	return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
27623	}
27624
27625	template <typename T>
27626	static bool tryToVectorizeSequence(
27627	SmallVectorImpl<T > &Incoming, function_ref<bool(T , T *)> Comparator,
27628	function_ref<bool(ArrayRef<T >, T )> AreCompatible,
27629	function_ref<bool(ArrayRef<T >, bool*)> TryToVectorizeHelper,
27630	bool MaxVFOnly, BoUpSLP &R) {
27631	bool Changed = false;
27632	// Sort by type, parent, operands.
27633	stable_sort(Incoming, Comparator);
27634
27635	// Try to vectorize elements base on their type.
27636	SmallVector<T *> Candidates;
27637	SmallVector<T *> VL;
27638	for (auto IncIt = Incoming.begin(), E = Incoming.end(); IncIt != E;
27639	VL.clear()) {
27640	// Look for the next elements with the same type, parent and operand
27641	// kinds.
27642	auto I = dyn_cast<Instruction>(IncIt);
27643	if (!I \|\| R.isDeleted(I)) {
27644	++IncIt;
27645	continue;
27646	}
27647	auto *SameTypeIt = IncIt;
27648	while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) \|\|
27649	R.isDeleted(I: cast<Instruction>(*SameTypeIt)) \|\|
27650	AreCompatible(VL, *SameTypeIt))) {
27651	auto I = dyn_cast<Instruction>(SameTypeIt);
27652	++SameTypeIt;
27653	if (I && !R.isDeleted(I))
27654	VL.push_back(cast<T>(I));
27655	}
27656
27657	// Try to vectorize them.
27658	unsigned NumElts = VL.size();
27659	LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
27660	<< NumElts << ")\n");
27661	// The vectorization is a 3-state attempt:
27662	// 1. Try to vectorize instructions with the same/alternate opcodes with the
27663	// size of maximal register at first.
27664	// 2. Try to vectorize remaining instructions with the same type, if
27665	// possible. This may result in the better vectorization results rather than
27666	// if we try just to vectorize instructions with the same/alternate opcodes.
27667	// 3. Final attempt to try to vectorize all instructions with the
27668	// same/alternate ops only, this may result in some extra final
27669	// vectorization.
27670	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
27671	// Success start over because instructions might have been changed.
27672	Changed = true;
27673	VL.swap(Candidates);
27674	Candidates.clear();
27675	for (T *V : VL) {
27676	if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27677	Candidates.push_back(V);
27678	}
27679	} else {
27680	/// \Returns the minimum number of elements that we will attempt to
27681	/// vectorize.
27682	auto GetMinNumElements = [&R](Value *V) {
27683	unsigned EltSize = R.getVectorElementSize(V);
27684	return std::max(a: `2U`, b: R.getMaxVecRegSize() / EltSize);
27685	};
27686	if (NumElts < GetMinNumElements(*IncIt) &&
27687	(Candidates.empty() \|\|
27688	Candidates.front()->getType() == (*IncIt)->getType())) {
27689	for (T *V : VL) {
27690	if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27691	Candidates.push_back(V);
27692	}
27693	}
27694	}
27695	// Final attempt to vectorize instructions with the same types.
27696	if (Candidates.size() > `1` &&
27697	(SameTypeIt == E \|\| (SameTypeIt)->getType() != (IncIt)->getType())) {
27698	if (TryToVectorizeHelper(Candidates, /MaxVFOnly=/false)) {
27699	// Success start over because instructions might have been changed.
27700	Changed = true;
27701	} else if (MaxVFOnly) {
27702	// Try to vectorize using small vectors.
27703	SmallVector<T *> VL;
27704	for (auto It = Candidates.begin(), End = Candidates.end(); It != End;
27705	VL.clear()) {
27706	auto I = dyn_cast<Instruction>(It);
27707	if (!I \|\| R.isDeleted(I)) {
27708	++It;
27709	continue;
27710	}
27711	auto *SameTypeIt = It;
27712	while (SameTypeIt != End &&
27713	(!isa<Instruction>(*SameTypeIt) \|\|
27714	R.isDeleted(I: cast<Instruction>(*SameTypeIt)) \|\|
27715	AreCompatible(SameTypeIt, It))) {
27716	auto I = dyn_cast<Instruction>(SameTypeIt);
27717	++SameTypeIt;
27718	if (I && !R.isDeleted(I))
27719	VL.push_back(cast<T>(I));
27720	}
27721	unsigned NumElts = VL.size();
27722	if (NumElts > `1` && TryToVectorizeHelper(ArrayRef(VL),
27723	/MaxVFOnly=/false))
27724	Changed = true;
27725	It = SameTypeIt;
27726	}
27727	}
27728	Candidates.clear();
27729	}
27730
27731	// Start over at the next instruction of a different type (or the end).
27732	IncIt = SameTypeIt;
27733	}
27734	return Changed;
27735	}
27736
27737	/// Compare two cmp instructions. If IsCompatibility is true, function returns
27738	/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
27739	/// operands. If IsCompatibility is false, function implements strict weak
27740	/// ordering relation between two cmp instructions, returning true if the first
27741	/// instruction is "less" than the second, i.e. its predicate is less than the
27742	/// predicate of the second or the operands IDs are less than the operands IDs
27743	/// of the second cmp instruction.
27744	template <bool IsCompatibility>
27745	static bool compareCmp(Value V, Value V2, TargetLibraryInfo &TLI,
27746	const DominatorTree &DT) {
27747	assert(isValidElementType(V->getType()) &&
27748	isValidElementType(V2->getType()) &&
27749	"Expected valid element types only.");
27750	if (V == V2)
27751	return IsCompatibility;
27752	auto *CI1 = cast<CmpInst>(Val: V);
27753	auto *CI2 = cast<CmpInst>(Val: V2);
27754	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() <
27755	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
27756	return !IsCompatibility;
27757	if (CI1->getOperand(i_nocapture: `0`)->getType()->getTypeID() >
27758	CI2->getOperand(i_nocapture: `0`)->getType()->getTypeID())
27759	return false;
27760	if (CI1->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() <
27761	CI2->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits())
27762	return !IsCompatibility;
27763	if (CI1->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits() >
27764	CI2->getOperand(i_nocapture: `0`)->getType()->getScalarSizeInBits())
27765	return false;
27766	CmpInst::Predicate Pred1 = CI1->getPredicate();
27767	CmpInst::Predicate Pred2 = CI2->getPredicate();
27768	CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
27769	CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
27770	CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
27771	CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
27772	if (BasePred1 < BasePred2)
27773	return !IsCompatibility;
27774	if (BasePred1 > BasePred2)
27775	return false;
27776	// Compare operands.
27777	bool CI1Preds = Pred1 == BasePred1;
27778	bool CI2Preds = Pred2 == BasePred1;
27779	for (int I = `0`, E = CI1->getNumOperands(); I < E; ++I) {
27780	auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - `1`);
27781	auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - `1`);
27782	if (Op1 == Op2)
27783	continue;
27784	if (Op1->getValueID() < Op2->getValueID())
27785	return !IsCompatibility;
27786	if (Op1->getValueID() > Op2->getValueID())
27787	return false;
27788	if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
27789	if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
27790	if (IsCompatibility) {
27791	if (I1->getParent() != I2->getParent())
27792	return false;
27793	} else {
27794	// Try to compare nodes with same parent.
27795	DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
27796	DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
27797	if (!NodeI1)
27798	return NodeI2 != nullptr;
27799	if (!NodeI2)
27800	return false;
27801	assert((NodeI1 == NodeI2) ==
27802	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27803	"Different nodes should have different DFS numbers");
27804	if (NodeI1 != NodeI2)
27805	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27806	}
27807	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
27808	if (S && (IsCompatibility \|\| !S.isAltShuffle()))
27809	continue;
27810	if (IsCompatibility)
27811	return false;
27812	if (I1->getOpcode() != I2->getOpcode())
27813	return I1->getOpcode() < I2->getOpcode();
27814	}
27815	}
27816	return IsCompatibility;
27817	}
27818
27819	template <typename ItT>
27820	bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
27821	BasicBlock *BB, BoUpSLP &R) {
27822	bool Changed = false;
27823	// Try to find reductions first.
27824	for (CmpInst *I : CmpInsts) {
27825	if (R.isDeleted(I))
27826	continue;
27827	for (Value *Op : I->operands())
27828	if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) {
27829	Changed \|= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R);
27830	if (R.isDeleted(I))
27831	break;
27832	}
27833	}
27834	// Try to vectorize operands as vector bundles.
27835	for (CmpInst *I : CmpInsts) {
27836	if (R.isDeleted(I))
27837	continue;
27838	Changed \|= tryToVectorize(I, R);
27839	}
27840	// Try to vectorize list of compares.
27841	// Sort by type, compare predicate, etc.
27842	auto CompareSorter = [&](Value V, Value V2) {
27843	if (V == V2)
27844	return false;
27845	return compareCmp<false>(V, V2, TLI&: TLI, DT: DT);
27846	};
27847
27848	auto AreCompatibleCompares = [&](ArrayRef<Value > VL, Value V1) {
27849	if (VL.empty() \|\| VL.back() == V1)
27850	return true;
27851	return compareCmp<true>(V: V1, V2: VL.back(), TLI&: TLI, DT: DT);
27852	};
27853
27854	SmallVector<Value *> Vals;
27855	for (Instruction *V : CmpInsts)
27856	if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V)))
27857	Vals.push_back(Elt: V);
27858	if (Vals.size() <= `1`)
27859	return Changed;
27860	Changed \|= tryToVectorizeSequence<Value>(
27861	Vals, CompareSorter, AreCompatibleCompares,
27862	[this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
27863	// Exclude possible reductions from other blocks.
27864	bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
27865	return any_of(V->users(), [V](User *U) {
27866	auto *Select = dyn_cast<SelectInst>(Val: U);
27867	return Select &&
27868	Select->getParent() != cast<Instruction>(Val: V)->getParent();
27869	});
27870	});
27871	if (ArePossiblyReducedInOtherBlock)
27872	return false;
27873	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
27874	},
27875	/MaxVFOnly=/true, R);
27876	return Changed;
27877	}
27878
27879	bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27880	BasicBlock *BB, BoUpSLP &R) {
27881	assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
27882	"This function only accepts Insert instructions");
27883	bool OpsChanged = false;
27884	SmallVector<WeakTrackingVH> PostponedInsts;
27885	for (auto *I : reverse(C&: Instructions)) {
27886	// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27887	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
27888	continue;
27889	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
27890	OpsChanged \|=
27891	vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /MaxVFOnly=/true);
27892	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
27893	OpsChanged \|=
27894	vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /MaxVFOnly=/true);
27895	}
27896	// pass2 - try to vectorize reductions only
27897	if (R.isDeleted(I))
27898	continue;
27899	OpsChanged \|= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts);
27900	if (R.isDeleted(I) \|\| isa<CmpInst>(Val: I))
27901	continue;
27902	// pass3 - try to match and vectorize a buildvector sequence.
27903	if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
27904	OpsChanged \|=
27905	vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /MaxVFOnly=/false);
27906	} else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
27907	OpsChanged \|= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
27908	/MaxVFOnly=/false);
27909	}
27910	}
27911	// Now try to vectorize postponed instructions.
27912	OpsChanged \|= tryToVectorize(Insts: PostponedInsts, R);
27913
27914	Instructions.clear();
27915	return OpsChanged;
27916	}
27917
27918	bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27919	bool Changed = false;
27920	SmallVector<Value *, `4`> Incoming;
27921	SmallPtrSet<Value *, `16`> VisitedInstrs;
27922	// Maps phi nodes to the non-phi nodes found in the use tree for each phi
27923	// node. Allows better to identify the chains that can be vectorized in the
27924	// better way.
27925	DenseMap<Value , SmallVector<Value , `4`>> PHIToOpcodes;
27926	auto PHICompare = [this, &PHIToOpcodes](Value V1, Value V2) {
27927	assert(isValidElementType(V1->getType()) &&
27928	isValidElementType(V2->getType()) &&
27929	"Expected vectorizable types only.");
27930	if (V1 == V2)
27931	return false;
27932	// It is fine to compare type IDs here, since we expect only vectorizable
27933	// types, like ints, floats and pointers, we don't care about other type.
27934	if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27935	return true;
27936	if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27937	return false;
27938	if (V1->getType()->getScalarSizeInBits() <
27939	V2->getType()->getScalarSizeInBits())
27940	return true;
27941	if (V1->getType()->getScalarSizeInBits() >
27942	V2->getType()->getScalarSizeInBits())
27943	return false;
27944	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
27945	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
27946	if (Opcodes1.size() < Opcodes2.size())
27947	return true;
27948	if (Opcodes1.size() > Opcodes2.size())
27949	return false;
27950	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
27951	{
27952	// Instructions come first.
27953	auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]);
27954	auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I]);
27955	if (I1 && I2) {
27956	DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
27957	DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
27958	if (!NodeI1)
27959	return NodeI2 != nullptr;
27960	if (!NodeI2)
27961	return false;
27962	assert((NodeI1 == NodeI2) ==
27963	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27964	"Different nodes should have different DFS numbers");
27965	if (NodeI1 != NodeI2)
27966	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27967	InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
27968	if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27969	const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1);
27970	const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2);
27971	if (!E1 \|\| !E2)
27972	continue;
27973
27974	// Sort on ExtractElementInsts primarily by vector operands. Prefer
27975	// program order of the vector operands.
27976	const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand());
27977	const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand());
27978	if (V1 != V2) {
27979	if (V1 && !V2)
27980	return true;
27981	if (!V1 && V2)
27982	return false;
27983	DomTreeNodeBase<BasicBlock> *NodeI1 =
27984	DT->getNode(BB: V1->getParent());
27985	DomTreeNodeBase<BasicBlock> *NodeI2 =
27986	DT->getNode(BB: V2->getParent());
27987	if (!NodeI1)
27988	return NodeI2 != nullptr;
27989	if (!NodeI2)
27990	return false;
27991	assert((NodeI1 == NodeI2) ==
27992	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27993	"Different nodes should have different DFS numbers");
27994	if (NodeI1 != NodeI2)
27995	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27996	return V1->comesBefore(Other: V2);
27997	}
27998	// If we have the same vector operand, try to sort by constant
27999	// index.
28000	std::optional<unsigned> Id1 = getExtractIndex(E: E1);
28001	std::optional<unsigned> Id2 = getExtractIndex(E: E2);
28002	// Bring constants to the top
28003	if (Id1 && !Id2)
28004	return true;
28005	if (!Id1 && Id2)
28006	return false;
28007	// First elements come first.
28008	if (Id1 && Id2)
28009	return Id1 < Id2;
28010
28011	continue;
28012	}
28013	if (I1->getOpcode() == I2->getOpcode())
28014	continue;
28015	return I1->getOpcode() < I2->getOpcode();
28016	}
28017	if (I1)
28018	return true;
28019	if (I2)
28020	return false;
28021	}
28022	{
28023	// Non-undef constants come next.
28024	bool C1 = isa<Constant>(Val: Opcodes1 [I]) && !isa<UndefValue>(Val: Opcodes1 [I]);
28025	bool C2 = isa<Constant>(Val: Opcodes2 [I]) && !isa<UndefValue>(Val: Opcodes2 [I]);
28026	if (C1 && C2)
28027	continue;
28028	if (C1)
28029	return true;
28030	if (C2)
28031	return false;
28032	}
28033	bool U1 = isa<UndefValue>(Val: Opcodes1 [I]);
28034	bool U2 = isa<UndefValue>(Val: Opcodes2 [I]);
28035	{
28036	// Non-constant non-instructions come next.
28037	if (!U1 && !U2) {
28038	auto ValID1 = Opcodes1 [I]->getValueID();
28039	auto ValID2 = Opcodes2 [I]->getValueID();
28040	if (ValID1 == ValID2)
28041	continue;
28042	if (ValID1 < ValID2)
28043	return true;
28044	if (ValID1 > ValID2)
28045	return false;
28046	}
28047	if (!U1)
28048	return true;
28049	if (!U2)
28050	return false;
28051	}
28052	// Undefs come last.
28053	assert(U1 && U2 && "The only thing left should be undef & undef.");
28054	}
28055	return false;
28056	};
28057	auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
28058	Value *V1) {
28059	if (VL.empty() \|\| V1 == VL.back())
28060	return true;
28061	Value *V2 = VL.back();
28062	if (V1->getType() != V2->getType())
28063	return false;
28064	ArrayRef<Value *> Opcodes1 = PHIToOpcodes [V1];
28065	ArrayRef<Value *> Opcodes2 = PHIToOpcodes [V2];
28066	if (Opcodes1.size() != Opcodes2.size())
28067	return false;
28068	for (int I = `0`, E = Opcodes1.size(); I < E; ++I) {
28069	// Undefs are compatible with any other value.
28070	if (isa<UndefValue>(Val: Opcodes1 [I]) \|\| isa<UndefValue>(Val: Opcodes2 [I]))
28071	continue;
28072	if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1 [I]))
28073	if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2 [I])) {
28074	if (R.isDeleted(I: I1) \|\| R.isDeleted(I: I2))
28075	return false;
28076	if (I1->getParent() != I2->getParent())
28077	return false;
28078	if (getSameOpcode(VL: {I1, I2}, TLI: *TLI))
28079	continue;
28080	return false;
28081	}
28082	if (isa<Constant>(Val: Opcodes1 [I]) && isa<Constant>(Val: Opcodes2 [I]))
28083	continue;
28084	if (Opcodes1 [I]->getValueID() != Opcodes2 [I]->getValueID())
28085	return false;
28086	}
28087	return true;
28088	};
28089
28090	bool HaveVectorizedPhiNodes = false;
28091	do {
28092	// Collect the incoming values from the PHIs.
28093	Incoming.clear();
28094	for (Instruction &I : *BB) {
28095	auto *P = dyn_cast<PHINode>(Val: &I);
28096	if (!P \|\| P->getNumIncomingValues() > MaxPHINumOperands)
28097	break;
28098
28099	// No need to analyze deleted, vectorized and non-vectorizable
28100	// instructions.
28101	if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
28102	isValidElementType(Ty: P->getType()))
28103	Incoming.push_back(Elt: P);
28104	}
28105
28106	if (Incoming.size() <= `1`)
28107	break;
28108
28109	// Find the corresponding non-phi nodes for better matching when trying to
28110	// build the tree.
28111	for (Value *V : Incoming) {
28112	SmallVectorImpl<Value *> &Opcodes =
28113	PHIToOpcodes.try_emplace(Key: V).first ->getSecond();
28114	if (!Opcodes.empty())
28115	continue;
28116	SmallVector<Value *, `4`> Nodes(`1`, V);
28117	SmallPtrSet<Value *, `4`> Visited;
28118	while (!Nodes.empty()) {
28119	auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
28120	if (!Visited.insert(Ptr: PHI).second)
28121	continue;
28122	for (Value *V : PHI->incoming_values()) {
28123	if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
28124	Nodes.push_back(Elt: PHI1);
28125	continue;
28126	}
28127	Opcodes.emplace_back(Args&: V);
28128	}
28129	}
28130	}
28131
28132	HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
28133	Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
28134	TryToVectorizeHelper: [this, &R](ArrayRef<Value > Candidates, bool* MaxVFOnly) {
28135	return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
28136	},
28137	/MaxVFOnly=/true, R);
28138	Changed \|= HaveVectorizedPhiNodes;
28139	if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
28140	auto *PHI = dyn_cast<PHINode>(P.first);
28141	return !PHI \|\| R.isDeleted(I: PHI);
28142	}))
28143	PHIToOpcodes.clear();
28144	VisitedInstrs.insert_range(R&: Incoming);
28145	} while (HaveVectorizedPhiNodes);
28146
28147	VisitedInstrs.clear();
28148
28149	InstSetVector PostProcessInserts;
28150	SmallSetVector<CmpInst *, `8`> PostProcessCmps;
28151	// Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
28152	// also vectorizes `PostProcessCmps`.
28153	auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
28154	bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
28155	if (VectorizeCmps) {
28156	Changed \|= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
28157	PostProcessCmps.clear();
28158	}
28159	PostProcessInserts.clear();
28160	return Changed;
28161	};
28162	// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
28163	auto IsInPostProcessInstrs = [&](Instruction *I) {
28164	if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
28165	return PostProcessCmps.contains(key: Cmp);
28166	return isa<InsertElementInst, InsertValueInst>(Val: I) &&
28167	PostProcessInserts.contains(key: I);
28168	};
28169	// Returns true if `I` is an instruction without users, like terminator, or
28170	// function call with ignored return value, store. Ignore unused instructions
28171	// (basing on instruction type, except for CallInst and InvokeInst).
28172	auto HasNoUsers = [](Instruction *I) {
28173	return I->use_empty() &&
28174	(I->getType()->isVoidTy() \|\| isa<CallInst, InvokeInst>(Val: I));
28175	};
28176	for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
28177	// Skip instructions with scalable type. The num of elements is unknown at
28178	// compile-time for scalable type.
28179	if (isa<ScalableVectorType>(Val: It ->getType()))
28180	continue;
28181
28182	// Skip instructions marked for the deletion.
28183	if (R.isDeleted(I: &*It))
28184	continue;
28185	// We may go through BB multiple times so skip the one we have checked.
28186	if (!VisitedInstrs.insert(Ptr: &*It).second) {
28187	if (HasNoUsers (&*It) &&
28188	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator())) {
28189	// We would like to start over since some instructions are deleted
28190	// and the iterator may become invalid value.
28191	Changed = true;
28192	It = BB->begin();
28193	E = BB->end();
28194	}
28195	continue;
28196	}
28197
28198	// Try to vectorize reductions that use PHINodes.
28199	if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
28200	// Check that the PHI is a reduction PHI.
28201	if (P->getNumIncomingValues() == `2`) {
28202	// Try to match and vectorize a horizontal reduction.
28203	Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
28204	if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
28205	Changed = true;
28206	It = BB->begin();
28207	E = BB->end();
28208	continue;
28209	}
28210	}
28211	// Try to vectorize the incoming values of the PHI, to catch reductions
28212	// that feed into PHIs.
28213	for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
28214	// Skip if the incoming block is the current BB for now. Also, bypass
28215	// unreachable IR for efficiency and to avoid crashing.
28216	// TODO: Collect the skipped incoming values and try to vectorize them
28217	// after processing BB.
28218	if (BB == P->getIncomingBlock(i: I) \|\|
28219	!DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
28220	continue;
28221
28222	// Postponed instructions should not be vectorized here, delay their
28223	// vectorization.
28224	if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
28225	PI && !IsInPostProcessInstrs (PI)) {
28226	bool Res =
28227	vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R);
28228	Changed \|= Res;
28229	if (Res && R.isDeleted(I: P)) {
28230	It = BB->begin();
28231	E = BB->end();
28232	break;
28233	}
28234	}
28235	}
28236	continue;
28237	}
28238
28239	if (HasNoUsers (&*It)) {
28240	bool OpsChanged = false;
28241	auto *SI = dyn_cast<StoreInst>(Val&: It);
28242	bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore \|\| !SI;
28243	if (SI) {
28244	auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
28245	// Try to vectorize chain in store, if this is the only store to the
28246	// address in the block.
28247	// TODO: This is just a temporarily solution to save compile time. Need
28248	// to investigate if we can safely turn on slp-vectorize-hor-store
28249	// instead to allow lookup for reduction chains in all non-vectorized
28250	// stores (need to check side effects and compile time).
28251	TryToVectorizeRoot \|= (I == Stores.end() \|\| I->second.size() == `1`) &&
28252	SI->getValueOperand()->hasOneUse();
28253	}
28254	if (TryToVectorizeRoot) {
28255	for (auto *V : It ->operand_values()) {
28256	// Postponed instructions should not be vectorized here, delay their
28257	// vectorization.
28258	if (auto *VI = dyn_cast<Instruction>(Val: V);
28259	VI && !IsInPostProcessInstrs (VI))
28260	// Try to match and vectorize a horizontal reduction.
28261	OpsChanged \|= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R);
28262	}
28263	}
28264	// Start vectorization of post-process list of instructions from the
28265	// top-tree instructions to try to vectorize as many instructions as
28266	// possible.
28267	OpsChanged \|=
28268	VectorizeInsertsAndCmps (/VectorizeCmps=/It ->isTerminator());
28269	if (OpsChanged) {
28270	// We would like to start over since some instructions are deleted
28271	// and the iterator may become invalid value.
28272	Changed = true;
28273	It = BB->begin();
28274	E = BB->end();
28275	continue;
28276	}
28277	}
28278
28279	if (isa<InsertElementInst, InsertValueInst>(Val: It))
28280	PostProcessInserts.insert(X: &*It);
28281	else if (isa<CmpInst>(Val: It))
28282	PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
28283	}
28284
28285	return Changed;
28286	}
28287
28288	bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
28289	auto Changed = false;
28290	for (auto &Entry : GEPs) {
28291	// If the getelementptr list has fewer than two elements, there's nothing
28292	// to do.
28293	if (Entry.second.size() < `2`)
28294	continue;
28295
28296	LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
28297	<< Entry.second.size() << ".\n");
28298
28299	// Process the GEP list in chunks suitable for the target's supported
28300	// vector size. If a vector register can't hold 1 element, we are done. We
28301	// are trying to vectorize the index computations, so the maximum number of
28302	// elements is based on the size of the index expression, rather than the
28303	// size of the GEP itself (the target's pointer size).
28304	auto It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst GEP) {
28305	return !R.isDeleted(I: GEP);
28306	});
28307	if (It == Entry.second.end())
28308	continue;
28309	unsigned MaxVecRegSize = R.getMaxVecRegSize();
28310	unsigned EltSize = R.getVectorElementSize(V: (It)->idx_begin());
28311	if (MaxVecRegSize < EltSize)
28312	continue;
28313
28314	unsigned MaxElts = MaxVecRegSize / EltSize;
28315	for (unsigned BI = `0`, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
28316	auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
28317	ArrayRef<GetElementPtrInst *> GEPList(&Entry.second [BI], Len);
28318
28319	// Initialize a set a candidate getelementptrs. Note that we use a
28320	// SetVector here to preserve program order. If the index computations
28321	// are vectorizable and begin with loads, we want to minimize the chance
28322	// of having to reorder them later.
28323	SetVector<Value *> Candidates(llvm::from_range, GEPList);
28324
28325	// Some of the candidates may have already been vectorized after we
28326	// initially collected them or their index is optimized to constant value.
28327	// If so, they are marked as deleted, so remove them from the set of
28328	// candidates.
28329	Candidates.remove_if(P: [&R](Value *I) {
28330	return R.isDeleted(I: cast<Instruction>(Val: I)) \|\|
28331	isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
28332	});
28333
28334	// Remove from the set of candidates all pairs of getelementptrs with
28335	// constant differences. Such getelementptrs are likely not good
28336	// candidates for vectorization in a bottom-up phase since one can be
28337	// computed from the other. We also ensure all candidate getelementptr
28338	// indices are unique.
28339	for (int I = `0`, E = GEPList.size(); I < E && Candidates.size() > `1`; ++I) {
28340	auto *GEPI = GEPList [I];
28341	if (!Candidates.count(key: GEPI))
28342	continue;
28343	const SCEV *SCEVI = SE->getSCEV(V: GEPList [I]);
28344	for (int J = I + `1`; J < E && Candidates.size() > `1`; ++J) {
28345	auto *GEPJ = GEPList [J];
28346	if (!Candidates.count(key: GEPJ))
28347	continue;
28348	const SCEV *SCEVJ = SE->getSCEV(V: GEPList [J]);
28349	if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
28350	Candidates.remove(X: GEPI);
28351	Candidates.remove(X: GEPJ);
28352	} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
28353	Candidates.remove(X: GEPJ);
28354	}
28355	}
28356	}
28357
28358	// We break out of the above computation as soon as we know there are
28359	// fewer than two candidates remaining.
28360	if (Candidates.size() < `2`)
28361	continue;
28362
28363	// Add the single, non-constant index of each candidate to the bundle. We
28364	// ensured the indices met these constraints when we originally collected
28365	// the getelementptrs.
28366	SmallVector<Value *, `16`> Bundle(Candidates.size());
28367	auto BundleIndex = `0u`;
28368	for (auto *V : Candidates) {
28369	auto *GEP = cast<GetElementPtrInst>(Val: V);
28370	auto *GEPIdx = GEP->idx_begin()->get();
28371	assert(GEP->getNumIndices() == `1` && !isa<Constant>(GEPIdx));
28372	Bundle [BundleIndex++] = GEPIdx;
28373	}
28374
28375	// Try and vectorize the indices. We are currently only interested in
28376	// gather-like cases of the form:
28377	//
28378	// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
28379	//
28380	// where the loads of "a", the loads of "b", and the subtractions can be
28381	// performed in parallel. It's likely that detecting this pattern in a
28382	// bottom-up phase will be simpler and less costly than building a
28383	// full-blown top-down phase beginning at the consecutive loads.
28384	Changed \|= tryToVectorizeList(VL: Bundle, R);
28385	}
28386	}
28387	return Changed;
28388	}
28389
28390	bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
28391	bool Changed = false;
28392	// Sort by type, base pointers and values operand. Value operands must be
28393	// compatible (have the same opcode, same parent), otherwise it is
28394	// definitely not profitable to try to vectorize them.
28395	auto &&StoreSorter = [this](StoreInst V, StoreInst V2) {
28396	if (V->getValueOperand()->getType()->getTypeID() <
28397	V2->getValueOperand()->getType()->getTypeID())
28398	return true;
28399	if (V->getValueOperand()->getType()->getTypeID() >
28400	V2->getValueOperand()->getType()->getTypeID())
28401	return false;
28402	if (V->getPointerOperandType()->getTypeID() <
28403	V2->getPointerOperandType()->getTypeID())
28404	return true;
28405	if (V->getPointerOperandType()->getTypeID() >
28406	V2->getPointerOperandType()->getTypeID())
28407	return false;
28408	if (V->getValueOperand()->getType()->getScalarSizeInBits() <
28409	V2->getValueOperand()->getType()->getScalarSizeInBits())
28410	return true;
28411	if (V->getValueOperand()->getType()->getScalarSizeInBits() >
28412	V2->getValueOperand()->getType()->getScalarSizeInBits())
28413	return false;
28414	// UndefValues are compatible with all other values.
28415	auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand());
28416	auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
28417	if (I1 && I2) {
28418	DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
28419	DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
28420	assert(NodeI1 && "Should only process reachable instructions");
28421	assert(NodeI2 && "Should only process reachable instructions");
28422	assert((NodeI1 == NodeI2) ==
28423	(NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
28424	"Different nodes should have different DFS numbers");
28425	if (NodeI1 != NodeI2)
28426	return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
28427	return I1->getOpcode() < I2->getOpcode();
28428	}
28429	if (I1 && !I2)
28430	return true;
28431	if (!I1 && I2)
28432	return false;
28433	return V->getValueOperand()->getValueID() <
28434	V2->getValueOperand()->getValueID();
28435	};
28436
28437	bool SameParent = true;
28438	auto AreCompatibleStores = [&](ArrayRef<StoreInst > VL, StoreInst V1) {
28439	if (VL.empty()) {
28440	SameParent = true;
28441	return true;
28442	}
28443	StoreInst *V2 = VL.back();
28444	if (V1 == V2)
28445	return true;
28446	if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
28447	return false;
28448	if (V1->getPointerOperandType() != V2->getPointerOperandType())
28449	return false;
28450	// Undefs are compatible with any other value.
28451	if (isa<UndefValue>(Val: V1->getValueOperand()) \|\|
28452	isa<UndefValue>(Val: V2->getValueOperand()))
28453	return true;
28454	if (isa<Constant>(Val: V1->getValueOperand()) &&
28455	isa<Constant>(Val: V2->getValueOperand()))
28456	return true;
28457	// Check if the operands of the stores can be vectorized. They can be
28458	// vectorized, if they have compatible operands or have operands, which can
28459	// be vectorized as copyables.
28460	auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand());
28461	auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
28462	if (I1 \|\| I2) {
28463	// Accept only tail-following non-compatible values for now.
28464	// TODO: investigate if it is possible to vectorize incompatible values,
28465	// if the copyables are first in the list.
28466	if (I1 && !I2)
28467	return false;
28468	SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
28469	SmallVector<Value *> NewVL(VL.size() + `1`);
28470	for (auto [SI, V] : zip(t&: VL, u&: NewVL))
28471	V = SI->getValueOperand();
28472	NewVL.back() = V1->getValueOperand();
28473	InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
28474	InstructionsState S = Analysis.buildInstructionsState(
28475	VL: NewVL, R, TryCopyableElementsVectorization: VectorizeCopyableElements, /WithProfitabilityCheck=/true,
28476	/SkipSameCodeCheck=/!SameParent);
28477	if (S)
28478	return true;
28479	if (!SameParent)
28480	return false;
28481	}
28482	return V1->getValueOperand()->getValueID() ==
28483	V2->getValueOperand()->getValueID();
28484	};
28485
28486	// Attempt to sort and vectorize each of the store-groups.
28487	DenseSet<std::tuple<Value , Value , Value , Value , unsigned>> Attempted;
28488	for (auto &Pair : Stores) {
28489	if (Pair.second.size() < `2`)
28490	continue;
28491
28492	LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
28493	<< Pair.second.size() << ".\n");
28494
28495	if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
28496	continue;
28497
28498	// Reverse stores to do bottom-to-top analysis. This is important if the
28499	// values are stores to the same addresses several times, in this case need
28500	// to follow the stores order (reversed to meet the memory dependecies).
28501	SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
28502	Pair.second.rend());
28503	Changed \|= tryToVectorizeSequence<StoreInst>(
28504	Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
28505	TryToVectorizeHelper: [&](ArrayRef<StoreInst > Candidates, bool*) {
28506	return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
28507	},
28508	/MaxVFOnly=/false, R);
28509	}
28510	return Changed;
28511	}
28512

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp