VectorUtils.cpp source code [llvm_projects/llvm/lib/Analysis/VectorUtils.cpp]

1	//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines vectorizer utilities.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/Analysis/VectorUtils.h"
14	#include "llvm/ADT/EquivalenceClasses.h"
15	#include "llvm/ADT/SmallVector.h"
16	#include "llvm/Analysis/DemandedBits.h"
17	#include "llvm/Analysis/LoopInfo.h"
18	#include "llvm/Analysis/LoopIterator.h"
19	#include "llvm/Analysis/ScalarEvolution.h"
20	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
21	#include "llvm/Analysis/TargetTransformInfo.h"
22	#include "llvm/Analysis/ValueTracking.h"
23	#include "llvm/IR/Constants.h"
24	#include "llvm/IR/DerivedTypes.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Value.h"
29	#include "llvm/Support/CommandLine.h"
30
31	#define DEBUG_TYPE "vectorutils"
32
33	using namespace llvm;
34	using namespace llvm::PatternMatch;
35
36	/// Maximum factor for an interleaved memory access.
37	static cl::opt<unsigned> MaxInterleaveGroupFactor(
38	"max-interleave-group-factor", cl::Hidden,
39	cl::desc ("Maximum factor for an interleaved access group (default = 8)"),
40	cl::init(Val: `8`));
41
42	/// Return true if all of the intrinsic's arguments and return type are scalars
43	/// for the scalar form of the intrinsic, and vectors for the vector form of the
44	/// intrinsic (except operands that are marked as always being scalar by
45	/// isVectorIntrinsicWithScalarOpAtArg).
46	bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
47	switch (ID) {
48	case Intrinsic::abs: // Begin integer bit-manipulation.
49	case Intrinsic::bswap:
50	case Intrinsic::bitreverse:
51	case Intrinsic::ctpop:
52	case Intrinsic::ctlz:
53	case Intrinsic::cttz:
54	case Intrinsic::fshl:
55	case Intrinsic::fshr:
56	case Intrinsic::smax:
57	case Intrinsic::smin:
58	case Intrinsic::umax:
59	case Intrinsic::umin:
60	case Intrinsic::sadd_sat:
61	case Intrinsic::ssub_sat:
62	case Intrinsic::uadd_sat:
63	case Intrinsic::usub_sat:
64	case Intrinsic::smul_fix:
65	case Intrinsic::smul_fix_sat:
66	case Intrinsic::umul_fix:
67	case Intrinsic::umul_fix_sat:
68	case Intrinsic::sqrt: // Begin floating-point.
69	case Intrinsic::asin:
70	case Intrinsic::acos:
71	case Intrinsic::atan:
72	case Intrinsic::atan2:
73	case Intrinsic::sin:
74	case Intrinsic::cos:
75	case Intrinsic::sincos:
76	case Intrinsic::sincospi:
77	case Intrinsic::tan:
78	case Intrinsic::sinh:
79	case Intrinsic::cosh:
80	case Intrinsic::tanh:
81	case Intrinsic::exp:
82	case Intrinsic::exp10:
83	case Intrinsic::exp2:
84	case Intrinsic::log:
85	case Intrinsic::log10:
86	case Intrinsic::log2:
87	case Intrinsic::fabs:
88	case Intrinsic::minnum:
89	case Intrinsic::maxnum:
90	case Intrinsic::minimum:
91	case Intrinsic::maximum:
92	case Intrinsic::minimumnum:
93	case Intrinsic::maximumnum:
94	case Intrinsic::modf:
95	case Intrinsic::copysign:
96	case Intrinsic::floor:
97	case Intrinsic::ceil:
98	case Intrinsic::trunc:
99	case Intrinsic::rint:
100	case Intrinsic::nearbyint:
101	case Intrinsic::round:
102	case Intrinsic::roundeven:
103	case Intrinsic::pow:
104	case Intrinsic::fma:
105	case Intrinsic::fmuladd:
106	case Intrinsic::is_fpclass:
107	case Intrinsic::powi:
108	case Intrinsic::canonicalize:
109	case Intrinsic::fptosi_sat:
110	case Intrinsic::fptoui_sat:
111	case Intrinsic::lrint:
112	case Intrinsic::llrint:
113	case Intrinsic::ucmp:
114	case Intrinsic::scmp:
115	return true;
116	default:
117	return false;
118	}
119	}
120
121	bool llvm::isTriviallyScalarizable(Intrinsic::ID ID,
122	const TargetTransformInfo *TTI) {
123	if (isTriviallyVectorizable(ID))
124	return true;
125
126	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
127	return TTI->isTargetIntrinsicTriviallyScalarizable(ID);
128
129	// TODO: Move frexp to isTriviallyVectorizable.
130	// https://github.com/llvm/llvm-project/issues/112408
131	switch (ID) {
132	case Intrinsic::frexp:
133	case Intrinsic::uadd_with_overflow:
134	case Intrinsic::sadd_with_overflow:
135	case Intrinsic::ssub_with_overflow:
136	case Intrinsic::usub_with_overflow:
137	case Intrinsic::umul_with_overflow:
138	case Intrinsic::smul_with_overflow:
139	return true;
140	}
141	return false;
142	}
143
144	/// Identifies if the vector form of the intrinsic has a scalar operand.
145	bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
146	unsigned ScalarOpdIdx,
147	const TargetTransformInfo *TTI) {
148
149	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
150	return TTI->isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
151
152	// Vector predication intrinsics have the EVL as the last operand.
153	if (VPIntrinsic::getVectorLengthParamPos(IntrinsicID: ID) == ScalarOpdIdx)
154	return true;
155
156	switch (ID) {
157	case Intrinsic::abs:
158	case Intrinsic::vp_abs:
159	case Intrinsic::ctlz:
160	case Intrinsic::vp_ctlz:
161	case Intrinsic::cttz:
162	case Intrinsic::vp_cttz:
163	case Intrinsic::is_fpclass:
164	case Intrinsic::vp_is_fpclass:
165	case Intrinsic::powi:
166	return (ScalarOpdIdx == `1`);
167	case Intrinsic::smul_fix:
168	case Intrinsic::smul_fix_sat:
169	case Intrinsic::umul_fix:
170	case Intrinsic::umul_fix_sat:
171	return (ScalarOpdIdx == `2`);
172	case Intrinsic::experimental_vp_splice:
173	return ScalarOpdIdx == `2` \|\| ScalarOpdIdx == `4`;
174	default:
175	return false;
176	}
177	}
178
179	bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
180	Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI) {
181	assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
182
183	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
184	return TTI->isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
185
186	if (VPCastIntrinsic::isVPCast(ID))
187	return OpdIdx == -`1` \|\| OpdIdx == `0`;
188
189	switch (ID) {
190	case Intrinsic::fptosi_sat:
191	case Intrinsic::fptoui_sat:
192	case Intrinsic::lrint:
193	case Intrinsic::llrint:
194	case Intrinsic::vp_lrint:
195	case Intrinsic::vp_llrint:
196	case Intrinsic::ucmp:
197	case Intrinsic::scmp:
198	return OpdIdx == -`1` \|\| OpdIdx == `0`;
199	case Intrinsic::modf:
200	case Intrinsic::sincos:
201	case Intrinsic::sincospi:
202	case Intrinsic::is_fpclass:
203	case Intrinsic::vp_is_fpclass:
204	return OpdIdx == `0`;
205	case Intrinsic::powi:
206	return OpdIdx == -`1` \|\| OpdIdx == `1`;
207	default:
208	return OpdIdx == -`1`;
209	}
210	}
211
212	bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
213	Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI) {
214
215	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
216	return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
217
218	switch (ID) {
219	case Intrinsic::frexp:
220	return RetIdx == `0` \|\| RetIdx == `1`;
221	default:
222	return RetIdx == `0`;
223	}
224	}
225
226	/// Returns intrinsic ID for call.
227	/// For the input call instruction it finds mapping intrinsic and returns
228	/// its ID, in case it does not found it return not_intrinsic.
229	Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
230	const TargetLibraryInfo *TLI) {
231	Intrinsic::ID ID = getIntrinsicForCallSite(CB: *CI, TLI);
232	if (ID == Intrinsic::not_intrinsic)
233	return Intrinsic::not_intrinsic;
234
235	if (isTriviallyVectorizable(ID) \|\| ID == Intrinsic::lifetime_start \|\|
236	ID == Intrinsic::lifetime_end \|\| ID == Intrinsic::assume \|\|
237	ID == Intrinsic::experimental_noalias_scope_decl \|\|
238	ID == Intrinsic::sideeffect \|\| ID == Intrinsic::pseudoprobe)
239	return ID;
240	return Intrinsic::not_intrinsic;
241	}
242
243	struct InterleaveIntrinsic {
244	Intrinsic::ID Interleave, Deinterleave;
245	};
246
247	static InterleaveIntrinsic InterleaveIntrinsics[] = {
248	{.Interleave: Intrinsic::vector_interleave2, .Deinterleave: Intrinsic::vector_deinterleave2},
249	{.Interleave: Intrinsic::vector_interleave3, .Deinterleave: Intrinsic::vector_deinterleave3},
250	{.Interleave: Intrinsic::vector_interleave4, .Deinterleave: Intrinsic::vector_deinterleave4},
251	{.Interleave: Intrinsic::vector_interleave5, .Deinterleave: Intrinsic::vector_deinterleave5},
252	{.Interleave: Intrinsic::vector_interleave6, .Deinterleave: Intrinsic::vector_deinterleave6},
253	{.Interleave: Intrinsic::vector_interleave7, .Deinterleave: Intrinsic::vector_deinterleave7},
254	{.Interleave: Intrinsic::vector_interleave8, .Deinterleave: Intrinsic::vector_deinterleave8},
255	};
256
257	Intrinsic::ID llvm::getInterleaveIntrinsicID(unsigned Factor) {
258	assert(Factor >= `2` && Factor <= `8` && "Unexpected factor");
259	return InterleaveIntrinsics[Factor - `2`].Interleave;
260	}
261
262	Intrinsic::ID llvm::getDeinterleaveIntrinsicID(unsigned Factor) {
263	assert(Factor >= `2` && Factor <= `8` && "Unexpected factor");
264	return InterleaveIntrinsics[Factor - `2`].Deinterleave;
265	}
266
267	/// Given a vector and an element number, see if the scalar value is
268	/// already around as a register, for example if it were inserted then extracted
269	/// from the vector.
270	Value llvm::findScalarElement(Value V, unsigned EltNo) {
271	assert(V->getType()->isVectorTy() && "Not looking at a vector?");
272	VectorType *VTy = cast<VectorType>(Val: V->getType());
273	// For fixed-length vector, return poison for out of range access.
274	if (auto *FVTy = dyn_cast<FixedVectorType>(Val: VTy)) {
275	unsigned Width = FVTy->getNumElements();
276	if (EltNo >= Width)
277	return PoisonValue::get(T: FVTy->getElementType());
278	}
279
280	if (Constant *C = dyn_cast<Constant>(Val: V))
281	return C->getAggregateElement(Elt: EltNo);
282
283	if (InsertElementInst *III = dyn_cast<InsertElementInst>(Val: V)) {
284	// If this is an insert to a variable element, we don't know what it is.
285	if (!isa<ConstantInt>(Val: III->getOperand(i_nocapture: `2`)))
286	return nullptr;
287	unsigned IIElt = cast<ConstantInt>(Val: III->getOperand(i_nocapture: `2`))->getZExtValue();
288
289	// If this is an insert to the element we are looking for, return the
290	// inserted value.
291	if (EltNo == IIElt)
292	return III->getOperand(i_nocapture: `1`);
293
294	// Guard against infinite loop on malformed, unreachable IR.
295	if (III == III->getOperand(i_nocapture: `0`))
296	return nullptr;
297
298	// Otherwise, the insertelement doesn't modify the value, recurse on its
299	// vector input.
300	return findScalarElement(V: III->getOperand(i_nocapture: `0`), EltNo);
301	}
302
303	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Val: V);
304	// Restrict the following transformation to fixed-length vector.
305	if (SVI && isa<FixedVectorType>(Val: SVI->getType())) {
306	unsigned LHSWidth =
307	cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: `0`)->getType())->getNumElements();
308	int InEl = SVI->getMaskValue(Elt: EltNo);
309	if (InEl < `0`)
310	return PoisonValue::get(T: VTy->getElementType());
311	if (InEl < (int)LHSWidth)
312	return findScalarElement(V: SVI->getOperand(i_nocapture: `0`), EltNo: InEl);
313	return findScalarElement(V: SVI->getOperand(i_nocapture: `1`), EltNo: InEl - LHSWidth);
314	}
315
316	// Extract a value from a vector add operation with a constant zero.
317	// TODO: Use getBinOpIdentity() to generalize this.
318	Value Val; Constant C;
319	if (match(V, P: m_Add(L: m_Value(V&: Val), R: m_Constant(C))))
320	if (Constant *Elt = C->getAggregateElement(Elt: EltNo))
321	if (Elt->isNullValue())
322	return findScalarElement(V: Val, EltNo);
323
324	// If the vector is a splat then we can trivially find the scalar element.
325	if (isa<ScalableVectorType>(Val: VTy))
326	if (Value *Splat = getSplatValue(V))
327	if (EltNo < VTy->getElementCount().getKnownMinValue())
328	return Splat;
329
330	// Otherwise, we don't know.
331	return nullptr;
332	}
333
334	int llvm::getSplatIndex(ArrayRef<int> Mask) {
335	int SplatIndex = -`1`;
336	for (int M : Mask) {
337	// Ignore invalid (undefined) mask elements.
338	if (M < `0`)
339	continue;
340
341	// There can be only 1 non-negative mask element value if this is a splat.
342	if (SplatIndex != -`1` && SplatIndex != M)
343	return -`1`;
344
345	// Initialize the splat index to the 1st non-negative mask element.
346	SplatIndex = M;
347	}
348	assert((SplatIndex == -`1` \|\| SplatIndex >= `0`) && "Negative index?");
349	return SplatIndex;
350	}
351
352	/// Get splat value if the input is a splat vector or return nullptr.
353	/// This function is not fully general. It checks only 2 cases:
354	/// the input value is (1) a splat constant vector or (2) a sequence
355	/// of instructions that broadcasts a scalar at element 0.
356	Value llvm::getSplatValue(const* Value *V) {
357	if (isa<VectorType>(Val: V->getType()))
358	if (auto *C = dyn_cast<Constant>(Val: V))
359	return C->getSplatValue();
360
361	// shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
362	Value *Splat;
363	if (match(V,
364	P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(V&: Splat), Idx: m_ZeroInt()),
365	v2: m_Value(), mask: m_ZeroMask ())))
366	return Splat;
367
368	return nullptr;
369	}
370
371	bool llvm::isSplatValue(const Value V, int* Index, unsigned Depth) {
372	assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
373
374	if (isa<VectorType>(Val: V->getType())) {
375	if (isa<UndefValue>(Val: V))
376	return true;
377	// FIXME: We can allow undefs, but if Index was specified, we may want to
378	// check that the constant is defined at that index.
379	if (auto *C = dyn_cast<Constant>(Val: V))
380	return C->getSplatValue() != nullptr;
381	}
382
383	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V)) {
384	// FIXME: We can safely allow undefs here. If Index was specified, we will
385	// check that the mask elt is defined at the required index.
386	if (!all_equal(Range: Shuf->getShuffleMask()))
387	return false;
388
389	// Match any index.
390	if (Index == -`1`)
391	return true;
392
393	// Match a specific element. The mask should be defined at and match the
394	// specified index.
395	return Shuf->getMaskValue(Elt: Index) == Index;
396	}
397
398	// The remaining tests are all recursive, so bail out if we hit the limit.
399	if (Depth++ == MaxAnalysisRecursionDepth)
400	return false;
401
402	// If both operands of a binop are splats, the result is a splat.
403	Value X, Y, *Z;
404	if (match(V, P: m_BinOp(L: m_Value(V&: X), R: m_Value(V&: Y))))
405	return isSplatValue(V: X, Index, Depth) && isSplatValue(V: Y, Index, Depth);
406
407	// If all operands of a select are splats, the result is a splat.
408	if (match(V, P: m_Select(C: m_Value(V&: X), L: m_Value(V&: Y), R: m_Value(V&: Z))))
409	return isSplatValue(V: X, Index, Depth) && isSplatValue(V: Y, Index, Depth) &&
410	isSplatValue(V: Z, Index, Depth);
411
412	// TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
413
414	return false;
415	}
416
417	bool llvm::getShuffleDemandedElts(int SrcWidth, ArrayRef<int> Mask,
418	const APInt &DemandedElts, APInt &DemandedLHS,
419	APInt &DemandedRHS, bool AllowUndefElts) {
420	DemandedLHS = DemandedRHS = APInt::getZero(numBits: SrcWidth);
421
422	// Early out if we don't demand any elements.
423	if (DemandedElts.isZero())
424	return true;
425
426	// Simple case of a shuffle with zeroinitializer.
427	if (all_of(Range&: Mask, P: [](int Elt) { return Elt == `0`; })) {
428	DemandedLHS.setBit(`0`);
429	return true;
430	}
431
432	for (unsigned I = `0`, E = Mask.size(); I != E; ++I) {
433	int M = Mask [I];
434	assert((-`1` <= M) && (M < (SrcWidth * `2`)) &&
435	"Invalid shuffle mask constant");
436
437	if (!DemandedElts [I] \|\| (AllowUndefElts && (M < `0`)))
438	continue;
439
440	// For undef elements, we don't know anything about the common state of
441	// the shuffle result.
442	if (M < `0`)
443	return false;
444
445	if (M < SrcWidth)
446	DemandedLHS.setBit(M);
447	else
448	DemandedRHS.setBit(M - SrcWidth);
449	}
450
451	return true;
452	}
453
454	bool llvm::isMaskedSlidePair(ArrayRef<int> Mask, int NumElts,
455	std::array<std::pair<int, int>, `2`> &SrcInfo) {
456	const int SignalValue = NumElts * `2`;
457	SrcInfo [`0`] = {-`1`, SignalValue};
458	SrcInfo [`1`] = {-`1`, SignalValue};
459	for (auto [i, M] : enumerate(First&: Mask)) {
460	if (M < `0`)
461	continue;
462	int Src = M >= (int)NumElts;
463	int Diff = (int)i - (M % NumElts);
464	bool Match = false;
465	for (int j = `0`; j < `2`; j++) {
466	auto &[SrcE, DiffE] = SrcInfo [j];
467	if (SrcE == -`1`) {
468	assert(DiffE == SignalValue);
469	SrcE = Src;
470	DiffE = Diff;
471	}
472	if (SrcE == Src && DiffE == Diff) {
473	Match = true;
474	break;
475	}
476	}
477	if (!Match)
478	return false;
479	}
480	// Avoid all undef masks
481	return SrcInfo [`0`].first != -`1`;
482	}
483
484	void llvm::narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
485	SmallVectorImpl<int> &ScaledMask) {
486	assert(Scale > `0` && "Unexpected scaling factor");
487
488	// Fast-path: if no scaling, then it is just a copy.
489	if (Scale == `1`) {
490	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
491	return;
492	}
493
494	ScaledMask.clear();
495	for (int MaskElt : Mask) {
496	if (MaskElt >= `0`) {
497	assert(((uint64_t)Scale * MaskElt + (Scale - `1`)) <= INT32_MAX &&
498	"Overflowed 32-bits");
499	}
500	for (int SliceElt = `0`; SliceElt != Scale; ++SliceElt)
501	ScaledMask.push_back(Elt: MaskElt < `0` ? MaskElt : Scale * MaskElt + SliceElt);
502	}
503	}
504
505	bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
506	SmallVectorImpl<int> &ScaledMask) {
507	assert(Scale > `0` && "Unexpected scaling factor");
508
509	// Fast-path: if no scaling, then it is just a copy.
510	if (Scale == `1`) {
511	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
512	return true;
513	}
514
515	// We must map the original elements down evenly to a type with less elements.
516	int NumElts = Mask.size();
517	if (NumElts % Scale != `0`)
518	return false;
519
520	ScaledMask.clear();
521	ScaledMask.reserve(N: NumElts / Scale);
522
523	// Step through the input mask by splitting into Scale-sized slices.
524	do {
525	ArrayRef<int> MaskSlice = Mask.take_front(N: Scale);
526	assert((int)MaskSlice.size() == Scale && "Expected Scale-sized slice.");
527
528	// The first element of the slice determines how we evaluate this slice.
529	int SliceFront = MaskSlice.front();
530	if (SliceFront < `0`) {
531	// Negative values (undef or other "sentinel" values) must be equal across
532	// the entire slice.
533	if (!all_equal(Range&: MaskSlice))
534	return false;
535	ScaledMask.push_back(Elt: SliceFront);
536	} else {
537	// A positive mask element must be cleanly divisible.
538	if (SliceFront % Scale != `0`)
539	return false;
540	// Elements of the slice must be consecutive.
541	for (int i = `1`; i < Scale; ++i)
542	if (MaskSlice [i] != SliceFront + i)
543	return false;
544	ScaledMask.push_back(Elt: SliceFront / Scale);
545	}
546	Mask = Mask.drop_front(N: Scale);
547	} while (!Mask.empty());
548
549	assert((int)ScaledMask.size() * Scale == NumElts && "Unexpected scaled mask");
550
551	// All elements of the original mask can be scaled down to map to the elements
552	// of a mask with wider elements.
553	return true;
554	}
555
556	bool llvm::widenShuffleMaskElts(ArrayRef<int> M,
557	SmallVectorImpl<int> &NewMask) {
558	unsigned NumElts = M.size();
559	if (NumElts % `2` != `0`)
560	return false;
561
562	NewMask.clear();
563	for (unsigned i = `0`; i < NumElts; i += `2`) {
564	int M0 = M [i];
565	int M1 = M [i + `1`];
566
567	// If both elements are undef, new mask is undef too.
568	if (M0 == -`1` && M1 == -`1`) {
569	NewMask.push_back(Elt: -`1`);
570	continue;
571	}
572
573	if (M0 == -`1` && M1 != -`1` && (M1 % `2`) == `1`) {
574	NewMask.push_back(Elt: M1 / `2`);
575	continue;
576	}
577
578	if (M0 != -`1` && (M0 % `2`) == `0` && ((M0 + `1`) == M1 \|\| M1 == -`1`)) {
579	NewMask.push_back(Elt: M0 / `2`);
580	continue;
581	}
582
583	NewMask.clear();
584	return false;
585	}
586
587	assert(NewMask.size() == NumElts / `2` && "Incorrect size for mask!");
588	return true;
589	}
590
591	bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
592	SmallVectorImpl<int> &ScaledMask) {
593	unsigned NumSrcElts = Mask.size();
594	assert(NumSrcElts > `0` && NumDstElts > `0` && "Unexpected scaling factor");
595
596	// Fast-path: if no scaling, then it is just a copy.
597	if (NumSrcElts == NumDstElts) {
598	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
599	return true;
600	}
601
602	// Ensure we can find a whole scale factor.
603	assert(((NumSrcElts % NumDstElts) == `0` \|\| (NumDstElts % NumSrcElts) == `0`) &&
604	"Unexpected scaling factor");
605
606	if (NumSrcElts > NumDstElts) {
607	int Scale = NumSrcElts / NumDstElts;
608	return widenShuffleMaskElts(Scale, Mask, ScaledMask);
609	}
610
611	int Scale = NumDstElts / NumSrcElts;
612	narrowShuffleMaskElts(Scale, Mask, ScaledMask);
613	return true;
614	}
615
616	void llvm::getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
617	SmallVectorImpl<int> &ScaledMask) {
618	std::array<SmallVector<int, `16`>, `2`> TmpMasks;
619	SmallVectorImpl<int> Output = &TmpMasks [`0`], Tmp = &TmpMasks [`1`];
620	ArrayRef<int> InputMask = Mask;
621	for (unsigned Scale = `2`; Scale <= InputMask.size(); ++Scale) {
622	while (widenShuffleMaskElts(Scale, Mask: InputMask, ScaledMask&: *Output)) {
623	InputMask = *Output;
624	std::swap(a&: Output, b&: Tmp);
625	}
626	}
627	ScaledMask.assign(in_start: InputMask.begin(), in_end: InputMask.end());
628	}
629
630	void llvm::processShuffleMasks(
631	ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
632	unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
633	function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
634	function_ref<void(ArrayRef<int>, unsigned, unsigned, bool)>
635	ManyInputsAction) {
636	SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
637	// Try to perform better estimation of the permutation.
638	// 1. Split the source/destination vectors into real registers.
639	// 2. Do the mask analysis to identify which real registers are
640	// permuted.
641	int Sz = Mask.size();
642	unsigned SzDest = Sz / NumOfDestRegs;
643	unsigned SzSrc = Sz / NumOfSrcRegs;
644	for (unsigned I = `0`; I < NumOfDestRegs; ++I) {
645	auto &RegMasks = Res [I];
646	RegMasks.assign(NumElts: `2` * NumOfSrcRegs, Elt: {});
647	// Check that the values in dest registers are in the one src
648	// register.
649	for (unsigned K = `0`; K < SzDest; ++K) {
650	int Idx = I * SzDest + K;
651	if (Idx == Sz)
652	break;
653	if (Mask [Idx] >= `2` * Sz \|\| Mask [Idx] == PoisonMaskElem)
654	continue;
655	int MaskIdx = Mask [Idx] % Sz;
656	int SrcRegIdx = MaskIdx / SzSrc + (Mask [Idx] >= Sz ? NumOfSrcRegs : `0`);
657	// Add a cost of PermuteTwoSrc for each new source register permute,
658	// if we have more than one source registers.
659	if (RegMasks [SrcRegIdx].empty())
660	RegMasks [SrcRegIdx].assign(NumElts: SzDest, Elt: PoisonMaskElem);
661	RegMasks [SrcRegIdx][K] = MaskIdx % SzSrc;
662	}
663	}
664	// Process split mask.
665	for (unsigned I : seq<unsigned>(Size: NumOfUsedRegs)) {
666	auto &Dest = Res [I];
667	int NumSrcRegs =
668	count_if(Range&: Dest, P: [](ArrayRef<int> Mask) { return !Mask.empty(); });
669	switch (NumSrcRegs) {
670	case `0`:
671	// No input vectors were used!
672	NoInputAction ();
673	break;
674	case `1`: {
675	// Find the only mask with at least single undef mask elem.
676	auto *It =
677	find_if(Range&: Dest, P: [](ArrayRef<int> Mask) { return !Mask.empty(); });
678	unsigned SrcReg = std::distance(first: Dest.begin(), last: It);
679	SingleInputAction (*It, SrcReg, I);
680	break;
681	}
682	default: {
683	// The first mask is a permutation of a single register. Since we have >2
684	// input registers to shuffle, we merge the masks for 2 first registers
685	// and generate a shuffle of 2 registers rather than the reordering of the
686	// first register and then shuffle with the second register. Next,
687	// generate the shuffles of the resulting register + the remaining
688	// registers from the list.
689	auto &&CombineMasks = [](MutableArrayRef<int> FirstMask,
690	ArrayRef<int> SecondMask) {
691	for (int Idx = `0`, VF = FirstMask.size(); Idx < VF; ++Idx) {
692	if (SecondMask [Idx] != PoisonMaskElem) {
693	assert(FirstMask[Idx] == PoisonMaskElem &&
694	"Expected undefined mask element.");
695	FirstMask [Idx] = SecondMask [Idx] + VF;
696	}
697	}
698	};
699	auto &&NormalizeMask = [](MutableArrayRef<int> Mask) {
700	for (int Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
701	if (Mask [Idx] != PoisonMaskElem)
702	Mask [Idx] = Idx;
703	}
704	};
705	int SecondIdx;
706	bool NewReg = true;
707	do {
708	int FirstIdx = -`1`;
709	SecondIdx = -`1`;
710	MutableArrayRef<int> FirstMask, SecondMask;
711	for (unsigned I : seq<unsigned>(Size: `2` * NumOfSrcRegs)) {
712	SmallVectorImpl<int> &RegMask = Dest [I];
713	if (RegMask.empty())
714	continue;
715
716	if (FirstIdx == SecondIdx) {
717	FirstIdx = I;
718	FirstMask = RegMask;
719	continue;
720	}
721	SecondIdx = I;
722	SecondMask = RegMask;
723	CombineMasks (FirstMask, SecondMask);
724	ManyInputsAction (FirstMask, FirstIdx, SecondIdx, NewReg);
725	NewReg = false;
726	NormalizeMask (FirstMask);
727	RegMask.clear();
728	SecondMask = FirstMask;
729	SecondIdx = FirstIdx;
730	}
731	if (FirstIdx != SecondIdx && SecondIdx >= `0`) {
732	CombineMasks (SecondMask, FirstMask);
733	ManyInputsAction (SecondMask, SecondIdx, FirstIdx, NewReg);
734	NewReg = false;
735	Dest [FirstIdx].clear();
736	NormalizeMask (SecondMask);
737	}
738	} while (SecondIdx >= `0`);
739	break;
740	}
741	}
742	}
743	}
744
745	void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
746	const APInt &DemandedElts,
747	APInt &DemandedLHS,
748	APInt &DemandedRHS) {
749	assert(VectorBitWidth >= `128` && "Vectors smaller than 128 bit not supported");
750	int NumLanes = VectorBitWidth / `128`;
751	int NumElts = DemandedElts.getBitWidth();
752	int NumEltsPerLane = NumElts / NumLanes;
753	int HalfEltsPerLane = NumEltsPerLane / `2`;
754
755	DemandedLHS = APInt::getZero(numBits: NumElts);
756	DemandedRHS = APInt::getZero(numBits: NumElts);
757
758	// Map DemandedElts to the horizontal operands.
759	for (int Idx = `0`; Idx != NumElts; ++Idx) {
760	if (!DemandedElts [Idx])
761	continue;
762	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
763	int LocalIdx = Idx % NumEltsPerLane;
764	if (LocalIdx < HalfEltsPerLane) {
765	DemandedLHS.setBit(LaneIdx + `2` * LocalIdx);
766	} else {
767	LocalIdx -= HalfEltsPerLane;
768	DemandedRHS.setBit(LaneIdx + `2` * LocalIdx);
769	}
770	}
771	}
772
773	MapVector<Instruction *, uint64_t>
774	llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
775	const TargetTransformInfo *TTI) {
776
777	// DemandedBits will give us every value's live-out bits. But we want
778	// to ensure no extra casts would need to be inserted, so every DAG
779	// of connected values must have the same minimum bitwidth.
780	EquivalenceClasses<Value *> ECs;
781	SmallVector<Instruction *, `16`> Worklist;
782	SmallPtrSet<Instruction *, `4`> Roots;
783	SmallPtrSet<Instruction *, `16`> Visited;
784	DenseMap<Value *, uint64_t> DBits;
785	SmallPtrSet<Instruction *, `4`> InstructionSet;
786	MapVector<Instruction *, uint64_t> MinBWs;
787
788	// Determine the roots. We work bottom-up, from truncs or icmps.
789	bool SeenExtFromIllegalType = false;
790	for (auto *BB : Blocks)
791	for (auto &I : *BB) {
792	InstructionSet.insert(Ptr: &I);
793
794	if (TTI && (isa<ZExtInst>(Val: &I) \|\| isa<SExtInst>(Val: &I)) &&
795	!TTI->isTypeLegal(Ty: I.getOperand(i: `0`)->getType()))
796	SeenExtFromIllegalType = true;
797
798	// Only deal with non-vector integers up to 64-bits wide.
799	if ((isa<TruncInst>(Val: &I) \|\| isa<ICmpInst>(Val: &I)) &&
800	!I.getType()->isVectorTy() &&
801	I.getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `64`) {
802	// Don't make work for ourselves. If we know the loaded type is legal,
803	// don't add it to the worklist.
804	if (TTI && isa<TruncInst>(Val: &I) && TTI->isTypeLegal(Ty: I.getType()))
805	continue;
806
807	Worklist.push_back(Elt: &I);
808	Roots.insert(Ptr: &I);
809	}
810	}
811	// Early exit.
812	if (Worklist.empty() \|\| (TTI && !SeenExtFromIllegalType))
813	return MinBWs;
814
815	// Now proceed breadth-first, unioning values together.
816	while (!Worklist.empty()) {
817	Instruction *I = Worklist.pop_back_val();
818	Value *Leader = ECs.getOrInsertLeaderValue(V: I);
819
820	if (!Visited.insert(Ptr: I).second)
821	continue;
822
823	// If we encounter a type that is larger than 64 bits, we can't represent
824	// it so bail out.
825	if (DB.getDemandedBits(I).getBitWidth() > `64`)
826	return MapVector<Instruction *, uint64_t>();
827
828	uint64_t V = DB.getDemandedBits(I).getZExtValue();
829	DBits [Leader] \|= V;
830	DBits [I] = V;
831
832	// Casts, loads and instructions outside of our range terminate a chain
833	// successfully.
834	if (isa<SExtInst>(Val: I) \|\| isa<ZExtInst>(Val: I) \|\| isa<LoadInst>(Val: I) \|\|
835	!InstructionSet.count(Ptr: I))
836	continue;
837
838	// Unsafe casts terminate a chain unsuccessfully. We can't do anything
839	// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
840	// transform anything that relies on them.
841	if (isa<BitCastInst>(Val: I) \|\| isa<PtrToIntInst>(Val: I) \|\| isa<IntToPtrInst>(Val: I) \|\|
842	!I->getType()->isIntegerTy()) {
843	DBits [Leader] \|= ~`0ULL`;
844	continue;
845	}
846
847	// We don't modify the types of PHIs. Reductions will already have been
848	// truncated if possible, and inductions' sizes will have been chosen by
849	// indvars.
850	if (isa<PHINode>(Val: I))
851	continue;
852
853	// Don't modify the types of operands of a call, as doing that would cause a
854	// signature mismatch.
855	if (isa<CallBase>(Val: I))
856	continue;
857
858	if (DBits [Leader] == ~`0ULL`)
859	// All bits demanded, no point continuing.
860	continue;
861
862	for (Value *O : I->operands()) {
863	ECs.unionSets(V1: Leader, V2: O);
864	if (auto *OI = dyn_cast<Instruction>(Val: O))
865	Worklist.push_back(Elt: OI);
866	}
867	}
868
869	// Now we've discovered all values, walk them to see if there are
870	// any users we didn't see. If there are, we can't optimize that
871	// chain.
872	for (auto &I : DBits)
873	for (auto *U : I.first->users())
874	if (U->getType()->isIntegerTy() && DBits.count(Val: U) == `0`)
875	DBits [ECs.getOrInsertLeaderValue(V: I.first)] \|= ~`0ULL`;
876
877	for (const auto &E : ECs) {
878	if (!E->isLeader())
879	continue;
880	uint64_t LeaderDemandedBits = `0`;
881	for (Value M : ECs.members(ECV: E))
882	LeaderDemandedBits \|= DBits [M];
883
884	uint64_t MinBW = llvm::bit_width(Value: LeaderDemandedBits);
885	// Round up to a power of 2
886	MinBW = llvm::bit_ceil(Value: MinBW);
887
888	// We don't modify the types of PHIs. Reductions will already have been
889	// truncated if possible, and inductions' sizes will have been chosen by
890	// indvars.
891	// If we are required to shrink a PHI, abandon this entire equivalence class.
892	bool Abort = false;
893	for (Value M : ECs.members(ECV: E))
894	if (isa<PHINode>(Val: M) && MinBW < M->getType()->getScalarSizeInBits()) {
895	Abort = true;
896	break;
897	}
898	if (Abort)
899	continue;
900
901	for (Value M : ECs.members(ECV: E)) {
902	auto *MI = dyn_cast<Instruction>(Val: M);
903	if (!MI)
904	continue;
905	Type *Ty = M->getType();
906	if (Roots.count(Ptr: MI))
907	Ty = MI->getOperand(i: `0`)->getType();
908
909	if (MinBW >= Ty->getScalarSizeInBits())
910	continue;
911
912	// If any of M's operands demand more bits than MinBW then M cannot be
913	// performed safely in MinBW.
914	auto *Call = dyn_cast<CallBase>(Val: MI);
915	auto Ops = Call ? Call->args() : MI->operands();
916	if (any_of(Range&: Ops, P: [&DB, MinBW](Use &U) {
917	auto *CI = dyn_cast<ConstantInt>(Val&: U);
918	// For constants shift amounts, check if the shift would result in
919	// poison.
920	if (CI &&
921	isa<ShlOperator, LShrOperator, AShrOperator>(Val: U.getUser()) &&
922	U.getOperandNo() == `1`)
923	return CI->uge(Num: MinBW);
924	uint64_t BW = bit_width(Value: DB.getDemandedBits(U: &U).getZExtValue());
925	return bit_ceil(Value: BW) > MinBW;
926	}))
927	continue;
928
929	MinBWs [MI] = MinBW;
930	}
931	}
932
933	return MinBWs;
934	}
935
936	/// Add all access groups in @p AccGroups to @p List.
937	template <typename ListT>
938	static void addToAccessGroupList(ListT &List, MDNode *AccGroups) {
939	// Interpret an access group as a list containing itself.
940	if (AccGroups->getNumOperands() == `0`) {
941	assert(isValidAsAccessGroup(AccGroups) && "Node must be an access group");
942	List.insert(AccGroups);
943	return;
944	}
945
946	for (const auto &AccGroupListOp : AccGroups->operands()) {
947	auto *Item = cast<MDNode>(Val: AccGroupListOp.get());
948	assert(isValidAsAccessGroup(Item) && "List item must be an access group");
949	List.insert(Item);
950	}
951	}
952
953	MDNode llvm::uniteAccessGroups(MDNode AccGroups1, MDNode *AccGroups2) {
954	if (!AccGroups1)
955	return AccGroups2;
956	if (!AccGroups2)
957	return AccGroups1;
958	if (AccGroups1 == AccGroups2)
959	return AccGroups1;
960
961	SmallSetVector<Metadata *, `4`> Union;
962	addToAccessGroupList(List&: Union, AccGroups: AccGroups1);
963	addToAccessGroupList(List&: Union, AccGroups: AccGroups2);
964
965	if (Union.size() == `0`)
966	return nullptr;
967	if (Union.size() == `1`)
968	return cast<MDNode>(Val: Union.front());
969
970	LLVMContext &Ctx = AccGroups1->getContext();
971	return MDNode::get(Context&: Ctx, MDs: Union.getArrayRef());
972	}
973
974	MDNode llvm::intersectAccessGroups(const* Instruction *Inst1,
975	const Instruction *Inst2) {
976	bool MayAccessMem1 = Inst1->mayReadOrWriteMemory();
977	bool MayAccessMem2 = Inst2->mayReadOrWriteMemory();
978
979	if (!MayAccessMem1 && !MayAccessMem2)
980	return nullptr;
981	if (!MayAccessMem1)
982	return Inst2->getMetadata(KindID: LLVMContext::MD_access_group);
983	if (!MayAccessMem2)
984	return Inst1->getMetadata(KindID: LLVMContext::MD_access_group);
985
986	MDNode *MD1 = Inst1->getMetadata(KindID: LLVMContext::MD_access_group);
987	MDNode *MD2 = Inst2->getMetadata(KindID: LLVMContext::MD_access_group);
988	if (!MD1 \|\| !MD2)
989	return nullptr;
990	if (MD1 == MD2)
991	return MD1;
992
993	// Use set for scalable 'contains' check.
994	SmallPtrSet<Metadata *, `4`> AccGroupSet2;
995	addToAccessGroupList(List&: AccGroupSet2, AccGroups: MD2);
996
997	SmallVector<Metadata *, `4`> Intersection;
998	if (MD1->getNumOperands() == `0`) {
999	assert(isValidAsAccessGroup(MD1) && "Node must be an access group");
1000	if (AccGroupSet2.count(Ptr: MD1))
1001	Intersection.push_back(Elt: MD1);
1002	} else {
1003	for (const MDOperand &Node : MD1->operands()) {
1004	auto *Item = cast<MDNode>(Val: Node.get());
1005	assert(isValidAsAccessGroup(Item) && "List item must be an access group");
1006	if (AccGroupSet2.count(Ptr: Item))
1007	Intersection.push_back(Elt: Item);
1008	}
1009	}
1010
1011	if (Intersection.size() == `0`)
1012	return nullptr;
1013	if (Intersection.size() == `1`)
1014	return cast<MDNode>(Val: Intersection.front());
1015
1016	LLVMContext &Ctx = Inst1->getContext();
1017	return MDNode::get(Context&: Ctx, MDs: Intersection);
1018	}
1019
1020	/// Add metadata from \p Inst to \p Metadata, if it can be preserved after
1021	/// vectorization.
1022	void llvm::getMetadataToPropagate(
1023	Instruction *Inst,
1024	SmallVectorImpl<std::pair<unsigned, MDNode *>> &Metadata) {
1025	Inst->getAllMetadataOtherThanDebugLoc(MDs&: Metadata);
1026	static const unsigned SupportedIDs[] = {
1027	LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
1028	LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
1029	LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
1030	LLVMContext::MD_access_group, LLVMContext::MD_mmra};
1031
1032	// Remove any unsupported metadata kinds from Metadata.
1033	for (unsigned Idx = `0`; Idx != Metadata.size();) {
1034	if (is_contained(Range: SupportedIDs, Element: Metadata [Idx].first)) {
1035	++Idx;
1036	} else {
1037	// Swap element to end and remove it.
1038	std::swap(x&: Metadata [Idx], y&: Metadata.back());
1039	Metadata.pop_back();
1040	}
1041	}
1042	}
1043
1044	/// \returns \p I after propagating metadata from \p VL.
1045	Instruction llvm::propagateMetadata(Instruction Inst, ArrayRef<Value *> VL) {
1046	if (VL.empty())
1047	return Inst;
1048	SmallVector<std::pair<unsigned, MDNode *>> Metadata;
1049	getMetadataToPropagate(Inst: cast<Instruction>(Val: VL [`0`]), Metadata);
1050
1051	for (auto &[Kind, MD] : Metadata) {
1052	for (int J = `1`, E = VL.size(); MD && J != E; ++J) {
1053	const Instruction *IJ = cast<Instruction>(Val: VL [J]);
1054	MDNode *IMD = IJ->getMetadata(KindID: Kind);
1055
1056	switch (Kind) {
1057	case LLVMContext::MD_mmra: {
1058	MD = MMRAMetadata::combine(Ctx&: Inst->getContext(), A: MD, B: IMD);
1059	break;
1060	}
1061	case LLVMContext::MD_tbaa:
1062	MD = MDNode::getMostGenericTBAA(A: MD, B: IMD);
1063	break;
1064	case LLVMContext::MD_alias_scope:
1065	MD = MDNode::getMostGenericAliasScope(A: MD, B: IMD);
1066	break;
1067	case LLVMContext::MD_fpmath:
1068	MD = MDNode::getMostGenericFPMath(A: MD, B: IMD);
1069	break;
1070	case LLVMContext::MD_noalias:
1071	case LLVMContext::MD_nontemporal:
1072	case LLVMContext::MD_invariant_load:
1073	MD = MDNode::intersect(A: MD, B: IMD);
1074	break;
1075	case LLVMContext::MD_access_group:
1076	MD = intersectAccessGroups(Inst1: Inst, Inst2: IJ);
1077	break;
1078	default:
1079	llvm_unreachable("unhandled metadata");
1080	}
1081	}
1082
1083	Inst->setMetadata(KindID: Kind, Node: MD);
1084	}
1085
1086	return Inst;
1087	}
1088
1089	Constant *
1090	llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
1091	const InterleaveGroup<Instruction> &Group) {
1092	// All 1's means mask is not needed.
1093	if (Group.getNumMembers() == Group.getFactor())
1094	return nullptr;
1095
1096	// TODO: support reversed access.
1097	assert(!Group.isReverse() && "Reversed group not supported.");
1098
1099	SmallVector<Constant *, `16`> Mask;
1100	for (unsigned i = `0`; i < VF; i++)
1101	for (unsigned j = `0`; j < Group.getFactor(); ++j) {
1102	unsigned HasMember = Group.getMember(Index: j) ? `1` : `0`;
1103	Mask.push_back(Elt: Builder.getInt1(V: HasMember));
1104	}
1105
1106	return ConstantVector::get(V: Mask);
1107	}
1108
1109	llvm::SmallVector<int, `16`>
1110	llvm::createReplicatedMask(unsigned ReplicationFactor, unsigned VF) {
1111	SmallVector<int, `16`> MaskVec;
1112	for (unsigned i = `0`; i < VF; i++)
1113	for (unsigned j = `0`; j < ReplicationFactor; j++)
1114	MaskVec.push_back(Elt: i);
1115
1116	return MaskVec;
1117	}
1118
1119	llvm::SmallVector<int, `16`> llvm::createInterleaveMask(unsigned VF,
1120	unsigned NumVecs) {
1121	SmallVector<int, `16`> Mask;
1122	for (unsigned i = `0`; i < VF; i++)
1123	for (unsigned j = `0`; j < NumVecs; j++)
1124	Mask.push_back(Elt: j * VF + i);
1125
1126	return Mask;
1127	}
1128
1129	llvm::SmallVector<int, `16`>
1130	llvm::createStrideMask(unsigned Start, unsigned Stride, unsigned VF) {
1131	SmallVector<int, `16`> Mask;
1132	for (unsigned i = `0`; i < VF; i++)
1133	Mask.push_back(Elt: Start + i * Stride);
1134
1135	return Mask;
1136	}
1137
1138	llvm::SmallVector<int, `16`> llvm::createSequentialMask(unsigned Start,
1139	unsigned NumInts,
1140	unsigned NumUndefs) {
1141	SmallVector<int, `16`> Mask;
1142	for (unsigned i = `0`; i < NumInts; i++)
1143	Mask.push_back(Elt: Start + i);
1144
1145	for (unsigned i = `0`; i < NumUndefs; i++)
1146	Mask.push_back(Elt: -`1`);
1147
1148	return Mask;
1149	}
1150
1151	llvm::SmallVector<int, `16`> llvm::createUnaryMask(ArrayRef<int> Mask,
1152	unsigned NumElts) {
1153	// Avoid casts in the loop and make sure we have a reasonable number.
1154	int NumEltsSigned = NumElts;
1155	assert(NumEltsSigned > `0` && "Expected smaller or non-zero element count");
1156
1157	// If the mask chooses an element from operand 1, reduce it to choose from the
1158	// corresponding element of operand 0. Undef mask elements are unchanged.
1159	SmallVector<int, `16`> UnaryMask;
1160	for (int MaskElt : Mask) {
1161	assert((MaskElt < NumEltsSigned * `2`) && "Expected valid shuffle mask");
1162	int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
1163	UnaryMask.push_back(Elt: UnaryElt);
1164	}
1165	return UnaryMask;
1166	}
1167
1168	/// A helper function for concatenating vectors. This function concatenates two
1169	/// vectors having the same element type. If the second vector has fewer
1170	/// elements than the first, it is padded with undefs.
1171	static Value concatenateTwoVectors(IRBuilderBase &Builder, Value V1,
1172	Value *V2) {
1173	VectorType *VecTy1 = dyn_cast<VectorType>(Val: V1->getType());
1174	VectorType *VecTy2 = dyn_cast<VectorType>(Val: V2->getType());
1175	assert(VecTy1 && VecTy2 &&
1176	VecTy1->getScalarType() == VecTy2->getScalarType() &&
1177	"Expect two vectors with the same element type");
1178
1179	unsigned NumElts1 = cast<FixedVectorType>(Val: VecTy1)->getNumElements();
1180	unsigned NumElts2 = cast<FixedVectorType>(Val: VecTy2)->getNumElements();
1181	assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
1182
1183	if (NumElts1 > NumElts2) {
1184	// Extend with UNDEFs.
1185	V2 = Builder.CreateShuffleVector(
1186	V: V2, Mask: createSequentialMask(Start: `0`, NumInts: NumElts2, NumUndefs: NumElts1 - NumElts2));
1187	}
1188
1189	return Builder.CreateShuffleVector(
1190	V1, V2, Mask: createSequentialMask(Start: `0`, NumInts: NumElts1 + NumElts2, NumUndefs: `0`));
1191	}
1192
1193	Value *llvm::concatenateVectors(IRBuilderBase &Builder,
1194	ArrayRef<Value *> Vecs) {
1195	unsigned NumVecs = Vecs.size();
1196	assert(NumVecs > `1` && "Should be at least two vectors");
1197
1198	SmallVector<Value *, `8`> ResList;
1199	ResList.append(in_start: Vecs.begin(), in_end: Vecs.end());
1200	do {
1201	SmallVector<Value *, `8`> TmpList;
1202	for (unsigned i = `0`; i < NumVecs - `1`; i += `2`) {
1203	Value V0 = ResList [i], V1 = ResList [i + `1`];
1204	assert((V0->getType() == V1->getType() \|\| i == NumVecs - `2`) &&
1205	"Only the last vector may have a different type");
1206
1207	TmpList.push_back(Elt: concatenateTwoVectors(Builder, V1: V0, V2: V1));
1208	}
1209
1210	// Push the last vector if the total number of vectors is odd.
1211	if (NumVecs % `2` != `0`)
1212	TmpList.push_back(Elt: ResList [NumVecs - `1`]);
1213
1214	ResList = TmpList;
1215	NumVecs = ResList.size();
1216	} while (NumVecs > `1`);
1217
1218	return ResList [`0`];
1219	}
1220
1221	bool llvm::maskIsAllZeroOrUndef(Value *Mask) {
1222	assert(isa<VectorType>(Mask->getType()) &&
1223	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1224	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1225	`1` &&
1226	"Mask must be a vector of i1");
1227
1228	auto *ConstMask = dyn_cast<Constant>(Val: Mask);
1229	if (!ConstMask)
1230	return false;
1231	if (ConstMask->isNullValue() \|\| isa<UndefValue>(Val: ConstMask))
1232	return true;
1233	if (isa<ScalableVectorType>(Val: ConstMask->getType()))
1234	return false;
1235	for (unsigned
1236	I = `0`,
1237	E = cast<FixedVectorType>(Val: ConstMask->getType())->getNumElements();
1238	I != E; ++I) {
1239	if (auto *MaskElt = ConstMask->getAggregateElement(Elt: I))
1240	if (MaskElt->isNullValue() \|\| isa<UndefValue>(Val: MaskElt))
1241	continue;
1242	return false;
1243	}
1244	return true;
1245	}
1246
1247	bool llvm::maskIsAllOneOrUndef(Value *Mask) {
1248	assert(isa<VectorType>(Mask->getType()) &&
1249	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1250	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1251	`1` &&
1252	"Mask must be a vector of i1");
1253
1254	auto *ConstMask = dyn_cast<Constant>(Val: Mask);
1255	if (!ConstMask)
1256	return false;
1257	if (ConstMask->isAllOnesValue() \|\| isa<UndefValue>(Val: ConstMask))
1258	return true;
1259	if (isa<ScalableVectorType>(Val: ConstMask->getType()))
1260	return false;
1261	for (unsigned
1262	I = `0`,
1263	E = cast<FixedVectorType>(Val: ConstMask->getType())->getNumElements();
1264	I != E; ++I) {
1265	if (auto *MaskElt = ConstMask->getAggregateElement(Elt: I))
1266	if (MaskElt->isAllOnesValue() \|\| isa<UndefValue>(Val: MaskElt))
1267	continue;
1268	return false;
1269	}
1270	return true;
1271	}
1272
1273	bool llvm::maskContainsAllOneOrUndef(Value *Mask) {
1274	assert(isa<VectorType>(Mask->getType()) &&
1275	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1276	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1277	`1` &&
1278	"Mask must be a vector of i1");
1279
1280	auto *ConstMask = dyn_cast<Constant>(Val: Mask);
1281	if (!ConstMask)
1282	return false;
1283	if (ConstMask->isAllOnesValue() \|\| isa<UndefValue>(Val: ConstMask))
1284	return true;
1285	if (isa<ScalableVectorType>(Val: ConstMask->getType()))
1286	return false;
1287	for (unsigned
1288	I = `0`,
1289	E = cast<FixedVectorType>(Val: ConstMask->getType())->getNumElements();
1290	I != E; ++I) {
1291	if (auto *MaskElt = ConstMask->getAggregateElement(Elt: I))
1292	if (MaskElt->isAllOnesValue() \|\| isa<UndefValue>(Val: MaskElt))
1293	return true;
1294	}
1295	return false;
1296	}
1297
1298	/// TODO: This is a lot like known bits, but for
1299	/// vectors. Is there something we can common this with?
1300	APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
1301	assert(isa<FixedVectorType>(Mask->getType()) &&
1302	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1303	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1304	`1` &&
1305	"Mask must be a fixed width vector of i1");
1306
1307	const unsigned VWidth =
1308	cast<FixedVectorType>(Val: Mask->getType())->getNumElements();
1309	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
1310	if (auto *CV = dyn_cast<ConstantVector>(Val: Mask))
1311	for (unsigned i = `0`; i < VWidth; i++)
1312	if (CV->getAggregateElement(Elt: i)->isNullValue())
1313	DemandedElts.clearBit(BitPosition: i);
1314	return DemandedElts;
1315	}
1316
1317	bool InterleavedAccessInfo::isStrided(int Stride) {
1318	unsigned Factor = std::abs(x: Stride);
1319	return Factor >= `2` && Factor <= MaxInterleaveGroupFactor;
1320	}
1321
1322	void InterleavedAccessInfo::collectConstStrideAccesses(
1323	MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
1324	const DenseMap<Value, const* SCEV*> &Strides) {
1325	auto &DL = TheLoop->getHeader()->getDataLayout();
1326
1327	// Since it's desired that the load/store instructions be maintained in
1328	// "program order" for the interleaved access analysis, we have to visit the
1329	// blocks in the loop in reverse postorder (i.e., in a topological order).
1330	// Such an ordering will ensure that any load/store that may be executed
1331	// before a second load/store will precede the second load/store in
1332	// AccessStrideInfo.
1333	LoopBlocksDFS DFS(TheLoop);
1334	DFS.perform(LI);
1335	for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO()))
1336	for (auto &I : *BB) {
1337	Value *Ptr = getLoadStorePointerOperand(V: &I);
1338	if (!Ptr)
1339	continue;
1340	Type *ElementTy = getLoadStoreType(I: &I);
1341
1342	// Currently, codegen doesn't support cases where the type size doesn't
1343	// match the alloc size. Skip them for now.
1344	uint64_t Size = DL.getTypeAllocSize(Ty: ElementTy);
1345	if (Size * `8` != DL.getTypeSizeInBits(Ty: ElementTy))
1346	continue;
1347
1348	// We don't check wrapping here because we don't know yet if Ptr will be
1349	// part of a full group or a group with gaps. Checking wrapping for all
1350	// pointers (even those that end up in groups with no gaps) will be overly
1351	// conservative. For full groups, wrapping should be ok since if we would
1352	// wrap around the address space we would do a memory access at nullptr
1353	// even without the transformation. The wrapping checks are therefore
1354	// deferred until after we've formed the interleaved groups.
1355	int64_t Stride =
1356	getPtrStride(PSE, AccessTy: ElementTy, Ptr, Lp: TheLoop, StridesMap: Strides,
1357	/Assume=/true, /ShouldCheckWrap=/false).value_or(u: `0`);
1358
1359	const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, PtrToStride: Strides, Ptr);
1360	AccessStrideInfo [&I] = StrideDescriptor (Stride, Scev, Size,
1361	getLoadStoreAlignment(I: &I));
1362	}
1363	}
1364
1365	// Analyze interleaved accesses and collect them into interleaved load and
1366	// store groups.
1367	//
1368	// When generating code for an interleaved load group, we effectively hoist all
1369	// loads in the group to the location of the first load in program order. When
1370	// generating code for an interleaved store group, we sink all stores to the
1371	// location of the last store. This code motion can change the order of load
1372	// and store instructions and may break dependences.
1373	//
1374	// The code generation strategy mentioned above ensures that we won't violate
1375	// any write-after-read (WAR) dependences.
1376	//
1377	// E.g., for the WAR dependence: a = A[i]; // (1)
1378	// A[i] = b; // (2)
1379	//
1380	// The store group of (2) is always inserted at or below (2), and the load
1381	// group of (1) is always inserted at or above (1). Thus, the instructions will
1382	// never be reordered. All other dependences are checked to ensure the
1383	// correctness of the instruction reordering.
1384	//
1385	// The algorithm visits all memory accesses in the loop in bottom-up program
1386	// order. Program order is established by traversing the blocks in the loop in
1387	// reverse postorder when collecting the accesses.
1388	//
1389	// We visit the memory accesses in bottom-up order because it can simplify the
1390	// construction of store groups in the presence of write-after-write (WAW)
1391	// dependences.
1392	//
1393	// E.g., for the WAW dependence: A[i] = a; // (1)
1394	// A[i] = b; // (2)
1395	// A[i + 1] = c; // (3)
1396	//
1397	// We will first create a store group with (3) and (2). (1) can't be added to
1398	// this group because it and (2) are dependent. However, (1) can be grouped
1399	// with other accesses that may precede it in program order. Note that a
1400	// bottom-up order does not imply that WAW dependences should not be checked.
1401	void InterleavedAccessInfo::analyzeInterleaving(
1402	bool EnablePredicatedInterleavedMemAccesses) {
1403	LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
1404	const auto &Strides = LAI->getSymbolicStrides();
1405
1406	// Holds all accesses with a constant stride.
1407	MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
1408	collectConstStrideAccesses(AccessStrideInfo, Strides);
1409
1410	if (AccessStrideInfo.empty())
1411	return;
1412
1413	// Collect the dependences in the loop.
1414	collectDependences();
1415
1416	// Holds all interleaved store groups temporarily.
1417	SmallSetVector<InterleaveGroup<Instruction> *, `4`> StoreGroups;
1418	// Holds all interleaved load groups temporarily.
1419	SmallSetVector<InterleaveGroup<Instruction> *, `4`> LoadGroups;
1420	// Groups added to this set cannot have new members added.
1421	SmallPtrSet<InterleaveGroup<Instruction> *, `4`> CompletedLoadGroups;
1422
1423	// Search in bottom-up program order for pairs of accesses (A and B) that can
1424	// form interleaved load or store groups. In the algorithm below, access A
1425	// precedes access B in program order. We initialize a group for B in the
1426	// outer loop of the algorithm, and then in the inner loop, we attempt to
1427	// insert each A into B's group if:
1428	//
1429	// 1. A and B have the same stride,
1430	// 2. A and B have the same memory object size, and
1431	// 3. A belongs in B's group according to its distance from B.
1432	//
1433	// Special care is taken to ensure group formation will not break any
1434	// dependences.
1435	for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
1436	BI != E; ++BI) {
1437	Instruction *B = BI ->first;
1438	StrideDescriptor DesB = BI ->second;
1439
1440	// Initialize a group for B if it has an allowable stride. Even if we don't
1441	// create a group for B, we continue with the bottom-up algorithm to ensure
1442	// we don't break any of B's dependences.
1443	InterleaveGroup<Instruction> GroupB = nullptr*;
1444	if (isStrided(Stride: DesB.Stride) &&
1445	(!isPredicated(BB: B->getParent()) \|\| EnablePredicatedInterleavedMemAccesses)) {
1446	GroupB = getInterleaveGroup(Instr: B);
1447	if (!GroupB) {
1448	LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
1449	<< `'\n'`);
1450	GroupB = createInterleaveGroup(Instr: B, Stride: DesB.Stride, Alignment: DesB.Alignment);
1451	if (B->mayWriteToMemory())
1452	StoreGroups.insert(X: GroupB);
1453	else
1454	LoadGroups.insert(X: GroupB);
1455	}
1456	}
1457
1458	for (auto AI = std::next(x: BI); AI != E; ++AI) {
1459	Instruction *A = AI ->first;
1460	StrideDescriptor DesA = AI ->second;
1461
1462	// Our code motion strategy implies that we can't have dependences
1463	// between accesses in an interleaved group and other accesses located
1464	// between the first and last member of the group. Note that this also
1465	// means that a group can't have more than one member at a given offset.
1466	// The accesses in a group can have dependences with other accesses, but
1467	// we must ensure we don't extend the boundaries of the group such that
1468	// we encompass those dependent accesses.
1469	//
1470	// For example, assume we have the sequence of accesses shown below in a
1471	// stride-2 loop:
1472	//
1473	// (1, 2) is a group \| A[i] = a; // (1)
1474	// \| A[i-1] = b; // (2) \|
1475	// A[i-3] = c; // (3)
1476	// A[i] = d; // (4) \| (2, 4) is not a group
1477	//
1478	// Because accesses (2) and (3) are dependent, we can group (2) with (1)
1479	// but not with (4). If we did, the dependent access (3) would be within
1480	// the boundaries of the (2, 4) group.
1481	auto DependentMember = [&](InterleaveGroup<Instruction> *Group,
1482	StrideEntry A) -> Instruction {
1483	for (uint32_t Index = `0`; Index < Group->getFactor(); ++Index) {
1484	Instruction *MemberOfGroupB = Group->getMember(Index);
1485	if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1486	A, B: &*AccessStrideInfo.find(Key: MemberOfGroupB)))
1487	return MemberOfGroupB;
1488	}
1489	return nullptr;
1490	};
1491
1492	auto GroupA = getInterleaveGroup(Instr: A);
1493	// If A is a load, dependencies are tolerable, there's nothing to do here.
1494	// If both A and B belong to the same (store) group, they are independent,
1495	// even if dependencies have not been recorded.
1496	// If both GroupA and GroupB are null, there's nothing to do here.
1497	if (A->mayWriteToMemory() && GroupA != GroupB) {
1498	Instruction DependentInst = nullptr*;
1499	// If GroupB is a load group, we have to compare AI against all
1500	// members of GroupB because if any load within GroupB has a dependency
1501	// on AI, we need to mark GroupB as complete and also release the
1502	// store GroupA (if A belongs to one). The former prevents incorrect
1503	// hoisting of load B above store A while the latter prevents incorrect
1504	// sinking of store A below load B.
1505	if (GroupB && LoadGroups.contains(key: GroupB))
1506	DependentInst = DependentMember (GroupB, &*AI);
1507	else if (!canReorderMemAccessesForInterleavedGroups(A: &AI, B: &BI))
1508	DependentInst = B;
1509
1510	if (DependentInst) {
1511	// A has a store dependence on B (or on some load within GroupB) and
1512	// is part of a store group. Release A's group to prevent illegal
1513	// sinking of A below B. A will then be free to form another group
1514	// with instructions that precede it.
1515	if (GroupA && StoreGroups.contains(key: GroupA)) {
1516	LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
1517	"dependence between "
1518	<< A << " and " << DependentInst << `'\n'`);
1519	StoreGroups.remove(X: GroupA);
1520	releaseGroup(Group: GroupA);
1521	}
1522	// If B is a load and part of an interleave group, no earlier loads
1523	// can be added to B's interleave group, because this would mean the
1524	// DependentInst would move across store A. Mark the interleave group
1525	// as complete.
1526	if (GroupB && LoadGroups.contains(key: GroupB)) {
1527	LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
1528	<< " as complete.\n");
1529	CompletedLoadGroups.insert(Ptr: GroupB);
1530	}
1531	}
1532	}
1533	if (CompletedLoadGroups.contains(Ptr: GroupB)) {
1534	// Skip trying to add A to B, continue to look for other conflicting A's
1535	// in groups to be released.
1536	continue;
1537	}
1538
1539	// At this point, we've checked for illegal code motion. If either A or B
1540	// isn't strided, there's nothing left to do.
1541	if (!isStrided(Stride: DesA.Stride) \|\| !isStrided(Stride: DesB.Stride))
1542	continue;
1543
1544	// Ignore A if it's already in a group or isn't the same kind of memory
1545	// operation as B.
1546	// Note that mayReadFromMemory() isn't mutually exclusive to
1547	// mayWriteToMemory in the case of atomic loads. We shouldn't see those
1548	// here, canVectorizeMemory() should have returned false - except for the
1549	// case we asked for optimization remarks.
1550	if (isInterleaved(Instr: A) \|\|
1551	(A->mayReadFromMemory() != B->mayReadFromMemory()) \|\|
1552	(A->mayWriteToMemory() != B->mayWriteToMemory()))
1553	continue;
1554
1555	// Check rules 1 and 2. Ignore A if its stride or size is different from
1556	// that of B.
1557	if (DesA.Stride != DesB.Stride \|\| DesA.Size != DesB.Size)
1558	continue;
1559
1560	// Ignore A if the memory object of A and B don't belong to the same
1561	// address space
1562	if (getLoadStoreAddressSpace(I: A) != getLoadStoreAddressSpace(I: B))
1563	continue;
1564
1565	// Calculate the distance from A to B.
1566	const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
1567	Val: PSE.getSE()->getMinusSCEV(LHS: DesA.Scev, RHS: DesB.Scev));
1568	if (!DistToB)
1569	continue;
1570	int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
1571
1572	// Check rule 3. Ignore A if its distance to B is not a multiple of the
1573	// size.
1574	if (DistanceToB % static_cast<int64_t>(DesB.Size))
1575	continue;
1576
1577	// All members of a predicated interleave-group must have the same predicate,
1578	// and currently must reside in the same BB.
1579	BasicBlock *BlockA = A->getParent();
1580	BasicBlock *BlockB = B->getParent();
1581	if ((isPredicated(BB: BlockA) \|\| isPredicated(BB: BlockB)) &&
1582	(!EnablePredicatedInterleavedMemAccesses \|\| BlockA != BlockB))
1583	continue;
1584
1585	// The index of A is the index of B plus A's distance to B in multiples
1586	// of the size.
1587	int IndexA =
1588	GroupB->getIndex(Instr: B) + DistanceToB / static_cast<int64_t>(DesB.Size);
1589
1590	// Try to insert A into B's group.
1591	if (GroupB->insertMember(Instr: A, Index: IndexA, NewAlign: DesA.Alignment)) {
1592	LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << `'\n'`
1593	<< " into the interleave group with" << *B
1594	<< `'\n'`);
1595	InterleaveGroupMap [A] = GroupB;
1596
1597	// Set the first load in program order as the insert position.
1598	if (A->mayReadFromMemory())
1599	GroupB->setInsertPos(A);
1600	}
1601	} // Iteration over A accesses.
1602	} // Iteration over B accesses.
1603
1604	auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
1605	int Index,
1606	const char FirstOrLast) -> bool* {
1607	Instruction *Member = Group->getMember(Index);
1608	assert(Member && "Group member does not exist");
1609	Value *MemberPtr = getLoadStorePointerOperand(V: Member);
1610	Type *AccessTy = getLoadStoreType(I: Member);
1611	if (getPtrStride(PSE, AccessTy, Ptr: MemberPtr, Lp: TheLoop, StridesMap: Strides,
1612	/Assume=/false, /ShouldCheckWrap=/true).value_or(u: `0`))
1613	return false;
1614	LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
1615	<< FirstOrLast
1616	<< " group member potentially pointer-wrapping.\n");
1617	releaseGroup(Group);
1618	return true;
1619	};
1620
1621	// Remove interleaved groups with gaps whose memory
1622	// accesses may wrap around. We have to revisit the getPtrStride analysis,
1623	// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
1624	// not check wrapping (see documentation there).
1625	// FORNOW we use Assume=false;
1626	// TODO: Change to Assume=true but making sure we don't exceed the threshold
1627	// of runtime SCEV assumptions checks (thereby potentially failing to
1628	// vectorize altogether).
1629	// Additional optional optimizations:
1630	// TODO: If we are peeling the loop and we know that the first pointer doesn't
1631	// wrap then we can deduce that all pointers in the group don't wrap.
1632	// This means that we can forcefully peel the loop in order to only have to
1633	// check the first pointer for no-wrap. When we'll change to use Assume=true
1634	// we'll only need at most one runtime check per interleaved group.
1635	for (auto *Group : LoadGroups) {
1636	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
1637	// load would wrap around the address space we would do a memory access at
1638	// nullptr even without the transformation.
1639	if (Group->getNumMembers() == Group->getFactor())
1640	continue;
1641
1642	// Case 2: If first and last members of the group don't wrap this implies
1643	// that all the pointers in the group don't wrap.
1644	// So we check only group member 0 (which is always guaranteed to exist),
1645	// and group member Factor - 1; If the latter doesn't exist we rely on
1646	// peeling (if it is a non-reversed access -- see Case 3).
1647	if (InvalidateGroupIfMemberMayWrap (Group, `0`, "first"))
1648	continue;
1649	if (Group->getMember(Index: Group->getFactor() - `1`))
1650	InvalidateGroupIfMemberMayWrap (Group, Group->getFactor() - `1`, "last");
1651	else {
1652	// Case 3: A non-reversed interleaved load group with gaps: We need
1653	// to execute at least one scalar epilogue iteration. This will ensure
1654	// we don't speculatively access memory out-of-bounds. We only need
1655	// to look for a member at index factor - 1, since every group must have
1656	// a member at index zero.
1657	if (Group->isReverse()) {
1658	LLVM_DEBUG(
1659	dbgs() << "LV: Invalidate candidate interleaved group due to "
1660	"a reverse access with gaps.\n");
1661	releaseGroup(Group);
1662	continue;
1663	}
1664	LLVM_DEBUG(
1665	dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
1666	RequiresScalarEpilogue = true;
1667	}
1668	}
1669
1670	for (auto *Group : StoreGroups) {
1671	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
1672	// store would wrap around the address space we would do a memory access at
1673	// nullptr even without the transformation.
1674	if (Group->getNumMembers() == Group->getFactor())
1675	continue;
1676
1677	// Interleave-store-group with gaps is implemented using masked wide store.
1678	// Remove interleaved store groups with gaps if
1679	// masked-interleaved-accesses are not enabled by the target.
1680	if (!EnablePredicatedInterleavedMemAccesses) {
1681	LLVM_DEBUG(
1682	dbgs() << "LV: Invalidate candidate interleaved store group due "
1683	"to gaps.\n");
1684	releaseGroup(Group);
1685	continue;
1686	}
1687
1688	// Case 2: If first and last members of the group don't wrap this implies
1689	// that all the pointers in the group don't wrap.
1690	// So we check only group member 0 (which is always guaranteed to exist),
1691	// and the last group member. Case 3 (scalar epilog) is not relevant for
1692	// stores with gaps, which are implemented with masked-store (rather than
1693	// speculative access, as in loads).
1694	if (InvalidateGroupIfMemberMayWrap (Group, `0`, "first"))
1695	continue;
1696	for (int Index = Group->getFactor() - `1`; Index > `0`; Index--)
1697	if (Group->getMember(Index)) {
1698	InvalidateGroupIfMemberMayWrap (Group, Index, "last");
1699	break;
1700	}
1701	}
1702	}
1703
1704	void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
1705	// If no group had triggered the requirement to create an epilogue loop,
1706	// there is nothing to do.
1707	if (!requiresScalarEpilogue())
1708	return;
1709
1710	// Release groups requiring scalar epilogues. Note that this also removes them
1711	// from InterleaveGroups.
1712	bool ReleasedGroup = InterleaveGroups.remove_if(P: [&](auto *Group) {
1713	if (!Group->requiresScalarEpilogue())
1714	return false;
1715	LLVM_DEBUG(
1716	dbgs()
1717	<< "LV: Invalidate candidate interleaved group due to gaps that "
1718	"require a scalar epilogue (not allowed under optsize) and cannot "
1719	"be masked (not enabled). \n");
1720	releaseGroupWithoutRemovingFromSet(Group);
1721	return true;
1722	});
1723	assert(ReleasedGroup && "At least one group must be invalidated, as a "
1724	"scalar epilogue was required");
1725	(void)ReleasedGroup;
1726	RequiresScalarEpilogue = false;
1727	}
1728
1729	template <typename InstT>
1730	void InterleaveGroup<InstT>::addMetadata(InstT NewInst) const* {
1731	llvm_unreachable("addMetadata can only be used for Instruction");
1732	}
1733
1734	namespace llvm {
1735	template <>
1736	void InterleaveGroup<Instruction>::addMetadata(Instruction NewInst) const* {
1737	SmallVector<Value *, `4`> VL(make_second_range(c: Members));
1738	propagateMetadata(Inst: NewInst, VL);
1739	}
1740	} // namespace llvm
1741

Browse the source code of llvm_projects/llvm/lib/Analysis/VectorUtils.cpp