VectorUtils.cpp source code [llvm_projects/llvm/lib/Analysis/VectorUtils.cpp]

1	//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines vectorizer utilities.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/Analysis/VectorUtils.h"
14	#include "llvm/ADT/EquivalenceClasses.h"
15	#include "llvm/ADT/SmallVector.h"
16	#include "llvm/Analysis/DemandedBits.h"
17	#include "llvm/Analysis/LoopInfo.h"
18	#include "llvm/Analysis/LoopIterator.h"
19	#include "llvm/Analysis/ScalarEvolution.h"
20	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
21	#include "llvm/Analysis/TargetTransformInfo.h"
22	#include "llvm/Analysis/ValueTracking.h"
23	#include "llvm/IR/Constants.h"
24	#include "llvm/IR/DerivedTypes.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Value.h"
29	#include "llvm/Support/CommandLine.h"
30
31	#define DEBUG_TYPE "vectorutils"
32
33	using namespace llvm;
34	using namespace llvm::PatternMatch;
35
36	/// Maximum factor for an interleaved memory access.
37	static cl::opt<unsigned> MaxInterleaveGroupFactor(
38	"max-interleave-group-factor", cl::Hidden,
39	cl::desc("Maximum factor for an interleaved access group (default = 8)"),
40	cl::init(Val: `8`));
41
42	/// Return true if all of the intrinsic's arguments and return type are scalars
43	/// for the scalar form of the intrinsic, and vectors for the vector form of the
44	/// intrinsic (except operands that are marked as always being scalar by
45	/// isVectorIntrinsicWithScalarOpAtArg).
46	bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
47	switch (ID) {
48	case Intrinsic::abs: // Begin integer bit-manipulation.
49	case Intrinsic::bswap:
50	case Intrinsic::bitreverse:
51	case Intrinsic::ctpop:
52	case Intrinsic::ctlz:
53	case Intrinsic::cttz:
54	case Intrinsic::fshl:
55	case Intrinsic::fshr:
56	case Intrinsic::smax:
57	case Intrinsic::smin:
58	case Intrinsic::umax:
59	case Intrinsic::umin:
60	case Intrinsic::sadd_sat:
61	case Intrinsic::ssub_sat:
62	case Intrinsic::uadd_sat:
63	case Intrinsic::usub_sat:
64	case Intrinsic::smul_fix:
65	case Intrinsic::smul_fix_sat:
66	case Intrinsic::umul_fix:
67	case Intrinsic::umul_fix_sat:
68	case Intrinsic::uadd_with_overflow:
69	case Intrinsic::sadd_with_overflow:
70	case Intrinsic::usub_with_overflow:
71	case Intrinsic::ssub_with_overflow:
72	case Intrinsic::umul_with_overflow:
73	case Intrinsic::smul_with_overflow:
74	case Intrinsic::sqrt: // Begin floating-point.
75	case Intrinsic::asin:
76	case Intrinsic::acos:
77	case Intrinsic::atan:
78	case Intrinsic::atan2:
79	case Intrinsic::sin:
80	case Intrinsic::cos:
81	case Intrinsic::sincos:
82	case Intrinsic::sincospi:
83	case Intrinsic::tan:
84	case Intrinsic::sinh:
85	case Intrinsic::cosh:
86	case Intrinsic::tanh:
87	case Intrinsic::exp:
88	case Intrinsic::exp10:
89	case Intrinsic::exp2:
90	case Intrinsic::frexp:
91	case Intrinsic::ldexp:
92	case Intrinsic::log:
93	case Intrinsic::log10:
94	case Intrinsic::log2:
95	case Intrinsic::fabs:
96	case Intrinsic::minnum:
97	case Intrinsic::maxnum:
98	case Intrinsic::minimum:
99	case Intrinsic::maximum:
100	case Intrinsic::minimumnum:
101	case Intrinsic::maximumnum:
102	case Intrinsic::modf:
103	case Intrinsic::copysign:
104	case Intrinsic::floor:
105	case Intrinsic::ceil:
106	case Intrinsic::trunc:
107	case Intrinsic::rint:
108	case Intrinsic::nearbyint:
109	case Intrinsic::round:
110	case Intrinsic::roundeven:
111	case Intrinsic::pow:
112	case Intrinsic::fma:
113	case Intrinsic::fmuladd:
114	case Intrinsic::is_fpclass:
115	case Intrinsic::powi:
116	case Intrinsic::canonicalize:
117	case Intrinsic::fptosi_sat:
118	case Intrinsic::fptoui_sat:
119	case Intrinsic::lround:
120	case Intrinsic::llround:
121	case Intrinsic::lrint:
122	case Intrinsic::llrint:
123	case Intrinsic::ucmp:
124	case Intrinsic::scmp:
125	case Intrinsic::clmul:
126	return true;
127	default:
128	return false;
129	}
130	}
131
132	bool llvm::isTriviallyScalarizable(Intrinsic::ID ID) {
133	if (isTriviallyVectorizable(ID))
134	return true;
135
136	return Intrinsic::isTriviallyScalarizable(id: ID);
137	}
138
139	/// Identifies if the vector form of the intrinsic has a scalar operand.
140	bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
141	unsigned ScalarOpdIdx,
142	const TargetTransformInfo *TTI) {
143
144	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
145	return TTI->isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
146
147	// Vector predication intrinsics have the EVL as the last operand.
148	if (VPIntrinsic::getVectorLengthParamPos(IntrinsicID: ID) == ScalarOpdIdx)
149	return true;
150
151	switch (ID) {
152	case Intrinsic::abs:
153	case Intrinsic::vp_abs:
154	case Intrinsic::ctlz:
155	case Intrinsic::vp_ctlz:
156	case Intrinsic::cttz:
157	case Intrinsic::vp_cttz:
158	case Intrinsic::is_fpclass:
159	case Intrinsic::vp_is_fpclass:
160	case Intrinsic::powi:
161	case Intrinsic::vector_extract:
162	return (ScalarOpdIdx == `1`);
163	case Intrinsic::smul_fix:
164	case Intrinsic::smul_fix_sat:
165	case Intrinsic::umul_fix:
166	case Intrinsic::umul_fix_sat:
167	case Intrinsic::vector_splice_left:
168	case Intrinsic::vector_splice_right:
169	return (ScalarOpdIdx == `2`);
170	case Intrinsic::experimental_vp_splice:
171	return ScalarOpdIdx == `2` \|\| ScalarOpdIdx == `4`;
172	case Intrinsic::experimental_vp_strided_load:
173	return ScalarOpdIdx == `0` \|\| ScalarOpdIdx == `1`;
174	case Intrinsic::loop_dependence_war_mask:
175	return true;
176	default:
177	return false;
178	}
179	}
180
181	bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
182	Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI) {
183	assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
184
185	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
186	return TTI->isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
187
188	if (VPCastIntrinsic::isVPCast(ID))
189	return OpdIdx == -`1` \|\| OpdIdx == `0`;
190
191	switch (ID) {
192	case Intrinsic::fptosi_sat:
193	case Intrinsic::fptoui_sat:
194	case Intrinsic::lround:
195	case Intrinsic::llround:
196	case Intrinsic::lrint:
197	case Intrinsic::llrint:
198	case Intrinsic::vp_lrint:
199	case Intrinsic::vp_llrint:
200	case Intrinsic::ucmp:
201	case Intrinsic::scmp:
202	case Intrinsic::vector_extract:
203	case Intrinsic::loop_dependence_war_mask:
204	return OpdIdx == -`1` \|\| OpdIdx == `0`;
205	case Intrinsic::modf:
206	case Intrinsic::sincos:
207	case Intrinsic::sincospi:
208	case Intrinsic::is_fpclass:
209	case Intrinsic::vp_is_fpclass:
210	return OpdIdx == `0`;
211	case Intrinsic::powi:
212	case Intrinsic::ldexp:
213	return OpdIdx == -`1` \|\| OpdIdx == `1`;
214	case Intrinsic::experimental_vp_strided_load:
215	return OpdIdx == -`1` \|\| OpdIdx == `0` \|\| OpdIdx == `1`;
216	default:
217	return OpdIdx == -`1`;
218	}
219	}
220
221	bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
222	Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI) {
223
224	if (TTI && Intrinsic::isTargetIntrinsic(IID: ID))
225	return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
226
227	switch (ID) {
228	case Intrinsic::frexp:
229	return RetIdx == `0` \|\| RetIdx == `1`;
230	default:
231	return RetIdx == `0`;
232	}
233	}
234
235	/// Returns intrinsic ID for call.
236	/// For the input call instruction it finds mapping intrinsic and returns
237	/// its ID, in case it does not found it return not_intrinsic.
238	Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
239	const TargetLibraryInfo *TLI) {
240	Intrinsic::ID ID = getIntrinsicForCallSite(CB: *CI, TLI);
241	if (ID == Intrinsic::not_intrinsic)
242	return Intrinsic::not_intrinsic;
243
244	if (isTriviallyVectorizable(ID) \|\| ID == Intrinsic::lifetime_start \|\|
245	ID == Intrinsic::lifetime_end \|\| ID == Intrinsic::assume \|\|
246	ID == Intrinsic::experimental_noalias_scope_decl \|\|
247	ID == Intrinsic::sideeffect \|\| ID == Intrinsic::pseudoprobe)
248	return ID;
249	return Intrinsic::not_intrinsic;
250	}
251
252	unsigned llvm::getInterleaveIntrinsicFactor(Intrinsic::ID ID) {
253	switch (ID) {
254	case Intrinsic::vector_interleave2:
255	return `2`;
256	case Intrinsic::vector_interleave3:
257	return `3`;
258	case Intrinsic::vector_interleave4:
259	return `4`;
260	case Intrinsic::vector_interleave5:
261	return `5`;
262	case Intrinsic::vector_interleave6:
263	return `6`;
264	case Intrinsic::vector_interleave7:
265	return `7`;
266	case Intrinsic::vector_interleave8:
267	return `8`;
268	default:
269	return `0`;
270	}
271	}
272
273	unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) {
274	switch (ID) {
275	case Intrinsic::vector_deinterleave2:
276	return `2`;
277	case Intrinsic::vector_deinterleave3:
278	return `3`;
279	case Intrinsic::vector_deinterleave4:
280	return `4`;
281	case Intrinsic::vector_deinterleave5:
282	return `5`;
283	case Intrinsic::vector_deinterleave6:
284	return `6`;
285	case Intrinsic::vector_deinterleave7:
286	return `7`;
287	case Intrinsic::vector_deinterleave8:
288	return `8`;
289	default:
290	return `0`;
291	}
292	}
293
294	VectorType llvm::getDeinterleavedVectorType(IntrinsicInst DI) {
295	[[maybe_unused]] unsigned Factor =
296	getDeinterleaveIntrinsicFactor(ID: DI->getIntrinsicID());
297	ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
298	assert(Factor && Factor == DISubtypes.size() &&
299	"unexpected deinterleave factor or result type");
300	return cast<VectorType>(Val: DISubtypes [`0`]);
301	}
302
303	/// Given a vector and an element number, see if the scalar value is
304	/// already around as a register, for example if it were inserted then extracted
305	/// from the vector.
306	Value llvm::findScalarElement(Value V, unsigned EltNo) {
307	assert(V->getType()->isVectorTy() && "Not looking at a vector?");
308	VectorType *VTy = cast<VectorType>(Val: V->getType());
309	// For fixed-length vector, return poison for out of range access.
310	if (auto *FVTy = dyn_cast<FixedVectorType>(Val: VTy)) {
311	unsigned Width = FVTy->getNumElements();
312	if (EltNo >= Width)
313	return PoisonValue::get(T: FVTy->getElementType());
314	}
315
316	if (Constant *C = dyn_cast<Constant>(Val: V))
317	return C->getAggregateElement(Elt: EltNo);
318
319	if (InsertElementInst *III = dyn_cast<InsertElementInst>(Val: V)) {
320	// If this is an insert to a variable element, we don't know what it is.
321	uint64_t IIElt;
322	if (!match(V: III->getOperand(i_nocapture: `2`), P: m_ConstantInt(V&: IIElt)))
323	return nullptr;
324
325	// If this is an insert to the element we are looking for, return the
326	// inserted value.
327	if (EltNo == IIElt)
328	return III->getOperand(i_nocapture: `1`);
329
330	// Guard against infinite loop on malformed, unreachable IR.
331	if (III == III->getOperand(i_nocapture: `0`))
332	return nullptr;
333
334	// Otherwise, the insertelement doesn't modify the value, recurse on its
335	// vector input.
336	return findScalarElement(V: III->getOperand(i_nocapture: `0`), EltNo);
337	}
338
339	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Val: V);
340	// Restrict the following transformation to fixed-length vector.
341	if (SVI && isa<FixedVectorType>(Val: SVI->getType())) {
342	unsigned LHSWidth =
343	cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: `0`)->getType())->getNumElements();
344	int InEl = SVI->getMaskValue(Elt: EltNo);
345	if (InEl < `0`)
346	return PoisonValue::get(T: VTy->getElementType());
347	if (InEl < (int)LHSWidth)
348	return findScalarElement(V: SVI->getOperand(i_nocapture: `0`), EltNo: InEl);
349	return findScalarElement(V: SVI->getOperand(i_nocapture: `1`), EltNo: InEl - LHSWidth);
350	}
351
352	// Extract a value from a vector add operation with a constant zero.
353	// TODO: Use getBinOpIdentity() to generalize this.
354	Value Val; Constant C;
355	if (match(V, P: m_Add(L: m_Value(V&: Val), R: m_Constant(C))))
356	if (Constant *Elt = C->getAggregateElement(Elt: EltNo))
357	if (Elt->isNullValue())
358	return findScalarElement(V: Val, EltNo);
359
360	// If the vector is a splat then we can trivially find the scalar element.
361	if (isa<ScalableVectorType>(Val: VTy))
362	if (Value *Splat = getSplatValue(V))
363	if (EltNo < VTy->getElementCount().getKnownMinValue())
364	return Splat;
365
366	// Otherwise, we don't know.
367	return nullptr;
368	}
369
370	int llvm::getSplatIndex(ArrayRef<int> Mask) {
371	int SplatIndex = -`1`;
372	for (int M : Mask) {
373	// Ignore invalid (undefined) mask elements.
374	if (M < `0`)
375	continue;
376
377	// There can be only 1 non-negative mask element value if this is a splat.
378	if (SplatIndex != -`1` && SplatIndex != M)
379	return -`1`;
380
381	// Initialize the splat index to the 1st non-negative mask element.
382	SplatIndex = M;
383	}
384	assert((SplatIndex == -`1` \|\| SplatIndex >= `0`) && "Negative index?");
385	return SplatIndex;
386	}
387
388	/// Get splat value if the input is a splat vector or return nullptr.
389	/// This function is not fully general. It checks only 2 cases:
390	/// the input value is (1) a splat constant vector or (2) a sequence
391	/// of instructions that broadcasts a scalar at element 0.
392	Value llvm::getSplatValue(const* Value *V) {
393	if (isa<VectorType>(Val: V->getType()))
394	if (auto *C = dyn_cast<Constant>(Val: V))
395	return C->getSplatValue();
396
397	// shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
398	Value *Splat;
399	if (match(V,
400	P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(V&: Splat), Idx: m_ZeroInt()),
401	v2: m_Value(), mask: m_ZeroMask ())))
402	return Splat;
403
404	return nullptr;
405	}
406
407	bool llvm::isSplatValue(const Value V, int* Index, unsigned Depth) {
408	assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
409
410	if (isa<VectorType>(Val: V->getType())) {
411	if (isa<UndefValue>(Val: V))
412	return true;
413	// FIXME: We can allow undefs, but if Index was specified, we may want to
414	// check that the constant is defined at that index.
415	if (auto *C = dyn_cast<Constant>(Val: V))
416	return C->getSplatValue() != nullptr;
417	}
418
419	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V)) {
420	// FIXME: We can safely allow undefs here. If Index was specified, we will
421	// check that the mask elt is defined at the required index.
422	if (!all_equal(Range: Shuf->getShuffleMask()))
423	return false;
424
425	// Match any index.
426	if (Index == -`1`)
427	return true;
428
429	// Match a specific element. The mask should be defined at and match the
430	// specified index.
431	return Shuf->getMaskValue(Elt: Index) == Index;
432	}
433
434	// The remaining tests are all recursive, so bail out if we hit the limit.
435	if (Depth++ == MaxAnalysisRecursionDepth)
436	return false;
437
438	// If both operands of a binop are splats, the result is a splat.
439	Value X, Y, *Z;
440	if (match(V, P: m_BinOp(L: m_Value(V&: X), R: m_Value(V&: Y))))
441	return isSplatValue(V: X, Index, Depth) && isSplatValue(V: Y, Index, Depth);
442
443	// If all operands of a select are splats, the result is a splat.
444	if (match(V, P: m_Select(C: m_Value(V&: X), L: m_Value(V&: Y), R: m_Value(V&: Z))))
445	return isSplatValue(V: X, Index, Depth) && isSplatValue(V: Y, Index, Depth) &&
446	isSplatValue(V: Z, Index, Depth);
447
448	// TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
449
450	return false;
451	}
452
453	bool llvm::getShuffleDemandedElts(int SrcWidth, ArrayRef<int> Mask,
454	const APInt &DemandedElts, APInt &DemandedLHS,
455	APInt &DemandedRHS, bool AllowUndefElts) {
456	DemandedLHS = DemandedRHS = APInt::getZero(numBits: SrcWidth);
457
458	// Early out if we don't demand any elements.
459	if (DemandedElts.isZero())
460	return true;
461
462	// Simple case of a shuffle with zeroinitializer.
463	if (all_of(Range&: Mask, P: equal_to(Arg: `0`))) {
464	DemandedLHS.setBit(`0`);
465	return true;
466	}
467
468	for (unsigned I = `0`, E = Mask.size(); I != E; ++I) {
469	int M = Mask [I];
470	assert((-`1` <= M) && (M < (SrcWidth * `2`)) &&
471	"Invalid shuffle mask constant");
472
473	if (!DemandedElts [I] \|\| (AllowUndefElts && (M < `0`)))
474	continue;
475
476	// For undef elements, we don't know anything about the common state of
477	// the shuffle result.
478	if (M < `0`)
479	return false;
480
481	if (M < SrcWidth)
482	DemandedLHS.setBit(M);
483	else
484	DemandedRHS.setBit(M - SrcWidth);
485	}
486
487	return true;
488	}
489
490	bool llvm::isMaskedSlidePair(ArrayRef<int> Mask, int NumElts,
491	std::array<std::pair<int, int>, `2`> &SrcInfo) {
492	const int SignalValue = NumElts * `2`;
493	SrcInfo [`0`] = {-`1`, SignalValue};
494	SrcInfo [`1`] = {-`1`, SignalValue};
495	for (auto [i, M] : enumerate(First&: Mask)) {
496	if (M < `0`)
497	continue;
498	int Src = M >= NumElts;
499	int Diff = (int)i - (M % NumElts);
500	bool Match = false;
501	for (int j = `0`; j < `2`; j++) {
502	auto &[SrcE, DiffE] = SrcInfo [j];
503	if (SrcE == -`1`) {
504	assert(DiffE == SignalValue);
505	SrcE = Src;
506	DiffE = Diff;
507	}
508	if (SrcE == Src && DiffE == Diff) {
509	Match = true;
510	break;
511	}
512	}
513	if (!Match)
514	return false;
515	}
516	// Avoid all undef masks
517	return SrcInfo [`0`].first != -`1`;
518	}
519
520	void llvm::narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
521	SmallVectorImpl<int> &ScaledMask) {
522	assert(Scale > `0` && "Unexpected scaling factor");
523
524	// Fast-path: if no scaling, then it is just a copy.
525	if (Scale == `1`) {
526	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
527	return;
528	}
529
530	ScaledMask.clear();
531	for (int MaskElt : Mask) {
532	if (MaskElt >= `0`) {
533	assert(((uint64_t)Scale * MaskElt + (Scale - `1`)) <= INT32_MAX &&
534	"Overflowed 32-bits");
535	}
536	for (int SliceElt = `0`; SliceElt != Scale; ++SliceElt)
537	ScaledMask.push_back(Elt: MaskElt < `0` ? MaskElt : Scale * MaskElt + SliceElt);
538	}
539	}
540
541	bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
542	SmallVectorImpl<int> &ScaledMask) {
543	assert(Scale > `0` && "Unexpected scaling factor");
544
545	// Fast-path: if no scaling, then it is just a copy.
546	if (Scale == `1`) {
547	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
548	return true;
549	}
550
551	// We must map the original elements down evenly to a type with less elements.
552	int NumElts = Mask.size();
553	if (NumElts % Scale != `0`)
554	return false;
555
556	ScaledMask.clear();
557	ScaledMask.reserve(N: NumElts / Scale);
558
559	// Step through the input mask by splitting into Scale-sized slices.
560	do {
561	ArrayRef<int> MaskSlice = Mask.take_front(N: Scale);
562	assert((int)MaskSlice.size() == Scale && "Expected Scale-sized slice.");
563
564	// The first element of the slice determines how we evaluate this slice.
565	int SliceFront = MaskSlice.front();
566	if (SliceFront < `0`) {
567	// Negative values (undef or other "sentinel" values) must be equal across
568	// the entire slice.
569	if (!all_equal(Range&: MaskSlice))
570	return false;
571	ScaledMask.push_back(Elt: SliceFront);
572	} else {
573	// A positive mask element must be cleanly divisible.
574	if (SliceFront % Scale != `0`)
575	return false;
576	// Elements of the slice must be consecutive.
577	for (int i = `1`; i < Scale; ++i)
578	if (MaskSlice [i] != SliceFront + i)
579	return false;
580	ScaledMask.push_back(Elt: SliceFront / Scale);
581	}
582	Mask = Mask.drop_front(N: Scale);
583	} while (!Mask.empty());
584
585	assert((int)ScaledMask.size() * Scale == NumElts && "Unexpected scaled mask");
586
587	// All elements of the original mask can be scaled down to map to the elements
588	// of a mask with wider elements.
589	return true;
590	}
591
592	bool llvm::widenShuffleMaskElts(ArrayRef<int> M,
593	SmallVectorImpl<int> &NewMask) {
594	unsigned NumElts = M.size();
595	if (NumElts % `2` != `0`)
596	return false;
597
598	NewMask.clear();
599	for (unsigned i = `0`; i < NumElts; i += `2`) {
600	int M0 = M [i];
601	int M1 = M [i + `1`];
602
603	// If both elements are undef, new mask is undef too.
604	if (M0 == -`1` && M1 == -`1`) {
605	NewMask.push_back(Elt: -`1`);
606	continue;
607	}
608
609	if (M0 == -`1` && M1 != -`1` && (M1 % `2`) == `1`) {
610	NewMask.push_back(Elt: M1 / `2`);
611	continue;
612	}
613
614	if (M0 != -`1` && (M0 % `2`) == `0` && ((M0 + `1`) == M1 \|\| M1 == -`1`)) {
615	NewMask.push_back(Elt: M0 / `2`);
616	continue;
617	}
618
619	NewMask.clear();
620	return false;
621	}
622
623	assert(NewMask.size() == NumElts / `2` && "Incorrect size for mask!");
624	return true;
625	}
626
627	bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
628	SmallVectorImpl<int> &ScaledMask) {
629	unsigned NumSrcElts = Mask.size();
630	assert(NumSrcElts > `0` && NumDstElts > `0` && "Unexpected scaling factor");
631
632	// Fast-path: if no scaling, then it is just a copy.
633	if (NumSrcElts == NumDstElts) {
634	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
635	return true;
636	}
637
638	// Ensure we can find a whole scale factor.
639	assert(((NumSrcElts % NumDstElts) == `0` \|\| (NumDstElts % NumSrcElts) == `0`) &&
640	"Unexpected scaling factor");
641
642	if (NumSrcElts > NumDstElts) {
643	int Scale = NumSrcElts / NumDstElts;
644	return widenShuffleMaskElts(Scale, Mask, ScaledMask);
645	}
646
647	int Scale = NumDstElts / NumSrcElts;
648	narrowShuffleMaskElts(Scale, Mask, ScaledMask);
649	return true;
650	}
651
652	void llvm::getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
653	SmallVectorImpl<int> &ScaledMask) {
654	std::array<SmallVector<int, `16`>, `2`> TmpMasks;
655	SmallVectorImpl<int> Output = &TmpMasks [`0`], Tmp = &TmpMasks [`1`];
656	ArrayRef<int> InputMask = Mask;
657	for (unsigned Scale = `2`; Scale <= InputMask.size(); ++Scale) {
658	while (widenShuffleMaskElts(Scale, Mask: InputMask, ScaledMask&: *Output)) {
659	InputMask = *Output;
660	std::swap(a&: Output, b&: Tmp);
661	}
662	}
663	ScaledMask.assign(in_start: InputMask.begin(), in_end: InputMask.end());
664	}
665
666	void llvm::processShuffleMasks(
667	ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
668	unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
669	function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
670	function_ref<void(ArrayRef<int>, unsigned, unsigned, bool)>
671	ManyInputsAction) {
672	SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
673	// Try to perform better estimation of the permutation.
674	// 1. Split the source/destination vectors into real registers.
675	// 2. Do the mask analysis to identify which real registers are
676	// permuted.
677	int Sz = Mask.size();
678	unsigned SzDest = Sz / NumOfDestRegs;
679	unsigned SzSrc = Sz / NumOfSrcRegs;
680	for (unsigned I = `0`; I < NumOfDestRegs; ++I) {
681	auto &RegMasks = Res [I];
682	RegMasks.assign(NumElts: `2` * NumOfSrcRegs, Elt: {});
683	// Check that the values in dest registers are in the one src
684	// register.
685	for (unsigned K = `0`; K < SzDest; ++K) {
686	int Idx = I * SzDest + K;
687	if (Idx == Sz)
688	break;
689	if (Mask [Idx] >= `2` * Sz \|\| Mask [Idx] == PoisonMaskElem)
690	continue;
691	int MaskIdx = Mask [Idx] % Sz;
692	int SrcRegIdx = MaskIdx / SzSrc + (Mask [Idx] >= Sz ? NumOfSrcRegs : `0`);
693	// Add a cost of PermuteTwoSrc for each new source register permute,
694	// if we have more than one source registers.
695	if (RegMasks [SrcRegIdx].empty())
696	RegMasks [SrcRegIdx].assign(NumElts: SzDest, Elt: PoisonMaskElem);
697	RegMasks [SrcRegIdx][K] = MaskIdx % SzSrc;
698	}
699	}
700	// Process split mask.
701	for (unsigned I : seq<unsigned>(Size: NumOfUsedRegs)) {
702	auto &Dest = Res [I];
703	int NumSrcRegs =
704	count_if(Range&: Dest, P: [](ArrayRef<int> Mask) { return !Mask.empty(); });
705	switch (NumSrcRegs) {
706	case `0`:
707	// No input vectors were used!
708	NoInputAction ();
709	break;
710	case `1`: {
711	// Find the only mask with at least single undef mask elem.
712	auto *It =
713	find_if(Range&: Dest, P: [](ArrayRef<int> Mask) { return !Mask.empty(); });
714	unsigned SrcReg = std::distance(first: Dest.begin(), last: It);
715	SingleInputAction (*It, SrcReg, I);
716	break;
717	}
718	default: {
719	// The first mask is a permutation of a single register. Since we have >2
720	// input registers to shuffle, we merge the masks for 2 first registers
721	// and generate a shuffle of 2 registers rather than the reordering of the
722	// first register and then shuffle with the second register. Next,
723	// generate the shuffles of the resulting register + the remaining
724	// registers from the list.
725	auto &&CombineMasks = [](MutableArrayRef<int> FirstMask,
726	ArrayRef<int> SecondMask) {
727	for (int Idx = `0`, VF = FirstMask.size(); Idx < VF; ++Idx) {
728	if (SecondMask [Idx] != PoisonMaskElem) {
729	assert(FirstMask[Idx] == PoisonMaskElem &&
730	"Expected undefined mask element.");
731	FirstMask [Idx] = SecondMask [Idx] + VF;
732	}
733	}
734	};
735	auto &&NormalizeMask = [](MutableArrayRef<int> Mask) {
736	for (int Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
737	if (Mask [Idx] != PoisonMaskElem)
738	Mask [Idx] = Idx;
739	}
740	};
741	int SecondIdx;
742	bool NewReg = true;
743	do {
744	int FirstIdx = -`1`;
745	SecondIdx = -`1`;
746	MutableArrayRef<int> FirstMask, SecondMask;
747	for (unsigned I : seq<unsigned>(Size: `2` * NumOfSrcRegs)) {
748	SmallVectorImpl<int> &RegMask = Dest [I];
749	if (RegMask.empty())
750	continue;
751
752	if (FirstIdx == SecondIdx) {
753	FirstIdx = I;
754	FirstMask = RegMask;
755	continue;
756	}
757	SecondIdx = I;
758	SecondMask = RegMask;
759	CombineMasks (FirstMask, SecondMask);
760	ManyInputsAction (FirstMask, FirstIdx, SecondIdx, NewReg);
761	NewReg = false;
762	NormalizeMask (FirstMask);
763	RegMask.clear();
764	SecondMask = FirstMask;
765	SecondIdx = FirstIdx;
766	}
767	if (FirstIdx != SecondIdx && SecondIdx >= `0`) {
768	CombineMasks (SecondMask, FirstMask);
769	ManyInputsAction (SecondMask, SecondIdx, FirstIdx, NewReg);
770	NewReg = false;
771	Dest [FirstIdx].clear();
772	NormalizeMask (SecondMask);
773	}
774	} while (SecondIdx >= `0`);
775	break;
776	}
777	}
778	}
779	}
780
781	void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
782	const APInt &DemandedElts,
783	APInt &DemandedLHS,
784	APInt &DemandedRHS) {
785	assert(VectorBitWidth >= `128` && "Vectors smaller than 128 bit not supported");
786	int NumLanes = VectorBitWidth / `128`;
787	int NumElts = DemandedElts.getBitWidth();
788	int NumEltsPerLane = NumElts / NumLanes;
789	int HalfEltsPerLane = NumEltsPerLane / `2`;
790
791	DemandedLHS = APInt::getZero(numBits: NumElts);
792	DemandedRHS = APInt::getZero(numBits: NumElts);
793
794	// Map DemandedElts to the horizontal operands.
795	for (int Idx = `0`; Idx != NumElts; ++Idx) {
796	if (!DemandedElts [Idx])
797	continue;
798	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
799	int LocalIdx = Idx % NumEltsPerLane;
800	if (LocalIdx < HalfEltsPerLane) {
801	DemandedLHS.setBit(LaneIdx + `2` * LocalIdx);
802	} else {
803	LocalIdx -= HalfEltsPerLane;
804	DemandedRHS.setBit(LaneIdx + `2` * LocalIdx);
805	}
806	}
807	}
808
809	MapVector<Instruction *, uint64_t>
810	llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
811	const TargetTransformInfo *TTI) {
812
813	// DemandedBits will give us every value's live-out bits. But we want
814	// to ensure no extra casts would need to be inserted, so every DAG
815	// of connected values must have the same minimum bitwidth.
816	EquivalenceClasses<Value *> ECs;
817	SmallVector<Instruction *, `16`> Worklist;
818	SmallPtrSet<Instruction *, `4`> Roots;
819	SmallPtrSet<Instruction *, `16`> Visited;
820	DenseMap<Value *, uint64_t> DBits;
821	SmallPtrSet<Instruction *, `4`> InstructionSet;
822	MapVector<Instruction *, uint64_t> MinBWs;
823
824	// Determine the roots. We work bottom-up, from truncs or icmps.
825	bool SeenExtFromIllegalType = false;
826	for (auto *BB : Blocks)
827	for (auto &I : *BB) {
828	InstructionSet.insert(Ptr: &I);
829
830	if (TTI && (isa<ZExtInst>(Val: &I) \|\| isa<SExtInst>(Val: &I)) &&
831	!TTI->isTypeLegal(Ty: I.getOperand(i: `0`)->getType()))
832	SeenExtFromIllegalType = true;
833
834	// Only deal with non-vector integers up to 64-bits wide.
835	if ((isa<TruncInst>(Val: &I) \|\| isa<ICmpInst>(Val: &I)) &&
836	!I.getType()->isVectorTy() &&
837	I.getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `64`) {
838	// Don't make work for ourselves. If we know the loaded type is legal,
839	// don't add it to the worklist.
840	if (TTI && isa<TruncInst>(Val: &I) && TTI->isTypeLegal(Ty: I.getType()))
841	continue;
842
843	Worklist.push_back(Elt: &I);
844	Roots.insert(Ptr: &I);
845	}
846	}
847	// Early exit.
848	if (Worklist.empty() \|\| (TTI && !SeenExtFromIllegalType))
849	return MinBWs;
850
851	// Now proceed breadth-first, unioning values together.
852	while (!Worklist.empty()) {
853	Instruction *I = Worklist.pop_back_val();
854	Value *Leader = ECs.getOrInsertLeaderValue(V: I);
855
856	if (!Visited.insert(Ptr: I).second)
857	continue;
858
859	// If we encounter a type that is larger than 64 bits, we can't represent
860	// it so bail out.
861	if (DB.getDemandedBits(I).getBitWidth() > `64`)
862	return MapVector<Instruction *, uint64_t>();
863
864	uint64_t V = DB.getDemandedBits(I).getZExtValue();
865	DBits [Leader] \|= V;
866	DBits [I] = V;
867
868	// Casts, loads and instructions outside of our range terminate a chain
869	// successfully.
870	if (isa<SExtInst>(Val: I) \|\| isa<ZExtInst>(Val: I) \|\| isa<LoadInst>(Val: I) \|\|
871	!InstructionSet.count(Ptr: I))
872	continue;
873
874	// Unsafe casts terminate a chain unsuccessfully. We can't do anything
875	// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
876	// transform anything that relies on them.
877	if (isa<BitCastInst>(Val: I) \|\| isa<PtrToIntInst>(Val: I) \|\| isa<IntToPtrInst>(Val: I) \|\|
878	!I->getType()->isIntegerTy()) {
879	DBits [Leader] \|= ~`0ULL`;
880	continue;
881	}
882
883	// We don't modify the types of PHIs. Reductions will already have been
884	// truncated if possible, and inductions' sizes will have been chosen by
885	// indvars.
886	if (isa<PHINode>(Val: I))
887	continue;
888
889	// Don't modify the types of operands of a call, as doing that would cause a
890	// signature mismatch.
891	if (isa<CallBase>(Val: I))
892	continue;
893
894	if (DBits [Leader] == ~`0ULL`)
895	// All bits demanded, no point continuing.
896	continue;
897
898	for (Value *O : I->operands()) {
899	ECs.unionSets(V1: Leader, V2: O);
900	if (auto *OI = dyn_cast<Instruction>(Val: O))
901	Worklist.push_back(Elt: OI);
902	}
903	}
904
905	// Now we've discovered all values, walk them to see if there are
906	// any users we didn't see. If there are, we can't optimize that
907	// chain.
908	for (auto &I : DBits)
909	for (auto *U : I.first->users())
910	if (U->getType()->isIntegerTy() && DBits.count(Val: U) == `0`)
911	DBits [ECs.getOrInsertLeaderValue(V: I.first)] \|= ~`0ULL`;
912
913	for (const auto &E : ECs) {
914	if (!E->isLeader())
915	continue;
916	uint64_t LeaderDemandedBits = `0`;
917	for (Value M : ECs.members(ECV: E))
918	LeaderDemandedBits \|= DBits [M];
919
920	uint64_t MinBW = llvm::bit_width(Value: LeaderDemandedBits);
921	// Round up to a power of 2
922	MinBW = llvm::bit_ceil(Value: MinBW);
923
924	// We don't modify the types of PHIs. Reductions will already have been
925	// truncated if possible, and inductions' sizes will have been chosen by
926	// indvars.
927	// If we are required to shrink a PHI, abandon this entire equivalence class.
928	bool Abort = false;
929	for (Value M : ECs.members(ECV: E))
930	if (isa<PHINode>(Val: M) && MinBW < M->getType()->getScalarSizeInBits()) {
931	Abort = true;
932	break;
933	}
934	if (Abort)
935	continue;
936
937	for (Value M : ECs.members(ECV: E)) {
938	auto *MI = dyn_cast<Instruction>(Val: M);
939	if (!MI)
940	continue;
941	Type *Ty = M->getType();
942	if (Roots.count(Ptr: MI))
943	Ty = MI->getOperand(i: `0`)->getType();
944
945	if (MinBW >= Ty->getScalarSizeInBits())
946	continue;
947
948	// If any of M's operands demand more bits than MinBW then M cannot be
949	// performed safely in MinBW.
950	auto *Call = dyn_cast<CallBase>(Val: MI);
951	auto Ops = Call ? Call->args() : MI->operands();
952	if (any_of(Range&: Ops, P: [&DB, MinBW](Use &U) {
953	auto *CI = dyn_cast<ConstantInt>(Val&: U);
954	// For constants shift amounts, check if the shift would result in
955	// poison.
956	if (CI &&
957	isa<ShlOperator, LShrOperator, AShrOperator>(Val: U.getUser()) &&
958	U.getOperandNo() == `1`)
959	return CI->uge(Num: MinBW);
960	uint64_t BW = bit_width(Value: DB.getDemandedBits(U: &U).getZExtValue());
961	return bit_ceil(Value: BW) > MinBW;
962	}))
963	continue;
964
965	MinBWs [MI] = MinBW;
966	}
967	}
968
969	return MinBWs;
970	}
971
972	/// Add all access groups in @p AccGroups to @p List.
973	template <typename ListT>
974	static void addToAccessGroupList(ListT &List, MDNode *AccGroups) {
975	// Interpret an access group as a list containing itself.
976	if (AccGroups->getNumOperands() == `0`) {
977	assert(isValidAsAccessGroup(AccGroups) && "Node must be an access group");
978	List.insert(AccGroups);
979	return;
980	}
981
982	for (const auto &AccGroupListOp : AccGroups->operands()) {
983	auto *Item = cast<MDNode>(Val: AccGroupListOp.get());
984	assert(isValidAsAccessGroup(Item) && "List item must be an access group");
985	List.insert(Item);
986	}
987	}
988
989	MDNode llvm::uniteAccessGroups(MDNode AccGroups1, MDNode *AccGroups2) {
990	if (!AccGroups1)
991	return AccGroups2;
992	if (!AccGroups2)
993	return AccGroups1;
994	if (AccGroups1 == AccGroups2)
995	return AccGroups1;
996
997	SmallSetVector<Metadata *, `4`> Union;
998	addToAccessGroupList(List&: Union, AccGroups: AccGroups1);
999	addToAccessGroupList(List&: Union, AccGroups: AccGroups2);
1000
1001	if (Union.size() == `0`)
1002	return nullptr;
1003	if (Union.size() == `1`)
1004	return cast<MDNode>(Val: Union.front());
1005
1006	LLVMContext &Ctx = AccGroups1->getContext();
1007	return MDNode::get(Context&: Ctx, MDs: Union.getArrayRef());
1008	}
1009
1010	MDNode llvm::intersectAccessGroups(const* Instruction *Inst1,
1011	const Instruction *Inst2) {
1012	bool MayAccessMem1 = Inst1->mayReadOrWriteMemory();
1013	bool MayAccessMem2 = Inst2->mayReadOrWriteMemory();
1014
1015	if (!MayAccessMem1 && !MayAccessMem2)
1016	return nullptr;
1017	if (!MayAccessMem1)
1018	return Inst2->getMetadata(KindID: LLVMContext::MD_access_group);
1019	if (!MayAccessMem2)
1020	return Inst1->getMetadata(KindID: LLVMContext::MD_access_group);
1021
1022	MDNode *MD1 = Inst1->getMetadata(KindID: LLVMContext::MD_access_group);
1023	MDNode *MD2 = Inst2->getMetadata(KindID: LLVMContext::MD_access_group);
1024	if (!MD1 \|\| !MD2)
1025	return nullptr;
1026	if (MD1 == MD2)
1027	return MD1;
1028
1029	// Use set for scalable 'contains' check.
1030	SmallPtrSet<Metadata *, `4`> AccGroupSet2;
1031	addToAccessGroupList(List&: AccGroupSet2, AccGroups: MD2);
1032
1033	SmallVector<Metadata *, `4`> Intersection;
1034	if (MD1->getNumOperands() == `0`) {
1035	assert(isValidAsAccessGroup(MD1) && "Node must be an access group");
1036	if (AccGroupSet2.count(Ptr: MD1))
1037	Intersection.push_back(Elt: MD1);
1038	} else {
1039	for (const MDOperand &Node : MD1->operands()) {
1040	auto *Item = cast<MDNode>(Val: Node.get());
1041	assert(isValidAsAccessGroup(Item) && "List item must be an access group");
1042	if (AccGroupSet2.count(Ptr: Item))
1043	Intersection.push_back(Elt: Item);
1044	}
1045	}
1046
1047	if (Intersection.size() == `0`)
1048	return nullptr;
1049	if (Intersection.size() == `1`)
1050	return cast<MDNode>(Val: Intersection.front());
1051
1052	LLVMContext &Ctx = Inst1->getContext();
1053	return MDNode::get(Context&: Ctx, MDs: Intersection);
1054	}
1055
1056	/// Add metadata from \p Inst to \p Metadata, if it can be preserved after
1057	/// vectorization.
1058	void llvm::getMetadataToPropagate(
1059	Instruction *Inst,
1060	SmallVectorImpl<std::pair<unsigned, MDNode *>> &Metadata) {
1061	Inst->getAllMetadataOtherThanDebugLoc(MDs&: Metadata);
1062	static const unsigned SupportedIDs[] = {
1063	LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
1064	LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
1065	LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
1066	LLVMContext::MD_access_group, LLVMContext::MD_mmra};
1067
1068	// Remove any unsupported metadata kinds from Metadata.
1069	for (unsigned Idx = `0`; Idx != Metadata.size();) {
1070	if (is_contained(Range: SupportedIDs, Element: Metadata [Idx].first)) {
1071	++Idx;
1072	} else {
1073	// Swap element to end and remove it.
1074	std::swap(x&: Metadata [Idx], y&: Metadata.back());
1075	Metadata.pop_back();
1076	}
1077	}
1078	}
1079
1080	/// \returns \p I after propagating metadata from \p VL.
1081	Instruction llvm::propagateMetadata(Instruction Inst, ArrayRef<Value *> VL) {
1082	if (VL.empty())
1083	return Inst;
1084	SmallVector<std::pair<unsigned, MDNode *>> Metadata;
1085	getMetadataToPropagate(Inst: cast<Instruction>(Val: VL [`0`]), Metadata);
1086
1087	for (auto &[Kind, MD] : Metadata) {
1088	// Skip MMRA metadata if the instruction cannot have it.
1089	if (Kind == LLVMContext::MD_mmra && !canInstructionHaveMMRAs(I: *Inst))
1090	continue;
1091
1092	for (int J = `1`, E = VL.size(); MD && J != E; ++J) {
1093	const Instruction *IJ = cast<Instruction>(Val: VL [J]);
1094	MDNode *IMD = IJ->getMetadata(KindID: Kind);
1095
1096	switch (Kind) {
1097	case LLVMContext::MD_mmra: {
1098	MD = MMRAMetadata::combine(Ctx&: Inst->getContext(), A: MD, B: IMD);
1099	break;
1100	}
1101	case LLVMContext::MD_tbaa:
1102	MD = MDNode::getMostGenericTBAA(A: MD, B: IMD);
1103	break;
1104	case LLVMContext::MD_alias_scope:
1105	MD = MDNode::getMostGenericAliasScope(A: MD, B: IMD);
1106	break;
1107	case LLVMContext::MD_fpmath:
1108	MD = MDNode::getMostGenericFPMath(A: MD, B: IMD);
1109	break;
1110	case LLVMContext::MD_noalias:
1111	case LLVMContext::MD_nontemporal:
1112	case LLVMContext::MD_invariant_load:
1113	MD = MDNode::intersect(A: MD, B: IMD);
1114	break;
1115	case LLVMContext::MD_access_group:
1116	MD = intersectAccessGroups(Inst1: Inst, Inst2: IJ);
1117	break;
1118	default:
1119	llvm_unreachable("unhandled metadata");
1120	}
1121	}
1122
1123	Inst->setMetadata(KindID: Kind, Node: MD);
1124	}
1125
1126	return Inst;
1127	}
1128
1129	Constant *
1130	llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
1131	const InterleaveGroup<Instruction> &Group) {
1132	// All 1's means mask is not needed.
1133	if (Group.isFull())
1134	return nullptr;
1135
1136	// TODO: support reversed access.
1137	assert(!Group.isReverse() && "Reversed group not supported.");
1138
1139	SmallVector<Constant *, `16`> Mask;
1140	for (unsigned i = `0`; i < VF; i++)
1141	for (unsigned j = `0`; j < Group.getFactor(); ++j) {
1142	unsigned HasMember = Group.getMember(Index: j) ? `1` : `0`;
1143	Mask.push_back(Elt: Builder.getInt1(V: HasMember));
1144	}
1145
1146	return ConstantVector::get(V: Mask);
1147	}
1148
1149	llvm::SmallVector<int, `16`>
1150	llvm::createReplicatedMask(unsigned ReplicationFactor, unsigned VF) {
1151	SmallVector<int, `16`> MaskVec;
1152	for (unsigned i = `0`; i < VF; i++)
1153	for (unsigned j = `0`; j < ReplicationFactor; j++)
1154	MaskVec.push_back(Elt: i);
1155
1156	return MaskVec;
1157	}
1158
1159	llvm::SmallVector<int, `16`> llvm::createInterleaveMask(unsigned VF,
1160	unsigned NumVecs) {
1161	SmallVector<int, `16`> Mask;
1162	for (unsigned i = `0`; i < VF; i++)
1163	for (unsigned j = `0`; j < NumVecs; j++)
1164	Mask.push_back(Elt: j * VF + i);
1165
1166	return Mask;
1167	}
1168
1169	llvm::SmallVector<int, `16`>
1170	llvm::createStrideMask(unsigned Start, unsigned Stride, unsigned VF) {
1171	SmallVector<int, `16`> Mask;
1172	for (unsigned i = `0`; i < VF; i++)
1173	Mask.push_back(Elt: Start + i * Stride);
1174
1175	return Mask;
1176	}
1177
1178	llvm::SmallVector<int, `16`> llvm::createSequentialMask(unsigned Start,
1179	unsigned NumInts,
1180	unsigned NumUndefs) {
1181	SmallVector<int, `16`> Mask;
1182	for (unsigned i = `0`; i < NumInts; i++)
1183	Mask.push_back(Elt: Start + i);
1184
1185	for (unsigned i = `0`; i < NumUndefs; i++)
1186	Mask.push_back(Elt: -`1`);
1187
1188	return Mask;
1189	}
1190
1191	llvm::SmallVector<int, `16`> llvm::createUnaryMask(ArrayRef<int> Mask,
1192	unsigned NumElts) {
1193	// Avoid casts in the loop and make sure we have a reasonable number.
1194	int NumEltsSigned = NumElts;
1195	assert(NumEltsSigned > `0` && "Expected smaller or non-zero element count");
1196
1197	// If the mask chooses an element from operand 1, reduce it to choose from the
1198	// corresponding element of operand 0. Undef mask elements are unchanged.
1199	SmallVector<int, `16`> UnaryMask;
1200	for (int MaskElt : Mask) {
1201	assert((MaskElt < NumEltsSigned * `2`) && "Expected valid shuffle mask");
1202	int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
1203	UnaryMask.push_back(Elt: UnaryElt);
1204	}
1205	return UnaryMask;
1206	}
1207
1208	/// A helper function for concatenating vectors. This function concatenates two
1209	/// vectors having the same element type. If the second vector has fewer
1210	/// elements than the first, it is padded with undefs.
1211	static Value concatenateTwoVectors(IRBuilderBase &Builder, Value V1,
1212	Value *V2) {
1213	VectorType *VecTy1 = dyn_cast<VectorType>(Val: V1->getType());
1214	VectorType *VecTy2 = dyn_cast<VectorType>(Val: V2->getType());
1215	assert(VecTy1 && VecTy2 &&
1216	VecTy1->getScalarType() == VecTy2->getScalarType() &&
1217	"Expect two vectors with the same element type");
1218
1219	unsigned NumElts1 = cast<FixedVectorType>(Val: VecTy1)->getNumElements();
1220	unsigned NumElts2 = cast<FixedVectorType>(Val: VecTy2)->getNumElements();
1221	assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
1222
1223	if (NumElts1 > NumElts2) {
1224	// Extend with UNDEFs.
1225	V2 = Builder.CreateShuffleVector(
1226	V: V2, Mask: createSequentialMask(Start: `0`, NumInts: NumElts2, NumUndefs: NumElts1 - NumElts2));
1227	}
1228
1229	return Builder.CreateShuffleVector(
1230	V1, V2, Mask: createSequentialMask(Start: `0`, NumInts: NumElts1 + NumElts2, NumUndefs: `0`));
1231	}
1232
1233	Value *llvm::concatenateVectors(IRBuilderBase &Builder,
1234	ArrayRef<Value *> Vecs) {
1235	unsigned NumVecs = Vecs.size();
1236	assert(NumVecs > `1` && "Should be at least two vectors");
1237
1238	SmallVector<Value *, `8`> ResList;
1239	ResList.append(in_start: Vecs.begin(), in_end: Vecs.end());
1240	do {
1241	SmallVector<Value *, `8`> TmpList;
1242	for (unsigned i = `0`; i < NumVecs - `1`; i += `2`) {
1243	Value V0 = ResList [i], V1 = ResList [i + `1`];
1244	assert((V0->getType() == V1->getType() \|\| i == NumVecs - `2`) &&
1245	"Only the last vector may have a different type");
1246
1247	TmpList.push_back(Elt: concatenateTwoVectors(Builder, V1: V0, V2: V1));
1248	}
1249
1250	// Push the last vector if the total number of vectors is odd.
1251	if (NumVecs % `2` != `0`)
1252	TmpList.push_back(Elt: ResList [NumVecs - `1`]);
1253
1254	ResList = TmpList;
1255	NumVecs = ResList.size();
1256	} while (NumVecs > `1`);
1257
1258	return ResList [`0`];
1259	}
1260
1261	bool llvm::maskContainsAllOneOrUndef(Value *Mask) {
1262	assert(isa<VectorType>(Mask->getType()) &&
1263	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1264	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1265	`1` &&
1266	"Mask must be a vector of i1");
1267
1268	auto AllOneOrUndef = m_CombineOr(Ps: m_AllOnes(), Ps: m_UndefValue());
1269	return match(V: Mask, P: m_CombineOr(Ps: AllOneOrUndef, Ps: m_ContainsMatchingVectorElement(
1270	SubPattern: AllOneOrUndef)));
1271	}
1272
1273	/// TODO: This is a lot like known bits, but for
1274	/// vectors. Is there something we can common this with?
1275	APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
1276	assert(isa<FixedVectorType>(Mask->getType()) &&
1277	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1278	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1279	`1` &&
1280	"Mask must be a fixed width vector of i1");
1281
1282	const unsigned VWidth =
1283	cast<FixedVectorType>(Val: Mask->getType())->getNumElements();
1284	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
1285	if (auto *CV = dyn_cast<ConstantVector>(Val: Mask))
1286	for (unsigned i = `0`; i < VWidth; i++)
1287	if (CV->getAggregateElement(Elt: i)->isNullValue())
1288	DemandedElts.clearBit(BitPosition: i);
1289	return DemandedElts;
1290	}
1291
1292	bool InterleavedAccessInfo::isStrided(int Stride) {
1293	unsigned Factor = std::abs(x: Stride);
1294	return Factor >= `2` && Factor <= MaxInterleaveGroupFactor;
1295	}
1296
1297	void InterleavedAccessInfo::collectConstStrideAccesses(
1298	MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
1299	const DenseMap<Value , const* SCEV *> &Strides,
1300	SmallVectorImpl<const SCEVPredicate > Predicates) {
1301	auto &DL = TheLoop->getHeader()->getDataLayout();
1302
1303	// Since it's desired that the load/store instructions be maintained in
1304	// "program order" for the interleaved access analysis, we have to visit the
1305	// blocks in the loop in reverse postorder (i.e., in a topological order).
1306	// Such an ordering will ensure that any load/store that may be executed
1307	// before a second load/store will precede the second load/store in
1308	// AccessStrideInfo.
1309	LoopBlocksDFS DFS(TheLoop);
1310	DFS.perform(LI);
1311	for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO()))
1312	for (auto &I : *BB) {
1313	Value *Ptr = getLoadStorePointerOperand(V: &I);
1314	if (!Ptr)
1315	continue;
1316	Type *ElementTy = getLoadStoreType(I: &I);
1317
1318	// Currently, codegen doesn't support cases where the type size doesn't
1319	// match the alloc size. Skip them for now.
1320	uint64_t Size = DL.getTypeAllocSize(Ty: ElementTy);
1321	if (Size * `8` != DL.getTypeSizeInBits(Ty: ElementTy))
1322	continue;
1323
1324	// We don't check wrapping here because we don't know yet if Ptr will be
1325	// part of a full group or a group with gaps. Checking wrapping for all
1326	// pointers (even those that end up in groups with no gaps) will be overly
1327	// conservative. For full groups, wrapping should be ok since if we would
1328	// wrap around the address space we would do a memory access at nullptr
1329	// even without the transformation. The wrapping checks are therefore
1330	// deferred until after we've formed the interleaved groups.
1331	int64_t Stride = getPtrStride(PSE, AccessTy: ElementTy, Ptr, Lp: TheLoop, DT: *DT, StridesMap: Strides,
1332	/ShouldCheckWrap=/false, Predicates)
1333	.value_or(u: `0`);
1334
1335	const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, PtrToStride: Strides, Ptr);
1336	AccessStrideInfo [&I] = StrideDescriptor (Stride, Scev, Size,
1337	getLoadStoreAlignment(I: &I));
1338	}
1339	}
1340
1341	// Analyze interleaved accesses and collect them into interleaved load and
1342	// store groups.
1343	//
1344	// When generating code for an interleaved load group, we effectively hoist all
1345	// loads in the group to the location of the first load in program order. When
1346	// generating code for an interleaved store group, we sink all stores to the
1347	// location of the last store. This code motion can change the order of load
1348	// and store instructions and may break dependences.
1349	//
1350	// The code generation strategy mentioned above ensures that we won't violate
1351	// any write-after-read (WAR) dependences.
1352	//
1353	// E.g., for the WAR dependence: a = A[i]; // (1)
1354	// A[i] = b; // (2)
1355	//
1356	// The store group of (2) is always inserted at or below (2), and the load
1357	// group of (1) is always inserted at or above (1). Thus, the instructions will
1358	// never be reordered. All other dependences are checked to ensure the
1359	// correctness of the instruction reordering.
1360	//
1361	// The algorithm visits all memory accesses in the loop in bottom-up program
1362	// order. Program order is established by traversing the blocks in the loop in
1363	// reverse postorder when collecting the accesses.
1364	//
1365	// We visit the memory accesses in bottom-up order because it can simplify the
1366	// construction of store groups in the presence of write-after-write (WAW)
1367	// dependences.
1368	//
1369	// E.g., for the WAW dependence: A[i] = a; // (1)
1370	// A[i] = b; // (2)
1371	// A[i + 1] = c; // (3)
1372	//
1373	// We will first create a store group with (3) and (2). (1) can't be added to
1374	// this group because it and (2) are dependent. However, (1) can be grouped
1375	// with other accesses that may precede it in program order. Note that a
1376	// bottom-up order does not imply that WAW dependences should not be checked.
1377	void InterleavedAccessInfo::analyzeInterleaving(
1378	bool EnablePredicatedInterleavedMemAccesses) {
1379	LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
1380	const auto &Strides = LAI->getSymbolicStrides();
1381
1382	// Holds all accesses with a constant stride.
1383	MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
1384	SmallVector<const SCEVPredicate *> Predicates;
1385	collectConstStrideAccesses(AccessStrideInfo, Strides,
1386	Predicates: OptForSize ? nullptr : &Predicates);
1387
1388	if (AccessStrideInfo.empty())
1389	return;
1390
1391	// Collect the dependences in the loop.
1392	collectDependences();
1393
1394	// Holds all interleaved store groups temporarily.
1395	SmallSetVector<InterleaveGroup<Instruction> *, `4`> StoreGroups;
1396	// Holds all interleaved load groups temporarily.
1397	SmallSetVector<InterleaveGroup<Instruction> *, `4`> LoadGroups;
1398	// Groups added to this set cannot have new members added.
1399	SmallPtrSet<InterleaveGroup<Instruction> *, `4`> CompletedLoadGroups;
1400
1401	// Search in bottom-up program order for pairs of accesses (A and B) that can
1402	// form interleaved load or store groups. In the algorithm below, access A
1403	// precedes access B in program order. We initialize a group for B in the
1404	// outer loop of the algorithm, and then in the inner loop, we attempt to
1405	// insert each A into B's group if:
1406	//
1407	// 1. A and B have the same stride,
1408	// 2. A and B have the same memory object size, and
1409	// 3. A belongs in B's group according to its distance from B.
1410	//
1411	// Special care is taken to ensure group formation will not break any
1412	// dependences.
1413	for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
1414	BI != E; ++BI) {
1415	Instruction *B = BI ->first;
1416	StrideDescriptor DesB = BI ->second;
1417
1418	// Initialize a group for B if it has an allowable stride. Even if we don't
1419	// create a group for B, we continue with the bottom-up algorithm to ensure
1420	// we don't break any of B's dependences.
1421	InterleaveGroup<Instruction> GroupB = nullptr*;
1422	if (isStrided(Stride: DesB.Stride) &&
1423	(!isPredicated(BB: B->getParent()) \|\| EnablePredicatedInterleavedMemAccesses)) {
1424	GroupB = getInterleaveGroup(Instr: B);
1425	if (!GroupB) {
1426	LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
1427	<< `'\n'`);
1428	GroupB = createInterleaveGroup(Instr: B, Stride: DesB.Stride, Alignment: DesB.Alignment);
1429	if (B->mayWriteToMemory())
1430	StoreGroups.insert(X: GroupB);
1431	else
1432	LoadGroups.insert(X: GroupB);
1433	}
1434	}
1435
1436	for (auto AI = std::next(x: BI); AI != E; ++AI) {
1437	Instruction *A = AI ->first;
1438	StrideDescriptor DesA = AI ->second;
1439
1440	// Our code motion strategy implies that we can't have dependences
1441	// between accesses in an interleaved group and other accesses located
1442	// between the first and last member of the group. Note that this also
1443	// means that a group can't have more than one member at a given offset.
1444	// The accesses in a group can have dependences with other accesses, but
1445	// we must ensure we don't extend the boundaries of the group such that
1446	// we encompass those dependent accesses.
1447	//
1448	// For example, assume we have the sequence of accesses shown below in a
1449	// stride-2 loop:
1450	//
1451	// (1, 2) is a group \| A[i] = a; // (1)
1452	// \| A[i-1] = b; // (2) \|
1453	// A[i-3] = c; // (3)
1454	// A[i] = d; // (4) \| (2, 4) is not a group
1455	//
1456	// Because accesses (2) and (3) are dependent, we can group (2) with (1)
1457	// but not with (4). If we did, the dependent access (3) would be within
1458	// the boundaries of the (2, 4) group.
1459	auto DependentMember = [&](InterleaveGroup<Instruction> *Group,
1460	StrideEntry A) -> Instruction {
1461	for (uint32_t Index = `0`; Index < Group->getFactor(); ++Index) {
1462	Instruction *MemberOfGroupB = Group->getMember(Index);
1463	if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1464	A, B: &*AccessStrideInfo.find(Key: MemberOfGroupB)))
1465	return MemberOfGroupB;
1466	}
1467	return nullptr;
1468	};
1469
1470	auto GroupA = getInterleaveGroup(Instr: A);
1471	// If A is a load, dependencies are tolerable, there's nothing to do here.
1472	// If both A and B belong to the same (store) group, they are independent,
1473	// even if dependencies have not been recorded.
1474	// If both GroupA and GroupB are null, there's nothing to do here.
1475	if (A->mayWriteToMemory() && GroupA != GroupB) {
1476	Instruction DependentInst = nullptr*;
1477	// If GroupB is a load group, we have to compare AI against all
1478	// members of GroupB because if any load within GroupB has a dependency
1479	// on AI, we need to mark GroupB as complete and also release the
1480	// store GroupA (if A belongs to one). The former prevents incorrect
1481	// hoisting of load B above store A while the latter prevents incorrect
1482	// sinking of store A below load B.
1483	if (GroupB && LoadGroups.contains(key: GroupB))
1484	DependentInst = DependentMember (GroupB, &*AI);
1485	else if (!canReorderMemAccessesForInterleavedGroups(A: &AI, B: &BI))
1486	DependentInst = B;
1487
1488	if (DependentInst) {
1489	// A has a store dependence on B (or on some load within GroupB) and
1490	// is part of a store group. Release A's group to prevent illegal
1491	// sinking of A below B. A will then be free to form another group
1492	// with instructions that precede it.
1493	if (GroupA && StoreGroups.contains(key: GroupA)) {
1494	LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
1495	"dependence between "
1496	<< A << " and " << DependentInst << `'\n'`);
1497	StoreGroups.remove(X: GroupA);
1498	releaseGroup(Group: GroupA);
1499	}
1500	// If B is a load and part of an interleave group, no earlier loads
1501	// can be added to B's interleave group, because this would mean the
1502	// DependentInst would move across store A. Mark the interleave group
1503	// as complete.
1504	if (GroupB && LoadGroups.contains(key: GroupB)) {
1505	LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
1506	<< " as complete.\n");
1507	CompletedLoadGroups.insert(Ptr: GroupB);
1508	}
1509	}
1510	}
1511	if (CompletedLoadGroups.contains(Ptr: GroupB)) {
1512	// Skip trying to add A to B, continue to look for other conflicting A's
1513	// in groups to be released.
1514	continue;
1515	}
1516
1517	// At this point, we've checked for illegal code motion. If either A or B
1518	// isn't strided, there's nothing left to do.
1519	if (!isStrided(Stride: DesA.Stride) \|\| !isStrided(Stride: DesB.Stride))
1520	continue;
1521
1522	// Ignore A if it's already in a group or isn't the same kind of memory
1523	// operation as B.
1524	// Note that mayReadFromMemory() isn't mutually exclusive to
1525	// mayWriteToMemory in the case of atomic loads. We shouldn't see those
1526	// here, canVectorizeMemory() should have returned false - except for the
1527	// case we asked for optimization remarks.
1528	if (isInterleaved(Instr: A) \|\|
1529	(A->mayReadFromMemory() != B->mayReadFromMemory()) \|\|
1530	(A->mayWriteToMemory() != B->mayWriteToMemory()))
1531	continue;
1532
1533	// Check rules 1 and 2. Ignore A if its stride or size is different from
1534	// that of B.
1535	if (DesA.Stride != DesB.Stride \|\| DesA.Size != DesB.Size)
1536	continue;
1537
1538	// Ignore A if the memory object of A and B don't belong to the same
1539	// address space
1540	if (getLoadStoreAddressSpace(I: A) != getLoadStoreAddressSpace(I: B))
1541	continue;
1542
1543	// Calculate the distance from A to B.
1544	const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
1545	Val: PSE.getSE()->getMinusSCEV(LHS: DesA.Scev, RHS: DesB.Scev));
1546	if (!DistToB)
1547	continue;
1548	int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
1549
1550	// Check rule 3. Ignore A if its distance to B is not a multiple of the
1551	// size.
1552	if (DistanceToB % static_cast<int64_t>(DesB.Size))
1553	continue;
1554
1555	// All members of a predicated interleave-group must have the same predicate,
1556	// and currently must reside in the same BB.
1557	BasicBlock *BlockA = A->getParent();
1558	BasicBlock *BlockB = B->getParent();
1559	if ((isPredicated(BB: BlockA) \|\| isPredicated(BB: BlockB)) &&
1560	(!EnablePredicatedInterleavedMemAccesses \|\| BlockA != BlockB))
1561	continue;
1562
1563	// The index of A is the index of B plus A's distance to B in multiples
1564	// of the size.
1565	int IndexA =
1566	GroupB->getIndex(Instr: B) + DistanceToB / static_cast<int64_t>(DesB.Size);
1567
1568	// Try to insert A into B's group.
1569	if (GroupB->insertMember(Instr: A, Index: IndexA, NewAlign: DesA.Alignment)) {
1570	LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << `'\n'`
1571	<< " into the interleave group with" << *B
1572	<< `'\n'`);
1573	InterleaveGroupMap [A] = GroupB;
1574
1575	// Set the first load in program order as the insert position.
1576	if (A->mayReadFromMemory())
1577	GroupB->setInsertPos(A);
1578	}
1579	} // Iteration over A accesses.
1580	} // Iteration over B accesses.
1581
1582	// Commit the collected predicates to PSE if any candidate group was formed.
1583	if (!LoadGroups.empty() \|\| !StoreGroups.empty())
1584	PSE.addPredicates(Preds: Predicates);
1585
1586	auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
1587	int Index,
1588	const char FirstOrLast) -> bool* {
1589	Instruction *Member = Group->getMember(Index);
1590	assert(Member && "Group member does not exist");
1591	Value *MemberPtr = getLoadStorePointerOperand(V: Member);
1592	Type *AccessTy = getLoadStoreType(I: Member);
1593	if (getPtrStride(PSE, AccessTy, Ptr: MemberPtr, Lp: TheLoop, DT: *DT, StridesMap: Strides,
1594	/Assume=/false, /ShouldCheckWrap=/true)
1595	.value_or(u: `0`))
1596	return false;
1597	LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
1598	<< FirstOrLast
1599	<< " group member potentially pointer-wrapping.\n");
1600	releaseGroup(Group);
1601	return true;
1602	};
1603
1604	// Remove interleaved groups with gaps whose memory
1605	// accesses may wrap around. We have to revisit the getPtrStride analysis,
1606	// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
1607	// not check wrapping (see documentation there).
1608	// FORNOW we use Assume=false;
1609	// TODO: Change to Assume=true but making sure we don't exceed the threshold
1610	// of runtime SCEV assumptions checks (thereby potentially failing to
1611	// vectorize altogether).
1612	// Additional optional optimizations:
1613	// TODO: If we are peeling the loop and we know that the first pointer doesn't
1614	// wrap then we can deduce that all pointers in the group don't wrap.
1615	// This means that we can forcefully peel the loop in order to only have to
1616	// check the first pointer for no-wrap. When we'll change to use Assume=true
1617	// we'll only need at most one runtime check per interleaved group.
1618	for (auto *Group : LoadGroups) {
1619	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
1620	// load would wrap around the address space we would do a memory access at
1621	// nullptr even without the transformation.
1622	if (Group->isFull())
1623	continue;
1624
1625	// Case 2: If first and last members of the group don't wrap this implies
1626	// that all the pointers in the group don't wrap.
1627	// So we check only group member 0 (which is always guaranteed to exist),
1628	// and group member Factor - 1; If the latter doesn't exist we rely on
1629	// peeling (if it is a non-reversed access -- see Case 3).
1630	if (InvalidateGroupIfMemberMayWrap (Group, `0`, "first"))
1631	continue;
1632	if (Group->getMember(Index: Group->getFactor() - `1`))
1633	InvalidateGroupIfMemberMayWrap (Group, Group->getFactor() - `1`, "last");
1634	else {
1635	// Case 3: A non-reversed interleaved load group with gaps: We need
1636	// to execute at least one scalar epilogue iteration. This will ensure
1637	// we don't speculatively access memory out-of-bounds. We only need
1638	// to look for a member at index factor - 1, since every group must have
1639	// a member at index zero.
1640	if (Group->isReverse()) {
1641	LLVM_DEBUG(
1642	dbgs() << "LV: Invalidate candidate interleaved group due to "
1643	"a reverse access with gaps.\n");
1644	releaseGroup(Group);
1645	continue;
1646	}
1647	LLVM_DEBUG(
1648	dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
1649	RequiresScalarEpilogue = true;
1650	}
1651	}
1652
1653	for (auto *Group : StoreGroups) {
1654	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
1655	// store would wrap around the address space we would do a memory access at
1656	// nullptr even without the transformation.
1657	if (Group->isFull())
1658	continue;
1659
1660	// Interleave-store-group with gaps is implemented using masked wide store.
1661	// Remove interleaved store groups with gaps if
1662	// masked-interleaved-accesses are not enabled by the target.
1663	if (!EnablePredicatedInterleavedMemAccesses) {
1664	LLVM_DEBUG(
1665	dbgs() << "LV: Invalidate candidate interleaved store group due "
1666	"to gaps.\n");
1667	releaseGroup(Group);
1668	continue;
1669	}
1670
1671	// Case 2: If first and last members of the group don't wrap this implies
1672	// that all the pointers in the group don't wrap.
1673	// So we check only group member 0 (which is always guaranteed to exist),
1674	// and the last group member. Case 3 (scalar epilog) is not relevant for
1675	// stores with gaps, which are implemented with masked-store (rather than
1676	// speculative access, as in loads).
1677	if (InvalidateGroupIfMemberMayWrap (Group, `0`, "first"))
1678	continue;
1679	for (int Index = Group->getFactor() - `1`; Index > `0`; Index--)
1680	if (Group->getMember(Index)) {
1681	InvalidateGroupIfMemberMayWrap (Group, Index, "last");
1682	break;
1683	}
1684	}
1685	}
1686
1687	void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
1688	// If no group had triggered the requirement to create an epilogue loop,
1689	// there is nothing to do.
1690	if (!requiresScalarEpilogue())
1691	return;
1692
1693	// Release groups requiring scalar epilogues. Note that this also removes them
1694	// from InterleaveGroups.
1695	bool ReleasedGroup = InterleaveGroups.remove_if(P: [&](auto *Group) {
1696	if (!Group->requiresScalarEpilogue())
1697	return false;
1698	LLVM_DEBUG(
1699	dbgs()
1700	<< "LV: Invalidate candidate interleaved group due to gaps that "
1701	"require a scalar epilogue (not allowed under optsize) and cannot "
1702	"be masked (not enabled). \n");
1703	releaseGroupWithoutRemovingFromSet(Group);
1704	return true;
1705	});
1706	assert(ReleasedGroup && "At least one group must be invalidated, as a "
1707	"scalar epilogue was required");
1708	(void)ReleasedGroup;
1709	RequiresScalarEpilogue = false;
1710	}
1711
1712	template <typename InstT>
1713	void InterleaveGroup<InstT>::addMetadata(InstT NewInst) const* {
1714	llvm_unreachable("addMetadata can only be used for Instruction");
1715	}
1716
1717	namespace llvm {
1718	template <>
1719	void InterleaveGroup<Instruction>::addMetadata(Instruction NewInst) const* {
1720	SmallVector<Value *, `4`> VL(make_second_range(c: Members));
1721	propagateMetadata(Inst: NewInst, VL);
1722	}
1723	} // namespace llvm
1724

Browse the source code of llvm_projects/llvm/lib/Analysis/VectorUtils.cpp