VectorUtils.cpp source code [llvm_projects/llvm/lib/Analysis/VectorUtils.cpp]

1	//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file defines vectorizer utilities.
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/Analysis/VectorUtils.h"
14	#include "llvm/ADT/EquivalenceClasses.h"
15	#include "llvm/ADT/SmallVector.h"
16	#include "llvm/Analysis/DemandedBits.h"
17	#include "llvm/Analysis/LoopInfo.h"
18	#include "llvm/Analysis/LoopIterator.h"
19	#include "llvm/Analysis/ScalarEvolution.h"
20	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
21	#include "llvm/Analysis/TargetTransformInfo.h"
22	#include "llvm/Analysis/ValueTracking.h"
23	#include "llvm/IR/Constants.h"
24	#include "llvm/IR/DerivedTypes.h"
25	#include "llvm/IR/IRBuilder.h"
26	#include "llvm/IR/MemoryModelRelaxationAnnotations.h"
27	#include "llvm/IR/PatternMatch.h"
28	#include "llvm/IR/Value.h"
29	#include "llvm/Support/CommandLine.h"
30
31	#define DEBUG_TYPE "vectorutils"
32
33	using namespace llvm;
34	using namespace llvm::PatternMatch;
35
36	/// Maximum factor for an interleaved memory access.
37	static cl::opt<unsigned> MaxInterleaveGroupFactor(
38	"max-interleave-group-factor", cl::Hidden,
39	cl::desc ("Maximum factor for an interleaved access group (default = 8)"),
40	cl::init(Val: `8`));
41
42	/// Return true if all of the intrinsic's arguments and return type are scalars
43	/// for the scalar form of the intrinsic, and vectors for the vector form of the
44	/// intrinsic (except operands that are marked as always being scalar by
45	/// isVectorIntrinsicWithScalarOpAtArg).
46	bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
47	switch (ID) {
48	case Intrinsic::abs: // Begin integer bit-manipulation.
49	case Intrinsic::bswap:
50	case Intrinsic::bitreverse:
51	case Intrinsic::ctpop:
52	case Intrinsic::ctlz:
53	case Intrinsic::cttz:
54	case Intrinsic::fshl:
55	case Intrinsic::fshr:
56	case Intrinsic::smax:
57	case Intrinsic::smin:
58	case Intrinsic::umax:
59	case Intrinsic::umin:
60	case Intrinsic::sadd_sat:
61	case Intrinsic::ssub_sat:
62	case Intrinsic::uadd_sat:
63	case Intrinsic::usub_sat:
64	case Intrinsic::smul_fix:
65	case Intrinsic::smul_fix_sat:
66	case Intrinsic::umul_fix:
67	case Intrinsic::umul_fix_sat:
68	case Intrinsic::sqrt: // Begin floating-point.
69	case Intrinsic::sin:
70	case Intrinsic::cos:
71	case Intrinsic::tan:
72	case Intrinsic::exp:
73	case Intrinsic::exp2:
74	case Intrinsic::log:
75	case Intrinsic::log10:
76	case Intrinsic::log2:
77	case Intrinsic::fabs:
78	case Intrinsic::minnum:
79	case Intrinsic::maxnum:
80	case Intrinsic::minimum:
81	case Intrinsic::maximum:
82	case Intrinsic::copysign:
83	case Intrinsic::floor:
84	case Intrinsic::ceil:
85	case Intrinsic::trunc:
86	case Intrinsic::rint:
87	case Intrinsic::nearbyint:
88	case Intrinsic::round:
89	case Intrinsic::roundeven:
90	case Intrinsic::pow:
91	case Intrinsic::fma:
92	case Intrinsic::fmuladd:
93	case Intrinsic::is_fpclass:
94	case Intrinsic::powi:
95	case Intrinsic::canonicalize:
96	case Intrinsic::fptosi_sat:
97	case Intrinsic::fptoui_sat:
98	case Intrinsic::lrint:
99	case Intrinsic::llrint:
100	return true;
101	default:
102	return false;
103	}
104	}
105
106	/// Identifies if the vector form of the intrinsic has a scalar operand.
107	bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
108	unsigned ScalarOpdIdx) {
109	switch (ID) {
110	case Intrinsic::abs:
111	case Intrinsic::ctlz:
112	case Intrinsic::cttz:
113	case Intrinsic::is_fpclass:
114	case Intrinsic::powi:
115	return (ScalarOpdIdx == `1`);
116	case Intrinsic::smul_fix:
117	case Intrinsic::smul_fix_sat:
118	case Intrinsic::umul_fix:
119	case Intrinsic::umul_fix_sat:
120	return (ScalarOpdIdx == `2`);
121	default:
122	return false;
123	}
124	}
125
126	bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
127	int OpdIdx) {
128	assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
129
130	switch (ID) {
131	case Intrinsic::fptosi_sat:
132	case Intrinsic::fptoui_sat:
133	case Intrinsic::lrint:
134	case Intrinsic::llrint:
135	return OpdIdx == -`1` \|\| OpdIdx == `0`;
136	case Intrinsic::is_fpclass:
137	return OpdIdx == `0`;
138	case Intrinsic::powi:
139	return OpdIdx == -`1` \|\| OpdIdx == `1`;
140	default:
141	return OpdIdx == -`1`;
142	}
143	}
144
145	/// Returns intrinsic ID for call.
146	/// For the input call instruction it finds mapping intrinsic and returns
147	/// its ID, in case it does not found it return not_intrinsic.
148	Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
149	const TargetLibraryInfo *TLI) {
150	Intrinsic::ID ID = getIntrinsicForCallSite(CB: *CI, TLI);
151	if (ID == Intrinsic::not_intrinsic)
152	return Intrinsic::not_intrinsic;
153
154	if (isTriviallyVectorizable(ID) \|\| ID == Intrinsic::lifetime_start \|\|
155	ID == Intrinsic::lifetime_end \|\| ID == Intrinsic::assume \|\|
156	ID == Intrinsic::experimental_noalias_scope_decl \|\|
157	ID == Intrinsic::sideeffect \|\| ID == Intrinsic::pseudoprobe)
158	return ID;
159	return Intrinsic::not_intrinsic;
160	}
161
162	/// Given a vector and an element number, see if the scalar value is
163	/// already around as a register, for example if it were inserted then extracted
164	/// from the vector.
165	Value llvm::findScalarElement(Value V, unsigned EltNo) {
166	assert(V->getType()->isVectorTy() && "Not looking at a vector?");
167	VectorType *VTy = cast<VectorType>(Val: V->getType());
168	// For fixed-length vector, return poison for out of range access.
169	if (auto *FVTy = dyn_cast<FixedVectorType>(Val: VTy)) {
170	unsigned Width = FVTy->getNumElements();
171	if (EltNo >= Width)
172	return PoisonValue::get(T: FVTy->getElementType());
173	}
174
175	if (Constant *C = dyn_cast<Constant>(Val: V))
176	return C->getAggregateElement(Elt: EltNo);
177
178	if (InsertElementInst *III = dyn_cast<InsertElementInst>(Val: V)) {
179	// If this is an insert to a variable element, we don't know what it is.
180	if (!isa<ConstantInt>(Val: III->getOperand(i_nocapture: `2`)))
181	return nullptr;
182	unsigned IIElt = cast<ConstantInt>(Val: III->getOperand(i_nocapture: `2`))->getZExtValue();
183
184	// If this is an insert to the element we are looking for, return the
185	// inserted value.
186	if (EltNo == IIElt)
187	return III->getOperand(i_nocapture: `1`);
188
189	// Guard against infinite loop on malformed, unreachable IR.
190	if (III == III->getOperand(i_nocapture: `0`))
191	return nullptr;
192
193	// Otherwise, the insertelement doesn't modify the value, recurse on its
194	// vector input.
195	return findScalarElement(V: III->getOperand(i_nocapture: `0`), EltNo);
196	}
197
198	ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Val: V);
199	// Restrict the following transformation to fixed-length vector.
200	if (SVI && isa<FixedVectorType>(Val: SVI->getType())) {
201	unsigned LHSWidth =
202	cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: `0`)->getType())->getNumElements();
203	int InEl = SVI->getMaskValue(Elt: EltNo);
204	if (InEl < `0`)
205	return PoisonValue::get(T: VTy->getElementType());
206	if (InEl < (int)LHSWidth)
207	return findScalarElement(V: SVI->getOperand(i_nocapture: `0`), EltNo: InEl);
208	return findScalarElement(V: SVI->getOperand(i_nocapture: `1`), EltNo: InEl - LHSWidth);
209	}
210
211	// Extract a value from a vector add operation with a constant zero.
212	// TODO: Use getBinOpIdentity() to generalize this.
213	Value Val; Constant C;
214	if (match(V, P: m_Add(L: m_Value(V&: Val), R: m_Constant(C))))
215	if (Constant *Elt = C->getAggregateElement(Elt: EltNo))
216	if (Elt->isNullValue())
217	return findScalarElement(V: Val, EltNo);
218
219	// If the vector is a splat then we can trivially find the scalar element.
220	if (isa<ScalableVectorType>(Val: VTy))
221	if (Value *Splat = getSplatValue(V))
222	if (EltNo < VTy->getElementCount().getKnownMinValue())
223	return Splat;
224
225	// Otherwise, we don't know.
226	return nullptr;
227	}
228
229	int llvm::getSplatIndex(ArrayRef<int> Mask) {
230	int SplatIndex = -`1`;
231	for (int M : Mask) {
232	// Ignore invalid (undefined) mask elements.
233	if (M < `0`)
234	continue;
235
236	// There can be only 1 non-negative mask element value if this is a splat.
237	if (SplatIndex != -`1` && SplatIndex != M)
238	return -`1`;
239
240	// Initialize the splat index to the 1st non-negative mask element.
241	SplatIndex = M;
242	}
243	assert((SplatIndex == -`1` \|\| SplatIndex >= `0`) && "Negative index?");
244	return SplatIndex;
245	}
246
247	/// Get splat value if the input is a splat vector or return nullptr.
248	/// This function is not fully general. It checks only 2 cases:
249	/// the input value is (1) a splat constant vector or (2) a sequence
250	/// of instructions that broadcasts a scalar at element 0.
251	Value llvm::getSplatValue(const* Value *V) {
252	if (isa<VectorType>(Val: V->getType()))
253	if (auto *C = dyn_cast<Constant>(Val: V))
254	return C->getSplatValue();
255
256	// shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
257	Value *Splat;
258	if (match(V,
259	P: m_Shuffle(v1: m_InsertElt(Val: m_Value(), Elt: m_Value(V&: Splat), Idx: m_ZeroInt()),
260	v2: m_Value(), mask: m_ZeroMask ())))
261	return Splat;
262
263	return nullptr;
264	}
265
266	bool llvm::isSplatValue(const Value V, int* Index, unsigned Depth) {
267	assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
268
269	if (isa<VectorType>(Val: V->getType())) {
270	if (isa<UndefValue>(Val: V))
271	return true;
272	// FIXME: We can allow undefs, but if Index was specified, we may want to
273	// check that the constant is defined at that index.
274	if (auto *C = dyn_cast<Constant>(Val: V))
275	return C->getSplatValue() != nullptr;
276	}
277
278	if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Val: V)) {
279	// FIXME: We can safely allow undefs here. If Index was specified, we will
280	// check that the mask elt is defined at the required index.
281	if (!all_equal(Range: Shuf->getShuffleMask()))
282	return false;
283
284	// Match any index.
285	if (Index == -`1`)
286	return true;
287
288	// Match a specific element. The mask should be defined at and match the
289	// specified index.
290	return Shuf->getMaskValue(Elt: Index) == Index;
291	}
292
293	// The remaining tests are all recursive, so bail out if we hit the limit.
294	if (Depth++ == MaxAnalysisRecursionDepth)
295	return false;
296
297	// If both operands of a binop are splats, the result is a splat.
298	Value X, Y, *Z;
299	if (match(V, P: m_BinOp(L: m_Value(V&: X), R: m_Value(V&: Y))))
300	return isSplatValue(V: X, Index, Depth) && isSplatValue(V: Y, Index, Depth);
301
302	// If all operands of a select are splats, the result is a splat.
303	if (match(V, P: m_Select(C: m_Value(V&: X), L: m_Value(V&: Y), R: m_Value(V&: Z))))
304	return isSplatValue(V: X, Index, Depth) && isSplatValue(V: Y, Index, Depth) &&
305	isSplatValue(V: Z, Index, Depth);
306
307	// TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
308
309	return false;
310	}
311
312	bool llvm::getShuffleDemandedElts(int SrcWidth, ArrayRef<int> Mask,
313	const APInt &DemandedElts, APInt &DemandedLHS,
314	APInt &DemandedRHS, bool AllowUndefElts) {
315	DemandedLHS = DemandedRHS = APInt::getZero(numBits: SrcWidth);
316
317	// Early out if we don't demand any elements.
318	if (DemandedElts.isZero())
319	return true;
320
321	// Simple case of a shuffle with zeroinitializer.
322	if (all_of(Range&: Mask, P: [](int Elt) { return Elt == `0`; })) {
323	DemandedLHS.setBit(`0`);
324	return true;
325	}
326
327	for (unsigned I = `0`, E = Mask.size(); I != E; ++I) {
328	int M = Mask [I];
329	assert((-`1` <= M) && (M < (SrcWidth * `2`)) &&
330	"Invalid shuffle mask constant");
331
332	if (!DemandedElts [I] \|\| (AllowUndefElts && (M < `0`)))
333	continue;
334
335	// For undef elements, we don't know anything about the common state of
336	// the shuffle result.
337	if (M < `0`)
338	return false;
339
340	if (M < SrcWidth)
341	DemandedLHS.setBit(M);
342	else
343	DemandedRHS.setBit(M - SrcWidth);
344	}
345
346	return true;
347	}
348
349	void llvm::narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
350	SmallVectorImpl<int> &ScaledMask) {
351	assert(Scale > `0` && "Unexpected scaling factor");
352
353	// Fast-path: if no scaling, then it is just a copy.
354	if (Scale == `1`) {
355	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
356	return;
357	}
358
359	ScaledMask.clear();
360	for (int MaskElt : Mask) {
361	if (MaskElt >= `0`) {
362	assert(((uint64_t)Scale * MaskElt + (Scale - `1`)) <= INT32_MAX &&
363	"Overflowed 32-bits");
364	}
365	for (int SliceElt = `0`; SliceElt != Scale; ++SliceElt)
366	ScaledMask.push_back(Elt: MaskElt < `0` ? MaskElt : Scale * MaskElt + SliceElt);
367	}
368	}
369
370	bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
371	SmallVectorImpl<int> &ScaledMask) {
372	assert(Scale > `0` && "Unexpected scaling factor");
373
374	// Fast-path: if no scaling, then it is just a copy.
375	if (Scale == `1`) {
376	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
377	return true;
378	}
379
380	// We must map the original elements down evenly to a type with less elements.
381	int NumElts = Mask.size();
382	if (NumElts % Scale != `0`)
383	return false;
384
385	ScaledMask.clear();
386	ScaledMask.reserve(N: NumElts / Scale);
387
388	// Step through the input mask by splitting into Scale-sized slices.
389	do {
390	ArrayRef<int> MaskSlice = Mask.take_front(N: Scale);
391	assert((int)MaskSlice.size() == Scale && "Expected Scale-sized slice.");
392
393	// The first element of the slice determines how we evaluate this slice.
394	int SliceFront = MaskSlice.front();
395	if (SliceFront < `0`) {
396	// Negative values (undef or other "sentinel" values) must be equal across
397	// the entire slice.
398	if (!all_equal(Range&: MaskSlice))
399	return false;
400	ScaledMask.push_back(Elt: SliceFront);
401	} else {
402	// A positive mask element must be cleanly divisible.
403	if (SliceFront % Scale != `0`)
404	return false;
405	// Elements of the slice must be consecutive.
406	for (int i = `1`; i < Scale; ++i)
407	if (MaskSlice [i] != SliceFront + i)
408	return false;
409	ScaledMask.push_back(Elt: SliceFront / Scale);
410	}
411	Mask = Mask.drop_front(N: Scale);
412	} while (!Mask.empty());
413
414	assert((int)ScaledMask.size() * Scale == NumElts && "Unexpected scaled mask");
415
416	// All elements of the original mask can be scaled down to map to the elements
417	// of a mask with wider elements.
418	return true;
419	}
420
421	bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
422	SmallVectorImpl<int> &ScaledMask) {
423	unsigned NumSrcElts = Mask.size();
424	assert(NumSrcElts > `0` && NumDstElts > `0` && "Unexpected scaling factor");
425
426	// Fast-path: if no scaling, then it is just a copy.
427	if (NumSrcElts == NumDstElts) {
428	ScaledMask.assign(in_start: Mask.begin(), in_end: Mask.end());
429	return true;
430	}
431
432	// Ensure we can find a whole scale factor.
433	assert(((NumSrcElts % NumDstElts) == `0` \|\| (NumDstElts % NumSrcElts) == `0`) &&
434	"Unexpected scaling factor");
435
436	if (NumSrcElts > NumDstElts) {
437	int Scale = NumSrcElts / NumDstElts;
438	return widenShuffleMaskElts(Scale, Mask, ScaledMask);
439	}
440
441	int Scale = NumDstElts / NumSrcElts;
442	narrowShuffleMaskElts(Scale, Mask, ScaledMask);
443	return true;
444	}
445
446	void llvm::getShuffleMaskWithWidestElts(ArrayRef<int> Mask,
447	SmallVectorImpl<int> &ScaledMask) {
448	std::array<SmallVector<int, `16`>, `2`> TmpMasks;
449	SmallVectorImpl<int> Output = &TmpMasks [`0`], Tmp = &TmpMasks [`1`];
450	ArrayRef<int> InputMask = Mask;
451	for (unsigned Scale = `2`; Scale <= InputMask.size(); ++Scale) {
452	while (widenShuffleMaskElts(Scale, Mask: InputMask, ScaledMask&: *Output)) {
453	InputMask = *Output;
454	std::swap(a&: Output, b&: Tmp);
455	}
456	}
457	ScaledMask.assign(in_start: InputMask.begin(), in_end: InputMask.end());
458	}
459
460	void llvm::processShuffleMasks(
461	ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
462	unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
463	function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
464	function_ref<void(ArrayRef<int>, unsigned, unsigned)> ManyInputsAction) {
465	SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
466	// Try to perform better estimation of the permutation.
467	// 1. Split the source/destination vectors into real registers.
468	// 2. Do the mask analysis to identify which real registers are
469	// permuted.
470	int Sz = Mask.size();
471	unsigned SzDest = Sz / NumOfDestRegs;
472	unsigned SzSrc = Sz / NumOfSrcRegs;
473	for (unsigned I = `0`; I < NumOfDestRegs; ++I) {
474	auto &RegMasks = Res [I];
475	RegMasks.assign(NumElts: NumOfSrcRegs, Elt: {});
476	// Check that the values in dest registers are in the one src
477	// register.
478	for (unsigned K = `0`; K < SzDest; ++K) {
479	int Idx = I * SzDest + K;
480	if (Idx == Sz)
481	break;
482	if (Mask [Idx] >= Sz \|\| Mask [Idx] == PoisonMaskElem)
483	continue;
484	int SrcRegIdx = Mask [Idx] / SzSrc;
485	// Add a cost of PermuteTwoSrc for each new source register permute,
486	// if we have more than one source registers.
487	if (RegMasks [SrcRegIdx].empty())
488	RegMasks [SrcRegIdx].assign(NumElts: SzDest, Elt: PoisonMaskElem);
489	RegMasks [SrcRegIdx][K] = Mask [Idx] % SzSrc;
490	}
491	}
492	// Process split mask.
493	for (unsigned I = `0`; I < NumOfUsedRegs; ++I) {
494	auto &Dest = Res [I];
495	int NumSrcRegs =
496	count_if(Range&: Dest, P: [](ArrayRef<int> Mask) { return !Mask.empty(); });
497	switch (NumSrcRegs) {
498	case `0`:
499	// No input vectors were used!
500	NoInputAction ();
501	break;
502	case `1`: {
503	// Find the only mask with at least single undef mask elem.
504	auto *It =
505	find_if(Range&: Dest, P: [](ArrayRef<int> Mask) { return !Mask.empty(); });
506	unsigned SrcReg = std::distance(first: Dest.begin(), last: It);
507	SingleInputAction (*It, SrcReg, I);
508	break;
509	}
510	default: {
511	// The first mask is a permutation of a single register. Since we have >2
512	// input registers to shuffle, we merge the masks for 2 first registers
513	// and generate a shuffle of 2 registers rather than the reordering of the
514	// first register and then shuffle with the second register. Next,
515	// generate the shuffles of the resulting register + the remaining
516	// registers from the list.
517	auto &&CombineMasks = [](MutableArrayRef<int> FirstMask,
518	ArrayRef<int> SecondMask) {
519	for (int Idx = `0`, VF = FirstMask.size(); Idx < VF; ++Idx) {
520	if (SecondMask [Idx] != PoisonMaskElem) {
521	assert(FirstMask[Idx] == PoisonMaskElem &&
522	"Expected undefined mask element.");
523	FirstMask [Idx] = SecondMask [Idx] + VF;
524	}
525	}
526	};
527	auto &&NormalizeMask = [](MutableArrayRef<int> Mask) {
528	for (int Idx = `0`, VF = Mask.size(); Idx < VF; ++Idx) {
529	if (Mask [Idx] != PoisonMaskElem)
530	Mask [Idx] = Idx;
531	}
532	};
533	int SecondIdx;
534	do {
535	int FirstIdx = -`1`;
536	SecondIdx = -`1`;
537	MutableArrayRef<int> FirstMask, SecondMask;
538	for (unsigned I = `0`; I < NumOfDestRegs; ++I) {
539	SmallVectorImpl<int> &RegMask = Dest [I];
540	if (RegMask.empty())
541	continue;
542
543	if (FirstIdx == SecondIdx) {
544	FirstIdx = I;
545	FirstMask = RegMask;
546	continue;
547	}
548	SecondIdx = I;
549	SecondMask = RegMask;
550	CombineMasks (FirstMask, SecondMask);
551	ManyInputsAction (FirstMask, FirstIdx, SecondIdx);
552	NormalizeMask (FirstMask);
553	RegMask.clear();
554	SecondMask = FirstMask;
555	SecondIdx = FirstIdx;
556	}
557	if (FirstIdx != SecondIdx && SecondIdx >= `0`) {
558	CombineMasks (SecondMask, FirstMask);
559	ManyInputsAction (SecondMask, SecondIdx, FirstIdx);
560	Dest [FirstIdx].clear();
561	NormalizeMask (SecondMask);
562	}
563	} while (SecondIdx >= `0`);
564	break;
565	}
566	}
567	}
568	}
569
570	void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
571	const APInt &DemandedElts,
572	APInt &DemandedLHS,
573	APInt &DemandedRHS) {
574	assert(VectorBitWidth >= `128` && "Vectors smaller than 128 bit not supported");
575	int NumLanes = VectorBitWidth / `128`;
576	int NumElts = DemandedElts.getBitWidth();
577	int NumEltsPerLane = NumElts / NumLanes;
578	int HalfEltsPerLane = NumEltsPerLane / `2`;
579
580	DemandedLHS = APInt::getZero(numBits: NumElts);
581	DemandedRHS = APInt::getZero(numBits: NumElts);
582
583	// Map DemandedElts to the horizontal operands.
584	for (int Idx = `0`; Idx != NumElts; ++Idx) {
585	if (!DemandedElts [Idx])
586	continue;
587	int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
588	int LocalIdx = Idx % NumEltsPerLane;
589	if (LocalIdx < HalfEltsPerLane) {
590	DemandedLHS.setBit(LaneIdx + `2` * LocalIdx);
591	} else {
592	LocalIdx -= HalfEltsPerLane;
593	DemandedRHS.setBit(LaneIdx + `2` * LocalIdx);
594	}
595	}
596	}
597
598	MapVector<Instruction *, uint64_t>
599	llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
600	const TargetTransformInfo *TTI) {
601
602	// DemandedBits will give us every value's live-out bits. But we want
603	// to ensure no extra casts would need to be inserted, so every DAG
604	// of connected values must have the same minimum bitwidth.
605	EquivalenceClasses<Value *> ECs;
606	SmallVector<Value *, `16`> Worklist;
607	SmallPtrSet<Value *, `4`> Roots;
608	SmallPtrSet<Value *, `16`> Visited;
609	DenseMap<Value *, uint64_t> DBits;
610	SmallPtrSet<Instruction *, `4`> InstructionSet;
611	MapVector<Instruction *, uint64_t> MinBWs;
612
613	// Determine the roots. We work bottom-up, from truncs or icmps.
614	bool SeenExtFromIllegalType = false;
615	for (auto *BB : Blocks)
616	for (auto &I : *BB) {
617	InstructionSet.insert(Ptr: &I);
618
619	if (TTI && (isa<ZExtInst>(Val: &I) \|\| isa<SExtInst>(Val: &I)) &&
620	!TTI->isTypeLegal(Ty: I.getOperand(i: `0`)->getType()))
621	SeenExtFromIllegalType = true;
622
623	// Only deal with non-vector integers up to 64-bits wide.
624	if ((isa<TruncInst>(Val: &I) \|\| isa<ICmpInst>(Val: &I)) &&
625	!I.getType()->isVectorTy() &&
626	I.getOperand(i: `0`)->getType()->getScalarSizeInBits() <= `64`) {
627	// Don't make work for ourselves. If we know the loaded type is legal,
628	// don't add it to the worklist.
629	if (TTI && isa<TruncInst>(Val: &I) && TTI->isTypeLegal(Ty: I.getType()))
630	continue;
631
632	Worklist.push_back(Elt: &I);
633	Roots.insert(Ptr: &I);
634	}
635	}
636	// Early exit.
637	if (Worklist.empty() \|\| (TTI && !SeenExtFromIllegalType))
638	return MinBWs;
639
640	// Now proceed breadth-first, unioning values together.
641	while (!Worklist.empty()) {
642	Value *Val = Worklist.pop_back_val();
643	Value *Leader = ECs.getOrInsertLeaderValue(V: Val);
644
645	if (!Visited.insert(Ptr: Val).second)
646	continue;
647
648	// Non-instructions terminate a chain successfully.
649	if (!isa<Instruction>(Val))
650	continue;
651	Instruction *I = cast<Instruction>(Val);
652
653	// If we encounter a type that is larger than 64 bits, we can't represent
654	// it so bail out.
655	if (DB.getDemandedBits(I).getBitWidth() > `64`)
656	return MapVector<Instruction *, uint64_t>();
657
658	uint64_t V = DB.getDemandedBits(I).getZExtValue();
659	DBits [Leader] \|= V;
660	DBits [I] = V;
661
662	// Casts, loads and instructions outside of our range terminate a chain
663	// successfully.
664	if (isa<SExtInst>(Val: I) \|\| isa<ZExtInst>(Val: I) \|\| isa<LoadInst>(Val: I) \|\|
665	!InstructionSet.count(Ptr: I))
666	continue;
667
668	// Unsafe casts terminate a chain unsuccessfully. We can't do anything
669	// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
670	// transform anything that relies on them.
671	if (isa<BitCastInst>(Val: I) \|\| isa<PtrToIntInst>(Val: I) \|\| isa<IntToPtrInst>(Val: I) \|\|
672	!I->getType()->isIntegerTy()) {
673	DBits [Leader] \|= ~`0ULL`;
674	continue;
675	}
676
677	// We don't modify the types of PHIs. Reductions will already have been
678	// truncated if possible, and inductions' sizes will have been chosen by
679	// indvars.
680	if (isa<PHINode>(Val: I))
681	continue;
682
683	if (DBits [Leader] == ~`0ULL`)
684	// All bits demanded, no point continuing.
685	continue;
686
687	for (Value *O : cast<User>(Val: I)->operands()) {
688	ECs.unionSets(V1: Leader, V2: O);
689	Worklist.push_back(Elt: O);
690	}
691	}
692
693	// Now we've discovered all values, walk them to see if there are
694	// any users we didn't see. If there are, we can't optimize that
695	// chain.
696	for (auto &I : DBits)
697	for (auto *U : I.first->users())
698	if (U->getType()->isIntegerTy() && DBits.count(Val: U) == `0`)
699	DBits [ECs.getOrInsertLeaderValue(V: I.first)] \|= ~`0ULL`;
700
701	for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) {
702	uint64_t LeaderDemandedBits = `0`;
703	for (Value *M : llvm::make_range(x: ECs.member_begin(I), y: ECs.member_end()))
704	LeaderDemandedBits \|= DBits [M];
705
706	uint64_t MinBW = llvm::bit_width(Value: LeaderDemandedBits);
707	// Round up to a power of 2
708	MinBW = llvm::bit_ceil(Value: MinBW);
709
710	// We don't modify the types of PHIs. Reductions will already have been
711	// truncated if possible, and inductions' sizes will have been chosen by
712	// indvars.
713	// If we are required to shrink a PHI, abandon this entire equivalence class.
714	bool Abort = false;
715	for (Value *M : llvm::make_range(x: ECs.member_begin(I), y: ECs.member_end()))
716	if (isa<PHINode>(Val: M) && MinBW < M->getType()->getScalarSizeInBits()) {
717	Abort = true;
718	break;
719	}
720	if (Abort)
721	continue;
722
723	for (Value *M : llvm::make_range(x: ECs.member_begin(I), y: ECs.member_end())) {
724	auto *MI = dyn_cast<Instruction>(Val: M);
725	if (!MI)
726	continue;
727	Type *Ty = M->getType();
728	if (Roots.count(Ptr: M))
729	Ty = MI->getOperand(i: `0`)->getType();
730
731	if (MinBW >= Ty->getScalarSizeInBits())
732	continue;
733
734	// If any of M's operands demand more bits than MinBW then M cannot be
735	// performed safely in MinBW.
736	if (any_of(Range: MI->operands(), P: [&DB, MinBW](Use &U) {
737	auto *CI = dyn_cast<ConstantInt>(Val&: U);
738	// For constants shift amounts, check if the shift would result in
739	// poison.
740	if (CI &&
741	isa<ShlOperator, LShrOperator, AShrOperator>(Val: U.getUser()) &&
742	U.getOperandNo() == `1`)
743	return CI->uge(Num: MinBW);
744	uint64_t BW = bit_width(Value: DB.getDemandedBits(U: &U).getZExtValue());
745	return bit_ceil(Value: BW) > MinBW;
746	}))
747	continue;
748
749	MinBWs [MI] = MinBW;
750	}
751	}
752
753	return MinBWs;
754	}
755
756	/// Add all access groups in @p AccGroups to @p List.
757	template <typename ListT>
758	static void addToAccessGroupList(ListT &List, MDNode *AccGroups) {
759	// Interpret an access group as a list containing itself.
760	if (AccGroups->getNumOperands() == `0`) {
761	assert(isValidAsAccessGroup(AccGroups) && "Node must be an access group");
762	List.insert(AccGroups);
763	return;
764	}
765
766	for (const auto &AccGroupListOp : AccGroups->operands()) {
767	auto *Item = cast<MDNode>(Val: AccGroupListOp.get());
768	assert(isValidAsAccessGroup(Item) && "List item must be an access group");
769	List.insert(Item);
770	}
771	}
772
773	MDNode llvm::uniteAccessGroups(MDNode AccGroups1, MDNode *AccGroups2) {
774	if (!AccGroups1)
775	return AccGroups2;
776	if (!AccGroups2)
777	return AccGroups1;
778	if (AccGroups1 == AccGroups2)
779	return AccGroups1;
780
781	SmallSetVector<Metadata *, `4`> Union;
782	addToAccessGroupList(List&: Union, AccGroups: AccGroups1);
783	addToAccessGroupList(List&: Union, AccGroups: AccGroups2);
784
785	if (Union.size() == `0`)
786	return nullptr;
787	if (Union.size() == `1`)
788	return cast<MDNode>(Val: Union.front());
789
790	LLVMContext &Ctx = AccGroups1->getContext();
791	return MDNode::get(Context&: Ctx, MDs: Union.getArrayRef());
792	}
793
794	MDNode llvm::intersectAccessGroups(const* Instruction *Inst1,
795	const Instruction *Inst2) {
796	bool MayAccessMem1 = Inst1->mayReadOrWriteMemory();
797	bool MayAccessMem2 = Inst2->mayReadOrWriteMemory();
798
799	if (!MayAccessMem1 && !MayAccessMem2)
800	return nullptr;
801	if (!MayAccessMem1)
802	return Inst2->getMetadata(KindID: LLVMContext::MD_access_group);
803	if (!MayAccessMem2)
804	return Inst1->getMetadata(KindID: LLVMContext::MD_access_group);
805
806	MDNode *MD1 = Inst1->getMetadata(KindID: LLVMContext::MD_access_group);
807	MDNode *MD2 = Inst2->getMetadata(KindID: LLVMContext::MD_access_group);
808	if (!MD1 \|\| !MD2)
809	return nullptr;
810	if (MD1 == MD2)
811	return MD1;
812
813	// Use set for scalable 'contains' check.
814	SmallPtrSet<Metadata *, `4`> AccGroupSet2;
815	addToAccessGroupList(List&: AccGroupSet2, AccGroups: MD2);
816
817	SmallVector<Metadata *, `4`> Intersection;
818	if (MD1->getNumOperands() == `0`) {
819	assert(isValidAsAccessGroup(MD1) && "Node must be an access group");
820	if (AccGroupSet2.count(Ptr: MD1))
821	Intersection.push_back(Elt: MD1);
822	} else {
823	for (const MDOperand &Node : MD1->operands()) {
824	auto *Item = cast<MDNode>(Val: Node.get());
825	assert(isValidAsAccessGroup(Item) && "List item must be an access group");
826	if (AccGroupSet2.count(Ptr: Item))
827	Intersection.push_back(Elt: Item);
828	}
829	}
830
831	if (Intersection.size() == `0`)
832	return nullptr;
833	if (Intersection.size() == `1`)
834	return cast<MDNode>(Val: Intersection.front());
835
836	LLVMContext &Ctx = Inst1->getContext();
837	return MDNode::get(Context&: Ctx, MDs: Intersection);
838	}
839
840	/// \returns \p I after propagating metadata from \p VL.
841	Instruction llvm::propagateMetadata(Instruction Inst, ArrayRef<Value *> VL) {
842	if (VL.empty())
843	return Inst;
844	Instruction *I0 = cast<Instruction>(Val: VL [`0`]);
845	SmallVector<std::pair<unsigned, MDNode *>, `4`> Metadata;
846	I0->getAllMetadataOtherThanDebugLoc(MDs&: Metadata);
847
848	for (auto Kind : {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
849	LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
850	LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
851	LLVMContext::MD_access_group, LLVMContext::MD_mmra}) {
852	MDNode *MD = I0->getMetadata(KindID: Kind);
853	for (int J = `1`, E = VL.size(); MD && J != E; ++J) {
854	const Instruction *IJ = cast<Instruction>(Val: VL [J]);
855	MDNode *IMD = IJ->getMetadata(KindID: Kind);
856
857	switch (Kind) {
858	case LLVMContext::MD_mmra: {
859	MD = MMRAMetadata::combine(Ctx&: Inst->getContext(), A: MD, B: IMD);
860	break;
861	}
862	case LLVMContext::MD_tbaa:
863	MD = MDNode::getMostGenericTBAA(A: MD, B: IMD);
864	break;
865	case LLVMContext::MD_alias_scope:
866	MD = MDNode::getMostGenericAliasScope(A: MD, B: IMD);
867	break;
868	case LLVMContext::MD_fpmath:
869	MD = MDNode::getMostGenericFPMath(A: MD, B: IMD);
870	break;
871	case LLVMContext::MD_noalias:
872	case LLVMContext::MD_nontemporal:
873	case LLVMContext::MD_invariant_load:
874	MD = MDNode::intersect(A: MD, B: IMD);
875	break;
876	case LLVMContext::MD_access_group:
877	MD = intersectAccessGroups(Inst1: Inst, Inst2: IJ);
878	break;
879	default:
880	llvm_unreachable("unhandled metadata");
881	}
882	}
883
884	Inst->setMetadata(KindID: Kind, Node: MD);
885	}
886
887	return Inst;
888	}
889
890	Constant *
891	llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
892	const InterleaveGroup<Instruction> &Group) {
893	// All 1's means mask is not needed.
894	if (Group.getNumMembers() == Group.getFactor())
895	return nullptr;
896
897	// TODO: support reversed access.
898	assert(!Group.isReverse() && "Reversed group not supported.");
899
900	SmallVector<Constant *, `16`> Mask;
901	for (unsigned i = `0`; i < VF; i++)
902	for (unsigned j = `0`; j < Group.getFactor(); ++j) {
903	unsigned HasMember = Group.getMember(Index: j) ? `1` : `0`;
904	Mask.push_back(Elt: Builder.getInt1(V: HasMember));
905	}
906
907	return ConstantVector::get(V: Mask);
908	}
909
910	llvm::SmallVector<int, `16`>
911	llvm::createReplicatedMask(unsigned ReplicationFactor, unsigned VF) {
912	SmallVector<int, `16`> MaskVec;
913	for (unsigned i = `0`; i < VF; i++)
914	for (unsigned j = `0`; j < ReplicationFactor; j++)
915	MaskVec.push_back(Elt: i);
916
917	return MaskVec;
918	}
919
920	llvm::SmallVector<int, `16`> llvm::createInterleaveMask(unsigned VF,
921	unsigned NumVecs) {
922	SmallVector<int, `16`> Mask;
923	for (unsigned i = `0`; i < VF; i++)
924	for (unsigned j = `0`; j < NumVecs; j++)
925	Mask.push_back(Elt: j * VF + i);
926
927	return Mask;
928	}
929
930	llvm::SmallVector<int, `16`>
931	llvm::createStrideMask(unsigned Start, unsigned Stride, unsigned VF) {
932	SmallVector<int, `16`> Mask;
933	for (unsigned i = `0`; i < VF; i++)
934	Mask.push_back(Elt: Start + i * Stride);
935
936	return Mask;
937	}
938
939	llvm::SmallVector<int, `16`> llvm::createSequentialMask(unsigned Start,
940	unsigned NumInts,
941	unsigned NumUndefs) {
942	SmallVector<int, `16`> Mask;
943	for (unsigned i = `0`; i < NumInts; i++)
944	Mask.push_back(Elt: Start + i);
945
946	for (unsigned i = `0`; i < NumUndefs; i++)
947	Mask.push_back(Elt: -`1`);
948
949	return Mask;
950	}
951
952	llvm::SmallVector<int, `16`> llvm::createUnaryMask(ArrayRef<int> Mask,
953	unsigned NumElts) {
954	// Avoid casts in the loop and make sure we have a reasonable number.
955	int NumEltsSigned = NumElts;
956	assert(NumEltsSigned > `0` && "Expected smaller or non-zero element count");
957
958	// If the mask chooses an element from operand 1, reduce it to choose from the
959	// corresponding element of operand 0. Undef mask elements are unchanged.
960	SmallVector<int, `16`> UnaryMask;
961	for (int MaskElt : Mask) {
962	assert((MaskElt < NumEltsSigned * `2`) && "Expected valid shuffle mask");
963	int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
964	UnaryMask.push_back(Elt: UnaryElt);
965	}
966	return UnaryMask;
967	}
968
969	/// A helper function for concatenating vectors. This function concatenates two
970	/// vectors having the same element type. If the second vector has fewer
971	/// elements than the first, it is padded with undefs.
972	static Value concatenateTwoVectors(IRBuilderBase &Builder, Value V1,
973	Value *V2) {
974	VectorType *VecTy1 = dyn_cast<VectorType>(Val: V1->getType());
975	VectorType *VecTy2 = dyn_cast<VectorType>(Val: V2->getType());
976	assert(VecTy1 && VecTy2 &&
977	VecTy1->getScalarType() == VecTy2->getScalarType() &&
978	"Expect two vectors with the same element type");
979
980	unsigned NumElts1 = cast<FixedVectorType>(Val: VecTy1)->getNumElements();
981	unsigned NumElts2 = cast<FixedVectorType>(Val: VecTy2)->getNumElements();
982	assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
983
984	if (NumElts1 > NumElts2) {
985	// Extend with UNDEFs.
986	V2 = Builder.CreateShuffleVector(
987	V: V2, Mask: createSequentialMask(Start: `0`, NumInts: NumElts2, NumUndefs: NumElts1 - NumElts2));
988	}
989
990	return Builder.CreateShuffleVector(
991	V1, V2, Mask: createSequentialMask(Start: `0`, NumInts: NumElts1 + NumElts2, NumUndefs: `0`));
992	}
993
994	Value *llvm::concatenateVectors(IRBuilderBase &Builder,
995	ArrayRef<Value *> Vecs) {
996	unsigned NumVecs = Vecs.size();
997	assert(NumVecs > `1` && "Should be at least two vectors");
998
999	SmallVector<Value *, `8`> ResList;
1000	ResList.append(in_start: Vecs.begin(), in_end: Vecs.end());
1001	do {
1002	SmallVector<Value *, `8`> TmpList;
1003	for (unsigned i = `0`; i < NumVecs - `1`; i += `2`) {
1004	Value V0 = ResList [i], V1 = ResList [i + `1`];
1005	assert((V0->getType() == V1->getType() \|\| i == NumVecs - `2`) &&
1006	"Only the last vector may have a different type");
1007
1008	TmpList.push_back(Elt: concatenateTwoVectors(Builder, V1: V0, V2: V1));
1009	}
1010
1011	// Push the last vector if the total number of vectors is odd.
1012	if (NumVecs % `2` != `0`)
1013	TmpList.push_back(Elt: ResList [NumVecs - `1`]);
1014
1015	ResList = TmpList;
1016	NumVecs = ResList.size();
1017	} while (NumVecs > `1`);
1018
1019	return ResList [`0`];
1020	}
1021
1022	bool llvm::maskIsAllZeroOrUndef(Value *Mask) {
1023	assert(isa<VectorType>(Mask->getType()) &&
1024	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1025	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1026	`1` &&
1027	"Mask must be a vector of i1");
1028
1029	auto *ConstMask = dyn_cast<Constant>(Val: Mask);
1030	if (!ConstMask)
1031	return false;
1032	if (ConstMask->isNullValue() \|\| isa<UndefValue>(Val: ConstMask))
1033	return true;
1034	if (isa<ScalableVectorType>(Val: ConstMask->getType()))
1035	return false;
1036	for (unsigned
1037	I = `0`,
1038	E = cast<FixedVectorType>(Val: ConstMask->getType())->getNumElements();
1039	I != E; ++I) {
1040	if (auto *MaskElt = ConstMask->getAggregateElement(Elt: I))
1041	if (MaskElt->isNullValue() \|\| isa<UndefValue>(Val: MaskElt))
1042	continue;
1043	return false;
1044	}
1045	return true;
1046	}
1047
1048	bool llvm::maskIsAllOneOrUndef(Value *Mask) {
1049	assert(isa<VectorType>(Mask->getType()) &&
1050	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1051	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1052	`1` &&
1053	"Mask must be a vector of i1");
1054
1055	auto *ConstMask = dyn_cast<Constant>(Val: Mask);
1056	if (!ConstMask)
1057	return false;
1058	if (ConstMask->isAllOnesValue() \|\| isa<UndefValue>(Val: ConstMask))
1059	return true;
1060	if (isa<ScalableVectorType>(Val: ConstMask->getType()))
1061	return false;
1062	for (unsigned
1063	I = `0`,
1064	E = cast<FixedVectorType>(Val: ConstMask->getType())->getNumElements();
1065	I != E; ++I) {
1066	if (auto *MaskElt = ConstMask->getAggregateElement(Elt: I))
1067	if (MaskElt->isAllOnesValue() \|\| isa<UndefValue>(Val: MaskElt))
1068	continue;
1069	return false;
1070	}
1071	return true;
1072	}
1073
1074	bool llvm::maskContainsAllOneOrUndef(Value *Mask) {
1075	assert(isa<VectorType>(Mask->getType()) &&
1076	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1077	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1078	`1` &&
1079	"Mask must be a vector of i1");
1080
1081	auto *ConstMask = dyn_cast<Constant>(Val: Mask);
1082	if (!ConstMask)
1083	return false;
1084	if (ConstMask->isAllOnesValue() \|\| isa<UndefValue>(Val: ConstMask))
1085	return true;
1086	if (isa<ScalableVectorType>(Val: ConstMask->getType()))
1087	return false;
1088	for (unsigned
1089	I = `0`,
1090	E = cast<FixedVectorType>(Val: ConstMask->getType())->getNumElements();
1091	I != E; ++I) {
1092	if (auto *MaskElt = ConstMask->getAggregateElement(Elt: I))
1093	if (MaskElt->isAllOnesValue() \|\| isa<UndefValue>(Val: MaskElt))
1094	return true;
1095	}
1096	return false;
1097	}
1098
1099	/// TODO: This is a lot like known bits, but for
1100	/// vectors. Is there something we can common this with?
1101	APInt llvm::possiblyDemandedEltsInMask(Value *Mask) {
1102	assert(isa<FixedVectorType>(Mask->getType()) &&
1103	isa<IntegerType>(Mask->getType()->getScalarType()) &&
1104	cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1105	`1` &&
1106	"Mask must be a fixed width vector of i1");
1107
1108	const unsigned VWidth =
1109	cast<FixedVectorType>(Val: Mask->getType())->getNumElements();
1110	APInt DemandedElts = APInt::getAllOnes(numBits: VWidth);
1111	if (auto *CV = dyn_cast<ConstantVector>(Val: Mask))
1112	for (unsigned i = `0`; i < VWidth; i++)
1113	if (CV->getAggregateElement(Elt: i)->isNullValue())
1114	DemandedElts.clearBit(BitPosition: i);
1115	return DemandedElts;
1116	}
1117
1118	bool InterleavedAccessInfo::isStrided(int Stride) {
1119	unsigned Factor = std::abs(x: Stride);
1120	return Factor >= `2` && Factor <= MaxInterleaveGroupFactor;
1121	}
1122
1123	void InterleavedAccessInfo::collectConstStrideAccesses(
1124	MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
1125	const DenseMap<Value, const* SCEV*> &Strides) {
1126	auto &DL = TheLoop->getHeader()->getDataLayout();
1127
1128	// Since it's desired that the load/store instructions be maintained in
1129	// "program order" for the interleaved access analysis, we have to visit the
1130	// blocks in the loop in reverse postorder (i.e., in a topological order).
1131	// Such an ordering will ensure that any load/store that may be executed
1132	// before a second load/store will precede the second load/store in
1133	// AccessStrideInfo.
1134	LoopBlocksDFS DFS(TheLoop);
1135	DFS.perform(LI);
1136	for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO()))
1137	for (auto &I : *BB) {
1138	Value *Ptr = getLoadStorePointerOperand(V: &I);
1139	if (!Ptr)
1140	continue;
1141	Type *ElementTy = getLoadStoreType(I: &I);
1142
1143	// Currently, codegen doesn't support cases where the type size doesn't
1144	// match the alloc size. Skip them for now.
1145	uint64_t Size = DL.getTypeAllocSize(Ty: ElementTy);
1146	if (Size * `8` != DL.getTypeSizeInBits(Ty: ElementTy))
1147	continue;
1148
1149	// We don't check wrapping here because we don't know yet if Ptr will be
1150	// part of a full group or a group with gaps. Checking wrapping for all
1151	// pointers (even those that end up in groups with no gaps) will be overly
1152	// conservative. For full groups, wrapping should be ok since if we would
1153	// wrap around the address space we would do a memory access at nullptr
1154	// even without the transformation. The wrapping checks are therefore
1155	// deferred until after we've formed the interleaved groups.
1156	int64_t Stride =
1157	getPtrStride(PSE, AccessTy: ElementTy, Ptr, Lp: TheLoop, StridesMap: Strides,
1158	/Assume=/true, /ShouldCheckWrap=/false).value_or(u: `0`);
1159
1160	const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, PtrToStride: Strides, Ptr);
1161	AccessStrideInfo [&I] = StrideDescriptor (Stride, Scev, Size,
1162	getLoadStoreAlignment(I: &I));
1163	}
1164	}
1165
1166	// Analyze interleaved accesses and collect them into interleaved load and
1167	// store groups.
1168	//
1169	// When generating code for an interleaved load group, we effectively hoist all
1170	// loads in the group to the location of the first load in program order. When
1171	// generating code for an interleaved store group, we sink all stores to the
1172	// location of the last store. This code motion can change the order of load
1173	// and store instructions and may break dependences.
1174	//
1175	// The code generation strategy mentioned above ensures that we won't violate
1176	// any write-after-read (WAR) dependences.
1177	//
1178	// E.g., for the WAR dependence: a = A[i]; // (1)
1179	// A[i] = b; // (2)
1180	//
1181	// The store group of (2) is always inserted at or below (2), and the load
1182	// group of (1) is always inserted at or above (1). Thus, the instructions will
1183	// never be reordered. All other dependences are checked to ensure the
1184	// correctness of the instruction reordering.
1185	//
1186	// The algorithm visits all memory accesses in the loop in bottom-up program
1187	// order. Program order is established by traversing the blocks in the loop in
1188	// reverse postorder when collecting the accesses.
1189	//
1190	// We visit the memory accesses in bottom-up order because it can simplify the
1191	// construction of store groups in the presence of write-after-write (WAW)
1192	// dependences.
1193	//
1194	// E.g., for the WAW dependence: A[i] = a; // (1)
1195	// A[i] = b; // (2)
1196	// A[i + 1] = c; // (3)
1197	//
1198	// We will first create a store group with (3) and (2). (1) can't be added to
1199	// this group because it and (2) are dependent. However, (1) can be grouped
1200	// with other accesses that may precede it in program order. Note that a
1201	// bottom-up order does not imply that WAW dependences should not be checked.
1202	void InterleavedAccessInfo::analyzeInterleaving(
1203	bool EnablePredicatedInterleavedMemAccesses) {
1204	LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
1205	const auto &Strides = LAI->getSymbolicStrides();
1206
1207	// Holds all accesses with a constant stride.
1208	MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
1209	collectConstStrideAccesses(AccessStrideInfo, Strides);
1210
1211	if (AccessStrideInfo.empty())
1212	return;
1213
1214	// Collect the dependences in the loop.
1215	collectDependences();
1216
1217	// Holds all interleaved store groups temporarily.
1218	SmallSetVector<InterleaveGroup<Instruction> *, `4`> StoreGroups;
1219	// Holds all interleaved load groups temporarily.
1220	SmallSetVector<InterleaveGroup<Instruction> *, `4`> LoadGroups;
1221	// Groups added to this set cannot have new members added.
1222	SmallPtrSet<InterleaveGroup<Instruction> *, `4`> CompletedLoadGroups;
1223
1224	// Search in bottom-up program order for pairs of accesses (A and B) that can
1225	// form interleaved load or store groups. In the algorithm below, access A
1226	// precedes access B in program order. We initialize a group for B in the
1227	// outer loop of the algorithm, and then in the inner loop, we attempt to
1228	// insert each A into B's group if:
1229	//
1230	// 1. A and B have the same stride,
1231	// 2. A and B have the same memory object size, and
1232	// 3. A belongs in B's group according to its distance from B.
1233	//
1234	// Special care is taken to ensure group formation will not break any
1235	// dependences.
1236	for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
1237	BI != E; ++BI) {
1238	Instruction *B = BI ->first;
1239	StrideDescriptor DesB = BI ->second;
1240
1241	// Initialize a group for B if it has an allowable stride. Even if we don't
1242	// create a group for B, we continue with the bottom-up algorithm to ensure
1243	// we don't break any of B's dependences.
1244	InterleaveGroup<Instruction> GroupB = nullptr*;
1245	if (isStrided(Stride: DesB.Stride) &&
1246	(!isPredicated(BB: B->getParent()) \|\| EnablePredicatedInterleavedMemAccesses)) {
1247	GroupB = getInterleaveGroup(Instr: B);
1248	if (!GroupB) {
1249	LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
1250	<< `'\n'`);
1251	GroupB = createInterleaveGroup(Instr: B, Stride: DesB.Stride, Alignment: DesB.Alignment);
1252	if (B->mayWriteToMemory())
1253	StoreGroups.insert(X: GroupB);
1254	else
1255	LoadGroups.insert(X: GroupB);
1256	}
1257	}
1258
1259	for (auto AI = std::next(x: BI); AI != E; ++AI) {
1260	Instruction *A = AI ->first;
1261	StrideDescriptor DesA = AI ->second;
1262
1263	// Our code motion strategy implies that we can't have dependences
1264	// between accesses in an interleaved group and other accesses located
1265	// between the first and last member of the group. Note that this also
1266	// means that a group can't have more than one member at a given offset.
1267	// The accesses in a group can have dependences with other accesses, but
1268	// we must ensure we don't extend the boundaries of the group such that
1269	// we encompass those dependent accesses.
1270	//
1271	// For example, assume we have the sequence of accesses shown below in a
1272	// stride-2 loop:
1273	//
1274	// (1, 2) is a group \| A[i] = a; // (1)
1275	// \| A[i-1] = b; // (2) \|
1276	// A[i-3] = c; // (3)
1277	// A[i] = d; // (4) \| (2, 4) is not a group
1278	//
1279	// Because accesses (2) and (3) are dependent, we can group (2) with (1)
1280	// but not with (4). If we did, the dependent access (3) would be within
1281	// the boundaries of the (2, 4) group.
1282	auto DependentMember = [&](InterleaveGroup<Instruction> *Group,
1283	StrideEntry A) -> Instruction {
1284	for (uint32_t Index = `0`; Index < Group->getFactor(); ++Index) {
1285	Instruction *MemberOfGroupB = Group->getMember(Index);
1286	if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1287	A, B: &*AccessStrideInfo.find(Key: MemberOfGroupB)))
1288	return MemberOfGroupB;
1289	}
1290	return nullptr;
1291	};
1292
1293	auto GroupA = getInterleaveGroup(Instr: A);
1294	// If A is a load, dependencies are tolerable, there's nothing to do here.
1295	// If both A and B belong to the same (store) group, they are independent,
1296	// even if dependencies have not been recorded.
1297	// If both GroupA and GroupB are null, there's nothing to do here.
1298	if (A->mayWriteToMemory() && GroupA != GroupB) {
1299	Instruction DependentInst = nullptr*;
1300	// If GroupB is a load group, we have to compare AI against all
1301	// members of GroupB because if any load within GroupB has a dependency
1302	// on AI, we need to mark GroupB as complete and also release the
1303	// store GroupA (if A belongs to one). The former prevents incorrect
1304	// hoisting of load B above store A while the latter prevents incorrect
1305	// sinking of store A below load B.
1306	if (GroupB && LoadGroups.contains(key: GroupB))
1307	DependentInst = DependentMember (GroupB, &*AI);
1308	else if (!canReorderMemAccessesForInterleavedGroups(A: &AI, B: &BI))
1309	DependentInst = B;
1310
1311	if (DependentInst) {
1312	// A has a store dependence on B (or on some load within GroupB) and
1313	// is part of a store group. Release A's group to prevent illegal
1314	// sinking of A below B. A will then be free to form another group
1315	// with instructions that precede it.
1316	if (GroupA && StoreGroups.contains(key: GroupA)) {
1317	LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
1318	"dependence between "
1319	<< A << " and " << DependentInst << `'\n'`);
1320	StoreGroups.remove(X: GroupA);
1321	releaseGroup(Group: GroupA);
1322	}
1323	// If B is a load and part of an interleave group, no earlier loads
1324	// can be added to B's interleave group, because this would mean the
1325	// DependentInst would move across store A. Mark the interleave group
1326	// as complete.
1327	if (GroupB && LoadGroups.contains(key: GroupB)) {
1328	LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
1329	<< " as complete.\n");
1330	CompletedLoadGroups.insert(Ptr: GroupB);
1331	}
1332	}
1333	}
1334	if (CompletedLoadGroups.contains(Ptr: GroupB)) {
1335	// Skip trying to add A to B, continue to look for other conflicting A's
1336	// in groups to be released.
1337	continue;
1338	}
1339
1340	// At this point, we've checked for illegal code motion. If either A or B
1341	// isn't strided, there's nothing left to do.
1342	if (!isStrided(Stride: DesA.Stride) \|\| !isStrided(Stride: DesB.Stride))
1343	continue;
1344
1345	// Ignore A if it's already in a group or isn't the same kind of memory
1346	// operation as B.
1347	// Note that mayReadFromMemory() isn't mutually exclusive to
1348	// mayWriteToMemory in the case of atomic loads. We shouldn't see those
1349	// here, canVectorizeMemory() should have returned false - except for the
1350	// case we asked for optimization remarks.
1351	if (isInterleaved(Instr: A) \|\|
1352	(A->mayReadFromMemory() != B->mayReadFromMemory()) \|\|
1353	(A->mayWriteToMemory() != B->mayWriteToMemory()))
1354	continue;
1355
1356	// Check rules 1 and 2. Ignore A if its stride or size is different from
1357	// that of B.
1358	if (DesA.Stride != DesB.Stride \|\| DesA.Size != DesB.Size)
1359	continue;
1360
1361	// Ignore A if the memory object of A and B don't belong to the same
1362	// address space
1363	if (getLoadStoreAddressSpace(I: A) != getLoadStoreAddressSpace(I: B))
1364	continue;
1365
1366	// Calculate the distance from A to B.
1367	const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
1368	Val: PSE.getSE()->getMinusSCEV(LHS: DesA.Scev, RHS: DesB.Scev));
1369	if (!DistToB)
1370	continue;
1371	int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
1372
1373	// Check rule 3. Ignore A if its distance to B is not a multiple of the
1374	// size.
1375	if (DistanceToB % static_cast<int64_t>(DesB.Size))
1376	continue;
1377
1378	// All members of a predicated interleave-group must have the same predicate,
1379	// and currently must reside in the same BB.
1380	BasicBlock *BlockA = A->getParent();
1381	BasicBlock *BlockB = B->getParent();
1382	if ((isPredicated(BB: BlockA) \|\| isPredicated(BB: BlockB)) &&
1383	(!EnablePredicatedInterleavedMemAccesses \|\| BlockA != BlockB))
1384	continue;
1385
1386	// The index of A is the index of B plus A's distance to B in multiples
1387	// of the size.
1388	int IndexA =
1389	GroupB->getIndex(Instr: B) + DistanceToB / static_cast<int64_t>(DesB.Size);
1390
1391	// Try to insert A into B's group.
1392	if (GroupB->insertMember(Instr: A, Index: IndexA, NewAlign: DesA.Alignment)) {
1393	LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << `'\n'`
1394	<< " into the interleave group with" << *B
1395	<< `'\n'`);
1396	InterleaveGroupMap [A] = GroupB;
1397
1398	// Set the first load in program order as the insert position.
1399	if (A->mayReadFromMemory())
1400	GroupB->setInsertPos(A);
1401	}
1402	} // Iteration over A accesses.
1403	} // Iteration over B accesses.
1404
1405	auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
1406	int Index,
1407	std::string FirstOrLast) -> bool {
1408	Instruction *Member = Group->getMember(Index);
1409	assert(Member && "Group member does not exist");
1410	Value *MemberPtr = getLoadStorePointerOperand(V: Member);
1411	Type *AccessTy = getLoadStoreType(I: Member);
1412	if (getPtrStride(PSE, AccessTy, Ptr: MemberPtr, Lp: TheLoop, StridesMap: Strides,
1413	/Assume=/false, /ShouldCheckWrap=/true).value_or(u: `0`))
1414	return false;
1415	LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
1416	<< FirstOrLast
1417	<< " group member potentially pointer-wrapping.\n");
1418	releaseGroup(Group);
1419	return true;
1420	};
1421
1422	// Remove interleaved groups with gaps whose memory
1423	// accesses may wrap around. We have to revisit the getPtrStride analysis,
1424	// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
1425	// not check wrapping (see documentation there).
1426	// FORNOW we use Assume=false;
1427	// TODO: Change to Assume=true but making sure we don't exceed the threshold
1428	// of runtime SCEV assumptions checks (thereby potentially failing to
1429	// vectorize altogether).
1430	// Additional optional optimizations:
1431	// TODO: If we are peeling the loop and we know that the first pointer doesn't
1432	// wrap then we can deduce that all pointers in the group don't wrap.
1433	// This means that we can forcefully peel the loop in order to only have to
1434	// check the first pointer for no-wrap. When we'll change to use Assume=true
1435	// we'll only need at most one runtime check per interleaved group.
1436	for (auto *Group : LoadGroups) {
1437	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
1438	// load would wrap around the address space we would do a memory access at
1439	// nullptr even without the transformation.
1440	if (Group->getNumMembers() == Group->getFactor())
1441	continue;
1442
1443	// Case 2: If first and last members of the group don't wrap this implies
1444	// that all the pointers in the group don't wrap.
1445	// So we check only group member 0 (which is always guaranteed to exist),
1446	// and group member Factor - 1; If the latter doesn't exist we rely on
1447	// peeling (if it is a non-reversed accsess -- see Case 3).
1448	if (InvalidateGroupIfMemberMayWrap (Group, `0`, std::string ("first")))
1449	continue;
1450	if (Group->getMember(Index: Group->getFactor() - `1`))
1451	InvalidateGroupIfMemberMayWrap (Group, Group->getFactor() - `1`,
1452	std::string ("last"));
1453	else {
1454	// Case 3: A non-reversed interleaved load group with gaps: We need
1455	// to execute at least one scalar epilogue iteration. This will ensure
1456	// we don't speculatively access memory out-of-bounds. We only need
1457	// to look for a member at index factor - 1, since every group must have
1458	// a member at index zero.
1459	if (Group->isReverse()) {
1460	LLVM_DEBUG(
1461	dbgs() << "LV: Invalidate candidate interleaved group due to "
1462	"a reverse access with gaps.\n");
1463	releaseGroup(Group);
1464	continue;
1465	}
1466	LLVM_DEBUG(
1467	dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
1468	RequiresScalarEpilogue = true;
1469	}
1470	}
1471
1472	for (auto *Group : StoreGroups) {
1473	// Case 1: A full group. Can Skip the checks; For full groups, if the wide
1474	// store would wrap around the address space we would do a memory access at
1475	// nullptr even without the transformation.
1476	if (Group->getNumMembers() == Group->getFactor())
1477	continue;
1478
1479	// Interleave-store-group with gaps is implemented using masked wide store.
1480	// Remove interleaved store groups with gaps if
1481	// masked-interleaved-accesses are not enabled by the target.
1482	if (!EnablePredicatedInterleavedMemAccesses) {
1483	LLVM_DEBUG(
1484	dbgs() << "LV: Invalidate candidate interleaved store group due "
1485	"to gaps.\n");
1486	releaseGroup(Group);
1487	continue;
1488	}
1489
1490	// Case 2: If first and last members of the group don't wrap this implies
1491	// that all the pointers in the group don't wrap.
1492	// So we check only group member 0 (which is always guaranteed to exist),
1493	// and the last group member. Case 3 (scalar epilog) is not relevant for
1494	// stores with gaps, which are implemented with masked-store (rather than
1495	// speculative access, as in loads).
1496	if (InvalidateGroupIfMemberMayWrap (Group, `0`, std::string ("first")))
1497	continue;
1498	for (int Index = Group->getFactor() - `1`; Index > `0`; Index--)
1499	if (Group->getMember(Index)) {
1500	InvalidateGroupIfMemberMayWrap (Group, Index, std::string ("last"));
1501	break;
1502	}
1503	}
1504	}
1505
1506	void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
1507	// If no group had triggered the requirement to create an epilogue loop,
1508	// there is nothing to do.
1509	if (!requiresScalarEpilogue())
1510	return;
1511
1512	// Release groups requiring scalar epilogues. Note that this also removes them
1513	// from InterleaveGroups.
1514	bool ReleasedGroup = InterleaveGroups.remove_if(P: [&](auto *Group) {
1515	if (!Group->requiresScalarEpilogue())
1516	return false;
1517	LLVM_DEBUG(
1518	dbgs()
1519	<< "LV: Invalidate candidate interleaved group due to gaps that "
1520	"require a scalar epilogue (not allowed under optsize) and cannot "
1521	"be masked (not enabled). \n");
1522	releaseGroupWithoutRemovingFromSet(Group);
1523	return true;
1524	});
1525	assert(ReleasedGroup && "At least one group must be invalidated, as a "
1526	"scalar epilogue was required");
1527	(void)ReleasedGroup;
1528	RequiresScalarEpilogue = false;
1529	}
1530
1531	template <typename InstT>
1532	void InterleaveGroup<InstT>::addMetadata(InstT NewInst) const* {
1533	llvm_unreachable("addMetadata can only be used for Instruction");
1534	}
1535
1536	namespace llvm {
1537	template <>
1538	void InterleaveGroup<Instruction>::addMetadata(Instruction NewInst) const* {
1539	SmallVector<Value *, `4`> VL;
1540	std::transform(first: Members.begin(), last: Members.end(), result: std::back_inserter(x&: VL),
1541	unary_op: [](std::pair<int, Instruction > p) { return* p.second; });
1542	propagateMetadata(Inst: NewInst, VL);
1543	}
1544	} // namespace llvm
1545

Browse the source code of llvm_projects/llvm/lib/Analysis/VectorUtils.cpp