X86InterleavedAccess.cpp source code [llvm_projects/llvm/lib/Target/X86/X86InterleavedAccess.cpp]

1	//===- X86InterleavedAccess.cpp -------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	/// \file
10	/// This file contains the X86 implementation of the interleaved accesses
11	/// optimization generating X86-specific instructions/intrinsics for
12	/// interleaved access groups.
13	//
14	//===----------------------------------------------------------------------===//
15
16	#include "X86ISelLowering.h"
17	#include "X86Subtarget.h"
18	#include "llvm/ADT/ArrayRef.h"
19	#include "llvm/ADT/SmallVector.h"
20	#include "llvm/Analysis/VectorUtils.h"
21	#include "llvm/CodeGenTypes/MachineValueType.h"
22	#include "llvm/IR/DataLayout.h"
23	#include "llvm/IR/DerivedTypes.h"
24	#include "llvm/IR/IRBuilder.h"
25	#include "llvm/IR/Instruction.h"
26	#include "llvm/IR/Instructions.h"
27	#include "llvm/IR/Type.h"
28	#include "llvm/IR/Value.h"
29	#include "llvm/Support/Casting.h"
30	#include <algorithm>
31	#include <cassert>
32	#include <cmath>
33
34	using namespace llvm;
35
36	namespace {
37
38	/// This class holds necessary information to represent an interleaved
39	/// access group and supports utilities to lower the group into
40	/// X86-specific instructions/intrinsics.
41	/// E.g. A group of interleaving access loads (Factor = 2; accessing every
42	/// other element)
43	/// %wide.vec = load <8 x i32>, <8 x i32> %ptr*
44	/// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
45	/// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
46	class X86InterleavedAccessGroup {
47	/// Reference to the wide-load instruction of an interleaved access
48	/// group.
49	Instruction *const Inst;
50
51	/// Reference to the shuffle(s), consumer(s) of the (load) 'Inst'.
52	ArrayRef<ShuffleVectorInst *> Shuffles;
53
54	/// Reference to the starting index of each user-shuffle.
55	ArrayRef<unsigned> Indices;
56
57	/// Reference to the interleaving stride in terms of elements.
58	const unsigned Factor;
59
60	/// Reference to the underlying target.
61	const X86Subtarget &Subtarget;
62
63	const DataLayout &DL;
64
65	IRBuilder<> &Builder;
66
67	/// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
68	/// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
69	void decompose(Instruction Inst, unsigned* NumSubVectors, FixedVectorType *T,
70	SmallVectorImpl<Instruction *> &DecomposedVectors);
71
72	/// Performs matrix transposition on a 4x4 matrix \p InputVectors and
73	/// returns the transposed-vectors in \p TransposedVectors.
74	/// E.g.
75	/// InputVectors:
76	/// In-V0 = p1, p2, p3, p4
77	/// In-V1 = q1, q2, q3, q4
78	/// In-V2 = r1, r2, r3, r4
79	/// In-V3 = s1, s2, s3, s4
80	/// OutputVectors:
81	/// Out-V0 = p1, q1, r1, s1
82	/// Out-V1 = p2, q2, r2, s2
83	/// Out-V2 = p3, q3, r3, s3
84	/// Out-V3 = P4, q4, r4, s4
85	void transpose_4x4(ArrayRef<Instruction *> InputVectors,
86	SmallVectorImpl<Value *> &TransposedMatrix);
87	void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
88	SmallVectorImpl<Value *> &TransposedMatrix,
89	unsigned NumSubVecElems);
90	void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
91	SmallVectorImpl<Value *> &TransposedMatrix);
92	void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
93	SmallVectorImpl<Value *> &TransposedMatrix,
94	unsigned NumSubVecElems);
95	void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
96	SmallVectorImpl<Value *> &TransposedMatrix,
97	unsigned NumSubVecElems);
98
99	public:
100	/// In order to form an interleaved access group X86InterleavedAccessGroup
101	/// requires a wide-load instruction \p 'I', a group of interleaved-vectors
102	/// \p Shuffs, reference to the first indices of each interleaved-vector
103	/// \p 'Ind' and the interleaving stride factor \p F. In order to generate
104	/// X86-specific instructions/intrinsics it also requires the underlying
105	/// target information \p STarget.
106	explicit X86InterleavedAccessGroup(Instruction *I,
107	ArrayRef<ShuffleVectorInst *> Shuffs,
108	ArrayRef<unsigned> Ind, const unsigned F,
109	const X86Subtarget &STarget,
110	IRBuilder<> &B)
111	: Inst(I), Shuffles (Shuffs), Indices (Ind), Factor(F), Subtarget(STarget),
112	DL(Inst->getDataLayout()), Builder(B) {}
113
114	/// Returns true if this interleaved access group can be lowered into
115	/// x86-specific instructions/intrinsics, false otherwise.
116	bool isSupported() const;
117
118	/// Lowers this interleaved access group into X86-specific
119	/// instructions/intrinsics.
120	bool lowerIntoOptimizedSequence();
121	};
122
123	} // end anonymous namespace
124
125	bool X86InterleavedAccessGroup::isSupported() const {
126	VectorType *ShuffleVecTy = Shuffles [`0`]->getType();
127	Type *ShuffleEltTy = ShuffleVecTy->getElementType();
128	unsigned ShuffleElemSize = DL.getTypeSizeInBits(Ty: ShuffleEltTy);
129	unsigned WideInstSize;
130
131	// Currently, lowering is supported for the following vectors:
132	// Stride 4:
133	// 1. Store and load of 4-element vectors of 64 bits on AVX.
134	// 2. Store of 16/32-element vectors of 8 bits on AVX.
135	// Stride 3:
136	// 1. Load of 16/32-element vectors of 8 bits on AVX.
137	if (!Subtarget.hasAVX() \|\| (Factor != `4` && Factor != `3`))
138	return false;
139
140	if (isa<LoadInst>(Val: Inst)) {
141	WideInstSize = DL.getTypeSizeInBits(Ty: Inst->getType());
142	if (cast<LoadInst>(Val: Inst)->getPointerAddressSpace())
143	return false;
144	} else
145	WideInstSize = DL.getTypeSizeInBits(Ty: Shuffles [`0`]->getType());
146
147	// We support shuffle represents stride 4 for byte type with size of
148	// WideInstSize.
149	if (ShuffleElemSize == `64` && WideInstSize == `1024` && Factor == `4`)
150	return true;
151
152	if (ShuffleElemSize == `8` && isa<StoreInst>(Val: Inst) && Factor == `4` &&
153	(WideInstSize == `256` \|\| WideInstSize == `512` \|\| WideInstSize == `1024` \|\|
154	WideInstSize == `2048`))
155	return true;
156
157	if (ShuffleElemSize == `8` && Factor == `3` &&
158	(WideInstSize == `384` \|\| WideInstSize == `768` \|\| WideInstSize == `1536`))
159	return true;
160
161	return false;
162	}
163
164	void X86InterleavedAccessGroup::decompose(
165	Instruction VecInst, unsigned* NumSubVectors, FixedVectorType *SubVecTy,
166	SmallVectorImpl<Instruction *> &DecomposedVectors) {
167	assert((isa<LoadInst>(VecInst) \|\| isa<ShuffleVectorInst>(VecInst)) &&
168	"Expected Load or Shuffle");
169
170	Type *VecWidth = VecInst->getType();
171	(void)VecWidth;
172	assert(VecWidth->isVectorTy() &&
173	DL.getTypeSizeInBits(VecWidth) >=
174	DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
175	"Invalid Inst-size!!!");
176
177	if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: VecInst)) {
178	Value *Op0 = SVI->getOperand(i_nocapture: `0`);
179	Value *Op1 = SVI->getOperand(i_nocapture: `1`);
180
181	// Generate N(= NumSubVectors) shuffles of T(= SubVecTy) type.
182	for (unsigned i = `0`; i < NumSubVectors; ++i)
183	DecomposedVectors.push_back(
184	Elt: cast<ShuffleVectorInst>(Val: Builder.CreateShuffleVector(
185	V1: Op0, V2: Op1,
186	Mask: createSequentialMask(Start: Indices [i], NumInts: SubVecTy->getNumElements(),
187	NumUndefs: `0`))));
188	return;
189	}
190
191	// Decompose the load instruction.
192	LoadInst *LI = cast<LoadInst>(Val: VecInst);
193	Type *VecBaseTy;
194	unsigned int NumLoads = NumSubVectors;
195	// In the case of stride 3 with a vector of 32 elements load the information
196	// in the following way:
197	// [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
198	unsigned VecLength = DL.getTypeSizeInBits(Ty: VecWidth);
199	Value *VecBasePtr = LI->getPointerOperand();
200	if (VecLength == `768` \|\| VecLength == `1536`) {
201	VecBaseTy = FixedVectorType::get(ElementType: Type::getInt8Ty(C&: LI->getContext()), NumElts: `16`);
202	NumLoads = NumSubVectors * (VecLength / `384`);
203	} else {
204	VecBaseTy = SubVecTy;
205	}
206	// Generate N loads of T type.
207	assert(VecBaseTy->getPrimitiveSizeInBits().isKnownMultipleOf(`8`) &&
208	"VecBaseTy's size must be a multiple of 8");
209	const Align FirstAlignment = LI->getAlign();
210	const Align SubsequentAlignment = commonAlignment(
211	A: FirstAlignment, Offset: VecBaseTy->getPrimitiveSizeInBits().getFixedValue() / `8`);
212	Align Alignment = FirstAlignment;
213	for (unsigned i = `0`; i < NumLoads; i++) {
214	// TODO: Support inbounds GEP.
215	Value *NewBasePtr =
216	Builder.CreateGEP(Ty: VecBaseTy, Ptr: VecBasePtr, IdxList: Builder.getInt32(C: i));
217	Instruction *NewLoad =
218	Builder.CreateAlignedLoad(Ty: VecBaseTy, Ptr: NewBasePtr, Align: Alignment);
219	DecomposedVectors.push_back(Elt: NewLoad);
220	Alignment = SubsequentAlignment;
221	}
222	}
223
224	// Changing the scale of the vector type by reducing the number of elements and
225	// doubling the scalar size.
226	static MVT scaleVectorType(MVT VT) {
227	unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * `2`;
228	return MVT::getVectorVT(VT: MVT::getIntegerVT(BitWidth: ScalarSize),
229	NumElements: VT.getVectorNumElements() / `2`);
230	}
231
232	static constexpr int Concat[] = {
233	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `15`,
234	`16`, `17`, `18`, `19`, `20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`, `28`, `29`, `30`, `31`,
235	`32`, `33`, `34`, `35`, `36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`, `44`, `45`, `46`, `47`,
236	`48`, `49`, `50`, `51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`, `59`, `60`, `61`, `62`, `63`};
237
238	// genShuffleBland - Creates shuffle according to two vectors.This function is
239	// only works on instructions with lane inside 256 registers. According to
240	// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
241	// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
242	// Where the 'LowOffset' refers to the first vector and the highOffset refers to
243	// the second vector.
244	// \|a0....a5,b0....b4,c0....c4\|a16..a21,b16..b20,c16..c20\|
245	// \|c5...c10,a5....a9,b5....b9\|c21..c26,a22..a26,b21..b25\|
246	// \|b10..b15,c11..c15,a10..a15\|b26..b31,c27..c31,a27..a31\|
247	// For the sequence to work as a mirror to the load.
248	// We must consider the elements order as above.
249	// In this function we are combining two types of shuffles.
250	// The first one is vpshufed and the second is a type of "blend" shuffle.
251	// By computing the shuffle on a sequence of 16 elements(one lane) and add the
252	// correct offset. We are creating a vpsuffed + blend sequence between two
253	// shuffles.
254	static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
255	SmallVectorImpl<int> &Out, int LowOffset,
256	int HighOffset) {
257	assert(VT.getSizeInBits() >= `256` &&
258	"This function doesn't accept width smaller then 256");
259	unsigned NumOfElm = VT.getVectorNumElements();
260	for (int I : Mask)
261	Out.push_back(Elt: I + LowOffset);
262	for (int I : Mask)
263	Out.push_back(Elt: I + HighOffset + NumOfElm);
264	}
265
266	// reorderSubVector returns the data to is the original state. And de-facto is
267	// the opposite of the function concatSubVector.
268
269	// For VecElems = 16
270	// Invec[0] - \|0\| TransposedMatrix[0] - \|0\|
271	// Invec[1] - \|1\| => TransposedMatrix[1] - \|1\|
272	// Invec[2] - \|2\| TransposedMatrix[2] - \|2\|
273
274	// For VecElems = 32
275	// Invec[0] - \|0\|3\| TransposedMatrix[0] - \|0\|1\|
276	// Invec[1] - \|1\|4\| => TransposedMatrix[1] - \|2\|3\|
277	// Invec[2] - \|2\|5\| TransposedMatrix[2] - \|4\|5\|
278
279	// For VecElems = 64
280	// Invec[0] - \|0\|3\|6\|9 \| TransposedMatrix[0] - \|0\|1\|2 \|3 \|
281	// Invec[1] - \|1\|4\|7\|10\| => TransposedMatrix[1] - \|4\|5\|6 \|7 \|
282	// Invec[2] - \|2\|5\|8\|11\| TransposedMatrix[2] - \|8\|9\|10\|11\|
283
284	static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
285	ArrayRef<Value > Vec, ArrayRef<int*> VPShuf,
286	unsigned VecElems, unsigned Stride,
287	IRBuilder<> &Builder) {
288
289	if (VecElems == `16`) {
290	for (unsigned i = `0`; i < Stride; i++)
291	TransposedMatrix [i] = Builder.CreateShuffleVector(V: Vec [i], Mask: VPShuf);
292	return;
293	}
294
295	SmallVector<int, `32`> OptimizeShuf;
296	Value *Temp[`8`];
297
298	for (unsigned i = `0`; i < (VecElems / `16`) * Stride; i += `2`) {
299	genShuffleBland(VT, Mask: VPShuf, Out&: OptimizeShuf, LowOffset: (i / Stride) * `16`,
300	HighOffset: (i + `1`) / Stride * `16`);
301	Temp[i / `2`] = Builder.CreateShuffleVector(
302	V1: Vec [i % Stride], V2: Vec [(i + `1`) % Stride], Mask: OptimizeShuf);
303	OptimizeShuf.clear();
304	}
305
306	if (VecElems == `32`) {
307	std::copy(first: Temp, last: Temp + Stride, result: TransposedMatrix.begin());
308	return;
309	} else
310	for (unsigned i = `0`; i < Stride; i++)
311	TransposedMatrix [i] =
312	Builder.CreateShuffleVector(V1: Temp[`2` * i], V2: Temp[`2` * i + `1`], Mask: Concat);
313	}
314
315	void X86InterleavedAccessGroup::interleave8bitStride4VF8(
316	ArrayRef<Instruction *> Matrix,
317	SmallVectorImpl<Value *> &TransposedMatrix) {
318	// Assuming we start from the following vectors:
319	// Matrix[0]= c0 c1 c2 c3 c4 ... c7
320	// Matrix[1]= m0 m1 m2 m3 m4 ... m7
321	// Matrix[2]= y0 y1 y2 y3 y4 ... y7
322	// Matrix[3]= k0 k1 k2 k3 k4 ... k7
323
324	MVT VT = MVT::v8i16;
325	TransposedMatrix.resize(N: `2`);
326	SmallVector<int, `16`> MaskLow;
327	SmallVector<int, `32`> MaskLowTemp1, MaskLowWord;
328	SmallVector<int, `32`> MaskHighTemp1, MaskHighWord;
329
330	for (unsigned i = `0`; i < `8`; ++i) {
331	MaskLow.push_back(Elt: i);
332	MaskLow.push_back(Elt: i + `8`);
333	}
334
335	createUnpackShuffleMask(VT, Mask&: MaskLowTemp1, Lo: true, Unary: false);
336	createUnpackShuffleMask(VT, Mask&: MaskHighTemp1, Lo: false, Unary: false);
337	narrowShuffleMaskElts(Scale: `2`, Mask: MaskHighTemp1, ScaledMask&: MaskHighWord);
338	narrowShuffleMaskElts(Scale: `2`, Mask: MaskLowTemp1, ScaledMask&: MaskLowWord);
339	// IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
340	// IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
341	Value *IntrVec1Low =
342	Builder.CreateShuffleVector(V1: Matrix [`0`], V2: Matrix [`1`], Mask: MaskLow);
343	Value *IntrVec2Low =
344	Builder.CreateShuffleVector(V1: Matrix [`2`], V2: Matrix [`3`], Mask: MaskLow);
345
346	// TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
347	// TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
348
349	TransposedMatrix [`0`] =
350	Builder.CreateShuffleVector(V1: IntrVec1Low, V2: IntrVec2Low, Mask: MaskLowWord);
351	TransposedMatrix [`1`] =
352	Builder.CreateShuffleVector(V1: IntrVec1Low, V2: IntrVec2Low, Mask: MaskHighWord);
353	}
354
355	void X86InterleavedAccessGroup::interleave8bitStride4(
356	ArrayRef<Instruction > Matrix, SmallVectorImpl<Value > &TransposedMatrix,
357	unsigned NumOfElm) {
358	// Example: Assuming we start from the following vectors:
359	// Matrix[0]= c0 c1 c2 c3 c4 ... c31
360	// Matrix[1]= m0 m1 m2 m3 m4 ... m31
361	// Matrix[2]= y0 y1 y2 y3 y4 ... y31
362	// Matrix[3]= k0 k1 k2 k3 k4 ... k31
363
364	MVT VT = MVT::getVectorVT(VT: MVT::i8, NumElements: NumOfElm);
365	MVT HalfVT = scaleVectorType(VT);
366
367	TransposedMatrix.resize(N: `4`);
368	SmallVector<int, `32`> MaskHigh;
369	SmallVector<int, `32`> MaskLow;
370	SmallVector<int, `32`> LowHighMask[`2`];
371	SmallVector<int, `32`> MaskHighTemp;
372	SmallVector<int, `32`> MaskLowTemp;
373
374	// MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
375	// shuffle pattern.
376
377	createUnpackShuffleMask(VT, Mask&: MaskLow, Lo: true, Unary: false);
378	createUnpackShuffleMask(VT, Mask&: MaskHigh, Lo: false, Unary: false);
379
380	// MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
381	// shuffle pattern.
382
383	createUnpackShuffleMask(VT: HalfVT, Mask&: MaskLowTemp, Lo: true, Unary: false);
384	createUnpackShuffleMask(VT: HalfVT, Mask&: MaskHighTemp, Lo: false, Unary: false);
385	narrowShuffleMaskElts(Scale: `2`, Mask: MaskLowTemp, ScaledMask&: LowHighMask[`0`]);
386	narrowShuffleMaskElts(Scale: `2`, Mask: MaskHighTemp, ScaledMask&: LowHighMask[`1`]);
387
388	// IntrVec1Low = c0 m0 c1 m1 ... c7 m7 \| c16 m16 c17 m17 ... c23 m23
389	// IntrVec1High = c8 m8 c9 m9 ... c15 m15 \| c24 m24 c25 m25 ... c31 m31
390	// IntrVec2Low = y0 k0 y1 k1 ... y7 k7 \| y16 k16 y17 k17 ... y23 k23
391	// IntrVec2High = y8 k8 y9 k9 ... y15 k15 \| y24 k24 y25 k25 ... y31 k31
392	Value *IntrVec[`4`];
393
394	IntrVec[`0`] = Builder.CreateShuffleVector(V1: Matrix [`0`], V2: Matrix [`1`], Mask: MaskLow);
395	IntrVec[`1`] = Builder.CreateShuffleVector(V1: Matrix [`0`], V2: Matrix [`1`], Mask: MaskHigh);
396	IntrVec[`2`] = Builder.CreateShuffleVector(V1: Matrix [`2`], V2: Matrix [`3`], Mask: MaskLow);
397	IntrVec[`3`] = Builder.CreateShuffleVector(V1: Matrix [`2`], V2: Matrix [`3`], Mask: MaskHigh);
398
399	// cmyk4 cmyk5 cmyk6 cmyk7 \| cmyk20 cmyk21 cmyk22 cmyk23
400	// cmyk12 cmyk13 cmyk14 cmyk15 \| cmyk28 cmyk29 cmyk30 cmyk31
401	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk16 cmyk17 cmyk18 cmyk19
402	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk24 cmyk25 cmyk26 cmyk27
403
404	Value *VecOut[`4`];
405	for (int i = `0`; i < `4`; i++)
406	VecOut[i] = Builder.CreateShuffleVector(V1: IntrVec[i / `2`], V2: IntrVec[i / `2` + `2`],
407	Mask: LowHighMask[i % `2`]);
408
409	// cmyk0 cmyk1 cmyk2 cmyk3 \| cmyk4 cmyk5 cmyk6 cmyk7
410	// cmyk8 cmyk9 cmyk10 cmyk11 \| cmyk12 cmyk13 cmyk14 cmyk15
411	// cmyk16 cmyk17 cmyk18 cmyk19 \| cmyk20 cmyk21 cmyk22 cmyk23
412	// cmyk24 cmyk25 cmyk26 cmyk27 \| cmyk28 cmyk29 cmyk30 cmyk31
413
414	if (VT == MVT::v16i8) {
415	std::copy(first: VecOut, last: VecOut + `4`, result: TransposedMatrix.begin());
416	return;
417	}
418
419	reorderSubVector(VT, TransposedMatrix, Vec: VecOut, VPShuf: ArrayRef(Concat, `16`), VecElems: NumOfElm,
420	Stride: `4`, Builder);
421	}
422
423	// createShuffleStride returns shuffle mask of size N.
424	// The shuffle pattern is as following :
425	// {0, Stride%(VF/Lane), (2Stride%(VF/Lane))...(VFStride/Lane)%(VF/Lane),
426	// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
427	// (VF / Lane)+(VFStride/Lane)%(VF/Lane)}*
428	// Where Lane is the # of lanes in a register:
429	// VectorSize = 128 => Lane = 1
430	// VectorSize = 256 => Lane = 2
431	// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
432	// {<[0\|3\|6\|1\|4\|7\|2\|5]-[8\|11\|14\|9\|12\|15\|10\|13]>}
433	static void createShuffleStride(MVT VT, int Stride,
434	SmallVectorImpl<int> &Mask) {
435	int VectorSize = VT.getSizeInBits();
436	int VF = VT.getVectorNumElements();
437	int LaneCount = std::max(a: VectorSize / `128`, b: `1`);
438	for (int Lane = `0`; Lane < LaneCount; Lane++)
439	for (int i = `0`, LaneSize = VF / LaneCount; i != LaneSize; ++i)
440	Mask.push_back(Elt: (i * Stride) % LaneSize + LaneSize * Lane);
441	}
442
443	// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
444	// inside mask a shuffleMask. A mask contains exactly 3 groups, where
445	// each group is a monotonically increasing sequence with stride 3.
446	// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
447	static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
448	int VectorSize = VT.getSizeInBits();
449	int VF = VT.getVectorNumElements() / std::max(a: VectorSize / `128`, b: `1`);
450	for (int i = `0`, FirstGroupElement = `0`; i < `3`; i++) {
451	int GroupSize = std::ceil(x: (VF - FirstGroupElement) / `3.0`);
452	SizeInfo.push_back(Elt: GroupSize);
453	FirstGroupElement = ((GroupSize)*`3` + FirstGroupElement) % VF;
454	}
455	}
456
457	// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
458	// vpalign works according to lanes
459	// Where Lane is the # of lanes in a register:
460	// VectorWide = 128 => Lane = 1
461	// VectorWide = 256 => Lane = 2
462	// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
463	// For Lane = 2 shuffle pattern is:
464	// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
465	// Imm variable sets the offset amount. The result of the
466	// function is stored inside ShuffleMask vector and it built as described in
467	// the begin of the description. AlignDirection is a boolean that indicates the
468	// direction of the alignment. (false - align to the "right" side while true -
469	// align to the "left" side)
470	static void DecodePALIGNRMask(MVT VT, unsigned Imm,
471	SmallVectorImpl<int> &ShuffleMask,
472	bool AlignDirection = true, bool Unary = false) {
473	unsigned NumElts = VT.getVectorNumElements();
474	unsigned NumLanes = std::max(a: (int)VT.getSizeInBits() / `128`, b: `1`);
475	unsigned NumLaneElts = NumElts / NumLanes;
476
477	Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
478	unsigned Offset = Imm * (VT.getScalarSizeInBits() / `8`);
479
480	for (unsigned l = `0`; l != NumElts; l += NumLaneElts) {
481	for (unsigned i = `0`; i != NumLaneElts; ++i) {
482	unsigned Base = i + Offset;
483	// if i+offset is out of this lane then we actually need the other source
484	// If Unary the other source is the first source.
485	if (Base >= NumLaneElts)
486	Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
487	ShuffleMask.push_back(Elt: Base + l);
488	}
489	}
490	}
491
492	// concatSubVector - The function rebuilds the data to a correct expected
493	// order. An assumption(The shape of the matrix) was taken for the
494	// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
495	// This function ensures that the data is built in correct way for the lane
496	// instructions. Each lane inside the vector is a 128-bit length.
497	//
498	// The 'InVec' argument contains the data in increasing order. In InVec[0] You
499	// can find the first 128 bit data. The number of different lanes inside a
500	// vector depends on the 'VecElems'.In general, the formula is
501	// VecElems type / 128. The size of the array 'InVec' depends and equal to*
502	// 'VecElems'.
503
504	// For VecElems = 16
505	// Invec[0] - \|0\| Vec[0] - \|0\|
506	// Invec[1] - \|1\| => Vec[1] - \|1\|
507	// Invec[2] - \|2\| Vec[2] - \|2\|
508
509	// For VecElems = 32
510	// Invec[0] - \|0\|1\| Vec[0] - \|0\|3\|
511	// Invec[1] - \|2\|3\| => Vec[1] - \|1\|4\|
512	// Invec[2] - \|4\|5\| Vec[2] - \|2\|5\|
513
514	// For VecElems = 64
515	// Invec[0] - \|0\|1\|2 \|3 \| Vec[0] - \|0\|3\|6\|9 \|
516	// Invec[1] - \|4\|5\|6 \|7 \| => Vec[1] - \|1\|4\|7\|10\|
517	// Invec[2] - \|8\|9\|10\|11\| Vec[2] - \|2\|5\|8\|11\|
518
519	static void concatSubVector(Value *Vec, ArrayRef<Instruction > InVec,
520	unsigned VecElems, IRBuilder<> &Builder) {
521	if (VecElems == `16`) {
522	for (int i = `0`; i < `3`; i++)
523	Vec[i] = InVec [i];
524	return;
525	}
526
527	for (unsigned j = `0`; j < VecElems / `32`; j++)
528	for (int i = `0`; i < `3`; i++)
529	Vec[i + j * `3`] = Builder.CreateShuffleVector(
530	V1: InVec [j * `6` + i], V2: InVec [j * `6` + i + `3`], Mask: ArrayRef(Concat, `32`));
531
532	if (VecElems == `32`)
533	return;
534
535	for (int i = `0`; i < `3`; i++)
536	Vec[i] = Builder.CreateShuffleVector(V1: Vec[i], V2: Vec[i + `3`], Mask: Concat);
537	}
538
539	void X86InterleavedAccessGroup::deinterleave8bitStride3(
540	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
541	unsigned VecElems) {
542	// Example: Assuming we start from the following vectors:
543	// Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
544	// Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
545	// Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
546
547	TransposedMatrix.resize(N: `3`);
548	SmallVector<int, `32`> VPShuf;
549	SmallVector<int, `32`> VPAlign[`2`];
550	SmallVector<int, `32`> VPAlign2;
551	SmallVector<int, `32`> VPAlign3;
552	SmallVector<int, `3`> GroupSize;
553	Value Vec[`6`], TempVector[`3`];
554
555	MVT VT = MVT::getVT(Ty: Shuffles [`0`]->getType());
556
557	createShuffleStride(VT, Stride: `3`, Mask&: VPShuf);
558	setGroupSize(VT, SizeInfo&: GroupSize);
559
560	for (int i = `0`; i < `2`; i++)
561	DecodePALIGNRMask(VT, Imm: GroupSize [`2` - i], ShuffleMask&: VPAlign[i], AlignDirection: false);
562
563	DecodePALIGNRMask(VT, Imm: GroupSize [`2`] + GroupSize [`1`], ShuffleMask&: VPAlign2, AlignDirection: true, Unary: true);
564	DecodePALIGNRMask(VT, Imm: GroupSize [`1`], ShuffleMask&: VPAlign3, AlignDirection: true, Unary: true);
565
566	concatSubVector(Vec, InVec, VecElems, Builder);
567	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
568	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
569	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
570
571	for (int i = `0`; i < `3`; i++)
572	Vec[i] = Builder.CreateShuffleVector(V: Vec[i], Mask: VPShuf);
573
574	// TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
575	// TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
576	// TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
577
578	for (int i = `0`; i < `3`; i++)
579	TempVector[i] =
580	Builder.CreateShuffleVector(V1: Vec[(i + `2`) % `3`], V2: Vec[i], Mask: VPAlign[`0`]);
581
582	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
583	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
584	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
585
586	for (int i = `0`; i < `3`; i++)
587	Vec[i] = Builder.CreateShuffleVector(V1: TempVector[(i + `1`) % `3`], V2: TempVector[i],
588	Mask: VPAlign[`1`]);
589
590	// TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
591	// TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
592	// TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
593
594	Value *TempVec = Builder.CreateShuffleVector(V: Vec[`1`], Mask: VPAlign3);
595	TransposedMatrix [`0`] = Builder.CreateShuffleVector(V: Vec[`0`], Mask: VPAlign2);
596	TransposedMatrix [`1`] = VecElems == `8` ? Vec[`2`] : TempVec;
597	TransposedMatrix [`2`] = VecElems == `8` ? TempVec : Vec[`2`];
598	}
599
600	// group2Shuffle reorder the shuffle stride back into continuous order.
601	// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
602	// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
603	static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
604	SmallVectorImpl<int> &Output) {
605	int IndexGroup[`3`] = {`0`, `0`, `0`};
606	int Index = `0`;
607	int VectorWidth = VT.getSizeInBits();
608	int VF = VT.getVectorNumElements();
609	// Find the index of the different groups.
610	int Lane = (VectorWidth / `128` > `0`) ? VectorWidth / `128` : `1`;
611	for (int i = `0`; i < `3`; i++) {
612	IndexGroup[(Index * `3`) % (VF / Lane)] = Index;
613	Index += Mask [i];
614	}
615	// According to the index compute the convert mask.
616	for (int i = `0`; i < VF / Lane; i++) {
617	Output.push_back(Elt: IndexGroup[i % `3`]);
618	IndexGroup[i % `3`]++;
619	}
620	}
621
622	void X86InterleavedAccessGroup::interleave8bitStride3(
623	ArrayRef<Instruction > InVec, SmallVectorImpl<Value > &TransposedMatrix,
624	unsigned VecElems) {
625	// Example: Assuming we start from the following vectors:
626	// Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
627	// Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
628	// Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
629
630	TransposedMatrix.resize(N: `3`);
631	SmallVector<int, `3`> GroupSize;
632	SmallVector<int, `32`> VPShuf;
633	SmallVector<int, `32`> VPAlign[`3`];
634	SmallVector<int, `32`> VPAlign2;
635	SmallVector<int, `32`> VPAlign3;
636
637	Value Vec[`3`], TempVector[`3`];
638	MVT VT = MVT::getVectorVT(VT: MVT::i8, NumElements: VecElems);
639
640	setGroupSize(VT, SizeInfo&: GroupSize);
641
642	for (int i = `0`; i < `3`; i++)
643	DecodePALIGNRMask(VT, Imm: GroupSize [i], ShuffleMask&: VPAlign[i]);
644
645	DecodePALIGNRMask(VT, Imm: GroupSize [`1`] + GroupSize [`2`], ShuffleMask&: VPAlign2, AlignDirection: false, Unary: true);
646	DecodePALIGNRMask(VT, Imm: GroupSize [`1`], ShuffleMask&: VPAlign3, AlignDirection: false, Unary: true);
647
648	// Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
649	// Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
650	// Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
651
652	Vec[`0`] = Builder.CreateShuffleVector(V: InVec [`0`], Mask: VPAlign2);
653	Vec[`1`] = Builder.CreateShuffleVector(V: InVec [`1`], Mask: VPAlign3);
654	Vec[`2`] = InVec [`2`];
655
656	// Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
657	// Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
658	// Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
659
660	for (int i = `0`; i < `3`; i++)
661	TempVector[i] =
662	Builder.CreateShuffleVector(V1: Vec[i], V2: Vec[(i + `2`) % `3`], Mask: VPAlign[`1`]);
663
664	// Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
665	// Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
666	// Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
667
668	for (int i = `0`; i < `3`; i++)
669	Vec[i] = Builder.CreateShuffleVector(V1: TempVector[i], V2: TempVector[(i + `1`) % `3`],
670	Mask: VPAlign[`2`]);
671
672	// TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
673	// TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
674	// TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
675
676	unsigned NumOfElm = VT.getVectorNumElements();
677	group2Shuffle(VT, Mask&: GroupSize, Output&: VPShuf);
678	reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, VecElems: NumOfElm, Stride: `3`, Builder);
679	}
680
681	void X86InterleavedAccessGroup::transpose_4x4(
682	ArrayRef<Instruction *> Matrix,
683	SmallVectorImpl<Value *> &TransposedMatrix) {
684	assert(Matrix.size() == `4` && "Invalid matrix size");
685	TransposedMatrix.resize(N: `4`);
686
687	// dst = src1[0,1],src2[0,1]
688	static constexpr int IntMask1[] = {`0`, `1`, `4`, `5`};
689	ArrayRef<int> Mask = ArrayRef(IntMask1, `4`);
690	Value *IntrVec1 = Builder.CreateShuffleVector(V1: Matrix [`0`], V2: Matrix [`2`], Mask);
691	Value *IntrVec2 = Builder.CreateShuffleVector(V1: Matrix [`1`], V2: Matrix [`3`], Mask);
692
693	// dst = src1[2,3],src2[2,3]
694	static constexpr int IntMask2[] = {`2`, `3`, `6`, `7`};
695	Mask = ArrayRef(IntMask2, `4`);
696	Value *IntrVec3 = Builder.CreateShuffleVector(V1: Matrix [`0`], V2: Matrix [`2`], Mask);
697	Value *IntrVec4 = Builder.CreateShuffleVector(V1: Matrix [`1`], V2: Matrix [`3`], Mask);
698
699	// dst = src1[0],src2[0],src1[2],src2[2]
700	static constexpr int IntMask3[] = {`0`, `4`, `2`, `6`};
701	Mask = ArrayRef(IntMask3, `4`);
702	TransposedMatrix [`0`] = Builder.CreateShuffleVector(V1: IntrVec1, V2: IntrVec2, Mask);
703	TransposedMatrix [`2`] = Builder.CreateShuffleVector(V1: IntrVec3, V2: IntrVec4, Mask);
704
705	// dst = src1[1],src2[1],src1[3],src2[3]
706	static constexpr int IntMask4[] = {`1`, `5`, `3`, `7`};
707	Mask = ArrayRef(IntMask4, `4`);
708	TransposedMatrix [`1`] = Builder.CreateShuffleVector(V1: IntrVec1, V2: IntrVec2, Mask);
709	TransposedMatrix [`3`] = Builder.CreateShuffleVector(V1: IntrVec3, V2: IntrVec4, Mask);
710	}
711
712	// Lowers this interleaved access group into X86-specific
713	// instructions/intrinsics.
714	bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
715	SmallVector<Instruction *, `4`> DecomposedVectors;
716	SmallVector<Value *, `4`> TransposedVectors;
717	auto *ShuffleTy = cast<FixedVectorType>(Val: Shuffles [`0`]->getType());
718
719	if (isa<LoadInst>(Val: Inst)) {
720	auto *ShuffleEltTy = cast<FixedVectorType>(Val: Inst->getType());
721	unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
722	switch (NumSubVecElems) {
723	default:
724	return false;
725	case `4`:
726	case `8`:
727	case `16`:
728	case `32`:
729	case `64`:
730	if (ShuffleTy->getNumElements() != NumSubVecElems)
731	return false;
732	break;
733	}
734
735	// Try to generate target-sized register(/instruction).
736	decompose(VecInst: Inst, NumSubVectors: Factor, SubVecTy: ShuffleTy, DecomposedVectors);
737
738	// Perform matrix-transposition in order to compute interleaved
739	// results by generating some sort of (optimized) target-specific
740	// instructions.
741
742	if (NumSubVecElems == `4`)
743	transpose_4x4(Matrix: DecomposedVectors, TransposedMatrix&: TransposedVectors);
744	else
745	deinterleave8bitStride3(InVec: DecomposedVectors, TransposedMatrix&: TransposedVectors,
746	VecElems: NumSubVecElems);
747
748	// Now replace the unoptimized-interleaved-vectors with the
749	// transposed-interleaved vectors.
750	for (unsigned i = `0`, e = Shuffles.size(); i < e; ++i)
751	Shuffles [i]->replaceAllUsesWith(V: TransposedVectors [Indices [i]]);
752
753	return true;
754	}
755
756	Type *ShuffleEltTy = ShuffleTy->getElementType();
757	unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
758
759	// Lower the interleaved stores:
760	// 1. Decompose the interleaved wide shuffle into individual shuffle
761	// vectors.
762	decompose(VecInst: Shuffles [`0`], NumSubVectors: Factor,
763	SubVecTy: FixedVectorType::get(ElementType: ShuffleEltTy, NumElts: NumSubVecElems),
764	DecomposedVectors);
765
766	// 2. Transpose the interleaved-vectors into vectors of contiguous
767	// elements.
768	switch (NumSubVecElems) {
769	case `4`:
770	transpose_4x4(Matrix: DecomposedVectors, TransposedMatrix&: TransposedVectors);
771	break;
772	case `8`:
773	interleave8bitStride4VF8(Matrix: DecomposedVectors, TransposedMatrix&: TransposedVectors);
774	break;
775	case `16`:
776	case `32`:
777	case `64`:
778	if (Factor == `4`)
779	interleave8bitStride4(Matrix: DecomposedVectors, TransposedMatrix&: TransposedVectors,
780	NumOfElm: NumSubVecElems);
781	if (Factor == `3`)
782	interleave8bitStride3(InVec: DecomposedVectors, TransposedMatrix&: TransposedVectors,
783	VecElems: NumSubVecElems);
784	break;
785	default:
786	return false;
787	}
788
789	// 3. Concatenate the contiguous-vectors back into a wide vector.
790	Value *WideVec = concatenateVectors(Builder, Vecs: TransposedVectors);
791
792	// 4. Generate a store instruction for wide-vec.
793	StoreInst *SI = cast<StoreInst>(Val: Inst);
794	Builder.CreateAlignedStore(Val: WideVec, Ptr: SI->getPointerOperand(), Align: SI->getAlign());
795
796	return true;
797	}
798
799	// Lower interleaved load(s) into target specific instructions/
800	// intrinsics. Lowering sequence varies depending on the vector-types, factor,
801	// number of shuffles and ISA.
802	// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
803	bool X86TargetLowering::lowerInterleavedLoad(
804	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
805	ArrayRef<unsigned> Indices, unsigned Factor) const {
806	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
807	"Invalid interleave factor");
808	assert(!Shuffles.empty() && "Empty shufflevector input");
809	assert(Shuffles.size() == Indices.size() &&
810	"Unmatched number of shufflevectors and indices");
811
812	// Create an interleaved access group.
813	IRBuilder<> Builder(LI);
814	X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget,
815	Builder);
816
817	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
818	}
819
820	bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
821	ShuffleVectorInst *SVI,
822	unsigned Factor) const {
823	assert(Factor >= `2` && Factor <= getMaxSupportedInterleaveFactor() &&
824	"Invalid interleave factor");
825
826	assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
827	`0` &&
828	"Invalid interleaved store");
829
830	// Holds the indices of SVI that correspond to the starting index of each
831	// interleaved shuffle.
832	auto Mask = SVI->getShuffleMask();
833	SmallVector<unsigned, `4`> Indices(Mask.take_front(N: Factor));
834
835	ArrayRef<ShuffleVectorInst *> Shuffles = ArrayRef(SVI);
836
837	// Create an interleaved access group.
838	IRBuilder<> Builder(SI);
839	X86InterleavedAccessGroup Grp(SI, Shuffles, Indices, Factor, Subtarget,
840	Builder);
841
842	return Grp.isSupported() && Grp.lowerIntoOptimizedSequence();
843	}
844

Browse the source code of llvm_projects/llvm/lib/Target/X86/X86InterleavedAccess.cpp