InterleavedAccessPass.cpp source code [llvm_projects/llvm/lib/CodeGen/InterleavedAccessPass.cpp]

1	//===- InterleavedAccessPass.cpp ------------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This file implements the Interleaved Access pass, which identifies
10	// interleaved memory accesses and transforms them into target specific
11	// intrinsics.
12	//
13	// An interleaved load reads data from memory into several vectors, with
14	// DE-interleaving the data on a factor. An interleaved store writes several
15	// vectors to memory with RE-interleaving the data on a factor.
16	//
17	// As interleaved accesses are difficult to identified in CodeGen (mainly
18	// because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
19	// IR), we identify and transform them to intrinsics in this pass so the
20	// intrinsics can be easily matched into target specific instructions later in
21	// CodeGen.
22	//
23	// E.g. An interleaved load (Factor = 2):
24	// %wide.vec = load <8 x i32>, <8 x i32> %ptr*
25	// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
26	// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
27	//
28	// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
29	// intrinsic in ARM backend.
30	//
31	// In X86, this can be further optimized into a set of target
32	// specific loads followed by an optimized sequence of shuffles.
33	//
34	// E.g. An interleaved store (Factor = 3):
35	// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
36	// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
37	// store <12 x i32> %i.vec, <12 x i32> %ptr*
38	//
39	// It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
40	// intrinsic in ARM backend.
41	//
42	// Similarly, a set of interleaved stores can be transformed into an optimized
43	// sequence of shuffles followed by a set of target specific stores for X86.
44	//
45	//===----------------------------------------------------------------------===//
46
47	#include "llvm/ADT/ArrayRef.h"
48	#include "llvm/ADT/DenseMap.h"
49	#include "llvm/ADT/SetVector.h"
50	#include "llvm/ADT/SmallVector.h"
51	#include "llvm/Analysis/VectorUtils.h"
52	#include "llvm/CodeGen/InterleavedAccess.h"
53	#include "llvm/CodeGen/TargetLowering.h"
54	#include "llvm/CodeGen/TargetPassConfig.h"
55	#include "llvm/CodeGen/TargetSubtargetInfo.h"
56	#include "llvm/IR/Constants.h"
57	#include "llvm/IR/Dominators.h"
58	#include "llvm/IR/Function.h"
59	#include "llvm/IR/IRBuilder.h"
60	#include "llvm/IR/InstIterator.h"
61	#include "llvm/IR/Instruction.h"
62	#include "llvm/IR/Instructions.h"
63	#include "llvm/IR/IntrinsicInst.h"
64	#include "llvm/IR/PatternMatch.h"
65	#include "llvm/InitializePasses.h"
66	#include "llvm/Pass.h"
67	#include "llvm/Support/Casting.h"
68	#include "llvm/Support/CommandLine.h"
69	#include "llvm/Support/Debug.h"
70	#include "llvm/Support/raw_ostream.h"
71	#include "llvm/Target/TargetMachine.h"
72	#include "llvm/Transforms/Utils/Local.h"
73	#include <cassert>
74	#include <utility>
75
76	using namespace llvm;
77
78	#define DEBUG_TYPE "interleaved-access"
79
80	static cl::opt<bool> LowerInterleavedAccesses(
81	"lower-interleaved-accesses",
82	cl::desc ("Enable lowering interleaved accesses to intrinsics"),
83	cl::init(Val: true), cl::Hidden);
84
85	namespace {
86
87	class InterleavedAccessImpl {
88	friend class InterleavedAccess;
89
90	public:
91	InterleavedAccessImpl() = default;
92	InterleavedAccessImpl(DominatorTree DT, const* TargetLowering *TLI)
93	: DT(DT), TLI(TLI), MaxFactor(TLI->getMaxSupportedInterleaveFactor()) {}
94	bool runOnFunction(Function &F);
95
96	private:
97	DominatorTree DT = nullptr*;
98	const TargetLowering TLI = nullptr*;
99
100	/// The maximum supported interleave factor.
101	unsigned MaxFactor = `0u`;
102
103	/// Transform an interleaved load into target specific intrinsics.
104	bool lowerInterleavedLoad(Instruction *Load,
105	SmallSetVector<Instruction *, `32`> &DeadInsts);
106
107	/// Transform an interleaved store into target specific intrinsics.
108	bool lowerInterleavedStore(Instruction *Store,
109	SmallSetVector<Instruction *, `32`> &DeadInsts);
110
111	/// Transform a load and a deinterleave intrinsic into target specific
112	/// instructions.
113	bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
114	SmallSetVector<Instruction *, `32`> &DeadInsts);
115
116	/// Transform an interleave intrinsic and a store into target specific
117	/// instructions.
118	bool lowerInterleaveIntrinsic(IntrinsicInst *II,
119	SmallSetVector<Instruction *, `32`> &DeadInsts);
120
121	/// Returns true if the uses of an interleaved load by the
122	/// extractelement instructions in \p Extracts can be replaced by uses of the
123	/// shufflevector instructions in \p Shuffles instead. If so, the necessary
124	/// replacements are also performed.
125	bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
126	ArrayRef<ShuffleVectorInst *> Shuffles);
127
128	/// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
129	/// to binop(shuffle(x), shuffle(y)) to allow the formation of an
130	/// interleaving load. Any newly created shuffles that operate on \p LI will
131	/// be added to \p Shuffles. Returns true, if any changes to the IR have been
132	/// made.
133	bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
134	SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
135	Instruction *LI);
136	};
137
138	class InterleavedAccess : public FunctionPass {
139	InterleavedAccessImpl Impl;
140
141	public:
142	static char ID;
143
144	InterleavedAccess() : FunctionPass (ID) {}
145
146	StringRef getPassName() const override { return "Interleaved Access Pass"; }
147
148	bool runOnFunction(Function &F) override;
149
150	void getAnalysisUsage(AnalysisUsage &AU) const override {
151	AU.addRequired<DominatorTreeWrapperPass>();
152	AU.setPreservesCFG();
153	}
154	};
155
156	} // end anonymous namespace.
157
158	PreservedAnalyses InterleavedAccessPass::run(Function &F,
159	FunctionAnalysisManager &FAM) {
160	auto *DT = &FAM.getResult<DominatorTreeAnalysis>(IR&: F);
161	auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
162	InterleavedAccessImpl Impl(DT, TLI);
163	bool Changed = Impl.runOnFunction(F);
164
165	if (!Changed)
166	return PreservedAnalyses::all();
167
168	PreservedAnalyses PA;
169	PA.preserveSet<CFGAnalyses>();
170	return PA;
171	}
172
173	char InterleavedAccess::ID = `0`;
174
175	bool InterleavedAccess::runOnFunction(Function &F) {
176	if (skipFunction(F))
177	return false;
178
179	auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
180	if (!TPC \|\| !LowerInterleavedAccesses)
181	return false;
182
183	LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
184
185	Impl.DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
186	auto &TM = TPC->getTM<TargetMachine>();
187	Impl.TLI = TM.getSubtargetImpl(F)->getTargetLowering();
188	Impl.MaxFactor = Impl.TLI->getMaxSupportedInterleaveFactor();
189
190	return Impl.runOnFunction(F);
191	}
192
193	INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
194	"Lower interleaved memory accesses to target specific intrinsics", false,
195	false)
196	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
197	INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
198	"Lower interleaved memory accesses to target specific intrinsics", false,
199	false)
200
201	FunctionPass *llvm::createInterleavedAccessPass() {
202	return new InterleavedAccess ();
203	}
204
205	/// Check if the mask is a DE-interleave mask for an interleaved load.
206	///
207	/// E.g. DE-interleave masks (Factor = 2) could be:
208	/// <0, 2, 4, 6> (mask of index 0 to extract even elements)
209	/// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
210	static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
211	unsigned &Index, unsigned MaxFactor,
212	unsigned NumLoadElements) {
213	if (Mask.size() < `2`)
214	return false;
215
216	// Check potential Factors.
217	for (Factor = `2`; Factor <= MaxFactor; Factor++) {
218	// Make sure we don't produce a load wider than the input load.
219	if (Mask.size() * Factor > NumLoadElements)
220	return false;
221	if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index))
222	return true;
223	}
224
225	return false;
226	}
227
228	/// Check if the mask can be used in an interleaved store.
229	//
230	/// It checks for a more general pattern than the RE-interleave mask.
231	/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
232	/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
233	/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
234	/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
235	///
236	/// The particular case of an RE-interleave mask is:
237	/// I.e. <0, LaneLen, ... , LaneLen(Factor - 1), 1, LaneLen + 1, ...>*
238	/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
239	static bool isReInterleaveMask(ShuffleVectorInst SVI, unsigned* &Factor,
240	unsigned MaxFactor) {
241	unsigned NumElts = SVI->getShuffleMask().size();
242	if (NumElts < `4`)
243	return false;
244
245	// Check potential Factors.
246	for (Factor = `2`; Factor <= MaxFactor; Factor++) {
247	if (SVI->isInterleave(Factor))
248	return true;
249	}
250
251	return false;
252	}
253
254	static Value getMaskOperand(IntrinsicInst II) {
255	switch (II->getIntrinsicID()) {
256	default:
257	llvm_unreachable("Unexpected intrinsic");
258	case Intrinsic::vp_load:
259	case Intrinsic::masked_load:
260	return II->getOperand(i_nocapture: `1`);
261	case Intrinsic::vp_store:
262	case Intrinsic::masked_store:
263	return II->getOperand(i_nocapture: `2`);
264	}
265	}
266
267	// Return a pair of
268	// (1) The corresponded deinterleaved mask, or nullptr if there is no valid
269	// mask.
270	// (2) Some mask effectively skips a certain field, and this element is a mask
271	// in which inactive lanes represent fields that are skipped (i.e. "gaps").
272	static std::pair<Value , APInt> getMask(Value WideMask, unsigned Factor,
273	ElementCount LeafValueEC);
274
275	static std::pair<Value , APInt> getMask(Value WideMask, unsigned Factor,
276	VectorType *LeafValueTy) {
277	return getMask(WideMask, Factor, LeafValueEC: LeafValueTy->getElementCount());
278	}
279
280	bool InterleavedAccessImpl::lowerInterleavedLoad(
281	Instruction Load, SmallSetVector<Instruction , `32`> &DeadInsts) {
282	if (isa<ScalableVectorType>(Val: Load->getType()))
283	return false;
284
285	auto *LI = dyn_cast<LoadInst>(Val: Load);
286	auto *II = dyn_cast<IntrinsicInst>(Val: Load);
287	if (!LI && !II)
288	return false;
289
290	if (LI && !LI->isSimple())
291	return false;
292
293	// Check if all users of this load are shufflevectors. If we encounter any
294	// users that are extractelement instructions or binary operators, we save
295	// them to later check if they can be modified to extract from one of the
296	// shufflevectors instead of the load.
297
298	SmallVector<ShuffleVectorInst *, `4`> Shuffles;
299	SmallVector<ExtractElementInst *, `4`> Extracts;
300	// BinOpShuffles need to be handled a single time in case both operands of the
301	// binop are the same load.
302	SmallSetVector<ShuffleVectorInst *, `4`> BinOpShuffles;
303
304	for (auto *User : Load->users()) {
305	auto *Extract = dyn_cast<ExtractElementInst>(Val: User);
306	if (Extract && isa<ConstantInt>(Val: Extract->getIndexOperand())) {
307	Extracts.push_back(Elt: Extract);
308	continue;
309	}
310	if (auto *BI = dyn_cast<BinaryOperator>(Val: User)) {
311	using namespace PatternMatch;
312	if (!BI->user_empty() &&
313	all_of(Range: BI->users(), P: match_fn(P: m_Shuffle(v1: m_Value(), v2: m_Undef())))) {
314	for (auto *SVI : BI->users())
315	BinOpShuffles.insert(X: cast<ShuffleVectorInst>(Val: SVI));
316	continue;
317	}
318	}
319	auto *SVI = dyn_cast<ShuffleVectorInst>(Val: User);
320	if (!SVI \|\| !isa<UndefValue>(Val: SVI->getOperand(i_nocapture: `1`)))
321	return false;
322
323	Shuffles.push_back(Elt: SVI);
324	}
325
326	if (Shuffles.empty() && BinOpShuffles.empty())
327	return false;
328
329	unsigned Factor, Index;
330
331	unsigned NumLoadElements =
332	cast<FixedVectorType>(Val: Load->getType())->getNumElements();
333	auto *FirstSVI = Shuffles.size() > `0` ? Shuffles [`0`] : BinOpShuffles [`0`];
334	// Check if the first shufflevector is DE-interleave shuffle.
335	if (!isDeInterleaveMask(Mask: FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
336	NumLoadElements))
337	return false;
338
339	// Holds the corresponding index for each DE-interleave shuffle.
340	SmallVector<unsigned, `4`> Indices;
341
342	VectorType *VecTy = cast<VectorType>(Val: FirstSVI->getType());
343
344	// Check if other shufflevectors are also DE-interleaved of the same type
345	// and factor as the first shufflevector.
346	for (auto *Shuffle : Shuffles) {
347	if (Shuffle->getType() != VecTy)
348	return false;
349	if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor(
350	Mask: Shuffle->getShuffleMask(), Factor, Index))
351	return false;
352
353	assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
354	Indices.push_back(Elt: Index);
355	}
356	for (auto *Shuffle : BinOpShuffles) {
357	if (Shuffle->getType() != VecTy)
358	return false;
359	if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor(
360	Mask: Shuffle->getShuffleMask(), Factor, Index))
361	return false;
362
363	assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
364
365	if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: `0`))->getOperand(i: `0`) == Load)
366	Indices.push_back(Elt: Index);
367	if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: `0`))->getOperand(i: `1`) == Load)
368	Indices.push_back(Elt: Index);
369	}
370
371	// Try and modify users of the load that are extractelement instructions to
372	// use the shufflevector instructions instead of the load.
373	if (!tryReplaceExtracts(Extracts, Shuffles))
374	return false;
375
376	bool BinOpShuffleChanged =
377	replaceBinOpShuffles(BinOpShuffles: BinOpShuffles.getArrayRef(), Shuffles, LI: Load);
378
379	Value Mask = nullptr*;
380	auto GapMask = APInt::getAllOnes(numBits: Factor);
381	if (LI) {
382	LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
383	} else {
384	// Check mask operand. Handle both all-true/false and interleaved mask.
385	std::tie(args&: Mask, args&: GapMask) = getMask(WideMask: getMaskOperand(II), Factor, LeafValueTy: VecTy);
386	if (!Mask)
387	return false;
388
389	LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
390	<< *Load << "\n");
391	LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
392	<< " and actual factor " << GapMask.popcount() << "\n");
393	}
394
395	// Try to create target specific intrinsics to replace the load and
396	// shuffles.
397	if (!TLI->lowerInterleavedLoad(Load: cast<Instruction>(Val: Load), Mask, Shuffles,
398	Indices, Factor, GapMask))
399	// If Extracts is not empty, tryReplaceExtracts made changes earlier.
400	return !Extracts.empty() \|\| BinOpShuffleChanged;
401
402	DeadInsts.insert_range(R&: Shuffles);
403
404	DeadInsts.insert(X: Load);
405	return true;
406	}
407
408	bool InterleavedAccessImpl::replaceBinOpShuffles(
409	ArrayRef<ShuffleVectorInst *> BinOpShuffles,
410	SmallVectorImpl<ShuffleVectorInst > &Shuffles, Instruction Load) {
411	for (auto *SVI : BinOpShuffles) {
412	BinaryOperator *BI = cast<BinaryOperator>(Val: SVI->getOperand(i_nocapture: `0`));
413	Type *BIOp0Ty = BI->getOperand(i_nocapture: `0`)->getType();
414	ArrayRef<int> Mask = SVI->getShuffleMask();
415	assert(all_of(Mask, [&](int Idx) {
416	return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
417	}));
418
419	BasicBlock::iterator insertPos = SVI->getIterator();
420	auto *NewSVI1 =
421	new ShuffleVectorInst (BI->getOperand(i_nocapture: `0`), PoisonValue::get(T: BIOp0Ty),
422	Mask, SVI->getName(), insertPos);
423	auto NewSVI2 = new* ShuffleVectorInst (
424	BI->getOperand(i_nocapture: `1`), PoisonValue::get(T: BI->getOperand(i_nocapture: `1`)->getType()), Mask,
425	SVI->getName(), insertPos);
426	BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
427	Opc: BI->getOpcode(), V1: NewSVI1, V2: NewSVI2, CopyO: BI, Name: BI->getName(), InsertBefore: insertPos);
428	SVI->replaceAllUsesWith(V: NewBI);
429	LLVM_DEBUG(dbgs() << " Replaced: " << BI << "\n And : " << SVI
430	<< "\n With : " << *NewSVI1 << "\n And : "
431	<< NewSVI2 << "\n And : " << NewBI << "\n");
432	RecursivelyDeleteTriviallyDeadInstructions(V: SVI);
433	if (NewSVI1->getOperand(i_nocapture: `0`) == Load)
434	Shuffles.push_back(Elt: NewSVI1);
435	if (NewSVI2->getOperand(i_nocapture: `0`) == Load)
436	Shuffles.push_back(Elt: NewSVI2);
437	}
438
439	return !BinOpShuffles.empty();
440	}
441
442	bool InterleavedAccessImpl::tryReplaceExtracts(
443	ArrayRef<ExtractElementInst *> Extracts,
444	ArrayRef<ShuffleVectorInst *> Shuffles) {
445	// If there aren't any extractelement instructions to modify, there's nothing
446	// to do.
447	if (Extracts.empty())
448	return true;
449
450	// Maps extractelement instructions to vector-index pairs. The extractlement
451	// instructions will be modified to use the new vector and index operands.
452	DenseMap<ExtractElementInst , std::pair<Value , int>> ReplacementMap;
453
454	for (auto *Extract : Extracts) {
455	// The vector index that is extracted.
456	auto *IndexOperand = cast<ConstantInt>(Val: Extract->getIndexOperand());
457	auto Index = IndexOperand->getSExtValue();
458
459	// Look for a suitable shufflevector instruction. The goal is to modify the
460	// extractelement instruction (which uses an interleaved load) to use one
461	// of the shufflevector instructions instead of the load.
462	for (auto *Shuffle : Shuffles) {
463	// If the shufflevector instruction doesn't dominate the extract, we
464	// can't create a use of it.
465	if (!DT->dominates(Def: Shuffle, User: Extract))
466	continue;
467
468	// Inspect the indices of the shufflevector instruction. If the shuffle
469	// selects the same index that is extracted, we can modify the
470	// extractelement instruction.
471	SmallVector<int, `4`> Indices;
472	Shuffle->getShuffleMask(Result&: Indices);
473	for (unsigned I = `0`; I < Indices.size(); ++I)
474	if (Indices [I] == Index) {
475	assert(Extract->getOperand(`0`) == Shuffle->getOperand(`0`) &&
476	"Vector operations do not match");
477	ReplacementMap [Extract] = std::make_pair(x&: Shuffle, y&: I);
478	break;
479	}
480
481	// If we found a suitable shufflevector instruction, stop looking.
482	if (ReplacementMap.count(Val: Extract))
483	break;
484	}
485
486	// If we did not find a suitable shufflevector instruction, the
487	// extractelement instruction cannot be modified, so we must give up.
488	if (!ReplacementMap.count(Val: Extract))
489	return false;
490	}
491
492	// Finally, perform the replacements.
493	IRBuilder<> Builder(Extracts [`0`]->getContext());
494	for (auto &Replacement : ReplacementMap) {
495	auto *Extract = Replacement.first;
496	auto *Vector = Replacement.second.first;
497	auto Index = Replacement.second.second;
498	Builder.SetInsertPoint(Extract);
499	Extract->replaceAllUsesWith(V: Builder.CreateExtractElement(Vec: Vector, Idx: Index));
500	Extract->eraseFromParent();
501	}
502
503	return true;
504	}
505
506	bool InterleavedAccessImpl::lowerInterleavedStore(
507	Instruction Store, SmallSetVector<Instruction , `32`> &DeadInsts) {
508	Value *StoredValue;
509	auto *SI = dyn_cast<StoreInst>(Val: Store);
510	auto *II = dyn_cast<IntrinsicInst>(Val: Store);
511	if (SI) {
512	if (!SI->isSimple())
513	return false;
514	StoredValue = SI->getValueOperand();
515	} else {
516	assert(II->getIntrinsicID() == Intrinsic::vp_store \|\|
517	II->getIntrinsicID() == Intrinsic::masked_store);
518	StoredValue = II->getArgOperand(i: `0`);
519	}
520
521	auto *SVI = dyn_cast<ShuffleVectorInst>(Val: StoredValue);
522	if (!SVI \|\| !SVI->hasOneUse() \|\| isa<ScalableVectorType>(Val: SVI->getType()))
523	return false;
524
525	unsigned NumStoredElements =
526	cast<FixedVectorType>(Val: SVI->getType())->getNumElements();
527	// Check if the shufflevector is RE-interleave shuffle.
528	unsigned Factor;
529	if (!isReInterleaveMask(SVI, Factor, MaxFactor))
530	return false;
531	assert(NumStoredElements % Factor == `0` &&
532	"number of stored element should be a multiple of Factor");
533
534	Value Mask = nullptr*;
535	auto GapMask = APInt::getAllOnes(numBits: Factor);
536	if (SI) {
537	LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
538	} else {
539	// Check mask operand. Handle both all-true/false and interleaved mask.
540	unsigned LaneMaskLen = NumStoredElements / Factor;
541	std::tie(args&: Mask, args&: GapMask) = getMask(WideMask: getMaskOperand(II), Factor,
542	LeafValueEC: ElementCount::getFixed(MinVal: LaneMaskLen));
543	if (!Mask)
544	return false;
545
546	LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
547	<< *Store << "\n");
548	LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
549	<< " and actual factor " << GapMask.popcount() << "\n");
550	}
551
552	// Try to create target specific intrinsics to replace the store and
553	// shuffle.
554	if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask))
555	return false;
556
557	// Already have a new target specific interleaved store. Erase the old store.
558	DeadInsts.insert(X: Store);
559	DeadInsts.insert(X: SVI);
560	return true;
561	}
562
563	// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
564	// last field in a factor-of-three interleaved store or deinterleaved load (in
565	// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
566	// This helper function tries to detect this pattern and return the actual
567	// factor we're accessing, which is 2 in this example.
568	static void getGapMask(const Constant &MaskConst, unsigned Factor,
569	unsigned LeafMaskLen, APInt &GapMask) {
570	assert(GapMask.getBitWidth() == Factor);
571	for (unsigned F = `0U`; F < Factor; ++F) {
572	bool AllZero = true;
573	for (unsigned Idx = `0U`; Idx < LeafMaskLen; ++Idx) {
574	Constant C = MaskConst.getAggregateElement(Elt: F + Idx Factor);
575	if (!C->isNullValue()) {
576	AllZero = false;
577	break;
578	}
579	}
580	// All mask bits on this field are zero, skipping it.
581	if (AllZero)
582	GapMask.clearBit(BitPosition: F);
583	}
584	}
585
586	static std::pair<Value , APInt> getMask(Value WideMask, unsigned Factor,
587	ElementCount LeafValueEC) {
588	auto GapMask = APInt::getAllOnes(numBits: Factor);
589
590	if (auto *IMI = dyn_cast<IntrinsicInst>(Val: WideMask)) {
591	if (unsigned F = getInterleaveIntrinsicFactor(ID: IMI->getIntrinsicID());
592	F && F == Factor) {
593	Value RefArg = nullptr*;
594	// Check if all the intrinsic arguments are the same, except those that
595	// are zeros, which we mark as gaps in the gap mask.
596	for (auto [Idx, Arg] : enumerate(First: IMI->args())) {
597	if (auto *C = dyn_cast<Constant>(Val&: Arg); C && C->isNullValue()) {
598	GapMask.clearBit(BitPosition: Idx);
599	continue;
600	}
601
602	if (!RefArg)
603	RefArg = Arg;
604	else if (RefArg != Arg)
605	return {nullptr, GapMask};
606	}
607
608	// In a very rare occasion, all the intrinsic arguments might be zeros,
609	// in which case we still want to return an all-zeros constant instead of
610	// nullptr.
611	return {RefArg ? RefArg : IMI->getArgOperand(i: `0`), GapMask};
612	}
613	}
614
615	// Masks that are assembled from bitwise AND.
616	if (auto *AndOp = dyn_cast<BinaryOperator>(Val: WideMask);
617	AndOp && AndOp->getOpcode() == Instruction::And) {
618	auto [MaskLHS, GapMaskLHS] =
619	getMask(WideMask: AndOp->getOperand(i_nocapture: `0`), Factor, LeafValueEC);
620	auto [MaskRHS, GapMaskRHS] =
621	getMask(WideMask: AndOp->getOperand(i_nocapture: `1`), Factor, LeafValueEC);
622	if (!MaskLHS \|\| !MaskRHS)
623	return {nullptr, GapMask};
624	// Using IRBuilder here so that any trivial constants could be folded right
625	// away.
626	return {IRBuilder<>(AndOp).CreateAnd(LHS: MaskLHS, RHS: MaskRHS),
627	GapMaskLHS & GapMaskRHS};
628	}
629
630	if (auto *ConstMask = dyn_cast<Constant>(Val: WideMask)) {
631	if (auto *Splat = ConstMask->getSplatValue())
632	// All-ones or all-zeros mask.
633	return {ConstantVector::getSplat(EC: LeafValueEC, Elt: Splat), GapMask};
634
635	if (LeafValueEC.isFixed()) {
636	unsigned LeafMaskLen = LeafValueEC.getFixedValue();
637	// First, check if we use a gap mask to skip some of the factors / fields.
638	getGapMask(MaskConst: *ConstMask, Factor, LeafMaskLen, GapMask);
639
640	SmallVector<Constant , `8`> LeafMask(LeafMaskLen, nullptr*);
641	// If this is a fixed-length constant mask, each lane / leaf has to
642	// use the same mask. This is done by checking if every group with Factor
643	// number of elements in the interleaved mask has homogeneous values.
644	for (unsigned Idx = `0U`; Idx < LeafMaskLen * Factor; ++Idx) {
645	if (!GapMask [Idx % Factor])
646	continue;
647	Constant *C = ConstMask->getAggregateElement(Elt: Idx);
648	if (LeafMask [Idx / Factor] && LeafMask [Idx / Factor] != C)
649	return {nullptr, GapMask};
650	LeafMask [Idx / Factor] = C;
651	}
652
653	return {ConstantVector::get(V: LeafMask), GapMask};
654	}
655	}
656
657	if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: WideMask)) {
658	Type *Op1Ty = SVI->getOperand(i_nocapture: `1`)->getType();
659	if (!isa<FixedVectorType>(Val: Op1Ty))
660	return {nullptr, GapMask};
661
662	// Check that the shuffle mask is: a) an interleave, b) all of the same
663	// set of the elements, and c) contained by the first source. (c) could
664	// be relaxed if desired.
665	unsigned NumSrcElts =
666	cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: `1`)->getType())->getNumElements();
667	SmallVector<unsigned> StartIndexes;
668	if (ShuffleVectorInst::isInterleaveMask(Mask: SVI->getShuffleMask(), Factor,
669	NumInputElts: NumSrcElts * `2`, StartIndexes) &&
670	llvm::all_of(Range&: StartIndexes, P: equal_to(Arg: `0`)) &&
671	llvm::all_of(Range: SVI->getShuffleMask(), P: [&NumSrcElts](int Idx) {
672	return Idx < (int)NumSrcElts;
673	})) {
674	auto *LeafMaskTy =
675	VectorType::get(ElementType: Type::getInt1Ty(C&: SVI->getContext()), EC: LeafValueEC);
676	IRBuilder<> Builder(SVI);
677	return {Builder.CreateExtractVector(DstType: LeafMaskTy, SrcVec: SVI->getOperand(i_nocapture: `0`),
678	Idx: uint64_t(`0`)),
679	GapMask};
680	}
681	}
682
683	return {nullptr, GapMask};
684	}
685
686	bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
687	IntrinsicInst DI, SmallSetVector<Instruction , `32`> &DeadInsts) {
688	Instruction *LoadedVal = dyn_cast<Instruction>(Val: DI->getOperand(i_nocapture: `0`));
689	if (!LoadedVal \|\| !LoadedVal->hasOneUse())
690	return false;
691
692	auto *LI = dyn_cast<LoadInst>(Val: LoadedVal);
693	auto *II = dyn_cast<IntrinsicInst>(Val: LoadedVal);
694	if (!LI && !II)
695	return false;
696
697	const unsigned Factor = getDeinterleaveIntrinsicFactor(ID: DI->getIntrinsicID());
698	assert(Factor && "unexpected deinterleave intrinsic");
699
700	Value Mask = nullptr*;
701	if (LI) {
702	if (!LI->isSimple())
703	return false;
704
705	LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
706	<< " and factor = " << Factor << "\n");
707	} else {
708	assert(II);
709	if (II->getIntrinsicID() != Intrinsic::masked_load &&
710	II->getIntrinsicID() != Intrinsic::vp_load)
711	return false;
712
713	// Check mask operand. Handle both all-true/false and interleaved mask.
714	APInt GapMask(Factor, `0`);
715	std::tie(args&: Mask, args&: GapMask) =
716	getMask(WideMask: getMaskOperand(II), Factor, LeafValueTy: getDeinterleavedVectorType(DI));
717	if (!Mask)
718	return false;
719	// We haven't supported gap mask if it's deinterleaving using intrinsics.
720	// Yet it is possible that we already changed the IR, hence returning true
721	// here.
722	if (GapMask.popcount() != Factor)
723	return true;
724
725	LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
726	<< " intrinsic " << *DI << " and factor = "
727	<< Factor << "\n");
728	}
729
730	// Try and match this with target specific intrinsics.
731	if (!TLI->lowerDeinterleaveIntrinsicToLoad(Load: LoadedVal, Mask, DI))
732	return false;
733
734	DeadInsts.insert(X: DI);
735	// We now have a target-specific load, so delete the old one.
736	DeadInsts.insert(X: LoadedVal);
737	return true;
738	}
739
740	bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
741	IntrinsicInst IntII, SmallSetVector<Instruction , `32`> &DeadInsts) {
742	if (!IntII->hasOneUse())
743	return false;
744	Instruction *StoredBy = dyn_cast<Instruction>(Val: IntII->user_back());
745	if (!StoredBy)
746	return false;
747	auto *SI = dyn_cast<StoreInst>(Val: StoredBy);
748	auto *II = dyn_cast<IntrinsicInst>(Val: StoredBy);
749	if (!SI && !II)
750	return false;
751
752	SmallVector<Value *, `8`> InterleaveValues(IntII->args());
753	const unsigned Factor = getInterleaveIntrinsicFactor(ID: IntII->getIntrinsicID());
754	assert(Factor && "unexpected interleave intrinsic");
755
756	Value Mask = nullptr*;
757	if (II) {
758	if (II->getIntrinsicID() != Intrinsic::masked_store &&
759	II->getIntrinsicID() != Intrinsic::vp_store)
760	return false;
761	// Check mask operand. Handle both all-true/false and interleaved mask.
762	APInt GapMask(Factor, `0`);
763	std::tie(args&: Mask, args&: GapMask) =
764	getMask(WideMask: getMaskOperand(II), Factor,
765	LeafValueTy: cast<VectorType>(Val: InterleaveValues [`0`]->getType()));
766	if (!Mask)
767	return false;
768	// We haven't supported gap mask if it's interleaving using intrinsics. Yet
769	// it is possible that we already changed the IR, hence returning true here.
770	if (GapMask.popcount() != Factor)
771	return true;
772
773	LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
774	<< " intrinsic " << *IntII << " and factor = "
775	<< Factor << "\n");
776	} else {
777	if (!SI->isSimple())
778	return false;
779
780	LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic "
781	<< *IntII << " and factor = " << Factor << "\n");
782	}
783
784	// Try and match this with target specific intrinsics.
785	if (!TLI->lowerInterleaveIntrinsicToStore(Store: StoredBy, Mask, InterleaveValues))
786	return false;
787
788	// We now have a target-specific store, so delete the old one.
789	DeadInsts.insert(X: StoredBy);
790	DeadInsts.insert(X: IntII);
791	return true;
792	}
793
794	bool InterleavedAccessImpl::runOnFunction(Function &F) {
795	// Holds dead instructions that will be erased later.
796	SmallSetVector<Instruction *, `32`> DeadInsts;
797	bool Changed = false;
798
799	using namespace PatternMatch;
800	for (auto &I : instructions(F)) {
801	if (match(V: &I, P: m_CombineOr(L: m_Load(Op: m_Value()),
802	R: m_Intrinsic<Intrinsic::vp_load>())) \|\|
803	match(V: &I, P: m_Intrinsic<Intrinsic::masked_load>()))
804	Changed \|= lowerInterleavedLoad(Load: &I, DeadInsts);
805
806	if (match(V: &I, P: m_CombineOr(L: m_Store(ValueOp: m_Value(), PointerOp: m_Value()),
807	R: m_Intrinsic<Intrinsic::vp_store>())) \|\|
808	match(V: &I, P: m_Intrinsic<Intrinsic::masked_store>()))
809	Changed \|= lowerInterleavedStore(Store: &I, DeadInsts);
810
811	if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
812	if (getDeinterleaveIntrinsicFactor(ID: II->getIntrinsicID()))
813	Changed \|= lowerDeinterleaveIntrinsic(DI: II, DeadInsts);
814	else if (getInterleaveIntrinsicFactor(ID: II->getIntrinsicID()))
815	Changed \|= lowerInterleaveIntrinsic(IntII: II, DeadInsts);
816	}
817	}
818
819	for (auto *I : DeadInsts)
820	I->eraseFromParent();
821
822	return Changed;
823	}
824

Browse the source code of llvm_projects/llvm/lib/CodeGen/InterleavedAccessPass.cpp