1//===- InterleavedAccessPass.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the Interleaved Access pass, which identifies
10// interleaved memory accesses and transforms them into target specific
11// intrinsics.
12//
13// An interleaved load reads data from memory into several vectors, with
14// DE-interleaving the data on a factor. An interleaved store writes several
15// vectors to memory with RE-interleaving the data on a factor.
16//
17// As interleaved accesses are difficult to identified in CodeGen (mainly
18// because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
19// IR), we identify and transform them to intrinsics in this pass so the
20// intrinsics can be easily matched into target specific instructions later in
21// CodeGen.
22//
23// E.g. An interleaved load (Factor = 2):
24// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
25// %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6>
26// %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7>
27//
28// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
29// intrinsic in ARM backend.
30//
31// In X86, this can be further optimized into a set of target
32// specific loads followed by an optimized sequence of shuffles.
33//
34// E.g. An interleaved store (Factor = 3):
35// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
36// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
37// store <12 x i32> %i.vec, <12 x i32>* %ptr
38//
39// It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
40// intrinsic in ARM backend.
41//
42// Similarly, a set of interleaved stores can be transformed into an optimized
43// sequence of shuffles followed by a set of target specific stores for X86.
44//
45//===----------------------------------------------------------------------===//
46
47#include "llvm/ADT/ArrayRef.h"
48#include "llvm/ADT/DenseMap.h"
49#include "llvm/ADT/SetVector.h"
50#include "llvm/ADT/SmallVector.h"
51#include "llvm/CodeGen/InterleavedAccess.h"
52#include "llvm/CodeGen/TargetLowering.h"
53#include "llvm/CodeGen/TargetPassConfig.h"
54#include "llvm/CodeGen/TargetSubtargetInfo.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/Dominators.h"
57#include "llvm/IR/Function.h"
58#include "llvm/IR/IRBuilder.h"
59#include "llvm/IR/InstIterator.h"
60#include "llvm/IR/Instruction.h"
61#include "llvm/IR/Instructions.h"
62#include "llvm/IR/IntrinsicInst.h"
63#include "llvm/IR/PatternMatch.h"
64#include "llvm/InitializePasses.h"
65#include "llvm/Pass.h"
66#include "llvm/Support/Casting.h"
67#include "llvm/Support/CommandLine.h"
68#include "llvm/Support/Debug.h"
69#include "llvm/Support/raw_ostream.h"
70#include "llvm/Target/TargetMachine.h"
71#include "llvm/Transforms/Utils/Local.h"
72#include <cassert>
73#include <utility>
74
75using namespace llvm;
76
77#define DEBUG_TYPE "interleaved-access"
78
79static cl::opt<bool> LowerInterleavedAccesses(
80 "lower-interleaved-accesses",
81 cl::desc("Enable lowering interleaved accesses to intrinsics"),
82 cl::init(Val: true), cl::Hidden);
83
84namespace {
85
86class InterleavedAccessImpl {
87 friend class InterleavedAccess;
88
89public:
90 InterleavedAccessImpl() = default;
91 InterleavedAccessImpl(DominatorTree *DT, const TargetLowering *TLI)
92 : DT(DT), TLI(TLI), MaxFactor(TLI->getMaxSupportedInterleaveFactor()) {}
93 bool runOnFunction(Function &F);
94
95private:
96 DominatorTree *DT = nullptr;
97 const TargetLowering *TLI = nullptr;
98
99 /// The maximum supported interleave factor.
100 unsigned MaxFactor = 0u;
101
102 /// Transform an interleaved load into target specific intrinsics.
103 bool lowerInterleavedLoad(Instruction *Load,
104 SmallSetVector<Instruction *, 32> &DeadInsts);
105
106 /// Transform an interleaved store into target specific intrinsics.
107 bool lowerInterleavedStore(Instruction *Store,
108 SmallSetVector<Instruction *, 32> &DeadInsts);
109
110 /// Transform a load and a deinterleave intrinsic into target specific
111 /// instructions.
112 bool lowerDeinterleaveIntrinsic(IntrinsicInst *II,
113 SmallSetVector<Instruction *, 32> &DeadInsts);
114
115 /// Transform an interleave intrinsic and a store into target specific
116 /// instructions.
117 bool lowerInterleaveIntrinsic(IntrinsicInst *II,
118 SmallSetVector<Instruction *, 32> &DeadInsts);
119
120 /// Returns true if the uses of an interleaved load by the
121 /// extractelement instructions in \p Extracts can be replaced by uses of the
122 /// shufflevector instructions in \p Shuffles instead. If so, the necessary
123 /// replacements are also performed.
124 bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
125 ArrayRef<ShuffleVectorInst *> Shuffles);
126
127 /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them
128 /// to binop(shuffle(x), shuffle(y)) to allow the formation of an
129 /// interleaving load. Any newly created shuffles that operate on \p LI will
130 /// be added to \p Shuffles. Returns true, if any changes to the IR have been
131 /// made.
132 bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
133 SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
134 Instruction *LI);
135};
136
137class InterleavedAccess : public FunctionPass {
138 InterleavedAccessImpl Impl;
139
140public:
141 static char ID;
142
143 InterleavedAccess() : FunctionPass(ID) {
144 initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
145 }
146
147 StringRef getPassName() const override { return "Interleaved Access Pass"; }
148
149 bool runOnFunction(Function &F) override;
150
151 void getAnalysisUsage(AnalysisUsage &AU) const override {
152 AU.addRequired<DominatorTreeWrapperPass>();
153 AU.setPreservesCFG();
154 }
155};
156
157} // end anonymous namespace.
158
159PreservedAnalyses InterleavedAccessPass::run(Function &F,
160 FunctionAnalysisManager &FAM) {
161 auto *DT = &FAM.getResult<DominatorTreeAnalysis>(IR&: F);
162 auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
163 InterleavedAccessImpl Impl(DT, TLI);
164 bool Changed = Impl.runOnFunction(F);
165
166 if (!Changed)
167 return PreservedAnalyses::all();
168
169 PreservedAnalyses PA;
170 PA.preserveSet<CFGAnalyses>();
171 return PA;
172}
173
174char InterleavedAccess::ID = 0;
175
176bool InterleavedAccess::runOnFunction(Function &F) {
177 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
178 if (!TPC || !LowerInterleavedAccesses)
179 return false;
180
181 LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
182
183 Impl.DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
184 auto &TM = TPC->getTM<TargetMachine>();
185 Impl.TLI = TM.getSubtargetImpl(F)->getTargetLowering();
186 Impl.MaxFactor = Impl.TLI->getMaxSupportedInterleaveFactor();
187
188 return Impl.runOnFunction(F);
189}
190
191INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
192 "Lower interleaved memory accesses to target specific intrinsics", false,
193 false)
194INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
195INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
196 "Lower interleaved memory accesses to target specific intrinsics", false,
197 false)
198
199FunctionPass *llvm::createInterleavedAccessPass() {
200 return new InterleavedAccess();
201}
202
203/// Check if the mask is a DE-interleave mask for an interleaved load.
204///
205/// E.g. DE-interleave masks (Factor = 2) could be:
206/// <0, 2, 4, 6> (mask of index 0 to extract even elements)
207/// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
208static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
209 unsigned &Index, unsigned MaxFactor,
210 unsigned NumLoadElements) {
211 if (Mask.size() < 2)
212 return false;
213
214 // Check potential Factors.
215 for (Factor = 2; Factor <= MaxFactor; Factor++) {
216 // Make sure we don't produce a load wider than the input load.
217 if (Mask.size() * Factor > NumLoadElements)
218 return false;
219 if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index))
220 return true;
221 }
222
223 return false;
224}
225
226/// Check if the mask can be used in an interleaved store.
227//
228/// It checks for a more general pattern than the RE-interleave mask.
229/// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
230/// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
231/// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
232/// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
233///
234/// The particular case of an RE-interleave mask is:
235/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
236/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
237static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
238 unsigned MaxFactor) {
239 unsigned NumElts = SVI->getShuffleMask().size();
240 if (NumElts < 4)
241 return false;
242
243 // Check potential Factors.
244 for (Factor = 2; Factor <= MaxFactor; Factor++) {
245 if (SVI->isInterleave(Factor))
246 return true;
247 }
248
249 return false;
250}
251
252// Return the corresponded deinterleaved mask, or nullptr if there is no valid
253// mask.
254static Value *getMask(Value *WideMask, unsigned Factor,
255 ElementCount LeafValueEC);
256
257static Value *getMask(Value *WideMask, unsigned Factor,
258 VectorType *LeafValueTy) {
259 return getMask(WideMask, Factor, LeafValueEC: LeafValueTy->getElementCount());
260}
261
262bool InterleavedAccessImpl::lowerInterleavedLoad(
263 Instruction *Load, SmallSetVector<Instruction *, 32> &DeadInsts) {
264 if (isa<ScalableVectorType>(Val: Load->getType()))
265 return false;
266
267 if (auto *LI = dyn_cast<LoadInst>(Val: Load)) {
268 if (!LI->isSimple())
269 return false;
270 } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Val: Load)) {
271 assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
272 // Require a constant mask.
273 if (!isa<ConstantVector>(Val: VPLoad->getMaskParam()))
274 return false;
275 } else {
276 llvm_unreachable("unsupported load operation");
277 }
278
279 // Check if all users of this load are shufflevectors. If we encounter any
280 // users that are extractelement instructions or binary operators, we save
281 // them to later check if they can be modified to extract from one of the
282 // shufflevectors instead of the load.
283
284 SmallVector<ShuffleVectorInst *, 4> Shuffles;
285 SmallVector<ExtractElementInst *, 4> Extracts;
286 // BinOpShuffles need to be handled a single time in case both operands of the
287 // binop are the same load.
288 SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
289
290 for (auto *User : Load->users()) {
291 auto *Extract = dyn_cast<ExtractElementInst>(Val: User);
292 if (Extract && isa<ConstantInt>(Val: Extract->getIndexOperand())) {
293 Extracts.push_back(Elt: Extract);
294 continue;
295 }
296 if (auto *BI = dyn_cast<BinaryOperator>(Val: User)) {
297 if (!BI->user_empty() && all_of(Range: BI->users(), P: [](auto *U) {
298 auto *SVI = dyn_cast<ShuffleVectorInst>(U);
299 return SVI && isa<UndefValue>(SVI->getOperand(1));
300 })) {
301 for (auto *SVI : BI->users())
302 BinOpShuffles.insert(X: cast<ShuffleVectorInst>(Val: SVI));
303 continue;
304 }
305 }
306 auto *SVI = dyn_cast<ShuffleVectorInst>(Val: User);
307 if (!SVI || !isa<UndefValue>(Val: SVI->getOperand(i_nocapture: 1)))
308 return false;
309
310 Shuffles.push_back(Elt: SVI);
311 }
312
313 if (Shuffles.empty() && BinOpShuffles.empty())
314 return false;
315
316 unsigned Factor, Index;
317
318 unsigned NumLoadElements =
319 cast<FixedVectorType>(Val: Load->getType())->getNumElements();
320 auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
321 // Check if the first shufflevector is DE-interleave shuffle.
322 if (!isDeInterleaveMask(Mask: FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
323 NumLoadElements))
324 return false;
325
326 // Holds the corresponding index for each DE-interleave shuffle.
327 SmallVector<unsigned, 4> Indices;
328
329 Type *VecTy = FirstSVI->getType();
330
331 // Check if other shufflevectors are also DE-interleaved of the same type
332 // and factor as the first shufflevector.
333 for (auto *Shuffle : Shuffles) {
334 if (Shuffle->getType() != VecTy)
335 return false;
336 if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor(
337 Mask: Shuffle->getShuffleMask(), Factor, Index))
338 return false;
339
340 assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
341 Indices.push_back(Elt: Index);
342 }
343 for (auto *Shuffle : BinOpShuffles) {
344 if (Shuffle->getType() != VecTy)
345 return false;
346 if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor(
347 Mask: Shuffle->getShuffleMask(), Factor, Index))
348 return false;
349
350 assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
351
352 if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: 0))->getOperand(i: 0) == Load)
353 Indices.push_back(Elt: Index);
354 if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: 0))->getOperand(i: 1) == Load)
355 Indices.push_back(Elt: Index);
356 }
357
358 // Try and modify users of the load that are extractelement instructions to
359 // use the shufflevector instructions instead of the load.
360 if (!tryReplaceExtracts(Extracts, Shuffles))
361 return false;
362
363 bool BinOpShuffleChanged =
364 replaceBinOpShuffles(BinOpShuffles: BinOpShuffles.getArrayRef(), Shuffles, LI: Load);
365
366 if (auto *VPLoad = dyn_cast<VPIntrinsic>(Val: Load)) {
367 Value *LaneMask =
368 getMask(WideMask: VPLoad->getMaskParam(), Factor, LeafValueTy: cast<VectorType>(Val: VecTy));
369 if (!LaneMask)
370 return false;
371
372 LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n");
373
374 // Sometimes the number of Shuffles might be less than Factor, we have to
375 // fill the gaps with null. Also, lowerInterleavedVPLoad
376 // expects them to be sorted.
377 SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
378 for (auto [Idx, ShuffleMaskIdx] : enumerate(First&: Indices))
379 ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
380 if (!TLI->lowerInterleavedVPLoad(Load: VPLoad, Mask: LaneMask, DeinterleaveRes: ShuffleValues))
381 // If Extracts is not empty, tryReplaceExtracts made changes earlier.
382 return !Extracts.empty() || BinOpShuffleChanged;
383 } else {
384 LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
385
386 // Try to create target specific intrinsics to replace the load and
387 // shuffles.
388 if (!TLI->lowerInterleavedLoad(LI: cast<LoadInst>(Val: Load), Shuffles, Indices,
389 Factor))
390 // If Extracts is not empty, tryReplaceExtracts made changes earlier.
391 return !Extracts.empty() || BinOpShuffleChanged;
392 }
393
394 DeadInsts.insert_range(R&: Shuffles);
395
396 DeadInsts.insert(X: Load);
397 return true;
398}
399
400bool InterleavedAccessImpl::replaceBinOpShuffles(
401 ArrayRef<ShuffleVectorInst *> BinOpShuffles,
402 SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *Load) {
403 for (auto *SVI : BinOpShuffles) {
404 BinaryOperator *BI = cast<BinaryOperator>(Val: SVI->getOperand(i_nocapture: 0));
405 Type *BIOp0Ty = BI->getOperand(i_nocapture: 0)->getType();
406 ArrayRef<int> Mask = SVI->getShuffleMask();
407 assert(all_of(Mask, [&](int Idx) {
408 return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements();
409 }));
410
411 BasicBlock::iterator insertPos = SVI->getIterator();
412 auto *NewSVI1 =
413 new ShuffleVectorInst(BI->getOperand(i_nocapture: 0), PoisonValue::get(T: BIOp0Ty),
414 Mask, SVI->getName(), insertPos);
415 auto *NewSVI2 = new ShuffleVectorInst(
416 BI->getOperand(i_nocapture: 1), PoisonValue::get(T: BI->getOperand(i_nocapture: 1)->getType()), Mask,
417 SVI->getName(), insertPos);
418 BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags(
419 Opc: BI->getOpcode(), V1: NewSVI1, V2: NewSVI2, CopyO: BI, Name: BI->getName(), InsertBefore: insertPos);
420 SVI->replaceAllUsesWith(V: NewBI);
421 LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI
422 << "\n With : " << *NewSVI1 << "\n And : "
423 << *NewSVI2 << "\n And : " << *NewBI << "\n");
424 RecursivelyDeleteTriviallyDeadInstructions(V: SVI);
425 if (NewSVI1->getOperand(i_nocapture: 0) == Load)
426 Shuffles.push_back(Elt: NewSVI1);
427 if (NewSVI2->getOperand(i_nocapture: 0) == Load)
428 Shuffles.push_back(Elt: NewSVI2);
429 }
430
431 return !BinOpShuffles.empty();
432}
433
434bool InterleavedAccessImpl::tryReplaceExtracts(
435 ArrayRef<ExtractElementInst *> Extracts,
436 ArrayRef<ShuffleVectorInst *> Shuffles) {
437 // If there aren't any extractelement instructions to modify, there's nothing
438 // to do.
439 if (Extracts.empty())
440 return true;
441
442 // Maps extractelement instructions to vector-index pairs. The extractlement
443 // instructions will be modified to use the new vector and index operands.
444 DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
445
446 for (auto *Extract : Extracts) {
447 // The vector index that is extracted.
448 auto *IndexOperand = cast<ConstantInt>(Val: Extract->getIndexOperand());
449 auto Index = IndexOperand->getSExtValue();
450
451 // Look for a suitable shufflevector instruction. The goal is to modify the
452 // extractelement instruction (which uses an interleaved load) to use one
453 // of the shufflevector instructions instead of the load.
454 for (auto *Shuffle : Shuffles) {
455 // If the shufflevector instruction doesn't dominate the extract, we
456 // can't create a use of it.
457 if (!DT->dominates(Def: Shuffle, User: Extract))
458 continue;
459
460 // Inspect the indices of the shufflevector instruction. If the shuffle
461 // selects the same index that is extracted, we can modify the
462 // extractelement instruction.
463 SmallVector<int, 4> Indices;
464 Shuffle->getShuffleMask(Result&: Indices);
465 for (unsigned I = 0; I < Indices.size(); ++I)
466 if (Indices[I] == Index) {
467 assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
468 "Vector operations do not match");
469 ReplacementMap[Extract] = std::make_pair(x&: Shuffle, y&: I);
470 break;
471 }
472
473 // If we found a suitable shufflevector instruction, stop looking.
474 if (ReplacementMap.count(Val: Extract))
475 break;
476 }
477
478 // If we did not find a suitable shufflevector instruction, the
479 // extractelement instruction cannot be modified, so we must give up.
480 if (!ReplacementMap.count(Val: Extract))
481 return false;
482 }
483
484 // Finally, perform the replacements.
485 IRBuilder<> Builder(Extracts[0]->getContext());
486 for (auto &Replacement : ReplacementMap) {
487 auto *Extract = Replacement.first;
488 auto *Vector = Replacement.second.first;
489 auto Index = Replacement.second.second;
490 Builder.SetInsertPoint(Extract);
491 Extract->replaceAllUsesWith(V: Builder.CreateExtractElement(Vec: Vector, Idx: Index));
492 Extract->eraseFromParent();
493 }
494
495 return true;
496}
497
498bool InterleavedAccessImpl::lowerInterleavedStore(
499 Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) {
500 Value *StoredValue;
501 if (auto *SI = dyn_cast<StoreInst>(Val: Store)) {
502 if (!SI->isSimple())
503 return false;
504 StoredValue = SI->getValueOperand();
505 } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Val: Store)) {
506 assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
507 // Require a constant mask.
508 if (!isa<ConstantVector>(Val: VPStore->getMaskParam()))
509 return false;
510 StoredValue = VPStore->getArgOperand(i: 0);
511 } else {
512 llvm_unreachable("unsupported store operation");
513 }
514
515 auto *SVI = dyn_cast<ShuffleVectorInst>(Val: StoredValue);
516 if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(Val: SVI->getType()))
517 return false;
518
519 unsigned NumStoredElements =
520 cast<FixedVectorType>(Val: SVI->getType())->getNumElements();
521 // Check if the shufflevector is RE-interleave shuffle.
522 unsigned Factor;
523 if (!isReInterleaveMask(SVI, Factor, MaxFactor))
524 return false;
525 assert(NumStoredElements % Factor == 0 &&
526 "number of stored element should be a multiple of Factor");
527
528 if (auto *VPStore = dyn_cast<VPIntrinsic>(Val: Store)) {
529 unsigned LaneMaskLen = NumStoredElements / Factor;
530 Value *LaneMask = getMask(WideMask: VPStore->getMaskParam(), Factor,
531 LeafValueEC: ElementCount::getFixed(MinVal: LaneMaskLen));
532 if (!LaneMask)
533 return false;
534
535 LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store
536 << "\n");
537
538 IRBuilder<> Builder(VPStore);
539 // We need to effectively de-interleave the shufflemask
540 // because lowerInterleavedVPStore expects individual de-interleaved
541 // values.
542 SmallVector<Value *, 10> NewShuffles;
543 SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
544 auto ShuffleMask = SVI->getShuffleMask();
545
546 for (unsigned i = 0; i < Factor; i++) {
547 for (unsigned j = 0; j < LaneMaskLen; j++)
548 NewShuffleMask[j] = ShuffleMask[i + Factor * j];
549
550 NewShuffles.push_back(Elt: Builder.CreateShuffleVector(
551 V1: SVI->getOperand(i_nocapture: 0), V2: SVI->getOperand(i_nocapture: 1), Mask: NewShuffleMask));
552 }
553
554 // Try to create target specific intrinsics to replace the vp.store and
555 // shuffle.
556 if (!TLI->lowerInterleavedVPStore(Store: VPStore, Mask: LaneMask, InterleaveOps: NewShuffles))
557 // We already created new shuffles.
558 return true;
559 } else {
560 LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n");
561
562 // Try to create target specific intrinsics to replace the store and
563 // shuffle.
564 if (!TLI->lowerInterleavedStore(SI: cast<StoreInst>(Val: Store), SVI, Factor))
565 return false;
566 }
567
568 // Already have a new target specific interleaved store. Erase the old store.
569 DeadInsts.insert(X: Store);
570 DeadInsts.insert(X: SVI);
571 return true;
572}
573
574static bool isInterleaveIntrinsic(Intrinsic::ID IID) {
575 switch (IID) {
576 case Intrinsic::vector_interleave2:
577 case Intrinsic::vector_interleave3:
578 case Intrinsic::vector_interleave4:
579 case Intrinsic::vector_interleave5:
580 case Intrinsic::vector_interleave6:
581 case Intrinsic::vector_interleave7:
582 case Intrinsic::vector_interleave8:
583 return true;
584 default:
585 return false;
586 }
587}
588
589static bool isDeinterleaveIntrinsic(Intrinsic::ID IID) {
590 switch (IID) {
591 case Intrinsic::vector_deinterleave2:
592 case Intrinsic::vector_deinterleave3:
593 case Intrinsic::vector_deinterleave4:
594 case Intrinsic::vector_deinterleave5:
595 case Intrinsic::vector_deinterleave6:
596 case Intrinsic::vector_deinterleave7:
597 case Intrinsic::vector_deinterleave8:
598 return true;
599 default:
600 return false;
601 }
602}
603
604static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
605 switch (II->getIntrinsicID()) {
606 case Intrinsic::vector_deinterleave2:
607 case Intrinsic::vector_interleave2:
608 return 2;
609 case Intrinsic::vector_deinterleave3:
610 case Intrinsic::vector_interleave3:
611 return 3;
612 case Intrinsic::vector_deinterleave4:
613 case Intrinsic::vector_interleave4:
614 return 4;
615 case Intrinsic::vector_deinterleave5:
616 case Intrinsic::vector_interleave5:
617 return 5;
618 case Intrinsic::vector_deinterleave6:
619 case Intrinsic::vector_interleave6:
620 return 6;
621 case Intrinsic::vector_deinterleave7:
622 case Intrinsic::vector_interleave7:
623 return 7;
624 case Intrinsic::vector_deinterleave8:
625 case Intrinsic::vector_interleave8:
626 return 8;
627 default:
628 llvm_unreachable("Unexpected intrinsic");
629 }
630}
631
632static Value *getMask(Value *WideMask, unsigned Factor,
633 ElementCount LeafValueEC) {
634 if (auto *IMI = dyn_cast<IntrinsicInst>(Val: WideMask)) {
635 if (isInterleaveIntrinsic(IID: IMI->getIntrinsicID()) &&
636 getIntrinsicFactor(II: IMI) == Factor && llvm::all_equal(Range: IMI->args())) {
637 return IMI->getArgOperand(i: 0);
638 }
639 }
640
641 if (auto *ConstMask = dyn_cast<Constant>(Val: WideMask)) {
642 if (auto *Splat = ConstMask->getSplatValue())
643 // All-ones or all-zeros mask.
644 return ConstantVector::getSplat(EC: LeafValueEC, Elt: Splat);
645
646 if (LeafValueEC.isFixed()) {
647 unsigned LeafMaskLen = LeafValueEC.getFixedValue();
648 SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
649 // If this is a fixed-length constant mask, each lane / leaf has to
650 // use the same mask. This is done by checking if every group with Factor
651 // number of elements in the interleaved mask has homogeneous values.
652 for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
653 Constant *C = ConstMask->getAggregateElement(Elt: Idx);
654 if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
655 return nullptr;
656 LeafMask[Idx / Factor] = C;
657 }
658
659 return ConstantVector::get(V: LeafMask);
660 }
661 }
662
663 return nullptr;
664}
665
666bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
667 IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) {
668 Value *LoadedVal = DI->getOperand(i_nocapture: 0);
669 if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(Val: LoadedVal))
670 return false;
671
672 const unsigned Factor = getIntrinsicFactor(II: DI);
673 if (!DI->hasNUses(N: Factor))
674 return false;
675 SmallVector<Value *, 8> DeinterleaveValues(Factor);
676 for (auto *User : DI->users()) {
677 auto *Extract = dyn_cast<ExtractValueInst>(Val: User);
678 if (!Extract || Extract->getNumIndices() != 1)
679 return false;
680 unsigned Idx = Extract->getIndices()[0];
681 if (DeinterleaveValues[Idx])
682 return false;
683 DeinterleaveValues[Idx] = Extract;
684 }
685
686 if (auto *VPLoad = dyn_cast<VPIntrinsic>(Val: LoadedVal)) {
687 if (VPLoad->getIntrinsicID() != Intrinsic::vp_load)
688 return false;
689 // Check mask operand. Handle both all-true/false and interleaved mask.
690 Value *WideMask = VPLoad->getOperand(i_nocapture: 1);
691 Value *Mask = getMask(WideMask, Factor,
692 LeafValueTy: cast<VectorType>(Val: DeinterleaveValues[0]->getType()));
693 if (!Mask)
694 return false;
695
696 LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic "
697 << *DI << " and factor = " << Factor << "\n");
698
699 // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
700 // TLI function to emit target-specific interleaved instruction.
701 if (!TLI->lowerInterleavedVPLoad(Load: VPLoad, Mask, DeinterleaveRes: DeinterleaveValues))
702 return false;
703
704 } else {
705 auto *LI = cast<LoadInst>(Val: LoadedVal);
706 if (!LI->isSimple())
707 return false;
708
709 LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI
710 << " and factor = " << Factor << "\n");
711
712 // Try and match this with target specific intrinsics.
713 if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues))
714 return false;
715 }
716
717 for (Value *V : DeinterleaveValues)
718 DeadInsts.insert(X: cast<Instruction>(Val: V));
719 DeadInsts.insert(X: DI);
720 // We now have a target-specific load, so delete the old one.
721 DeadInsts.insert(X: cast<Instruction>(Val: LoadedVal));
722 return true;
723}
724
725bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
726 IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) {
727 if (!II->hasOneUse())
728 return false;
729 Value *StoredBy = II->user_back();
730 if (!isa<StoreInst, VPIntrinsic>(Val: StoredBy))
731 return false;
732
733 SmallVector<Value *, 8> InterleaveValues(II->args());
734 const unsigned Factor = getIntrinsicFactor(II);
735
736 if (auto *VPStore = dyn_cast<VPIntrinsic>(Val: StoredBy)) {
737 if (VPStore->getIntrinsicID() != Intrinsic::vp_store)
738 return false;
739
740 Value *WideMask = VPStore->getOperand(i_nocapture: 2);
741 Value *Mask = getMask(WideMask, Factor,
742 LeafValueTy: cast<VectorType>(Val: InterleaveValues[0]->getType()));
743 if (!Mask)
744 return false;
745
746 LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic "
747 << *II << " and factor = " << Factor << "\n");
748
749 // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
750 // TLI function to emit target-specific interleaved instruction.
751 if (!TLI->lowerInterleavedVPStore(Store: VPStore, Mask, InterleaveOps: InterleaveValues))
752 return false;
753 } else {
754 auto *SI = cast<StoreInst>(Val: StoredBy);
755 if (!SI->isSimple())
756 return false;
757
758 LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II
759 << " and factor = " << Factor << "\n");
760
761 // Try and match this with target specific intrinsics.
762 if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues))
763 return false;
764 }
765
766 // We now have a target-specific store, so delete the old one.
767 DeadInsts.insert(X: cast<Instruction>(Val: StoredBy));
768 DeadInsts.insert(X: II);
769 return true;
770}
771
772bool InterleavedAccessImpl::runOnFunction(Function &F) {
773 // Holds dead instructions that will be erased later.
774 SmallSetVector<Instruction *, 32> DeadInsts;
775 bool Changed = false;
776
777 using namespace PatternMatch;
778 for (auto &I : instructions(F)) {
779 if (match(V: &I, P: m_CombineOr(L: m_Load(Op: m_Value()),
780 R: m_Intrinsic<Intrinsic::vp_load>())))
781 Changed |= lowerInterleavedLoad(Load: &I, DeadInsts);
782
783 if (match(V: &I, P: m_CombineOr(L: m_Store(ValueOp: m_Value(), PointerOp: m_Value()),
784 R: m_Intrinsic<Intrinsic::vp_store>())))
785 Changed |= lowerInterleavedStore(Store: &I, DeadInsts);
786
787 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) {
788 if (isDeinterleaveIntrinsic(IID: II->getIntrinsicID()))
789 Changed |= lowerDeinterleaveIntrinsic(DI: II, DeadInsts);
790 else if (isInterleaveIntrinsic(IID: II->getIntrinsicID()))
791 Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
792 }
793 }
794
795 for (auto *I : DeadInsts)
796 I->eraseFromParent();
797
798 return Changed;
799}
800