1 | //===- InterleavedAccessPass.cpp ------------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // This file implements the Interleaved Access pass, which identifies |
10 | // interleaved memory accesses and transforms them into target specific |
11 | // intrinsics. |
12 | // |
13 | // An interleaved load reads data from memory into several vectors, with |
14 | // DE-interleaving the data on a factor. An interleaved store writes several |
15 | // vectors to memory with RE-interleaving the data on a factor. |
16 | // |
17 | // As interleaved accesses are difficult to identified in CodeGen (mainly |
18 | // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector |
19 | // IR), we identify and transform them to intrinsics in this pass so the |
20 | // intrinsics can be easily matched into target specific instructions later in |
21 | // CodeGen. |
22 | // |
23 | // E.g. An interleaved load (Factor = 2): |
24 | // %wide.vec = load <8 x i32>, <8 x i32>* %ptr |
25 | // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> |
26 | // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> |
27 | // |
28 | // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2 |
29 | // intrinsic in ARM backend. |
30 | // |
31 | // In X86, this can be further optimized into a set of target |
32 | // specific loads followed by an optimized sequence of shuffles. |
33 | // |
34 | // E.g. An interleaved store (Factor = 3): |
35 | // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, |
36 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> |
37 | // store <12 x i32> %i.vec, <12 x i32>* %ptr |
38 | // |
39 | // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3 |
40 | // intrinsic in ARM backend. |
41 | // |
42 | // Similarly, a set of interleaved stores can be transformed into an optimized |
43 | // sequence of shuffles followed by a set of target specific stores for X86. |
44 | // |
45 | //===----------------------------------------------------------------------===// |
46 | |
47 | #include "llvm/ADT/ArrayRef.h" |
48 | #include "llvm/ADT/DenseMap.h" |
49 | #include "llvm/ADT/SetVector.h" |
50 | #include "llvm/ADT/SmallVector.h" |
51 | #include "llvm/CodeGen/InterleavedAccess.h" |
52 | #include "llvm/CodeGen/TargetLowering.h" |
53 | #include "llvm/CodeGen/TargetPassConfig.h" |
54 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
55 | #include "llvm/IR/Constants.h" |
56 | #include "llvm/IR/Dominators.h" |
57 | #include "llvm/IR/Function.h" |
58 | #include "llvm/IR/IRBuilder.h" |
59 | #include "llvm/IR/InstIterator.h" |
60 | #include "llvm/IR/Instruction.h" |
61 | #include "llvm/IR/Instructions.h" |
62 | #include "llvm/IR/IntrinsicInst.h" |
63 | #include "llvm/IR/PatternMatch.h" |
64 | #include "llvm/InitializePasses.h" |
65 | #include "llvm/Pass.h" |
66 | #include "llvm/Support/Casting.h" |
67 | #include "llvm/Support/CommandLine.h" |
68 | #include "llvm/Support/Debug.h" |
69 | #include "llvm/Support/raw_ostream.h" |
70 | #include "llvm/Target/TargetMachine.h" |
71 | #include "llvm/Transforms/Utils/Local.h" |
72 | #include <cassert> |
73 | #include <utility> |
74 | |
75 | using namespace llvm; |
76 | |
77 | #define DEBUG_TYPE "interleaved-access" |
78 | |
79 | static cl::opt<bool> LowerInterleavedAccesses( |
80 | "lower-interleaved-accesses" , |
81 | cl::desc("Enable lowering interleaved accesses to intrinsics" ), |
82 | cl::init(Val: true), cl::Hidden); |
83 | |
84 | namespace { |
85 | |
86 | class InterleavedAccessImpl { |
87 | friend class InterleavedAccess; |
88 | |
89 | public: |
90 | InterleavedAccessImpl() = default; |
91 | InterleavedAccessImpl(DominatorTree *DT, const TargetLowering *TLI) |
92 | : DT(DT), TLI(TLI), MaxFactor(TLI->getMaxSupportedInterleaveFactor()) {} |
93 | bool runOnFunction(Function &F); |
94 | |
95 | private: |
96 | DominatorTree *DT = nullptr; |
97 | const TargetLowering *TLI = nullptr; |
98 | |
99 | /// The maximum supported interleave factor. |
100 | unsigned MaxFactor = 0u; |
101 | |
102 | /// Transform an interleaved load into target specific intrinsics. |
103 | bool lowerInterleavedLoad(Instruction *Load, |
104 | SmallSetVector<Instruction *, 32> &DeadInsts); |
105 | |
106 | /// Transform an interleaved store into target specific intrinsics. |
107 | bool lowerInterleavedStore(Instruction *Store, |
108 | SmallSetVector<Instruction *, 32> &DeadInsts); |
109 | |
110 | /// Transform a load and a deinterleave intrinsic into target specific |
111 | /// instructions. |
112 | bool lowerDeinterleaveIntrinsic(IntrinsicInst *II, |
113 | SmallSetVector<Instruction *, 32> &DeadInsts); |
114 | |
115 | /// Transform an interleave intrinsic and a store into target specific |
116 | /// instructions. |
117 | bool lowerInterleaveIntrinsic(IntrinsicInst *II, |
118 | SmallSetVector<Instruction *, 32> &DeadInsts); |
119 | |
120 | /// Returns true if the uses of an interleaved load by the |
121 | /// extractelement instructions in \p Extracts can be replaced by uses of the |
122 | /// shufflevector instructions in \p Shuffles instead. If so, the necessary |
123 | /// replacements are also performed. |
124 | bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> , |
125 | ArrayRef<ShuffleVectorInst *> Shuffles); |
126 | |
127 | /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them |
128 | /// to binop(shuffle(x), shuffle(y)) to allow the formation of an |
129 | /// interleaving load. Any newly created shuffles that operate on \p LI will |
130 | /// be added to \p Shuffles. Returns true, if any changes to the IR have been |
131 | /// made. |
132 | bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles, |
133 | SmallVectorImpl<ShuffleVectorInst *> &Shuffles, |
134 | Instruction *LI); |
135 | }; |
136 | |
137 | class InterleavedAccess : public FunctionPass { |
138 | InterleavedAccessImpl Impl; |
139 | |
140 | public: |
141 | static char ID; |
142 | |
143 | InterleavedAccess() : FunctionPass(ID) { |
144 | initializeInterleavedAccessPass(*PassRegistry::getPassRegistry()); |
145 | } |
146 | |
147 | StringRef getPassName() const override { return "Interleaved Access Pass" ; } |
148 | |
149 | bool runOnFunction(Function &F) override; |
150 | |
151 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
152 | AU.addRequired<DominatorTreeWrapperPass>(); |
153 | AU.setPreservesCFG(); |
154 | } |
155 | }; |
156 | |
157 | } // end anonymous namespace. |
158 | |
159 | PreservedAnalyses InterleavedAccessPass::run(Function &F, |
160 | FunctionAnalysisManager &FAM) { |
161 | auto *DT = &FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
162 | auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); |
163 | InterleavedAccessImpl Impl(DT, TLI); |
164 | bool Changed = Impl.runOnFunction(F); |
165 | |
166 | if (!Changed) |
167 | return PreservedAnalyses::all(); |
168 | |
169 | PreservedAnalyses PA; |
170 | PA.preserveSet<CFGAnalyses>(); |
171 | return PA; |
172 | } |
173 | |
174 | char InterleavedAccess::ID = 0; |
175 | |
176 | bool InterleavedAccess::runOnFunction(Function &F) { |
177 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
178 | if (!TPC || !LowerInterleavedAccesses) |
179 | return false; |
180 | |
181 | LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n" ); |
182 | |
183 | Impl.DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
184 | auto &TM = TPC->getTM<TargetMachine>(); |
185 | Impl.TLI = TM.getSubtargetImpl(F)->getTargetLowering(); |
186 | Impl.MaxFactor = Impl.TLI->getMaxSupportedInterleaveFactor(); |
187 | |
188 | return Impl.runOnFunction(F); |
189 | } |
190 | |
191 | INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE, |
192 | "Lower interleaved memory accesses to target specific intrinsics" , false, |
193 | false) |
194 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
195 | INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE, |
196 | "Lower interleaved memory accesses to target specific intrinsics" , false, |
197 | false) |
198 | |
199 | FunctionPass *llvm::createInterleavedAccessPass() { |
200 | return new InterleavedAccess(); |
201 | } |
202 | |
203 | /// Check if the mask is a DE-interleave mask for an interleaved load. |
204 | /// |
205 | /// E.g. DE-interleave masks (Factor = 2) could be: |
206 | /// <0, 2, 4, 6> (mask of index 0 to extract even elements) |
207 | /// <1, 3, 5, 7> (mask of index 1 to extract odd elements) |
208 | static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, |
209 | unsigned &Index, unsigned MaxFactor, |
210 | unsigned NumLoadElements) { |
211 | if (Mask.size() < 2) |
212 | return false; |
213 | |
214 | // Check potential Factors. |
215 | for (Factor = 2; Factor <= MaxFactor; Factor++) { |
216 | // Make sure we don't produce a load wider than the input load. |
217 | if (Mask.size() * Factor > NumLoadElements) |
218 | return false; |
219 | if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index)) |
220 | return true; |
221 | } |
222 | |
223 | return false; |
224 | } |
225 | |
226 | /// Check if the mask can be used in an interleaved store. |
227 | // |
228 | /// It checks for a more general pattern than the RE-interleave mask. |
229 | /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...> |
230 | /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35> |
231 | /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> |
232 | /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5> |
233 | /// |
234 | /// The particular case of an RE-interleave mask is: |
235 | /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> |
236 | /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> |
237 | static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, |
238 | unsigned MaxFactor) { |
239 | unsigned NumElts = SVI->getShuffleMask().size(); |
240 | if (NumElts < 4) |
241 | return false; |
242 | |
243 | // Check potential Factors. |
244 | for (Factor = 2; Factor <= MaxFactor; Factor++) { |
245 | if (SVI->isInterleave(Factor)) |
246 | return true; |
247 | } |
248 | |
249 | return false; |
250 | } |
251 | |
252 | // Return the corresponded deinterleaved mask, or nullptr if there is no valid |
253 | // mask. |
254 | static Value *getMask(Value *WideMask, unsigned Factor, |
255 | ElementCount LeafValueEC); |
256 | |
257 | static Value *getMask(Value *WideMask, unsigned Factor, |
258 | VectorType *LeafValueTy) { |
259 | return getMask(WideMask, Factor, LeafValueEC: LeafValueTy->getElementCount()); |
260 | } |
261 | |
262 | bool InterleavedAccessImpl::lowerInterleavedLoad( |
263 | Instruction *Load, SmallSetVector<Instruction *, 32> &DeadInsts) { |
264 | if (isa<ScalableVectorType>(Val: Load->getType())) |
265 | return false; |
266 | |
267 | if (auto *LI = dyn_cast<LoadInst>(Val: Load)) { |
268 | if (!LI->isSimple()) |
269 | return false; |
270 | } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Val: Load)) { |
271 | assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); |
272 | // Require a constant mask. |
273 | if (!isa<ConstantVector>(Val: VPLoad->getMaskParam())) |
274 | return false; |
275 | } else { |
276 | llvm_unreachable("unsupported load operation" ); |
277 | } |
278 | |
279 | // Check if all users of this load are shufflevectors. If we encounter any |
280 | // users that are extractelement instructions or binary operators, we save |
281 | // them to later check if they can be modified to extract from one of the |
282 | // shufflevectors instead of the load. |
283 | |
284 | SmallVector<ShuffleVectorInst *, 4> Shuffles; |
285 | SmallVector<ExtractElementInst *, 4> ; |
286 | // BinOpShuffles need to be handled a single time in case both operands of the |
287 | // binop are the same load. |
288 | SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles; |
289 | |
290 | for (auto *User : Load->users()) { |
291 | auto * = dyn_cast<ExtractElementInst>(Val: User); |
292 | if (Extract && isa<ConstantInt>(Val: Extract->getIndexOperand())) { |
293 | Extracts.push_back(Elt: Extract); |
294 | continue; |
295 | } |
296 | if (auto *BI = dyn_cast<BinaryOperator>(Val: User)) { |
297 | if (!BI->user_empty() && all_of(Range: BI->users(), P: [](auto *U) { |
298 | auto *SVI = dyn_cast<ShuffleVectorInst>(U); |
299 | return SVI && isa<UndefValue>(SVI->getOperand(1)); |
300 | })) { |
301 | for (auto *SVI : BI->users()) |
302 | BinOpShuffles.insert(X: cast<ShuffleVectorInst>(Val: SVI)); |
303 | continue; |
304 | } |
305 | } |
306 | auto *SVI = dyn_cast<ShuffleVectorInst>(Val: User); |
307 | if (!SVI || !isa<UndefValue>(Val: SVI->getOperand(i_nocapture: 1))) |
308 | return false; |
309 | |
310 | Shuffles.push_back(Elt: SVI); |
311 | } |
312 | |
313 | if (Shuffles.empty() && BinOpShuffles.empty()) |
314 | return false; |
315 | |
316 | unsigned Factor, Index; |
317 | |
318 | unsigned NumLoadElements = |
319 | cast<FixedVectorType>(Val: Load->getType())->getNumElements(); |
320 | auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; |
321 | // Check if the first shufflevector is DE-interleave shuffle. |
322 | if (!isDeInterleaveMask(Mask: FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, |
323 | NumLoadElements)) |
324 | return false; |
325 | |
326 | // Holds the corresponding index for each DE-interleave shuffle. |
327 | SmallVector<unsigned, 4> Indices; |
328 | |
329 | Type *VecTy = FirstSVI->getType(); |
330 | |
331 | // Check if other shufflevectors are also DE-interleaved of the same type |
332 | // and factor as the first shufflevector. |
333 | for (auto *Shuffle : Shuffles) { |
334 | if (Shuffle->getType() != VecTy) |
335 | return false; |
336 | if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor( |
337 | Mask: Shuffle->getShuffleMask(), Factor, Index)) |
338 | return false; |
339 | |
340 | assert(Shuffle->getShuffleMask().size() <= NumLoadElements); |
341 | Indices.push_back(Elt: Index); |
342 | } |
343 | for (auto *Shuffle : BinOpShuffles) { |
344 | if (Shuffle->getType() != VecTy) |
345 | return false; |
346 | if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor( |
347 | Mask: Shuffle->getShuffleMask(), Factor, Index)) |
348 | return false; |
349 | |
350 | assert(Shuffle->getShuffleMask().size() <= NumLoadElements); |
351 | |
352 | if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: 0))->getOperand(i: 0) == Load) |
353 | Indices.push_back(Elt: Index); |
354 | if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: 0))->getOperand(i: 1) == Load) |
355 | Indices.push_back(Elt: Index); |
356 | } |
357 | |
358 | // Try and modify users of the load that are extractelement instructions to |
359 | // use the shufflevector instructions instead of the load. |
360 | if (!tryReplaceExtracts(Extracts, Shuffles)) |
361 | return false; |
362 | |
363 | bool BinOpShuffleChanged = |
364 | replaceBinOpShuffles(BinOpShuffles: BinOpShuffles.getArrayRef(), Shuffles, LI: Load); |
365 | |
366 | if (auto *VPLoad = dyn_cast<VPIntrinsic>(Val: Load)) { |
367 | Value *LaneMask = |
368 | getMask(WideMask: VPLoad->getMaskParam(), Factor, LeafValueTy: cast<VectorType>(Val: VecTy)); |
369 | if (!LaneMask) |
370 | return false; |
371 | |
372 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n" ); |
373 | |
374 | // Sometimes the number of Shuffles might be less than Factor, we have to |
375 | // fill the gaps with null. Also, lowerInterleavedVPLoad |
376 | // expects them to be sorted. |
377 | SmallVector<Value *, 4> ShuffleValues(Factor, nullptr); |
378 | for (auto [Idx, ShuffleMaskIdx] : enumerate(First&: Indices)) |
379 | ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; |
380 | if (!TLI->lowerInterleavedVPLoad(Load: VPLoad, Mask: LaneMask, DeinterleaveRes: ShuffleValues)) |
381 | // If Extracts is not empty, tryReplaceExtracts made changes earlier. |
382 | return !Extracts.empty() || BinOpShuffleChanged; |
383 | } else { |
384 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n" ); |
385 | |
386 | // Try to create target specific intrinsics to replace the load and |
387 | // shuffles. |
388 | if (!TLI->lowerInterleavedLoad(LI: cast<LoadInst>(Val: Load), Shuffles, Indices, |
389 | Factor)) |
390 | // If Extracts is not empty, tryReplaceExtracts made changes earlier. |
391 | return !Extracts.empty() || BinOpShuffleChanged; |
392 | } |
393 | |
394 | DeadInsts.insert_range(R&: Shuffles); |
395 | |
396 | DeadInsts.insert(X: Load); |
397 | return true; |
398 | } |
399 | |
400 | bool InterleavedAccessImpl::replaceBinOpShuffles( |
401 | ArrayRef<ShuffleVectorInst *> BinOpShuffles, |
402 | SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *Load) { |
403 | for (auto *SVI : BinOpShuffles) { |
404 | BinaryOperator *BI = cast<BinaryOperator>(Val: SVI->getOperand(i_nocapture: 0)); |
405 | Type *BIOp0Ty = BI->getOperand(i_nocapture: 0)->getType(); |
406 | ArrayRef<int> Mask = SVI->getShuffleMask(); |
407 | assert(all_of(Mask, [&](int Idx) { |
408 | return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements(); |
409 | })); |
410 | |
411 | BasicBlock::iterator insertPos = SVI->getIterator(); |
412 | auto *NewSVI1 = |
413 | new ShuffleVectorInst(BI->getOperand(i_nocapture: 0), PoisonValue::get(T: BIOp0Ty), |
414 | Mask, SVI->getName(), insertPos); |
415 | auto *NewSVI2 = new ShuffleVectorInst( |
416 | BI->getOperand(i_nocapture: 1), PoisonValue::get(T: BI->getOperand(i_nocapture: 1)->getType()), Mask, |
417 | SVI->getName(), insertPos); |
418 | BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags( |
419 | Opc: BI->getOpcode(), V1: NewSVI1, V2: NewSVI2, CopyO: BI, Name: BI->getName(), InsertBefore: insertPos); |
420 | SVI->replaceAllUsesWith(V: NewBI); |
421 | LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI |
422 | << "\n With : " << *NewSVI1 << "\n And : " |
423 | << *NewSVI2 << "\n And : " << *NewBI << "\n" ); |
424 | RecursivelyDeleteTriviallyDeadInstructions(V: SVI); |
425 | if (NewSVI1->getOperand(i_nocapture: 0) == Load) |
426 | Shuffles.push_back(Elt: NewSVI1); |
427 | if (NewSVI2->getOperand(i_nocapture: 0) == Load) |
428 | Shuffles.push_back(Elt: NewSVI2); |
429 | } |
430 | |
431 | return !BinOpShuffles.empty(); |
432 | } |
433 | |
434 | bool InterleavedAccessImpl::( |
435 | ArrayRef<ExtractElementInst *> , |
436 | ArrayRef<ShuffleVectorInst *> Shuffles) { |
437 | // If there aren't any extractelement instructions to modify, there's nothing |
438 | // to do. |
439 | if (Extracts.empty()) |
440 | return true; |
441 | |
442 | // Maps extractelement instructions to vector-index pairs. The extractlement |
443 | // instructions will be modified to use the new vector and index operands. |
444 | DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap; |
445 | |
446 | for (auto * : Extracts) { |
447 | // The vector index that is extracted. |
448 | auto *IndexOperand = cast<ConstantInt>(Val: Extract->getIndexOperand()); |
449 | auto Index = IndexOperand->getSExtValue(); |
450 | |
451 | // Look for a suitable shufflevector instruction. The goal is to modify the |
452 | // extractelement instruction (which uses an interleaved load) to use one |
453 | // of the shufflevector instructions instead of the load. |
454 | for (auto *Shuffle : Shuffles) { |
455 | // If the shufflevector instruction doesn't dominate the extract, we |
456 | // can't create a use of it. |
457 | if (!DT->dominates(Def: Shuffle, User: Extract)) |
458 | continue; |
459 | |
460 | // Inspect the indices of the shufflevector instruction. If the shuffle |
461 | // selects the same index that is extracted, we can modify the |
462 | // extractelement instruction. |
463 | SmallVector<int, 4> Indices; |
464 | Shuffle->getShuffleMask(Result&: Indices); |
465 | for (unsigned I = 0; I < Indices.size(); ++I) |
466 | if (Indices[I] == Index) { |
467 | assert(Extract->getOperand(0) == Shuffle->getOperand(0) && |
468 | "Vector operations do not match" ); |
469 | ReplacementMap[Extract] = std::make_pair(x&: Shuffle, y&: I); |
470 | break; |
471 | } |
472 | |
473 | // If we found a suitable shufflevector instruction, stop looking. |
474 | if (ReplacementMap.count(Val: Extract)) |
475 | break; |
476 | } |
477 | |
478 | // If we did not find a suitable shufflevector instruction, the |
479 | // extractelement instruction cannot be modified, so we must give up. |
480 | if (!ReplacementMap.count(Val: Extract)) |
481 | return false; |
482 | } |
483 | |
484 | // Finally, perform the replacements. |
485 | IRBuilder<> Builder(Extracts[0]->getContext()); |
486 | for (auto &Replacement : ReplacementMap) { |
487 | auto * = Replacement.first; |
488 | auto *Vector = Replacement.second.first; |
489 | auto Index = Replacement.second.second; |
490 | Builder.SetInsertPoint(Extract); |
491 | Extract->replaceAllUsesWith(V: Builder.CreateExtractElement(Vec: Vector, Idx: Index)); |
492 | Extract->eraseFromParent(); |
493 | } |
494 | |
495 | return true; |
496 | } |
497 | |
498 | bool InterleavedAccessImpl::lowerInterleavedStore( |
499 | Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) { |
500 | Value *StoredValue; |
501 | if (auto *SI = dyn_cast<StoreInst>(Val: Store)) { |
502 | if (!SI->isSimple()) |
503 | return false; |
504 | StoredValue = SI->getValueOperand(); |
505 | } else if (auto *VPStore = dyn_cast<VPIntrinsic>(Val: Store)) { |
506 | assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); |
507 | // Require a constant mask. |
508 | if (!isa<ConstantVector>(Val: VPStore->getMaskParam())) |
509 | return false; |
510 | StoredValue = VPStore->getArgOperand(i: 0); |
511 | } else { |
512 | llvm_unreachable("unsupported store operation" ); |
513 | } |
514 | |
515 | auto *SVI = dyn_cast<ShuffleVectorInst>(Val: StoredValue); |
516 | if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(Val: SVI->getType())) |
517 | return false; |
518 | |
519 | unsigned NumStoredElements = |
520 | cast<FixedVectorType>(Val: SVI->getType())->getNumElements(); |
521 | // Check if the shufflevector is RE-interleave shuffle. |
522 | unsigned Factor; |
523 | if (!isReInterleaveMask(SVI, Factor, MaxFactor)) |
524 | return false; |
525 | assert(NumStoredElements % Factor == 0 && |
526 | "number of stored element should be a multiple of Factor" ); |
527 | |
528 | if (auto *VPStore = dyn_cast<VPIntrinsic>(Val: Store)) { |
529 | unsigned LaneMaskLen = NumStoredElements / Factor; |
530 | Value *LaneMask = getMask(WideMask: VPStore->getMaskParam(), Factor, |
531 | LeafValueEC: ElementCount::getFixed(MinVal: LaneMaskLen)); |
532 | if (!LaneMask) |
533 | return false; |
534 | |
535 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store: " << *Store |
536 | << "\n" ); |
537 | |
538 | IRBuilder<> Builder(VPStore); |
539 | // We need to effectively de-interleave the shufflemask |
540 | // because lowerInterleavedVPStore expects individual de-interleaved |
541 | // values. |
542 | SmallVector<Value *, 10> NewShuffles; |
543 | SmallVector<int, 16> NewShuffleMask(LaneMaskLen); |
544 | auto ShuffleMask = SVI->getShuffleMask(); |
545 | |
546 | for (unsigned i = 0; i < Factor; i++) { |
547 | for (unsigned j = 0; j < LaneMaskLen; j++) |
548 | NewShuffleMask[j] = ShuffleMask[i + Factor * j]; |
549 | |
550 | NewShuffles.push_back(Elt: Builder.CreateShuffleVector( |
551 | V1: SVI->getOperand(i_nocapture: 0), V2: SVI->getOperand(i_nocapture: 1), Mask: NewShuffleMask)); |
552 | } |
553 | |
554 | // Try to create target specific intrinsics to replace the vp.store and |
555 | // shuffle. |
556 | if (!TLI->lowerInterleavedVPStore(Store: VPStore, Mask: LaneMask, InterleaveOps: NewShuffles)) |
557 | // We already created new shuffles. |
558 | return true; |
559 | } else { |
560 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n" ); |
561 | |
562 | // Try to create target specific intrinsics to replace the store and |
563 | // shuffle. |
564 | if (!TLI->lowerInterleavedStore(SI: cast<StoreInst>(Val: Store), SVI, Factor)) |
565 | return false; |
566 | } |
567 | |
568 | // Already have a new target specific interleaved store. Erase the old store. |
569 | DeadInsts.insert(X: Store); |
570 | DeadInsts.insert(X: SVI); |
571 | return true; |
572 | } |
573 | |
574 | static bool isInterleaveIntrinsic(Intrinsic::ID IID) { |
575 | switch (IID) { |
576 | case Intrinsic::vector_interleave2: |
577 | case Intrinsic::vector_interleave3: |
578 | case Intrinsic::vector_interleave4: |
579 | case Intrinsic::vector_interleave5: |
580 | case Intrinsic::vector_interleave6: |
581 | case Intrinsic::vector_interleave7: |
582 | case Intrinsic::vector_interleave8: |
583 | return true; |
584 | default: |
585 | return false; |
586 | } |
587 | } |
588 | |
589 | static bool isDeinterleaveIntrinsic(Intrinsic::ID IID) { |
590 | switch (IID) { |
591 | case Intrinsic::vector_deinterleave2: |
592 | case Intrinsic::vector_deinterleave3: |
593 | case Intrinsic::vector_deinterleave4: |
594 | case Intrinsic::vector_deinterleave5: |
595 | case Intrinsic::vector_deinterleave6: |
596 | case Intrinsic::vector_deinterleave7: |
597 | case Intrinsic::vector_deinterleave8: |
598 | return true; |
599 | default: |
600 | return false; |
601 | } |
602 | } |
603 | |
604 | static unsigned getIntrinsicFactor(const IntrinsicInst *II) { |
605 | switch (II->getIntrinsicID()) { |
606 | case Intrinsic::vector_deinterleave2: |
607 | case Intrinsic::vector_interleave2: |
608 | return 2; |
609 | case Intrinsic::vector_deinterleave3: |
610 | case Intrinsic::vector_interleave3: |
611 | return 3; |
612 | case Intrinsic::vector_deinterleave4: |
613 | case Intrinsic::vector_interleave4: |
614 | return 4; |
615 | case Intrinsic::vector_deinterleave5: |
616 | case Intrinsic::vector_interleave5: |
617 | return 5; |
618 | case Intrinsic::vector_deinterleave6: |
619 | case Intrinsic::vector_interleave6: |
620 | return 6; |
621 | case Intrinsic::vector_deinterleave7: |
622 | case Intrinsic::vector_interleave7: |
623 | return 7; |
624 | case Intrinsic::vector_deinterleave8: |
625 | case Intrinsic::vector_interleave8: |
626 | return 8; |
627 | default: |
628 | llvm_unreachable("Unexpected intrinsic" ); |
629 | } |
630 | } |
631 | |
632 | static Value *getMask(Value *WideMask, unsigned Factor, |
633 | ElementCount LeafValueEC) { |
634 | if (auto *IMI = dyn_cast<IntrinsicInst>(Val: WideMask)) { |
635 | if (isInterleaveIntrinsic(IID: IMI->getIntrinsicID()) && |
636 | getIntrinsicFactor(II: IMI) == Factor && llvm::all_equal(Range: IMI->args())) { |
637 | return IMI->getArgOperand(i: 0); |
638 | } |
639 | } |
640 | |
641 | if (auto *ConstMask = dyn_cast<Constant>(Val: WideMask)) { |
642 | if (auto *Splat = ConstMask->getSplatValue()) |
643 | // All-ones or all-zeros mask. |
644 | return ConstantVector::getSplat(EC: LeafValueEC, Elt: Splat); |
645 | |
646 | if (LeafValueEC.isFixed()) { |
647 | unsigned LeafMaskLen = LeafValueEC.getFixedValue(); |
648 | SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr); |
649 | // If this is a fixed-length constant mask, each lane / leaf has to |
650 | // use the same mask. This is done by checking if every group with Factor |
651 | // number of elements in the interleaved mask has homogeneous values. |
652 | for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { |
653 | Constant *C = ConstMask->getAggregateElement(Elt: Idx); |
654 | if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) |
655 | return nullptr; |
656 | LeafMask[Idx / Factor] = C; |
657 | } |
658 | |
659 | return ConstantVector::get(V: LeafMask); |
660 | } |
661 | } |
662 | |
663 | return nullptr; |
664 | } |
665 | |
666 | bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( |
667 | IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) { |
668 | Value *LoadedVal = DI->getOperand(i_nocapture: 0); |
669 | if (!LoadedVal->hasOneUse() || !isa<LoadInst, VPIntrinsic>(Val: LoadedVal)) |
670 | return false; |
671 | |
672 | const unsigned Factor = getIntrinsicFactor(II: DI); |
673 | if (!DI->hasNUses(N: Factor)) |
674 | return false; |
675 | SmallVector<Value *, 8> DeinterleaveValues(Factor); |
676 | for (auto *User : DI->users()) { |
677 | auto * = dyn_cast<ExtractValueInst>(Val: User); |
678 | if (!Extract || Extract->getNumIndices() != 1) |
679 | return false; |
680 | unsigned Idx = Extract->getIndices()[0]; |
681 | if (DeinterleaveValues[Idx]) |
682 | return false; |
683 | DeinterleaveValues[Idx] = Extract; |
684 | } |
685 | |
686 | if (auto *VPLoad = dyn_cast<VPIntrinsic>(Val: LoadedVal)) { |
687 | if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) |
688 | return false; |
689 | // Check mask operand. Handle both all-true/false and interleaved mask. |
690 | Value *WideMask = VPLoad->getOperand(i_nocapture: 1); |
691 | Value *Mask = getMask(WideMask, Factor, |
692 | LeafValueTy: cast<VectorType>(Val: DeinterleaveValues[0]->getType())); |
693 | if (!Mask) |
694 | return false; |
695 | |
696 | LLVM_DEBUG(dbgs() << "IA: Found a vp.load with deinterleave intrinsic " |
697 | << *DI << " and factor = " << Factor << "\n" ); |
698 | |
699 | // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special |
700 | // TLI function to emit target-specific interleaved instruction. |
701 | if (!TLI->lowerInterleavedVPLoad(Load: VPLoad, Mask, DeinterleaveRes: DeinterleaveValues)) |
702 | return false; |
703 | |
704 | } else { |
705 | auto *LI = cast<LoadInst>(Val: LoadedVal); |
706 | if (!LI->isSimple()) |
707 | return false; |
708 | |
709 | LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI |
710 | << " and factor = " << Factor << "\n" ); |
711 | |
712 | // Try and match this with target specific intrinsics. |
713 | if (!TLI->lowerDeinterleaveIntrinsicToLoad(LI, DeinterleaveValues)) |
714 | return false; |
715 | } |
716 | |
717 | for (Value *V : DeinterleaveValues) |
718 | DeadInsts.insert(X: cast<Instruction>(Val: V)); |
719 | DeadInsts.insert(X: DI); |
720 | // We now have a target-specific load, so delete the old one. |
721 | DeadInsts.insert(X: cast<Instruction>(Val: LoadedVal)); |
722 | return true; |
723 | } |
724 | |
725 | bool InterleavedAccessImpl::lowerInterleaveIntrinsic( |
726 | IntrinsicInst *II, SmallSetVector<Instruction *, 32> &DeadInsts) { |
727 | if (!II->hasOneUse()) |
728 | return false; |
729 | Value *StoredBy = II->user_back(); |
730 | if (!isa<StoreInst, VPIntrinsic>(Val: StoredBy)) |
731 | return false; |
732 | |
733 | SmallVector<Value *, 8> InterleaveValues(II->args()); |
734 | const unsigned Factor = getIntrinsicFactor(II); |
735 | |
736 | if (auto *VPStore = dyn_cast<VPIntrinsic>(Val: StoredBy)) { |
737 | if (VPStore->getIntrinsicID() != Intrinsic::vp_store) |
738 | return false; |
739 | |
740 | Value *WideMask = VPStore->getOperand(i_nocapture: 2); |
741 | Value *Mask = getMask(WideMask, Factor, |
742 | LeafValueTy: cast<VectorType>(Val: InterleaveValues[0]->getType())); |
743 | if (!Mask) |
744 | return false; |
745 | |
746 | LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic " |
747 | << *II << " and factor = " << Factor << "\n" ); |
748 | |
749 | // Since lowerInterleavedStore expects Shuffle and StoreInst, use special |
750 | // TLI function to emit target-specific interleaved instruction. |
751 | if (!TLI->lowerInterleavedVPStore(Store: VPStore, Mask, InterleaveOps: InterleaveValues)) |
752 | return false; |
753 | } else { |
754 | auto *SI = cast<StoreInst>(Val: StoredBy); |
755 | if (!SI->isSimple()) |
756 | return false; |
757 | |
758 | LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II |
759 | << " and factor = " << Factor << "\n" ); |
760 | |
761 | // Try and match this with target specific intrinsics. |
762 | if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) |
763 | return false; |
764 | } |
765 | |
766 | // We now have a target-specific store, so delete the old one. |
767 | DeadInsts.insert(X: cast<Instruction>(Val: StoredBy)); |
768 | DeadInsts.insert(X: II); |
769 | return true; |
770 | } |
771 | |
772 | bool InterleavedAccessImpl::runOnFunction(Function &F) { |
773 | // Holds dead instructions that will be erased later. |
774 | SmallSetVector<Instruction *, 32> DeadInsts; |
775 | bool Changed = false; |
776 | |
777 | using namespace PatternMatch; |
778 | for (auto &I : instructions(F)) { |
779 | if (match(V: &I, P: m_CombineOr(L: m_Load(Op: m_Value()), |
780 | R: m_Intrinsic<Intrinsic::vp_load>()))) |
781 | Changed |= lowerInterleavedLoad(Load: &I, DeadInsts); |
782 | |
783 | if (match(V: &I, P: m_CombineOr(L: m_Store(ValueOp: m_Value(), PointerOp: m_Value()), |
784 | R: m_Intrinsic<Intrinsic::vp_store>()))) |
785 | Changed |= lowerInterleavedStore(Store: &I, DeadInsts); |
786 | |
787 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
788 | if (isDeinterleaveIntrinsic(IID: II->getIntrinsicID())) |
789 | Changed |= lowerDeinterleaveIntrinsic(DI: II, DeadInsts); |
790 | else if (isInterleaveIntrinsic(IID: II->getIntrinsicID())) |
791 | Changed |= lowerInterleaveIntrinsic(II, DeadInsts); |
792 | } |
793 | } |
794 | |
795 | for (auto *I : DeadInsts) |
796 | I->eraseFromParent(); |
797 | |
798 | return Changed; |
799 | } |
800 | |