| 1 | //===- InterleavedAccessPass.cpp ------------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This file implements the Interleaved Access pass, which identifies |
| 10 | // interleaved memory accesses and transforms them into target specific |
| 11 | // intrinsics. |
| 12 | // |
| 13 | // An interleaved load reads data from memory into several vectors, with |
| 14 | // DE-interleaving the data on a factor. An interleaved store writes several |
| 15 | // vectors to memory with RE-interleaving the data on a factor. |
| 16 | // |
| 17 | // As interleaved accesses are difficult to identified in CodeGen (mainly |
| 18 | // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector |
| 19 | // IR), we identify and transform them to intrinsics in this pass so the |
| 20 | // intrinsics can be easily matched into target specific instructions later in |
| 21 | // CodeGen. |
| 22 | // |
| 23 | // E.g. An interleaved load (Factor = 2): |
| 24 | // %wide.vec = load <8 x i32>, <8 x i32>* %ptr |
| 25 | // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <0, 2, 4, 6> |
| 26 | // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> poison, <1, 3, 5, 7> |
| 27 | // |
| 28 | // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2 |
| 29 | // intrinsic in ARM backend. |
| 30 | // |
| 31 | // In X86, this can be further optimized into a set of target |
| 32 | // specific loads followed by an optimized sequence of shuffles. |
| 33 | // |
| 34 | // E.g. An interleaved store (Factor = 3): |
| 35 | // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, |
| 36 | // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> |
| 37 | // store <12 x i32> %i.vec, <12 x i32>* %ptr |
| 38 | // |
| 39 | // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3 |
| 40 | // intrinsic in ARM backend. |
| 41 | // |
| 42 | // Similarly, a set of interleaved stores can be transformed into an optimized |
| 43 | // sequence of shuffles followed by a set of target specific stores for X86. |
| 44 | // |
| 45 | //===----------------------------------------------------------------------===// |
| 46 | |
| 47 | #include "llvm/ADT/ArrayRef.h" |
| 48 | #include "llvm/ADT/DenseMap.h" |
| 49 | #include "llvm/ADT/SetVector.h" |
| 50 | #include "llvm/ADT/SmallVector.h" |
| 51 | #include "llvm/Analysis/VectorUtils.h" |
| 52 | #include "llvm/CodeGen/InterleavedAccess.h" |
| 53 | #include "llvm/CodeGen/TargetLowering.h" |
| 54 | #include "llvm/CodeGen/TargetPassConfig.h" |
| 55 | #include "llvm/CodeGen/TargetSubtargetInfo.h" |
| 56 | #include "llvm/IR/Constants.h" |
| 57 | #include "llvm/IR/Dominators.h" |
| 58 | #include "llvm/IR/Function.h" |
| 59 | #include "llvm/IR/IRBuilder.h" |
| 60 | #include "llvm/IR/InstIterator.h" |
| 61 | #include "llvm/IR/Instruction.h" |
| 62 | #include "llvm/IR/Instructions.h" |
| 63 | #include "llvm/IR/IntrinsicInst.h" |
| 64 | #include "llvm/IR/PatternMatch.h" |
| 65 | #include "llvm/InitializePasses.h" |
| 66 | #include "llvm/Pass.h" |
| 67 | #include "llvm/Support/Casting.h" |
| 68 | #include "llvm/Support/CommandLine.h" |
| 69 | #include "llvm/Support/Debug.h" |
| 70 | #include "llvm/Support/raw_ostream.h" |
| 71 | #include "llvm/Target/TargetMachine.h" |
| 72 | #include "llvm/Transforms/Utils/Local.h" |
| 73 | #include <cassert> |
| 74 | #include <utility> |
| 75 | |
| 76 | using namespace llvm; |
| 77 | |
| 78 | #define DEBUG_TYPE "interleaved-access" |
| 79 | |
| 80 | static cl::opt<bool> LowerInterleavedAccesses( |
| 81 | "lower-interleaved-accesses" , |
| 82 | cl::desc("Enable lowering interleaved accesses to intrinsics" ), |
| 83 | cl::init(Val: true), cl::Hidden); |
| 84 | |
| 85 | namespace { |
| 86 | |
| 87 | class InterleavedAccessImpl { |
| 88 | friend class InterleavedAccess; |
| 89 | |
| 90 | public: |
| 91 | InterleavedAccessImpl() = default; |
| 92 | InterleavedAccessImpl(DominatorTree *DT, const TargetLowering *TLI) |
| 93 | : DT(DT), TLI(TLI), MaxFactor(TLI->getMaxSupportedInterleaveFactor()) {} |
| 94 | bool runOnFunction(Function &F); |
| 95 | |
| 96 | private: |
| 97 | DominatorTree *DT = nullptr; |
| 98 | const TargetLowering *TLI = nullptr; |
| 99 | |
| 100 | /// The maximum supported interleave factor. |
| 101 | unsigned MaxFactor = 0u; |
| 102 | |
| 103 | /// Transform an interleaved load into target specific intrinsics. |
| 104 | bool lowerInterleavedLoad(Instruction *Load, |
| 105 | SmallSetVector<Instruction *, 32> &DeadInsts); |
| 106 | |
| 107 | /// Transform an interleaved store into target specific intrinsics. |
| 108 | bool lowerInterleavedStore(Instruction *Store, |
| 109 | SmallSetVector<Instruction *, 32> &DeadInsts); |
| 110 | |
| 111 | /// Transform a load and a deinterleave intrinsic into target specific |
| 112 | /// instructions. |
| 113 | bool lowerDeinterleaveIntrinsic(IntrinsicInst *II, |
| 114 | SmallSetVector<Instruction *, 32> &DeadInsts); |
| 115 | |
| 116 | /// Transform an interleave intrinsic and a store into target specific |
| 117 | /// instructions. |
| 118 | bool lowerInterleaveIntrinsic(IntrinsicInst *II, |
| 119 | SmallSetVector<Instruction *, 32> &DeadInsts); |
| 120 | |
| 121 | /// Returns true if the uses of an interleaved load by the |
| 122 | /// extractelement instructions in \p Extracts can be replaced by uses of the |
| 123 | /// shufflevector instructions in \p Shuffles instead. If so, the necessary |
| 124 | /// replacements are also performed. |
| 125 | bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> , |
| 126 | ArrayRef<ShuffleVectorInst *> Shuffles); |
| 127 | |
| 128 | /// Given a number of shuffles of the form shuffle(binop(x,y)), convert them |
| 129 | /// to binop(shuffle(x), shuffle(y)) to allow the formation of an |
| 130 | /// interleaving load. Any newly created shuffles that operate on \p LI will |
| 131 | /// be added to \p Shuffles. Returns true, if any changes to the IR have been |
| 132 | /// made. |
| 133 | bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles, |
| 134 | SmallVectorImpl<ShuffleVectorInst *> &Shuffles, |
| 135 | Instruction *LI); |
| 136 | }; |
| 137 | |
| 138 | class InterleavedAccess : public FunctionPass { |
| 139 | InterleavedAccessImpl Impl; |
| 140 | |
| 141 | public: |
| 142 | static char ID; |
| 143 | |
| 144 | InterleavedAccess() : FunctionPass(ID) {} |
| 145 | |
| 146 | StringRef getPassName() const override { return "Interleaved Access Pass" ; } |
| 147 | |
| 148 | bool runOnFunction(Function &F) override; |
| 149 | |
| 150 | void getAnalysisUsage(AnalysisUsage &AU) const override { |
| 151 | AU.addRequired<DominatorTreeWrapperPass>(); |
| 152 | AU.setPreservesCFG(); |
| 153 | } |
| 154 | }; |
| 155 | |
| 156 | } // end anonymous namespace. |
| 157 | |
| 158 | PreservedAnalyses InterleavedAccessPass::run(Function &F, |
| 159 | FunctionAnalysisManager &FAM) { |
| 160 | auto *DT = &FAM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 161 | auto *TLI = TM->getSubtargetImpl(F)->getTargetLowering(); |
| 162 | InterleavedAccessImpl Impl(DT, TLI); |
| 163 | bool Changed = Impl.runOnFunction(F); |
| 164 | |
| 165 | if (!Changed) |
| 166 | return PreservedAnalyses::all(); |
| 167 | |
| 168 | PreservedAnalyses PA; |
| 169 | PA.preserveSet<CFGAnalyses>(); |
| 170 | return PA; |
| 171 | } |
| 172 | |
| 173 | char InterleavedAccess::ID = 0; |
| 174 | |
| 175 | bool InterleavedAccess::runOnFunction(Function &F) { |
| 176 | if (skipFunction(F)) |
| 177 | return false; |
| 178 | |
| 179 | auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); |
| 180 | if (!TPC || !LowerInterleavedAccesses) |
| 181 | return false; |
| 182 | |
| 183 | LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n" ); |
| 184 | |
| 185 | Impl.DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); |
| 186 | auto &TM = TPC->getTM<TargetMachine>(); |
| 187 | Impl.TLI = TM.getSubtargetImpl(F)->getTargetLowering(); |
| 188 | Impl.MaxFactor = Impl.TLI->getMaxSupportedInterleaveFactor(); |
| 189 | |
| 190 | return Impl.runOnFunction(F); |
| 191 | } |
| 192 | |
| 193 | INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE, |
| 194 | "Lower interleaved memory accesses to target specific intrinsics" , false, |
| 195 | false) |
| 196 | INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) |
| 197 | INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE, |
| 198 | "Lower interleaved memory accesses to target specific intrinsics" , false, |
| 199 | false) |
| 200 | |
| 201 | FunctionPass *llvm::createInterleavedAccessPass() { |
| 202 | return new InterleavedAccess(); |
| 203 | } |
| 204 | |
| 205 | /// Check if the mask is a DE-interleave mask for an interleaved load. |
| 206 | /// |
| 207 | /// E.g. DE-interleave masks (Factor = 2) could be: |
| 208 | /// <0, 2, 4, 6> (mask of index 0 to extract even elements) |
| 209 | /// <1, 3, 5, 7> (mask of index 1 to extract odd elements) |
| 210 | static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor, |
| 211 | unsigned &Index, unsigned MaxFactor, |
| 212 | unsigned NumLoadElements) { |
| 213 | if (Mask.size() < 2) |
| 214 | return false; |
| 215 | |
| 216 | // Check potential Factors. |
| 217 | for (Factor = 2; Factor <= MaxFactor; Factor++) { |
| 218 | // Make sure we don't produce a load wider than the input load. |
| 219 | if (Mask.size() * Factor > NumLoadElements) |
| 220 | return false; |
| 221 | if (ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask, Factor, Index)) |
| 222 | return true; |
| 223 | } |
| 224 | |
| 225 | return false; |
| 226 | } |
| 227 | |
| 228 | /// Check if the mask can be used in an interleaved store. |
| 229 | // |
| 230 | /// It checks for a more general pattern than the RE-interleave mask. |
| 231 | /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...> |
| 232 | /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35> |
| 233 | /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> |
| 234 | /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5> |
| 235 | /// |
| 236 | /// The particular case of an RE-interleave mask is: |
| 237 | /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> |
| 238 | /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> |
| 239 | static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, |
| 240 | unsigned MaxFactor) { |
| 241 | unsigned NumElts = SVI->getShuffleMask().size(); |
| 242 | if (NumElts < 4) |
| 243 | return false; |
| 244 | |
| 245 | // Check potential Factors. |
| 246 | for (Factor = 2; Factor <= MaxFactor; Factor++) { |
| 247 | if (SVI->isInterleave(Factor)) |
| 248 | return true; |
| 249 | } |
| 250 | |
| 251 | return false; |
| 252 | } |
| 253 | |
| 254 | static Value *getMaskOperand(IntrinsicInst *II) { |
| 255 | switch (II->getIntrinsicID()) { |
| 256 | default: |
| 257 | llvm_unreachable("Unexpected intrinsic" ); |
| 258 | case Intrinsic::vp_load: |
| 259 | case Intrinsic::masked_load: |
| 260 | return II->getOperand(i_nocapture: 1); |
| 261 | case Intrinsic::vp_store: |
| 262 | case Intrinsic::masked_store: |
| 263 | return II->getOperand(i_nocapture: 2); |
| 264 | } |
| 265 | } |
| 266 | |
| 267 | // Return a pair of |
| 268 | // (1) The corresponded deinterleaved mask, or nullptr if there is no valid |
| 269 | // mask. |
| 270 | // (2) Some mask effectively skips a certain field, and this element is a mask |
| 271 | // in which inactive lanes represent fields that are skipped (i.e. "gaps"). |
| 272 | static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, |
| 273 | ElementCount LeafValueEC); |
| 274 | |
| 275 | static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, |
| 276 | VectorType *LeafValueTy) { |
| 277 | return getMask(WideMask, Factor, LeafValueEC: LeafValueTy->getElementCount()); |
| 278 | } |
| 279 | |
| 280 | bool InterleavedAccessImpl::lowerInterleavedLoad( |
| 281 | Instruction *Load, SmallSetVector<Instruction *, 32> &DeadInsts) { |
| 282 | if (isa<ScalableVectorType>(Val: Load->getType())) |
| 283 | return false; |
| 284 | |
| 285 | auto *LI = dyn_cast<LoadInst>(Val: Load); |
| 286 | auto *II = dyn_cast<IntrinsicInst>(Val: Load); |
| 287 | if (!LI && !II) |
| 288 | return false; |
| 289 | |
| 290 | if (LI && !LI->isSimple()) |
| 291 | return false; |
| 292 | |
| 293 | // Check if all users of this load are shufflevectors. If we encounter any |
| 294 | // users that are extractelement instructions or binary operators, we save |
| 295 | // them to later check if they can be modified to extract from one of the |
| 296 | // shufflevectors instead of the load. |
| 297 | |
| 298 | SmallVector<ShuffleVectorInst *, 4> Shuffles; |
| 299 | SmallVector<ExtractElementInst *, 4> ; |
| 300 | // BinOpShuffles need to be handled a single time in case both operands of the |
| 301 | // binop are the same load. |
| 302 | SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles; |
| 303 | |
| 304 | for (auto *User : Load->users()) { |
| 305 | auto * = dyn_cast<ExtractElementInst>(Val: User); |
| 306 | if (Extract && isa<ConstantInt>(Val: Extract->getIndexOperand())) { |
| 307 | Extracts.push_back(Elt: Extract); |
| 308 | continue; |
| 309 | } |
| 310 | if (auto *BI = dyn_cast<BinaryOperator>(Val: User)) { |
| 311 | using namespace PatternMatch; |
| 312 | if (!BI->user_empty() && |
| 313 | all_of(Range: BI->users(), P: match_fn(P: m_Shuffle(v1: m_Value(), v2: m_Undef())))) { |
| 314 | for (auto *SVI : BI->users()) |
| 315 | BinOpShuffles.insert(X: cast<ShuffleVectorInst>(Val: SVI)); |
| 316 | continue; |
| 317 | } |
| 318 | } |
| 319 | auto *SVI = dyn_cast<ShuffleVectorInst>(Val: User); |
| 320 | if (!SVI || !isa<UndefValue>(Val: SVI->getOperand(i_nocapture: 1))) |
| 321 | return false; |
| 322 | |
| 323 | Shuffles.push_back(Elt: SVI); |
| 324 | } |
| 325 | |
| 326 | if (Shuffles.empty() && BinOpShuffles.empty()) |
| 327 | return false; |
| 328 | |
| 329 | unsigned Factor, Index; |
| 330 | |
| 331 | unsigned NumLoadElements = |
| 332 | cast<FixedVectorType>(Val: Load->getType())->getNumElements(); |
| 333 | auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0]; |
| 334 | // Check if the first shufflevector is DE-interleave shuffle. |
| 335 | if (!isDeInterleaveMask(Mask: FirstSVI->getShuffleMask(), Factor, Index, MaxFactor, |
| 336 | NumLoadElements)) |
| 337 | return false; |
| 338 | |
| 339 | // Holds the corresponding index for each DE-interleave shuffle. |
| 340 | SmallVector<unsigned, 4> Indices; |
| 341 | |
| 342 | VectorType *VecTy = cast<VectorType>(Val: FirstSVI->getType()); |
| 343 | |
| 344 | // Check if other shufflevectors are also DE-interleaved of the same type |
| 345 | // and factor as the first shufflevector. |
| 346 | for (auto *Shuffle : Shuffles) { |
| 347 | if (Shuffle->getType() != VecTy) |
| 348 | return false; |
| 349 | if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor( |
| 350 | Mask: Shuffle->getShuffleMask(), Factor, Index)) |
| 351 | return false; |
| 352 | |
| 353 | assert(Shuffle->getShuffleMask().size() <= NumLoadElements); |
| 354 | Indices.push_back(Elt: Index); |
| 355 | } |
| 356 | for (auto *Shuffle : BinOpShuffles) { |
| 357 | if (Shuffle->getType() != VecTy) |
| 358 | return false; |
| 359 | if (!ShuffleVectorInst::isDeInterleaveMaskOfFactor( |
| 360 | Mask: Shuffle->getShuffleMask(), Factor, Index)) |
| 361 | return false; |
| 362 | |
| 363 | assert(Shuffle->getShuffleMask().size() <= NumLoadElements); |
| 364 | |
| 365 | if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: 0))->getOperand(i: 0) == Load) |
| 366 | Indices.push_back(Elt: Index); |
| 367 | if (cast<Instruction>(Val: Shuffle->getOperand(i_nocapture: 0))->getOperand(i: 1) == Load) |
| 368 | Indices.push_back(Elt: Index); |
| 369 | } |
| 370 | |
| 371 | // Try and modify users of the load that are extractelement instructions to |
| 372 | // use the shufflevector instructions instead of the load. |
| 373 | if (!tryReplaceExtracts(Extracts, Shuffles)) |
| 374 | return false; |
| 375 | |
| 376 | bool BinOpShuffleChanged = |
| 377 | replaceBinOpShuffles(BinOpShuffles: BinOpShuffles.getArrayRef(), Shuffles, LI: Load); |
| 378 | |
| 379 | Value *Mask = nullptr; |
| 380 | auto GapMask = APInt::getAllOnes(numBits: Factor); |
| 381 | if (LI) { |
| 382 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n" ); |
| 383 | } else { |
| 384 | // Check mask operand. Handle both all-true/false and interleaved mask. |
| 385 | std::tie(args&: Mask, args&: GapMask) = getMask(WideMask: getMaskOperand(II), Factor, LeafValueTy: VecTy); |
| 386 | if (!Mask) |
| 387 | return false; |
| 388 | |
| 389 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: " |
| 390 | << *Load << "\n" ); |
| 391 | LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor |
| 392 | << " and actual factor " << GapMask.popcount() << "\n" ); |
| 393 | } |
| 394 | |
| 395 | // Try to create target specific intrinsics to replace the load and |
| 396 | // shuffles. |
| 397 | if (!TLI->lowerInterleavedLoad(Load: cast<Instruction>(Val: Load), Mask, Shuffles, |
| 398 | Indices, Factor, GapMask)) |
| 399 | // If Extracts is not empty, tryReplaceExtracts made changes earlier. |
| 400 | return !Extracts.empty() || BinOpShuffleChanged; |
| 401 | |
| 402 | DeadInsts.insert_range(R&: Shuffles); |
| 403 | |
| 404 | DeadInsts.insert(X: Load); |
| 405 | return true; |
| 406 | } |
| 407 | |
| 408 | bool InterleavedAccessImpl::replaceBinOpShuffles( |
| 409 | ArrayRef<ShuffleVectorInst *> BinOpShuffles, |
| 410 | SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *Load) { |
| 411 | for (auto *SVI : BinOpShuffles) { |
| 412 | BinaryOperator *BI = cast<BinaryOperator>(Val: SVI->getOperand(i_nocapture: 0)); |
| 413 | Type *BIOp0Ty = BI->getOperand(i_nocapture: 0)->getType(); |
| 414 | ArrayRef<int> Mask = SVI->getShuffleMask(); |
| 415 | assert(all_of(Mask, [&](int Idx) { |
| 416 | return Idx < (int)cast<FixedVectorType>(BIOp0Ty)->getNumElements(); |
| 417 | })); |
| 418 | |
| 419 | BasicBlock::iterator insertPos = SVI->getIterator(); |
| 420 | auto *NewSVI1 = |
| 421 | new ShuffleVectorInst(BI->getOperand(i_nocapture: 0), PoisonValue::get(T: BIOp0Ty), |
| 422 | Mask, SVI->getName(), insertPos); |
| 423 | auto *NewSVI2 = new ShuffleVectorInst( |
| 424 | BI->getOperand(i_nocapture: 1), PoisonValue::get(T: BI->getOperand(i_nocapture: 1)->getType()), Mask, |
| 425 | SVI->getName(), insertPos); |
| 426 | BinaryOperator *NewBI = BinaryOperator::CreateWithCopiedFlags( |
| 427 | Opc: BI->getOpcode(), V1: NewSVI1, V2: NewSVI2, CopyO: BI, Name: BI->getName(), InsertBefore: insertPos); |
| 428 | SVI->replaceAllUsesWith(V: NewBI); |
| 429 | LLVM_DEBUG(dbgs() << " Replaced: " << *BI << "\n And : " << *SVI |
| 430 | << "\n With : " << *NewSVI1 << "\n And : " |
| 431 | << *NewSVI2 << "\n And : " << *NewBI << "\n" ); |
| 432 | RecursivelyDeleteTriviallyDeadInstructions(V: SVI); |
| 433 | if (NewSVI1->getOperand(i_nocapture: 0) == Load) |
| 434 | Shuffles.push_back(Elt: NewSVI1); |
| 435 | if (NewSVI2->getOperand(i_nocapture: 0) == Load) |
| 436 | Shuffles.push_back(Elt: NewSVI2); |
| 437 | } |
| 438 | |
| 439 | return !BinOpShuffles.empty(); |
| 440 | } |
| 441 | |
| 442 | bool InterleavedAccessImpl::( |
| 443 | ArrayRef<ExtractElementInst *> , |
| 444 | ArrayRef<ShuffleVectorInst *> Shuffles) { |
| 445 | // If there aren't any extractelement instructions to modify, there's nothing |
| 446 | // to do. |
| 447 | if (Extracts.empty()) |
| 448 | return true; |
| 449 | |
| 450 | // Maps extractelement instructions to vector-index pairs. The extractlement |
| 451 | // instructions will be modified to use the new vector and index operands. |
| 452 | DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap; |
| 453 | |
| 454 | for (auto * : Extracts) { |
| 455 | // The vector index that is extracted. |
| 456 | auto *IndexOperand = cast<ConstantInt>(Val: Extract->getIndexOperand()); |
| 457 | auto Index = IndexOperand->getSExtValue(); |
| 458 | |
| 459 | // Look for a suitable shufflevector instruction. The goal is to modify the |
| 460 | // extractelement instruction (which uses an interleaved load) to use one |
| 461 | // of the shufflevector instructions instead of the load. |
| 462 | for (auto *Shuffle : Shuffles) { |
| 463 | // If the shufflevector instruction doesn't dominate the extract, we |
| 464 | // can't create a use of it. |
| 465 | if (!DT->dominates(Def: Shuffle, User: Extract)) |
| 466 | continue; |
| 467 | |
| 468 | // Inspect the indices of the shufflevector instruction. If the shuffle |
| 469 | // selects the same index that is extracted, we can modify the |
| 470 | // extractelement instruction. |
| 471 | SmallVector<int, 4> Indices; |
| 472 | Shuffle->getShuffleMask(Result&: Indices); |
| 473 | for (unsigned I = 0; I < Indices.size(); ++I) |
| 474 | if (Indices[I] == Index) { |
| 475 | assert(Extract->getOperand(0) == Shuffle->getOperand(0) && |
| 476 | "Vector operations do not match" ); |
| 477 | ReplacementMap[Extract] = std::make_pair(x&: Shuffle, y&: I); |
| 478 | break; |
| 479 | } |
| 480 | |
| 481 | // If we found a suitable shufflevector instruction, stop looking. |
| 482 | if (ReplacementMap.count(Val: Extract)) |
| 483 | break; |
| 484 | } |
| 485 | |
| 486 | // If we did not find a suitable shufflevector instruction, the |
| 487 | // extractelement instruction cannot be modified, so we must give up. |
| 488 | if (!ReplacementMap.count(Val: Extract)) |
| 489 | return false; |
| 490 | } |
| 491 | |
| 492 | // Finally, perform the replacements. |
| 493 | IRBuilder<> Builder(Extracts[0]->getContext()); |
| 494 | for (auto &Replacement : ReplacementMap) { |
| 495 | auto * = Replacement.first; |
| 496 | auto *Vector = Replacement.second.first; |
| 497 | auto Index = Replacement.second.second; |
| 498 | Builder.SetInsertPoint(Extract); |
| 499 | Extract->replaceAllUsesWith(V: Builder.CreateExtractElement(Vec: Vector, Idx: Index)); |
| 500 | Extract->eraseFromParent(); |
| 501 | } |
| 502 | |
| 503 | return true; |
| 504 | } |
| 505 | |
| 506 | bool InterleavedAccessImpl::lowerInterleavedStore( |
| 507 | Instruction *Store, SmallSetVector<Instruction *, 32> &DeadInsts) { |
| 508 | Value *StoredValue; |
| 509 | auto *SI = dyn_cast<StoreInst>(Val: Store); |
| 510 | auto *II = dyn_cast<IntrinsicInst>(Val: Store); |
| 511 | if (SI) { |
| 512 | if (!SI->isSimple()) |
| 513 | return false; |
| 514 | StoredValue = SI->getValueOperand(); |
| 515 | } else { |
| 516 | assert(II->getIntrinsicID() == Intrinsic::vp_store || |
| 517 | II->getIntrinsicID() == Intrinsic::masked_store); |
| 518 | StoredValue = II->getArgOperand(i: 0); |
| 519 | } |
| 520 | |
| 521 | auto *SVI = dyn_cast<ShuffleVectorInst>(Val: StoredValue); |
| 522 | if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(Val: SVI->getType())) |
| 523 | return false; |
| 524 | |
| 525 | unsigned NumStoredElements = |
| 526 | cast<FixedVectorType>(Val: SVI->getType())->getNumElements(); |
| 527 | // Check if the shufflevector is RE-interleave shuffle. |
| 528 | unsigned Factor; |
| 529 | if (!isReInterleaveMask(SVI, Factor, MaxFactor)) |
| 530 | return false; |
| 531 | assert(NumStoredElements % Factor == 0 && |
| 532 | "number of stored element should be a multiple of Factor" ); |
| 533 | |
| 534 | Value *Mask = nullptr; |
| 535 | auto GapMask = APInt::getAllOnes(numBits: Factor); |
| 536 | if (SI) { |
| 537 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *Store << "\n" ); |
| 538 | } else { |
| 539 | // Check mask operand. Handle both all-true/false and interleaved mask. |
| 540 | unsigned LaneMaskLen = NumStoredElements / Factor; |
| 541 | std::tie(args&: Mask, args&: GapMask) = getMask(WideMask: getMaskOperand(II), Factor, |
| 542 | LeafValueEC: ElementCount::getFixed(MinVal: LaneMaskLen)); |
| 543 | if (!Mask) |
| 544 | return false; |
| 545 | |
| 546 | LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: " |
| 547 | << *Store << "\n" ); |
| 548 | LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor |
| 549 | << " and actual factor " << GapMask.popcount() << "\n" ); |
| 550 | } |
| 551 | |
| 552 | // Try to create target specific intrinsics to replace the store and |
| 553 | // shuffle. |
| 554 | if (!TLI->lowerInterleavedStore(Store, Mask, SVI, Factor, GapMask)) |
| 555 | return false; |
| 556 | |
| 557 | // Already have a new target specific interleaved store. Erase the old store. |
| 558 | DeadInsts.insert(X: Store); |
| 559 | DeadInsts.insert(X: SVI); |
| 560 | return true; |
| 561 | } |
| 562 | |
| 563 | // A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the |
| 564 | // last field in a factor-of-three interleaved store or deinterleaved load (in |
| 565 | // which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask. |
| 566 | // This helper function tries to detect this pattern and return the actual |
| 567 | // factor we're accessing, which is 2 in this example. |
| 568 | static void getGapMask(const Constant &MaskConst, unsigned Factor, |
| 569 | unsigned LeafMaskLen, APInt &GapMask) { |
| 570 | assert(GapMask.getBitWidth() == Factor); |
| 571 | for (unsigned F = 0U; F < Factor; ++F) { |
| 572 | bool AllZero = true; |
| 573 | for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) { |
| 574 | Constant *C = MaskConst.getAggregateElement(Elt: F + Idx * Factor); |
| 575 | if (!C->isZeroValue()) { |
| 576 | AllZero = false; |
| 577 | break; |
| 578 | } |
| 579 | } |
| 580 | // All mask bits on this field are zero, skipping it. |
| 581 | if (AllZero) |
| 582 | GapMask.clearBit(BitPosition: F); |
| 583 | } |
| 584 | } |
| 585 | |
| 586 | static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor, |
| 587 | ElementCount LeafValueEC) { |
| 588 | auto GapMask = APInt::getAllOnes(numBits: Factor); |
| 589 | |
| 590 | if (auto *IMI = dyn_cast<IntrinsicInst>(Val: WideMask)) { |
| 591 | if (unsigned F = getInterleaveIntrinsicFactor(ID: IMI->getIntrinsicID()); |
| 592 | F && F == Factor) { |
| 593 | Value *RefArg = nullptr; |
| 594 | // Check if all the intrinsic arguments are the same, except those that |
| 595 | // are zeros, which we mark as gaps in the gap mask. |
| 596 | for (auto [Idx, Arg] : enumerate(First: IMI->args())) { |
| 597 | if (auto *C = dyn_cast<Constant>(Val&: Arg); C && C->isZeroValue()) { |
| 598 | GapMask.clearBit(BitPosition: Idx); |
| 599 | continue; |
| 600 | } |
| 601 | |
| 602 | if (!RefArg) |
| 603 | RefArg = Arg; |
| 604 | else if (RefArg != Arg) |
| 605 | return {nullptr, GapMask}; |
| 606 | } |
| 607 | |
| 608 | // In a very rare occasion, all the intrinsic arguments might be zeros, |
| 609 | // in which case we still want to return an all-zeros constant instead of |
| 610 | // nullptr. |
| 611 | return {RefArg ? RefArg : IMI->getArgOperand(i: 0), GapMask}; |
| 612 | } |
| 613 | } |
| 614 | |
| 615 | // Masks that are assembled from bitwise AND. |
| 616 | if (auto *AndOp = dyn_cast<BinaryOperator>(Val: WideMask); |
| 617 | AndOp && AndOp->getOpcode() == Instruction::And) { |
| 618 | auto [MaskLHS, GapMaskLHS] = |
| 619 | getMask(WideMask: AndOp->getOperand(i_nocapture: 0), Factor, LeafValueEC); |
| 620 | auto [MaskRHS, GapMaskRHS] = |
| 621 | getMask(WideMask: AndOp->getOperand(i_nocapture: 1), Factor, LeafValueEC); |
| 622 | if (!MaskLHS || !MaskRHS) |
| 623 | return {nullptr, GapMask}; |
| 624 | // Using IRBuilder here so that any trivial constants could be folded right |
| 625 | // away. |
| 626 | return {IRBuilder<>(AndOp).CreateAnd(LHS: MaskLHS, RHS: MaskRHS), |
| 627 | GapMaskLHS & GapMaskRHS}; |
| 628 | } |
| 629 | |
| 630 | if (auto *ConstMask = dyn_cast<Constant>(Val: WideMask)) { |
| 631 | if (auto *Splat = ConstMask->getSplatValue()) |
| 632 | // All-ones or all-zeros mask. |
| 633 | return {ConstantVector::getSplat(EC: LeafValueEC, Elt: Splat), GapMask}; |
| 634 | |
| 635 | if (LeafValueEC.isFixed()) { |
| 636 | unsigned LeafMaskLen = LeafValueEC.getFixedValue(); |
| 637 | // First, check if we use a gap mask to skip some of the factors / fields. |
| 638 | getGapMask(MaskConst: *ConstMask, Factor, LeafMaskLen, GapMask); |
| 639 | |
| 640 | SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr); |
| 641 | // If this is a fixed-length constant mask, each lane / leaf has to |
| 642 | // use the same mask. This is done by checking if every group with Factor |
| 643 | // number of elements in the interleaved mask has homogeneous values. |
| 644 | for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { |
| 645 | if (!GapMask[Idx % Factor]) |
| 646 | continue; |
| 647 | Constant *C = ConstMask->getAggregateElement(Elt: Idx); |
| 648 | if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C) |
| 649 | return {nullptr, GapMask}; |
| 650 | LeafMask[Idx / Factor] = C; |
| 651 | } |
| 652 | |
| 653 | return {ConstantVector::get(V: LeafMask), GapMask}; |
| 654 | } |
| 655 | } |
| 656 | |
| 657 | if (auto *SVI = dyn_cast<ShuffleVectorInst>(Val: WideMask)) { |
| 658 | Type *Op1Ty = SVI->getOperand(i_nocapture: 1)->getType(); |
| 659 | if (!isa<FixedVectorType>(Val: Op1Ty)) |
| 660 | return {nullptr, GapMask}; |
| 661 | |
| 662 | // Check that the shuffle mask is: a) an interleave, b) all of the same |
| 663 | // set of the elements, and c) contained by the first source. (c) could |
| 664 | // be relaxed if desired. |
| 665 | unsigned NumSrcElts = |
| 666 | cast<FixedVectorType>(Val: SVI->getOperand(i_nocapture: 1)->getType())->getNumElements(); |
| 667 | SmallVector<unsigned> StartIndexes; |
| 668 | if (ShuffleVectorInst::isInterleaveMask(Mask: SVI->getShuffleMask(), Factor, |
| 669 | NumInputElts: NumSrcElts * 2, StartIndexes) && |
| 670 | llvm::all_of(Range&: StartIndexes, P: equal_to(Arg: 0)) && |
| 671 | llvm::all_of(Range: SVI->getShuffleMask(), P: [&NumSrcElts](int Idx) { |
| 672 | return Idx < (int)NumSrcElts; |
| 673 | })) { |
| 674 | auto *LeafMaskTy = |
| 675 | VectorType::get(ElementType: Type::getInt1Ty(C&: SVI->getContext()), EC: LeafValueEC); |
| 676 | IRBuilder<> Builder(SVI); |
| 677 | return {Builder.CreateExtractVector(DstType: LeafMaskTy, SrcVec: SVI->getOperand(i_nocapture: 0), |
| 678 | Idx: uint64_t(0)), |
| 679 | GapMask}; |
| 680 | } |
| 681 | } |
| 682 | |
| 683 | return {nullptr, GapMask}; |
| 684 | } |
| 685 | |
| 686 | bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( |
| 687 | IntrinsicInst *DI, SmallSetVector<Instruction *, 32> &DeadInsts) { |
| 688 | Instruction *LoadedVal = dyn_cast<Instruction>(Val: DI->getOperand(i_nocapture: 0)); |
| 689 | if (!LoadedVal || !LoadedVal->hasOneUse()) |
| 690 | return false; |
| 691 | |
| 692 | auto *LI = dyn_cast<LoadInst>(Val: LoadedVal); |
| 693 | auto *II = dyn_cast<IntrinsicInst>(Val: LoadedVal); |
| 694 | if (!LI && !II) |
| 695 | return false; |
| 696 | |
| 697 | const unsigned Factor = getDeinterleaveIntrinsicFactor(ID: DI->getIntrinsicID()); |
| 698 | assert(Factor && "unexpected deinterleave intrinsic" ); |
| 699 | |
| 700 | Value *Mask = nullptr; |
| 701 | if (LI) { |
| 702 | if (!LI->isSimple()) |
| 703 | return false; |
| 704 | |
| 705 | LLVM_DEBUG(dbgs() << "IA: Found a load with deinterleave intrinsic " << *DI |
| 706 | << " and factor = " << Factor << "\n" ); |
| 707 | } else { |
| 708 | assert(II); |
| 709 | if (II->getIntrinsicID() != Intrinsic::masked_load && |
| 710 | II->getIntrinsicID() != Intrinsic::vp_load) |
| 711 | return false; |
| 712 | |
| 713 | // Check mask operand. Handle both all-true/false and interleaved mask. |
| 714 | APInt GapMask(Factor, 0); |
| 715 | std::tie(args&: Mask, args&: GapMask) = |
| 716 | getMask(WideMask: getMaskOperand(II), Factor, LeafValueTy: getDeinterleavedVectorType(DI)); |
| 717 | if (!Mask) |
| 718 | return false; |
| 719 | // We haven't supported gap mask if it's deinterleaving using intrinsics. |
| 720 | // Yet it is possible that we already changed the IR, hence returning true |
| 721 | // here. |
| 722 | if (GapMask.popcount() != Factor) |
| 723 | return true; |
| 724 | |
| 725 | LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave" |
| 726 | << " intrinsic " << *DI << " and factor = " |
| 727 | << Factor << "\n" ); |
| 728 | } |
| 729 | |
| 730 | // Try and match this with target specific intrinsics. |
| 731 | if (!TLI->lowerDeinterleaveIntrinsicToLoad(Load: LoadedVal, Mask, DI)) |
| 732 | return false; |
| 733 | |
| 734 | DeadInsts.insert(X: DI); |
| 735 | // We now have a target-specific load, so delete the old one. |
| 736 | DeadInsts.insert(X: LoadedVal); |
| 737 | return true; |
| 738 | } |
| 739 | |
| 740 | bool InterleavedAccessImpl::lowerInterleaveIntrinsic( |
| 741 | IntrinsicInst *IntII, SmallSetVector<Instruction *, 32> &DeadInsts) { |
| 742 | if (!IntII->hasOneUse()) |
| 743 | return false; |
| 744 | Instruction *StoredBy = dyn_cast<Instruction>(Val: IntII->user_back()); |
| 745 | if (!StoredBy) |
| 746 | return false; |
| 747 | auto *SI = dyn_cast<StoreInst>(Val: StoredBy); |
| 748 | auto *II = dyn_cast<IntrinsicInst>(Val: StoredBy); |
| 749 | if (!SI && !II) |
| 750 | return false; |
| 751 | |
| 752 | SmallVector<Value *, 8> InterleaveValues(IntII->args()); |
| 753 | const unsigned Factor = getInterleaveIntrinsicFactor(ID: IntII->getIntrinsicID()); |
| 754 | assert(Factor && "unexpected interleave intrinsic" ); |
| 755 | |
| 756 | Value *Mask = nullptr; |
| 757 | if (II) { |
| 758 | if (II->getIntrinsicID() != Intrinsic::masked_store && |
| 759 | II->getIntrinsicID() != Intrinsic::vp_store) |
| 760 | return false; |
| 761 | // Check mask operand. Handle both all-true/false and interleaved mask. |
| 762 | APInt GapMask(Factor, 0); |
| 763 | std::tie(args&: Mask, args&: GapMask) = |
| 764 | getMask(WideMask: getMaskOperand(II), Factor, |
| 765 | LeafValueTy: cast<VectorType>(Val: InterleaveValues[0]->getType())); |
| 766 | if (!Mask) |
| 767 | return false; |
| 768 | // We haven't supported gap mask if it's interleaving using intrinsics. Yet |
| 769 | // it is possible that we already changed the IR, hence returning true here. |
| 770 | if (GapMask.popcount() != Factor) |
| 771 | return true; |
| 772 | |
| 773 | LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave" |
| 774 | << " intrinsic " << *IntII << " and factor = " |
| 775 | << Factor << "\n" ); |
| 776 | } else { |
| 777 | if (!SI->isSimple()) |
| 778 | return false; |
| 779 | |
| 780 | LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " |
| 781 | << *IntII << " and factor = " << Factor << "\n" ); |
| 782 | } |
| 783 | |
| 784 | // Try and match this with target specific intrinsics. |
| 785 | if (!TLI->lowerInterleaveIntrinsicToStore(Store: StoredBy, Mask, InterleaveValues)) |
| 786 | return false; |
| 787 | |
| 788 | // We now have a target-specific store, so delete the old one. |
| 789 | DeadInsts.insert(X: StoredBy); |
| 790 | DeadInsts.insert(X: IntII); |
| 791 | return true; |
| 792 | } |
| 793 | |
| 794 | bool InterleavedAccessImpl::runOnFunction(Function &F) { |
| 795 | // Holds dead instructions that will be erased later. |
| 796 | SmallSetVector<Instruction *, 32> DeadInsts; |
| 797 | bool Changed = false; |
| 798 | |
| 799 | using namespace PatternMatch; |
| 800 | for (auto &I : instructions(F)) { |
| 801 | if (match(V: &I, P: m_CombineOr(L: m_Load(Op: m_Value()), |
| 802 | R: m_Intrinsic<Intrinsic::vp_load>())) || |
| 803 | match(V: &I, P: m_Intrinsic<Intrinsic::masked_load>())) |
| 804 | Changed |= lowerInterleavedLoad(Load: &I, DeadInsts); |
| 805 | |
| 806 | if (match(V: &I, P: m_CombineOr(L: m_Store(ValueOp: m_Value(), PointerOp: m_Value()), |
| 807 | R: m_Intrinsic<Intrinsic::vp_store>())) || |
| 808 | match(V: &I, P: m_Intrinsic<Intrinsic::masked_store>())) |
| 809 | Changed |= lowerInterleavedStore(Store: &I, DeadInsts); |
| 810 | |
| 811 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) { |
| 812 | if (getDeinterleaveIntrinsicFactor(ID: II->getIntrinsicID())) |
| 813 | Changed |= lowerDeinterleaveIntrinsic(DI: II, DeadInsts); |
| 814 | else if (getInterleaveIntrinsicFactor(ID: II->getIntrinsicID())) |
| 815 | Changed |= lowerInterleaveIntrinsic(IntII: II, DeadInsts); |
| 816 | } |
| 817 | } |
| 818 | |
| 819 | for (auto *I : DeadInsts) |
| 820 | I->eraseFromParent(); |
| 821 | |
| 822 | return Changed; |
| 823 | } |
| 824 | |