| 1 | //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // This pass implements the Bottom Up SLP vectorizer. It detects consecutive |
| 10 | // stores that can be put together into vector-stores. Next, it attempts to |
| 11 | // construct vectorizable tree using the use-def chains. If a profitable tree |
| 12 | // was found, the SLP vectorizer performs vectorization on the tree. |
| 13 | // |
| 14 | // The pass is inspired by the work described in the paper: |
| 15 | // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks. |
| 16 | // |
| 17 | //===----------------------------------------------------------------------===// |
| 18 | |
| 19 | #include "llvm/Transforms/Vectorize/SLPVectorizer.h" |
| 20 | #include "llvm/ADT/DenseMap.h" |
| 21 | #include "llvm/ADT/DenseSet.h" |
| 22 | #include "llvm/ADT/PriorityQueue.h" |
| 23 | #include "llvm/ADT/STLExtras.h" |
| 24 | #include "llvm/ADT/ScopeExit.h" |
| 25 | #include "llvm/ADT/SetOperations.h" |
| 26 | #include "llvm/ADT/SetVector.h" |
| 27 | #include "llvm/ADT/SmallBitVector.h" |
| 28 | #include "llvm/ADT/SmallPtrSet.h" |
| 29 | #include "llvm/ADT/SmallSet.h" |
| 30 | #include "llvm/ADT/SmallString.h" |
| 31 | #include "llvm/ADT/Statistic.h" |
| 32 | #include "llvm/ADT/iterator.h" |
| 33 | #include "llvm/ADT/iterator_range.h" |
| 34 | #include "llvm/Analysis/AliasAnalysis.h" |
| 35 | #include "llvm/Analysis/AssumptionCache.h" |
| 36 | #include "llvm/Analysis/CodeMetrics.h" |
| 37 | #include "llvm/Analysis/ConstantFolding.h" |
| 38 | #include "llvm/Analysis/DemandedBits.h" |
| 39 | #include "llvm/Analysis/GlobalsModRef.h" |
| 40 | #include "llvm/Analysis/IVDescriptors.h" |
| 41 | #include "llvm/Analysis/Loads.h" |
| 42 | #include "llvm/Analysis/LoopAccessAnalysis.h" |
| 43 | #include "llvm/Analysis/LoopInfo.h" |
| 44 | #include "llvm/Analysis/MemoryLocation.h" |
| 45 | #include "llvm/Analysis/OptimizationRemarkEmitter.h" |
| 46 | #include "llvm/Analysis/ScalarEvolution.h" |
| 47 | #include "llvm/Analysis/ScalarEvolutionExpressions.h" |
| 48 | #include "llvm/Analysis/TargetLibraryInfo.h" |
| 49 | #include "llvm/Analysis/TargetTransformInfo.h" |
| 50 | #include "llvm/Analysis/ValueTracking.h" |
| 51 | #include "llvm/Analysis/VectorUtils.h" |
| 52 | #include "llvm/IR/Attributes.h" |
| 53 | #include "llvm/IR/BasicBlock.h" |
| 54 | #include "llvm/IR/Constant.h" |
| 55 | #include "llvm/IR/Constants.h" |
| 56 | #include "llvm/IR/DataLayout.h" |
| 57 | #include "llvm/IR/DerivedTypes.h" |
| 58 | #include "llvm/IR/Dominators.h" |
| 59 | #include "llvm/IR/Function.h" |
| 60 | #include "llvm/IR/IRBuilder.h" |
| 61 | #include "llvm/IR/InstrTypes.h" |
| 62 | #include "llvm/IR/Instruction.h" |
| 63 | #include "llvm/IR/Instructions.h" |
| 64 | #include "llvm/IR/IntrinsicInst.h" |
| 65 | #include "llvm/IR/Intrinsics.h" |
| 66 | #include "llvm/IR/Module.h" |
| 67 | #include "llvm/IR/Operator.h" |
| 68 | #include "llvm/IR/PatternMatch.h" |
| 69 | #include "llvm/IR/Type.h" |
| 70 | #include "llvm/IR/Use.h" |
| 71 | #include "llvm/IR/User.h" |
| 72 | #include "llvm/IR/Value.h" |
| 73 | #include "llvm/IR/ValueHandle.h" |
| 74 | #ifdef EXPENSIVE_CHECKS |
| 75 | #include "llvm/IR/Verifier.h" |
| 76 | #endif |
| 77 | #include "llvm/Pass.h" |
| 78 | #include "llvm/Support/Casting.h" |
| 79 | #include "llvm/Support/CommandLine.h" |
| 80 | #include "llvm/Support/Compiler.h" |
| 81 | #include "llvm/Support/DOTGraphTraits.h" |
| 82 | #include "llvm/Support/Debug.h" |
| 83 | #include "llvm/Support/DebugCounter.h" |
| 84 | #include "llvm/Support/ErrorHandling.h" |
| 85 | #include "llvm/Support/GraphWriter.h" |
| 86 | #include "llvm/Support/InstructionCost.h" |
| 87 | #include "llvm/Support/KnownBits.h" |
| 88 | #include "llvm/Support/MathExtras.h" |
| 89 | #include "llvm/Support/raw_ostream.h" |
| 90 | #include "llvm/Transforms/Utils/InjectTLIMappings.h" |
| 91 | #include "llvm/Transforms/Utils/Local.h" |
| 92 | #include "llvm/Transforms/Utils/LoopUtils.h" |
| 93 | #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" |
| 94 | #include <algorithm> |
| 95 | #include <cassert> |
| 96 | #include <cstdint> |
| 97 | #include <iterator> |
| 98 | #include <memory> |
| 99 | #include <optional> |
| 100 | #include <set> |
| 101 | #include <string> |
| 102 | #include <tuple> |
| 103 | #include <utility> |
| 104 | |
| 105 | using namespace llvm; |
| 106 | using namespace llvm::PatternMatch; |
| 107 | using namespace slpvectorizer; |
| 108 | using namespace std::placeholders; |
| 109 | |
| 110 | #define SV_NAME "slp-vectorizer" |
| 111 | #define DEBUG_TYPE "SLP" |
| 112 | |
| 113 | STATISTIC(NumVectorInstructions, "Number of vector instructions generated" ); |
| 114 | |
| 115 | DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized" , |
| 116 | "Controls which SLP graphs should be vectorized." ); |
| 117 | |
| 118 | static cl::opt<bool> |
| 119 | RunSLPVectorization("vectorize-slp" , cl::init(Val: true), cl::Hidden, |
| 120 | cl::desc("Run the SLP vectorization passes" )); |
| 121 | |
| 122 | static cl::opt<bool> |
| 123 | SLPReVec("slp-revec" , cl::init(Val: false), cl::Hidden, |
| 124 | cl::desc("Enable vectorization for wider vector utilization" )); |
| 125 | |
| 126 | static cl::opt<int> |
| 127 | SLPCostThreshold("slp-threshold" , cl::init(Val: 0), cl::Hidden, |
| 128 | cl::desc("Only vectorize if you gain more than this " |
| 129 | "number " )); |
| 130 | |
| 131 | static cl::opt<bool> SLPSkipEarlyProfitabilityCheck( |
| 132 | "slp-skip-early-profitability-check" , cl::init(Val: false), cl::Hidden, |
| 133 | cl::desc("When true, SLP vectorizer bypasses profitability checks based on " |
| 134 | "heuristics and makes vectorization decision via cost modeling." )); |
| 135 | |
| 136 | static cl::opt<bool> |
| 137 | ShouldVectorizeHor("slp-vectorize-hor" , cl::init(Val: true), cl::Hidden, |
| 138 | cl::desc("Attempt to vectorize horizontal reductions" )); |
| 139 | |
| 140 | static cl::opt<bool> ShouldStartVectorizeHorAtStore( |
| 141 | "slp-vectorize-hor-store" , cl::init(Val: false), cl::Hidden, |
| 142 | cl::desc( |
| 143 | "Attempt to vectorize horizontal reductions feeding into a store" )); |
| 144 | |
| 145 | static cl::opt<bool> SplitAlternateInstructions( |
| 146 | "slp-split-alternate-instructions" , cl::init(Val: true), cl::Hidden, |
| 147 | cl::desc("Improve the code quality by splitting alternate instructions" )); |
| 148 | |
| 149 | static cl::opt<int> |
| 150 | MaxVectorRegSizeOption("slp-max-reg-size" , cl::init(Val: 128), cl::Hidden, |
| 151 | cl::desc("Attempt to vectorize for this register size in bits" )); |
| 152 | |
| 153 | static cl::opt<unsigned> |
| 154 | MaxVFOption("slp-max-vf" , cl::init(Val: 0), cl::Hidden, |
| 155 | cl::desc("Maximum SLP vectorization factor (0=unlimited)" )); |
| 156 | |
| 157 | /// Limits the size of scheduling regions in a block. |
| 158 | /// It avoid long compile times for _very_ large blocks where vector |
| 159 | /// instructions are spread over a wide range. |
| 160 | /// This limit is way higher than needed by real-world functions. |
| 161 | static cl::opt<int> |
| 162 | ScheduleRegionSizeBudget("slp-schedule-budget" , cl::init(Val: 100000), cl::Hidden, |
| 163 | cl::desc("Limit the size of the SLP scheduling region per block" )); |
| 164 | |
| 165 | static cl::opt<int> MinVectorRegSizeOption( |
| 166 | "slp-min-reg-size" , cl::init(Val: 128), cl::Hidden, |
| 167 | cl::desc("Attempt to vectorize for this register size in bits" )); |
| 168 | |
| 169 | static cl::opt<unsigned> RecursionMaxDepth( |
| 170 | "slp-recursion-max-depth" , cl::init(Val: 12), cl::Hidden, |
| 171 | cl::desc("Limit the recursion depth when building a vectorizable tree" )); |
| 172 | |
| 173 | static cl::opt<unsigned> MinTreeSize( |
| 174 | "slp-min-tree-size" , cl::init(Val: 3), cl::Hidden, |
| 175 | cl::desc("Only vectorize small trees if they are fully vectorizable" )); |
| 176 | |
| 177 | // The maximum depth that the look-ahead score heuristic will explore. |
| 178 | // The higher this value, the higher the compilation time overhead. |
| 179 | static cl::opt<int> LookAheadMaxDepth( |
| 180 | "slp-max-look-ahead-depth" , cl::init(Val: 2), cl::Hidden, |
| 181 | cl::desc("The maximum look-ahead depth for operand reordering scores" )); |
| 182 | |
| 183 | // The maximum depth that the look-ahead score heuristic will explore |
| 184 | // when it probing among candidates for vectorization tree roots. |
| 185 | // The higher this value, the higher the compilation time overhead but unlike |
| 186 | // similar limit for operands ordering this is less frequently used, hence |
| 187 | // impact of higher value is less noticeable. |
| 188 | static cl::opt<int> RootLookAheadMaxDepth( |
| 189 | "slp-max-root-look-ahead-depth" , cl::init(Val: 2), cl::Hidden, |
| 190 | cl::desc("The maximum look-ahead depth for searching best rooting option" )); |
| 191 | |
| 192 | static cl::opt<unsigned> MinProfitableStridedLoads( |
| 193 | "slp-min-strided-loads" , cl::init(Val: 2), cl::Hidden, |
| 194 | cl::desc("The minimum number of loads, which should be considered strided, " |
| 195 | "if the stride is > 1 or is runtime value" )); |
| 196 | |
| 197 | static cl::opt<unsigned> MaxProfitableLoadStride( |
| 198 | "slp-max-stride" , cl::init(Val: 8), cl::Hidden, |
| 199 | cl::desc("The maximum stride, considered to be profitable." )); |
| 200 | |
| 201 | static cl::opt<bool> |
| 202 | ViewSLPTree("view-slp-tree" , cl::Hidden, |
| 203 | cl::desc("Display the SLP trees with Graphviz" )); |
| 204 | |
| 205 | static cl::opt<bool> VectorizeNonPowerOf2( |
| 206 | "slp-vectorize-non-power-of-2" , cl::init(Val: false), cl::Hidden, |
| 207 | cl::desc("Try to vectorize with non-power-of-2 number of elements." )); |
| 208 | |
| 209 | // Limit the number of alias checks. The limit is chosen so that |
| 210 | // it has no negative effect on the llvm benchmarks. |
| 211 | static const unsigned AliasedCheckLimit = 10; |
| 212 | |
| 213 | // Limit of the number of uses for potentially transformed instructions/values, |
| 214 | // used in checks to avoid compile-time explode. |
| 215 | static constexpr int UsesLimit = 64; |
| 216 | |
| 217 | // Another limit for the alias checks: The maximum distance between load/store |
| 218 | // instructions where alias checks are done. |
| 219 | // This limit is useful for very large basic blocks. |
| 220 | static const unsigned MaxMemDepDistance = 160; |
| 221 | |
| 222 | /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling |
| 223 | /// regions to be handled. |
| 224 | static const int MinScheduleRegionSize = 16; |
| 225 | |
| 226 | /// Maximum allowed number of operands in the PHI nodes. |
| 227 | static const unsigned MaxPHINumOperands = 128; |
| 228 | |
| 229 | /// Predicate for the element types that the SLP vectorizer supports. |
| 230 | /// |
| 231 | /// The most important thing to filter here are types which are invalid in LLVM |
| 232 | /// vectors. We also filter target specific types which have absolutely no |
| 233 | /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just |
| 234 | /// avoids spending time checking the cost model and realizing that they will |
| 235 | /// be inevitably scalarized. |
| 236 | static bool isValidElementType(Type *Ty) { |
| 237 | // TODO: Support ScalableVectorType. |
| 238 | if (SLPReVec && isa<FixedVectorType>(Val: Ty)) |
| 239 | Ty = Ty->getScalarType(); |
| 240 | return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() && |
| 241 | !Ty->isPPC_FP128Ty(); |
| 242 | } |
| 243 | |
| 244 | /// Returns the type of the given value/instruction \p V. If it is store, |
| 245 | /// returns the type of its value operand, for Cmp - the types of the compare |
| 246 | /// operands and for insertelement - the type os the inserted operand. |
| 247 | /// Otherwise, just the type of the value is returned. |
| 248 | static Type *getValueType(Value *V) { |
| 249 | if (auto *SI = dyn_cast<StoreInst>(Val: V)) |
| 250 | return SI->getValueOperand()->getType(); |
| 251 | if (auto *CI = dyn_cast<CmpInst>(Val: V)) |
| 252 | return CI->getOperand(i_nocapture: 0)->getType(); |
| 253 | if (auto *IE = dyn_cast<InsertElementInst>(Val: V)) |
| 254 | return IE->getOperand(i_nocapture: 1)->getType(); |
| 255 | return V->getType(); |
| 256 | } |
| 257 | |
| 258 | /// \returns the number of elements for Ty. |
| 259 | static unsigned getNumElements(Type *Ty) { |
| 260 | assert(!isa<ScalableVectorType>(Ty) && |
| 261 | "ScalableVectorType is not supported." ); |
| 262 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty)) |
| 263 | return VecTy->getNumElements(); |
| 264 | return 1; |
| 265 | } |
| 266 | |
| 267 | /// \returns the vector type of ScalarTy based on vectorization factor. |
| 268 | static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) { |
| 269 | return FixedVectorType::get(ElementType: ScalarTy->getScalarType(), |
| 270 | NumElts: VF * getNumElements(Ty: ScalarTy)); |
| 271 | } |
| 272 | |
| 273 | /// Returns the number of elements of the given type \p Ty, not less than \p Sz, |
| 274 | /// which forms type, which splits by \p TTI into whole vector types during |
| 275 | /// legalization. |
| 276 | static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, |
| 277 | Type *Ty, unsigned Sz) { |
| 278 | if (!isValidElementType(Ty)) |
| 279 | return bit_ceil(Value: Sz); |
| 280 | // Find the number of elements, which forms full vectors. |
| 281 | const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz)); |
| 282 | if (NumParts == 0 || NumParts >= Sz) |
| 283 | return bit_ceil(Value: Sz); |
| 284 | return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts; |
| 285 | } |
| 286 | |
| 287 | /// Returns the number of elements of the given type \p Ty, not greater than \p |
| 288 | /// Sz, which forms type, which splits by \p TTI into whole vector types during |
| 289 | /// legalization. |
| 290 | static unsigned |
| 291 | getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, |
| 292 | unsigned Sz) { |
| 293 | if (!isValidElementType(Ty)) |
| 294 | return bit_floor(Value: Sz); |
| 295 | // Find the number of elements, which forms full vectors. |
| 296 | unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz)); |
| 297 | if (NumParts == 0 || NumParts >= Sz) |
| 298 | return bit_floor(Value: Sz); |
| 299 | unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)); |
| 300 | if (RegVF > Sz) |
| 301 | return bit_floor(Value: Sz); |
| 302 | return (Sz / RegVF) * RegVF; |
| 303 | } |
| 304 | |
| 305 | static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, |
| 306 | SmallVectorImpl<int> &Mask) { |
| 307 | // The ShuffleBuilder implementation use shufflevector to splat an "element". |
| 308 | // But the element have different meaning for SLP (scalar) and REVEC |
| 309 | // (vector). We need to expand Mask into masks which shufflevector can use |
| 310 | // directly. |
| 311 | SmallVector<int> NewMask(Mask.size() * VecTyNumElements); |
| 312 | for (unsigned I : seq<unsigned>(Size: Mask.size())) |
| 313 | for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice( |
| 314 | N: I * VecTyNumElements, M: VecTyNumElements))) |
| 315 | MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem |
| 316 | : Mask[I] * VecTyNumElements + J; |
| 317 | Mask.swap(RHS&: NewMask); |
| 318 | } |
| 319 | |
| 320 | /// \returns the number of groups of shufflevector |
| 321 | /// A group has the following features |
| 322 | /// 1. All of value in a group are shufflevector. |
| 323 | /// 2. The mask of all shufflevector is isExtractSubvectorMask. |
| 324 | /// 3. The mask of all shufflevector uses all of the elements of the source. |
| 325 | /// e.g., it is 1 group (%0) |
| 326 | /// %1 = shufflevector <16 x i8> %0, <16 x i8> poison, |
| 327 | /// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> |
| 328 | /// %2 = shufflevector <16 x i8> %0, <16 x i8> poison, |
| 329 | /// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> |
| 330 | /// it is 2 groups (%3 and %4) |
| 331 | /// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, |
| 332 | /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 333 | /// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, |
| 334 | /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| 335 | /// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, |
| 336 | /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 337 | /// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, |
| 338 | /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| 339 | /// it is 0 group |
| 340 | /// %12 = shufflevector <8 x i16> %10, <8 x i16> poison, |
| 341 | /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 342 | /// %13 = shufflevector <8 x i16> %11, <8 x i16> poison, |
| 343 | /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 344 | static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) { |
| 345 | if (VL.empty()) |
| 346 | return 0; |
| 347 | if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>)) |
| 348 | return 0; |
| 349 | auto *SV = cast<ShuffleVectorInst>(Val: VL.front()); |
| 350 | unsigned SVNumElements = |
| 351 | cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements(); |
| 352 | unsigned ShuffleMaskSize = SV->getShuffleMask().size(); |
| 353 | if (SVNumElements % ShuffleMaskSize != 0) |
| 354 | return 0; |
| 355 | unsigned GroupSize = SVNumElements / ShuffleMaskSize; |
| 356 | if (GroupSize == 0 || (VL.size() % GroupSize) != 0) |
| 357 | return 0; |
| 358 | unsigned NumGroup = 0; |
| 359 | for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) { |
| 360 | auto *SV = cast<ShuffleVectorInst>(Val: VL[I]); |
| 361 | Value *Src = SV->getOperand(i_nocapture: 0); |
| 362 | ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize); |
| 363 | SmallBitVector ExpectedIndex(GroupSize); |
| 364 | if (!all_of(Range&: Group, P: [&](Value *V) { |
| 365 | auto *SV = cast<ShuffleVectorInst>(Val: V); |
| 366 | // From the same source. |
| 367 | if (SV->getOperand(i_nocapture: 0) != Src) |
| 368 | return false; |
| 369 | int Index; |
| 370 | if (!SV->isExtractSubvectorMask(Index)) |
| 371 | return false; |
| 372 | ExpectedIndex.set(Index / ShuffleMaskSize); |
| 373 | return true; |
| 374 | })) |
| 375 | return 0; |
| 376 | if (!ExpectedIndex.all()) |
| 377 | return 0; |
| 378 | ++NumGroup; |
| 379 | } |
| 380 | assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups" ); |
| 381 | return NumGroup; |
| 382 | } |
| 383 | |
| 384 | /// \returns a shufflevector mask which is used to vectorize shufflevectors |
| 385 | /// e.g., |
| 386 | /// %5 = shufflevector <8 x i16> %3, <8 x i16> poison, |
| 387 | /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 388 | /// %6 = shufflevector <8 x i16> %3, <8 x i16> poison, |
| 389 | /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| 390 | /// %7 = shufflevector <8 x i16> %4, <8 x i16> poison, |
| 391 | /// <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 392 | /// %8 = shufflevector <8 x i16> %4, <8 x i16> poison, |
| 393 | /// <4 x i32> <i32 4, i32 5, i32 6, i32 7> |
| 394 | /// the result is |
| 395 | /// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31> |
| 396 | static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) { |
| 397 | assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage." ); |
| 398 | auto *SV = cast<ShuffleVectorInst>(Val: VL.front()); |
| 399 | unsigned SVNumElements = |
| 400 | cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements(); |
| 401 | SmallVector<int> Mask; |
| 402 | unsigned AccumulateLength = 0; |
| 403 | for (Value *V : VL) { |
| 404 | auto *SV = cast<ShuffleVectorInst>(Val: V); |
| 405 | for (int M : SV->getShuffleMask()) |
| 406 | Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem |
| 407 | : AccumulateLength + M); |
| 408 | AccumulateLength += SVNumElements; |
| 409 | } |
| 410 | return Mask; |
| 411 | } |
| 412 | |
| 413 | /// \returns True if the value is a constant (but not globals/constant |
| 414 | /// expressions). |
| 415 | static bool isConstant(Value *V) { |
| 416 | return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V); |
| 417 | } |
| 418 | |
| 419 | /// Checks if \p V is one of vector-like instructions, i.e. undef, |
| 420 | /// insertelement/extractelement with constant indices for fixed vector type or |
| 421 | /// extractvalue instruction. |
| 422 | static bool isVectorLikeInstWithConstOps(Value *V) { |
| 423 | if (!isa<InsertElementInst, ExtractElementInst>(Val: V) && |
| 424 | !isa<ExtractValueInst, UndefValue>(Val: V)) |
| 425 | return false; |
| 426 | auto *I = dyn_cast<Instruction>(Val: V); |
| 427 | if (!I || isa<ExtractValueInst>(Val: I)) |
| 428 | return true; |
| 429 | if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType())) |
| 430 | return false; |
| 431 | if (isa<ExtractElementInst>(Val: I)) |
| 432 | return isConstant(V: I->getOperand(i: 1)); |
| 433 | assert(isa<InsertElementInst>(V) && "Expected only insertelement." ); |
| 434 | return isConstant(V: I->getOperand(i: 2)); |
| 435 | } |
| 436 | |
| 437 | /// Returns power-of-2 number of elements in a single register (part), given the |
| 438 | /// total number of elements \p Size and number of registers (parts) \p |
| 439 | /// NumParts. |
| 440 | static unsigned getPartNumElems(unsigned Size, unsigned NumParts) { |
| 441 | return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts))); |
| 442 | } |
| 443 | |
| 444 | /// Returns correct remaining number of elements, considering total amount \p |
| 445 | /// Size, (power-of-2 number) of elements in a single register \p PartNumElems |
| 446 | /// and current register (part) \p Part. |
| 447 | static unsigned getNumElems(unsigned Size, unsigned PartNumElems, |
| 448 | unsigned Part) { |
| 449 | return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems); |
| 450 | } |
| 451 | |
| 452 | #if !defined(NDEBUG) |
| 453 | /// Print a short descriptor of the instruction bundle suitable for debug output. |
| 454 | static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) { |
| 455 | std::string Result; |
| 456 | raw_string_ostream OS(Result); |
| 457 | if (Idx >= 0) |
| 458 | OS << "Idx: " << Idx << ", " ; |
| 459 | OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]" ; |
| 460 | return Result; |
| 461 | } |
| 462 | #endif |
| 463 | |
| 464 | /// \returns true if all of the instructions in \p VL are in the same block or |
| 465 | /// false otherwise. |
| 466 | static bool allSameBlock(ArrayRef<Value *> VL) { |
| 467 | auto *It = find_if(Range&: VL, P: IsaPred<Instruction>); |
| 468 | if (It == VL.end()) |
| 469 | return false; |
| 470 | Instruction *I0 = cast<Instruction>(Val: *It); |
| 471 | if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) |
| 472 | return true; |
| 473 | |
| 474 | BasicBlock *BB = I0->getParent(); |
| 475 | for (Value *V : iterator_range(It, VL.end())) { |
| 476 | if (isa<PoisonValue>(Val: V)) |
| 477 | continue; |
| 478 | auto *II = dyn_cast<Instruction>(Val: V); |
| 479 | if (!II) |
| 480 | return false; |
| 481 | |
| 482 | if (BB != II->getParent()) |
| 483 | return false; |
| 484 | } |
| 485 | return true; |
| 486 | } |
| 487 | |
| 488 | /// \returns True if all of the values in \p VL are constants (but not |
| 489 | /// globals/constant expressions). |
| 490 | static bool allConstant(ArrayRef<Value *> VL) { |
| 491 | // Constant expressions and globals can't be vectorized like normal integer/FP |
| 492 | // constants. |
| 493 | return all_of(Range&: VL, P: isConstant); |
| 494 | } |
| 495 | |
| 496 | /// \returns True if all of the values in \p VL are identical or some of them |
| 497 | /// are UndefValue. |
| 498 | static bool isSplat(ArrayRef<Value *> VL) { |
| 499 | Value *FirstNonUndef = nullptr; |
| 500 | for (Value *V : VL) { |
| 501 | if (isa<UndefValue>(Val: V)) |
| 502 | continue; |
| 503 | if (!FirstNonUndef) { |
| 504 | FirstNonUndef = V; |
| 505 | continue; |
| 506 | } |
| 507 | if (V != FirstNonUndef) |
| 508 | return false; |
| 509 | } |
| 510 | return FirstNonUndef != nullptr; |
| 511 | } |
| 512 | |
| 513 | /// \returns True if \p I is commutative, handles CmpInst and BinaryOperator. |
| 514 | /// For BinaryOperator, it also checks if \p InstWithUses is used in specific |
| 515 | /// patterns that make it effectively commutative (like equality comparisons |
| 516 | /// with zero). |
| 517 | /// In most cases, users should not call this function directly (since \p I and |
| 518 | /// \p InstWithUses are the same). However, when analyzing interchangeable |
| 519 | /// instructions, we need to use the converted opcode along with the original |
| 520 | /// uses. |
| 521 | /// \param I The instruction to check for commutativity |
| 522 | /// \param InstWithUses The instruction whose uses are analyzed for special |
| 523 | /// patterns |
| 524 | static bool isCommutative(Instruction *I, Instruction *InstWithUses) { |
| 525 | if (auto *Cmp = dyn_cast<CmpInst>(Val: I)) |
| 526 | return Cmp->isCommutative(); |
| 527 | if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) |
| 528 | return BO->isCommutative() || |
| 529 | (BO->getOpcode() == Instruction::Sub && |
| 530 | !InstWithUses->hasNUsesOrMore(N: UsesLimit) && |
| 531 | all_of( |
| 532 | Range: InstWithUses->uses(), |
| 533 | P: [](const Use &U) { |
| 534 | // Commutative, if icmp eq/ne sub, 0 |
| 535 | CmpPredicate Pred; |
| 536 | if (match(V: U.getUser(), |
| 537 | P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) && |
| 538 | (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)) |
| 539 | return true; |
| 540 | // Commutative, if abs(sub nsw, true) or abs(sub, false). |
| 541 | ConstantInt *Flag; |
| 542 | return match(V: U.getUser(), |
| 543 | P: m_Intrinsic<Intrinsic::abs>( |
| 544 | Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) && |
| 545 | (!cast<Instruction>(Val: U.get())->hasNoSignedWrap() || |
| 546 | Flag->isOne()); |
| 547 | })) || |
| 548 | (BO->getOpcode() == Instruction::FSub && |
| 549 | !InstWithUses->hasNUsesOrMore(N: UsesLimit) && |
| 550 | all_of(Range: InstWithUses->uses(), P: [](const Use &U) { |
| 551 | return match(V: U.getUser(), |
| 552 | P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get()))); |
| 553 | })); |
| 554 | return I->isCommutative(); |
| 555 | } |
| 556 | |
| 557 | /// This is a helper function to check whether \p I is commutative. |
| 558 | /// This is a convenience wrapper that calls the two-parameter version of |
| 559 | /// isCommutative with the same instruction for both parameters. This is |
| 560 | /// the common case where the instruction being checked for commutativity |
| 561 | /// is the same as the instruction whose uses are analyzed for special |
| 562 | /// patterns (see the two-parameter version above for details). |
| 563 | /// \param I The instruction to check for commutativity |
| 564 | /// \returns true if the instruction is commutative, false otherwise |
| 565 | static bool isCommutative(Instruction *I) { return isCommutative(I, InstWithUses: I); } |
| 566 | |
| 567 | template <typename T> |
| 568 | static std::optional<unsigned> (const Value *Inst, |
| 569 | unsigned Offset) { |
| 570 | static_assert(std::is_same_v<T, InsertElementInst> || |
| 571 | std::is_same_v<T, ExtractElementInst>, |
| 572 | "unsupported T" ); |
| 573 | int Index = Offset; |
| 574 | if (const auto *IE = dyn_cast<T>(Inst)) { |
| 575 | const auto *VT = dyn_cast<FixedVectorType>(IE->getType()); |
| 576 | if (!VT) |
| 577 | return std::nullopt; |
| 578 | const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)); |
| 579 | if (!CI) |
| 580 | return std::nullopt; |
| 581 | if (CI->getValue().uge(VT->getNumElements())) |
| 582 | return std::nullopt; |
| 583 | Index *= VT->getNumElements(); |
| 584 | Index += CI->getZExtValue(); |
| 585 | return Index; |
| 586 | } |
| 587 | return std::nullopt; |
| 588 | } |
| 589 | |
| 590 | /// \returns inserting or extracting index of InsertElement, ExtractElement or |
| 591 | /// InsertValue instruction, using Offset as base offset for index. |
| 592 | /// \returns std::nullopt if the index is not an immediate. |
| 593 | static std::optional<unsigned> getElementIndex(const Value *Inst, |
| 594 | unsigned Offset = 0) { |
| 595 | if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset)) |
| 596 | return Index; |
| 597 | if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset)) |
| 598 | return Index; |
| 599 | |
| 600 | int Index = Offset; |
| 601 | |
| 602 | const auto *IV = dyn_cast<InsertValueInst>(Val: Inst); |
| 603 | if (!IV) |
| 604 | return std::nullopt; |
| 605 | |
| 606 | Type *CurrentType = IV->getType(); |
| 607 | for (unsigned I : IV->indices()) { |
| 608 | if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) { |
| 609 | Index *= ST->getNumElements(); |
| 610 | CurrentType = ST->getElementType(N: I); |
| 611 | } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) { |
| 612 | Index *= AT->getNumElements(); |
| 613 | CurrentType = AT->getElementType(); |
| 614 | } else { |
| 615 | return std::nullopt; |
| 616 | } |
| 617 | Index += I; |
| 618 | } |
| 619 | return Index; |
| 620 | } |
| 621 | |
| 622 | /// \returns true if all of the values in \p VL use the same opcode. |
| 623 | /// For comparison instructions, also checks if predicates match. |
| 624 | /// PoisonValues are considered matching. |
| 625 | /// Interchangeable instructions are not considered. |
| 626 | static bool allSameOpcode(ArrayRef<Value *> VL) { |
| 627 | auto *It = find_if(Range&: VL, P: IsaPred<Instruction>); |
| 628 | if (It == VL.end()) |
| 629 | return true; |
| 630 | Instruction *MainOp = cast<Instruction>(Val: *It); |
| 631 | unsigned Opcode = MainOp->getOpcode(); |
| 632 | bool IsCmpOp = isa<CmpInst>(Val: MainOp); |
| 633 | CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate() |
| 634 | : CmpInst::BAD_ICMP_PREDICATE; |
| 635 | return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) { |
| 636 | if (auto *CI = dyn_cast<CmpInst>(Val: V)) |
| 637 | return BasePred == CI->getPredicate(); |
| 638 | if (auto *I = dyn_cast<Instruction>(Val: V)) |
| 639 | return I->getOpcode() == Opcode; |
| 640 | return isa<PoisonValue>(Val: V); |
| 641 | }); |
| 642 | } |
| 643 | |
| 644 | namespace { |
| 645 | /// Specifies the way the mask should be analyzed for undefs/poisonous elements |
| 646 | /// in the shuffle mask. |
| 647 | enum class UseMask { |
| 648 | FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors, |
| 649 | ///< check for the mask elements for the first argument (mask |
| 650 | ///< indices are in range [0:VF)). |
| 651 | SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check |
| 652 | ///< for the mask elements for the second argument (mask indices |
| 653 | ///< are in range [VF:2*VF)) |
| 654 | UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for |
| 655 | ///< future shuffle elements and mark them as ones as being used |
| 656 | ///< in future. Non-undef elements are considered as unused since |
| 657 | ///< they're already marked as used in the mask. |
| 658 | }; |
| 659 | } // namespace |
| 660 | |
| 661 | /// Prepares a use bitset for the given mask either for the first argument or |
| 662 | /// for the second. |
| 663 | static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask, |
| 664 | UseMask MaskArg) { |
| 665 | SmallBitVector UseMask(VF, true); |
| 666 | for (auto [Idx, Value] : enumerate(First&: Mask)) { |
| 667 | if (Value == PoisonMaskElem) { |
| 668 | if (MaskArg == UseMask::UndefsAsMask) |
| 669 | UseMask.reset(Idx); |
| 670 | continue; |
| 671 | } |
| 672 | if (MaskArg == UseMask::FirstArg && Value < VF) |
| 673 | UseMask.reset(Idx: Value); |
| 674 | else if (MaskArg == UseMask::SecondArg && Value >= VF) |
| 675 | UseMask.reset(Idx: Value - VF); |
| 676 | } |
| 677 | return UseMask; |
| 678 | } |
| 679 | |
| 680 | /// Checks if the given value is actually an undefined constant vector. |
| 681 | /// Also, if the \p UseMask is not empty, tries to check if the non-masked |
| 682 | /// elements actually mask the insertelement buildvector, if any. |
| 683 | template <bool IsPoisonOnly = false> |
| 684 | static SmallBitVector isUndefVector(const Value *V, |
| 685 | const SmallBitVector &UseMask = {}) { |
| 686 | SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true); |
| 687 | using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>; |
| 688 | if (isa<T>(V)) |
| 689 | return Res; |
| 690 | auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType()); |
| 691 | if (!VecTy) |
| 692 | return Res.reset(); |
| 693 | auto *C = dyn_cast<Constant>(Val: V); |
| 694 | if (!C) { |
| 695 | if (!UseMask.empty()) { |
| 696 | const Value *Base = V; |
| 697 | while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) { |
| 698 | Base = II->getOperand(i_nocapture: 0); |
| 699 | if (isa<T>(II->getOperand(i_nocapture: 1))) |
| 700 | continue; |
| 701 | std::optional<unsigned> Idx = getElementIndex(Inst: II); |
| 702 | if (!Idx) { |
| 703 | Res.reset(); |
| 704 | return Res; |
| 705 | } |
| 706 | if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx)) |
| 707 | Res.reset(Idx: *Idx); |
| 708 | } |
| 709 | // TODO: Add analysis for shuffles here too. |
| 710 | if (V == Base) { |
| 711 | Res.reset(); |
| 712 | } else { |
| 713 | SmallBitVector SubMask(UseMask.size(), false); |
| 714 | Res &= isUndefVector<IsPoisonOnly>(Base, SubMask); |
| 715 | } |
| 716 | } else { |
| 717 | Res.reset(); |
| 718 | } |
| 719 | return Res; |
| 720 | } |
| 721 | for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) { |
| 722 | if (Constant *Elem = C->getAggregateElement(Elt: I)) |
| 723 | if (!isa<T>(Elem) && |
| 724 | (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I)))) |
| 725 | Res.reset(Idx: I); |
| 726 | } |
| 727 | return Res; |
| 728 | } |
| 729 | |
| 730 | /// Checks if the vector of instructions can be represented as a shuffle, like: |
| 731 | /// %x0 = extractelement <4 x i8> %x, i32 0 |
| 732 | /// %x3 = extractelement <4 x i8> %x, i32 3 |
| 733 | /// %y1 = extractelement <4 x i8> %y, i32 1 |
| 734 | /// %y2 = extractelement <4 x i8> %y, i32 2 |
| 735 | /// %x0x0 = mul i8 %x0, %x0 |
| 736 | /// %x3x3 = mul i8 %x3, %x3 |
| 737 | /// %y1y1 = mul i8 %y1, %y1 |
| 738 | /// %y2y2 = mul i8 %y2, %y2 |
| 739 | /// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0 |
| 740 | /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1 |
| 741 | /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2 |
| 742 | /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3 |
| 743 | /// ret <4 x i8> %ins4 |
| 744 | /// can be transformed into: |
| 745 | /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5, |
| 746 | /// i32 6> |
| 747 | /// %2 = mul <4 x i8> %1, %1 |
| 748 | /// ret <4 x i8> %2 |
| 749 | /// Mask will return the Shuffle Mask equivalent to the extracted elements. |
| 750 | /// TODO: Can we split off and reuse the shuffle mask detection from |
| 751 | /// ShuffleVectorInst/getShuffleCost? |
| 752 | static std::optional<TargetTransformInfo::ShuffleKind> |
| 753 | isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, |
| 754 | AssumptionCache *AC) { |
| 755 | const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>); |
| 756 | if (It == VL.end()) |
| 757 | return std::nullopt; |
| 758 | unsigned Size = |
| 759 | std::accumulate(first: VL.begin(), last: VL.end(), init: 0u, binary_op: [](unsigned S, Value *V) { |
| 760 | auto *EI = dyn_cast<ExtractElementInst>(Val: V); |
| 761 | if (!EI) |
| 762 | return S; |
| 763 | auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType()); |
| 764 | if (!VTy) |
| 765 | return S; |
| 766 | return std::max(a: S, b: VTy->getNumElements()); |
| 767 | }); |
| 768 | |
| 769 | Value *Vec1 = nullptr; |
| 770 | Value *Vec2 = nullptr; |
| 771 | bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) { |
| 772 | auto *EE = dyn_cast<ExtractElementInst>(Val: V); |
| 773 | if (!EE) |
| 774 | return false; |
| 775 | Value *Vec = EE->getVectorOperand(); |
| 776 | if (isa<UndefValue>(Val: Vec)) |
| 777 | return false; |
| 778 | return isGuaranteedNotToBePoison(V: Vec, AC); |
| 779 | }); |
| 780 | enum ShuffleMode { Unknown, Select, Permute }; |
| 781 | ShuffleMode CommonShuffleMode = Unknown; |
| 782 | Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem); |
| 783 | for (unsigned I = 0, E = VL.size(); I < E; ++I) { |
| 784 | // Undef can be represented as an undef element in a vector. |
| 785 | if (isa<UndefValue>(Val: VL[I])) |
| 786 | continue; |
| 787 | auto *EI = cast<ExtractElementInst>(Val: VL[I]); |
| 788 | if (isa<ScalableVectorType>(Val: EI->getVectorOperandType())) |
| 789 | return std::nullopt; |
| 790 | auto *Vec = EI->getVectorOperand(); |
| 791 | // We can extractelement from undef or poison vector. |
| 792 | if (isUndefVector</*isPoisonOnly=*/true>(V: Vec).all()) |
| 793 | continue; |
| 794 | // All vector operands must have the same number of vector elements. |
| 795 | if (isa<UndefValue>(Val: Vec)) { |
| 796 | Mask[I] = I; |
| 797 | } else { |
| 798 | if (isa<UndefValue>(Val: EI->getIndexOperand())) |
| 799 | continue; |
| 800 | auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand()); |
| 801 | if (!Idx) |
| 802 | return std::nullopt; |
| 803 | // Undefined behavior if Idx is negative or >= Size. |
| 804 | if (Idx->getValue().uge(RHS: Size)) |
| 805 | continue; |
| 806 | unsigned IntIdx = Idx->getValue().getZExtValue(); |
| 807 | Mask[I] = IntIdx; |
| 808 | } |
| 809 | if (isUndefVector(V: Vec).all() && HasNonUndefVec) |
| 810 | continue; |
| 811 | // For correct shuffling we have to have at most 2 different vector operands |
| 812 | // in all extractelement instructions. |
| 813 | if (!Vec1 || Vec1 == Vec) { |
| 814 | Vec1 = Vec; |
| 815 | } else if (!Vec2 || Vec2 == Vec) { |
| 816 | Vec2 = Vec; |
| 817 | Mask[I] += Size; |
| 818 | } else { |
| 819 | return std::nullopt; |
| 820 | } |
| 821 | if (CommonShuffleMode == Permute) |
| 822 | continue; |
| 823 | // If the extract index is not the same as the operation number, it is a |
| 824 | // permutation. |
| 825 | if (Mask[I] % Size != I) { |
| 826 | CommonShuffleMode = Permute; |
| 827 | continue; |
| 828 | } |
| 829 | CommonShuffleMode = Select; |
| 830 | } |
| 831 | // If we're not crossing lanes in different vectors, consider it as blending. |
| 832 | if (CommonShuffleMode == Select && Vec2) |
| 833 | return TargetTransformInfo::SK_Select; |
| 834 | // If Vec2 was never used, we have a permutation of a single vector, otherwise |
| 835 | // we have permutation of 2 vectors. |
| 836 | return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc |
| 837 | : TargetTransformInfo::SK_PermuteSingleSrc; |
| 838 | } |
| 839 | |
| 840 | /// \returns True if Extract{Value,Element} instruction extracts element Idx. |
| 841 | static std::optional<unsigned> (const Instruction *E) { |
| 842 | unsigned Opcode = E->getOpcode(); |
| 843 | assert((Opcode == Instruction::ExtractElement || |
| 844 | Opcode == Instruction::ExtractValue) && |
| 845 | "Expected extractelement or extractvalue instruction." ); |
| 846 | if (Opcode == Instruction::ExtractElement) { |
| 847 | auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1)); |
| 848 | if (!CI) |
| 849 | return std::nullopt; |
| 850 | return CI->getZExtValue(); |
| 851 | } |
| 852 | auto *EI = cast<ExtractValueInst>(Val: E); |
| 853 | if (EI->getNumIndices() != 1) |
| 854 | return std::nullopt; |
| 855 | return *EI->idx_begin(); |
| 856 | } |
| 857 | |
| 858 | namespace { |
| 859 | /// \returns true if \p Opcode is allowed as part of the main/alternate |
| 860 | /// instruction for SLP vectorization. |
| 861 | /// |
| 862 | /// Example of unsupported opcode is SDIV that can potentially cause UB if the |
| 863 | /// "shuffled out" lane would result in division by zero. |
| 864 | bool isValidForAlternation(unsigned Opcode) { |
| 865 | return !Instruction::isIntDivRem(Opcode); |
| 866 | } |
| 867 | |
| 868 | /// Helper class that determines VL can use the same opcode. |
| 869 | /// Alternate instruction is supported. In addition, it supports interchangeable |
| 870 | /// instruction. An interchangeable instruction is an instruction that can be |
| 871 | /// converted to another instruction with same semantics. For example, x << 1 is |
| 872 | /// equal to x * 2. x * 1 is equal to x | 0. |
| 873 | class BinOpSameOpcodeHelper { |
| 874 | using MaskType = std::uint_fast16_t; |
| 875 | /// Sort SupportedOp because it is used by binary_search. |
| 876 | constexpr static std::initializer_list<unsigned> SupportedOp = { |
| 877 | Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl, |
| 878 | Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor}; |
| 879 | enum : MaskType { |
| 880 | ShlBIT = 0b1, |
| 881 | AShrBIT = 0b10, |
| 882 | MulBIT = 0b100, |
| 883 | AddBIT = 0b1000, |
| 884 | SubBIT = 0b10000, |
| 885 | AndBIT = 0b100000, |
| 886 | OrBIT = 0b1000000, |
| 887 | XorBIT = 0b10000000, |
| 888 | MainOpBIT = 0b100000000, |
| 889 | LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT) |
| 890 | }; |
| 891 | /// Return a non-nullptr if either operand of I is a ConstantInt. |
| 892 | /// The second return value represents the operand position. We check the |
| 893 | /// right-hand side first (1). If the right hand side is not a ConstantInt and |
| 894 | /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand |
| 895 | /// side (0). |
| 896 | static std::pair<ConstantInt *, unsigned> |
| 897 | isBinOpWithConstantInt(const Instruction *I) { |
| 898 | unsigned Opcode = I->getOpcode(); |
| 899 | assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode." ); |
| 900 | (void)SupportedOp; |
| 901 | auto *BinOp = cast<BinaryOperator>(Val: I); |
| 902 | if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1))) |
| 903 | return {CI, 1}; |
| 904 | if (Opcode == Instruction::Sub || Opcode == Instruction::Shl || |
| 905 | Opcode == Instruction::AShr) |
| 906 | return {nullptr, 0}; |
| 907 | if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 0))) |
| 908 | return {CI, 0}; |
| 909 | return {nullptr, 0}; |
| 910 | } |
| 911 | struct InterchangeableInfo { |
| 912 | const Instruction *I = nullptr; |
| 913 | /// The bit it sets represents whether MainOp can be converted to. |
| 914 | MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | |
| 915 | MulBIT | AShrBIT | ShlBIT; |
| 916 | /// We cannot create an interchangeable instruction that does not exist in |
| 917 | /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0], |
| 918 | /// but << does not exist in VL. In the end, we convert VL to [x * 1, y * |
| 919 | /// 1]. SeenBefore is used to know what operations have been seen before. |
| 920 | MaskType SeenBefore = 0; |
| 921 | InterchangeableInfo(const Instruction *I) : I(I) {} |
| 922 | /// Return false allows BinOpSameOpcodeHelper to find an alternate |
| 923 | /// instruction. Directly setting the mask will destroy the mask state, |
| 924 | /// preventing us from determining which instruction it should convert to. |
| 925 | bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) { |
| 926 | if (Mask & InterchangeableMask) { |
| 927 | SeenBefore |= OpcodeInMaskForm; |
| 928 | Mask &= InterchangeableMask; |
| 929 | return true; |
| 930 | } |
| 931 | return false; |
| 932 | } |
| 933 | bool equal(unsigned Opcode) { |
| 934 | if (Opcode == I->getOpcode()) |
| 935 | return trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT); |
| 936 | return false; |
| 937 | } |
| 938 | unsigned getOpcode() const { |
| 939 | MaskType Candidate = Mask & SeenBefore; |
| 940 | if (Candidate & MainOpBIT) |
| 941 | return I->getOpcode(); |
| 942 | if (Candidate & ShlBIT) |
| 943 | return Instruction::Shl; |
| 944 | if (Candidate & AShrBIT) |
| 945 | return Instruction::AShr; |
| 946 | if (Candidate & MulBIT) |
| 947 | return Instruction::Mul; |
| 948 | if (Candidate & AddBIT) |
| 949 | return Instruction::Add; |
| 950 | if (Candidate & SubBIT) |
| 951 | return Instruction::Sub; |
| 952 | if (Candidate & AndBIT) |
| 953 | return Instruction::And; |
| 954 | if (Candidate & OrBIT) |
| 955 | return Instruction::Or; |
| 956 | if (Candidate & XorBIT) |
| 957 | return Instruction::Xor; |
| 958 | llvm_unreachable("Cannot find interchangeable instruction." ); |
| 959 | } |
| 960 | SmallVector<Value *> getOperand(const Instruction *To) const { |
| 961 | unsigned ToOpcode = To->getOpcode(); |
| 962 | unsigned FromOpcode = I->getOpcode(); |
| 963 | if (FromOpcode == ToOpcode) |
| 964 | return SmallVector<Value *>(I->operands()); |
| 965 | assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode." ); |
| 966 | auto [CI, Pos] = isBinOpWithConstantInt(I); |
| 967 | const APInt &FromCIValue = CI->getValue(); |
| 968 | unsigned FromCIValueBitWidth = FromCIValue.getBitWidth(); |
| 969 | APInt ToCIValue; |
| 970 | switch (FromOpcode) { |
| 971 | case Instruction::Shl: |
| 972 | if (ToOpcode == Instruction::Mul) { |
| 973 | ToCIValue = APInt::getOneBitSet(numBits: FromCIValueBitWidth, |
| 974 | BitNo: FromCIValue.getZExtValue()); |
| 975 | } else { |
| 976 | assert(FromCIValue.isZero() && "Cannot convert the instruction." ); |
| 977 | ToCIValue = ToOpcode == Instruction::And |
| 978 | ? APInt::getAllOnes(numBits: FromCIValueBitWidth) |
| 979 | : APInt::getZero(numBits: FromCIValueBitWidth); |
| 980 | } |
| 981 | break; |
| 982 | case Instruction::Mul: |
| 983 | assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction." ); |
| 984 | if (ToOpcode == Instruction::Shl) { |
| 985 | ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2()); |
| 986 | } else { |
| 987 | assert(FromCIValue.isOne() && "Cannot convert the instruction." ); |
| 988 | ToCIValue = ToOpcode == Instruction::And |
| 989 | ? APInt::getAllOnes(numBits: FromCIValueBitWidth) |
| 990 | : APInt::getZero(numBits: FromCIValueBitWidth); |
| 991 | } |
| 992 | break; |
| 993 | case Instruction::Add: |
| 994 | case Instruction::Sub: |
| 995 | if (FromCIValue.isZero()) { |
| 996 | ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth); |
| 997 | } else { |
| 998 | assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) && |
| 999 | "Cannot convert the instruction." ); |
| 1000 | ToCIValue = FromCIValue; |
| 1001 | ToCIValue.negate(); |
| 1002 | } |
| 1003 | break; |
| 1004 | case Instruction::And: |
| 1005 | assert(FromCIValue.isAllOnes() && "Cannot convert the instruction." ); |
| 1006 | ToCIValue = ToOpcode == Instruction::Mul |
| 1007 | ? APInt::getOneBitSet(numBits: FromCIValueBitWidth, BitNo: 0) |
| 1008 | : APInt::getZero(numBits: FromCIValueBitWidth); |
| 1009 | break; |
| 1010 | default: |
| 1011 | assert(FromCIValue.isZero() && "Cannot convert the instruction." ); |
| 1012 | ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth); |
| 1013 | break; |
| 1014 | } |
| 1015 | Value *LHS = I->getOperand(i: 1 - Pos); |
| 1016 | Constant *RHS = |
| 1017 | ConstantInt::get(Ty: I->getOperand(i: Pos)->getType(), V: ToCIValue); |
| 1018 | // constant + x cannot be -constant - x |
| 1019 | // instead, it should be x - -constant |
| 1020 | if (Pos == 1 || |
| 1021 | (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub)) |
| 1022 | return SmallVector<Value *>({LHS, RHS}); |
| 1023 | return SmallVector<Value *>({RHS, LHS}); |
| 1024 | } |
| 1025 | }; |
| 1026 | InterchangeableInfo MainOp; |
| 1027 | InterchangeableInfo AltOp; |
| 1028 | bool isValidForAlternation(const Instruction *I) const { |
| 1029 | return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) && |
| 1030 | ::isValidForAlternation(Opcode: I->getOpcode()); |
| 1031 | } |
| 1032 | bool initializeAltOp(const Instruction *I) { |
| 1033 | if (AltOp.I) |
| 1034 | return true; |
| 1035 | if (!isValidForAlternation(I)) |
| 1036 | return false; |
| 1037 | AltOp.I = I; |
| 1038 | return true; |
| 1039 | } |
| 1040 | |
| 1041 | public: |
| 1042 | BinOpSameOpcodeHelper(const Instruction *MainOp, |
| 1043 | const Instruction *AltOp = nullptr) |
| 1044 | : MainOp(MainOp), AltOp(AltOp) { |
| 1045 | assert(is_sorted(SupportedOp) && "SupportedOp is not sorted." ); |
| 1046 | } |
| 1047 | bool add(const Instruction *I) { |
| 1048 | assert(isa<BinaryOperator>(I) && |
| 1049 | "BinOpSameOpcodeHelper only accepts BinaryOperator." ); |
| 1050 | unsigned Opcode = I->getOpcode(); |
| 1051 | MaskType OpcodeInMaskForm; |
| 1052 | // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp. |
| 1053 | switch (Opcode) { |
| 1054 | case Instruction::Shl: |
| 1055 | OpcodeInMaskForm = ShlBIT; |
| 1056 | break; |
| 1057 | case Instruction::AShr: |
| 1058 | OpcodeInMaskForm = AShrBIT; |
| 1059 | break; |
| 1060 | case Instruction::Mul: |
| 1061 | OpcodeInMaskForm = MulBIT; |
| 1062 | break; |
| 1063 | case Instruction::Add: |
| 1064 | OpcodeInMaskForm = AddBIT; |
| 1065 | break; |
| 1066 | case Instruction::Sub: |
| 1067 | OpcodeInMaskForm = SubBIT; |
| 1068 | break; |
| 1069 | case Instruction::And: |
| 1070 | OpcodeInMaskForm = AndBIT; |
| 1071 | break; |
| 1072 | case Instruction::Or: |
| 1073 | OpcodeInMaskForm = OrBIT; |
| 1074 | break; |
| 1075 | case Instruction::Xor: |
| 1076 | OpcodeInMaskForm = XorBIT; |
| 1077 | break; |
| 1078 | default: |
| 1079 | return MainOp.equal(Opcode) || |
| 1080 | (initializeAltOp(I) && AltOp.equal(Opcode)); |
| 1081 | } |
| 1082 | MaskType InterchangeableMask = OpcodeInMaskForm; |
| 1083 | ConstantInt *CI = isBinOpWithConstantInt(I).first; |
| 1084 | if (CI) { |
| 1085 | constexpr MaskType CanBeAll = |
| 1086 | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT; |
| 1087 | const APInt &CIValue = CI->getValue(); |
| 1088 | switch (Opcode) { |
| 1089 | case Instruction::Shl: |
| 1090 | if (CIValue.ult(RHS: CIValue.getBitWidth())) |
| 1091 | InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT; |
| 1092 | break; |
| 1093 | case Instruction::Mul: |
| 1094 | if (CIValue.isOne()) { |
| 1095 | InterchangeableMask = CanBeAll; |
| 1096 | break; |
| 1097 | } |
| 1098 | if (CIValue.isPowerOf2()) |
| 1099 | InterchangeableMask = MulBIT | ShlBIT; |
| 1100 | break; |
| 1101 | case Instruction::Add: |
| 1102 | case Instruction::Sub: |
| 1103 | InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT; |
| 1104 | break; |
| 1105 | case Instruction::And: |
| 1106 | if (CIValue.isAllOnes()) |
| 1107 | InterchangeableMask = CanBeAll; |
| 1108 | break; |
| 1109 | default: |
| 1110 | if (CIValue.isZero()) |
| 1111 | InterchangeableMask = CanBeAll; |
| 1112 | break; |
| 1113 | } |
| 1114 | } |
| 1115 | return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) || |
| 1116 | (initializeAltOp(I) && |
| 1117 | AltOp.trySet(OpcodeInMaskForm, InterchangeableMask)); |
| 1118 | } |
| 1119 | unsigned getMainOpcode() const { return MainOp.getOpcode(); } |
| 1120 | bool hasAltOp() const { return AltOp.I; } |
| 1121 | unsigned getAltOpcode() const { |
| 1122 | return hasAltOp() ? AltOp.getOpcode() : getMainOpcode(); |
| 1123 | } |
| 1124 | SmallVector<Value *> getOperand(const Instruction *I) const { |
| 1125 | return MainOp.getOperand(To: I); |
| 1126 | } |
| 1127 | }; |
| 1128 | |
| 1129 | /// Main data required for vectorization of instructions. |
| 1130 | class InstructionsState { |
| 1131 | /// MainOp and AltOp are primarily determined by getSameOpcode. Currently, |
| 1132 | /// only BinaryOperator, CastInst, and CmpInst support alternate instructions |
| 1133 | /// (i.e., AltOp is not equal to MainOp; this can be checked using |
| 1134 | /// isAltShuffle). |
| 1135 | /// A rare exception is TrySplitNode, where the InstructionsState is derived |
| 1136 | /// from getMainAltOpsNoStateVL. |
| 1137 | /// For those InstructionsState that use alternate instructions, the resulting |
| 1138 | /// vectorized output ultimately comes from a shufflevector. For example, |
| 1139 | /// given a vector list (VL): |
| 1140 | /// VL[0] = add i32 a, e |
| 1141 | /// VL[1] = sub i32 b, f |
| 1142 | /// VL[2] = add i32 c, g |
| 1143 | /// VL[3] = sub i32 d, h |
| 1144 | /// The vectorized result would be: |
| 1145 | /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h> |
| 1146 | /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h> |
| 1147 | /// result = shufflevector <4 x i32> intermediated_0, |
| 1148 | /// <4 x i32> intermediated_1, |
| 1149 | /// <4 x i32> <i32 0, i32 5, i32 2, i32 7> |
| 1150 | /// Since shufflevector is used in the final result, when calculating the cost |
| 1151 | /// (getEntryCost), we must account for the usage of shufflevector in |
| 1152 | /// GetVectorCost. |
| 1153 | Instruction *MainOp = nullptr; |
| 1154 | Instruction *AltOp = nullptr; |
| 1155 | |
| 1156 | public: |
| 1157 | Instruction *getMainOp() const { |
| 1158 | assert(valid() && "InstructionsState is invalid." ); |
| 1159 | return MainOp; |
| 1160 | } |
| 1161 | |
| 1162 | Instruction *getAltOp() const { |
| 1163 | assert(valid() && "InstructionsState is invalid." ); |
| 1164 | return AltOp; |
| 1165 | } |
| 1166 | |
| 1167 | /// The main/alternate opcodes for the list of instructions. |
| 1168 | unsigned getOpcode() const { return getMainOp()->getOpcode(); } |
| 1169 | |
| 1170 | unsigned getAltOpcode() const { return getAltOp()->getOpcode(); } |
| 1171 | |
| 1172 | /// Some of the instructions in the list have alternate opcodes. |
| 1173 | bool isAltShuffle() const { return getMainOp() != getAltOp(); } |
| 1174 | |
| 1175 | /// Checks if the instruction matches either the main or alternate opcode. |
| 1176 | /// \returns |
| 1177 | /// - MainOp if \param I matches MainOp's opcode directly or can be converted |
| 1178 | /// to it |
| 1179 | /// - AltOp if \param I matches AltOp's opcode directly or can be converted to |
| 1180 | /// it |
| 1181 | /// - nullptr if \param I cannot be matched or converted to either opcode |
| 1182 | Instruction *getMatchingMainOpOrAltOp(Instruction *I) const { |
| 1183 | assert(MainOp && "MainOp cannot be nullptr." ); |
| 1184 | if (I->getOpcode() == MainOp->getOpcode()) |
| 1185 | return MainOp; |
| 1186 | // Prefer AltOp instead of interchangeable instruction of MainOp. |
| 1187 | assert(AltOp && "AltOp cannot be nullptr." ); |
| 1188 | if (I->getOpcode() == AltOp->getOpcode()) |
| 1189 | return AltOp; |
| 1190 | if (!I->isBinaryOp()) |
| 1191 | return nullptr; |
| 1192 | BinOpSameOpcodeHelper Converter(MainOp); |
| 1193 | if (Converter.add(I) && Converter.add(I: MainOp) && !Converter.hasAltOp()) |
| 1194 | return MainOp; |
| 1195 | return AltOp; |
| 1196 | } |
| 1197 | |
| 1198 | /// Checks if main/alt instructions are shift operations. |
| 1199 | bool isShiftOp() const { |
| 1200 | return getMainOp()->isShift() && getAltOp()->isShift(); |
| 1201 | } |
| 1202 | |
| 1203 | /// Checks if main/alt instructions are bitwise logic operations. |
| 1204 | bool isBitwiseLogicOp() const { |
| 1205 | return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp(); |
| 1206 | } |
| 1207 | |
| 1208 | /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations. |
| 1209 | bool isMulDivLikeOp() const { |
| 1210 | constexpr std::array<unsigned, 8> MulDiv = { |
| 1211 | Instruction::Mul, Instruction::FMul, Instruction::SDiv, |
| 1212 | Instruction::UDiv, Instruction::FDiv, Instruction::SRem, |
| 1213 | Instruction::URem, Instruction::FRem}; |
| 1214 | return is_contained(Range: MulDiv, Element: getOpcode()) && |
| 1215 | is_contained(Range: MulDiv, Element: getAltOpcode()); |
| 1216 | } |
| 1217 | |
| 1218 | /// Checks if main/alt instructions are add/sub/fadd/fsub operations. |
| 1219 | bool isAddSubLikeOp() const { |
| 1220 | constexpr std::array<unsigned, 4> AddSub = { |
| 1221 | Instruction::Add, Instruction::Sub, Instruction::FAdd, |
| 1222 | Instruction::FSub}; |
| 1223 | return is_contained(Range: AddSub, Element: getOpcode()) && |
| 1224 | is_contained(Range: AddSub, Element: getAltOpcode()); |
| 1225 | } |
| 1226 | |
| 1227 | /// Checks if main/alt instructions are cmp operations. |
| 1228 | bool isCmpOp() const { |
| 1229 | return (getOpcode() == Instruction::ICmp || |
| 1230 | getOpcode() == Instruction::FCmp) && |
| 1231 | getAltOpcode() == getOpcode(); |
| 1232 | } |
| 1233 | |
| 1234 | /// Checks if the current state is valid, i.e. has non-null MainOp |
| 1235 | bool valid() const { return MainOp && AltOp; } |
| 1236 | |
| 1237 | explicit operator bool() const { return valid(); } |
| 1238 | |
| 1239 | InstructionsState() = delete; |
| 1240 | InstructionsState(Instruction *MainOp, Instruction *AltOp) |
| 1241 | : MainOp(MainOp), AltOp(AltOp) {} |
| 1242 | static InstructionsState invalid() { return {nullptr, nullptr}; } |
| 1243 | }; |
| 1244 | |
| 1245 | std::pair<Instruction *, SmallVector<Value *>> |
| 1246 | convertTo(Instruction *I, const InstructionsState &S) { |
| 1247 | Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I); |
| 1248 | assert(SelectedOp && "Cannot convert the instruction." ); |
| 1249 | if (I->isBinaryOp()) { |
| 1250 | BinOpSameOpcodeHelper Converter(I); |
| 1251 | return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp)); |
| 1252 | } |
| 1253 | return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands())); |
| 1254 | } |
| 1255 | |
| 1256 | } // end anonymous namespace |
| 1257 | |
| 1258 | static InstructionsState getSameOpcode(ArrayRef<Value *> VL, |
| 1259 | const TargetLibraryInfo &TLI); |
| 1260 | |
| 1261 | /// Find an instruction with a specific opcode in VL. |
| 1262 | /// \param VL Array of values to search through. Must contain only Instructions |
| 1263 | /// and PoisonValues. |
| 1264 | /// \param Opcode The instruction opcode to search for |
| 1265 | /// \returns |
| 1266 | /// - The first instruction found with matching opcode |
| 1267 | /// - nullptr if no matching instruction is found |
| 1268 | static Instruction *findInstructionWithOpcode(ArrayRef<Value *> VL, |
| 1269 | unsigned Opcode) { |
| 1270 | for (Value *V : VL) { |
| 1271 | if (isa<PoisonValue>(Val: V)) |
| 1272 | continue; |
| 1273 | assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction." ); |
| 1274 | auto *Inst = cast<Instruction>(Val: V); |
| 1275 | if (Inst->getOpcode() == Opcode) |
| 1276 | return Inst; |
| 1277 | } |
| 1278 | return nullptr; |
| 1279 | } |
| 1280 | |
| 1281 | /// Checks if the provided operands of 2 cmp instructions are compatible, i.e. |
| 1282 | /// compatible instructions or constants, or just some other regular values. |
| 1283 | static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, |
| 1284 | Value *Op1, const TargetLibraryInfo &TLI) { |
| 1285 | return (isConstant(V: BaseOp0) && isConstant(V: Op0)) || |
| 1286 | (isConstant(V: BaseOp1) && isConstant(V: Op1)) || |
| 1287 | (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) && |
| 1288 | !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) || |
| 1289 | BaseOp0 == Op0 || BaseOp1 == Op1 || |
| 1290 | getSameOpcode(VL: {BaseOp0, Op0}, TLI) || |
| 1291 | getSameOpcode(VL: {BaseOp1, Op1}, TLI); |
| 1292 | } |
| 1293 | |
| 1294 | /// \returns true if a compare instruction \p CI has similar "look" and |
| 1295 | /// same predicate as \p BaseCI, "as is" or with its operands and predicate |
| 1296 | /// swapped, false otherwise. |
| 1297 | static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, |
| 1298 | const TargetLibraryInfo &TLI) { |
| 1299 | assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() && |
| 1300 | "Assessing comparisons of different types?" ); |
| 1301 | CmpInst::Predicate BasePred = BaseCI->getPredicate(); |
| 1302 | CmpInst::Predicate Pred = CI->getPredicate(); |
| 1303 | CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred); |
| 1304 | |
| 1305 | Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0); |
| 1306 | Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1); |
| 1307 | Value *Op0 = CI->getOperand(i_nocapture: 0); |
| 1308 | Value *Op1 = CI->getOperand(i_nocapture: 1); |
| 1309 | |
| 1310 | return (BasePred == Pred && |
| 1311 | areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) || |
| 1312 | (BasePred == SwappedPred && |
| 1313 | areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI)); |
| 1314 | } |
| 1315 | |
| 1316 | /// \returns analysis of the Instructions in \p VL described in |
| 1317 | /// InstructionsState, the Opcode that we suppose the whole list |
| 1318 | /// could be vectorized even if its structure is diverse. |
| 1319 | static InstructionsState getSameOpcode(ArrayRef<Value *> VL, |
| 1320 | const TargetLibraryInfo &TLI) { |
| 1321 | // Make sure these are all Instructions. |
| 1322 | if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>)) |
| 1323 | return InstructionsState::invalid(); |
| 1324 | |
| 1325 | auto *It = find_if(Range&: VL, P: IsaPred<Instruction>); |
| 1326 | if (It == VL.end()) |
| 1327 | return InstructionsState::invalid(); |
| 1328 | |
| 1329 | Instruction *MainOp = cast<Instruction>(Val: *It); |
| 1330 | unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>); |
| 1331 | if ((VL.size() > 2 && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / 2) || |
| 1332 | (VL.size() == 2 && InstCnt < 2)) |
| 1333 | return InstructionsState::invalid(); |
| 1334 | |
| 1335 | bool IsCastOp = isa<CastInst>(Val: MainOp); |
| 1336 | bool IsBinOp = isa<BinaryOperator>(Val: MainOp); |
| 1337 | bool IsCmpOp = isa<CmpInst>(Val: MainOp); |
| 1338 | CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate() |
| 1339 | : CmpInst::BAD_ICMP_PREDICATE; |
| 1340 | Instruction *AltOp = MainOp; |
| 1341 | unsigned Opcode = MainOp->getOpcode(); |
| 1342 | unsigned AltOpcode = Opcode; |
| 1343 | |
| 1344 | BinOpSameOpcodeHelper BinOpHelper(MainOp); |
| 1345 | bool SwappedPredsCompatible = IsCmpOp && [&]() { |
| 1346 | SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds; |
| 1347 | UniquePreds.insert(X: BasePred); |
| 1348 | UniqueNonSwappedPreds.insert(X: BasePred); |
| 1349 | for (Value *V : VL) { |
| 1350 | auto *I = dyn_cast<CmpInst>(Val: V); |
| 1351 | if (!I) |
| 1352 | return false; |
| 1353 | CmpInst::Predicate CurrentPred = I->getPredicate(); |
| 1354 | CmpInst::Predicate SwappedCurrentPred = |
| 1355 | CmpInst::getSwappedPredicate(pred: CurrentPred); |
| 1356 | UniqueNonSwappedPreds.insert(X: CurrentPred); |
| 1357 | if (!UniquePreds.contains(key: CurrentPred) && |
| 1358 | !UniquePreds.contains(key: SwappedCurrentPred)) |
| 1359 | UniquePreds.insert(X: CurrentPred); |
| 1360 | } |
| 1361 | // Total number of predicates > 2, but if consider swapped predicates |
| 1362 | // compatible only 2, consider swappable predicates as compatible opcodes, |
| 1363 | // not alternate. |
| 1364 | return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2; |
| 1365 | }(); |
| 1366 | // Check for one alternate opcode from another BinaryOperator. |
| 1367 | // TODO - generalize to support all operators (types, calls etc.). |
| 1368 | Intrinsic::ID BaseID = 0; |
| 1369 | SmallVector<VFInfo> BaseMappings; |
| 1370 | if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) { |
| 1371 | BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI); |
| 1372 | BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase); |
| 1373 | if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty()) |
| 1374 | return InstructionsState::invalid(); |
| 1375 | } |
| 1376 | bool AnyPoison = InstCnt != VL.size(); |
| 1377 | // Check MainOp too to be sure that it matches the requirements for the |
| 1378 | // instructions. |
| 1379 | for (Value *V : iterator_range(It, VL.end())) { |
| 1380 | auto *I = dyn_cast<Instruction>(Val: V); |
| 1381 | if (!I) |
| 1382 | continue; |
| 1383 | |
| 1384 | // Cannot combine poison and divisions. |
| 1385 | // TODO: do some smart analysis of the CallInsts to exclude divide-like |
| 1386 | // intrinsics/functions only. |
| 1387 | if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I))) |
| 1388 | return InstructionsState::invalid(); |
| 1389 | unsigned InstOpcode = I->getOpcode(); |
| 1390 | if (IsBinOp && isa<BinaryOperator>(Val: I)) { |
| 1391 | if (BinOpHelper.add(I)) |
| 1392 | continue; |
| 1393 | } else if (IsCastOp && isa<CastInst>(Val: I)) { |
| 1394 | Value *Op0 = MainOp->getOperand(i: 0); |
| 1395 | Type *Ty0 = Op0->getType(); |
| 1396 | Value *Op1 = I->getOperand(i: 0); |
| 1397 | Type *Ty1 = Op1->getType(); |
| 1398 | if (Ty0 == Ty1) { |
| 1399 | if (InstOpcode == Opcode || InstOpcode == AltOpcode) |
| 1400 | continue; |
| 1401 | if (Opcode == AltOpcode) { |
| 1402 | assert(isValidForAlternation(Opcode) && |
| 1403 | isValidForAlternation(InstOpcode) && |
| 1404 | "Cast isn't safe for alternation, logic needs to be updated!" ); |
| 1405 | AltOpcode = InstOpcode; |
| 1406 | AltOp = I; |
| 1407 | continue; |
| 1408 | } |
| 1409 | } |
| 1410 | } else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) { |
| 1411 | auto *BaseInst = cast<CmpInst>(Val: MainOp); |
| 1412 | Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType(); |
| 1413 | Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType(); |
| 1414 | if (Ty0 == Ty1) { |
| 1415 | assert(InstOpcode == Opcode && "Expected same CmpInst opcode." ); |
| 1416 | assert(InstOpcode == AltOpcode && |
| 1417 | "Alternate instructions are only supported by BinaryOperator " |
| 1418 | "and CastInst." ); |
| 1419 | // Check for compatible operands. If the corresponding operands are not |
| 1420 | // compatible - need to perform alternate vectorization. |
| 1421 | CmpInst::Predicate CurrentPred = Inst->getPredicate(); |
| 1422 | CmpInst::Predicate SwappedCurrentPred = |
| 1423 | CmpInst::getSwappedPredicate(pred: CurrentPred); |
| 1424 | |
| 1425 | if ((VL.size() == 2 || SwappedPredsCompatible) && |
| 1426 | (BasePred == CurrentPred || BasePred == SwappedCurrentPred)) |
| 1427 | continue; |
| 1428 | |
| 1429 | if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI)) |
| 1430 | continue; |
| 1431 | auto *AltInst = cast<CmpInst>(Val: AltOp); |
| 1432 | if (MainOp != AltOp) { |
| 1433 | if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI)) |
| 1434 | continue; |
| 1435 | } else if (BasePred != CurrentPred) { |
| 1436 | assert( |
| 1437 | isValidForAlternation(InstOpcode) && |
| 1438 | "CmpInst isn't safe for alternation, logic needs to be updated!" ); |
| 1439 | AltOp = I; |
| 1440 | continue; |
| 1441 | } |
| 1442 | CmpInst::Predicate AltPred = AltInst->getPredicate(); |
| 1443 | if (BasePred == CurrentPred || BasePred == SwappedCurrentPred || |
| 1444 | AltPred == CurrentPred || AltPred == SwappedCurrentPred) |
| 1445 | continue; |
| 1446 | } |
| 1447 | } else if (InstOpcode == Opcode) { |
| 1448 | assert(InstOpcode == AltOpcode && |
| 1449 | "Alternate instructions are only supported by BinaryOperator and " |
| 1450 | "CastInst." ); |
| 1451 | if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) { |
| 1452 | if (Gep->getNumOperands() != 2 || |
| 1453 | Gep->getOperand(i_nocapture: 0)->getType() != MainOp->getOperand(i: 0)->getType()) |
| 1454 | return InstructionsState::invalid(); |
| 1455 | } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) { |
| 1456 | if (!isVectorLikeInstWithConstOps(V: EI)) |
| 1457 | return InstructionsState::invalid(); |
| 1458 | } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) { |
| 1459 | auto *BaseLI = cast<LoadInst>(Val: MainOp); |
| 1460 | if (!LI->isSimple() || !BaseLI->isSimple()) |
| 1461 | return InstructionsState::invalid(); |
| 1462 | } else if (auto *Call = dyn_cast<CallInst>(Val: I)) { |
| 1463 | auto *CallBase = cast<CallInst>(Val: MainOp); |
| 1464 | if (Call->getCalledFunction() != CallBase->getCalledFunction()) |
| 1465 | return InstructionsState::invalid(); |
| 1466 | if (Call->hasOperandBundles() && |
| 1467 | (!CallBase->hasOperandBundles() || |
| 1468 | !std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(), |
| 1469 | last1: Call->op_begin() + Call->getBundleOperandsEndIndex(), |
| 1470 | first2: CallBase->op_begin() + |
| 1471 | CallBase->getBundleOperandsStartIndex()))) |
| 1472 | return InstructionsState::invalid(); |
| 1473 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI); |
| 1474 | if (ID != BaseID) |
| 1475 | return InstructionsState::invalid(); |
| 1476 | if (!ID) { |
| 1477 | SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call); |
| 1478 | if (Mappings.size() != BaseMappings.size() || |
| 1479 | Mappings.front().ISA != BaseMappings.front().ISA || |
| 1480 | Mappings.front().ScalarName != BaseMappings.front().ScalarName || |
| 1481 | Mappings.front().VectorName != BaseMappings.front().VectorName || |
| 1482 | Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || |
| 1483 | Mappings.front().Shape.Parameters != |
| 1484 | BaseMappings.front().Shape.Parameters) |
| 1485 | return InstructionsState::invalid(); |
| 1486 | } |
| 1487 | } |
| 1488 | continue; |
| 1489 | } |
| 1490 | return InstructionsState::invalid(); |
| 1491 | } |
| 1492 | |
| 1493 | if (IsBinOp) { |
| 1494 | MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode()); |
| 1495 | assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper." ); |
| 1496 | AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode()); |
| 1497 | assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper." ); |
| 1498 | } |
| 1499 | assert((MainOp == AltOp || !allSameOpcode(VL)) && |
| 1500 | "Incorrect implementation of allSameOpcode." ); |
| 1501 | InstructionsState S(MainOp, AltOp); |
| 1502 | assert(all_of(VL, |
| 1503 | [&](Value *V) { |
| 1504 | return isa<PoisonValue>(V) || |
| 1505 | S.getMatchingMainOpOrAltOp(cast<Instruction>(V)); |
| 1506 | }) && |
| 1507 | "Invalid InstructionsState." ); |
| 1508 | return S; |
| 1509 | } |
| 1510 | |
| 1511 | /// \returns true if all of the values in \p VL have the same type or false |
| 1512 | /// otherwise. |
| 1513 | static bool allSameType(ArrayRef<Value *> VL) { |
| 1514 | Type *Ty = VL.front()->getType(); |
| 1515 | return all_of(Range: VL.drop_front(), P: [&](Value *V) { return V->getType() == Ty; }); |
| 1516 | } |
| 1517 | |
| 1518 | /// \returns True if in-tree use also needs extract. This refers to |
| 1519 | /// possible scalar operand in vectorized instruction. |
| 1520 | static bool (Value *Scalar, Instruction *UserInst, |
| 1521 | TargetLibraryInfo *TLI, |
| 1522 | const TargetTransformInfo *TTI) { |
| 1523 | if (!UserInst) |
| 1524 | return false; |
| 1525 | unsigned Opcode = UserInst->getOpcode(); |
| 1526 | switch (Opcode) { |
| 1527 | case Instruction::Load: { |
| 1528 | LoadInst *LI = cast<LoadInst>(Val: UserInst); |
| 1529 | return (LI->getPointerOperand() == Scalar); |
| 1530 | } |
| 1531 | case Instruction::Store: { |
| 1532 | StoreInst *SI = cast<StoreInst>(Val: UserInst); |
| 1533 | return (SI->getPointerOperand() == Scalar); |
| 1534 | } |
| 1535 | case Instruction::Call: { |
| 1536 | CallInst *CI = cast<CallInst>(Val: UserInst); |
| 1537 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 1538 | return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) { |
| 1539 | return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) && |
| 1540 | Arg.value().get() == Scalar; |
| 1541 | }); |
| 1542 | } |
| 1543 | default: |
| 1544 | return false; |
| 1545 | } |
| 1546 | } |
| 1547 | |
| 1548 | /// \returns the AA location that is being access by the instruction. |
| 1549 | static MemoryLocation getLocation(Instruction *I) { |
| 1550 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) |
| 1551 | return MemoryLocation::get(SI); |
| 1552 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) |
| 1553 | return MemoryLocation::get(LI); |
| 1554 | return MemoryLocation(); |
| 1555 | } |
| 1556 | |
| 1557 | /// \returns True if the instruction is not a volatile or atomic load/store. |
| 1558 | static bool isSimple(Instruction *I) { |
| 1559 | if (LoadInst *LI = dyn_cast<LoadInst>(Val: I)) |
| 1560 | return LI->isSimple(); |
| 1561 | if (StoreInst *SI = dyn_cast<StoreInst>(Val: I)) |
| 1562 | return SI->isSimple(); |
| 1563 | if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I)) |
| 1564 | return !MI->isVolatile(); |
| 1565 | return true; |
| 1566 | } |
| 1567 | |
| 1568 | /// Shuffles \p Mask in accordance with the given \p SubMask. |
| 1569 | /// \param ExtendingManyInputs Supports reshuffling of the mask with not only |
| 1570 | /// one but two input vectors. |
| 1571 | static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask, |
| 1572 | bool ExtendingManyInputs = false) { |
| 1573 | if (SubMask.empty()) |
| 1574 | return; |
| 1575 | assert( |
| 1576 | (!ExtendingManyInputs || SubMask.size() > Mask.size() || |
| 1577 | // Check if input scalars were extended to match the size of other node. |
| 1578 | (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) && |
| 1579 | "SubMask with many inputs support must be larger than the mask." ); |
| 1580 | if (Mask.empty()) { |
| 1581 | Mask.append(in_start: SubMask.begin(), in_end: SubMask.end()); |
| 1582 | return; |
| 1583 | } |
| 1584 | SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem); |
| 1585 | int TermValue = std::min(a: Mask.size(), b: SubMask.size()); |
| 1586 | for (int I = 0, E = SubMask.size(); I < E; ++I) { |
| 1587 | if (SubMask[I] == PoisonMaskElem || |
| 1588 | (!ExtendingManyInputs && |
| 1589 | (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue))) |
| 1590 | continue; |
| 1591 | NewMask[I] = Mask[SubMask[I]]; |
| 1592 | } |
| 1593 | Mask.swap(RHS&: NewMask); |
| 1594 | } |
| 1595 | |
| 1596 | /// Order may have elements assigned special value (size) which is out of |
| 1597 | /// bounds. Such indices only appear on places which correspond to undef values |
| 1598 | /// (see canReuseExtract for details) and used in order to avoid undef values |
| 1599 | /// have effect on operands ordering. |
| 1600 | /// The first loop below simply finds all unused indices and then the next loop |
| 1601 | /// nest assigns these indices for undef values positions. |
| 1602 | /// As an example below Order has two undef positions and they have assigned |
| 1603 | /// values 3 and 7 respectively: |
| 1604 | /// before: 6 9 5 4 9 2 1 0 |
| 1605 | /// after: 6 3 5 4 7 2 1 0 |
| 1606 | static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) { |
| 1607 | const size_t Sz = Order.size(); |
| 1608 | SmallBitVector UnusedIndices(Sz, /*t=*/true); |
| 1609 | SmallBitVector MaskedIndices(Sz); |
| 1610 | for (unsigned I = 0; I < Sz; ++I) { |
| 1611 | if (Order[I] < Sz) |
| 1612 | UnusedIndices.reset(Idx: Order[I]); |
| 1613 | else |
| 1614 | MaskedIndices.set(I); |
| 1615 | } |
| 1616 | if (MaskedIndices.none()) |
| 1617 | return; |
| 1618 | assert(UnusedIndices.count() == MaskedIndices.count() && |
| 1619 | "Non-synced masked/available indices." ); |
| 1620 | int Idx = UnusedIndices.find_first(); |
| 1621 | int MIdx = MaskedIndices.find_first(); |
| 1622 | while (MIdx >= 0) { |
| 1623 | assert(Idx >= 0 && "Indices must be synced." ); |
| 1624 | Order[MIdx] = Idx; |
| 1625 | Idx = UnusedIndices.find_next(Prev: Idx); |
| 1626 | MIdx = MaskedIndices.find_next(Prev: MIdx); |
| 1627 | } |
| 1628 | } |
| 1629 | |
| 1630 | /// \returns a bitset for selecting opcodes. false for Opcode0 and true for |
| 1631 | /// Opcode1. |
| 1632 | static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy, |
| 1633 | unsigned Opcode0, unsigned Opcode1) { |
| 1634 | unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy); |
| 1635 | SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false); |
| 1636 | for (unsigned Lane : seq<unsigned>(Size: VL.size())) { |
| 1637 | if (isa<PoisonValue>(Val: VL[Lane])) |
| 1638 | continue; |
| 1639 | if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1) |
| 1640 | OpcodeMask.set(I: Lane * ScalarTyNumElements, |
| 1641 | E: Lane * ScalarTyNumElements + ScalarTyNumElements); |
| 1642 | } |
| 1643 | return OpcodeMask; |
| 1644 | } |
| 1645 | |
| 1646 | /// Replicates the given \p Val \p VF times. |
| 1647 | static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val, |
| 1648 | unsigned VF) { |
| 1649 | assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) && |
| 1650 | "Expected scalar constants." ); |
| 1651 | SmallVector<Constant *> NewVal(Val.size() * VF); |
| 1652 | for (auto [I, V] : enumerate(First&: Val)) |
| 1653 | std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V); |
| 1654 | return NewVal; |
| 1655 | } |
| 1656 | |
| 1657 | namespace llvm { |
| 1658 | |
| 1659 | static void inversePermutation(ArrayRef<unsigned> Indices, |
| 1660 | SmallVectorImpl<int> &Mask) { |
| 1661 | Mask.clear(); |
| 1662 | const unsigned E = Indices.size(); |
| 1663 | Mask.resize(N: E, NV: PoisonMaskElem); |
| 1664 | for (unsigned I = 0; I < E; ++I) |
| 1665 | Mask[Indices[I]] = I; |
| 1666 | } |
| 1667 | |
| 1668 | /// Reorders the list of scalars in accordance with the given \p Mask. |
| 1669 | static void reorderScalars(SmallVectorImpl<Value *> &Scalars, |
| 1670 | ArrayRef<int> Mask) { |
| 1671 | assert(!Mask.empty() && "Expected non-empty mask." ); |
| 1672 | SmallVector<Value *> Prev(Scalars.size(), |
| 1673 | PoisonValue::get(T: Scalars.front()->getType())); |
| 1674 | Prev.swap(RHS&: Scalars); |
| 1675 | for (unsigned I = 0, E = Prev.size(); I < E; ++I) |
| 1676 | if (Mask[I] != PoisonMaskElem) |
| 1677 | Scalars[Mask[I]] = Prev[I]; |
| 1678 | } |
| 1679 | |
| 1680 | /// Checks if the provided value does not require scheduling. It does not |
| 1681 | /// require scheduling if this is not an instruction or it is an instruction |
| 1682 | /// that does not read/write memory and all operands are either not instructions |
| 1683 | /// or phi nodes or instructions from different blocks. |
| 1684 | static bool areAllOperandsNonInsts(Value *V) { |
| 1685 | auto *I = dyn_cast<Instruction>(Val: V); |
| 1686 | if (!I) |
| 1687 | return true; |
| 1688 | return !mayHaveNonDefUseDependency(I: *I) && |
| 1689 | all_of(Range: I->operands(), P: [I](Value *V) { |
| 1690 | auto *IO = dyn_cast<Instruction>(Val: V); |
| 1691 | if (!IO) |
| 1692 | return true; |
| 1693 | return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent(); |
| 1694 | }); |
| 1695 | } |
| 1696 | |
| 1697 | /// Checks if the provided value does not require scheduling. It does not |
| 1698 | /// require scheduling if this is not an instruction or it is an instruction |
| 1699 | /// that does not read/write memory and all users are phi nodes or instructions |
| 1700 | /// from the different blocks. |
| 1701 | static bool isUsedOutsideBlock(Value *V) { |
| 1702 | auto *I = dyn_cast<Instruction>(Val: V); |
| 1703 | if (!I) |
| 1704 | return true; |
| 1705 | // Limits the number of uses to save compile time. |
| 1706 | return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) && |
| 1707 | all_of(Range: I->users(), P: [I](User *U) { |
| 1708 | auto *IU = dyn_cast<Instruction>(Val: U); |
| 1709 | if (!IU) |
| 1710 | return true; |
| 1711 | return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU); |
| 1712 | }); |
| 1713 | } |
| 1714 | |
| 1715 | /// Checks if the specified value does not require scheduling. It does not |
| 1716 | /// require scheduling if all operands and all users do not need to be scheduled |
| 1717 | /// in the current basic block. |
| 1718 | static bool doesNotNeedToBeScheduled(Value *V) { |
| 1719 | return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); |
| 1720 | } |
| 1721 | |
| 1722 | /// Checks if the specified array of instructions does not require scheduling. |
| 1723 | /// It is so if all either instructions have operands that do not require |
| 1724 | /// scheduling or their users do not require scheduling since they are phis or |
| 1725 | /// in other basic blocks. |
| 1726 | static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) { |
| 1727 | return !VL.empty() && |
| 1728 | (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts)); |
| 1729 | } |
| 1730 | |
| 1731 | /// Returns true if widened type of \p Ty elements with size \p Sz represents |
| 1732 | /// full vector type, i.e. adding extra element results in extra parts upon type |
| 1733 | /// legalization. |
| 1734 | static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, |
| 1735 | unsigned Sz) { |
| 1736 | if (Sz <= 1) |
| 1737 | return false; |
| 1738 | if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty)) |
| 1739 | return false; |
| 1740 | if (has_single_bit(Value: Sz)) |
| 1741 | return true; |
| 1742 | const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz)); |
| 1743 | return NumParts > 0 && NumParts < Sz && has_single_bit(Value: Sz / NumParts) && |
| 1744 | Sz % NumParts == 0; |
| 1745 | } |
| 1746 | |
| 1747 | /// Returns number of parts, the type \p VecTy will be split at the codegen |
| 1748 | /// phase. If the type is going to be scalarized or does not uses whole |
| 1749 | /// registers, returns 1. |
| 1750 | static unsigned |
| 1751 | getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, |
| 1752 | const unsigned Limit = std::numeric_limits<unsigned>::max()) { |
| 1753 | unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy); |
| 1754 | if (NumParts == 0 || NumParts >= Limit) |
| 1755 | return 1; |
| 1756 | unsigned Sz = getNumElements(Ty: VecTy); |
| 1757 | if (NumParts >= Sz || Sz % NumParts != 0 || |
| 1758 | !hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts)) |
| 1759 | return 1; |
| 1760 | return NumParts; |
| 1761 | } |
| 1762 | |
| 1763 | namespace slpvectorizer { |
| 1764 | |
| 1765 | /// Bottom Up SLP Vectorizer. |
| 1766 | class BoUpSLP { |
| 1767 | class TreeEntry; |
| 1768 | class ScheduleEntity; |
| 1769 | class ScheduleData; |
| 1770 | class ScheduleBundle; |
| 1771 | class ShuffleCostEstimator; |
| 1772 | class ShuffleInstructionBuilder; |
| 1773 | |
| 1774 | public: |
| 1775 | /// Tracks the state we can represent the loads in the given sequence. |
| 1776 | enum class LoadsState { |
| 1777 | Gather, |
| 1778 | Vectorize, |
| 1779 | ScatterVectorize, |
| 1780 | StridedVectorize, |
| 1781 | CompressVectorize |
| 1782 | }; |
| 1783 | |
| 1784 | using ValueList = SmallVector<Value *, 8>; |
| 1785 | using InstrList = SmallVector<Instruction *, 16>; |
| 1786 | using ValueSet = SmallPtrSet<Value *, 16>; |
| 1787 | using StoreList = SmallVector<StoreInst *, 8>; |
| 1788 | using = SmallDenseSet<Value *, 4>; |
| 1789 | using OrdersType = SmallVector<unsigned, 4>; |
| 1790 | |
| 1791 | BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, |
| 1792 | TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, |
| 1793 | DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, |
| 1794 | const DataLayout *DL, OptimizationRemarkEmitter *ORE) |
| 1795 | : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt), |
| 1796 | AC(AC), DB(DB), DL(DL), ORE(ORE), |
| 1797 | Builder(Se->getContext(), TargetFolder(*DL)) { |
| 1798 | CodeMetrics::collectEphemeralValues(L: F, AC, EphValues); |
| 1799 | // Use the vector register size specified by the target unless overridden |
| 1800 | // by a command-line option. |
| 1801 | // TODO: It would be better to limit the vectorization factor based on |
| 1802 | // data type rather than just register size. For example, x86 AVX has |
| 1803 | // 256-bit registers, but it does not support integer operations |
| 1804 | // at that width (that requires AVX2). |
| 1805 | if (MaxVectorRegSizeOption.getNumOccurrences()) |
| 1806 | MaxVecRegSize = MaxVectorRegSizeOption; |
| 1807 | else |
| 1808 | MaxVecRegSize = |
| 1809 | TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
| 1810 | .getFixedValue(); |
| 1811 | |
| 1812 | if (MinVectorRegSizeOption.getNumOccurrences()) |
| 1813 | MinVecRegSize = MinVectorRegSizeOption; |
| 1814 | else |
| 1815 | MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); |
| 1816 | } |
| 1817 | |
| 1818 | /// Vectorize the tree that starts with the elements in \p VL. |
| 1819 | /// Returns the vectorized root. |
| 1820 | Value *vectorizeTree(); |
| 1821 | |
| 1822 | /// Vectorize the tree but with the list of externally used values \p |
| 1823 | /// ExternallyUsedValues. Values in this MapVector can be replaced but the |
| 1824 | /// generated extractvalue instructions. |
| 1825 | Value *vectorizeTree( |
| 1826 | const ExtraValueToDebugLocsMap &ExternallyUsedValues, |
| 1827 | Instruction *ReductionRoot = nullptr, |
| 1828 | ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {}); |
| 1829 | |
| 1830 | /// \returns the cost incurred by unwanted spills and fills, caused by |
| 1831 | /// holding live values over call sites. |
| 1832 | InstructionCost getSpillCost(); |
| 1833 | |
| 1834 | /// \returns the vectorization cost of the subtree that starts at \p VL. |
| 1835 | /// A negative number means that this is profitable. |
| 1836 | InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {}, |
| 1837 | InstructionCost ReductionCost = TTI::TCC_Free); |
| 1838 | |
| 1839 | /// Construct a vectorizable tree that starts at \p Roots, ignoring users for |
| 1840 | /// the purpose of scheduling and extraction in the \p UserIgnoreLst. |
| 1841 | void buildTree(ArrayRef<Value *> Roots, |
| 1842 | const SmallDenseSet<Value *> &UserIgnoreLst); |
| 1843 | |
| 1844 | /// Construct a vectorizable tree that starts at \p Roots. |
| 1845 | void buildTree(ArrayRef<Value *> Roots); |
| 1846 | |
| 1847 | /// Return the scalars of the root node. |
| 1848 | ArrayRef<Value *> getRootNodeScalars() const { |
| 1849 | assert(!VectorizableTree.empty() && "No graph to get the first node from" ); |
| 1850 | return VectorizableTree.front()->Scalars; |
| 1851 | } |
| 1852 | |
| 1853 | /// Returns the type/is-signed info for the root node in the graph without |
| 1854 | /// casting. |
| 1855 | std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const { |
| 1856 | const TreeEntry &Root = *VectorizableTree.front(); |
| 1857 | if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() || |
| 1858 | !Root.Scalars.front()->getType()->isIntegerTy()) |
| 1859 | return std::nullopt; |
| 1860 | auto It = MinBWs.find(Val: &Root); |
| 1861 | if (It != MinBWs.end()) |
| 1862 | return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(), |
| 1863 | NumBits: It->second.first), |
| 1864 | y: It->second.second); |
| 1865 | if (Root.getOpcode() == Instruction::ZExt || |
| 1866 | Root.getOpcode() == Instruction::SExt) |
| 1867 | return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(), |
| 1868 | y: Root.getOpcode() == Instruction::SExt); |
| 1869 | return std::nullopt; |
| 1870 | } |
| 1871 | |
| 1872 | /// Checks if the root graph node can be emitted with narrower bitwidth at |
| 1873 | /// codegen and returns it signedness, if so. |
| 1874 | bool isSignedMinBitwidthRootNode() const { |
| 1875 | return MinBWs.at(Val: VectorizableTree.front().get()).second; |
| 1876 | } |
| 1877 | |
| 1878 | /// Returns reduction type after minbitdth analysis. |
| 1879 | FixedVectorType *getReductionType() const { |
| 1880 | if (ReductionBitWidth == 0 || |
| 1881 | !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() || |
| 1882 | ReductionBitWidth >= |
| 1883 | DL->getTypeSizeInBits( |
| 1884 | Ty: VectorizableTree.front()->Scalars.front()->getType())) |
| 1885 | return getWidenedType( |
| 1886 | ScalarTy: VectorizableTree.front()->Scalars.front()->getType(), |
| 1887 | VF: VectorizableTree.front()->getVectorFactor()); |
| 1888 | return getWidenedType( |
| 1889 | ScalarTy: IntegerType::get( |
| 1890 | C&: VectorizableTree.front()->Scalars.front()->getContext(), |
| 1891 | NumBits: ReductionBitWidth), |
| 1892 | VF: VectorizableTree.front()->getVectorFactor()); |
| 1893 | } |
| 1894 | |
| 1895 | /// Builds external uses of the vectorized scalars, i.e. the list of |
| 1896 | /// vectorized scalars to be extracted, their lanes and their scalar users. \p |
| 1897 | /// ExternallyUsedValues contains additional list of external uses to handle |
| 1898 | /// vectorization of reductions. |
| 1899 | void |
| 1900 | buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {}); |
| 1901 | |
| 1902 | /// Transforms graph nodes to target specific representations, if profitable. |
| 1903 | void transformNodes(); |
| 1904 | |
| 1905 | /// Clear the internal data structures that are created by 'buildTree'. |
| 1906 | void deleteTree() { |
| 1907 | VectorizableTree.clear(); |
| 1908 | ScalarToTreeEntries.clear(); |
| 1909 | OperandsToTreeEntry.clear(); |
| 1910 | ScalarsInSplitNodes.clear(); |
| 1911 | MustGather.clear(); |
| 1912 | NonScheduledFirst.clear(); |
| 1913 | EntryToLastInstruction.clear(); |
| 1914 | LoadEntriesToVectorize.clear(); |
| 1915 | IsGraphTransformMode = false; |
| 1916 | GatheredLoadsEntriesFirst.reset(); |
| 1917 | CompressEntryToData.clear(); |
| 1918 | ExternalUses.clear(); |
| 1919 | ExternalUsesAsOriginalScalar.clear(); |
| 1920 | for (auto &Iter : BlocksSchedules) { |
| 1921 | BlockScheduling *BS = Iter.second.get(); |
| 1922 | BS->clear(); |
| 1923 | } |
| 1924 | MinBWs.clear(); |
| 1925 | ReductionBitWidth = 0; |
| 1926 | BaseGraphSize = 1; |
| 1927 | CastMaxMinBWSizes.reset(); |
| 1928 | ExtraBitWidthNodes.clear(); |
| 1929 | InstrElementSize.clear(); |
| 1930 | UserIgnoreList = nullptr; |
| 1931 | PostponedGathers.clear(); |
| 1932 | ValueToGatherNodes.clear(); |
| 1933 | } |
| 1934 | |
| 1935 | unsigned getTreeSize() const { return VectorizableTree.size(); } |
| 1936 | |
| 1937 | /// Returns the base graph size, before any transformations. |
| 1938 | unsigned getCanonicalGraphSize() const { return BaseGraphSize; } |
| 1939 | |
| 1940 | /// Perform LICM and CSE on the newly generated gather sequences. |
| 1941 | void optimizeGatherSequence(); |
| 1942 | |
| 1943 | /// Does this non-empty order represent an identity order? Identity |
| 1944 | /// should be represented as an empty order, so this is used to |
| 1945 | /// decide if we can canonicalize a computed order. Undef elements |
| 1946 | /// (represented as size) are ignored. |
| 1947 | static bool isIdentityOrder(ArrayRef<unsigned> Order) { |
| 1948 | assert(!Order.empty() && "expected non-empty order" ); |
| 1949 | const unsigned Sz = Order.size(); |
| 1950 | return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) { |
| 1951 | return P.value() == P.index() || P.value() == Sz; |
| 1952 | }); |
| 1953 | } |
| 1954 | |
| 1955 | /// Checks if the specified gather tree entry \p TE can be represented as a |
| 1956 | /// shuffled vector entry + (possibly) permutation with other gathers. It |
| 1957 | /// implements the checks only for possibly ordered scalars (Loads, |
| 1958 | /// ExtractElement, ExtractValue), which can be part of the graph. |
| 1959 | /// \param TopToBottom If true, used for the whole tree rotation, false - for |
| 1960 | /// sub-tree rotations. \param IgnoreReorder true, if the order of the root |
| 1961 | /// node might be ignored. |
| 1962 | std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE, |
| 1963 | bool TopToBottom, |
| 1964 | bool IgnoreReorder); |
| 1965 | |
| 1966 | /// Sort loads into increasing pointers offsets to allow greater clustering. |
| 1967 | std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE); |
| 1968 | |
| 1969 | /// Gets reordering data for the given tree entry. If the entry is vectorized |
| 1970 | /// - just return ReorderIndices, otherwise check if the scalars can be |
| 1971 | /// reordered and return the most optimal order. |
| 1972 | /// \return std::nullopt if ordering is not important, empty order, if |
| 1973 | /// identity order is important, or the actual order. |
| 1974 | /// \param TopToBottom If true, include the order of vectorized stores and |
| 1975 | /// insertelement nodes, otherwise skip them. |
| 1976 | /// \param IgnoreReorder true, if the root node order can be ignored. |
| 1977 | std::optional<OrdersType> |
| 1978 | getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder); |
| 1979 | |
| 1980 | /// Checks if it is profitable to reorder the current tree. |
| 1981 | /// If the tree does not contain many profitable reordable nodes, better to |
| 1982 | /// skip it to save compile time. |
| 1983 | bool isProfitableToReorder() const; |
| 1984 | |
| 1985 | /// Reorders the current graph to the most profitable order starting from the |
| 1986 | /// root node to the leaf nodes. The best order is chosen only from the nodes |
| 1987 | /// of the same size (vectorization factor). Smaller nodes are considered |
| 1988 | /// parts of subgraph with smaller VF and they are reordered independently. We |
| 1989 | /// can make it because we still need to extend smaller nodes to the wider VF |
| 1990 | /// and we can merge reordering shuffles with the widening shuffles. |
| 1991 | void reorderTopToBottom(); |
| 1992 | |
| 1993 | /// Reorders the current graph to the most profitable order starting from |
| 1994 | /// leaves to the root. It allows to rotate small subgraphs and reduce the |
| 1995 | /// number of reshuffles if the leaf nodes use the same order. In this case we |
| 1996 | /// can merge the orders and just shuffle user node instead of shuffling its |
| 1997 | /// operands. Plus, even the leaf nodes have different orders, it allows to |
| 1998 | /// sink reordering in the graph closer to the root node and merge it later |
| 1999 | /// during analysis. |
| 2000 | void reorderBottomToTop(bool IgnoreReorder = false); |
| 2001 | |
| 2002 | /// \return The vector element size in bits to use when vectorizing the |
| 2003 | /// expression tree ending at \p V. If V is a store, the size is the width of |
| 2004 | /// the stored value. Otherwise, the size is the width of the largest loaded |
| 2005 | /// value reaching V. This method is used by the vectorizer to calculate |
| 2006 | /// vectorization factors. |
| 2007 | unsigned getVectorElementSize(Value *V); |
| 2008 | |
| 2009 | /// Compute the minimum type sizes required to represent the entries in a |
| 2010 | /// vectorizable tree. |
| 2011 | void computeMinimumValueSizes(); |
| 2012 | |
| 2013 | // \returns maximum vector register size as set by TTI or overridden by cl::opt. |
| 2014 | unsigned getMaxVecRegSize() const { |
| 2015 | return MaxVecRegSize; |
| 2016 | } |
| 2017 | |
| 2018 | // \returns minimum vector register size as set by cl::opt. |
| 2019 | unsigned getMinVecRegSize() const { |
| 2020 | return MinVecRegSize; |
| 2021 | } |
| 2022 | |
| 2023 | unsigned getMinVF(unsigned Sz) const { |
| 2024 | return std::max(a: 2U, b: getMinVecRegSize() / Sz); |
| 2025 | } |
| 2026 | |
| 2027 | unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const { |
| 2028 | unsigned MaxVF = MaxVFOption.getNumOccurrences() ? |
| 2029 | MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode); |
| 2030 | return MaxVF ? MaxVF : UINT_MAX; |
| 2031 | } |
| 2032 | |
| 2033 | /// Check if homogeneous aggregate is isomorphic to some VectorType. |
| 2034 | /// Accepts homogeneous multidimensional aggregate of scalars/vectors like |
| 2035 | /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> }, |
| 2036 | /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on. |
| 2037 | /// |
| 2038 | /// \returns number of elements in vector if isomorphism exists, 0 otherwise. |
| 2039 | unsigned canMapToVector(Type *T) const; |
| 2040 | |
| 2041 | /// \returns True if the VectorizableTree is both tiny and not fully |
| 2042 | /// vectorizable. We do not vectorize such trees. |
| 2043 | bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const; |
| 2044 | |
| 2045 | /// Checks if the graph and all its subgraphs cannot be better vectorized. |
| 2046 | /// It may happen, if all gather nodes are loads and they cannot be |
| 2047 | /// "clusterized". In this case even subgraphs cannot be vectorized more |
| 2048 | /// effectively than the base graph. |
| 2049 | bool isTreeNotExtendable() const; |
| 2050 | |
| 2051 | /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values |
| 2052 | /// can be load combined in the backend. Load combining may not be allowed in |
| 2053 | /// the IR optimizer, so we do not want to alter the pattern. For example, |
| 2054 | /// partially transforming a scalar bswap() pattern into vector code is |
| 2055 | /// effectively impossible for the backend to undo. |
| 2056 | /// TODO: If load combining is allowed in the IR optimizer, this analysis |
| 2057 | /// may not be necessary. |
| 2058 | bool isLoadCombineReductionCandidate(RecurKind RdxKind) const; |
| 2059 | |
| 2060 | /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values |
| 2061 | /// can be load combined in the backend. Load combining may not be allowed in |
| 2062 | /// the IR optimizer, so we do not want to alter the pattern. For example, |
| 2063 | /// partially transforming a scalar bswap() pattern into vector code is |
| 2064 | /// effectively impossible for the backend to undo. |
| 2065 | /// TODO: If load combining is allowed in the IR optimizer, this analysis |
| 2066 | /// may not be necessary. |
| 2067 | bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const; |
| 2068 | |
| 2069 | /// Checks if the given array of loads can be represented as a vectorized, |
| 2070 | /// scatter or just simple gather. |
| 2071 | /// \param VL list of loads. |
| 2072 | /// \param VL0 main load value. |
| 2073 | /// \param Order returned order of load instructions. |
| 2074 | /// \param PointerOps returned list of pointer operands. |
| 2075 | /// \param BestVF return best vector factor, if recursive check found better |
| 2076 | /// vectorization sequences rather than masked gather. |
| 2077 | /// \param TryRecursiveCheck used to check if long masked gather can be |
| 2078 | /// represented as a serie of loads/insert subvector, if profitable. |
| 2079 | LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, |
| 2080 | SmallVectorImpl<unsigned> &Order, |
| 2081 | SmallVectorImpl<Value *> &PointerOps, |
| 2082 | unsigned *BestVF = nullptr, |
| 2083 | bool TryRecursiveCheck = true) const; |
| 2084 | |
| 2085 | /// Registers non-vectorizable sequence of loads |
| 2086 | template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) { |
| 2087 | ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL)); |
| 2088 | } |
| 2089 | |
| 2090 | /// Checks if the given loads sequence is known as not vectorizable |
| 2091 | template <typename T> |
| 2092 | bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const { |
| 2093 | return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL)); |
| 2094 | } |
| 2095 | |
| 2096 | OptimizationRemarkEmitter *getORE() { return ORE; } |
| 2097 | |
| 2098 | /// This structure holds any data we need about the edges being traversed |
| 2099 | /// during buildTreeRec(). We keep track of: |
| 2100 | /// (i) the user TreeEntry index, and |
| 2101 | /// (ii) the index of the edge. |
| 2102 | struct EdgeInfo { |
| 2103 | EdgeInfo() = default; |
| 2104 | EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx) |
| 2105 | : UserTE(UserTE), EdgeIdx(EdgeIdx) {} |
| 2106 | /// The user TreeEntry. |
| 2107 | TreeEntry *UserTE = nullptr; |
| 2108 | /// The operand index of the use. |
| 2109 | unsigned EdgeIdx = UINT_MAX; |
| 2110 | #ifndef NDEBUG |
| 2111 | friend inline raw_ostream &operator<<(raw_ostream &OS, |
| 2112 | const BoUpSLP::EdgeInfo &EI) { |
| 2113 | EI.dump(OS); |
| 2114 | return OS; |
| 2115 | } |
| 2116 | /// Debug print. |
| 2117 | void dump(raw_ostream &OS) const { |
| 2118 | OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null" ) |
| 2119 | << " EdgeIdx:" << EdgeIdx << "}" ; |
| 2120 | } |
| 2121 | LLVM_DUMP_METHOD void dump() const { dump(dbgs()); } |
| 2122 | #endif |
| 2123 | bool operator == (const EdgeInfo &Other) const { |
| 2124 | return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx; |
| 2125 | } |
| 2126 | |
| 2127 | operator bool() const { return UserTE != nullptr; } |
| 2128 | }; |
| 2129 | |
| 2130 | /// A helper class used for scoring candidates for two consecutive lanes. |
| 2131 | class LookAheadHeuristics { |
| 2132 | const TargetLibraryInfo &TLI; |
| 2133 | const DataLayout &DL; |
| 2134 | ScalarEvolution &SE; |
| 2135 | const BoUpSLP &R; |
| 2136 | int NumLanes; // Total number of lanes (aka vectorization factor). |
| 2137 | int MaxLevel; // The maximum recursion depth for accumulating score. |
| 2138 | |
| 2139 | public: |
| 2140 | LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, |
| 2141 | ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, |
| 2142 | int MaxLevel) |
| 2143 | : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes), |
| 2144 | MaxLevel(MaxLevel) {} |
| 2145 | |
| 2146 | // The hard-coded scores listed here are not very important, though it shall |
| 2147 | // be higher for better matches to improve the resulting cost. When |
| 2148 | // computing the scores of matching one sub-tree with another, we are |
| 2149 | // basically counting the number of values that are matching. So even if all |
| 2150 | // scores are set to 1, we would still get a decent matching result. |
| 2151 | // However, sometimes we have to break ties. For example we may have to |
| 2152 | // choose between matching loads vs matching opcodes. This is what these |
| 2153 | // scores are helping us with: they provide the order of preference. Also, |
| 2154 | // this is important if the scalar is externally used or used in another |
| 2155 | // tree entry node in the different lane. |
| 2156 | |
| 2157 | /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]). |
| 2158 | static const int ScoreConsecutiveLoads = 4; |
| 2159 | /// The same load multiple times. This should have a better score than |
| 2160 | /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it |
| 2161 | /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for |
| 2162 | /// a vector load and 1.0 for a broadcast. |
| 2163 | static const int ScoreSplatLoads = 3; |
| 2164 | /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]). |
| 2165 | static const int ScoreReversedLoads = 3; |
| 2166 | /// A load candidate for masked gather. |
| 2167 | static const int ScoreMaskedGatherCandidate = 1; |
| 2168 | /// ExtractElementInst from same vector and consecutive indexes. |
| 2169 | static const int = 4; |
| 2170 | /// ExtractElementInst from same vector and reversed indices. |
| 2171 | static const int = 3; |
| 2172 | /// Constants. |
| 2173 | static const int ScoreConstants = 2; |
| 2174 | /// Instructions with the same opcode. |
| 2175 | static const int ScoreSameOpcode = 2; |
| 2176 | /// Instructions with alt opcodes (e.g, add + sub). |
| 2177 | static const int ScoreAltOpcodes = 1; |
| 2178 | /// Identical instructions (a.k.a. splat or broadcast). |
| 2179 | static const int ScoreSplat = 1; |
| 2180 | /// Matching with an undef is preferable to failing. |
| 2181 | static const int ScoreUndef = 1; |
| 2182 | /// Score for failing to find a decent match. |
| 2183 | static const int ScoreFail = 0; |
| 2184 | /// Score if all users are vectorized. |
| 2185 | static const int ScoreAllUserVectorized = 1; |
| 2186 | |
| 2187 | /// \returns the score of placing \p V1 and \p V2 in consecutive lanes. |
| 2188 | /// \p U1 and \p U2 are the users of \p V1 and \p V2. |
| 2189 | /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p |
| 2190 | /// MainAltOps. |
| 2191 | int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, |
| 2192 | ArrayRef<Value *> MainAltOps) const { |
| 2193 | if (!isValidElementType(Ty: V1->getType()) || |
| 2194 | !isValidElementType(Ty: V2->getType())) |
| 2195 | return LookAheadHeuristics::ScoreFail; |
| 2196 | |
| 2197 | if (V1 == V2) { |
| 2198 | if (isa<LoadInst>(Val: V1)) { |
| 2199 | // Retruns true if the users of V1 and V2 won't need to be extracted. |
| 2200 | auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) { |
| 2201 | // Bail out if we have too many uses to save compilation time. |
| 2202 | if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit)) |
| 2203 | return false; |
| 2204 | |
| 2205 | auto AllUsersVectorized = [U1, U2, this](Value *V) { |
| 2206 | return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) { |
| 2207 | return U == U1 || U == U2 || R.isVectorized(V: U); |
| 2208 | }); |
| 2209 | }; |
| 2210 | return AllUsersVectorized(V1) && AllUsersVectorized(V2); |
| 2211 | }; |
| 2212 | // A broadcast of a load can be cheaper on some targets. |
| 2213 | if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(), |
| 2214 | NumElements: ElementCount::getFixed(MinVal: NumLanes)) && |
| 2215 | ((int)V1->getNumUses() == NumLanes || |
| 2216 | AllUsersAreInternal(V1, V2))) |
| 2217 | return LookAheadHeuristics::ScoreSplatLoads; |
| 2218 | } |
| 2219 | return LookAheadHeuristics::ScoreSplat; |
| 2220 | } |
| 2221 | |
| 2222 | auto CheckSameEntryOrFail = [&]() { |
| 2223 | if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) { |
| 2224 | SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1); |
| 2225 | if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2); |
| 2226 | !TEs2.empty() && |
| 2227 | any_of(Range&: TEs2, P: [&](TreeEntry *E) { return Set.contains(Ptr: E); })) |
| 2228 | return LookAheadHeuristics::ScoreSplatLoads; |
| 2229 | } |
| 2230 | return LookAheadHeuristics::ScoreFail; |
| 2231 | }; |
| 2232 | |
| 2233 | auto *LI1 = dyn_cast<LoadInst>(Val: V1); |
| 2234 | auto *LI2 = dyn_cast<LoadInst>(Val: V2); |
| 2235 | if (LI1 && LI2) { |
| 2236 | if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() || |
| 2237 | !LI2->isSimple()) |
| 2238 | return CheckSameEntryOrFail(); |
| 2239 | |
| 2240 | std::optional<int64_t> Dist = getPointersDiff( |
| 2241 | ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(), |
| 2242 | PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); |
| 2243 | if (!Dist || *Dist == 0) { |
| 2244 | if (getUnderlyingObject(V: LI1->getPointerOperand()) == |
| 2245 | getUnderlyingObject(V: LI2->getPointerOperand()) && |
| 2246 | R.TTI->isLegalMaskedGather( |
| 2247 | DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign())) |
| 2248 | return LookAheadHeuristics::ScoreMaskedGatherCandidate; |
| 2249 | return CheckSameEntryOrFail(); |
| 2250 | } |
| 2251 | // The distance is too large - still may be profitable to use masked |
| 2252 | // loads/gathers. |
| 2253 | if (std::abs(i: *Dist) > NumLanes / 2) |
| 2254 | return LookAheadHeuristics::ScoreMaskedGatherCandidate; |
| 2255 | // This still will detect consecutive loads, but we might have "holes" |
| 2256 | // in some cases. It is ok for non-power-2 vectorization and may produce |
| 2257 | // better results. It should not affect current vectorization. |
| 2258 | return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads |
| 2259 | : LookAheadHeuristics::ScoreReversedLoads; |
| 2260 | } |
| 2261 | |
| 2262 | auto *C1 = dyn_cast<Constant>(Val: V1); |
| 2263 | auto *C2 = dyn_cast<Constant>(Val: V2); |
| 2264 | if (C1 && C2) |
| 2265 | return LookAheadHeuristics::ScoreConstants; |
| 2266 | |
| 2267 | // Extracts from consecutive indexes of the same vector better score as |
| 2268 | // the extracts could be optimized away. |
| 2269 | Value *EV1; |
| 2270 | ConstantInt *Ex1Idx; |
| 2271 | if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) { |
| 2272 | // Undefs are always profitable for extractelements. |
| 2273 | // Compiler can easily combine poison and extractelement <non-poison> or |
| 2274 | // undef and extractelement <poison>. But combining undef + |
| 2275 | // extractelement <non-poison-but-may-produce-poison> requires some |
| 2276 | // extra operations. |
| 2277 | if (isa<UndefValue>(Val: V2)) |
| 2278 | return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all()) |
| 2279 | ? LookAheadHeuristics::ScoreConsecutiveExtracts |
| 2280 | : LookAheadHeuristics::ScoreSameOpcode; |
| 2281 | Value *EV2 = nullptr; |
| 2282 | ConstantInt *Ex2Idx = nullptr; |
| 2283 | if (match(V: V2, |
| 2284 | P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx), |
| 2285 | R: m_Undef())))) { |
| 2286 | // Undefs are always profitable for extractelements. |
| 2287 | if (!Ex2Idx) |
| 2288 | return LookAheadHeuristics::ScoreConsecutiveExtracts; |
| 2289 | if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType()) |
| 2290 | return LookAheadHeuristics::ScoreConsecutiveExtracts; |
| 2291 | if (EV2 == EV1) { |
| 2292 | int Idx1 = Ex1Idx->getZExtValue(); |
| 2293 | int Idx2 = Ex2Idx->getZExtValue(); |
| 2294 | int Dist = Idx2 - Idx1; |
| 2295 | // The distance is too large - still may be profitable to use |
| 2296 | // shuffles. |
| 2297 | if (std::abs(x: Dist) == 0) |
| 2298 | return LookAheadHeuristics::ScoreSplat; |
| 2299 | if (std::abs(x: Dist) > NumLanes / 2) |
| 2300 | return LookAheadHeuristics::ScoreSameOpcode; |
| 2301 | return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts |
| 2302 | : LookAheadHeuristics::ScoreReversedExtracts; |
| 2303 | } |
| 2304 | return LookAheadHeuristics::ScoreAltOpcodes; |
| 2305 | } |
| 2306 | return CheckSameEntryOrFail(); |
| 2307 | } |
| 2308 | |
| 2309 | auto *I1 = dyn_cast<Instruction>(Val: V1); |
| 2310 | auto *I2 = dyn_cast<Instruction>(Val: V2); |
| 2311 | if (I1 && I2) { |
| 2312 | if (I1->getParent() != I2->getParent()) |
| 2313 | return CheckSameEntryOrFail(); |
| 2314 | SmallVector<Value *, 4> Ops(MainAltOps); |
| 2315 | Ops.push_back(Elt: I1); |
| 2316 | Ops.push_back(Elt: I2); |
| 2317 | InstructionsState S = getSameOpcode(VL: Ops, TLI); |
| 2318 | // Note: Only consider instructions with <= 2 operands to avoid |
| 2319 | // complexity explosion. |
| 2320 | if (S && |
| 2321 | (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() || |
| 2322 | !S.isAltShuffle()) && |
| 2323 | all_of(Range&: Ops, P: [&S](Value *V) { |
| 2324 | return isa<PoisonValue>(Val: V) || |
| 2325 | cast<Instruction>(Val: V)->getNumOperands() == |
| 2326 | S.getMainOp()->getNumOperands(); |
| 2327 | })) |
| 2328 | return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes |
| 2329 | : LookAheadHeuristics::ScoreSameOpcode; |
| 2330 | } |
| 2331 | |
| 2332 | if (I1 && isa<PoisonValue>(Val: V2)) |
| 2333 | return LookAheadHeuristics::ScoreSameOpcode; |
| 2334 | |
| 2335 | if (isa<UndefValue>(Val: V2)) |
| 2336 | return LookAheadHeuristics::ScoreUndef; |
| 2337 | |
| 2338 | return CheckSameEntryOrFail(); |
| 2339 | } |
| 2340 | |
| 2341 | /// Go through the operands of \p LHS and \p RHS recursively until |
| 2342 | /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are |
| 2343 | /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands |
| 2344 | /// of \p U1 and \p U2), except at the beginning of the recursion where |
| 2345 | /// these are set to nullptr. |
| 2346 | /// |
| 2347 | /// For example: |
| 2348 | /// \verbatim |
| 2349 | /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1] |
| 2350 | /// \ / \ / \ / \ / |
| 2351 | /// + + + + |
| 2352 | /// G1 G2 G3 G4 |
| 2353 | /// \endverbatim |
| 2354 | /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at |
| 2355 | /// each level recursively, accumulating the score. It starts from matching |
| 2356 | /// the additions at level 0, then moves on to the loads (level 1). The |
| 2357 | /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and |
| 2358 | /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while |
| 2359 | /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail. |
| 2360 | /// Please note that the order of the operands does not matter, as we |
| 2361 | /// evaluate the score of all profitable combinations of operands. In |
| 2362 | /// other words the score of G1 and G4 is the same as G1 and G2. This |
| 2363 | /// heuristic is based on ideas described in: |
| 2364 | /// Look-ahead SLP: Auto-vectorization in the presence of commutative |
| 2365 | /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, |
| 2366 | /// LuÃs F. W. Góes |
| 2367 | int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, |
| 2368 | Instruction *U2, int CurrLevel, |
| 2369 | ArrayRef<Value *> MainAltOps) const { |
| 2370 | |
| 2371 | // Get the shallow score of V1 and V2. |
| 2372 | int ShallowScoreAtThisLevel = |
| 2373 | getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps); |
| 2374 | |
| 2375 | // If reached MaxLevel, |
| 2376 | // or if V1 and V2 are not instructions, |
| 2377 | // or if they are SPLAT, |
| 2378 | // or if they are not consecutive, |
| 2379 | // or if profitable to vectorize loads or extractelements, early return |
| 2380 | // the current cost. |
| 2381 | auto *I1 = dyn_cast<Instruction>(Val: LHS); |
| 2382 | auto *I2 = dyn_cast<Instruction>(Val: RHS); |
| 2383 | if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 || |
| 2384 | ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail || |
| 2385 | (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) || |
| 2386 | (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) || |
| 2387 | (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) && |
| 2388 | ShallowScoreAtThisLevel)) |
| 2389 | return ShallowScoreAtThisLevel; |
| 2390 | assert(I1 && I2 && "Should have early exited." ); |
| 2391 | |
| 2392 | // Contains the I2 operand indexes that got matched with I1 operands. |
| 2393 | SmallSet<unsigned, 4> Op2Used; |
| 2394 | |
| 2395 | // Recursion towards the operands of I1 and I2. We are trying all possible |
| 2396 | // operand pairs, and keeping track of the best score. |
| 2397 | for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands(); |
| 2398 | OpIdx1 != NumOperands1; ++OpIdx1) { |
| 2399 | // Try to pair op1I with the best operand of I2. |
| 2400 | int MaxTmpScore = 0; |
| 2401 | unsigned MaxOpIdx2 = 0; |
| 2402 | bool FoundBest = false; |
| 2403 | // If I2 is commutative try all combinations. |
| 2404 | unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1; |
| 2405 | unsigned ToIdx = isCommutative(I: I2) |
| 2406 | ? I2->getNumOperands() |
| 2407 | : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1); |
| 2408 | assert(FromIdx <= ToIdx && "Bad index" ); |
| 2409 | for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) { |
| 2410 | // Skip operands already paired with OpIdx1. |
| 2411 | if (Op2Used.count(V: OpIdx2)) |
| 2412 | continue; |
| 2413 | // Recursively calculate the cost at each level |
| 2414 | int TmpScore = |
| 2415 | getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2), |
| 2416 | U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: {}); |
| 2417 | // Look for the best score. |
| 2418 | if (TmpScore > LookAheadHeuristics::ScoreFail && |
| 2419 | TmpScore > MaxTmpScore) { |
| 2420 | MaxTmpScore = TmpScore; |
| 2421 | MaxOpIdx2 = OpIdx2; |
| 2422 | FoundBest = true; |
| 2423 | } |
| 2424 | } |
| 2425 | if (FoundBest) { |
| 2426 | // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it. |
| 2427 | Op2Used.insert(V: MaxOpIdx2); |
| 2428 | ShallowScoreAtThisLevel += MaxTmpScore; |
| 2429 | } |
| 2430 | } |
| 2431 | return ShallowScoreAtThisLevel; |
| 2432 | } |
| 2433 | }; |
| 2434 | /// A helper data structure to hold the operands of a vector of instructions. |
| 2435 | /// This supports a fixed vector length for all operand vectors. |
| 2436 | class VLOperands { |
| 2437 | /// For each operand we need (i) the value, and (ii) the opcode that it |
| 2438 | /// would be attached to if the expression was in a left-linearized form. |
| 2439 | /// This is required to avoid illegal operand reordering. |
| 2440 | /// For example: |
| 2441 | /// \verbatim |
| 2442 | /// 0 Op1 |
| 2443 | /// |/ |
| 2444 | /// Op1 Op2 Linearized + Op2 |
| 2445 | /// \ / ----------> |/ |
| 2446 | /// - - |
| 2447 | /// |
| 2448 | /// Op1 - Op2 (0 + Op1) - Op2 |
| 2449 | /// \endverbatim |
| 2450 | /// |
| 2451 | /// Value Op1 is attached to a '+' operation, and Op2 to a '-'. |
| 2452 | /// |
| 2453 | /// Another way to think of this is to track all the operations across the |
| 2454 | /// path from the operand all the way to the root of the tree and to |
| 2455 | /// calculate the operation that corresponds to this path. For example, the |
| 2456 | /// path from Op2 to the root crosses the RHS of the '-', therefore the |
| 2457 | /// corresponding operation is a '-' (which matches the one in the |
| 2458 | /// linearized tree, as shown above). |
| 2459 | /// |
| 2460 | /// For lack of a better term, we refer to this operation as Accumulated |
| 2461 | /// Path Operation (APO). |
| 2462 | struct OperandData { |
| 2463 | OperandData() = default; |
| 2464 | OperandData(Value *V, bool APO, bool IsUsed) |
| 2465 | : V(V), APO(APO), IsUsed(IsUsed) {} |
| 2466 | /// The operand value. |
| 2467 | Value *V = nullptr; |
| 2468 | /// TreeEntries only allow a single opcode, or an alternate sequence of |
| 2469 | /// them (e.g, +, -). Therefore, we can safely use a boolean value for the |
| 2470 | /// APO. It is set to 'true' if 'V' is attached to an inverse operation |
| 2471 | /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise |
| 2472 | /// (e.g., Add/Mul) |
| 2473 | bool APO = false; |
| 2474 | /// Helper data for the reordering function. |
| 2475 | bool IsUsed = false; |
| 2476 | }; |
| 2477 | |
| 2478 | /// During operand reordering, we are trying to select the operand at lane |
| 2479 | /// that matches best with the operand at the neighboring lane. Our |
| 2480 | /// selection is based on the type of value we are looking for. For example, |
| 2481 | /// if the neighboring lane has a load, we need to look for a load that is |
| 2482 | /// accessing a consecutive address. These strategies are summarized in the |
| 2483 | /// 'ReorderingMode' enumerator. |
| 2484 | enum class ReorderingMode { |
| 2485 | Load, ///< Matching loads to consecutive memory addresses |
| 2486 | Opcode, ///< Matching instructions based on opcode (same or alternate) |
| 2487 | Constant, ///< Matching constants |
| 2488 | Splat, ///< Matching the same instruction multiple times (broadcast) |
| 2489 | Failed, ///< We failed to create a vectorizable group |
| 2490 | }; |
| 2491 | |
| 2492 | using OperandDataVec = SmallVector<OperandData, 2>; |
| 2493 | |
| 2494 | /// A vector of operand vectors. |
| 2495 | SmallVector<OperandDataVec, 4> OpsVec; |
| 2496 | /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0] |
| 2497 | /// is not IntrinsicInst, ArgSize is User::getNumOperands. |
| 2498 | unsigned ArgSize = 0; |
| 2499 | |
| 2500 | const TargetLibraryInfo &TLI; |
| 2501 | const DataLayout &DL; |
| 2502 | ScalarEvolution &SE; |
| 2503 | const BoUpSLP &R; |
| 2504 | const Loop *L = nullptr; |
| 2505 | |
| 2506 | /// \returns the operand data at \p OpIdx and \p Lane. |
| 2507 | OperandData &getData(unsigned OpIdx, unsigned Lane) { |
| 2508 | return OpsVec[OpIdx][Lane]; |
| 2509 | } |
| 2510 | |
| 2511 | /// \returns the operand data at \p OpIdx and \p Lane. Const version. |
| 2512 | const OperandData &getData(unsigned OpIdx, unsigned Lane) const { |
| 2513 | return OpsVec[OpIdx][Lane]; |
| 2514 | } |
| 2515 | |
| 2516 | /// Clears the used flag for all entries. |
| 2517 | void clearUsed() { |
| 2518 | for (unsigned OpIdx = 0, NumOperands = getNumOperands(); |
| 2519 | OpIdx != NumOperands; ++OpIdx) |
| 2520 | for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes; |
| 2521 | ++Lane) |
| 2522 | OpsVec[OpIdx][Lane].IsUsed = false; |
| 2523 | } |
| 2524 | |
| 2525 | /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2. |
| 2526 | void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) { |
| 2527 | std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]); |
| 2528 | } |
| 2529 | |
| 2530 | /// \param Lane lane of the operands under analysis. |
| 2531 | /// \param OpIdx operand index in \p Lane lane we're looking the best |
| 2532 | /// candidate for. |
| 2533 | /// \param Idx operand index of the current candidate value. |
| 2534 | /// \returns The additional score due to possible broadcasting of the |
| 2535 | /// elements in the lane. It is more profitable to have power-of-2 unique |
| 2536 | /// elements in the lane, it will be vectorized with higher probability |
| 2537 | /// after removing duplicates. Currently the SLP vectorizer supports only |
| 2538 | /// vectorization of the power-of-2 number of unique scalars. |
| 2539 | int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx, |
| 2540 | const SmallBitVector &UsedLanes) const { |
| 2541 | Value *IdxLaneV = getData(OpIdx: Idx, Lane).V; |
| 2542 | if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V || |
| 2543 | isa<ExtractElementInst>(Val: IdxLaneV)) |
| 2544 | return 0; |
| 2545 | SmallDenseMap<Value *, unsigned, 4> Uniques; |
| 2546 | for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) { |
| 2547 | if (Ln == Lane) |
| 2548 | continue; |
| 2549 | Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V; |
| 2550 | if (!isa<Instruction>(Val: OpIdxLnV)) |
| 2551 | return 0; |
| 2552 | Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln); |
| 2553 | } |
| 2554 | unsigned UniquesCount = Uniques.size(); |
| 2555 | auto IdxIt = Uniques.find(Val: IdxLaneV); |
| 2556 | unsigned UniquesCntWithIdxLaneV = |
| 2557 | IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1; |
| 2558 | Value *OpIdxLaneV = getData(OpIdx, Lane).V; |
| 2559 | auto OpIdxIt = Uniques.find(Val: OpIdxLaneV); |
| 2560 | unsigned UniquesCntWithOpIdxLaneV = |
| 2561 | OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1; |
| 2562 | if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV) |
| 2563 | return 0; |
| 2564 | return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) - |
| 2565 | UniquesCntWithOpIdxLaneV, |
| 2566 | b: UniquesCntWithOpIdxLaneV - |
| 2567 | bit_floor(Value: UniquesCntWithOpIdxLaneV)) - |
| 2568 | ((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt->second)) |
| 2569 | ? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV) |
| 2570 | : bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV); |
| 2571 | } |
| 2572 | |
| 2573 | /// \param Lane lane of the operands under analysis. |
| 2574 | /// \param OpIdx operand index in \p Lane lane we're looking the best |
| 2575 | /// candidate for. |
| 2576 | /// \param Idx operand index of the current candidate value. |
| 2577 | /// \returns The additional score for the scalar which users are all |
| 2578 | /// vectorized. |
| 2579 | int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const { |
| 2580 | Value *IdxLaneV = getData(OpIdx: Idx, Lane).V; |
| 2581 | Value *OpIdxLaneV = getData(OpIdx, Lane).V; |
| 2582 | // Do not care about number of uses for vector-like instructions |
| 2583 | // (extractelement/extractvalue with constant indices), they are extracts |
| 2584 | // themselves and already externally used. Vectorization of such |
| 2585 | // instructions does not add extra extractelement instruction, just may |
| 2586 | // remove it. |
| 2587 | if (isVectorLikeInstWithConstOps(V: IdxLaneV) && |
| 2588 | isVectorLikeInstWithConstOps(V: OpIdxLaneV)) |
| 2589 | return LookAheadHeuristics::ScoreAllUserVectorized; |
| 2590 | auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV); |
| 2591 | if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV)) |
| 2592 | return 0; |
| 2593 | return R.areAllUsersVectorized(I: IdxLaneI) |
| 2594 | ? LookAheadHeuristics::ScoreAllUserVectorized |
| 2595 | : 0; |
| 2596 | } |
| 2597 | |
| 2598 | /// Score scaling factor for fully compatible instructions but with |
| 2599 | /// different number of external uses. Allows better selection of the |
| 2600 | /// instructions with less external uses. |
| 2601 | static const int ScoreScaleFactor = 10; |
| 2602 | |
| 2603 | /// \Returns the look-ahead score, which tells us how much the sub-trees |
| 2604 | /// rooted at \p LHS and \p RHS match, the more they match the higher the |
| 2605 | /// score. This helps break ties in an informed way when we cannot decide on |
| 2606 | /// the order of the operands by just considering the immediate |
| 2607 | /// predecessors. |
| 2608 | int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps, |
| 2609 | int Lane, unsigned OpIdx, unsigned Idx, |
| 2610 | bool &IsUsed, const SmallBitVector &UsedLanes) { |
| 2611 | LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(), |
| 2612 | LookAheadMaxDepth); |
| 2613 | // Keep track of the instruction stack as we recurse into the operands |
| 2614 | // during the look-ahead score exploration. |
| 2615 | int Score = |
| 2616 | LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr, |
| 2617 | /*CurrLevel=*/1, MainAltOps); |
| 2618 | if (Score) { |
| 2619 | int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes); |
| 2620 | if (Score <= -SplatScore) { |
| 2621 | // Failed score. |
| 2622 | Score = 0; |
| 2623 | } else { |
| 2624 | Score += SplatScore; |
| 2625 | // Scale score to see the difference between different operands |
| 2626 | // and similar operands but all vectorized/not all vectorized |
| 2627 | // uses. It does not affect actual selection of the best |
| 2628 | // compatible operand in general, just allows to select the |
| 2629 | // operand with all vectorized uses. |
| 2630 | Score *= ScoreScaleFactor; |
| 2631 | Score += getExternalUseScore(Lane, OpIdx, Idx); |
| 2632 | IsUsed = true; |
| 2633 | } |
| 2634 | } |
| 2635 | return Score; |
| 2636 | } |
| 2637 | |
| 2638 | /// Best defined scores per lanes between the passes. Used to choose the |
| 2639 | /// best operand (with the highest score) between the passes. |
| 2640 | /// The key - {Operand Index, Lane}. |
| 2641 | /// The value - the best score between the passes for the lane and the |
| 2642 | /// operand. |
| 2643 | SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8> |
| 2644 | BestScoresPerLanes; |
| 2645 | |
| 2646 | // Search all operands in Ops[*][Lane] for the one that matches best |
| 2647 | // Ops[OpIdx][LastLane] and return its opreand index. |
| 2648 | // If no good match can be found, return std::nullopt. |
| 2649 | std::optional<unsigned> |
| 2650 | getBestOperand(unsigned OpIdx, int Lane, int LastLane, |
| 2651 | ArrayRef<ReorderingMode> ReorderingModes, |
| 2652 | ArrayRef<Value *> MainAltOps, |
| 2653 | const SmallBitVector &UsedLanes) { |
| 2654 | unsigned NumOperands = getNumOperands(); |
| 2655 | |
| 2656 | // The operand of the previous lane at OpIdx. |
| 2657 | Value *OpLastLane = getData(OpIdx, Lane: LastLane).V; |
| 2658 | |
| 2659 | // Our strategy mode for OpIdx. |
| 2660 | ReorderingMode RMode = ReorderingModes[OpIdx]; |
| 2661 | if (RMode == ReorderingMode::Failed) |
| 2662 | return std::nullopt; |
| 2663 | |
| 2664 | // The linearized opcode of the operand at OpIdx, Lane. |
| 2665 | bool OpIdxAPO = getData(OpIdx, Lane).APO; |
| 2666 | |
| 2667 | // The best operand index and its score. |
| 2668 | // Sometimes we have more than one option (e.g., Opcode and Undefs), so we |
| 2669 | // are using the score to differentiate between the two. |
| 2670 | struct BestOpData { |
| 2671 | std::optional<unsigned> Idx; |
| 2672 | unsigned Score = 0; |
| 2673 | } BestOp; |
| 2674 | BestOp.Score = |
| 2675 | BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0) |
| 2676 | .first->second; |
| 2677 | |
| 2678 | // Track if the operand must be marked as used. If the operand is set to |
| 2679 | // Score 1 explicitly (because of non power-of-2 unique scalars, we may |
| 2680 | // want to reestimate the operands again on the following iterations). |
| 2681 | bool IsUsed = RMode == ReorderingMode::Splat || |
| 2682 | RMode == ReorderingMode::Constant || |
| 2683 | RMode == ReorderingMode::Load; |
| 2684 | // Iterate through all unused operands and look for the best. |
| 2685 | for (unsigned Idx = 0; Idx != NumOperands; ++Idx) { |
| 2686 | // Get the operand at Idx and Lane. |
| 2687 | OperandData &OpData = getData(OpIdx: Idx, Lane); |
| 2688 | Value *Op = OpData.V; |
| 2689 | bool OpAPO = OpData.APO; |
| 2690 | |
| 2691 | // Skip already selected operands. |
| 2692 | if (OpData.IsUsed) |
| 2693 | continue; |
| 2694 | |
| 2695 | // Skip if we are trying to move the operand to a position with a |
| 2696 | // different opcode in the linearized tree form. This would break the |
| 2697 | // semantics. |
| 2698 | if (OpAPO != OpIdxAPO) |
| 2699 | continue; |
| 2700 | |
| 2701 | // Look for an operand that matches the current mode. |
| 2702 | switch (RMode) { |
| 2703 | case ReorderingMode::Load: |
| 2704 | case ReorderingMode::Opcode: { |
| 2705 | bool LeftToRight = Lane > LastLane; |
| 2706 | Value *OpLeft = (LeftToRight) ? OpLastLane : Op; |
| 2707 | Value *OpRight = (LeftToRight) ? Op : OpLastLane; |
| 2708 | int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane, |
| 2709 | OpIdx, Idx, IsUsed, UsedLanes); |
| 2710 | if (Score > static_cast<int>(BestOp.Score) || |
| 2711 | (Score > 0 && Score == static_cast<int>(BestOp.Score) && |
| 2712 | Idx == OpIdx)) { |
| 2713 | BestOp.Idx = Idx; |
| 2714 | BestOp.Score = Score; |
| 2715 | BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score; |
| 2716 | } |
| 2717 | break; |
| 2718 | } |
| 2719 | case ReorderingMode::Constant: |
| 2720 | if (isa<Constant>(Val: Op) || |
| 2721 | (!BestOp.Score && L && L->isLoopInvariant(V: Op))) { |
| 2722 | BestOp.Idx = Idx; |
| 2723 | if (isa<Constant>(Val: Op)) { |
| 2724 | BestOp.Score = LookAheadHeuristics::ScoreConstants; |
| 2725 | BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = |
| 2726 | LookAheadHeuristics::ScoreConstants; |
| 2727 | } |
| 2728 | if (isa<UndefValue>(Val: Op) || !isa<Constant>(Val: Op)) |
| 2729 | IsUsed = false; |
| 2730 | } |
| 2731 | break; |
| 2732 | case ReorderingMode::Splat: |
| 2733 | if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Val: Op))) { |
| 2734 | IsUsed = Op == OpLastLane; |
| 2735 | if (Op == OpLastLane) { |
| 2736 | BestOp.Score = LookAheadHeuristics::ScoreSplat; |
| 2737 | BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = |
| 2738 | LookAheadHeuristics::ScoreSplat; |
| 2739 | } |
| 2740 | BestOp.Idx = Idx; |
| 2741 | } |
| 2742 | break; |
| 2743 | case ReorderingMode::Failed: |
| 2744 | llvm_unreachable("Not expected Failed reordering mode." ); |
| 2745 | } |
| 2746 | } |
| 2747 | |
| 2748 | if (BestOp.Idx) { |
| 2749 | getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed; |
| 2750 | return BestOp.Idx; |
| 2751 | } |
| 2752 | // If we could not find a good match return std::nullopt. |
| 2753 | return std::nullopt; |
| 2754 | } |
| 2755 | |
| 2756 | /// Helper for reorderOperandVecs. |
| 2757 | /// \returns the lane that we should start reordering from. This is the one |
| 2758 | /// which has the least number of operands that can freely move about or |
| 2759 | /// less profitable because it already has the most optimal set of operands. |
| 2760 | unsigned getBestLaneToStartReordering() const { |
| 2761 | unsigned Min = UINT_MAX; |
| 2762 | unsigned SameOpNumber = 0; |
| 2763 | // std::pair<unsigned, unsigned> is used to implement a simple voting |
| 2764 | // algorithm and choose the lane with the least number of operands that |
| 2765 | // can freely move about or less profitable because it already has the |
| 2766 | // most optimal set of operands. The first unsigned is a counter for |
| 2767 | // voting, the second unsigned is the counter of lanes with instructions |
| 2768 | // with same/alternate opcodes and same parent basic block. |
| 2769 | MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap; |
| 2770 | // Try to be closer to the original results, if we have multiple lanes |
| 2771 | // with same cost. If 2 lanes have the same cost, use the one with the |
| 2772 | // highest index. |
| 2773 | for (int I = getNumLanes(); I > 0; --I) { |
| 2774 | unsigned Lane = I - 1; |
| 2775 | OperandsOrderData NumFreeOpsHash = |
| 2776 | getMaxNumOperandsThatCanBeReordered(Lane); |
| 2777 | // Compare the number of operands that can move and choose the one with |
| 2778 | // the least number. |
| 2779 | if (NumFreeOpsHash.NumOfAPOs < Min) { |
| 2780 | Min = NumFreeOpsHash.NumOfAPOs; |
| 2781 | SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; |
| 2782 | HashMap.clear(); |
| 2783 | HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane); |
| 2784 | } else if (NumFreeOpsHash.NumOfAPOs == Min && |
| 2785 | NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) { |
| 2786 | // Select the most optimal lane in terms of number of operands that |
| 2787 | // should be moved around. |
| 2788 | SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent; |
| 2789 | HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane); |
| 2790 | } else if (NumFreeOpsHash.NumOfAPOs == Min && |
| 2791 | NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) { |
| 2792 | auto [It, Inserted] = |
| 2793 | HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: 1, Args&: Lane); |
| 2794 | if (!Inserted) |
| 2795 | ++It->second.first; |
| 2796 | } |
| 2797 | } |
| 2798 | // Select the lane with the minimum counter. |
| 2799 | unsigned BestLane = 0; |
| 2800 | unsigned CntMin = UINT_MAX; |
| 2801 | for (const auto &Data : reverse(C&: HashMap)) { |
| 2802 | if (Data.second.first < CntMin) { |
| 2803 | CntMin = Data.second.first; |
| 2804 | BestLane = Data.second.second; |
| 2805 | } |
| 2806 | } |
| 2807 | return BestLane; |
| 2808 | } |
| 2809 | |
| 2810 | /// Data structure that helps to reorder operands. |
| 2811 | struct OperandsOrderData { |
| 2812 | /// The best number of operands with the same APOs, which can be |
| 2813 | /// reordered. |
| 2814 | unsigned NumOfAPOs = UINT_MAX; |
| 2815 | /// Number of operands with the same/alternate instruction opcode and |
| 2816 | /// parent. |
| 2817 | unsigned NumOpsWithSameOpcodeParent = 0; |
| 2818 | /// Hash for the actual operands ordering. |
| 2819 | /// Used to count operands, actually their position id and opcode |
| 2820 | /// value. It is used in the voting mechanism to find the lane with the |
| 2821 | /// least number of operands that can freely move about or less profitable |
| 2822 | /// because it already has the most optimal set of operands. Can be |
| 2823 | /// replaced with SmallVector<unsigned> instead but hash code is faster |
| 2824 | /// and requires less memory. |
| 2825 | unsigned Hash = 0; |
| 2826 | }; |
| 2827 | /// \returns the maximum number of operands that are allowed to be reordered |
| 2828 | /// for \p Lane and the number of compatible instructions(with the same |
| 2829 | /// parent/opcode). This is used as a heuristic for selecting the first lane |
| 2830 | /// to start operand reordering. |
| 2831 | OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const { |
| 2832 | unsigned CntTrue = 0; |
| 2833 | unsigned NumOperands = getNumOperands(); |
| 2834 | // Operands with the same APO can be reordered. We therefore need to count |
| 2835 | // how many of them we have for each APO, like this: Cnt[APO] = x. |
| 2836 | // Since we only have two APOs, namely true and false, we can avoid using |
| 2837 | // a map. Instead we can simply count the number of operands that |
| 2838 | // correspond to one of them (in this case the 'true' APO), and calculate |
| 2839 | // the other by subtracting it from the total number of operands. |
| 2840 | // Operands with the same instruction opcode and parent are more |
| 2841 | // profitable since we don't need to move them in many cases, with a high |
| 2842 | // probability such lane already can be vectorized effectively. |
| 2843 | bool AllUndefs = true; |
| 2844 | unsigned NumOpsWithSameOpcodeParent = 0; |
| 2845 | Instruction *OpcodeI = nullptr; |
| 2846 | BasicBlock *Parent = nullptr; |
| 2847 | unsigned Hash = 0; |
| 2848 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { |
| 2849 | const OperandData &OpData = getData(OpIdx, Lane); |
| 2850 | if (OpData.APO) |
| 2851 | ++CntTrue; |
| 2852 | // Use Boyer-Moore majority voting for finding the majority opcode and |
| 2853 | // the number of times it occurs. |
| 2854 | if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) { |
| 2855 | if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI) || |
| 2856 | I->getParent() != Parent) { |
| 2857 | if (NumOpsWithSameOpcodeParent == 0) { |
| 2858 | NumOpsWithSameOpcodeParent = 1; |
| 2859 | OpcodeI = I; |
| 2860 | Parent = I->getParent(); |
| 2861 | } else { |
| 2862 | --NumOpsWithSameOpcodeParent; |
| 2863 | } |
| 2864 | } else { |
| 2865 | ++NumOpsWithSameOpcodeParent; |
| 2866 | } |
| 2867 | } |
| 2868 | Hash = hash_combine( |
| 2869 | args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1))); |
| 2870 | AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V); |
| 2871 | } |
| 2872 | if (AllUndefs) |
| 2873 | return {}; |
| 2874 | OperandsOrderData Data; |
| 2875 | Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue); |
| 2876 | Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent; |
| 2877 | Data.Hash = Hash; |
| 2878 | return Data; |
| 2879 | } |
| 2880 | |
| 2881 | /// Go through the instructions in VL and append their operands. |
| 2882 | void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands, |
| 2883 | const InstructionsState &S) { |
| 2884 | assert(!Operands.empty() && !VL.empty() && "Bad list of operands" ); |
| 2885 | assert((empty() || all_of(Operands, |
| 2886 | [this](const ValueList &VL) { |
| 2887 | return VL.size() == getNumLanes(); |
| 2888 | })) && |
| 2889 | "Expected same number of lanes" ); |
| 2890 | assert(S.valid() && "InstructionsState is invalid." ); |
| 2891 | // IntrinsicInst::isCommutative returns true if swapping the first "two" |
| 2892 | // arguments to the intrinsic produces the same result. |
| 2893 | constexpr unsigned IntrinsicNumOperands = 2; |
| 2894 | Instruction *MainOp = S.getMainOp(); |
| 2895 | unsigned NumOperands = MainOp->getNumOperands(); |
| 2896 | ArgSize = isa<IntrinsicInst>(Val: MainOp) ? IntrinsicNumOperands : NumOperands; |
| 2897 | OpsVec.resize(N: ArgSize); |
| 2898 | unsigned NumLanes = VL.size(); |
| 2899 | for (OperandDataVec &Ops : OpsVec) |
| 2900 | Ops.resize(N: NumLanes); |
| 2901 | for (unsigned Lane : seq<unsigned>(Size: NumLanes)) { |
| 2902 | Value *V = VL[Lane]; |
| 2903 | assert((isa<Instruction>(V) || isa<PoisonValue>(V)) && |
| 2904 | "Expected instruction or poison value" ); |
| 2905 | // Our tree has just 3 nodes: the root and two operands. |
| 2906 | // It is therefore trivial to get the APO. We only need to check the |
| 2907 | // opcode of V and whether the operand at OpIdx is the LHS or RHS |
| 2908 | // operand. The LHS operand of both add and sub is never attached to an |
| 2909 | // inversese operation in the linearized form, therefore its APO is |
| 2910 | // false. The RHS is true only if V is an inverse operation. |
| 2911 | |
| 2912 | // Since operand reordering is performed on groups of commutative |
| 2913 | // operations or alternating sequences (e.g., +, -), we can safely tell |
| 2914 | // the inverse operations by checking commutativity. |
| 2915 | if (isa<PoisonValue>(Val: V)) { |
| 2916 | for (unsigned OpIdx : seq<unsigned>(Size: NumOperands)) |
| 2917 | OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false}; |
| 2918 | continue; |
| 2919 | } |
| 2920 | auto [SelectedOp, Ops] = convertTo(I: cast<Instruction>(Val: V), S); |
| 2921 | // We cannot check commutativity by the converted instruction |
| 2922 | // (SelectedOp) because isCommutative also examines def-use |
| 2923 | // relationships. |
| 2924 | bool IsInverseOperation = |
| 2925 | !isCommutative(I: SelectedOp, InstWithUses: cast<Instruction>(Val: V)); |
| 2926 | for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) { |
| 2927 | bool APO = (OpIdx == 0) ? false : IsInverseOperation; |
| 2928 | OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false}; |
| 2929 | } |
| 2930 | } |
| 2931 | } |
| 2932 | |
| 2933 | /// \returns the number of operands. |
| 2934 | unsigned getNumOperands() const { return ArgSize; } |
| 2935 | |
| 2936 | /// \returns the number of lanes. |
| 2937 | unsigned getNumLanes() const { return OpsVec[0].size(); } |
| 2938 | |
| 2939 | /// \returns the operand value at \p OpIdx and \p Lane. |
| 2940 | Value *getValue(unsigned OpIdx, unsigned Lane) const { |
| 2941 | return getData(OpIdx, Lane).V; |
| 2942 | } |
| 2943 | |
| 2944 | /// \returns true if the data structure is empty. |
| 2945 | bool empty() const { return OpsVec.empty(); } |
| 2946 | |
| 2947 | /// Clears the data. |
| 2948 | void clear() { OpsVec.clear(); } |
| 2949 | |
| 2950 | /// \Returns true if there are enough operands identical to \p Op to fill |
| 2951 | /// the whole vector (it is mixed with constants or loop invariant values). |
| 2952 | /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow. |
| 2953 | bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) { |
| 2954 | assert(Op == getValue(OpIdx, Lane) && |
| 2955 | "Op is expected to be getValue(OpIdx, Lane)." ); |
| 2956 | // Small number of loads - try load matching. |
| 2957 | if (isa<LoadInst>(Val: Op) && getNumLanes() == 2 && getNumOperands() == 2) |
| 2958 | return false; |
| 2959 | bool OpAPO = getData(OpIdx, Lane).APO; |
| 2960 | bool IsInvariant = L && L->isLoopInvariant(V: Op); |
| 2961 | unsigned Cnt = 0; |
| 2962 | for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { |
| 2963 | if (Ln == Lane) |
| 2964 | continue; |
| 2965 | // This is set to true if we found a candidate for broadcast at Lane. |
| 2966 | bool FoundCandidate = false; |
| 2967 | for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) { |
| 2968 | OperandData &Data = getData(OpIdx: OpI, Lane: Ln); |
| 2969 | if (Data.APO != OpAPO || Data.IsUsed) |
| 2970 | continue; |
| 2971 | Value *OpILane = getValue(OpIdx: OpI, Lane); |
| 2972 | bool IsConstantOp = isa<Constant>(Val: OpILane); |
| 2973 | // Consider the broadcast candidate if: |
| 2974 | // 1. Same value is found in one of the operands. |
| 2975 | if (Data.V == Op || |
| 2976 | // 2. The operand in the given lane is not constant but there is a |
| 2977 | // constant operand in another lane (which can be moved to the |
| 2978 | // given lane). In this case we can represent it as a simple |
| 2979 | // permutation of constant and broadcast. |
| 2980 | (!IsConstantOp && |
| 2981 | ((Lns > 2 && isa<Constant>(Val: Data.V)) || |
| 2982 | // 2.1. If we have only 2 lanes, need to check that value in the |
| 2983 | // next lane does not build same opcode sequence. |
| 2984 | (Lns == 2 && |
| 2985 | !getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + 1) % OpE, Lane: Ln)}, TLI) && |
| 2986 | isa<Constant>(Val: Data.V)))) || |
| 2987 | // 3. The operand in the current lane is loop invariant (can be |
| 2988 | // hoisted out) and another operand is also a loop invariant |
| 2989 | // (though not a constant). In this case the whole vector can be |
| 2990 | // hoisted out. |
| 2991 | // FIXME: need to teach the cost model about this case for better |
| 2992 | // estimation. |
| 2993 | (IsInvariant && !isa<Constant>(Val: Data.V) && |
| 2994 | !getSameOpcode(VL: {Op, Data.V}, TLI) && |
| 2995 | L->isLoopInvariant(V: Data.V))) { |
| 2996 | FoundCandidate = true; |
| 2997 | Data.IsUsed = Data.V == Op; |
| 2998 | if (Data.V == Op) |
| 2999 | ++Cnt; |
| 3000 | break; |
| 3001 | } |
| 3002 | } |
| 3003 | if (!FoundCandidate) |
| 3004 | return false; |
| 3005 | } |
| 3006 | return getNumLanes() == 2 || Cnt > 1; |
| 3007 | } |
| 3008 | |
| 3009 | /// Checks if there is at least single compatible operand in lanes other |
| 3010 | /// than \p Lane, compatible with the operand \p Op. |
| 3011 | bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const { |
| 3012 | assert(Op == getValue(OpIdx, Lane) && |
| 3013 | "Op is expected to be getValue(OpIdx, Lane)." ); |
| 3014 | bool OpAPO = getData(OpIdx, Lane).APO; |
| 3015 | for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) { |
| 3016 | if (Ln == Lane) |
| 3017 | continue; |
| 3018 | if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) { |
| 3019 | const OperandData &Data = getData(OpIdx: OpI, Lane: Ln); |
| 3020 | if (Data.APO != OpAPO || Data.IsUsed) |
| 3021 | return true; |
| 3022 | Value *OpILn = getValue(OpIdx: OpI, Lane: Ln); |
| 3023 | return (L && L->isLoopInvariant(V: OpILn)) || |
| 3024 | (getSameOpcode(VL: {Op, OpILn}, TLI) && |
| 3025 | allSameBlock(VL: {Op, OpILn})); |
| 3026 | })) |
| 3027 | return true; |
| 3028 | } |
| 3029 | return false; |
| 3030 | } |
| 3031 | |
| 3032 | public: |
| 3033 | /// Initialize with all the operands of the instruction vector \p RootVL. |
| 3034 | VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands, |
| 3035 | const InstructionsState &S, const BoUpSLP &R) |
| 3036 | : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R), |
| 3037 | L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) { |
| 3038 | // Append all the operands of RootVL. |
| 3039 | appendOperands(VL: RootVL, Operands, S); |
| 3040 | } |
| 3041 | |
| 3042 | /// \Returns a value vector with the operands across all lanes for the |
| 3043 | /// opearnd at \p OpIdx. |
| 3044 | ValueList getVL(unsigned OpIdx) const { |
| 3045 | ValueList OpVL(OpsVec[OpIdx].size()); |
| 3046 | assert(OpsVec[OpIdx].size() == getNumLanes() && |
| 3047 | "Expected same num of lanes across all operands" ); |
| 3048 | for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane) |
| 3049 | OpVL[Lane] = OpsVec[OpIdx][Lane].V; |
| 3050 | return OpVL; |
| 3051 | } |
| 3052 | |
| 3053 | // Performs operand reordering for 2 or more operands. |
| 3054 | // The original operands are in OrigOps[OpIdx][Lane]. |
| 3055 | // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'. |
| 3056 | void reorder() { |
| 3057 | unsigned NumOperands = getNumOperands(); |
| 3058 | unsigned NumLanes = getNumLanes(); |
| 3059 | // Each operand has its own mode. We are using this mode to help us select |
| 3060 | // the instructions for each lane, so that they match best with the ones |
| 3061 | // we have selected so far. |
| 3062 | SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands); |
| 3063 | |
| 3064 | // This is a greedy single-pass algorithm. We are going over each lane |
| 3065 | // once and deciding on the best order right away with no back-tracking. |
| 3066 | // However, in order to increase its effectiveness, we start with the lane |
| 3067 | // that has operands that can move the least. For example, given the |
| 3068 | // following lanes: |
| 3069 | // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd |
| 3070 | // Lane 1 : A[1] = C[1] - B[1] // Visited 1st |
| 3071 | // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd |
| 3072 | // Lane 3 : A[3] = C[3] - B[3] // Visited 4th |
| 3073 | // we will start at Lane 1, since the operands of the subtraction cannot |
| 3074 | // be reordered. Then we will visit the rest of the lanes in a circular |
| 3075 | // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3. |
| 3076 | |
| 3077 | // Find the first lane that we will start our search from. |
| 3078 | unsigned FirstLane = getBestLaneToStartReordering(); |
| 3079 | |
| 3080 | // Initialize the modes. |
| 3081 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { |
| 3082 | Value *OpLane0 = getValue(OpIdx, Lane: FirstLane); |
| 3083 | // Keep track if we have instructions with all the same opcode on one |
| 3084 | // side. |
| 3085 | if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) { |
| 3086 | // Check if OpLane0 should be broadcast. |
| 3087 | if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) || |
| 3088 | !canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane)) |
| 3089 | ReorderingModes[OpIdx] = ReorderingMode::Splat; |
| 3090 | else if (isa<LoadInst>(Val: OpILane0)) |
| 3091 | ReorderingModes[OpIdx] = ReorderingMode::Load; |
| 3092 | else |
| 3093 | ReorderingModes[OpIdx] = ReorderingMode::Opcode; |
| 3094 | } else if (isa<Constant>(Val: OpLane0)) { |
| 3095 | ReorderingModes[OpIdx] = ReorderingMode::Constant; |
| 3096 | } else if (isa<Argument>(Val: OpLane0)) { |
| 3097 | // Our best hope is a Splat. It may save some cost in some cases. |
| 3098 | ReorderingModes[OpIdx] = ReorderingMode::Splat; |
| 3099 | } else { |
| 3100 | llvm_unreachable("Unexpected value kind." ); |
| 3101 | } |
| 3102 | } |
| 3103 | |
| 3104 | // Check that we don't have same operands. No need to reorder if operands |
| 3105 | // are just perfect diamond or shuffled diamond match. Do not do it only |
| 3106 | // for possible broadcasts or non-power of 2 number of scalars (just for |
| 3107 | // now). |
| 3108 | auto &&SkipReordering = [this]() { |
| 3109 | SmallPtrSet<Value *, 4> UniqueValues; |
| 3110 | ArrayRef<OperandData> Op0 = OpsVec.front(); |
| 3111 | for (const OperandData &Data : Op0) |
| 3112 | UniqueValues.insert(Ptr: Data.V); |
| 3113 | for (ArrayRef<OperandData> Op : |
| 3114 | ArrayRef(OpsVec).slice(N: 1, M: getNumOperands() - 1)) { |
| 3115 | if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) { |
| 3116 | return !UniqueValues.contains(Ptr: Data.V); |
| 3117 | })) |
| 3118 | return false; |
| 3119 | } |
| 3120 | // TODO: Check if we can remove a check for non-power-2 number of |
| 3121 | // scalars after full support of non-power-2 vectorization. |
| 3122 | return UniqueValues.size() != 2 && |
| 3123 | hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(), |
| 3124 | Sz: UniqueValues.size()); |
| 3125 | }; |
| 3126 | |
| 3127 | // If the initial strategy fails for any of the operand indexes, then we |
| 3128 | // perform reordering again in a second pass. This helps avoid assigning |
| 3129 | // high priority to the failed strategy, and should improve reordering for |
| 3130 | // the non-failed operand indexes. |
| 3131 | for (int Pass = 0; Pass != 2; ++Pass) { |
| 3132 | // Check if no need to reorder operands since they're are perfect or |
| 3133 | // shuffled diamond match. |
| 3134 | // Need to do it to avoid extra external use cost counting for |
| 3135 | // shuffled matches, which may cause regressions. |
| 3136 | if (SkipReordering()) |
| 3137 | break; |
| 3138 | // Skip the second pass if the first pass did not fail. |
| 3139 | bool StrategyFailed = false; |
| 3140 | // Mark all operand data as free to use. |
| 3141 | clearUsed(); |
| 3142 | // We keep the original operand order for the FirstLane, so reorder the |
| 3143 | // rest of the lanes. We are visiting the nodes in a circular fashion, |
| 3144 | // using FirstLane as the center point and increasing the radius |
| 3145 | // distance. |
| 3146 | SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands); |
| 3147 | for (unsigned I = 0; I < NumOperands; ++I) |
| 3148 | MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V); |
| 3149 | |
| 3150 | SmallBitVector UsedLanes(NumLanes); |
| 3151 | UsedLanes.set(FirstLane); |
| 3152 | for (unsigned Distance = 1; Distance != NumLanes; ++Distance) { |
| 3153 | // Visit the lane on the right and then the lane on the left. |
| 3154 | for (int Direction : {+1, -1}) { |
| 3155 | int Lane = FirstLane + Direction * Distance; |
| 3156 | if (Lane < 0 || Lane >= (int)NumLanes) |
| 3157 | continue; |
| 3158 | UsedLanes.set(Lane); |
| 3159 | int LastLane = Lane - Direction; |
| 3160 | assert(LastLane >= 0 && LastLane < (int)NumLanes && |
| 3161 | "Out of bounds" ); |
| 3162 | // Look for a good match for each operand. |
| 3163 | for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { |
| 3164 | // Search for the operand that matches SortedOps[OpIdx][Lane-1]. |
| 3165 | std::optional<unsigned> BestIdx = |
| 3166 | getBestOperand(OpIdx, Lane, LastLane, ReorderingModes, |
| 3167 | MainAltOps: MainAltOps[OpIdx], UsedLanes); |
| 3168 | // By not selecting a value, we allow the operands that follow to |
| 3169 | // select a better matching value. We will get a non-null value in |
| 3170 | // the next run of getBestOperand(). |
| 3171 | if (BestIdx) { |
| 3172 | // Swap the current operand with the one returned by |
| 3173 | // getBestOperand(). |
| 3174 | swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane); |
| 3175 | } else { |
| 3176 | // Enable the second pass. |
| 3177 | StrategyFailed = true; |
| 3178 | } |
| 3179 | // Try to get the alternate opcode and follow it during analysis. |
| 3180 | if (MainAltOps[OpIdx].size() != 2) { |
| 3181 | OperandData &AltOp = getData(OpIdx, Lane); |
| 3182 | InstructionsState OpS = |
| 3183 | getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI); |
| 3184 | if (OpS && OpS.isAltShuffle()) |
| 3185 | MainAltOps[OpIdx].push_back(Elt: AltOp.V); |
| 3186 | } |
| 3187 | } |
| 3188 | } |
| 3189 | } |
| 3190 | // Skip second pass if the strategy did not fail. |
| 3191 | if (!StrategyFailed) |
| 3192 | break; |
| 3193 | } |
| 3194 | } |
| 3195 | |
| 3196 | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
| 3197 | LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { |
| 3198 | switch (RMode) { |
| 3199 | case ReorderingMode::Load: |
| 3200 | return "Load" ; |
| 3201 | case ReorderingMode::Opcode: |
| 3202 | return "Opcode" ; |
| 3203 | case ReorderingMode::Constant: |
| 3204 | return "Constant" ; |
| 3205 | case ReorderingMode::Splat: |
| 3206 | return "Splat" ; |
| 3207 | case ReorderingMode::Failed: |
| 3208 | return "Failed" ; |
| 3209 | } |
| 3210 | llvm_unreachable("Unimplemented Reordering Type" ); |
| 3211 | } |
| 3212 | |
| 3213 | LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode, |
| 3214 | raw_ostream &OS) { |
| 3215 | return OS << getModeStr(RMode); |
| 3216 | } |
| 3217 | |
| 3218 | /// Debug print. |
| 3219 | LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) { |
| 3220 | printMode(RMode, dbgs()); |
| 3221 | } |
| 3222 | |
| 3223 | friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) { |
| 3224 | return printMode(RMode, OS); |
| 3225 | } |
| 3226 | |
| 3227 | LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const { |
| 3228 | const unsigned Indent = 2; |
| 3229 | unsigned Cnt = 0; |
| 3230 | for (const OperandDataVec &OpDataVec : OpsVec) { |
| 3231 | OS << "Operand " << Cnt++ << "\n" ; |
| 3232 | for (const OperandData &OpData : OpDataVec) { |
| 3233 | OS.indent(Indent) << "{" ; |
| 3234 | if (Value *V = OpData.V) |
| 3235 | OS << *V; |
| 3236 | else |
| 3237 | OS << "null" ; |
| 3238 | OS << ", APO:" << OpData.APO << "}\n" ; |
| 3239 | } |
| 3240 | OS << "\n" ; |
| 3241 | } |
| 3242 | return OS; |
| 3243 | } |
| 3244 | |
| 3245 | /// Debug print. |
| 3246 | LLVM_DUMP_METHOD void dump() const { print(dbgs()); } |
| 3247 | #endif |
| 3248 | }; |
| 3249 | |
| 3250 | /// Evaluate each pair in \p Candidates and return index into \p Candidates |
| 3251 | /// for a pair which have highest score deemed to have best chance to form |
| 3252 | /// root of profitable tree to vectorize. Return std::nullopt if no candidate |
| 3253 | /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit |
| 3254 | /// of the cost, considered to be good enough score. |
| 3255 | std::optional<int> |
| 3256 | findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates, |
| 3257 | int Limit = LookAheadHeuristics::ScoreFail) const { |
| 3258 | LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2, |
| 3259 | RootLookAheadMaxDepth); |
| 3260 | int BestScore = Limit; |
| 3261 | std::optional<int> Index; |
| 3262 | for (int I : seq<int>(Begin: 0, End: Candidates.size())) { |
| 3263 | int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first, |
| 3264 | RHS: Candidates[I].second, |
| 3265 | /*U1=*/nullptr, /*U2=*/nullptr, |
| 3266 | /*CurrLevel=*/1, MainAltOps: {}); |
| 3267 | if (Score > BestScore) { |
| 3268 | BestScore = Score; |
| 3269 | Index = I; |
| 3270 | } |
| 3271 | } |
| 3272 | return Index; |
| 3273 | } |
| 3274 | |
| 3275 | /// Checks if the instruction is marked for deletion. |
| 3276 | bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); } |
| 3277 | |
| 3278 | /// Removes an instruction from its block and eventually deletes it. |
| 3279 | /// It's like Instruction::eraseFromParent() except that the actual deletion |
| 3280 | /// is delayed until BoUpSLP is destructed. |
| 3281 | void eraseInstruction(Instruction *I) { |
| 3282 | DeletedInstructions.insert(V: I); |
| 3283 | } |
| 3284 | |
| 3285 | /// Remove instructions from the parent function and clear the operands of \p |
| 3286 | /// DeadVals instructions, marking for deletion trivially dead operands. |
| 3287 | template <typename T> |
| 3288 | void removeInstructionsAndOperands( |
| 3289 | ArrayRef<T *> DeadVals, |
| 3290 | ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) { |
| 3291 | SmallVector<WeakTrackingVH> DeadInsts; |
| 3292 | for (T *V : DeadVals) { |
| 3293 | auto *I = cast<Instruction>(V); |
| 3294 | eraseInstruction(I); |
| 3295 | } |
| 3296 | DenseSet<Value *> Processed; |
| 3297 | for (T *V : DeadVals) { |
| 3298 | if (!V || !Processed.insert(V).second) |
| 3299 | continue; |
| 3300 | auto *I = cast<Instruction>(V); |
| 3301 | salvageDebugInfo(*I); |
| 3302 | ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I); |
| 3303 | for (Use &U : I->operands()) { |
| 3304 | if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get()); |
| 3305 | OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() && |
| 3306 | wouldInstructionBeTriviallyDead(I: OpI, TLI) && |
| 3307 | (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) { |
| 3308 | return Entry->VectorizedValue == OpI; |
| 3309 | }))) |
| 3310 | DeadInsts.push_back(Elt: OpI); |
| 3311 | } |
| 3312 | I->dropAllReferences(); |
| 3313 | } |
| 3314 | for (T *V : DeadVals) { |
| 3315 | auto *I = cast<Instruction>(V); |
| 3316 | if (!I->getParent()) |
| 3317 | continue; |
| 3318 | assert((I->use_empty() || all_of(I->uses(), |
| 3319 | [&](Use &U) { |
| 3320 | return isDeleted( |
| 3321 | cast<Instruction>(U.getUser())); |
| 3322 | })) && |
| 3323 | "trying to erase instruction with users." ); |
| 3324 | I->removeFromParent(); |
| 3325 | SE->forgetValue(V: I); |
| 3326 | } |
| 3327 | // Process the dead instruction list until empty. |
| 3328 | while (!DeadInsts.empty()) { |
| 3329 | Value *V = DeadInsts.pop_back_val(); |
| 3330 | Instruction *VI = cast_or_null<Instruction>(Val: V); |
| 3331 | if (!VI || !VI->getParent()) |
| 3332 | continue; |
| 3333 | assert(isInstructionTriviallyDead(VI, TLI) && |
| 3334 | "Live instruction found in dead worklist!" ); |
| 3335 | assert(VI->use_empty() && "Instructions with uses are not dead." ); |
| 3336 | |
| 3337 | // Don't lose the debug info while deleting the instructions. |
| 3338 | salvageDebugInfo(I&: *VI); |
| 3339 | |
| 3340 | // Null out all of the instruction's operands to see if any operand |
| 3341 | // becomes dead as we go. |
| 3342 | for (Use &OpU : VI->operands()) { |
| 3343 | Value *OpV = OpU.get(); |
| 3344 | if (!OpV) |
| 3345 | continue; |
| 3346 | OpU.set(nullptr); |
| 3347 | |
| 3348 | if (!OpV->use_empty()) |
| 3349 | continue; |
| 3350 | |
| 3351 | // If the operand is an instruction that became dead as we nulled out |
| 3352 | // the operand, and if it is 'trivially' dead, delete it in a future |
| 3353 | // loop iteration. |
| 3354 | if (auto *OpI = dyn_cast<Instruction>(Val: OpV)) |
| 3355 | if (!DeletedInstructions.contains(V: OpI) && |
| 3356 | (!OpI->getType()->isVectorTy() || |
| 3357 | none_of(VectorValuesAndScales, |
| 3358 | [&](const std::tuple<Value *, unsigned, bool> &V) { |
| 3359 | return std::get<0>(t: V) == OpI; |
| 3360 | })) && |
| 3361 | isInstructionTriviallyDead(I: OpI, TLI)) |
| 3362 | DeadInsts.push_back(Elt: OpI); |
| 3363 | } |
| 3364 | |
| 3365 | VI->removeFromParent(); |
| 3366 | eraseInstruction(I: VI); |
| 3367 | SE->forgetValue(V: VI); |
| 3368 | } |
| 3369 | } |
| 3370 | |
| 3371 | /// Checks if the instruction was already analyzed for being possible |
| 3372 | /// reduction root. |
| 3373 | bool isAnalyzedReductionRoot(Instruction *I) const { |
| 3374 | return AnalyzedReductionsRoots.count(Ptr: I); |
| 3375 | } |
| 3376 | /// Register given instruction as already analyzed for being possible |
| 3377 | /// reduction root. |
| 3378 | void analyzedReductionRoot(Instruction *I) { |
| 3379 | AnalyzedReductionsRoots.insert(Ptr: I); |
| 3380 | } |
| 3381 | /// Checks if the provided list of reduced values was checked already for |
| 3382 | /// vectorization. |
| 3383 | bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const { |
| 3384 | return AnalyzedReductionVals.contains(V: hash_value(S: VL)); |
| 3385 | } |
| 3386 | /// Adds the list of reduced values to list of already checked values for the |
| 3387 | /// vectorization. |
| 3388 | void analyzedReductionVals(ArrayRef<Value *> VL) { |
| 3389 | AnalyzedReductionVals.insert(V: hash_value(S: VL)); |
| 3390 | } |
| 3391 | /// Clear the list of the analyzed reduction root instructions. |
| 3392 | void clearReductionData() { |
| 3393 | AnalyzedReductionsRoots.clear(); |
| 3394 | AnalyzedReductionVals.clear(); |
| 3395 | AnalyzedMinBWVals.clear(); |
| 3396 | } |
| 3397 | /// Checks if the given value is gathered in one of the nodes. |
| 3398 | bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const { |
| 3399 | return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); }); |
| 3400 | } |
| 3401 | /// Checks if the given value is gathered in one of the nodes. |
| 3402 | bool isGathered(const Value *V) const { |
| 3403 | return MustGather.contains(Ptr: V); |
| 3404 | } |
| 3405 | /// Checks if the specified value was not schedule. |
| 3406 | bool isNotScheduled(const Value *V) const { |
| 3407 | return NonScheduledFirst.contains(Ptr: V); |
| 3408 | } |
| 3409 | |
| 3410 | /// Check if the value is vectorized in the tree. |
| 3411 | bool isVectorized(const Value *V) const { |
| 3412 | assert(V && "V cannot be nullptr." ); |
| 3413 | return ScalarToTreeEntries.contains(Val: V); |
| 3414 | } |
| 3415 | |
| 3416 | ~BoUpSLP(); |
| 3417 | |
| 3418 | private: |
| 3419 | /// Determine if a node \p E in can be demoted to a smaller type with a |
| 3420 | /// truncation. We collect the entries that will be demoted in ToDemote. |
| 3421 | /// \param E Node for analysis |
| 3422 | /// \param ToDemote indices of the nodes to be demoted. |
| 3423 | bool collectValuesToDemote( |
| 3424 | const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, |
| 3425 | SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, |
| 3426 | const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel, |
| 3427 | bool &IsProfitableToDemote, bool IsTruncRoot) const; |
| 3428 | |
| 3429 | /// Builds the list of reorderable operands on the edges \p Edges of the \p |
| 3430 | /// UserTE, which allow reordering (i.e. the operands can be reordered because |
| 3431 | /// they have only one user and reordarable). |
| 3432 | /// \param ReorderableGathers List of all gather nodes that require reordering |
| 3433 | /// (e.g., gather of extractlements or partially vectorizable loads). |
| 3434 | /// \param GatherOps List of gather operand nodes for \p UserTE that require |
| 3435 | /// reordering, subset of \p NonVectorized. |
| 3436 | void buildReorderableOperands( |
| 3437 | TreeEntry *UserTE, |
| 3438 | SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, |
| 3439 | const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers, |
| 3440 | SmallVectorImpl<TreeEntry *> &GatherOps); |
| 3441 | |
| 3442 | /// Checks if the given \p TE is a gather node with clustered reused scalars |
| 3443 | /// and reorders it per given \p Mask. |
| 3444 | void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const; |
| 3445 | |
| 3446 | /// Checks if all users of \p I are the part of the vectorization tree. |
| 3447 | bool areAllUsersVectorized( |
| 3448 | Instruction *I, |
| 3449 | const SmallDenseSet<Value *> *VectorizedVals = nullptr) const; |
| 3450 | |
| 3451 | /// Return information about the vector formed for the specified index |
| 3452 | /// of a vector of (the same) instruction. |
| 3453 | TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops); |
| 3454 | |
| 3455 | /// \returns the graph entry for the \p Idx operand of the \p E entry. |
| 3456 | const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const; |
| 3457 | TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) { |
| 3458 | return const_cast<TreeEntry *>( |
| 3459 | getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx)); |
| 3460 | } |
| 3461 | |
| 3462 | /// Gets the root instruction for the given node. If the node is a strided |
| 3463 | /// load/store node with the reverse order, the root instruction is the last |
| 3464 | /// one. |
| 3465 | Instruction *getRootEntryInstruction(const TreeEntry &Entry) const; |
| 3466 | |
| 3467 | /// \returns Cast context for the given graph node. |
| 3468 | TargetTransformInfo::CastContextHint |
| 3469 | getCastContextHint(const TreeEntry &TE) const; |
| 3470 | |
| 3471 | /// \returns the cost of the vectorizable entry. |
| 3472 | InstructionCost getEntryCost(const TreeEntry *E, |
| 3473 | ArrayRef<Value *> VectorizedVals, |
| 3474 | SmallPtrSetImpl<Value *> &); |
| 3475 | |
| 3476 | /// Checks if it is legal and profitable to build SplitVectorize node for the |
| 3477 | /// given \p VL. |
| 3478 | /// \param Op1 first homogeneous scalars. |
| 3479 | /// \param Op2 second homogeneous scalars. |
| 3480 | /// \param ReorderIndices indices to reorder the scalars. |
| 3481 | /// \returns true if the node was successfully built. |
| 3482 | bool canBuildSplitNode(ArrayRef<Value *> VL, |
| 3483 | const InstructionsState &LocalState, |
| 3484 | SmallVectorImpl<Value *> &Op1, |
| 3485 | SmallVectorImpl<Value *> &Op2, |
| 3486 | OrdersType &ReorderIndices) const; |
| 3487 | |
| 3488 | /// This is the recursive part of buildTree. |
| 3489 | void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI, |
| 3490 | unsigned InterleaveFactor = 0); |
| 3491 | |
| 3492 | /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can |
| 3493 | /// be vectorized to use the original vector (or aggregate "bitcast" to a |
| 3494 | /// vector) and sets \p CurrentOrder to the identity permutation; otherwise |
| 3495 | /// returns false, setting \p CurrentOrder to either an empty vector or a |
| 3496 | /// non-identity permutation that allows to reuse extract instructions. |
| 3497 | /// \param ResizeAllowed indicates whether it is allowed to handle subvector |
| 3498 | /// extract order. |
| 3499 | bool canReuseExtract(ArrayRef<Value *> VL, |
| 3500 | SmallVectorImpl<unsigned> &CurrentOrder, |
| 3501 | bool ResizeAllowed = false) const; |
| 3502 | |
| 3503 | /// Vectorize a single entry in the tree. |
| 3504 | Value *vectorizeTree(TreeEntry *E); |
| 3505 | |
| 3506 | /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry |
| 3507 | /// \p E. |
| 3508 | Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx); |
| 3509 | |
| 3510 | /// Create a new vector from a list of scalar values. Produces a sequence |
| 3511 | /// which exploits values reused across lanes, and arranges the inserts |
| 3512 | /// for ease of later optimization. |
| 3513 | template <typename BVTy, typename ResTy, typename... Args> |
| 3514 | ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params); |
| 3515 | |
| 3516 | /// Create a new vector from a list of scalar values. Produces a sequence |
| 3517 | /// which exploits values reused across lanes, and arranges the inserts |
| 3518 | /// for ease of later optimization. |
| 3519 | Value *createBuildVector(const TreeEntry *E, Type *ScalarTy); |
| 3520 | |
| 3521 | /// Returns the instruction in the bundle, which can be used as a base point |
| 3522 | /// for scheduling. Usually it is the last instruction in the bundle, except |
| 3523 | /// for the case when all operands are external (in this case, it is the first |
| 3524 | /// instruction in the list). |
| 3525 | Instruction &getLastInstructionInBundle(const TreeEntry *E); |
| 3526 | |
| 3527 | /// Tries to find extractelement instructions with constant indices from fixed |
| 3528 | /// vector type and gather such instructions into a bunch, which highly likely |
| 3529 | /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt |
| 3530 | /// was successful, the matched scalars are replaced by poison values in \p VL |
| 3531 | /// for future analysis. |
| 3532 | std::optional<TargetTransformInfo::ShuffleKind> |
| 3533 | tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL, |
| 3534 | SmallVectorImpl<int> &Mask) const; |
| 3535 | |
| 3536 | /// Tries to find extractelement instructions with constant indices from fixed |
| 3537 | /// vector type and gather such instructions into a bunch, which highly likely |
| 3538 | /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt |
| 3539 | /// was successful, the matched scalars are replaced by poison values in \p VL |
| 3540 | /// for future analysis. |
| 3541 | SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> |
| 3542 | tryToGatherExtractElements(SmallVectorImpl<Value *> &VL, |
| 3543 | SmallVectorImpl<int> &Mask, |
| 3544 | unsigned NumParts) const; |
| 3545 | |
| 3546 | /// Checks if the gathered \p VL can be represented as a single register |
| 3547 | /// shuffle(s) of previous tree entries. |
| 3548 | /// \param TE Tree entry checked for permutation. |
| 3549 | /// \param VL List of scalars (a subset of the TE scalar), checked for |
| 3550 | /// permutations. Must form single-register vector. |
| 3551 | /// \param ForOrder Tries to fetch the best candidates for ordering info. Also |
| 3552 | /// commands to build the mask using the original vector value, without |
| 3553 | /// relying on the potential reordering. |
| 3554 | /// \returns ShuffleKind, if gathered values can be represented as shuffles of |
| 3555 | /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask. |
| 3556 | std::optional<TargetTransformInfo::ShuffleKind> |
| 3557 | isGatherShuffledSingleRegisterEntry( |
| 3558 | const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, |
| 3559 | SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, |
| 3560 | bool ForOrder); |
| 3561 | |
| 3562 | /// Checks if the gathered \p VL can be represented as multi-register |
| 3563 | /// shuffle(s) of previous tree entries. |
| 3564 | /// \param TE Tree entry checked for permutation. |
| 3565 | /// \param VL List of scalars (a subset of the TE scalar), checked for |
| 3566 | /// permutations. |
| 3567 | /// \param ForOrder Tries to fetch the best candidates for ordering info. Also |
| 3568 | /// commands to build the mask using the original vector value, without |
| 3569 | /// relying on the potential reordering. |
| 3570 | /// \returns per-register series of ShuffleKind, if gathered values can be |
| 3571 | /// represented as shuffles of previous tree entries. \p Mask is filled with |
| 3572 | /// the shuffle mask (also on per-register base). |
| 3573 | SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> |
| 3574 | isGatherShuffledEntry( |
| 3575 | const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, |
| 3576 | SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, |
| 3577 | unsigned NumParts, bool ForOrder = false); |
| 3578 | |
| 3579 | /// \returns the cost of gathering (inserting) the values in \p VL into a |
| 3580 | /// vector. |
| 3581 | /// \param ForPoisonSrc true if initial vector is poison, false otherwise. |
| 3582 | InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, |
| 3583 | Type *ScalarTy) const; |
| 3584 | |
| 3585 | /// Set the Builder insert point to one after the last instruction in |
| 3586 | /// the bundle |
| 3587 | void setInsertPointAfterBundle(const TreeEntry *E); |
| 3588 | |
| 3589 | /// \returns a vector from a collection of scalars in \p VL. if \p Root is not |
| 3590 | /// specified, the starting vector value is poison. |
| 3591 | Value * |
| 3592 | gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy, |
| 3593 | function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle); |
| 3594 | |
| 3595 | /// \returns whether the VectorizableTree is fully vectorizable and will |
| 3596 | /// be beneficial even the tree height is tiny. |
| 3597 | bool isFullyVectorizableTinyTree(bool ForReduction) const; |
| 3598 | |
| 3599 | /// Run through the list of all gathered loads in the graph and try to find |
| 3600 | /// vector loads/masked gathers instead of regular gathers. Later these loads |
| 3601 | /// are reshufled to build final gathered nodes. |
| 3602 | void tryToVectorizeGatheredLoads( |
| 3603 | const SmallMapVector< |
| 3604 | std::tuple<BasicBlock *, Value *, Type *>, |
| 3605 | SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8> |
| 3606 | &GatheredLoads); |
| 3607 | |
| 3608 | /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the |
| 3609 | /// users of \p TE and collects the stores. It returns the map from the store |
| 3610 | /// pointers to the collected stores. |
| 3611 | SmallVector<SmallVector<StoreInst *>> |
| 3612 | collectUserStores(const BoUpSLP::TreeEntry *TE) const; |
| 3613 | |
| 3614 | /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the |
| 3615 | /// stores in \p StoresVec can form a vector instruction. If so it returns |
| 3616 | /// true and populates \p ReorderIndices with the shuffle indices of the |
| 3617 | /// stores when compared to the sorted vector. |
| 3618 | bool canFormVector(ArrayRef<StoreInst *> StoresVec, |
| 3619 | OrdersType &ReorderIndices) const; |
| 3620 | |
| 3621 | /// Iterates through the users of \p TE, looking for scalar stores that can be |
| 3622 | /// potentially vectorized in a future SLP-tree. If found, it keeps track of |
| 3623 | /// their order and builds an order index vector for each store bundle. It |
| 3624 | /// returns all these order vectors found. |
| 3625 | /// We run this after the tree has formed, otherwise we may come across user |
| 3626 | /// instructions that are not yet in the tree. |
| 3627 | SmallVector<OrdersType, 1> |
| 3628 | findExternalStoreUsersReorderIndices(TreeEntry *TE) const; |
| 3629 | |
| 3630 | /// Tries to reorder the gathering node for better vectorization |
| 3631 | /// opportunities. |
| 3632 | void reorderGatherNode(TreeEntry &TE); |
| 3633 | |
| 3634 | class TreeEntry { |
| 3635 | public: |
| 3636 | using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>; |
| 3637 | TreeEntry(VecTreeTy &Container) : Container(Container) {} |
| 3638 | |
| 3639 | /// \returns Common mask for reorder indices and reused scalars. |
| 3640 | SmallVector<int> getCommonMask() const { |
| 3641 | if (State == TreeEntry::SplitVectorize) |
| 3642 | return {}; |
| 3643 | SmallVector<int> Mask; |
| 3644 | inversePermutation(Indices: ReorderIndices, Mask); |
| 3645 | ::addMask(Mask, SubMask: ReuseShuffleIndices); |
| 3646 | return Mask; |
| 3647 | } |
| 3648 | |
| 3649 | /// \returns The mask for split nodes. |
| 3650 | SmallVector<int> getSplitMask() const { |
| 3651 | assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() && |
| 3652 | "Expected only split vectorize node." ); |
| 3653 | SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem); |
| 3654 | unsigned CommonVF = std::max<unsigned>( |
| 3655 | a: CombinedEntriesWithIndices.back().second, |
| 3656 | b: Scalars.size() - CombinedEntriesWithIndices.back().second); |
| 3657 | for (auto [Idx, I] : enumerate(First: ReorderIndices)) |
| 3658 | Mask[I] = |
| 3659 | Idx + (Idx >= CombinedEntriesWithIndices.back().second |
| 3660 | ? CommonVF - CombinedEntriesWithIndices.back().second |
| 3661 | : 0); |
| 3662 | return Mask; |
| 3663 | } |
| 3664 | |
| 3665 | /// Updates (reorders) SplitVectorize node according to the given mask \p |
| 3666 | /// Mask and order \p MaskOrder. |
| 3667 | void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask, |
| 3668 | ArrayRef<int> MaskOrder); |
| 3669 | |
| 3670 | /// \returns true if the scalars in VL are equal to this entry. |
| 3671 | bool isSame(ArrayRef<Value *> VL) const { |
| 3672 | auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) { |
| 3673 | if (Mask.size() != VL.size() && VL.size() == Scalars.size()) |
| 3674 | return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin()); |
| 3675 | return VL.size() == Mask.size() && |
| 3676 | std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(), |
| 3677 | binary_pred: [Scalars](Value *V, int Idx) { |
| 3678 | return (isa<UndefValue>(Val: V) && |
| 3679 | Idx == PoisonMaskElem) || |
| 3680 | (Idx != PoisonMaskElem && V == Scalars[Idx]); |
| 3681 | }); |
| 3682 | }; |
| 3683 | if (!ReorderIndices.empty()) { |
| 3684 | // TODO: implement matching if the nodes are just reordered, still can |
| 3685 | // treat the vector as the same if the list of scalars matches VL |
| 3686 | // directly, without reordering. |
| 3687 | SmallVector<int> Mask; |
| 3688 | inversePermutation(Indices: ReorderIndices, Mask); |
| 3689 | if (VL.size() == Scalars.size()) |
| 3690 | return IsSame(Scalars, Mask); |
| 3691 | if (VL.size() == ReuseShuffleIndices.size()) { |
| 3692 | ::addMask(Mask, SubMask: ReuseShuffleIndices); |
| 3693 | return IsSame(Scalars, Mask); |
| 3694 | } |
| 3695 | return false; |
| 3696 | } |
| 3697 | return IsSame(Scalars, ReuseShuffleIndices); |
| 3698 | } |
| 3699 | |
| 3700 | /// \returns true if current entry has same operands as \p TE. |
| 3701 | bool hasEqualOperands(const TreeEntry &TE) const { |
| 3702 | if (TE.getNumOperands() != getNumOperands()) |
| 3703 | return false; |
| 3704 | SmallBitVector Used(getNumOperands()); |
| 3705 | for (unsigned I = 0, E = getNumOperands(); I < E; ++I) { |
| 3706 | unsigned PrevCount = Used.count(); |
| 3707 | for (unsigned K = 0; K < E; ++K) { |
| 3708 | if (Used.test(Idx: K)) |
| 3709 | continue; |
| 3710 | if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) { |
| 3711 | Used.set(K); |
| 3712 | break; |
| 3713 | } |
| 3714 | } |
| 3715 | // Check if we actually found the matching operand. |
| 3716 | if (PrevCount == Used.count()) |
| 3717 | return false; |
| 3718 | } |
| 3719 | return true; |
| 3720 | } |
| 3721 | |
| 3722 | /// \return Final vectorization factor for the node. Defined by the total |
| 3723 | /// number of vectorized scalars, including those, used several times in the |
| 3724 | /// entry and counted in the \a ReuseShuffleIndices, if any. |
| 3725 | unsigned getVectorFactor() const { |
| 3726 | if (!ReuseShuffleIndices.empty()) |
| 3727 | return ReuseShuffleIndices.size(); |
| 3728 | return Scalars.size(); |
| 3729 | }; |
| 3730 | |
| 3731 | /// Checks if the current node is a gather node. |
| 3732 | bool isGather() const { return State == NeedToGather; } |
| 3733 | |
| 3734 | /// A vector of scalars. |
| 3735 | ValueList Scalars; |
| 3736 | |
| 3737 | /// The Scalars are vectorized into this value. It is initialized to Null. |
| 3738 | WeakTrackingVH VectorizedValue = nullptr; |
| 3739 | |
| 3740 | /// Do we need to gather this sequence or vectorize it |
| 3741 | /// (either with vector instruction or with scatter/gather |
| 3742 | /// intrinsics for store/load)? |
| 3743 | enum EntryState { |
| 3744 | Vectorize, ///< The node is regularly vectorized. |
| 3745 | ScatterVectorize, ///< Masked scatter/gather node. |
| 3746 | StridedVectorize, ///< Strided loads (and stores) |
| 3747 | CompressVectorize, ///< (Masked) load with compress. |
| 3748 | NeedToGather, ///< Gather/buildvector node. |
| 3749 | CombinedVectorize, ///< Vectorized node, combined with its user into more |
| 3750 | ///< complex node like select/cmp to minmax, mul/add to |
| 3751 | ///< fma, etc. Must be used for the following nodes in |
| 3752 | ///< the pattern, not the very first one. |
| 3753 | SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them |
| 3754 | ///< independently and then combines back. |
| 3755 | }; |
| 3756 | EntryState State; |
| 3757 | |
| 3758 | /// List of combined opcodes supported by the vectorizer. |
| 3759 | enum CombinedOpcode { |
| 3760 | NotCombinedOp = -1, |
| 3761 | MinMax = Instruction::OtherOpsEnd + 1, |
| 3762 | }; |
| 3763 | CombinedOpcode CombinedOp = NotCombinedOp; |
| 3764 | |
| 3765 | /// Does this sequence require some shuffling? |
| 3766 | SmallVector<int, 4> ReuseShuffleIndices; |
| 3767 | |
| 3768 | /// Does this entry require reordering? |
| 3769 | SmallVector<unsigned, 4> ReorderIndices; |
| 3770 | |
| 3771 | /// Points back to the VectorizableTree. |
| 3772 | /// |
| 3773 | /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has |
| 3774 | /// to be a pointer and needs to be able to initialize the child iterator. |
| 3775 | /// Thus we need a reference back to the container to translate the indices |
| 3776 | /// to entries. |
| 3777 | VecTreeTy &Container; |
| 3778 | |
| 3779 | /// The TreeEntry index containing the user of this entry. |
| 3780 | EdgeInfo UserTreeIndex; |
| 3781 | |
| 3782 | /// The index of this treeEntry in VectorizableTree. |
| 3783 | unsigned Idx = 0; |
| 3784 | |
| 3785 | /// For gather/buildvector/alt opcode nodes, which are combined from |
| 3786 | /// other nodes as a series of insertvector instructions. |
| 3787 | SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices; |
| 3788 | |
| 3789 | private: |
| 3790 | /// The operands of each instruction in each lane Operands[op_index][lane]. |
| 3791 | /// Note: This helps avoid the replication of the code that performs the |
| 3792 | /// reordering of operands during buildTreeRec() and vectorizeTree(). |
| 3793 | SmallVector<ValueList, 2> Operands; |
| 3794 | |
| 3795 | /// MainOp and AltOp are recorded inside. S should be obtained from |
| 3796 | /// newTreeEntry. |
| 3797 | InstructionsState S = InstructionsState::invalid(); |
| 3798 | |
| 3799 | /// Interleaving factor for interleaved loads Vectorize nodes. |
| 3800 | unsigned InterleaveFactor = 0; |
| 3801 | |
| 3802 | /// True if the node does not require scheduling. |
| 3803 | bool DoesNotNeedToSchedule = false; |
| 3804 | |
| 3805 | /// Set this bundle's \p OpIdx'th operand to \p OpVL. |
| 3806 | void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { |
| 3807 | if (Operands.size() < OpIdx + 1) |
| 3808 | Operands.resize(N: OpIdx + 1); |
| 3809 | assert(Operands[OpIdx].empty() && "Already resized?" ); |
| 3810 | assert(OpVL.size() <= Scalars.size() && |
| 3811 | "Number of operands is greater than the number of scalars." ); |
| 3812 | Operands[OpIdx].resize(N: OpVL.size()); |
| 3813 | copy(Range&: OpVL, Out: Operands[OpIdx].begin()); |
| 3814 | } |
| 3815 | |
| 3816 | public: |
| 3817 | /// Returns interleave factor for interleave nodes. |
| 3818 | unsigned getInterleaveFactor() const { return InterleaveFactor; } |
| 3819 | /// Sets interleaving factor for the interleaving nodes. |
| 3820 | void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } |
| 3821 | |
| 3822 | /// Marks the node as one that does not require scheduling. |
| 3823 | void setDoesNotNeedToSchedule() { |
| 3824 | assert(::doesNotNeedToSchedule(Scalars) && |
| 3825 | "Expected to not need scheduling" ); |
| 3826 | DoesNotNeedToSchedule = true; |
| 3827 | } |
| 3828 | /// Returns true if the node is marked as one that does not require |
| 3829 | /// scheduling. |
| 3830 | bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; } |
| 3831 | |
| 3832 | /// Set this bundle's operands from \p Operands. |
| 3833 | void setOperands(ArrayRef<ValueList> Operands) { |
| 3834 | for (unsigned I : seq<unsigned>(Size: Operands.size())) |
| 3835 | setOperand(OpIdx: I, OpVL: Operands[I]); |
| 3836 | } |
| 3837 | |
| 3838 | /// Reorders operands of the node to the given mask \p Mask. |
| 3839 | void reorderOperands(ArrayRef<int> Mask) { |
| 3840 | for (ValueList &Operand : Operands) |
| 3841 | reorderScalars(Scalars&: Operand, Mask); |
| 3842 | } |
| 3843 | |
| 3844 | /// \returns the \p OpIdx operand of this TreeEntry. |
| 3845 | ValueList &getOperand(unsigned OpIdx) { |
| 3846 | assert(OpIdx < Operands.size() && "Off bounds" ); |
| 3847 | return Operands[OpIdx]; |
| 3848 | } |
| 3849 | |
| 3850 | /// \returns the \p OpIdx operand of this TreeEntry. |
| 3851 | ArrayRef<Value *> getOperand(unsigned OpIdx) const { |
| 3852 | assert(OpIdx < Operands.size() && "Off bounds" ); |
| 3853 | return Operands[OpIdx]; |
| 3854 | } |
| 3855 | |
| 3856 | /// \returns the number of operands. |
| 3857 | unsigned getNumOperands() const { return Operands.size(); } |
| 3858 | |
| 3859 | /// \return the single \p OpIdx operand. |
| 3860 | Value *getSingleOperand(unsigned OpIdx) const { |
| 3861 | assert(OpIdx < Operands.size() && "Off bounds" ); |
| 3862 | assert(!Operands[OpIdx].empty() && "No operand available" ); |
| 3863 | return Operands[OpIdx][0]; |
| 3864 | } |
| 3865 | |
| 3866 | /// Some of the instructions in the list have alternate opcodes. |
| 3867 | bool isAltShuffle() const { return S.isAltShuffle(); } |
| 3868 | |
| 3869 | Instruction *getMatchingMainOpOrAltOp(Instruction *I) const { |
| 3870 | return S.getMatchingMainOpOrAltOp(I); |
| 3871 | } |
| 3872 | |
| 3873 | /// Chooses the correct key for scheduling data. If \p Op has the same (or |
| 3874 | /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is |
| 3875 | /// \p OpValue. |
| 3876 | Value *isOneOf(Value *Op) const { |
| 3877 | auto *I = dyn_cast<Instruction>(Val: Op); |
| 3878 | if (I && getMatchingMainOpOrAltOp(I)) |
| 3879 | return Op; |
| 3880 | return S.getMainOp(); |
| 3881 | } |
| 3882 | |
| 3883 | void setOperations(const InstructionsState &S) { |
| 3884 | assert(S && "InstructionsState is invalid." ); |
| 3885 | this->S = S; |
| 3886 | } |
| 3887 | |
| 3888 | Instruction *getMainOp() const { return S.getMainOp(); } |
| 3889 | |
| 3890 | Instruction *getAltOp() const { return S.getAltOp(); } |
| 3891 | |
| 3892 | /// The main/alternate opcodes for the list of instructions. |
| 3893 | unsigned getOpcode() const { return S.getOpcode(); } |
| 3894 | |
| 3895 | unsigned getAltOpcode() const { return S.getAltOpcode(); } |
| 3896 | |
| 3897 | bool hasState() const { return S.valid(); } |
| 3898 | |
| 3899 | /// When ReuseReorderShuffleIndices is empty it just returns position of \p |
| 3900 | /// V within vector of Scalars. Otherwise, try to remap on its reuse index. |
| 3901 | int findLaneForValue(Value *V) const { |
| 3902 | unsigned FoundLane = getVectorFactor(); |
| 3903 | for (auto *It = find(Range: Scalars, Val: V), *End = Scalars.end(); It != End; |
| 3904 | std::advance(i&: It, n: 1)) { |
| 3905 | if (*It != V) |
| 3906 | continue; |
| 3907 | FoundLane = std::distance(first: Scalars.begin(), last: It); |
| 3908 | assert(FoundLane < Scalars.size() && "Couldn't find extract lane" ); |
| 3909 | if (!ReorderIndices.empty()) |
| 3910 | FoundLane = ReorderIndices[FoundLane]; |
| 3911 | assert(FoundLane < Scalars.size() && "Couldn't find extract lane" ); |
| 3912 | if (ReuseShuffleIndices.empty()) |
| 3913 | break; |
| 3914 | if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane); |
| 3915 | RIt != ReuseShuffleIndices.end()) { |
| 3916 | FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt); |
| 3917 | break; |
| 3918 | } |
| 3919 | } |
| 3920 | assert(FoundLane < getVectorFactor() && "Unable to find given value." ); |
| 3921 | return FoundLane; |
| 3922 | } |
| 3923 | |
| 3924 | /// Build a shuffle mask for graph entry which represents a merge of main |
| 3925 | /// and alternate operations. |
| 3926 | void |
| 3927 | buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp, |
| 3928 | SmallVectorImpl<int> &Mask, |
| 3929 | SmallVectorImpl<Value *> *OpScalars = nullptr, |
| 3930 | SmallVectorImpl<Value *> *AltScalars = nullptr) const; |
| 3931 | |
| 3932 | /// Return true if this is a non-power-of-2 node. |
| 3933 | bool isNonPowOf2Vec() const { |
| 3934 | bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size()); |
| 3935 | return IsNonPowerOf2; |
| 3936 | } |
| 3937 | |
| 3938 | /// Return true if this is a node, which tries to vectorize number of |
| 3939 | /// elements, forming whole vectors. |
| 3940 | bool |
| 3941 | hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const { |
| 3942 | bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2( |
| 3943 | TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size()); |
| 3944 | assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && |
| 3945 | "Reshuffling not supported with non-power-of-2 vectors yet." ); |
| 3946 | return IsNonPowerOf2; |
| 3947 | } |
| 3948 | |
| 3949 | Value *getOrdered(unsigned Idx) const { |
| 3950 | assert(isGather() && "Must be used only for buildvectors/gathers." ); |
| 3951 | if (ReorderIndices.empty()) |
| 3952 | return Scalars[Idx]; |
| 3953 | SmallVector<int> Mask; |
| 3954 | inversePermutation(Indices: ReorderIndices, Mask); |
| 3955 | return Scalars[Mask[Idx]]; |
| 3956 | } |
| 3957 | |
| 3958 | #ifndef NDEBUG |
| 3959 | /// Debug printer. |
| 3960 | LLVM_DUMP_METHOD void dump() const { |
| 3961 | dbgs() << Idx << ".\n" ; |
| 3962 | for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) { |
| 3963 | dbgs() << "Operand " << OpI << ":\n" ; |
| 3964 | for (const Value *V : Operands[OpI]) |
| 3965 | dbgs().indent(2) << *V << "\n" ; |
| 3966 | } |
| 3967 | dbgs() << "Scalars: \n" ; |
| 3968 | for (Value *V : Scalars) |
| 3969 | dbgs().indent(2) << *V << "\n" ; |
| 3970 | dbgs() << "State: " ; |
| 3971 | switch (State) { |
| 3972 | case Vectorize: |
| 3973 | if (InterleaveFactor > 0) { |
| 3974 | dbgs() << "Vectorize with interleave factor " << InterleaveFactor |
| 3975 | << "\n" ; |
| 3976 | } else { |
| 3977 | dbgs() << "Vectorize\n" ; |
| 3978 | } |
| 3979 | break; |
| 3980 | case ScatterVectorize: |
| 3981 | dbgs() << "ScatterVectorize\n" ; |
| 3982 | break; |
| 3983 | case StridedVectorize: |
| 3984 | dbgs() << "StridedVectorize\n" ; |
| 3985 | break; |
| 3986 | case CompressVectorize: |
| 3987 | dbgs() << "CompressVectorize\n" ; |
| 3988 | break; |
| 3989 | case NeedToGather: |
| 3990 | dbgs() << "NeedToGather\n" ; |
| 3991 | break; |
| 3992 | case CombinedVectorize: |
| 3993 | dbgs() << "CombinedVectorize\n" ; |
| 3994 | break; |
| 3995 | case SplitVectorize: |
| 3996 | dbgs() << "SplitVectorize\n" ; |
| 3997 | break; |
| 3998 | } |
| 3999 | if (S) { |
| 4000 | dbgs() << "MainOp: " << *S.getMainOp() << "\n" ; |
| 4001 | dbgs() << "AltOp: " << *S.getAltOp() << "\n" ; |
| 4002 | } else { |
| 4003 | dbgs() << "MainOp: NULL\n" ; |
| 4004 | dbgs() << "AltOp: NULL\n" ; |
| 4005 | } |
| 4006 | dbgs() << "VectorizedValue: " ; |
| 4007 | if (VectorizedValue) |
| 4008 | dbgs() << *VectorizedValue << "\n" ; |
| 4009 | else |
| 4010 | dbgs() << "NULL\n" ; |
| 4011 | dbgs() << "ReuseShuffleIndices: " ; |
| 4012 | if (ReuseShuffleIndices.empty()) |
| 4013 | dbgs() << "Empty" ; |
| 4014 | else |
| 4015 | for (int ReuseIdx : ReuseShuffleIndices) |
| 4016 | dbgs() << ReuseIdx << ", " ; |
| 4017 | dbgs() << "\n" ; |
| 4018 | dbgs() << "ReorderIndices: " ; |
| 4019 | for (unsigned ReorderIdx : ReorderIndices) |
| 4020 | dbgs() << ReorderIdx << ", " ; |
| 4021 | dbgs() << "\n" ; |
| 4022 | dbgs() << "UserTreeIndex: " ; |
| 4023 | if (UserTreeIndex) |
| 4024 | dbgs() << UserTreeIndex; |
| 4025 | else |
| 4026 | dbgs() << "<invalid>" ; |
| 4027 | dbgs() << "\n" ; |
| 4028 | if (!CombinedEntriesWithIndices.empty()) { |
| 4029 | dbgs() << "Combined entries: " ; |
| 4030 | interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) { |
| 4031 | dbgs() << "Entry index " << P.first << " with offset " << P.second; |
| 4032 | }); |
| 4033 | dbgs() << "\n" ; |
| 4034 | } |
| 4035 | } |
| 4036 | #endif |
| 4037 | }; |
| 4038 | |
| 4039 | #ifndef NDEBUG |
| 4040 | void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, |
| 4041 | InstructionCost VecCost, InstructionCost ScalarCost, |
| 4042 | StringRef Banner) const { |
| 4043 | dbgs() << "SLP: " << Banner << ":\n" ; |
| 4044 | E->dump(); |
| 4045 | dbgs() << "SLP: Costs:\n" ; |
| 4046 | dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n" ; |
| 4047 | dbgs() << "SLP: VectorCost = " << VecCost << "\n" ; |
| 4048 | dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n" ; |
| 4049 | dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " |
| 4050 | << ReuseShuffleCost + VecCost - ScalarCost << "\n" ; |
| 4051 | } |
| 4052 | #endif |
| 4053 | |
| 4054 | /// Create a new gather TreeEntry |
| 4055 | TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL, |
| 4056 | const InstructionsState &S, |
| 4057 | const EdgeInfo &UserTreeIdx, |
| 4058 | ArrayRef<int> ReuseShuffleIndices = {}) { |
| 4059 | auto Invalid = ScheduleBundle::invalid(); |
| 4060 | return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices); |
| 4061 | } |
| 4062 | |
| 4063 | /// Create a new VectorizableTree entry. |
| 4064 | TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle, |
| 4065 | const InstructionsState &S, |
| 4066 | const EdgeInfo &UserTreeIdx, |
| 4067 | ArrayRef<int> ReuseShuffleIndices = {}, |
| 4068 | ArrayRef<unsigned> ReorderIndices = {}, |
| 4069 | unsigned InterleaveFactor = 0) { |
| 4070 | TreeEntry::EntryState EntryState = |
| 4071 | Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; |
| 4072 | TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, |
| 4073 | ReuseShuffleIndices, ReorderIndices); |
| 4074 | if (E && InterleaveFactor > 0) |
| 4075 | E->setInterleave(InterleaveFactor); |
| 4076 | return E; |
| 4077 | } |
| 4078 | |
| 4079 | TreeEntry *newTreeEntry(ArrayRef<Value *> VL, |
| 4080 | TreeEntry::EntryState EntryState, |
| 4081 | ScheduleBundle &Bundle, const InstructionsState &S, |
| 4082 | const EdgeInfo &UserTreeIdx, |
| 4083 | ArrayRef<int> ReuseShuffleIndices = {}, |
| 4084 | ArrayRef<unsigned> ReorderIndices = {}) { |
| 4085 | assert(((!Bundle && (EntryState == TreeEntry::NeedToGather || |
| 4086 | EntryState == TreeEntry::SplitVectorize)) || |
| 4087 | (Bundle && EntryState != TreeEntry::NeedToGather && |
| 4088 | EntryState != TreeEntry::SplitVectorize)) && |
| 4089 | "Need to vectorize gather entry?" ); |
| 4090 | // Gathered loads still gathered? Do not create entry, use the original one. |
| 4091 | if (GatheredLoadsEntriesFirst.has_value() && |
| 4092 | EntryState == TreeEntry::NeedToGather && S && |
| 4093 | S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && |
| 4094 | !UserTreeIdx.UserTE) |
| 4095 | return nullptr; |
| 4096 | VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree)); |
| 4097 | TreeEntry *Last = VectorizableTree.back().get(); |
| 4098 | Last->Idx = VectorizableTree.size() - 1; |
| 4099 | Last->State = EntryState; |
| 4100 | if (UserTreeIdx.UserTE) |
| 4101 | OperandsToTreeEntry.try_emplace( |
| 4102 | Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last); |
| 4103 | // FIXME: Remove once support for ReuseShuffleIndices has been implemented |
| 4104 | // for non-power-of-two vectors. |
| 4105 | assert( |
| 4106 | (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) || |
| 4107 | ReuseShuffleIndices.empty()) && |
| 4108 | "Reshuffling scalars not yet supported for nodes with padding" ); |
| 4109 | Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(), |
| 4110 | in_end: ReuseShuffleIndices.end()); |
| 4111 | if (ReorderIndices.empty()) { |
| 4112 | Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end()); |
| 4113 | if (S) |
| 4114 | Last->setOperations(S); |
| 4115 | } else { |
| 4116 | // Reorder scalars and build final mask. |
| 4117 | Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr); |
| 4118 | transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(), |
| 4119 | F: [VL](unsigned Idx) -> Value * { |
| 4120 | if (Idx >= VL.size()) |
| 4121 | return UndefValue::get(T: VL.front()->getType()); |
| 4122 | return VL[Idx]; |
| 4123 | }); |
| 4124 | InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI); |
| 4125 | if (S) |
| 4126 | Last->setOperations(S); |
| 4127 | Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end()); |
| 4128 | } |
| 4129 | if (EntryState == TreeEntry::SplitVectorize) { |
| 4130 | assert(S && "Split nodes must have operations." ); |
| 4131 | Last->setOperations(S); |
| 4132 | SmallPtrSet<Value *, 4> Processed; |
| 4133 | for (Value *V : VL) { |
| 4134 | auto *I = dyn_cast<Instruction>(Val: V); |
| 4135 | if (!I) |
| 4136 | continue; |
| 4137 | auto It = ScalarsInSplitNodes.find(Val: V); |
| 4138 | if (It == ScalarsInSplitNodes.end()) { |
| 4139 | ScalarsInSplitNodes.try_emplace(Key: V).first->getSecond().push_back(Elt: Last); |
| 4140 | (void)Processed.insert(Ptr: V); |
| 4141 | } else if (Processed.insert(Ptr: V).second) { |
| 4142 | assert(!is_contained(It->getSecond(), Last) && |
| 4143 | "Value already associated with the node." ); |
| 4144 | It->getSecond().push_back(Elt: Last); |
| 4145 | } |
| 4146 | } |
| 4147 | } else if (!Last->isGather()) { |
| 4148 | if (doesNotNeedToSchedule(VL)) |
| 4149 | Last->setDoesNotNeedToSchedule(); |
| 4150 | SmallPtrSet<Value *, 4> Processed; |
| 4151 | for (Value *V : VL) { |
| 4152 | if (isa<PoisonValue>(Val: V)) |
| 4153 | continue; |
| 4154 | auto It = ScalarToTreeEntries.find(Val: V); |
| 4155 | if (It == ScalarToTreeEntries.end()) { |
| 4156 | ScalarToTreeEntries.try_emplace(Key: V).first->getSecond().push_back(Elt: Last); |
| 4157 | (void)Processed.insert(Ptr: V); |
| 4158 | } else if (Processed.insert(Ptr: V).second) { |
| 4159 | assert(!is_contained(It->getSecond(), Last) && |
| 4160 | "Value already associated with the node." ); |
| 4161 | It->getSecond().push_back(Elt: Last); |
| 4162 | } |
| 4163 | } |
| 4164 | // Update the scheduler bundle to point to this TreeEntry. |
| 4165 | assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) || |
| 4166 | isVectorLikeInstWithConstOps(S.getMainOp()) || |
| 4167 | Last->doesNotNeedToSchedule()) && |
| 4168 | "Bundle and VL out of sync" ); |
| 4169 | if (!Bundle.getBundle().empty()) { |
| 4170 | #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) |
| 4171 | auto *BundleMember = Bundle.getBundle().begin(); |
| 4172 | SmallPtrSet<Value *, 4> Processed; |
| 4173 | for (Value *V : VL) { |
| 4174 | if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second) |
| 4175 | continue; |
| 4176 | ++BundleMember; |
| 4177 | } |
| 4178 | assert(BundleMember == Bundle.getBundle().end() && |
| 4179 | "Bundle and VL out of sync" ); |
| 4180 | #endif |
| 4181 | Bundle.setTreeEntry(Last); |
| 4182 | } |
| 4183 | } else { |
| 4184 | // Build a map for gathered scalars to the nodes where they are used. |
| 4185 | bool AllConstsOrCasts = true; |
| 4186 | for (Value *V : VL) |
| 4187 | if (!isConstant(V)) { |
| 4188 | auto *I = dyn_cast<CastInst>(Val: V); |
| 4189 | AllConstsOrCasts &= I && I->getType()->isIntegerTy(); |
| 4190 | if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || |
| 4191 | !UserTreeIdx.UserTE->isGather()) |
| 4192 | ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(X: Last); |
| 4193 | } |
| 4194 | if (AllConstsOrCasts) |
| 4195 | CastMaxMinBWSizes = |
| 4196 | std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1); |
| 4197 | MustGather.insert_range(R&: VL); |
| 4198 | } |
| 4199 | |
| 4200 | if (UserTreeIdx.UserTE) |
| 4201 | Last->UserTreeIndex = UserTreeIdx; |
| 4202 | return Last; |
| 4203 | } |
| 4204 | |
| 4205 | /// -- Vectorization State -- |
| 4206 | /// Holds all of the tree entries. |
| 4207 | TreeEntry::VecTreeTy VectorizableTree; |
| 4208 | |
| 4209 | #ifndef NDEBUG |
| 4210 | /// Debug printer. |
| 4211 | LLVM_DUMP_METHOD void dumpVectorizableTree() const { |
| 4212 | for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) { |
| 4213 | VectorizableTree[Id]->dump(); |
| 4214 | dbgs() << "\n" ; |
| 4215 | } |
| 4216 | } |
| 4217 | #endif |
| 4218 | |
| 4219 | /// Get list of vector entries, associated with the value \p V. |
| 4220 | ArrayRef<TreeEntry *> getTreeEntries(Value *V) const { |
| 4221 | assert(V && "V cannot be nullptr." ); |
| 4222 | auto It = ScalarToTreeEntries.find(Val: V); |
| 4223 | if (It == ScalarToTreeEntries.end()) |
| 4224 | return {}; |
| 4225 | return It->getSecond(); |
| 4226 | } |
| 4227 | |
| 4228 | /// Get list of split vector entries, associated with the value \p V. |
| 4229 | ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const { |
| 4230 | assert(V && "V cannot be nullptr." ); |
| 4231 | auto It = ScalarsInSplitNodes.find(Val: V); |
| 4232 | if (It == ScalarsInSplitNodes.end()) |
| 4233 | return {}; |
| 4234 | return It->getSecond(); |
| 4235 | } |
| 4236 | |
| 4237 | /// Returns first vector node for value \p V, matching values \p VL. |
| 4238 | TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL, |
| 4239 | bool SameVF = false) const { |
| 4240 | assert(V && "V cannot be nullptr." ); |
| 4241 | for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V)) |
| 4242 | if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL)) |
| 4243 | return TE; |
| 4244 | return nullptr; |
| 4245 | } |
| 4246 | |
| 4247 | /// Check that the operand node of alternate node does not generate |
| 4248 | /// buildvector sequence. If it is, then probably not worth it to build |
| 4249 | /// alternate shuffle, if number of buildvector operands + alternate |
| 4250 | /// instruction > than the number of buildvector instructions. |
| 4251 | /// \param S the instructions state of the analyzed values. |
| 4252 | /// \param VL list of the instructions with alternate opcodes. |
| 4253 | bool areAltOperandsProfitable(const InstructionsState &S, |
| 4254 | ArrayRef<Value *> VL) const; |
| 4255 | |
| 4256 | /// Contains all the outputs of legality analysis for a list of values to |
| 4257 | /// vectorize. |
| 4258 | class ScalarsVectorizationLegality { |
| 4259 | InstructionsState S; |
| 4260 | bool IsLegal; |
| 4261 | bool TryToFindDuplicates; |
| 4262 | bool TrySplitVectorize; |
| 4263 | |
| 4264 | public: |
| 4265 | ScalarsVectorizationLegality(InstructionsState S, bool IsLegal, |
| 4266 | bool TryToFindDuplicates = true, |
| 4267 | bool TrySplitVectorize = false) |
| 4268 | : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates), |
| 4269 | TrySplitVectorize(TrySplitVectorize) { |
| 4270 | assert((!IsLegal || (S.valid() && TryToFindDuplicates)) && |
| 4271 | "Inconsistent state" ); |
| 4272 | } |
| 4273 | const InstructionsState &getInstructionsState() const { return S; }; |
| 4274 | bool isLegal() const { return IsLegal; } |
| 4275 | bool tryToFindDuplicates() const { return TryToFindDuplicates; } |
| 4276 | bool trySplitVectorize() const { return TrySplitVectorize; } |
| 4277 | }; |
| 4278 | |
| 4279 | /// Checks if the specified list of the instructions/values can be vectorized |
| 4280 | /// in general. |
| 4281 | ScalarsVectorizationLegality |
| 4282 | getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth, |
| 4283 | const EdgeInfo &UserTreeIdx) const; |
| 4284 | |
| 4285 | /// Checks if the specified list of the instructions/values can be vectorized |
| 4286 | /// and fills required data before actual scheduling of the instructions. |
| 4287 | TreeEntry::EntryState |
| 4288 | getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL, |
| 4289 | bool IsScatterVectorizeUserTE, |
| 4290 | OrdersType &CurrentOrder, |
| 4291 | SmallVectorImpl<Value *> &PointerOps); |
| 4292 | |
| 4293 | /// Maps a specific scalar to its tree entry(ies). |
| 4294 | SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries; |
| 4295 | |
| 4296 | /// Maps the operand index and entry to the corresponding tree entry. |
| 4297 | SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *> |
| 4298 | OperandsToTreeEntry; |
| 4299 | |
| 4300 | /// Scalars, used in split vectorize nodes. |
| 4301 | SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes; |
| 4302 | |
| 4303 | /// Maps a value to the proposed vectorizable size. |
| 4304 | SmallDenseMap<Value *, unsigned> InstrElementSize; |
| 4305 | |
| 4306 | /// A list of scalars that we found that we need to keep as scalars. |
| 4307 | ValueSet MustGather; |
| 4308 | |
| 4309 | /// A set of first non-schedulable values. |
| 4310 | ValueSet NonScheduledFirst; |
| 4311 | |
| 4312 | /// A map between the vectorized entries and the last instructions in the |
| 4313 | /// bundles. The bundles are built in use order, not in the def order of the |
| 4314 | /// instructions. So, we cannot rely directly on the last instruction in the |
| 4315 | /// bundle being the last instruction in the program order during |
| 4316 | /// vectorization process since the basic blocks are affected, need to |
| 4317 | /// pre-gather them before. |
| 4318 | SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction; |
| 4319 | |
| 4320 | /// List of gather nodes, depending on other gather/vector nodes, which should |
| 4321 | /// be emitted after the vector instruction emission process to correctly |
| 4322 | /// handle order of the vector instructions and shuffles. |
| 4323 | SetVector<const TreeEntry *> PostponedGathers; |
| 4324 | |
| 4325 | using ValueToGatherNodesMap = |
| 4326 | DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>; |
| 4327 | ValueToGatherNodesMap ValueToGatherNodes; |
| 4328 | |
| 4329 | /// A list of the load entries (node indices), which can be vectorized using |
| 4330 | /// strided or masked gather approach, but attempted to be represented as |
| 4331 | /// contiguous loads. |
| 4332 | SetVector<unsigned> LoadEntriesToVectorize; |
| 4333 | |
| 4334 | /// true if graph nodes transforming mode is on. |
| 4335 | bool IsGraphTransformMode = false; |
| 4336 | |
| 4337 | /// The index of the first gathered load entry in the VectorizeTree. |
| 4338 | std::optional<unsigned> GatheredLoadsEntriesFirst; |
| 4339 | |
| 4340 | /// Maps compress entries to their mask data for the final codegen. |
| 4341 | SmallDenseMap<const TreeEntry *, |
| 4342 | std::tuple<SmallVector<int>, VectorType *, unsigned, bool>> |
| 4343 | CompressEntryToData; |
| 4344 | |
| 4345 | /// This POD struct describes one external user in the vectorized tree. |
| 4346 | struct ExternalUser { |
| 4347 | ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L) |
| 4348 | : Scalar(S), User(U), E(E), Lane(L) {} |
| 4349 | |
| 4350 | /// Which scalar in our function. |
| 4351 | Value *Scalar = nullptr; |
| 4352 | |
| 4353 | /// Which user that uses the scalar. |
| 4354 | llvm::User *User = nullptr; |
| 4355 | |
| 4356 | /// Vector node, the value is part of. |
| 4357 | const TreeEntry &E; |
| 4358 | |
| 4359 | /// Which lane does the scalar belong to. |
| 4360 | int Lane; |
| 4361 | }; |
| 4362 | using UserList = SmallVector<ExternalUser, 16>; |
| 4363 | |
| 4364 | /// Checks if two instructions may access the same memory. |
| 4365 | /// |
| 4366 | /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it |
| 4367 | /// is invariant in the calling loop. |
| 4368 | bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, |
| 4369 | Instruction *Inst2) { |
| 4370 | assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction." ); |
| 4371 | if (!isSimple(I: Inst2)) |
| 4372 | return true; |
| 4373 | // First check if the result is already in the cache. |
| 4374 | AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2); |
| 4375 | auto Res = AliasCache.try_emplace(Key); |
| 4376 | if (!Res.second) |
| 4377 | return Res.first->second; |
| 4378 | bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1)); |
| 4379 | // Store the result in the cache. |
| 4380 | Res.first->getSecond() = Aliased; |
| 4381 | AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased); |
| 4382 | return Aliased; |
| 4383 | } |
| 4384 | |
| 4385 | using AliasCacheKey = std::pair<Instruction *, Instruction *>; |
| 4386 | |
| 4387 | /// Cache for alias results. |
| 4388 | /// TODO: consider moving this to the AliasAnalysis itself. |
| 4389 | SmallDenseMap<AliasCacheKey, bool> AliasCache; |
| 4390 | |
| 4391 | // Cache for pointerMayBeCaptured calls inside AA. This is preserved |
| 4392 | // globally through SLP because we don't perform any action which |
| 4393 | // invalidates capture results. |
| 4394 | BatchAAResults BatchAA; |
| 4395 | |
| 4396 | /// Temporary store for deleted instructions. Instructions will be deleted |
| 4397 | /// eventually when the BoUpSLP is destructed. The deferral is required to |
| 4398 | /// ensure that there are no incorrect collisions in the AliasCache, which |
| 4399 | /// can happen if a new instruction is allocated at the same address as a |
| 4400 | /// previously deleted instruction. |
| 4401 | DenseSet<Instruction *> DeletedInstructions; |
| 4402 | |
| 4403 | /// Set of the instruction, being analyzed already for reductions. |
| 4404 | SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots; |
| 4405 | |
| 4406 | /// Set of hashes for the list of reduction values already being analyzed. |
| 4407 | DenseSet<size_t> AnalyzedReductionVals; |
| 4408 | |
| 4409 | /// Values, already been analyzed for mininmal bitwidth and found to be |
| 4410 | /// non-profitable. |
| 4411 | DenseSet<Value *> AnalyzedMinBWVals; |
| 4412 | |
| 4413 | /// A list of values that need to extracted out of the tree. |
| 4414 | /// This list holds pairs of (Internal Scalar : External User). External User |
| 4415 | /// can be nullptr, it means that this Internal Scalar will be used later, |
| 4416 | /// after vectorization. |
| 4417 | UserList ExternalUses; |
| 4418 | |
| 4419 | /// A list of GEPs which can be reaplced by scalar GEPs instead of |
| 4420 | /// extractelement instructions. |
| 4421 | SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar; |
| 4422 | |
| 4423 | /// Values used only by @llvm.assume calls. |
| 4424 | SmallPtrSet<const Value *, 32> EphValues; |
| 4425 | |
| 4426 | /// Holds all of the instructions that we gathered, shuffle instructions and |
| 4427 | /// extractelements. |
| 4428 | SetVector<Instruction *> ; |
| 4429 | |
| 4430 | /// A list of blocks that we are going to CSE. |
| 4431 | DenseSet<BasicBlock *> CSEBlocks; |
| 4432 | |
| 4433 | /// List of hashes of vector of loads, which are known to be non vectorizable. |
| 4434 | DenseSet<size_t> ListOfKnonwnNonVectorizableLoads; |
| 4435 | |
| 4436 | /// Represents a scheduling entity, either ScheduleData or ScheduleBundle. |
| 4437 | /// ScheduleData used to gather dependecies for a single instructions, while |
| 4438 | /// ScheduleBundle represents a batch of instructions, going to be groupped |
| 4439 | /// together. |
| 4440 | class ScheduleEntity { |
| 4441 | friend class ScheduleBundle; |
| 4442 | friend class ScheduleData; |
| 4443 | |
| 4444 | protected: |
| 4445 | enum class Kind { ScheduleData, ScheduleBundle }; |
| 4446 | Kind getKind() const { return K; } |
| 4447 | ScheduleEntity(Kind K) : K(K) {} |
| 4448 | |
| 4449 | private: |
| 4450 | /// Used for getting a "good" final ordering of instructions. |
| 4451 | int SchedulingPriority = 0; |
| 4452 | /// True if this instruction (or bundle) is scheduled (or considered as |
| 4453 | /// scheduled in the dry-run). |
| 4454 | bool IsScheduled = false; |
| 4455 | /// The kind of the ScheduleEntity. |
| 4456 | const Kind K = Kind::ScheduleData; |
| 4457 | |
| 4458 | public: |
| 4459 | ScheduleEntity() = delete; |
| 4460 | /// Gets/sets the scheduling priority. |
| 4461 | void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; } |
| 4462 | int getSchedulingPriority() const { return SchedulingPriority; } |
| 4463 | bool isReady() const { |
| 4464 | if (auto *SD = dyn_cast<ScheduleData>(Val: this)) |
| 4465 | return SD->isReady(); |
| 4466 | return cast<ScheduleBundle>(Val: this)->isReady(); |
| 4467 | } |
| 4468 | /// Gets/sets if the bundle is scheduled. |
| 4469 | bool isScheduled() const { return IsScheduled; } |
| 4470 | void setScheduled(bool Scheduled) { IsScheduled = Scheduled; } |
| 4471 | |
| 4472 | static bool classof(const ScheduleEntity *) { return true; } |
| 4473 | }; |
| 4474 | |
| 4475 | /// Contains all scheduling relevant data for an instruction. |
| 4476 | /// A ScheduleData either represents a single instruction or a member of an |
| 4477 | /// instruction bundle (= a group of instructions which is combined into a |
| 4478 | /// vector instruction). |
| 4479 | class ScheduleData final : public ScheduleEntity { |
| 4480 | public: |
| 4481 | // The initial value for the dependency counters. It means that the |
| 4482 | // dependencies are not calculated yet. |
| 4483 | enum { InvalidDeps = -1 }; |
| 4484 | |
| 4485 | ScheduleData() : ScheduleEntity(Kind::ScheduleData) {} |
| 4486 | static bool classof(const ScheduleEntity *Entity) { |
| 4487 | return Entity->getKind() == Kind::ScheduleData; |
| 4488 | } |
| 4489 | |
| 4490 | void init(int BlockSchedulingRegionID, Instruction *I) { |
| 4491 | NextLoadStore = nullptr; |
| 4492 | IsScheduled = false; |
| 4493 | SchedulingRegionID = BlockSchedulingRegionID; |
| 4494 | clearDependencies(); |
| 4495 | Inst = I; |
| 4496 | } |
| 4497 | |
| 4498 | /// Verify basic self consistency properties |
| 4499 | void verify() { |
| 4500 | if (hasValidDependencies()) { |
| 4501 | assert(UnscheduledDeps <= Dependencies && "invariant" ); |
| 4502 | } else { |
| 4503 | assert(UnscheduledDeps == Dependencies && "invariant" ); |
| 4504 | } |
| 4505 | |
| 4506 | if (IsScheduled) { |
| 4507 | assert(hasValidDependencies() && UnscheduledDeps == 0 && |
| 4508 | "unexpected scheduled state" ); |
| 4509 | } |
| 4510 | } |
| 4511 | |
| 4512 | /// Returns true if the dependency information has been calculated. |
| 4513 | /// Note that depenendency validity can vary between instructions within |
| 4514 | /// a single bundle. |
| 4515 | bool hasValidDependencies() const { return Dependencies != InvalidDeps; } |
| 4516 | |
| 4517 | /// Returns true if it is ready for scheduling, i.e. it has no more |
| 4518 | /// unscheduled depending instructions/bundles. |
| 4519 | bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; } |
| 4520 | |
| 4521 | /// Modifies the number of unscheduled dependencies for this instruction, |
| 4522 | /// and returns the number of remaining dependencies for the containing |
| 4523 | /// bundle. |
| 4524 | int incrementUnscheduledDeps(int Incr) { |
| 4525 | assert(hasValidDependencies() && |
| 4526 | "increment of unscheduled deps would be meaningless" ); |
| 4527 | UnscheduledDeps += Incr; |
| 4528 | return UnscheduledDeps; |
| 4529 | } |
| 4530 | |
| 4531 | /// Sets the number of unscheduled dependencies to the number of |
| 4532 | /// dependencies. |
| 4533 | void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; } |
| 4534 | |
| 4535 | /// Clears all dependency information. |
| 4536 | void clearDependencies() { |
| 4537 | Dependencies = InvalidDeps; |
| 4538 | resetUnscheduledDeps(); |
| 4539 | MemoryDependencies.clear(); |
| 4540 | ControlDependencies.clear(); |
| 4541 | IsScheduled = false; |
| 4542 | } |
| 4543 | |
| 4544 | /// Gets the number of unscheduled dependencies. |
| 4545 | int getUnscheduledDeps() const { return UnscheduledDeps; } |
| 4546 | /// Gets the number of dependencies. |
| 4547 | int getDependencies() const { return Dependencies; } |
| 4548 | /// Initializes the number of dependencies. |
| 4549 | void initDependencies() { Dependencies = 0; } |
| 4550 | /// Increments the number of dependencies. |
| 4551 | void incDependencies() { Dependencies++; } |
| 4552 | |
| 4553 | /// Gets scheduling region ID. |
| 4554 | int getSchedulingRegionID() const { return SchedulingRegionID; } |
| 4555 | |
| 4556 | /// Gets the instruction. |
| 4557 | Instruction *getInst() const { return Inst; } |
| 4558 | |
| 4559 | /// Gets the list of memory dependencies. |
| 4560 | ArrayRef<ScheduleData *> getMemoryDependencies() const { |
| 4561 | return MemoryDependencies; |
| 4562 | } |
| 4563 | /// Adds a memory dependency. |
| 4564 | void addMemoryDependency(ScheduleData *Dep) { |
| 4565 | MemoryDependencies.push_back(Elt: Dep); |
| 4566 | } |
| 4567 | /// Gets the list of control dependencies. |
| 4568 | ArrayRef<ScheduleData *> getControlDependencies() const { |
| 4569 | return ControlDependencies; |
| 4570 | } |
| 4571 | /// Adds a control dependency. |
| 4572 | void addControlDependency(ScheduleData *Dep) { |
| 4573 | ControlDependencies.push_back(Elt: Dep); |
| 4574 | } |
| 4575 | /// Gets/sets the next load/store instruction in the block. |
| 4576 | ScheduleData *getNextLoadStore() const { return NextLoadStore; } |
| 4577 | void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; } |
| 4578 | |
| 4579 | void dump(raw_ostream &OS) const { OS << *Inst; } |
| 4580 | |
| 4581 | LLVM_DUMP_METHOD void dump() const { |
| 4582 | dump(OS&: dbgs()); |
| 4583 | dbgs() << '\n'; |
| 4584 | } |
| 4585 | |
| 4586 | private: |
| 4587 | Instruction *Inst = nullptr; |
| 4588 | |
| 4589 | /// Single linked list of all memory instructions (e.g. load, store, call) |
| 4590 | /// in the block - until the end of the scheduling region. |
| 4591 | ScheduleData *NextLoadStore = nullptr; |
| 4592 | |
| 4593 | /// The dependent memory instructions. |
| 4594 | /// This list is derived on demand in calculateDependencies(). |
| 4595 | SmallVector<ScheduleData *> MemoryDependencies; |
| 4596 | |
| 4597 | /// List of instructions which this instruction could be control dependent |
| 4598 | /// on. Allowing such nodes to be scheduled below this one could introduce |
| 4599 | /// a runtime fault which didn't exist in the original program. |
| 4600 | /// ex: this is a load or udiv following a readonly call which inf loops |
| 4601 | SmallVector<ScheduleData *> ControlDependencies; |
| 4602 | |
| 4603 | /// This ScheduleData is in the current scheduling region if this matches |
| 4604 | /// the current SchedulingRegionID of BlockScheduling. |
| 4605 | int SchedulingRegionID = 0; |
| 4606 | |
| 4607 | /// The number of dependencies. Constitutes of the number of users of the |
| 4608 | /// instruction plus the number of dependent memory instructions (if any). |
| 4609 | /// This value is calculated on demand. |
| 4610 | /// If InvalidDeps, the number of dependencies is not calculated yet. |
| 4611 | int Dependencies = InvalidDeps; |
| 4612 | |
| 4613 | /// The number of dependencies minus the number of dependencies of scheduled |
| 4614 | /// instructions. As soon as this is zero, the instruction/bundle gets ready |
| 4615 | /// for scheduling. |
| 4616 | /// Note that this is negative as long as Dependencies is not calculated. |
| 4617 | int UnscheduledDeps = InvalidDeps; |
| 4618 | }; |
| 4619 | |
| 4620 | #ifndef NDEBUG |
| 4621 | friend inline raw_ostream &operator<<(raw_ostream &OS, |
| 4622 | const BoUpSLP::ScheduleData &SD) { |
| 4623 | SD.dump(OS); |
| 4624 | return OS; |
| 4625 | } |
| 4626 | #endif |
| 4627 | |
| 4628 | class ScheduleBundle final : public ScheduleEntity { |
| 4629 | /// The schedule data for the instructions in the bundle. |
| 4630 | SmallVector<ScheduleData *> Bundle; |
| 4631 | /// True if this bundle is valid. |
| 4632 | bool IsValid = true; |
| 4633 | /// The TreeEntry that this instruction corresponds to. |
| 4634 | TreeEntry *TE = nullptr; |
| 4635 | ScheduleBundle(bool IsValid) |
| 4636 | : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {} |
| 4637 | |
| 4638 | public: |
| 4639 | ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {} |
| 4640 | static bool classof(const ScheduleEntity *Entity) { |
| 4641 | return Entity->getKind() == Kind::ScheduleBundle; |
| 4642 | } |
| 4643 | |
| 4644 | /// Verify basic self consistency properties |
| 4645 | void verify() const { |
| 4646 | for (const ScheduleData *SD : Bundle) { |
| 4647 | if (SD->hasValidDependencies()) { |
| 4648 | assert(SD->getUnscheduledDeps() <= SD->getDependencies() && |
| 4649 | "invariant" ); |
| 4650 | } else { |
| 4651 | assert(SD->getUnscheduledDeps() == SD->getDependencies() && |
| 4652 | "invariant" ); |
| 4653 | } |
| 4654 | |
| 4655 | if (isScheduled()) { |
| 4656 | assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 && |
| 4657 | "unexpected scheduled state" ); |
| 4658 | } |
| 4659 | } |
| 4660 | } |
| 4661 | |
| 4662 | /// Returns the number of unscheduled dependencies in the bundle. |
| 4663 | int unscheduledDepsInBundle() const { |
| 4664 | assert(*this && "bundle must not be empty" ); |
| 4665 | int Sum = 0; |
| 4666 | for (const ScheduleData *BundleMember : Bundle) { |
| 4667 | if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps) |
| 4668 | return ScheduleData::InvalidDeps; |
| 4669 | Sum += BundleMember->getUnscheduledDeps(); |
| 4670 | } |
| 4671 | return Sum; |
| 4672 | } |
| 4673 | |
| 4674 | /// Returns true if the dependency information has been calculated. |
| 4675 | /// Note that depenendency validity can vary between instructions within |
| 4676 | /// a single bundle. |
| 4677 | bool hasValidDependencies() const { |
| 4678 | return all_of(Range: Bundle, P: [](const ScheduleData *SD) { |
| 4679 | return SD->hasValidDependencies(); |
| 4680 | }); |
| 4681 | } |
| 4682 | |
| 4683 | /// Returns true if it is ready for scheduling, i.e. it has no more |
| 4684 | /// unscheduled depending instructions/bundles. |
| 4685 | bool isReady() const { |
| 4686 | assert(*this && "bundle must not be empty" ); |
| 4687 | return unscheduledDepsInBundle() == 0 && !isScheduled(); |
| 4688 | } |
| 4689 | |
| 4690 | /// Returns the bundle of scheduling data, associated with the current |
| 4691 | /// instruction. |
| 4692 | ArrayRef<ScheduleData *> getBundle() { return Bundle; } |
| 4693 | ArrayRef<const ScheduleData *> getBundle() const { return Bundle; } |
| 4694 | /// Adds an instruction to the bundle. |
| 4695 | void add(ScheduleData *SD) { Bundle.push_back(Elt: SD); } |
| 4696 | |
| 4697 | /// Gets/sets the associated tree entry. |
| 4698 | void setTreeEntry(TreeEntry *TE) { this->TE = TE; } |
| 4699 | TreeEntry *getTreeEntry() const { return TE; } |
| 4700 | |
| 4701 | static ScheduleBundle invalid() { return {false}; } |
| 4702 | |
| 4703 | operator bool() const { return IsValid; } |
| 4704 | |
| 4705 | #ifndef NDEBUG |
| 4706 | void dump(raw_ostream &OS) const { |
| 4707 | if (!*this) { |
| 4708 | OS << "[]" ; |
| 4709 | return; |
| 4710 | } |
| 4711 | OS << '['; |
| 4712 | interleaveComma(Bundle, OS, |
| 4713 | [&](const ScheduleData *SD) { OS << *SD->getInst(); }); |
| 4714 | OS << ']'; |
| 4715 | } |
| 4716 | |
| 4717 | LLVM_DUMP_METHOD void dump() const { |
| 4718 | dump(dbgs()); |
| 4719 | dbgs() << '\n'; |
| 4720 | } |
| 4721 | #endif // NDEBUG |
| 4722 | }; |
| 4723 | |
| 4724 | #ifndef NDEBUG |
| 4725 | friend inline raw_ostream &operator<<(raw_ostream &OS, |
| 4726 | const BoUpSLP::ScheduleBundle &Bundle) { |
| 4727 | Bundle.dump(OS); |
| 4728 | return OS; |
| 4729 | } |
| 4730 | #endif |
| 4731 | |
| 4732 | friend struct GraphTraits<BoUpSLP *>; |
| 4733 | friend struct DOTGraphTraits<BoUpSLP *>; |
| 4734 | |
| 4735 | /// Contains all scheduling data for a basic block. |
| 4736 | /// It does not schedules instructions, which are not memory read/write |
| 4737 | /// instructions and their operands are either constants, or arguments, or |
| 4738 | /// phis, or instructions from others blocks, or their users are phis or from |
| 4739 | /// the other blocks. The resulting vector instructions can be placed at the |
| 4740 | /// beginning of the basic block without scheduling (if operands does not need |
| 4741 | /// to be scheduled) or at the end of the block (if users are outside of the |
| 4742 | /// block). It allows to save some compile time and memory used by the |
| 4743 | /// compiler. |
| 4744 | /// ScheduleData is assigned for each instruction in between the boundaries of |
| 4745 | /// the tree entry, even for those, which are not part of the graph. It is |
| 4746 | /// required to correctly follow the dependencies between the instructions and |
| 4747 | /// their correct scheduling. The ScheduleData is not allocated for the |
| 4748 | /// instructions, which do not require scheduling, like phis, nodes with |
| 4749 | /// extractelements/insertelements only or nodes with instructions, with |
| 4750 | /// uses/operands outside of the block. |
| 4751 | struct BlockScheduling { |
| 4752 | BlockScheduling(BasicBlock *BB) |
| 4753 | : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} |
| 4754 | |
| 4755 | void clear() { |
| 4756 | ScheduledBundles.clear(); |
| 4757 | ScheduledBundlesList.clear(); |
| 4758 | ReadyInsts.clear(); |
| 4759 | ScheduleStart = nullptr; |
| 4760 | ScheduleEnd = nullptr; |
| 4761 | FirstLoadStoreInRegion = nullptr; |
| 4762 | LastLoadStoreInRegion = nullptr; |
| 4763 | RegionHasStackSave = false; |
| 4764 | |
| 4765 | // Reduce the maximum schedule region size by the size of the |
| 4766 | // previous scheduling run. |
| 4767 | ScheduleRegionSizeLimit -= ScheduleRegionSize; |
| 4768 | if (ScheduleRegionSizeLimit < MinScheduleRegionSize) |
| 4769 | ScheduleRegionSizeLimit = MinScheduleRegionSize; |
| 4770 | ScheduleRegionSize = 0; |
| 4771 | |
| 4772 | // Make a new scheduling region, i.e. all existing ScheduleData is not |
| 4773 | // in the new region yet. |
| 4774 | ++SchedulingRegionID; |
| 4775 | } |
| 4776 | |
| 4777 | ScheduleData *getScheduleData(Instruction *I) { |
| 4778 | if (!I) |
| 4779 | return nullptr; |
| 4780 | if (BB != I->getParent()) |
| 4781 | // Avoid lookup if can't possibly be in map. |
| 4782 | return nullptr; |
| 4783 | ScheduleData *SD = ScheduleDataMap.lookup(Val: I); |
| 4784 | if (SD && isInSchedulingRegion(SD)) |
| 4785 | return SD; |
| 4786 | return nullptr; |
| 4787 | } |
| 4788 | |
| 4789 | ScheduleData *getScheduleData(Value *V) { |
| 4790 | return getScheduleData(I: dyn_cast<Instruction>(Val: V)); |
| 4791 | } |
| 4792 | |
| 4793 | ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const { |
| 4794 | auto *I = dyn_cast<Instruction>(Val: V); |
| 4795 | if (!I) |
| 4796 | return {}; |
| 4797 | auto It = ScheduledBundles.find(Val: I); |
| 4798 | if (It == ScheduledBundles.end()) |
| 4799 | return {}; |
| 4800 | return It->getSecond(); |
| 4801 | } |
| 4802 | |
| 4803 | bool isInSchedulingRegion(ScheduleData *SD) const { |
| 4804 | return SD->getSchedulingRegionID() == SchedulingRegionID; |
| 4805 | } |
| 4806 | |
| 4807 | bool isInSchedulingRegion(const ScheduleBundle &Bundle) const { |
| 4808 | return all_of(Range: Bundle.getBundle(), P: [&](const ScheduleData *BundleMember) { |
| 4809 | return BundleMember->getSchedulingRegionID() == SchedulingRegionID; |
| 4810 | }); |
| 4811 | } |
| 4812 | |
| 4813 | /// Marks an instruction as scheduled and puts all dependent ready |
| 4814 | /// instructions into the ready-list. |
| 4815 | template <typename ReadyListType> |
| 4816 | void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) { |
| 4817 | auto ProcessBundleMember = [&](ScheduleData *BundleMember, |
| 4818 | ScheduleBundle *Bundle) { |
| 4819 | // Handle the def-use chain dependencies. |
| 4820 | |
| 4821 | // Decrement the unscheduled counter and insert to ready list if ready. |
| 4822 | auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) { |
| 4823 | if ((IsControl || Data->hasValidDependencies()) && |
| 4824 | Data->incrementUnscheduledDeps(Incr: -1) == 0) { |
| 4825 | // There are no more unscheduled dependencies after |
| 4826 | // decrementing, so we can put the dependent instruction |
| 4827 | // into the ready list. |
| 4828 | if (ArrayRef<ScheduleBundle *> Bundles = |
| 4829 | getScheduleBundles(V: Data->getInst()); |
| 4830 | !Bundles.empty()) { |
| 4831 | for (ScheduleBundle *Bundle : Bundles) { |
| 4832 | if (Bundle->unscheduledDepsInBundle() == 0) { |
| 4833 | assert(!Bundle->isScheduled() && |
| 4834 | "already scheduled bundle gets ready" ); |
| 4835 | ReadyList.insert(Bundle); |
| 4836 | LLVM_DEBUG(dbgs() |
| 4837 | << "SLP: gets ready: " << *Bundle << "\n" ); |
| 4838 | } |
| 4839 | } |
| 4840 | return; |
| 4841 | } |
| 4842 | assert(!Data->isScheduled() && |
| 4843 | "already scheduled bundle gets ready" ); |
| 4844 | ReadyList.insert(Data); |
| 4845 | LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n" ); |
| 4846 | } |
| 4847 | }; |
| 4848 | |
| 4849 | auto DecrUnschedForInst = [&](Instruction *I) { |
| 4850 | if (ScheduleData *OpSD = getScheduleData(I)) |
| 4851 | DecrUnsched(OpSD, /*IsControl=*/false); |
| 4852 | }; |
| 4853 | |
| 4854 | // If BundleMember is a vector bundle, its operands may have been |
| 4855 | // reordered during buildTree(). We therefore need to get its operands |
| 4856 | // through the TreeEntry. |
| 4857 | if (Bundle) { |
| 4858 | // Need to search for the lane since the tree entry can be reordered. |
| 4859 | auto *In = BundleMember->getInst(); |
| 4860 | int Lane = std::distance(first: Bundle->getTreeEntry()->Scalars.begin(), |
| 4861 | last: find(Range&: Bundle->getTreeEntry()->Scalars, Val: In)); |
| 4862 | assert(Lane >= 0 && "Lane not set" ); |
| 4863 | |
| 4864 | // Since vectorization tree is being built recursively this assertion |
| 4865 | // ensures that the tree entry has all operands set before reaching |
| 4866 | // this code. Couple of exceptions known at the moment are extracts |
| 4867 | // where their second (immediate) operand is not added. Since |
| 4868 | // immediates do not affect scheduler behavior this is considered |
| 4869 | // okay. |
| 4870 | assert(In && |
| 4871 | (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) || |
| 4872 | In->getNumOperands() == |
| 4873 | Bundle->getTreeEntry()->getNumOperands()) && |
| 4874 | "Missed TreeEntry operands?" ); |
| 4875 | |
| 4876 | for (unsigned OpIdx : |
| 4877 | seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands())) |
| 4878 | if (auto *I = dyn_cast<Instruction>( |
| 4879 | Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { |
| 4880 | LLVM_DEBUG(dbgs() |
| 4881 | << "SLP: check for readiness (def): " << *I << "\n" ); |
| 4882 | DecrUnschedForInst(I); |
| 4883 | } |
| 4884 | } else { |
| 4885 | // If BundleMember is a stand-alone instruction, no operand reordering |
| 4886 | // has taken place, so we directly access its operands. |
| 4887 | for (Use &U : BundleMember->getInst()->operands()) |
| 4888 | if (auto *I = dyn_cast<Instruction>(Val: U.get())) { |
| 4889 | LLVM_DEBUG(dbgs() |
| 4890 | << "SLP: check for readiness (def): " << *I << "\n" ); |
| 4891 | DecrUnschedForInst(I); |
| 4892 | } |
| 4893 | } |
| 4894 | // Handle the memory dependencies. |
| 4895 | for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) { |
| 4896 | // There are no more unscheduled dependencies after decrementing, |
| 4897 | // so we can put the dependent instruction into the ready list. |
| 4898 | LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): " |
| 4899 | << *MemoryDep << "\n" ); |
| 4900 | DecrUnsched(MemoryDep); |
| 4901 | } |
| 4902 | // Handle the control dependencies. |
| 4903 | for (ScheduleData *Dep : BundleMember->getControlDependencies()) { |
| 4904 | // There are no more unscheduled dependencies after decrementing, |
| 4905 | // so we can put the dependent instruction into the ready list. |
| 4906 | LLVM_DEBUG(dbgs() |
| 4907 | << "SLP: check for readiness (ctrl): " << *Dep << "\n" ); |
| 4908 | DecrUnsched(Dep, /*IsControl=*/true); |
| 4909 | } |
| 4910 | }; |
| 4911 | if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) { |
| 4912 | SD->setScheduled(/*Scheduled=*/true); |
| 4913 | LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n" ); |
| 4914 | ProcessBundleMember(SD, nullptr); |
| 4915 | } else { |
| 4916 | ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data); |
| 4917 | Bundle.setScheduled(/*Scheduled=*/true); |
| 4918 | LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n" ); |
| 4919 | auto AreAllBundlesScheduled = [&](const ScheduleData *SD) { |
| 4920 | ArrayRef<ScheduleBundle *> SDBundles = |
| 4921 | getScheduleBundles(V: SD->getInst()); |
| 4922 | return !SDBundles.empty() && |
| 4923 | all_of(SDBundles, [&](const ScheduleBundle *SDBundle) { |
| 4924 | return SDBundle->isScheduled(); |
| 4925 | }); |
| 4926 | }; |
| 4927 | for (ScheduleData *SD : Bundle.getBundle()) { |
| 4928 | if (AreAllBundlesScheduled(SD)) { |
| 4929 | SD->setScheduled(/*Scheduled=*/true); |
| 4930 | ProcessBundleMember(SD, &Bundle); |
| 4931 | } |
| 4932 | } |
| 4933 | } |
| 4934 | } |
| 4935 | |
| 4936 | /// Verify basic self consistency properties of the data structure. |
| 4937 | void verify() { |
| 4938 | if (!ScheduleStart) |
| 4939 | return; |
| 4940 | |
| 4941 | assert(ScheduleStart->getParent() == ScheduleEnd->getParent() && |
| 4942 | ScheduleStart->comesBefore(ScheduleEnd) && |
| 4943 | "Not a valid scheduling region?" ); |
| 4944 | |
| 4945 | for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { |
| 4946 | ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I); |
| 4947 | if (!Bundles.empty()) { |
| 4948 | for (ScheduleBundle *Bundle : Bundles) { |
| 4949 | assert(isInSchedulingRegion(*Bundle) && |
| 4950 | "primary schedule data not in window?" ); |
| 4951 | Bundle->verify(); |
| 4952 | } |
| 4953 | continue; |
| 4954 | } |
| 4955 | auto *SD = getScheduleData(I); |
| 4956 | if (!SD) |
| 4957 | continue; |
| 4958 | assert(isInSchedulingRegion(SD) && |
| 4959 | "primary schedule data not in window?" ); |
| 4960 | SD->verify(); |
| 4961 | } |
| 4962 | |
| 4963 | assert(all_of(ReadyInsts, |
| 4964 | [](const ScheduleEntity *Bundle) { |
| 4965 | return Bundle->isReady(); |
| 4966 | }) && |
| 4967 | "item in ready list not ready?" ); |
| 4968 | } |
| 4969 | |
| 4970 | /// Put all instructions into the ReadyList which are ready for scheduling. |
| 4971 | template <typename ReadyListType> |
| 4972 | void initialFillReadyList(ReadyListType &ReadyList) { |
| 4973 | SmallPtrSet<ScheduleBundle *, 16> Visited; |
| 4974 | for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { |
| 4975 | ScheduleData *SD = getScheduleData(I); |
| 4976 | if (SD && SD->hasValidDependencies() && SD->isReady()) { |
| 4977 | if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I); |
| 4978 | !Bundles.empty()) { |
| 4979 | for (ScheduleBundle *Bundle : Bundles) { |
| 4980 | if (!Visited.insert(Ptr: Bundle).second) |
| 4981 | continue; |
| 4982 | if (Bundle->hasValidDependencies() && Bundle->isReady()) { |
| 4983 | ReadyList.insert(Bundle); |
| 4984 | LLVM_DEBUG(dbgs() << "SLP: initially in ready list: " |
| 4985 | << *Bundle << "\n" ); |
| 4986 | } |
| 4987 | } |
| 4988 | continue; |
| 4989 | } |
| 4990 | ReadyList.insert(SD); |
| 4991 | LLVM_DEBUG(dbgs() |
| 4992 | << "SLP: initially in ready list: " << *SD << "\n" ); |
| 4993 | } |
| 4994 | } |
| 4995 | } |
| 4996 | |
| 4997 | /// Build a bundle from the ScheduleData nodes corresponding to the |
| 4998 | /// scalar instruction for each lane. |
| 4999 | ScheduleBundle &buildBundle(ArrayRef<Value *> VL); |
| 5000 | |
| 5001 | /// Checks if a bundle of instructions can be scheduled, i.e. has no |
| 5002 | /// cyclic dependencies. This is only a dry-run, no instructions are |
| 5003 | /// actually moved at this stage. |
| 5004 | /// \returns the scheduling bundle. The returned Optional value is not |
| 5005 | /// std::nullopt if \p VL is allowed to be scheduled. |
| 5006 | std::optional<ScheduleBundle *> |
| 5007 | tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, |
| 5008 | const InstructionsState &S); |
| 5009 | |
| 5010 | /// Allocates schedule data chunk. |
| 5011 | ScheduleData *allocateScheduleDataChunks(); |
| 5012 | |
| 5013 | /// Extends the scheduling region so that V is inside the region. |
| 5014 | /// \returns true if the region size is within the limit. |
| 5015 | bool extendSchedulingRegion(Value *V, const InstructionsState &S); |
| 5016 | |
| 5017 | /// Initialize the ScheduleData structures for new instructions in the |
| 5018 | /// scheduling region. |
| 5019 | void initScheduleData(Instruction *FromI, Instruction *ToI, |
| 5020 | ScheduleData *PrevLoadStore, |
| 5021 | ScheduleData *NextLoadStore); |
| 5022 | |
| 5023 | /// Updates the dependency information of a bundle and of all instructions/ |
| 5024 | /// bundles which depend on the original bundle. |
| 5025 | void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList, |
| 5026 | BoUpSLP *SLP); |
| 5027 | |
| 5028 | /// Sets all instruction in the scheduling region to un-scheduled. |
| 5029 | void resetSchedule(); |
| 5030 | |
| 5031 | BasicBlock *BB; |
| 5032 | |
| 5033 | /// Simple memory allocation for ScheduleData. |
| 5034 | SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks; |
| 5035 | |
| 5036 | /// The size of a ScheduleData array in ScheduleDataChunks. |
| 5037 | int ChunkSize; |
| 5038 | |
| 5039 | /// The allocator position in the current chunk, which is the last entry |
| 5040 | /// of ScheduleDataChunks. |
| 5041 | int ChunkPos; |
| 5042 | |
| 5043 | /// Attaches ScheduleData to Instruction. |
| 5044 | /// Note that the mapping survives during all vectorization iterations, i.e. |
| 5045 | /// ScheduleData structures are recycled. |
| 5046 | SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap; |
| 5047 | |
| 5048 | /// Attaches ScheduleBundle to Instruction. |
| 5049 | SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>> |
| 5050 | ScheduledBundles; |
| 5051 | /// The list of ScheduleBundles. |
| 5052 | SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList; |
| 5053 | |
| 5054 | /// The ready-list for scheduling (only used for the dry-run). |
| 5055 | SetVector<ScheduleEntity *> ReadyInsts; |
| 5056 | |
| 5057 | /// The first instruction of the scheduling region. |
| 5058 | Instruction *ScheduleStart = nullptr; |
| 5059 | |
| 5060 | /// The first instruction _after_ the scheduling region. |
| 5061 | Instruction *ScheduleEnd = nullptr; |
| 5062 | |
| 5063 | /// The first memory accessing instruction in the scheduling region |
| 5064 | /// (can be null). |
| 5065 | ScheduleData *FirstLoadStoreInRegion = nullptr; |
| 5066 | |
| 5067 | /// The last memory accessing instruction in the scheduling region |
| 5068 | /// (can be null). |
| 5069 | ScheduleData *LastLoadStoreInRegion = nullptr; |
| 5070 | |
| 5071 | /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling |
| 5072 | /// region? Used to optimize the dependence calculation for the |
| 5073 | /// common case where there isn't. |
| 5074 | bool RegionHasStackSave = false; |
| 5075 | |
| 5076 | /// The current size of the scheduling region. |
| 5077 | int ScheduleRegionSize = 0; |
| 5078 | |
| 5079 | /// The maximum size allowed for the scheduling region. |
| 5080 | int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; |
| 5081 | |
| 5082 | /// The ID of the scheduling region. For a new vectorization iteration this |
| 5083 | /// is incremented which "removes" all ScheduleData from the region. |
| 5084 | /// Make sure that the initial SchedulingRegionID is greater than the |
| 5085 | /// initial SchedulingRegionID in ScheduleData (which is 0). |
| 5086 | int SchedulingRegionID = 1; |
| 5087 | }; |
| 5088 | |
| 5089 | /// Attaches the BlockScheduling structures to basic blocks. |
| 5090 | MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules; |
| 5091 | |
| 5092 | /// Performs the "real" scheduling. Done before vectorization is actually |
| 5093 | /// performed in a basic block. |
| 5094 | void scheduleBlock(BlockScheduling *BS); |
| 5095 | |
| 5096 | /// List of users to ignore during scheduling and that don't need extracting. |
| 5097 | const SmallDenseSet<Value *> *UserIgnoreList = nullptr; |
| 5098 | |
| 5099 | /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of |
| 5100 | /// sorted SmallVectors of unsigned. |
| 5101 | struct OrdersTypeDenseMapInfo { |
| 5102 | static OrdersType getEmptyKey() { |
| 5103 | OrdersType V; |
| 5104 | V.push_back(Elt: ~1U); |
| 5105 | return V; |
| 5106 | } |
| 5107 | |
| 5108 | static OrdersType getTombstoneKey() { |
| 5109 | OrdersType V; |
| 5110 | V.push_back(Elt: ~2U); |
| 5111 | return V; |
| 5112 | } |
| 5113 | |
| 5114 | static unsigned getHashValue(const OrdersType &V) { |
| 5115 | return static_cast<unsigned>(hash_combine_range(R: V)); |
| 5116 | } |
| 5117 | |
| 5118 | static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) { |
| 5119 | return LHS == RHS; |
| 5120 | } |
| 5121 | }; |
| 5122 | |
| 5123 | // Analysis and block reference. |
| 5124 | Function *F; |
| 5125 | ScalarEvolution *SE; |
| 5126 | TargetTransformInfo *TTI; |
| 5127 | TargetLibraryInfo *TLI; |
| 5128 | LoopInfo *LI; |
| 5129 | DominatorTree *DT; |
| 5130 | AssumptionCache *AC; |
| 5131 | DemandedBits *DB; |
| 5132 | const DataLayout *DL; |
| 5133 | OptimizationRemarkEmitter *ORE; |
| 5134 | |
| 5135 | unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. |
| 5136 | unsigned MinVecRegSize; // Set by cl::opt (default: 128). |
| 5137 | |
| 5138 | /// Instruction builder to construct the vectorized tree. |
| 5139 | IRBuilder<TargetFolder> Builder; |
| 5140 | |
| 5141 | /// A map of scalar integer values to the smallest bit width with which they |
| 5142 | /// can legally be represented. The values map to (width, signed) pairs, |
| 5143 | /// where "width" indicates the minimum bit width and "signed" is True if the |
| 5144 | /// value must be signed-extended, rather than zero-extended, back to its |
| 5145 | /// original width. |
| 5146 | DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs; |
| 5147 | |
| 5148 | /// Final size of the reduced vector, if the current graph represents the |
| 5149 | /// input for the reduction and it was possible to narrow the size of the |
| 5150 | /// reduction. |
| 5151 | unsigned ReductionBitWidth = 0; |
| 5152 | |
| 5153 | /// Canonical graph size before the transformations. |
| 5154 | unsigned BaseGraphSize = 1; |
| 5155 | |
| 5156 | /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of |
| 5157 | /// type sizes, used in the tree. |
| 5158 | std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes; |
| 5159 | |
| 5160 | /// Indices of the vectorized nodes, which supposed to be the roots of the new |
| 5161 | /// bitwidth analysis attempt, like trunc, IToFP or ICmp. |
| 5162 | DenseSet<unsigned> ; |
| 5163 | }; |
| 5164 | |
| 5165 | } // end namespace slpvectorizer |
| 5166 | |
| 5167 | template <> struct GraphTraits<BoUpSLP *> { |
| 5168 | using TreeEntry = BoUpSLP::TreeEntry; |
| 5169 | |
| 5170 | /// NodeRef has to be a pointer per the GraphWriter. |
| 5171 | using NodeRef = TreeEntry *; |
| 5172 | |
| 5173 | using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy; |
| 5174 | |
| 5175 | /// Add the VectorizableTree to the index iterator to be able to return |
| 5176 | /// TreeEntry pointers. |
| 5177 | struct ChildIteratorType |
| 5178 | : public iterator_adaptor_base< |
| 5179 | ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> { |
| 5180 | ContainerTy &VectorizableTree; |
| 5181 | |
| 5182 | ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W, |
| 5183 | ContainerTy &VT) |
| 5184 | : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {} |
| 5185 | |
| 5186 | NodeRef operator*() { return I->UserTE; } |
| 5187 | }; |
| 5188 | |
| 5189 | static NodeRef getEntryNode(BoUpSLP &R) { |
| 5190 | return R.VectorizableTree[0].get(); |
| 5191 | } |
| 5192 | |
| 5193 | static ChildIteratorType child_begin(NodeRef N) { |
| 5194 | return {&N->UserTreeIndex, N->Container}; |
| 5195 | } |
| 5196 | |
| 5197 | static ChildIteratorType child_end(NodeRef N) { |
| 5198 | return {&N->UserTreeIndex + 1, N->Container}; |
| 5199 | } |
| 5200 | |
| 5201 | /// For the node iterator we just need to turn the TreeEntry iterator into a |
| 5202 | /// TreeEntry* iterator so that it dereferences to NodeRef. |
| 5203 | class nodes_iterator { |
| 5204 | using ItTy = ContainerTy::iterator; |
| 5205 | ItTy It; |
| 5206 | |
| 5207 | public: |
| 5208 | nodes_iterator(const ItTy &It2) : It(It2) {} |
| 5209 | NodeRef operator*() { return It->get(); } |
| 5210 | nodes_iterator operator++() { |
| 5211 | ++It; |
| 5212 | return *this; |
| 5213 | } |
| 5214 | bool operator!=(const nodes_iterator &N2) const { return N2.It != It; } |
| 5215 | }; |
| 5216 | |
| 5217 | static nodes_iterator nodes_begin(BoUpSLP *R) { |
| 5218 | return nodes_iterator(R->VectorizableTree.begin()); |
| 5219 | } |
| 5220 | |
| 5221 | static nodes_iterator nodes_end(BoUpSLP *R) { |
| 5222 | return nodes_iterator(R->VectorizableTree.end()); |
| 5223 | } |
| 5224 | |
| 5225 | static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); } |
| 5226 | }; |
| 5227 | |
| 5228 | template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits { |
| 5229 | using TreeEntry = BoUpSLP::TreeEntry; |
| 5230 | |
| 5231 | DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} |
| 5232 | |
| 5233 | std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) { |
| 5234 | std::string Str; |
| 5235 | raw_string_ostream OS(Str); |
| 5236 | OS << Entry->Idx << ".\n" ; |
| 5237 | if (isSplat(VL: Entry->Scalars)) |
| 5238 | OS << "<splat> " ; |
| 5239 | for (auto *V : Entry->Scalars) { |
| 5240 | OS << *V; |
| 5241 | if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) { |
| 5242 | return EU.Scalar == V; |
| 5243 | })) |
| 5244 | OS << " <extract>" ; |
| 5245 | OS << "\n" ; |
| 5246 | } |
| 5247 | return Str; |
| 5248 | } |
| 5249 | |
| 5250 | static std::string getNodeAttributes(const TreeEntry *Entry, |
| 5251 | const BoUpSLP *) { |
| 5252 | if (Entry->isGather()) |
| 5253 | return "color=red" ; |
| 5254 | if (Entry->State == TreeEntry::ScatterVectorize || |
| 5255 | Entry->State == TreeEntry::StridedVectorize || |
| 5256 | Entry->State == TreeEntry::CompressVectorize) |
| 5257 | return "color=blue" ; |
| 5258 | return "" ; |
| 5259 | } |
| 5260 | }; |
| 5261 | |
| 5262 | } // end namespace llvm |
| 5263 | |
| 5264 | BoUpSLP::~BoUpSLP() { |
| 5265 | SmallVector<WeakTrackingVH> DeadInsts; |
| 5266 | for (auto *I : DeletedInstructions) { |
| 5267 | if (!I->getParent()) { |
| 5268 | // Temporarily insert instruction back to erase them from parent and |
| 5269 | // memory later. |
| 5270 | if (isa<PHINode>(Val: I)) |
| 5271 | // Phi nodes must be the very first instructions in the block. |
| 5272 | I->insertBefore(BB&: F->getEntryBlock(), |
| 5273 | InsertPos: F->getEntryBlock().getFirstNonPHIIt()); |
| 5274 | else |
| 5275 | I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator()); |
| 5276 | continue; |
| 5277 | } |
| 5278 | for (Use &U : I->operands()) { |
| 5279 | auto *Op = dyn_cast<Instruction>(Val: U.get()); |
| 5280 | if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() && |
| 5281 | wouldInstructionBeTriviallyDead(I: Op, TLI)) |
| 5282 | DeadInsts.emplace_back(Args&: Op); |
| 5283 | } |
| 5284 | I->dropAllReferences(); |
| 5285 | } |
| 5286 | for (auto *I : DeletedInstructions) { |
| 5287 | assert(I->use_empty() && |
| 5288 | "trying to erase instruction with users." ); |
| 5289 | I->eraseFromParent(); |
| 5290 | } |
| 5291 | |
| 5292 | // Cleanup any dead scalar code feeding the vectorized instructions |
| 5293 | RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI); |
| 5294 | |
| 5295 | #ifdef EXPENSIVE_CHECKS |
| 5296 | // If we could guarantee that this call is not extremely slow, we could |
| 5297 | // remove the ifdef limitation (see PR47712). |
| 5298 | assert(!verifyFunction(*F, &dbgs())); |
| 5299 | #endif |
| 5300 | } |
| 5301 | |
| 5302 | /// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses |
| 5303 | /// contains original mask for the scalars reused in the node. Procedure |
| 5304 | /// transform this mask in accordance with the given \p Mask. |
| 5305 | static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) { |
| 5306 | assert(!Mask.empty() && Reuses.size() == Mask.size() && |
| 5307 | "Expected non-empty mask." ); |
| 5308 | SmallVector<int> Prev(Reuses.begin(), Reuses.end()); |
| 5309 | Prev.swap(RHS&: Reuses); |
| 5310 | for (unsigned I = 0, E = Prev.size(); I < E; ++I) |
| 5311 | if (Mask[I] != PoisonMaskElem) |
| 5312 | Reuses[Mask[I]] = Prev[I]; |
| 5313 | } |
| 5314 | |
| 5315 | /// Reorders the given \p Order according to the given \p Mask. \p Order - is |
| 5316 | /// the original order of the scalars. Procedure transforms the provided order |
| 5317 | /// in accordance with the given \p Mask. If the resulting \p Order is just an |
| 5318 | /// identity order, \p Order is cleared. |
| 5319 | static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask, |
| 5320 | bool BottomOrder = false) { |
| 5321 | assert(!Mask.empty() && "Expected non-empty mask." ); |
| 5322 | unsigned Sz = Mask.size(); |
| 5323 | if (BottomOrder) { |
| 5324 | SmallVector<unsigned> PrevOrder; |
| 5325 | if (Order.empty()) { |
| 5326 | PrevOrder.resize(N: Sz); |
| 5327 | std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0); |
| 5328 | } else { |
| 5329 | PrevOrder.swap(RHS&: Order); |
| 5330 | } |
| 5331 | Order.assign(NumElts: Sz, Elt: Sz); |
| 5332 | for (unsigned I = 0; I < Sz; ++I) |
| 5333 | if (Mask[I] != PoisonMaskElem) |
| 5334 | Order[I] = PrevOrder[Mask[I]]; |
| 5335 | if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) { |
| 5336 | return Data.value() == Sz || Data.index() == Data.value(); |
| 5337 | })) { |
| 5338 | Order.clear(); |
| 5339 | return; |
| 5340 | } |
| 5341 | fixupOrderingIndices(Order); |
| 5342 | return; |
| 5343 | } |
| 5344 | SmallVector<int> MaskOrder; |
| 5345 | if (Order.empty()) { |
| 5346 | MaskOrder.resize(N: Sz); |
| 5347 | std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0); |
| 5348 | } else { |
| 5349 | inversePermutation(Indices: Order, Mask&: MaskOrder); |
| 5350 | } |
| 5351 | reorderReuses(Reuses&: MaskOrder, Mask); |
| 5352 | if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) { |
| 5353 | Order.clear(); |
| 5354 | return; |
| 5355 | } |
| 5356 | Order.assign(NumElts: Sz, Elt: Sz); |
| 5357 | for (unsigned I = 0; I < Sz; ++I) |
| 5358 | if (MaskOrder[I] != PoisonMaskElem) |
| 5359 | Order[MaskOrder[I]] = I; |
| 5360 | fixupOrderingIndices(Order); |
| 5361 | } |
| 5362 | |
| 5363 | std::optional<BoUpSLP::OrdersType> |
| 5364 | BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, |
| 5365 | bool TopToBottom, bool IgnoreReorder) { |
| 5366 | assert(TE.isGather() && "Expected gather node only." ); |
| 5367 | // Try to find subvector extract/insert patterns and reorder only such |
| 5368 | // patterns. |
| 5369 | SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end()); |
| 5370 | Type *ScalarTy = GatheredScalars.front()->getType(); |
| 5371 | size_t NumScalars = GatheredScalars.size(); |
| 5372 | if (!isValidElementType(Ty: ScalarTy)) |
| 5373 | return std::nullopt; |
| 5374 | auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars); |
| 5375 | unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars); |
| 5376 | SmallVector<int> ; |
| 5377 | SmallVector<int> Mask; |
| 5378 | SmallVector<SmallVector<const TreeEntry *>> Entries; |
| 5379 | SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> = |
| 5380 | tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts); |
| 5381 | SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles = |
| 5382 | isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts, |
| 5383 | /*ForOrder=*/true); |
| 5384 | // No shuffled operands - ignore. |
| 5385 | if (GatherShuffles.empty() && ExtractShuffles.empty()) |
| 5386 | return std::nullopt; |
| 5387 | OrdersType CurrentOrder(NumScalars, NumScalars); |
| 5388 | if (GatherShuffles.size() == 1 && |
| 5389 | *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && |
| 5390 | Entries.front().front()->isSame(VL: TE.Scalars)) { |
| 5391 | // If the full matched node in whole tree rotation - no need to consider the |
| 5392 | // matching order, rotating the whole tree. |
| 5393 | if (TopToBottom) |
| 5394 | return std::nullopt; |
| 5395 | // No need to keep the order for the same user node. |
| 5396 | if (Entries.front().front()->UserTreeIndex.UserTE == |
| 5397 | TE.UserTreeIndex.UserTE) |
| 5398 | return std::nullopt; |
| 5399 | // No need to keep the order for the matched root node, if it can be freely |
| 5400 | // reordered. |
| 5401 | if (!IgnoreReorder && Entries.front().front()->Idx == 0) |
| 5402 | return std::nullopt; |
| 5403 | // If shuffling 2 elements only and the matching node has reverse reuses - |
| 5404 | // no need to count order, both work fine. |
| 5405 | if (!Entries.front().front()->ReuseShuffleIndices.empty() && |
| 5406 | TE.getVectorFactor() == 2 && Mask.size() == 2 && |
| 5407 | any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices), |
| 5408 | P: [](const auto &P) { |
| 5409 | return P.value() % 2 != static_cast<int>(P.index()) % 2; |
| 5410 | })) |
| 5411 | return std::nullopt; |
| 5412 | |
| 5413 | // Perfect match in the graph, will reuse the previously vectorized |
| 5414 | // node. Cost is 0. |
| 5415 | std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0); |
| 5416 | return CurrentOrder; |
| 5417 | } |
| 5418 | auto IsSplatMask = [](ArrayRef<int> Mask) { |
| 5419 | int SingleElt = PoisonMaskElem; |
| 5420 | return all_of(Range&: Mask, P: [&](int I) { |
| 5421 | if (SingleElt == PoisonMaskElem && I != PoisonMaskElem) |
| 5422 | SingleElt = I; |
| 5423 | return I == PoisonMaskElem || I == SingleElt; |
| 5424 | }); |
| 5425 | }; |
| 5426 | // Exclusive broadcast mask - ignore. |
| 5427 | if ((ExtractShuffles.empty() && IsSplatMask(Mask) && |
| 5428 | (Entries.size() != 1 || |
| 5429 | Entries.front().front()->ReorderIndices.empty())) || |
| 5430 | (GatherShuffles.empty() && IsSplatMask(ExtractMask))) |
| 5431 | return std::nullopt; |
| 5432 | SmallBitVector ShuffledSubMasks(NumParts); |
| 5433 | auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder, |
| 5434 | ArrayRef<int> Mask, int PartSz, int NumParts, |
| 5435 | function_ref<unsigned(unsigned)> GetVF) { |
| 5436 | for (int I : seq<int>(Begin: 0, End: NumParts)) { |
| 5437 | if (ShuffledSubMasks.test(Idx: I)) |
| 5438 | continue; |
| 5439 | const int VF = GetVF(I); |
| 5440 | if (VF == 0) |
| 5441 | continue; |
| 5442 | unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I); |
| 5443 | MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit); |
| 5444 | // Shuffle of at least 2 vectors - ignore. |
| 5445 | if (any_of(Range&: Slice, P: [&](unsigned I) { return I != NumScalars; })) { |
| 5446 | llvm::fill(Range&: Slice, Value&: NumScalars); |
| 5447 | ShuffledSubMasks.set(I); |
| 5448 | continue; |
| 5449 | } |
| 5450 | // Try to include as much elements from the mask as possible. |
| 5451 | int FirstMin = INT_MAX; |
| 5452 | int SecondVecFound = false; |
| 5453 | for (int K : seq<int>(Size: Limit)) { |
| 5454 | int Idx = Mask[I * PartSz + K]; |
| 5455 | if (Idx == PoisonMaskElem) { |
| 5456 | Value *V = GatheredScalars[I * PartSz + K]; |
| 5457 | if (isConstant(V) && !isa<PoisonValue>(Val: V)) { |
| 5458 | SecondVecFound = true; |
| 5459 | break; |
| 5460 | } |
| 5461 | continue; |
| 5462 | } |
| 5463 | if (Idx < VF) { |
| 5464 | if (FirstMin > Idx) |
| 5465 | FirstMin = Idx; |
| 5466 | } else { |
| 5467 | SecondVecFound = true; |
| 5468 | break; |
| 5469 | } |
| 5470 | } |
| 5471 | FirstMin = (FirstMin / PartSz) * PartSz; |
| 5472 | // Shuffle of at least 2 vectors - ignore. |
| 5473 | if (SecondVecFound) { |
| 5474 | llvm::fill(Range&: Slice, Value&: NumScalars); |
| 5475 | ShuffledSubMasks.set(I); |
| 5476 | continue; |
| 5477 | } |
| 5478 | for (int K : seq<int>(Size: Limit)) { |
| 5479 | int Idx = Mask[I * PartSz + K]; |
| 5480 | if (Idx == PoisonMaskElem) |
| 5481 | continue; |
| 5482 | Idx -= FirstMin; |
| 5483 | if (Idx >= PartSz) { |
| 5484 | SecondVecFound = true; |
| 5485 | break; |
| 5486 | } |
| 5487 | if (CurrentOrder[I * PartSz + Idx] > |
| 5488 | static_cast<unsigned>(I * PartSz + K) && |
| 5489 | CurrentOrder[I * PartSz + Idx] != |
| 5490 | static_cast<unsigned>(I * PartSz + Idx)) |
| 5491 | CurrentOrder[I * PartSz + Idx] = I * PartSz + K; |
| 5492 | } |
| 5493 | // Shuffle of at least 2 vectors - ignore. |
| 5494 | if (SecondVecFound) { |
| 5495 | llvm::fill(Range&: Slice, Value&: NumScalars); |
| 5496 | ShuffledSubMasks.set(I); |
| 5497 | continue; |
| 5498 | } |
| 5499 | } |
| 5500 | }; |
| 5501 | int PartSz = getPartNumElems(Size: NumScalars, NumParts); |
| 5502 | if (!ExtractShuffles.empty()) |
| 5503 | TransformMaskToOrder( |
| 5504 | CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) { |
| 5505 | if (!ExtractShuffles[I]) |
| 5506 | return 0U; |
| 5507 | unsigned VF = 0; |
| 5508 | unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I); |
| 5509 | for (unsigned Idx : seq<unsigned>(Size: Sz)) { |
| 5510 | int K = I * PartSz + Idx; |
| 5511 | if (ExtractMask[K] == PoisonMaskElem) |
| 5512 | continue; |
| 5513 | if (!TE.ReuseShuffleIndices.empty()) |
| 5514 | K = TE.ReuseShuffleIndices[K]; |
| 5515 | if (K == PoisonMaskElem) |
| 5516 | continue; |
| 5517 | if (!TE.ReorderIndices.empty()) |
| 5518 | K = std::distance(first: TE.ReorderIndices.begin(), |
| 5519 | last: find(Range: TE.ReorderIndices, Val: K)); |
| 5520 | auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]); |
| 5521 | if (!EI) |
| 5522 | continue; |
| 5523 | VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType()) |
| 5524 | ->getElementCount() |
| 5525 | .getKnownMinValue()); |
| 5526 | } |
| 5527 | return VF; |
| 5528 | }); |
| 5529 | // Check special corner case - single shuffle of the same entry. |
| 5530 | if (GatherShuffles.size() == 1 && NumParts != 1) { |
| 5531 | if (ShuffledSubMasks.any()) |
| 5532 | return std::nullopt; |
| 5533 | PartSz = NumScalars; |
| 5534 | NumParts = 1; |
| 5535 | } |
| 5536 | if (!Entries.empty()) |
| 5537 | TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) { |
| 5538 | if (!GatherShuffles[I]) |
| 5539 | return 0U; |
| 5540 | return std::max(a: Entries[I].front()->getVectorFactor(), |
| 5541 | b: Entries[I].back()->getVectorFactor()); |
| 5542 | }); |
| 5543 | unsigned NumUndefs = |
| 5544 | count_if(Range&: CurrentOrder, P: [&](unsigned Idx) { return Idx == NumScalars; }); |
| 5545 | if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) |
| 5546 | return std::nullopt; |
| 5547 | return std::move(CurrentOrder); |
| 5548 | } |
| 5549 | |
| 5550 | static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, |
| 5551 | const TargetLibraryInfo &TLI, |
| 5552 | bool CompareOpcodes = true) { |
| 5553 | if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) != |
| 5554 | getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth)) |
| 5555 | return false; |
| 5556 | auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1); |
| 5557 | auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2); |
| 5558 | return (!GEP1 || GEP1->getNumOperands() == 2) && |
| 5559 | (!GEP2 || GEP2->getNumOperands() == 2) && |
| 5560 | (((!GEP1 || isConstant(V: GEP1->getOperand(i_nocapture: 1))) && |
| 5561 | (!GEP2 || isConstant(V: GEP2->getOperand(i_nocapture: 1)))) || |
| 5562 | !CompareOpcodes || |
| 5563 | (GEP1 && GEP2 && |
| 5564 | getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI))); |
| 5565 | } |
| 5566 | |
| 5567 | /// Calculates minimal alignment as a common alignment. |
| 5568 | template <typename T> |
| 5569 | static Align computeCommonAlignment(ArrayRef<Value *> VL) { |
| 5570 | Align CommonAlignment = cast<T>(VL.front())->getAlign(); |
| 5571 | for (Value *V : VL.drop_front()) |
| 5572 | CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign()); |
| 5573 | return CommonAlignment; |
| 5574 | } |
| 5575 | |
| 5576 | /// Check if \p Order represents reverse order. |
| 5577 | static bool isReverseOrder(ArrayRef<unsigned> Order) { |
| 5578 | assert(!Order.empty() && |
| 5579 | "Order is empty. Please check it before using isReverseOrder." ); |
| 5580 | unsigned Sz = Order.size(); |
| 5581 | return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) { |
| 5582 | return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value(); |
| 5583 | }); |
| 5584 | } |
| 5585 | |
| 5586 | /// Checks if the provided list of pointers \p Pointers represents the strided |
| 5587 | /// pointers for type ElemTy. If they are not, std::nullopt is returned. |
| 5588 | /// Otherwise, if \p Inst is not specified, just initialized optional value is |
| 5589 | /// returned to show that the pointers represent strided pointers. If \p Inst |
| 5590 | /// specified, the runtime stride is materialized before the given \p Inst. |
| 5591 | /// \returns std::nullopt if the pointers are not pointers with the runtime |
| 5592 | /// stride, nullptr or actual stride value, otherwise. |
| 5593 | static std::optional<Value *> |
| 5594 | calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy, |
| 5595 | const DataLayout &DL, ScalarEvolution &SE, |
| 5596 | SmallVectorImpl<unsigned> &SortedIndices, |
| 5597 | Instruction *Inst = nullptr) { |
| 5598 | SmallVector<const SCEV *> SCEVs; |
| 5599 | const SCEV *PtrSCEVLowest = nullptr; |
| 5600 | const SCEV *PtrSCEVHighest = nullptr; |
| 5601 | // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest |
| 5602 | // addresses). |
| 5603 | for (Value *Ptr : PointerOps) { |
| 5604 | const SCEV *PtrSCEV = SE.getSCEV(V: Ptr); |
| 5605 | if (!PtrSCEV) |
| 5606 | return std::nullopt; |
| 5607 | SCEVs.push_back(Elt: PtrSCEV); |
| 5608 | if (!PtrSCEVLowest && !PtrSCEVHighest) { |
| 5609 | PtrSCEVLowest = PtrSCEVHighest = PtrSCEV; |
| 5610 | continue; |
| 5611 | } |
| 5612 | const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest); |
| 5613 | if (isa<SCEVCouldNotCompute>(Val: Diff)) |
| 5614 | return std::nullopt; |
| 5615 | if (Diff->isNonConstantNegative()) { |
| 5616 | PtrSCEVLowest = PtrSCEV; |
| 5617 | continue; |
| 5618 | } |
| 5619 | const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV); |
| 5620 | if (isa<SCEVCouldNotCompute>(Val: Diff1)) |
| 5621 | return std::nullopt; |
| 5622 | if (Diff1->isNonConstantNegative()) { |
| 5623 | PtrSCEVHighest = PtrSCEV; |
| 5624 | continue; |
| 5625 | } |
| 5626 | } |
| 5627 | // Dist = PtrSCEVHighest - PtrSCEVLowest; |
| 5628 | const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest); |
| 5629 | if (isa<SCEVCouldNotCompute>(Val: Dist)) |
| 5630 | return std::nullopt; |
| 5631 | int Size = DL.getTypeStoreSize(Ty: ElemTy); |
| 5632 | auto TryGetStride = [&](const SCEV *Dist, |
| 5633 | const SCEV *Multiplier) -> const SCEV * { |
| 5634 | if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) { |
| 5635 | if (M->getOperand(i: 0) == Multiplier) |
| 5636 | return M->getOperand(i: 1); |
| 5637 | if (M->getOperand(i: 1) == Multiplier) |
| 5638 | return M->getOperand(i: 0); |
| 5639 | return nullptr; |
| 5640 | } |
| 5641 | if (Multiplier == Dist) |
| 5642 | return SE.getConstant(Ty: Dist->getType(), V: 1); |
| 5643 | return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier); |
| 5644 | }; |
| 5645 | // Stride_in_elements = Dist / element_size * (num_elems - 1). |
| 5646 | const SCEV *Stride = nullptr; |
| 5647 | if (Size != 1 || SCEVs.size() > 2) { |
| 5648 | const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1)); |
| 5649 | Stride = TryGetStride(Dist, Sz); |
| 5650 | if (!Stride) |
| 5651 | return std::nullopt; |
| 5652 | } |
| 5653 | if (!Stride || isa<SCEVConstant>(Val: Stride)) |
| 5654 | return std::nullopt; |
| 5655 | // Iterate through all pointers and check if all distances are |
| 5656 | // unique multiple of Stride. |
| 5657 | using DistOrdPair = std::pair<int64_t, int>; |
| 5658 | auto Compare = llvm::less_first(); |
| 5659 | std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); |
| 5660 | int Cnt = 0; |
| 5661 | bool IsConsecutive = true; |
| 5662 | for (const SCEV *PtrSCEV : SCEVs) { |
| 5663 | unsigned Dist = 0; |
| 5664 | if (PtrSCEV != PtrSCEVLowest) { |
| 5665 | const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest); |
| 5666 | const SCEV *Coeff = TryGetStride(Diff, Stride); |
| 5667 | if (!Coeff) |
| 5668 | return std::nullopt; |
| 5669 | const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff); |
| 5670 | if (!SC || isa<SCEVCouldNotCompute>(Val: SC)) |
| 5671 | return std::nullopt; |
| 5672 | if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest, |
| 5673 | RHS: SE.getMulExpr(LHS: Stride, RHS: SC))) |
| 5674 | ->isZero()) |
| 5675 | return std::nullopt; |
| 5676 | Dist = SC->getAPInt().getZExtValue(); |
| 5677 | } |
| 5678 | // If the strides are not the same or repeated, we can't vectorize. |
| 5679 | if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size()) |
| 5680 | return std::nullopt; |
| 5681 | auto Res = Offsets.emplace(args&: Dist, args&: Cnt); |
| 5682 | if (!Res.second) |
| 5683 | return std::nullopt; |
| 5684 | // Consecutive order if the inserted element is the last one. |
| 5685 | IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end(); |
| 5686 | ++Cnt; |
| 5687 | } |
| 5688 | if (Offsets.size() != SCEVs.size()) |
| 5689 | return std::nullopt; |
| 5690 | SortedIndices.clear(); |
| 5691 | if (!IsConsecutive) { |
| 5692 | // Fill SortedIndices array only if it is non-consecutive. |
| 5693 | SortedIndices.resize(N: PointerOps.size()); |
| 5694 | Cnt = 0; |
| 5695 | for (const std::pair<int64_t, int> &Pair : Offsets) { |
| 5696 | SortedIndices[Cnt] = Pair.second; |
| 5697 | ++Cnt; |
| 5698 | } |
| 5699 | } |
| 5700 | if (!Inst) |
| 5701 | return nullptr; |
| 5702 | SCEVExpander Expander(SE, DL, "strided-load-vec" ); |
| 5703 | return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst); |
| 5704 | } |
| 5705 | |
| 5706 | static std::pair<InstructionCost, InstructionCost> |
| 5707 | getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, |
| 5708 | Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, |
| 5709 | Type *ScalarTy, VectorType *VecTy); |
| 5710 | |
| 5711 | /// Returns the cost of the shuffle instructions with the given \p Kind, vector |
| 5712 | /// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert |
| 5713 | /// subvector pattern. |
| 5714 | static InstructionCost |
| 5715 | getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, |
| 5716 | VectorType *Tp, ArrayRef<int> Mask = {}, |
| 5717 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, |
| 5718 | int Index = 0, VectorType *SubTp = nullptr, |
| 5719 | ArrayRef<const Value *> Args = {}) { |
| 5720 | VectorType *DstTy = Tp; |
| 5721 | if (!Mask.empty()) |
| 5722 | DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size()); |
| 5723 | |
| 5724 | if (Kind != TTI::SK_PermuteTwoSrc) |
| 5725 | return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp, |
| 5726 | Args); |
| 5727 | int NumSrcElts = Tp->getElementCount().getKnownMinValue(); |
| 5728 | int NumSubElts; |
| 5729 | if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( |
| 5730 | Mask, NumSrcElts, NumSubElts, Index)) { |
| 5731 | if (Index + NumSubElts > NumSrcElts && |
| 5732 | Index + NumSrcElts <= static_cast<int>(Mask.size())) |
| 5733 | return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask, |
| 5734 | CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp); |
| 5735 | } |
| 5736 | return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp, |
| 5737 | Args); |
| 5738 | } |
| 5739 | |
| 5740 | /// This is similar to TargetTransformInfo::getScalarizationOverhead, but if |
| 5741 | /// ScalarTy is a FixedVectorType, a vector will be inserted or extracted |
| 5742 | /// instead of a scalar. |
| 5743 | static InstructionCost |
| 5744 | getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy, |
| 5745 | VectorType *Ty, const APInt &DemandedElts, bool Insert, |
| 5746 | bool , TTI::TargetCostKind CostKind, |
| 5747 | bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) { |
| 5748 | assert(!isa<ScalableVectorType>(Ty) && |
| 5749 | "ScalableVectorType is not supported." ); |
| 5750 | assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() == |
| 5751 | getNumElements(Ty) && |
| 5752 | "Incorrect usage." ); |
| 5753 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) { |
| 5754 | assert(SLPReVec && "Only supported by REVEC." ); |
| 5755 | // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead |
| 5756 | // of CreateInsertElement. |
| 5757 | unsigned ScalarTyNumElements = VecTy->getNumElements(); |
| 5758 | InstructionCost Cost = 0; |
| 5759 | for (unsigned I : seq(Size: DemandedElts.getBitWidth())) { |
| 5760 | if (!DemandedElts[I]) |
| 5761 | continue; |
| 5762 | if (Insert) |
| 5763 | Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind, |
| 5764 | Index: I * ScalarTyNumElements, SubTp: VecTy); |
| 5765 | if (Extract) |
| 5766 | Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind, |
| 5767 | Index: I * ScalarTyNumElements, SubTp: VecTy); |
| 5768 | } |
| 5769 | return Cost; |
| 5770 | } |
| 5771 | return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, |
| 5772 | CostKind, ForPoisonSrc, VL); |
| 5773 | } |
| 5774 | |
| 5775 | /// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy |
| 5776 | /// is a FixedVectorType, a vector will be extracted instead of a scalar. |
| 5777 | static InstructionCost getVectorInstrCost( |
| 5778 | const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val, |
| 5779 | TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, |
| 5780 | ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) { |
| 5781 | if (Opcode == Instruction::ExtractElement) { |
| 5782 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) { |
| 5783 | assert(SLPReVec && "Only supported by REVEC." ); |
| 5784 | assert(isa<VectorType>(Val) && "Val must be a vector type." ); |
| 5785 | return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, |
| 5786 | Tp: cast<VectorType>(Val), Mask: {}, CostKind, |
| 5787 | Index: Index * VecTy->getNumElements(), SubTp: VecTy); |
| 5788 | } |
| 5789 | } |
| 5790 | return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar, |
| 5791 | ScalarUserAndIdx); |
| 5792 | } |
| 5793 | |
| 5794 | /// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst |
| 5795 | /// is a FixedVectorType, a vector will be extracted instead of a scalar. |
| 5796 | static InstructionCost ( |
| 5797 | const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst, |
| 5798 | VectorType *VecTy, unsigned Index, |
| 5799 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) { |
| 5800 | if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) { |
| 5801 | assert(SLPReVec && "Only supported by REVEC." ); |
| 5802 | auto *SubTp = |
| 5803 | getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements()); |
| 5804 | return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind, |
| 5805 | Index: Index * ScalarTy->getNumElements(), SubTp) + |
| 5806 | TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None, |
| 5807 | CostKind); |
| 5808 | } |
| 5809 | return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind); |
| 5810 | } |
| 5811 | |
| 5812 | /// Correctly creates insert_subvector, checking that the index is multiple of |
| 5813 | /// the subvectors length. Otherwise, generates shuffle using \p Generator or |
| 5814 | /// using default shuffle. |
| 5815 | static Value *createInsertVector( |
| 5816 | IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, |
| 5817 | function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) { |
| 5818 | const unsigned SubVecVF = getNumElements(Ty: V->getType()); |
| 5819 | if (Index % SubVecVF == 0) { |
| 5820 | Vec = Builder.CreateInsertVector(DstType: Vec->getType(), SrcVec: Vec, SubVec: V, Idx: Index); |
| 5821 | } else { |
| 5822 | // Create shuffle, insertvector requires that index is multiple of |
| 5823 | // the subvector length. |
| 5824 | const unsigned VecVF = getNumElements(Ty: Vec->getType()); |
| 5825 | SmallVector<int> Mask(VecVF, PoisonMaskElem); |
| 5826 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
| 5827 | for (unsigned I : seq<unsigned>(Size: SubVecVF)) |
| 5828 | Mask[I + Index] = I + VecVF; |
| 5829 | if (Generator) { |
| 5830 | Vec = Generator(Vec, V, Mask); |
| 5831 | } else { |
| 5832 | // 1. Resize V to the size of Vec. |
| 5833 | SmallVector<int> ResizeMask(VecVF, PoisonMaskElem); |
| 5834 | std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: 0); |
| 5835 | V = Builder.CreateShuffleVector(V, Mask: ResizeMask); |
| 5836 | Vec = Builder.CreateShuffleVector(V1: Vec, V2: V, Mask); |
| 5837 | } |
| 5838 | } |
| 5839 | return Vec; |
| 5840 | } |
| 5841 | |
| 5842 | /// Correctly creates extract_subvector, checking that the index is multiple of |
| 5843 | /// the subvectors length. Otherwise, generates shuffle using \p Generator or |
| 5844 | /// using default shuffle. |
| 5845 | static Value *(IRBuilderBase &Builder, Value *Vec, |
| 5846 | unsigned SubVecVF, unsigned Index) { |
| 5847 | if (Index % SubVecVF == 0) { |
| 5848 | VectorType *SubVecTy = |
| 5849 | getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: SubVecVF); |
| 5850 | return Builder.CreateExtractVector(DstType: SubVecTy, SrcVec: Vec, Idx: Index); |
| 5851 | } |
| 5852 | // Create shuffle, extract_subvector requires that index is multiple of |
| 5853 | // the subvector length. |
| 5854 | SmallVector<int> Mask(SubVecVF, PoisonMaskElem); |
| 5855 | std::iota(first: Mask.begin(), last: Mask.end(), value: Index); |
| 5856 | return Builder.CreateShuffleVector(V: Vec, Mask); |
| 5857 | } |
| 5858 | |
| 5859 | /// Builds compress-like mask for shuffles for the given \p PointerOps, ordered |
| 5860 | /// with \p Order. |
| 5861 | /// \return true if the mask represents strided access, false - otherwise. |
| 5862 | static bool buildCompressMask(ArrayRef<Value *> PointerOps, |
| 5863 | ArrayRef<unsigned> Order, Type *ScalarTy, |
| 5864 | const DataLayout &DL, ScalarEvolution &SE, |
| 5865 | SmallVectorImpl<int> &CompressMask) { |
| 5866 | const unsigned Sz = PointerOps.size(); |
| 5867 | CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem); |
| 5868 | // The first element always set. |
| 5869 | CompressMask[0] = 0; |
| 5870 | // Check if the mask represents strided access. |
| 5871 | std::optional<unsigned> Stride = 0; |
| 5872 | Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()]; |
| 5873 | for (unsigned I : seq<unsigned>(Begin: 1, End: Sz)) { |
| 5874 | Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]]; |
| 5875 | std::optional<int64_t> OptPos = |
| 5876 | getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE); |
| 5877 | if (!OptPos || OptPos > std::numeric_limits<unsigned>::max()) |
| 5878 | return false; |
| 5879 | unsigned Pos = static_cast<unsigned>(*OptPos); |
| 5880 | CompressMask[I] = Pos; |
| 5881 | if (!Stride) |
| 5882 | continue; |
| 5883 | if (*Stride == 0) { |
| 5884 | *Stride = Pos; |
| 5885 | continue; |
| 5886 | } |
| 5887 | if (Pos != *Stride * I) |
| 5888 | Stride.reset(); |
| 5889 | } |
| 5890 | return Stride.has_value(); |
| 5891 | } |
| 5892 | |
| 5893 | /// Checks if the \p VL can be transformed to a (masked)load + compress or |
| 5894 | /// (masked) interleaved load. |
| 5895 | static bool isMaskedLoadCompress( |
| 5896 | ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, |
| 5897 | ArrayRef<unsigned> Order, const TargetTransformInfo &TTI, |
| 5898 | const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC, |
| 5899 | const DominatorTree &DT, const TargetLibraryInfo &TLI, |
| 5900 | const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked, |
| 5901 | unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask, |
| 5902 | VectorType *&LoadVecTy) { |
| 5903 | InterleaveFactor = 0; |
| 5904 | Type *ScalarTy = VL.front()->getType(); |
| 5905 | const size_t Sz = VL.size(); |
| 5906 | auto *VecTy = getWidenedType(ScalarTy, VF: Sz); |
| 5907 | constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 5908 | SmallVector<int> Mask; |
| 5909 | if (!Order.empty()) |
| 5910 | inversePermutation(Indices: Order, Mask); |
| 5911 | // Check external uses. |
| 5912 | for (const auto [I, V] : enumerate(First&: VL)) { |
| 5913 | if (AreAllUsersVectorized(V)) |
| 5914 | continue; |
| 5915 | InstructionCost = |
| 5916 | TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind, |
| 5917 | Index: Mask.empty() ? I : Mask[I]); |
| 5918 | InstructionCost ScalarCost = |
| 5919 | TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind); |
| 5920 | if (ExtractCost <= ScalarCost) |
| 5921 | return false; |
| 5922 | } |
| 5923 | Value *Ptr0; |
| 5924 | Value *PtrN; |
| 5925 | if (Order.empty()) { |
| 5926 | Ptr0 = PointerOps.front(); |
| 5927 | PtrN = PointerOps.back(); |
| 5928 | } else { |
| 5929 | Ptr0 = PointerOps[Order.front()]; |
| 5930 | PtrN = PointerOps[Order.back()]; |
| 5931 | } |
| 5932 | std::optional<int64_t> Diff = |
| 5933 | getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE); |
| 5934 | if (!Diff) |
| 5935 | return false; |
| 5936 | const size_t MaxRegSize = |
| 5937 | TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector) |
| 5938 | .getFixedValue(); |
| 5939 | // Check for very large distances between elements. |
| 5940 | if (*Diff / Sz >= MaxRegSize / 8) |
| 5941 | return false; |
| 5942 | LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + 1); |
| 5943 | auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()]); |
| 5944 | Align CommonAlignment = LI->getAlign(); |
| 5945 | IsMasked = !isSafeToLoadUnconditionally( |
| 5946 | V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL, |
| 5947 | ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL[Order.back()]), AC: &AC, DT: &DT, |
| 5948 | TLI: &TLI); |
| 5949 | if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment, |
| 5950 | AddressSpace: LI->getPointerAddressSpace())) |
| 5951 | return false; |
| 5952 | // TODO: perform the analysis of each scalar load for better |
| 5953 | // safe-load-unconditionally analysis. |
| 5954 | bool IsStrided = |
| 5955 | buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask); |
| 5956 | assert(CompressMask.size() >= 2 && "At least two elements are required" ); |
| 5957 | SmallVector<Value *> OrderedPointerOps(PointerOps); |
| 5958 | if (!Order.empty()) |
| 5959 | reorderScalars(Scalars&: OrderedPointerOps, Mask); |
| 5960 | auto [ScalarGEPCost, VectorGEPCost] = |
| 5961 | getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(), |
| 5962 | Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy); |
| 5963 | // The cost of scalar loads. |
| 5964 | InstructionCost ScalarLoadsCost = |
| 5965 | std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(), |
| 5966 | binary_op: [&](InstructionCost C, Value *V) { |
| 5967 | return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V), |
| 5968 | CostKind); |
| 5969 | }) + |
| 5970 | ScalarGEPCost; |
| 5971 | APInt DemandedElts = APInt::getAllOnes(numBits: Sz); |
| 5972 | InstructionCost GatherCost = |
| 5973 | getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts, |
| 5974 | /*Insert=*/true, |
| 5975 | /*Extract=*/false, CostKind) + |
| 5976 | ScalarLoadsCost; |
| 5977 | InstructionCost LoadCost = 0; |
| 5978 | if (IsMasked) { |
| 5979 | LoadCost = |
| 5980 | TTI.getMaskedMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment, |
| 5981 | AddressSpace: LI->getPointerAddressSpace(), CostKind); |
| 5982 | } else { |
| 5983 | LoadCost = |
| 5984 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment, |
| 5985 | AddressSpace: LI->getPointerAddressSpace(), CostKind); |
| 5986 | } |
| 5987 | if (IsStrided && !IsMasked && Order.empty()) { |
| 5988 | // Check for potential segmented(interleaved) loads. |
| 5989 | VectorType *AlignedLoadVecTy = getWidenedType( |
| 5990 | ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + 1)); |
| 5991 | if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment, |
| 5992 | DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT, |
| 5993 | TLI: &TLI)) |
| 5994 | AlignedLoadVecTy = LoadVecTy; |
| 5995 | if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask[1], |
| 5996 | Alignment: CommonAlignment, |
| 5997 | AddrSpace: LI->getPointerAddressSpace())) { |
| 5998 | InstructionCost InterleavedCost = |
| 5999 | VectorGEPCost + TTI.getInterleavedMemoryOpCost( |
| 6000 | Opcode: Instruction::Load, VecTy: AlignedLoadVecTy, |
| 6001 | Factor: CompressMask[1], Indices: {}, Alignment: CommonAlignment, |
| 6002 | AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked); |
| 6003 | if (InterleavedCost < GatherCost) { |
| 6004 | InterleaveFactor = CompressMask[1]; |
| 6005 | LoadVecTy = AlignedLoadVecTy; |
| 6006 | return true; |
| 6007 | } |
| 6008 | } |
| 6009 | } |
| 6010 | InstructionCost CompressCost = ::getShuffleCost( |
| 6011 | TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind); |
| 6012 | if (!Order.empty()) { |
| 6013 | SmallVector<int> NewMask(Sz, PoisonMaskElem); |
| 6014 | for (unsigned I : seq<unsigned>(Size: Sz)) { |
| 6015 | NewMask[I] = CompressMask[Mask[I]]; |
| 6016 | } |
| 6017 | CompressMask.swap(RHS&: NewMask); |
| 6018 | } |
| 6019 | InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost; |
| 6020 | return TotalVecCost < GatherCost; |
| 6021 | } |
| 6022 | |
| 6023 | /// Checks if the \p VL can be transformed to a (masked)load + compress or |
| 6024 | /// (masked) interleaved load. |
| 6025 | static bool |
| 6026 | isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, |
| 6027 | ArrayRef<unsigned> Order, const TargetTransformInfo &TTI, |
| 6028 | const DataLayout &DL, ScalarEvolution &SE, |
| 6029 | AssumptionCache &AC, const DominatorTree &DT, |
| 6030 | const TargetLibraryInfo &TLI, |
| 6031 | const function_ref<bool(Value *)> AreAllUsersVectorized) { |
| 6032 | bool IsMasked; |
| 6033 | unsigned InterleaveFactor; |
| 6034 | SmallVector<int> CompressMask; |
| 6035 | VectorType *LoadVecTy; |
| 6036 | return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI, |
| 6037 | AreAllUsersVectorized, IsMasked, InterleaveFactor, |
| 6038 | CompressMask, LoadVecTy); |
| 6039 | } |
| 6040 | |
| 6041 | /// Checks if strided loads can be generated out of \p VL loads with pointers \p |
| 6042 | /// PointerOps: |
| 6043 | /// 1. Target with strided load support is detected. |
| 6044 | /// 2. The number of loads is greater than MinProfitableStridedLoads, or the |
| 6045 | /// potential stride <= MaxProfitableLoadStride and the potential stride is |
| 6046 | /// power-of-2 (to avoid perf regressions for the very small number of loads) |
| 6047 | /// and max distance > number of loads, or potential stride is -1. |
| 6048 | /// 3. The loads are ordered, or number of unordered loads <= |
| 6049 | /// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is |
| 6050 | /// to avoid extra costs for very expensive shuffles). |
| 6051 | /// 4. Any pointer operand is an instruction with the users outside of the |
| 6052 | /// current graph (for masked gathers extra extractelement instructions |
| 6053 | /// might be required). |
| 6054 | static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps, |
| 6055 | ArrayRef<unsigned> Order, |
| 6056 | const TargetTransformInfo &TTI, const DataLayout &DL, |
| 6057 | ScalarEvolution &SE, |
| 6058 | const bool IsAnyPointerUsedOutGraph, |
| 6059 | const int64_t Diff) { |
| 6060 | const size_t Sz = VL.size(); |
| 6061 | const uint64_t AbsoluteDiff = std::abs(i: Diff); |
| 6062 | Type *ScalarTy = VL.front()->getType(); |
| 6063 | auto *VecTy = getWidenedType(ScalarTy, VF: Sz); |
| 6064 | if (IsAnyPointerUsedOutGraph || |
| 6065 | (AbsoluteDiff > Sz && |
| 6066 | (Sz > MinProfitableStridedLoads || |
| 6067 | (AbsoluteDiff <= MaxProfitableLoadStride * Sz && |
| 6068 | AbsoluteDiff % Sz == 0 && has_single_bit(Value: AbsoluteDiff / Sz)))) || |
| 6069 | Diff == -(static_cast<int64_t>(Sz) - 1)) { |
| 6070 | int64_t Stride = Diff / static_cast<int64_t>(Sz - 1); |
| 6071 | if (Diff != Stride * static_cast<int64_t>(Sz - 1)) |
| 6072 | return false; |
| 6073 | Align Alignment = |
| 6074 | cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()]) |
| 6075 | ->getAlign(); |
| 6076 | if (!TTI.isLegalStridedLoadStore(DataType: VecTy, Alignment)) |
| 6077 | return false; |
| 6078 | Value *Ptr0; |
| 6079 | Value *PtrN; |
| 6080 | if (Order.empty()) { |
| 6081 | Ptr0 = PointerOps.front(); |
| 6082 | PtrN = PointerOps.back(); |
| 6083 | } else { |
| 6084 | Ptr0 = PointerOps[Order.front()]; |
| 6085 | PtrN = PointerOps[Order.back()]; |
| 6086 | } |
| 6087 | // Iterate through all pointers and check if all distances are |
| 6088 | // unique multiple of Dist. |
| 6089 | SmallSet<int64_t, 4> Dists; |
| 6090 | for (Value *Ptr : PointerOps) { |
| 6091 | int64_t Dist = 0; |
| 6092 | if (Ptr == PtrN) |
| 6093 | Dist = Diff; |
| 6094 | else if (Ptr != Ptr0) |
| 6095 | Dist = *getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE); |
| 6096 | // If the strides are not the same or repeated, we can't |
| 6097 | // vectorize. |
| 6098 | if (((Dist / Stride) * Stride) != Dist || !Dists.insert(V: Dist).second) |
| 6099 | break; |
| 6100 | } |
| 6101 | if (Dists.size() == Sz) |
| 6102 | return true; |
| 6103 | } |
| 6104 | return false; |
| 6105 | } |
| 6106 | |
| 6107 | BoUpSLP::LoadsState |
| 6108 | BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0, |
| 6109 | SmallVectorImpl<unsigned> &Order, |
| 6110 | SmallVectorImpl<Value *> &PointerOps, |
| 6111 | unsigned *BestVF, bool TryRecursiveCheck) const { |
| 6112 | // Check that a vectorized load would load the same memory as a scalar |
| 6113 | // load. For example, we don't want to vectorize loads that are smaller |
| 6114 | // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM |
| 6115 | // treats loading/storing it as an i8 struct. If we vectorize loads/stores |
| 6116 | // from such a struct, we read/write packed bits disagreeing with the |
| 6117 | // unvectorized version. |
| 6118 | if (BestVF) |
| 6119 | *BestVF = 0; |
| 6120 | if (areKnownNonVectorizableLoads(VL)) |
| 6121 | return LoadsState::Gather; |
| 6122 | Type *ScalarTy = VL0->getType(); |
| 6123 | |
| 6124 | if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy)) |
| 6125 | return LoadsState::Gather; |
| 6126 | |
| 6127 | // Make sure all loads in the bundle are simple - we can't vectorize |
| 6128 | // atomic or volatile loads. |
| 6129 | PointerOps.clear(); |
| 6130 | const size_t Sz = VL.size(); |
| 6131 | PointerOps.resize(N: Sz); |
| 6132 | auto *POIter = PointerOps.begin(); |
| 6133 | for (Value *V : VL) { |
| 6134 | auto *L = dyn_cast<LoadInst>(Val: V); |
| 6135 | if (!L || !L->isSimple()) |
| 6136 | return LoadsState::Gather; |
| 6137 | *POIter = L->getPointerOperand(); |
| 6138 | ++POIter; |
| 6139 | } |
| 6140 | |
| 6141 | Order.clear(); |
| 6142 | // Check the order of pointer operands or that all pointers are the same. |
| 6143 | bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order); |
| 6144 | |
| 6145 | auto *VecTy = getWidenedType(ScalarTy, VF: Sz); |
| 6146 | Align CommonAlignment = computeCommonAlignment<LoadInst>(VL); |
| 6147 | if (!IsSorted) { |
| 6148 | if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy)) { |
| 6149 | if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) && |
| 6150 | calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order)) |
| 6151 | return LoadsState::StridedVectorize; |
| 6152 | } |
| 6153 | |
| 6154 | if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) || |
| 6155 | TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment)) |
| 6156 | return LoadsState::Gather; |
| 6157 | |
| 6158 | if (!all_of(Range&: PointerOps, P: [&](Value *P) { |
| 6159 | return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI); |
| 6160 | })) |
| 6161 | return LoadsState::Gather; |
| 6162 | |
| 6163 | } else { |
| 6164 | Value *Ptr0; |
| 6165 | Value *PtrN; |
| 6166 | if (Order.empty()) { |
| 6167 | Ptr0 = PointerOps.front(); |
| 6168 | PtrN = PointerOps.back(); |
| 6169 | } else { |
| 6170 | Ptr0 = PointerOps[Order.front()]; |
| 6171 | PtrN = PointerOps[Order.back()]; |
| 6172 | } |
| 6173 | std::optional<int64_t> Diff = |
| 6174 | getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE); |
| 6175 | // Check that the sorted loads are consecutive. |
| 6176 | if (static_cast<uint64_t>(*Diff) == Sz - 1) |
| 6177 | return LoadsState::Vectorize; |
| 6178 | if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT, |
| 6179 | TLI: *TLI, AreAllUsersVectorized: [&](Value *V) { |
| 6180 | return areAllUsersVectorized( |
| 6181 | I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList); |
| 6182 | })) |
| 6183 | return LoadsState::CompressVectorize; |
| 6184 | // Simple check if not a strided access - clear order. |
| 6185 | bool IsPossibleStrided = *Diff % (Sz - 1) == 0; |
| 6186 | // Try to generate strided load node. |
| 6187 | auto IsAnyPointerUsedOutGraph = |
| 6188 | IsPossibleStrided && any_of(Range&: PointerOps, P: [&](Value *V) { |
| 6189 | return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) { |
| 6190 | return !isVectorized(V: U) && !MustGather.contains(Ptr: U); |
| 6191 | }); |
| 6192 | }); |
| 6193 | if (IsPossibleStrided && |
| 6194 | isStridedLoad(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE, |
| 6195 | IsAnyPointerUsedOutGraph, Diff: *Diff)) |
| 6196 | return LoadsState::StridedVectorize; |
| 6197 | } |
| 6198 | if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) || |
| 6199 | TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment)) |
| 6200 | return LoadsState::Gather; |
| 6201 | // Correctly identify compare the cost of loads + shuffles rather than |
| 6202 | // strided/masked gather loads. Returns true if vectorized + shuffles |
| 6203 | // representation is better than just gather. |
| 6204 | auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, |
| 6205 | unsigned *BestVF, |
| 6206 | bool ProfitableGatherPointers) { |
| 6207 | if (BestVF) |
| 6208 | *BestVF = 0; |
| 6209 | // Compare masked gather cost and loads + insert subvector costs. |
| 6210 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 6211 | auto [ScalarGEPCost, VectorGEPCost] = |
| 6212 | getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(), |
| 6213 | Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy); |
| 6214 | // Estimate the cost of masked gather GEP. If not a splat, roughly |
| 6215 | // estimate as a buildvector, otherwise estimate as splat. |
| 6216 | APInt DemandedElts = APInt::getAllOnes(numBits: Sz); |
| 6217 | Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType(); |
| 6218 | VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz); |
| 6219 | if (static_cast<unsigned>(count_if( |
| 6220 | Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 || |
| 6221 | any_of(Range&: PointerOps, P: [&](Value *V) { |
| 6222 | return getUnderlyingObject(V) != |
| 6223 | getUnderlyingObject(V: PointerOps.front()); |
| 6224 | })) |
| 6225 | VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, |
| 6226 | DemandedElts, /*Insert=*/true, |
| 6227 | /*Extract=*/false, CostKind); |
| 6228 | else |
| 6229 | VectorGEPCost += |
| 6230 | getScalarizationOverhead( |
| 6231 | TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: 0), |
| 6232 | /*Insert=*/true, /*Extract=*/false, CostKind) + |
| 6233 | ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind); |
| 6234 | // The cost of scalar loads. |
| 6235 | InstructionCost ScalarLoadsCost = |
| 6236 | std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(), |
| 6237 | binary_op: [&](InstructionCost C, Value *V) { |
| 6238 | return C + TTI.getInstructionCost( |
| 6239 | U: cast<Instruction>(Val: V), CostKind); |
| 6240 | }) + |
| 6241 | ScalarGEPCost; |
| 6242 | // The cost of masked gather. |
| 6243 | InstructionCost MaskedGatherCost = |
| 6244 | TTI.getGatherScatterOpCost( |
| 6245 | Opcode: Instruction::Load, DataTy: VecTy, Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(), |
| 6246 | /*VariableMask=*/false, Alignment: CommonAlignment, CostKind) + |
| 6247 | (ProfitableGatherPointers ? 0 : VectorGEPCost); |
| 6248 | InstructionCost GatherCost = |
| 6249 | getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts, |
| 6250 | /*Insert=*/true, |
| 6251 | /*Extract=*/false, CostKind) + |
| 6252 | ScalarLoadsCost; |
| 6253 | // The list of loads is small or perform partial check already - directly |
| 6254 | // compare masked gather cost and gather cost. |
| 6255 | constexpr unsigned ListLimit = 4; |
| 6256 | if (!TryRecursiveCheck || VL.size() < ListLimit) |
| 6257 | return MaskedGatherCost - GatherCost >= -SLPCostThreshold; |
| 6258 | |
| 6259 | // FIXME: The following code has not been updated for non-power-of-2 |
| 6260 | // vectors (and not whole registers). The splitting logic here does not |
| 6261 | // cover the original vector if the vector factor is not a power of two. |
| 6262 | if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size())) |
| 6263 | return false; |
| 6264 | |
| 6265 | unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy); |
| 6266 | unsigned MinVF = getMinVF(Sz: 2 * Sz); |
| 6267 | DemandedElts.clearAllBits(); |
| 6268 | // Iterate through possible vectorization factors and check if vectorized + |
| 6269 | // shuffles is better than just gather. |
| 6270 | for (unsigned VF = |
| 6271 | getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - 1); |
| 6272 | VF >= MinVF; |
| 6273 | VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - 1)) { |
| 6274 | SmallVector<LoadsState> States; |
| 6275 | for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) { |
| 6276 | ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF); |
| 6277 | SmallVector<unsigned> Order; |
| 6278 | SmallVector<Value *> PointerOps; |
| 6279 | LoadsState LS = |
| 6280 | canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps, BestVF, |
| 6281 | /*TryRecursiveCheck=*/false); |
| 6282 | // Check that the sorted loads are consecutive. |
| 6283 | if (LS == LoadsState::Gather) { |
| 6284 | if (BestVF) { |
| 6285 | DemandedElts.setAllBits(); |
| 6286 | break; |
| 6287 | } |
| 6288 | DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF); |
| 6289 | continue; |
| 6290 | } |
| 6291 | // If need the reorder - consider as high-cost masked gather for now. |
| 6292 | if ((LS == LoadsState::Vectorize || |
| 6293 | LS == LoadsState::StridedVectorize || |
| 6294 | LS == LoadsState::CompressVectorize) && |
| 6295 | !Order.empty() && !isReverseOrder(Order)) |
| 6296 | LS = LoadsState::ScatterVectorize; |
| 6297 | States.push_back(Elt: LS); |
| 6298 | } |
| 6299 | if (DemandedElts.isAllOnes()) |
| 6300 | // All loads gathered - try smaller VF. |
| 6301 | continue; |
| 6302 | // Can be vectorized later as a serie of loads/insertelements. |
| 6303 | InstructionCost VecLdCost = 0; |
| 6304 | if (!DemandedElts.isZero()) { |
| 6305 | VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts, |
| 6306 | /*Insert=*/true, |
| 6307 | /*Extract=*/false, CostKind) + |
| 6308 | ScalarGEPCost; |
| 6309 | for (unsigned Idx : seq<unsigned>(Size: VL.size())) |
| 6310 | if (DemandedElts[Idx]) |
| 6311 | VecLdCost += |
| 6312 | TTI.getInstructionCost(U: cast<Instruction>(Val: VL[Idx]), CostKind); |
| 6313 | } |
| 6314 | auto *SubVecTy = getWidenedType(ScalarTy, VF); |
| 6315 | for (auto [I, LS] : enumerate(First&: States)) { |
| 6316 | auto *LI0 = cast<LoadInst>(Val: VL[I * VF]); |
| 6317 | InstructionCost VectorGEPCost = |
| 6318 | (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) |
| 6319 | ? 0 |
| 6320 | : getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF), |
| 6321 | BasePtr: LI0->getPointerOperand(), |
| 6322 | Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, |
| 6323 | VecTy: SubVecTy) |
| 6324 | .second; |
| 6325 | if (LS == LoadsState::ScatterVectorize) { |
| 6326 | if (static_cast<unsigned>( |
| 6327 | count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < |
| 6328 | PointerOps.size() - 1 || |
| 6329 | any_of(Range&: PointerOps, P: [&](Value *V) { |
| 6330 | return getUnderlyingObject(V) != |
| 6331 | getUnderlyingObject(V: PointerOps.front()); |
| 6332 | })) |
| 6333 | VectorGEPCost += getScalarizationOverhead( |
| 6334 | TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF), |
| 6335 | /*Insert=*/true, /*Extract=*/false, CostKind); |
| 6336 | else |
| 6337 | VectorGEPCost += |
| 6338 | getScalarizationOverhead( |
| 6339 | TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: 0), |
| 6340 | /*Insert=*/true, /*Extract=*/false, CostKind) + |
| 6341 | ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {}, |
| 6342 | CostKind); |
| 6343 | } |
| 6344 | switch (LS) { |
| 6345 | case LoadsState::Vectorize: |
| 6346 | VecLdCost += |
| 6347 | TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(), |
| 6348 | AddressSpace: LI0->getPointerAddressSpace(), CostKind, |
| 6349 | OpdInfo: TTI::OperandValueInfo()) + |
| 6350 | VectorGEPCost; |
| 6351 | break; |
| 6352 | case LoadsState::StridedVectorize: |
| 6353 | VecLdCost += TTI.getStridedMemoryOpCost(Opcode: Instruction::Load, DataTy: SubVecTy, |
| 6354 | Ptr: LI0->getPointerOperand(), |
| 6355 | /*VariableMask=*/false, |
| 6356 | Alignment: CommonAlignment, CostKind) + |
| 6357 | VectorGEPCost; |
| 6358 | break; |
| 6359 | case LoadsState::CompressVectorize: |
| 6360 | VecLdCost += TTI.getMaskedMemoryOpCost( |
| 6361 | Opcode: Instruction::Load, Src: SubVecTy, Alignment: CommonAlignment, |
| 6362 | AddressSpace: LI0->getPointerAddressSpace(), CostKind) + |
| 6363 | VectorGEPCost + |
| 6364 | ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy, |
| 6365 | Mask: {}, CostKind); |
| 6366 | break; |
| 6367 | case LoadsState::ScatterVectorize: |
| 6368 | VecLdCost += TTI.getGatherScatterOpCost(Opcode: Instruction::Load, DataTy: SubVecTy, |
| 6369 | Ptr: LI0->getPointerOperand(), |
| 6370 | /*VariableMask=*/false, |
| 6371 | Alignment: CommonAlignment, CostKind) + |
| 6372 | VectorGEPCost; |
| 6373 | break; |
| 6374 | case LoadsState::Gather: |
| 6375 | // Gathers are already calculated - ignore. |
| 6376 | continue; |
| 6377 | } |
| 6378 | SmallVector<int> ShuffleMask(VL.size()); |
| 6379 | for (int Idx : seq<int>(Begin: 0, End: VL.size())) |
| 6380 | ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; |
| 6381 | if (I > 0) |
| 6382 | VecLdCost += |
| 6383 | ::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask, |
| 6384 | CostKind, Index: I * VF, SubTp: SubVecTy); |
| 6385 | } |
| 6386 | // If masked gather cost is higher - better to vectorize, so |
| 6387 | // consider it as a gather node. It will be better estimated |
| 6388 | // later. |
| 6389 | if (MaskedGatherCost >= VecLdCost && |
| 6390 | VecLdCost - GatherCost < -SLPCostThreshold) { |
| 6391 | if (BestVF) |
| 6392 | *BestVF = VF; |
| 6393 | return true; |
| 6394 | } |
| 6395 | } |
| 6396 | return MaskedGatherCost - GatherCost >= -SLPCostThreshold; |
| 6397 | }; |
| 6398 | // TODO: need to improve analysis of the pointers, if not all of them are |
| 6399 | // GEPs or have > 2 operands, we end up with a gather node, which just |
| 6400 | // increases the cost. |
| 6401 | Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent()); |
| 6402 | bool ProfitableGatherPointers = |
| 6403 | L && Sz > 2 && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) { |
| 6404 | return L->isLoopInvariant(V); |
| 6405 | })) <= Sz / 2; |
| 6406 | if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [](Value *P) { |
| 6407 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: P); |
| 6408 | return (!GEP && doesNotNeedToBeScheduled(V: P)) || |
| 6409 | (GEP && GEP->getNumOperands() == 2 && |
| 6410 | isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1))); |
| 6411 | })) { |
| 6412 | // Check if potential masked gather can be represented as series |
| 6413 | // of loads + insertsubvectors. |
| 6414 | // If masked gather cost is higher - better to vectorize, so |
| 6415 | // consider it as a gather node. It will be better estimated |
| 6416 | // later. |
| 6417 | if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF, |
| 6418 | ProfitableGatherPointers)) |
| 6419 | return LoadsState::ScatterVectorize; |
| 6420 | } |
| 6421 | |
| 6422 | return LoadsState::Gather; |
| 6423 | } |
| 6424 | |
| 6425 | static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, |
| 6426 | ArrayRef<BasicBlock *> BBs, Type *ElemTy, |
| 6427 | const DataLayout &DL, ScalarEvolution &SE, |
| 6428 | SmallVectorImpl<unsigned> &SortedIndices) { |
| 6429 | assert( |
| 6430 | all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && |
| 6431 | "Expected list of pointer operands." ); |
| 6432 | // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each |
| 6433 | // Ptr into, sort and return the sorted indices with values next to one |
| 6434 | // another. |
| 6435 | SmallMapVector< |
| 6436 | std::pair<BasicBlock *, Value *>, |
| 6437 | SmallVector<SmallVector<std::tuple<Value *, int64_t, unsigned>>>, 8> |
| 6438 | Bases; |
| 6439 | Bases |
| 6440 | .try_emplace(Key: std::make_pair( |
| 6441 | x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth))) |
| 6442 | .first->second.emplace_back().emplace_back(Args: VL.front(), Args: 0U, Args: 0U); |
| 6443 | |
| 6444 | SortedIndices.clear(); |
| 6445 | for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) { |
| 6446 | auto Key = std::make_pair(x: BBs[Cnt + 1], |
| 6447 | y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth)); |
| 6448 | bool Found = any_of(Range&: Bases.try_emplace(Key).first->second, |
| 6449 | P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) { |
| 6450 | std::optional<int64_t> Diff = |
| 6451 | getPointersDiff(ElemTy, std::get<0>(Base.front()), |
| 6452 | ElemTy, Ptr, DL, SE, |
| 6453 | /*StrictCheck=*/true); |
| 6454 | if (!Diff) |
| 6455 | return false; |
| 6456 | |
| 6457 | Base.emplace_back(Ptr, *Diff, Cnt + 1); |
| 6458 | return true; |
| 6459 | }); |
| 6460 | |
| 6461 | if (!Found) { |
| 6462 | // If we haven't found enough to usefully cluster, return early. |
| 6463 | if (Bases.size() > VL.size() / 2 - 1) |
| 6464 | return false; |
| 6465 | |
| 6466 | // Not found already - add a new Base |
| 6467 | Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: 0, Args: Cnt + 1); |
| 6468 | } |
| 6469 | } |
| 6470 | |
| 6471 | if (Bases.size() == VL.size()) |
| 6472 | return false; |
| 6473 | |
| 6474 | if (Bases.size() == 1 && (Bases.front().second.size() == 1 || |
| 6475 | Bases.front().second.size() == VL.size())) |
| 6476 | return false; |
| 6477 | |
| 6478 | // For each of the bases sort the pointers by Offset and check if any of the |
| 6479 | // base become consecutively allocated. |
| 6480 | auto ComparePointers = [](Value *Ptr1, Value *Ptr2) { |
| 6481 | SmallPtrSet<Value *, 13> FirstPointers; |
| 6482 | SmallPtrSet<Value *, 13> SecondPointers; |
| 6483 | Value *P1 = Ptr1; |
| 6484 | Value *P2 = Ptr2; |
| 6485 | unsigned Depth = 0; |
| 6486 | while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) { |
| 6487 | if (P1 == P2 || Depth > RecursionMaxDepth) |
| 6488 | return false; |
| 6489 | FirstPointers.insert(Ptr: P1); |
| 6490 | SecondPointers.insert(Ptr: P2); |
| 6491 | P1 = getUnderlyingObject(V: P1, /*MaxLookup=*/1); |
| 6492 | P2 = getUnderlyingObject(V: P2, /*MaxLookup=*/1); |
| 6493 | ++Depth; |
| 6494 | } |
| 6495 | assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) && |
| 6496 | "Unable to find matching root." ); |
| 6497 | return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1); |
| 6498 | }; |
| 6499 | for (auto &Base : Bases) { |
| 6500 | for (auto &Vec : Base.second) { |
| 6501 | if (Vec.size() > 1) { |
| 6502 | stable_sort(Range&: Vec, C: llvm::less_second()); |
| 6503 | int64_t InitialOffset = std::get<1>(t&: Vec[0]); |
| 6504 | bool AnyConsecutive = |
| 6505 | all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) { |
| 6506 | return std::get<1>(P.value()) == |
| 6507 | int64_t(P.index()) + InitialOffset; |
| 6508 | }); |
| 6509 | // Fill SortedIndices array only if it looks worth-while to sort the |
| 6510 | // ptrs. |
| 6511 | if (!AnyConsecutive) |
| 6512 | return false; |
| 6513 | } |
| 6514 | } |
| 6515 | stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) { |
| 6516 | return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front())); |
| 6517 | }); |
| 6518 | } |
| 6519 | |
| 6520 | for (auto &T : Bases) |
| 6521 | for (const auto &Vec : T.second) |
| 6522 | for (const auto &P : Vec) |
| 6523 | SortedIndices.push_back(Elt: std::get<2>(t: P)); |
| 6524 | |
| 6525 | assert(SortedIndices.size() == VL.size() && |
| 6526 | "Expected SortedIndices to be the size of VL" ); |
| 6527 | return true; |
| 6528 | } |
| 6529 | |
| 6530 | std::optional<BoUpSLP::OrdersType> |
| 6531 | BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) { |
| 6532 | assert(TE.isGather() && "Expected gather node only." ); |
| 6533 | Type *ScalarTy = TE.Scalars[0]->getType(); |
| 6534 | |
| 6535 | SmallVector<Value *> Ptrs; |
| 6536 | Ptrs.reserve(N: TE.Scalars.size()); |
| 6537 | SmallVector<BasicBlock *> BBs; |
| 6538 | BBs.reserve(N: TE.Scalars.size()); |
| 6539 | for (Value *V : TE.Scalars) { |
| 6540 | auto *L = dyn_cast<LoadInst>(Val: V); |
| 6541 | if (!L || !L->isSimple()) |
| 6542 | return std::nullopt; |
| 6543 | Ptrs.push_back(Elt: L->getPointerOperand()); |
| 6544 | BBs.push_back(Elt: L->getParent()); |
| 6545 | } |
| 6546 | |
| 6547 | BoUpSLP::OrdersType Order; |
| 6548 | if (!LoadEntriesToVectorize.contains(key: TE.Idx) && |
| 6549 | clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order)) |
| 6550 | return std::move(Order); |
| 6551 | return std::nullopt; |
| 6552 | } |
| 6553 | |
| 6554 | /// Check if two insertelement instructions are from the same buildvector. |
| 6555 | static bool areTwoInsertFromSameBuildVector( |
| 6556 | InsertElementInst *VU, InsertElementInst *V, |
| 6557 | function_ref<Value *(InsertElementInst *)> GetBaseOperand) { |
| 6558 | // Instructions must be from the same basic blocks. |
| 6559 | if (VU->getParent() != V->getParent()) |
| 6560 | return false; |
| 6561 | // Checks if 2 insertelements are from the same buildvector. |
| 6562 | if (VU->getType() != V->getType()) |
| 6563 | return false; |
| 6564 | // Multiple used inserts are separate nodes. |
| 6565 | if (!VU->hasOneUse() && !V->hasOneUse()) |
| 6566 | return false; |
| 6567 | auto *IE1 = VU; |
| 6568 | auto *IE2 = V; |
| 6569 | std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1); |
| 6570 | std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2); |
| 6571 | if (Idx1 == std::nullopt || Idx2 == std::nullopt) |
| 6572 | return false; |
| 6573 | // Go through the vector operand of insertelement instructions trying to find |
| 6574 | // either VU as the original vector for IE2 or V as the original vector for |
| 6575 | // IE1. |
| 6576 | SmallBitVector ReusedIdx( |
| 6577 | cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue()); |
| 6578 | bool IsReusedIdx = false; |
| 6579 | do { |
| 6580 | if (IE2 == VU && !IE1) |
| 6581 | return VU->hasOneUse(); |
| 6582 | if (IE1 == V && !IE2) |
| 6583 | return V->hasOneUse(); |
| 6584 | if (IE1 && IE1 != V) { |
| 6585 | unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2); |
| 6586 | IsReusedIdx |= ReusedIdx.test(Idx: Idx1); |
| 6587 | ReusedIdx.set(Idx1); |
| 6588 | if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx) |
| 6589 | IE1 = nullptr; |
| 6590 | else |
| 6591 | IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1)); |
| 6592 | } |
| 6593 | if (IE2 && IE2 != VU) { |
| 6594 | unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1); |
| 6595 | IsReusedIdx |= ReusedIdx.test(Idx: Idx2); |
| 6596 | ReusedIdx.set(Idx2); |
| 6597 | if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx) |
| 6598 | IE2 = nullptr; |
| 6599 | else |
| 6600 | IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2)); |
| 6601 | } |
| 6602 | } while (!IsReusedIdx && (IE1 || IE2)); |
| 6603 | return false; |
| 6604 | } |
| 6605 | |
| 6606 | /// Checks if the specified instruction \p I is an alternate operation for |
| 6607 | /// the given \p MainOp and \p AltOp instructions. |
| 6608 | static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, |
| 6609 | Instruction *AltOp, |
| 6610 | const TargetLibraryInfo &TLI); |
| 6611 | |
| 6612 | std::optional<BoUpSLP::OrdersType> |
| 6613 | BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, |
| 6614 | bool IgnoreReorder) { |
| 6615 | // No need to reorder if need to shuffle reuses, still need to shuffle the |
| 6616 | // node. |
| 6617 | if (!TE.ReuseShuffleIndices.empty()) { |
| 6618 | // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. |
| 6619 | assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) && |
| 6620 | "Reshuffling scalars not yet supported for nodes with padding" ); |
| 6621 | |
| 6622 | if (isSplat(VL: TE.Scalars)) |
| 6623 | return std::nullopt; |
| 6624 | // Check if reuse shuffle indices can be improved by reordering. |
| 6625 | // For this, check that reuse mask is "clustered", i.e. each scalar values |
| 6626 | // is used once in each submask of size <number_of_scalars>. |
| 6627 | // Example: 4 scalar values. |
| 6628 | // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered. |
| 6629 | // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because |
| 6630 | // element 3 is used twice in the second submask. |
| 6631 | unsigned Sz = TE.Scalars.size(); |
| 6632 | if (TE.isGather()) { |
| 6633 | if (std::optional<OrdersType> CurrentOrder = |
| 6634 | findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) { |
| 6635 | SmallVector<int> Mask; |
| 6636 | fixupOrderingIndices(Order: *CurrentOrder); |
| 6637 | inversePermutation(Indices: *CurrentOrder, Mask); |
| 6638 | ::addMask(Mask, SubMask: TE.ReuseShuffleIndices); |
| 6639 | OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor()); |
| 6640 | unsigned Sz = TE.Scalars.size(); |
| 6641 | for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) { |
| 6642 | for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz))) |
| 6643 | if (Idx != PoisonMaskElem) |
| 6644 | Res[Idx + K * Sz] = I + K * Sz; |
| 6645 | } |
| 6646 | return std::move(Res); |
| 6647 | } |
| 6648 | } |
| 6649 | if (Sz == 2 && TE.getVectorFactor() == 4 && |
| 6650 | ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(), |
| 6651 | VF: 2 * TE.getVectorFactor())) == 1) |
| 6652 | return std::nullopt; |
| 6653 | if (TE.ReuseShuffleIndices.size() % Sz != 0) |
| 6654 | return std::nullopt; |
| 6655 | if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices, |
| 6656 | VF: Sz)) { |
| 6657 | SmallVector<int> ReorderMask(Sz, PoisonMaskElem); |
| 6658 | if (TE.ReorderIndices.empty()) |
| 6659 | std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0); |
| 6660 | else |
| 6661 | inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask); |
| 6662 | ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices); |
| 6663 | unsigned VF = ReorderMask.size(); |
| 6664 | OrdersType ResOrder(VF, VF); |
| 6665 | unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz); |
| 6666 | SmallBitVector UsedVals(NumParts); |
| 6667 | for (unsigned I = 0; I < VF; I += Sz) { |
| 6668 | int Val = PoisonMaskElem; |
| 6669 | unsigned UndefCnt = 0; |
| 6670 | unsigned Limit = std::min(a: Sz, b: VF - I); |
| 6671 | if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit), |
| 6672 | P: [&](int Idx) { |
| 6673 | if (Val == PoisonMaskElem && Idx != PoisonMaskElem) |
| 6674 | Val = Idx; |
| 6675 | if (Idx == PoisonMaskElem) |
| 6676 | ++UndefCnt; |
| 6677 | return Idx != PoisonMaskElem && Idx != Val; |
| 6678 | }) || |
| 6679 | Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) || |
| 6680 | UndefCnt > Sz / 2) |
| 6681 | return std::nullopt; |
| 6682 | UsedVals.set(Val); |
| 6683 | for (unsigned K = 0; K < NumParts; ++K) { |
| 6684 | unsigned Idx = Val + Sz * K; |
| 6685 | if (Idx < VF && I + K < VF) |
| 6686 | ResOrder[Idx] = I + K; |
| 6687 | } |
| 6688 | } |
| 6689 | return std::move(ResOrder); |
| 6690 | } |
| 6691 | unsigned VF = TE.getVectorFactor(); |
| 6692 | // Try build correct order for extractelement instructions. |
| 6693 | SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(), |
| 6694 | TE.ReuseShuffleIndices.end()); |
| 6695 | if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement && |
| 6696 | all_of(Range: TE.Scalars, P: [Sz](Value *V) { |
| 6697 | if (isa<PoisonValue>(Val: V)) |
| 6698 | return true; |
| 6699 | std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V)); |
| 6700 | return Idx && *Idx < Sz; |
| 6701 | })) { |
| 6702 | assert(!TE.isAltShuffle() && "Alternate instructions are only supported " |
| 6703 | "by BinaryOperator and CastInst." ); |
| 6704 | SmallVector<int> ReorderMask(Sz, PoisonMaskElem); |
| 6705 | if (TE.ReorderIndices.empty()) |
| 6706 | std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0); |
| 6707 | else |
| 6708 | inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask); |
| 6709 | for (unsigned I = 0; I < VF; ++I) { |
| 6710 | int &Idx = ReusedMask[I]; |
| 6711 | if (Idx == PoisonMaskElem) |
| 6712 | continue; |
| 6713 | Value *V = TE.Scalars[ReorderMask[Idx]]; |
| 6714 | std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V)); |
| 6715 | Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI)); |
| 6716 | } |
| 6717 | } |
| 6718 | // Build the order of the VF size, need to reorder reuses shuffles, they are |
| 6719 | // always of VF size. |
| 6720 | OrdersType ResOrder(VF); |
| 6721 | std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0); |
| 6722 | auto *It = ResOrder.begin(); |
| 6723 | for (unsigned K = 0; K < VF; K += Sz) { |
| 6724 | OrdersType CurrentOrder(TE.ReorderIndices); |
| 6725 | SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)}; |
| 6726 | if (SubMask.front() == PoisonMaskElem) |
| 6727 | std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0); |
| 6728 | reorderOrder(Order&: CurrentOrder, Mask: SubMask); |
| 6729 | transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; }); |
| 6730 | std::advance(i&: It, n: Sz); |
| 6731 | } |
| 6732 | if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) { |
| 6733 | return Data.index() == Data.value(); |
| 6734 | })) |
| 6735 | return std::nullopt; // No need to reorder. |
| 6736 | return std::move(ResOrder); |
| 6737 | } |
| 6738 | if (TE.State == TreeEntry::StridedVectorize && !TopToBottom && |
| 6739 | (!TE.UserTreeIndex || |
| 6740 | !Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) && |
| 6741 | (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices))) |
| 6742 | return std::nullopt; |
| 6743 | if (TE.State == TreeEntry::SplitVectorize || |
| 6744 | ((TE.State == TreeEntry::Vectorize || |
| 6745 | TE.State == TreeEntry::StridedVectorize || |
| 6746 | TE.State == TreeEntry::CompressVectorize) && |
| 6747 | (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) || |
| 6748 | (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) { |
| 6749 | assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) && |
| 6750 | "Alternate instructions are only supported by " |
| 6751 | "BinaryOperator and CastInst." ); |
| 6752 | return TE.ReorderIndices; |
| 6753 | } |
| 6754 | if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize && |
| 6755 | TE.isAltShuffle()) { |
| 6756 | assert(TE.ReuseShuffleIndices.empty() && |
| 6757 | "ReuseShuffleIndices should be " |
| 6758 | "empty for alternate instructions." ); |
| 6759 | SmallVector<int> Mask; |
| 6760 | TE.buildAltOpShuffleMask( |
| 6761 | IsAltOp: [&](Instruction *I) { |
| 6762 | assert(TE.getMatchingMainOpOrAltOp(I) && |
| 6763 | "Unexpected main/alternate opcode" ); |
| 6764 | return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI); |
| 6765 | }, |
| 6766 | Mask); |
| 6767 | const int VF = TE.getVectorFactor(); |
| 6768 | OrdersType ResOrder(VF, VF); |
| 6769 | for (unsigned I : seq<unsigned>(Size: VF)) { |
| 6770 | if (Mask[I] == PoisonMaskElem) |
| 6771 | continue; |
| 6772 | ResOrder[Mask[I] % VF] = I; |
| 6773 | } |
| 6774 | return std::move(ResOrder); |
| 6775 | } |
| 6776 | if (!TE.ReorderIndices.empty()) |
| 6777 | return TE.ReorderIndices; |
| 6778 | if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) { |
| 6779 | if (!TE.ReorderIndices.empty()) |
| 6780 | return TE.ReorderIndices; |
| 6781 | |
| 6782 | SmallVector<Instruction *> UserBVHead(TE.Scalars.size()); |
| 6783 | for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) { |
| 6784 | if (isa<Constant>(Val: V) || !V->hasNUsesOrMore(N: 1)) |
| 6785 | continue; |
| 6786 | auto *II = dyn_cast<InsertElementInst>(Val: *V->user_begin()); |
| 6787 | if (!II) |
| 6788 | continue; |
| 6789 | Instruction *BVHead = nullptr; |
| 6790 | BasicBlock *BB = II->getParent(); |
| 6791 | while (II && II->hasOneUse() && II->getParent() == BB) { |
| 6792 | BVHead = II; |
| 6793 | II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0)); |
| 6794 | } |
| 6795 | I = BVHead; |
| 6796 | } |
| 6797 | |
| 6798 | auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) { |
| 6799 | assert(BB1 != BB2 && "Expected different basic blocks." ); |
| 6800 | if (!DT->isReachableFromEntry(A: BB1)) |
| 6801 | return false; |
| 6802 | if (!DT->isReachableFromEntry(A: BB2)) |
| 6803 | return true; |
| 6804 | auto *NodeA = DT->getNode(BB: BB1); |
| 6805 | auto *NodeB = DT->getNode(BB: BB2); |
| 6806 | assert(NodeA && "Should only process reachable instructions" ); |
| 6807 | assert(NodeB && "Should only process reachable instructions" ); |
| 6808 | assert((NodeA == NodeB) == |
| 6809 | (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && |
| 6810 | "Different nodes should have different DFS numbers" ); |
| 6811 | return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); |
| 6812 | }; |
| 6813 | auto PHICompare = [&](unsigned I1, unsigned I2) { |
| 6814 | Value *V1 = TE.Scalars[I1]; |
| 6815 | Value *V2 = TE.Scalars[I2]; |
| 6816 | if (V1 == V2 || (V1->use_empty() && V2->use_empty())) |
| 6817 | return false; |
| 6818 | if (isa<PoisonValue>(Val: V1)) |
| 6819 | return true; |
| 6820 | if (isa<PoisonValue>(Val: V2)) |
| 6821 | return false; |
| 6822 | if (V1->getNumUses() < V2->getNumUses()) |
| 6823 | return true; |
| 6824 | if (V1->getNumUses() > V2->getNumUses()) |
| 6825 | return false; |
| 6826 | auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin()); |
| 6827 | auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin()); |
| 6828 | if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent()) |
| 6829 | return CompareByBasicBlocks(FirstUserOfPhi1->getParent(), |
| 6830 | FirstUserOfPhi2->getParent()); |
| 6831 | auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1); |
| 6832 | auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2); |
| 6833 | auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1); |
| 6834 | auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2); |
| 6835 | if (IE1 && !IE2) |
| 6836 | return true; |
| 6837 | if (!IE1 && IE2) |
| 6838 | return false; |
| 6839 | if (IE1 && IE2) { |
| 6840 | if (UserBVHead[I1] && !UserBVHead[I2]) |
| 6841 | return true; |
| 6842 | if (!UserBVHead[I1]) |
| 6843 | return false; |
| 6844 | if (UserBVHead[I1] == UserBVHead[I2]) |
| 6845 | return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2); |
| 6846 | if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent()) |
| 6847 | return CompareByBasicBlocks(UserBVHead[I1]->getParent(), |
| 6848 | UserBVHead[I2]->getParent()); |
| 6849 | return UserBVHead[I1]->comesBefore(Other: UserBVHead[I2]); |
| 6850 | } |
| 6851 | if (EE1 && !EE2) |
| 6852 | return true; |
| 6853 | if (!EE1 && EE2) |
| 6854 | return false; |
| 6855 | if (EE1 && EE2) { |
| 6856 | auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: 0)); |
| 6857 | auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: 0)); |
| 6858 | auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: 0)); |
| 6859 | auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: 0)); |
| 6860 | if (!Inst2 && !P2) |
| 6861 | return Inst1 || P1; |
| 6862 | if (EE1->getOperand(i_nocapture: 0) == EE2->getOperand(i_nocapture: 0)) |
| 6863 | return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2); |
| 6864 | if (!Inst1 && Inst2) |
| 6865 | return false; |
| 6866 | if (Inst1 && Inst2) { |
| 6867 | if (Inst1->getParent() != Inst2->getParent()) |
| 6868 | return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent()); |
| 6869 | return Inst1->comesBefore(Other: Inst2); |
| 6870 | } |
| 6871 | if (!P1 && P2) |
| 6872 | return false; |
| 6873 | assert(P1 && P2 && |
| 6874 | "Expected either instructions or arguments vector operands." ); |
| 6875 | return P1->getArgNo() < P2->getArgNo(); |
| 6876 | } |
| 6877 | return false; |
| 6878 | }; |
| 6879 | OrdersType Phis(TE.Scalars.size()); |
| 6880 | std::iota(first: Phis.begin(), last: Phis.end(), value: 0); |
| 6881 | stable_sort(Range&: Phis, C: PHICompare); |
| 6882 | if (isIdentityOrder(Order: Phis)) |
| 6883 | return std::nullopt; // No need to reorder. |
| 6884 | return std::move(Phis); |
| 6885 | } |
| 6886 | if (TE.isGather() && |
| 6887 | (!TE.hasState() || !TE.isAltShuffle() || |
| 6888 | ScalarsInSplitNodes.contains(Val: TE.getMainOp())) && |
| 6889 | allSameType(VL: TE.Scalars)) { |
| 6890 | // TODO: add analysis of other gather nodes with extractelement |
| 6891 | // instructions and other values/instructions, not only undefs. |
| 6892 | if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) || |
| 6893 | (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) && |
| 6894 | any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) && |
| 6895 | all_of(Range: TE.Scalars, P: [](Value *V) { |
| 6896 | auto *EE = dyn_cast<ExtractElementInst>(Val: V); |
| 6897 | return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType()); |
| 6898 | })) { |
| 6899 | // Check that gather of extractelements can be represented as |
| 6900 | // just a shuffle of a single vector. |
| 6901 | OrdersType CurrentOrder; |
| 6902 | bool Reuse = |
| 6903 | canReuseExtract(VL: TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true); |
| 6904 | if (Reuse || !CurrentOrder.empty()) |
| 6905 | return std::move(CurrentOrder); |
| 6906 | } |
| 6907 | // If the gather node is <undef, v, .., poison> and |
| 6908 | // insertelement poison, v, 0 [+ permute] |
| 6909 | // is cheaper than |
| 6910 | // insertelement poison, v, n - try to reorder. |
| 6911 | // If rotating the whole graph, exclude the permute cost, the whole graph |
| 6912 | // might be transformed. |
| 6913 | int Sz = TE.Scalars.size(); |
| 6914 | if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) && |
| 6915 | count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) { |
| 6916 | const auto *It = find_if_not(Range: TE.Scalars, P: isConstant); |
| 6917 | if (It == TE.Scalars.begin()) |
| 6918 | return OrdersType(); |
| 6919 | auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz); |
| 6920 | if (It != TE.Scalars.end()) { |
| 6921 | OrdersType Order(Sz, Sz); |
| 6922 | unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It); |
| 6923 | Order[Idx] = 0; |
| 6924 | fixupOrderingIndices(Order); |
| 6925 | SmallVector<int> Mask; |
| 6926 | inversePermutation(Indices: Order, Mask); |
| 6927 | InstructionCost PermuteCost = |
| 6928 | TopToBottom |
| 6929 | ? 0 |
| 6930 | : ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask); |
| 6931 | InstructionCost InsertFirstCost = TTI->getVectorInstrCost( |
| 6932 | Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0, |
| 6933 | Op0: PoisonValue::get(T: Ty), Op1: *It); |
| 6934 | InstructionCost InsertIdxCost = TTI->getVectorInstrCost( |
| 6935 | Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx, |
| 6936 | Op0: PoisonValue::get(T: Ty), Op1: *It); |
| 6937 | if (InsertFirstCost + PermuteCost < InsertIdxCost) { |
| 6938 | OrdersType Order(Sz, Sz); |
| 6939 | Order[Idx] = 0; |
| 6940 | return std::move(Order); |
| 6941 | } |
| 6942 | } |
| 6943 | } |
| 6944 | if (isSplat(VL: TE.Scalars)) |
| 6945 | return std::nullopt; |
| 6946 | if (TE.Scalars.size() >= 3) |
| 6947 | if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE)) |
| 6948 | return Order; |
| 6949 | // Check if can include the order of vectorized loads. For masked gathers do |
| 6950 | // extra analysis later, so include such nodes into a special list. |
| 6951 | if (TE.hasState() && TE.getOpcode() == Instruction::Load) { |
| 6952 | SmallVector<Value *> PointerOps; |
| 6953 | OrdersType CurrentOrder; |
| 6954 | LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(), |
| 6955 | Order&: CurrentOrder, PointerOps); |
| 6956 | if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize || |
| 6957 | Res == LoadsState::CompressVectorize) |
| 6958 | return std::move(CurrentOrder); |
| 6959 | } |
| 6960 | // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars |
| 6961 | // has been auditted for correctness with non-power-of-two vectors. |
| 6962 | if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI)) |
| 6963 | if (std::optional<OrdersType> CurrentOrder = |
| 6964 | findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) |
| 6965 | return CurrentOrder; |
| 6966 | } |
| 6967 | return std::nullopt; |
| 6968 | } |
| 6969 | |
| 6970 | /// Checks if the given mask is a "clustered" mask with the same clusters of |
| 6971 | /// size \p Sz, which are not identity submasks. |
| 6972 | static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask, |
| 6973 | unsigned Sz) { |
| 6974 | ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz); |
| 6975 | if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz)) |
| 6976 | return false; |
| 6977 | for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) { |
| 6978 | ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz); |
| 6979 | if (Cluster != FirstCluster) |
| 6980 | return false; |
| 6981 | } |
| 6982 | return true; |
| 6983 | } |
| 6984 | |
| 6985 | void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const { |
| 6986 | // Reorder reuses mask. |
| 6987 | reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask); |
| 6988 | const unsigned Sz = TE.Scalars.size(); |
| 6989 | // For vectorized and non-clustered reused no need to do anything else. |
| 6990 | if (!TE.isGather() || |
| 6991 | !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices, |
| 6992 | VF: Sz) || |
| 6993 | !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz)) |
| 6994 | return; |
| 6995 | SmallVector<int> NewMask; |
| 6996 | inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask); |
| 6997 | addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices); |
| 6998 | // Clear reorder since it is going to be applied to the new mask. |
| 6999 | TE.ReorderIndices.clear(); |
| 7000 | // Try to improve gathered nodes with clustered reuses, if possible. |
| 7001 | ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz); |
| 7002 | SmallVector<unsigned> NewOrder(Slice); |
| 7003 | inversePermutation(Indices: NewOrder, Mask&: NewMask); |
| 7004 | reorderScalars(Scalars&: TE.Scalars, Mask: NewMask); |
| 7005 | // Fill the reuses mask with the identity submasks. |
| 7006 | for (auto *It = TE.ReuseShuffleIndices.begin(), |
| 7007 | *End = TE.ReuseShuffleIndices.end(); |
| 7008 | It != End; std::advance(i&: It, n: Sz)) |
| 7009 | std::iota(first: It, last: std::next(x: It, n: Sz), value: 0); |
| 7010 | } |
| 7011 | |
| 7012 | static void combineOrders(MutableArrayRef<unsigned> Order, |
| 7013 | ArrayRef<unsigned> SecondaryOrder) { |
| 7014 | assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) && |
| 7015 | "Expected same size of orders" ); |
| 7016 | size_t Sz = Order.size(); |
| 7017 | SmallBitVector UsedIndices(Sz); |
| 7018 | for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) { |
| 7019 | if (Order[Idx] != Sz) |
| 7020 | UsedIndices.set(Order[Idx]); |
| 7021 | } |
| 7022 | if (SecondaryOrder.empty()) { |
| 7023 | for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) |
| 7024 | if (Order[Idx] == Sz && !UsedIndices.test(Idx)) |
| 7025 | Order[Idx] = Idx; |
| 7026 | } else { |
| 7027 | for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) |
| 7028 | if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz && |
| 7029 | !UsedIndices.test(Idx: SecondaryOrder[Idx])) |
| 7030 | Order[Idx] = SecondaryOrder[Idx]; |
| 7031 | } |
| 7032 | } |
| 7033 | |
| 7034 | bool BoUpSLP::isProfitableToReorder() const { |
| 7035 | constexpr unsigned TinyVF = 2; |
| 7036 | constexpr unsigned TinyTree = 10; |
| 7037 | constexpr unsigned PhiOpsLimit = 12; |
| 7038 | constexpr unsigned GatherLoadsLimit = 2; |
| 7039 | if (VectorizableTree.size() <= TinyTree) |
| 7040 | return true; |
| 7041 | if (VectorizableTree.front()->hasState() && |
| 7042 | !VectorizableTree.front()->isGather() && |
| 7043 | (VectorizableTree.front()->getOpcode() == Instruction::Store || |
| 7044 | VectorizableTree.front()->getOpcode() == Instruction::PHI || |
| 7045 | (VectorizableTree.front()->getVectorFactor() <= TinyVF && |
| 7046 | (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt || |
| 7047 | VectorizableTree.front()->getOpcode() == Instruction::ICmp))) && |
| 7048 | VectorizableTree.front()->ReorderIndices.empty()) { |
| 7049 | // Check if the tree has only single store and single (unordered) load node, |
| 7050 | // other nodes are phis or geps/binops, combined with phis, and/orsingle |
| 7051 | // gather load node |
| 7052 | bool HasPhis = false; |
| 7053 | if (VectorizableTree.front()->getOpcode() == Instruction::PHI && |
| 7054 | VectorizableTree.front()->Scalars.size() == TinyVF && |
| 7055 | VectorizableTree.front()->getNumOperands() > PhiOpsLimit) |
| 7056 | return false; |
| 7057 | bool HasLoad = true; |
| 7058 | unsigned GatherLoads = 0; |
| 7059 | for (const std::unique_ptr<TreeEntry> &TE : |
| 7060 | ArrayRef(VectorizableTree).drop_front()) { |
| 7061 | if (!TE->hasState()) { |
| 7062 | if (all_of(Range&: TE->Scalars, P: IsaPred<Constant, PHINode>) || |
| 7063 | all_of(Range&: TE->Scalars, P: IsaPred<BinaryOperator, PHINode>)) |
| 7064 | continue; |
| 7065 | if (VectorizableTree.front()->Scalars.size() == TinyVF && |
| 7066 | any_of(Range&: TE->Scalars, P: IsaPred<PHINode, GEPOperator>)) |
| 7067 | continue; |
| 7068 | return true; |
| 7069 | } |
| 7070 | if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) { |
| 7071 | if (!TE->isGather()) { |
| 7072 | HasLoad = false; |
| 7073 | continue; |
| 7074 | } |
| 7075 | if (HasLoad) |
| 7076 | return true; |
| 7077 | ++GatherLoads; |
| 7078 | if (GatherLoads >= GatherLoadsLimit) |
| 7079 | return true; |
| 7080 | } |
| 7081 | if (TE->getOpcode() == Instruction::GetElementPtr || |
| 7082 | Instruction::isBinaryOp(Opcode: TE->getOpcode())) |
| 7083 | continue; |
| 7084 | if (TE->getOpcode() != Instruction::PHI) |
| 7085 | return true; |
| 7086 | if (VectorizableTree.front()->Scalars.size() == TinyVF && |
| 7087 | TE->getNumOperands() > PhiOpsLimit) |
| 7088 | return false; |
| 7089 | HasPhis = true; |
| 7090 | } |
| 7091 | return !HasPhis; |
| 7092 | } |
| 7093 | return true; |
| 7094 | } |
| 7095 | |
| 7096 | void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask, |
| 7097 | ArrayRef<int> MaskOrder) { |
| 7098 | assert(State == TreeEntry::SplitVectorize && "Expected split user node." ); |
| 7099 | SmallVector<int> NewMask(getVectorFactor()); |
| 7100 | SmallVector<int> NewMaskOrder(getVectorFactor()); |
| 7101 | std::iota(first: NewMask.begin(), last: NewMask.end(), value: 0); |
| 7102 | std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: 0); |
| 7103 | if (Idx == 0) { |
| 7104 | copy(Range&: Mask, Out: NewMask.begin()); |
| 7105 | copy(Range&: MaskOrder, Out: NewMaskOrder.begin()); |
| 7106 | } else { |
| 7107 | assert(Idx == 1 && "Expected either 0 or 1 index." ); |
| 7108 | unsigned Offset = CombinedEntriesWithIndices.back().second; |
| 7109 | for (unsigned I : seq<unsigned>(Size: Mask.size())) { |
| 7110 | NewMask[I + Offset] = Mask[I] + Offset; |
| 7111 | NewMaskOrder[I + Offset] = MaskOrder[I] + Offset; |
| 7112 | } |
| 7113 | } |
| 7114 | reorderScalars(Scalars, Mask: NewMask); |
| 7115 | reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /*BottomOrder=*/true); |
| 7116 | if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices)) |
| 7117 | ReorderIndices.clear(); |
| 7118 | } |
| 7119 | |
| 7120 | void BoUpSLP::reorderTopToBottom() { |
| 7121 | // Maps VF to the graph nodes. |
| 7122 | DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries; |
| 7123 | // ExtractElement gather nodes which can be vectorized and need to handle |
| 7124 | // their ordering. |
| 7125 | DenseMap<const TreeEntry *, OrdersType> GathersToOrders; |
| 7126 | |
| 7127 | // Phi nodes can have preferred ordering based on their result users |
| 7128 | DenseMap<const TreeEntry *, OrdersType> PhisToOrders; |
| 7129 | |
| 7130 | // AltShuffles can also have a preferred ordering that leads to fewer |
| 7131 | // instructions, e.g., the addsub instruction in x86. |
| 7132 | DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders; |
| 7133 | |
| 7134 | // Maps a TreeEntry to the reorder indices of external users. |
| 7135 | DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>> |
| 7136 | ExternalUserReorderMap; |
| 7137 | // Find all reorderable nodes with the given VF. |
| 7138 | // Currently the are vectorized stores,loads,extracts + some gathering of |
| 7139 | // extracts. |
| 7140 | for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI]( |
| 7141 | const std::unique_ptr<TreeEntry> &TE) { |
| 7142 | // Look for external users that will probably be vectorized. |
| 7143 | SmallVector<OrdersType, 1> ExternalUserReorderIndices = |
| 7144 | findExternalStoreUsersReorderIndices(TE: TE.get()); |
| 7145 | if (!ExternalUserReorderIndices.empty()) { |
| 7146 | VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get()); |
| 7147 | ExternalUserReorderMap.try_emplace(Key: TE.get(), |
| 7148 | Args: std::move(ExternalUserReorderIndices)); |
| 7149 | } |
| 7150 | |
| 7151 | // Patterns like [fadd,fsub] can be combined into a single instruction in |
| 7152 | // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need |
| 7153 | // to take into account their order when looking for the most used order. |
| 7154 | if (TE->hasState() && TE->isAltShuffle() && |
| 7155 | TE->State != TreeEntry::SplitVectorize) { |
| 7156 | Type *ScalarTy = TE->Scalars[0]->getType(); |
| 7157 | VectorType *VecTy = getWidenedType(ScalarTy, VF: TE->Scalars.size()); |
| 7158 | unsigned Opcode0 = TE->getOpcode(); |
| 7159 | unsigned Opcode1 = TE->getAltOpcode(); |
| 7160 | SmallBitVector OpcodeMask( |
| 7161 | getAltInstrMask(VL: TE->Scalars, ScalarTy, Opcode0, Opcode1)); |
| 7162 | // If this pattern is supported by the target then we consider the order. |
| 7163 | if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { |
| 7164 | VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get()); |
| 7165 | AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType()); |
| 7166 | } |
| 7167 | // TODO: Check the reverse order too. |
| 7168 | } |
| 7169 | |
| 7170 | bool IgnoreReorder = |
| 7171 | !UserIgnoreList && VectorizableTree.front()->hasState() && |
| 7172 | (VectorizableTree.front()->getOpcode() == Instruction::InsertElement || |
| 7173 | VectorizableTree.front()->getOpcode() == Instruction::Store); |
| 7174 | if (std::optional<OrdersType> CurrentOrder = |
| 7175 | getReorderingData(TE: *TE, /*TopToBottom=*/true, IgnoreReorder)) { |
| 7176 | // Do not include ordering for nodes used in the alt opcode vectorization, |
| 7177 | // better to reorder them during bottom-to-top stage. If follow the order |
| 7178 | // here, it causes reordering of the whole graph though actually it is |
| 7179 | // profitable just to reorder the subgraph that starts from the alternate |
| 7180 | // opcode vectorization node. Such nodes already end-up with the shuffle |
| 7181 | // instruction and it is just enough to change this shuffle rather than |
| 7182 | // rotate the scalars for the whole graph. |
| 7183 | unsigned Cnt = 0; |
| 7184 | const TreeEntry *UserTE = TE.get(); |
| 7185 | while (UserTE && Cnt < RecursionMaxDepth) { |
| 7186 | if (!UserTE->UserTreeIndex) |
| 7187 | break; |
| 7188 | if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && |
| 7189 | UserTE->UserTreeIndex.UserTE->isAltShuffle() && |
| 7190 | UserTE->UserTreeIndex.UserTE->Idx != 0) |
| 7191 | return; |
| 7192 | UserTE = UserTE->UserTreeIndex.UserTE; |
| 7193 | ++Cnt; |
| 7194 | } |
| 7195 | VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get()); |
| 7196 | if (!(TE->State == TreeEntry::Vectorize || |
| 7197 | TE->State == TreeEntry::StridedVectorize || |
| 7198 | TE->State == TreeEntry::SplitVectorize || |
| 7199 | TE->State == TreeEntry::CompressVectorize) || |
| 7200 | !TE->ReuseShuffleIndices.empty()) |
| 7201 | GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder); |
| 7202 | if (TE->State == TreeEntry::Vectorize && |
| 7203 | TE->getOpcode() == Instruction::PHI) |
| 7204 | PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder); |
| 7205 | } |
| 7206 | }); |
| 7207 | |
| 7208 | // Reorder the graph nodes according to their vectorization factor. |
| 7209 | for (unsigned VF = VectorizableTree.front()->getVectorFactor(); |
| 7210 | !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) { |
| 7211 | auto It = VFToOrderedEntries.find(Val: VF); |
| 7212 | if (It == VFToOrderedEntries.end()) |
| 7213 | continue; |
| 7214 | // Try to find the most profitable order. We just are looking for the most |
| 7215 | // used order and reorder scalar elements in the nodes according to this |
| 7216 | // mostly used order. |
| 7217 | ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef(); |
| 7218 | // Delete VF entry upon exit. |
| 7219 | auto Cleanup = make_scope_exit(F: [&]() { VFToOrderedEntries.erase(I: It); }); |
| 7220 | |
| 7221 | // All operands are reordered and used only in this node - propagate the |
| 7222 | // most used order to the user node. |
| 7223 | MapVector<OrdersType, unsigned, |
| 7224 | DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> |
| 7225 | OrdersUses; |
| 7226 | for (const TreeEntry *OpTE : OrderedEntries) { |
| 7227 | // No need to reorder this nodes, still need to extend and to use shuffle, |
| 7228 | // just need to merge reordering shuffle and the reuse shuffle. |
| 7229 | if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) && |
| 7230 | OpTE->State != TreeEntry::SplitVectorize) |
| 7231 | continue; |
| 7232 | // Count number of orders uses. |
| 7233 | const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders, |
| 7234 | &PhisToOrders]() -> const OrdersType & { |
| 7235 | if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) { |
| 7236 | auto It = GathersToOrders.find(Val: OpTE); |
| 7237 | if (It != GathersToOrders.end()) |
| 7238 | return It->second; |
| 7239 | } |
| 7240 | if (OpTE->hasState() && OpTE->isAltShuffle()) { |
| 7241 | auto It = AltShufflesToOrders.find(Val: OpTE); |
| 7242 | if (It != AltShufflesToOrders.end()) |
| 7243 | return It->second; |
| 7244 | } |
| 7245 | if (OpTE->State == TreeEntry::Vectorize && |
| 7246 | OpTE->getOpcode() == Instruction::PHI) { |
| 7247 | auto It = PhisToOrders.find(Val: OpTE); |
| 7248 | if (It != PhisToOrders.end()) |
| 7249 | return It->second; |
| 7250 | } |
| 7251 | return OpTE->ReorderIndices; |
| 7252 | }(); |
| 7253 | // First consider the order of the external scalar users. |
| 7254 | auto It = ExternalUserReorderMap.find(Val: OpTE); |
| 7255 | if (It != ExternalUserReorderMap.end()) { |
| 7256 | const auto &ExternalUserReorderIndices = It->second; |
| 7257 | // If the OpTE vector factor != number of scalars - use natural order, |
| 7258 | // it is an attempt to reorder node with reused scalars but with |
| 7259 | // external uses. |
| 7260 | if (OpTE->getVectorFactor() != OpTE->Scalars.size()) { |
| 7261 | OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0)).first->second += |
| 7262 | ExternalUserReorderIndices.size(); |
| 7263 | } else { |
| 7264 | for (const OrdersType &ExtOrder : ExternalUserReorderIndices) |
| 7265 | ++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: 0)).first->second; |
| 7266 | } |
| 7267 | // No other useful reorder data in this entry. |
| 7268 | if (Order.empty()) |
| 7269 | continue; |
| 7270 | } |
| 7271 | // Stores actually store the mask, not the order, need to invert. |
| 7272 | if (OpTE->State == TreeEntry::Vectorize && |
| 7273 | OpTE->getOpcode() == Instruction::Store && !Order.empty()) { |
| 7274 | assert(!OpTE->isAltShuffle() && |
| 7275 | "Alternate instructions are only supported by BinaryOperator " |
| 7276 | "and CastInst." ); |
| 7277 | SmallVector<int> Mask; |
| 7278 | inversePermutation(Indices: Order, Mask); |
| 7279 | unsigned E = Order.size(); |
| 7280 | OrdersType CurrentOrder(E, E); |
| 7281 | transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) { |
| 7282 | return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); |
| 7283 | }); |
| 7284 | fixupOrderingIndices(Order: CurrentOrder); |
| 7285 | ++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second; |
| 7286 | } else { |
| 7287 | ++OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second; |
| 7288 | } |
| 7289 | } |
| 7290 | if (OrdersUses.empty()) |
| 7291 | continue; |
| 7292 | // Choose the most used order. |
| 7293 | unsigned IdentityCnt = 0; |
| 7294 | unsigned FilledIdentityCnt = 0; |
| 7295 | OrdersType IdentityOrder(VF, VF); |
| 7296 | for (auto &Pair : OrdersUses) { |
| 7297 | if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) { |
| 7298 | if (!Pair.first.empty()) |
| 7299 | FilledIdentityCnt += Pair.second; |
| 7300 | IdentityCnt += Pair.second; |
| 7301 | combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first); |
| 7302 | } |
| 7303 | } |
| 7304 | MutableArrayRef<unsigned> BestOrder = IdentityOrder; |
| 7305 | unsigned Cnt = IdentityCnt; |
| 7306 | for (auto &Pair : OrdersUses) { |
| 7307 | // Prefer identity order. But, if filled identity found (non-empty order) |
| 7308 | // with same number of uses, as the new candidate order, we can choose |
| 7309 | // this candidate order. |
| 7310 | if (Cnt < Pair.second || |
| 7311 | (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt && |
| 7312 | Cnt == Pair.second && !BestOrder.empty() && |
| 7313 | isIdentityOrder(Order: BestOrder))) { |
| 7314 | combineOrders(Order: Pair.first, SecondaryOrder: BestOrder); |
| 7315 | BestOrder = Pair.first; |
| 7316 | Cnt = Pair.second; |
| 7317 | } else { |
| 7318 | combineOrders(Order: BestOrder, SecondaryOrder: Pair.first); |
| 7319 | } |
| 7320 | } |
| 7321 | // Set order of the user node. |
| 7322 | if (isIdentityOrder(Order: BestOrder)) |
| 7323 | continue; |
| 7324 | fixupOrderingIndices(Order: BestOrder); |
| 7325 | SmallVector<int> Mask; |
| 7326 | inversePermutation(Indices: BestOrder, Mask); |
| 7327 | SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); |
| 7328 | unsigned E = BestOrder.size(); |
| 7329 | transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) { |
| 7330 | return I < E ? static_cast<int>(I) : PoisonMaskElem; |
| 7331 | }); |
| 7332 | // Do an actual reordering, if profitable. |
| 7333 | for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 7334 | // Just do the reordering for the nodes with the given VF. |
| 7335 | if (TE->Scalars.size() != VF) { |
| 7336 | if (TE->ReuseShuffleIndices.size() == VF) { |
| 7337 | assert(TE->State != TreeEntry::SplitVectorize && |
| 7338 | "Split vectorized not expected." ); |
| 7339 | // Need to reorder the reuses masks of the operands with smaller VF to |
| 7340 | // be able to find the match between the graph nodes and scalar |
| 7341 | // operands of the given node during vectorization/cost estimation. |
| 7342 | assert( |
| 7343 | (!TE->UserTreeIndex || |
| 7344 | TE->UserTreeIndex.UserTE->Scalars.size() == VF || |
| 7345 | TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() || |
| 7346 | TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) && |
| 7347 | "All users must be of VF size." ); |
| 7348 | if (SLPReVec) { |
| 7349 | assert(SLPReVec && "Only supported by REVEC." ); |
| 7350 | // ShuffleVectorInst does not do reorderOperands (and it should not |
| 7351 | // because ShuffleVectorInst supports only a limited set of |
| 7352 | // patterns). Only do reorderNodeWithReuses if the user is not |
| 7353 | // ShuffleVectorInst. |
| 7354 | if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() && |
| 7355 | isa<ShuffleVectorInst>(Val: TE->UserTreeIndex.UserTE->getMainOp())) |
| 7356 | continue; |
| 7357 | } |
| 7358 | // Update ordering of the operands with the smaller VF than the given |
| 7359 | // one. |
| 7360 | reorderNodeWithReuses(TE&: *TE, Mask); |
| 7361 | // Update orders in user split vectorize nodes. |
| 7362 | if (TE->UserTreeIndex && |
| 7363 | TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) |
| 7364 | TE->UserTreeIndex.UserTE->reorderSplitNode( |
| 7365 | Idx: TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder); |
| 7366 | } |
| 7367 | continue; |
| 7368 | } |
| 7369 | if ((TE->State == TreeEntry::SplitVectorize && |
| 7370 | TE->ReuseShuffleIndices.empty()) || |
| 7371 | ((TE->State == TreeEntry::Vectorize || |
| 7372 | TE->State == TreeEntry::StridedVectorize || |
| 7373 | TE->State == TreeEntry::CompressVectorize) && |
| 7374 | (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst, |
| 7375 | InsertElementInst>(Val: TE->getMainOp()) || |
| 7376 | (SLPReVec && isa<ShuffleVectorInst>(Val: TE->getMainOp()))))) { |
| 7377 | assert( |
| 7378 | (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize && |
| 7379 | TE->ReuseShuffleIndices.empty())) && |
| 7380 | "Alternate instructions are only supported by BinaryOperator " |
| 7381 | "and CastInst." ); |
| 7382 | // Build correct orders for extract{element,value}, loads, |
| 7383 | // stores and alternate (split) nodes. |
| 7384 | reorderOrder(Order&: TE->ReorderIndices, Mask); |
| 7385 | if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp())) |
| 7386 | TE->reorderOperands(Mask); |
| 7387 | } else { |
| 7388 | // Reorder the node and its operands. |
| 7389 | TE->reorderOperands(Mask); |
| 7390 | assert(TE->ReorderIndices.empty() && |
| 7391 | "Expected empty reorder sequence." ); |
| 7392 | reorderScalars(Scalars&: TE->Scalars, Mask); |
| 7393 | } |
| 7394 | if (!TE->ReuseShuffleIndices.empty()) { |
| 7395 | // Apply reversed order to keep the original ordering of the reused |
| 7396 | // elements to avoid extra reorder indices shuffling. |
| 7397 | OrdersType CurrentOrder; |
| 7398 | reorderOrder(Order&: CurrentOrder, Mask: MaskOrder); |
| 7399 | SmallVector<int> NewReuses; |
| 7400 | inversePermutation(Indices: CurrentOrder, Mask&: NewReuses); |
| 7401 | addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices); |
| 7402 | TE->ReuseShuffleIndices.swap(RHS&: NewReuses); |
| 7403 | } else if (TE->UserTreeIndex && |
| 7404 | TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) |
| 7405 | // Update orders in user split vectorize nodes. |
| 7406 | TE->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE->UserTreeIndex.EdgeIdx, |
| 7407 | Mask, MaskOrder); |
| 7408 | } |
| 7409 | } |
| 7410 | } |
| 7411 | |
| 7412 | void BoUpSLP::buildReorderableOperands( |
| 7413 | TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges, |
| 7414 | const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers, |
| 7415 | SmallVectorImpl<TreeEntry *> &GatherOps) { |
| 7416 | for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) { |
| 7417 | if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) { |
| 7418 | return OpData.first == I && |
| 7419 | (OpData.second->State == TreeEntry::Vectorize || |
| 7420 | OpData.second->State == TreeEntry::StridedVectorize || |
| 7421 | OpData.second->State == TreeEntry::CompressVectorize || |
| 7422 | OpData.second->State == TreeEntry::SplitVectorize); |
| 7423 | })) |
| 7424 | continue; |
| 7425 | // Do not request operands, if they do not exist. |
| 7426 | if (UserTE->hasState()) { |
| 7427 | if (UserTE->getOpcode() == Instruction::ExtractElement || |
| 7428 | UserTE->getOpcode() == Instruction::ExtractValue) |
| 7429 | continue; |
| 7430 | if (UserTE->getOpcode() == Instruction::InsertElement && I == 0) |
| 7431 | continue; |
| 7432 | if (UserTE->getOpcode() == Instruction::Store && |
| 7433 | UserTE->State == TreeEntry::Vectorize && I == 1) |
| 7434 | continue; |
| 7435 | if (UserTE->getOpcode() == Instruction::Load && |
| 7436 | (UserTE->State == TreeEntry::Vectorize || |
| 7437 | UserTE->State == TreeEntry::StridedVectorize || |
| 7438 | UserTE->State == TreeEntry::CompressVectorize)) |
| 7439 | continue; |
| 7440 | } |
| 7441 | TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I); |
| 7442 | assert(TE && "Expected operand entry." ); |
| 7443 | if (!TE->isGather()) { |
| 7444 | // Add the node to the list of the ordered nodes with the identity |
| 7445 | // order. |
| 7446 | Edges.emplace_back(Args&: I, Args&: TE); |
| 7447 | // Add ScatterVectorize nodes to the list of operands, where just |
| 7448 | // reordering of the scalars is required. Similar to the gathers, so |
| 7449 | // simply add to the list of gathered ops. |
| 7450 | // If there are reused scalars, process this node as a regular vectorize |
| 7451 | // node, just reorder reuses mask. |
| 7452 | if (TE->State == TreeEntry::ScatterVectorize && |
| 7453 | TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty()) |
| 7454 | GatherOps.push_back(Elt: TE); |
| 7455 | continue; |
| 7456 | } |
| 7457 | if (ReorderableGathers.contains(Ptr: TE)) |
| 7458 | GatherOps.push_back(Elt: TE); |
| 7459 | } |
| 7460 | } |
| 7461 | |
| 7462 | void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { |
| 7463 | struct TreeEntryCompare { |
| 7464 | bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const { |
| 7465 | if (LHS->UserTreeIndex && RHS->UserTreeIndex) |
| 7466 | return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx; |
| 7467 | return LHS->Idx < RHS->Idx; |
| 7468 | } |
| 7469 | }; |
| 7470 | PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue; |
| 7471 | DenseSet<const TreeEntry *> GathersToOrders; |
| 7472 | // Find all reorderable leaf nodes with the given VF. |
| 7473 | // Currently the are vectorized loads,extracts without alternate operands + |
| 7474 | // some gathering of extracts. |
| 7475 | SmallPtrSet<const TreeEntry *, 4> NonVectorized; |
| 7476 | for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 7477 | if (TE->State != TreeEntry::Vectorize && |
| 7478 | TE->State != TreeEntry::StridedVectorize && |
| 7479 | TE->State != TreeEntry::CompressVectorize && |
| 7480 | TE->State != TreeEntry::SplitVectorize) |
| 7481 | NonVectorized.insert(Ptr: TE.get()); |
| 7482 | if (std::optional<OrdersType> CurrentOrder = |
| 7483 | getReorderingData(TE: *TE, /*TopToBottom=*/false, IgnoreReorder)) { |
| 7484 | Queue.push(x: TE.get()); |
| 7485 | if (!(TE->State == TreeEntry::Vectorize || |
| 7486 | TE->State == TreeEntry::StridedVectorize || |
| 7487 | TE->State == TreeEntry::CompressVectorize || |
| 7488 | TE->State == TreeEntry::SplitVectorize) || |
| 7489 | !TE->ReuseShuffleIndices.empty()) |
| 7490 | GathersToOrders.insert(V: TE.get()); |
| 7491 | } |
| 7492 | } |
| 7493 | |
| 7494 | // 1. Propagate order to the graph nodes, which use only reordered nodes. |
| 7495 | // I.e., if the node has operands, that are reordered, try to make at least |
| 7496 | // one operand order in the natural order and reorder others + reorder the |
| 7497 | // user node itself. |
| 7498 | SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps; |
| 7499 | while (!Queue.empty()) { |
| 7500 | // 1. Filter out only reordered nodes. |
| 7501 | std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users; |
| 7502 | TreeEntry *TE = Queue.top(); |
| 7503 | const TreeEntry *UserTE = TE->UserTreeIndex.UserTE; |
| 7504 | Queue.pop(); |
| 7505 | SmallVector<TreeEntry *> OrderedOps(1, TE); |
| 7506 | while (!Queue.empty()) { |
| 7507 | TE = Queue.top(); |
| 7508 | if (!UserTE || UserTE != TE->UserTreeIndex.UserTE) |
| 7509 | break; |
| 7510 | Queue.pop(); |
| 7511 | OrderedOps.push_back(Elt: TE); |
| 7512 | } |
| 7513 | for (TreeEntry *TE : OrderedOps) { |
| 7514 | if (!(TE->State == TreeEntry::Vectorize || |
| 7515 | TE->State == TreeEntry::StridedVectorize || |
| 7516 | TE->State == TreeEntry::CompressVectorize || |
| 7517 | TE->State == TreeEntry::SplitVectorize || |
| 7518 | (TE->isGather() && GathersToOrders.contains(V: TE))) || |
| 7519 | !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() || |
| 7520 | !Visited.insert(Ptr: TE).second) |
| 7521 | continue; |
| 7522 | // Build a map between user nodes and their operands order to speedup |
| 7523 | // search. The graph currently does not provide this dependency directly. |
| 7524 | Users.first = TE->UserTreeIndex.UserTE; |
| 7525 | Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE); |
| 7526 | } |
| 7527 | if (Users.first) { |
| 7528 | auto &Data = Users; |
| 7529 | if (Data.first->State == TreeEntry::SplitVectorize) { |
| 7530 | assert( |
| 7531 | Data.second.size() <= 2 && |
| 7532 | "Expected not greater than 2 operands for split vectorize node." ); |
| 7533 | if (any_of(Range&: Data.second, |
| 7534 | P: [](const auto &Op) { return !Op.second->UserTreeIndex; })) |
| 7535 | continue; |
| 7536 | // Update orders in user split vectorize nodes. |
| 7537 | assert(Data.first->CombinedEntriesWithIndices.size() == 2 && |
| 7538 | "Expected exactly 2 entries." ); |
| 7539 | for (const auto &P : Data.first->CombinedEntriesWithIndices) { |
| 7540 | TreeEntry &OpTE = *VectorizableTree[P.first]; |
| 7541 | OrdersType Order = OpTE.ReorderIndices; |
| 7542 | if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) { |
| 7543 | if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty()) |
| 7544 | continue; |
| 7545 | const auto BestOrder = |
| 7546 | getReorderingData(TE: OpTE, /*TopToBottom=*/false, IgnoreReorder); |
| 7547 | if (!BestOrder || BestOrder->empty() || isIdentityOrder(Order: *BestOrder)) |
| 7548 | continue; |
| 7549 | Order = *BestOrder; |
| 7550 | } |
| 7551 | fixupOrderingIndices(Order); |
| 7552 | SmallVector<int> Mask; |
| 7553 | inversePermutation(Indices: Order, Mask); |
| 7554 | const unsigned E = Order.size(); |
| 7555 | SmallVector<int> MaskOrder(E, PoisonMaskElem); |
| 7556 | transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) { |
| 7557 | return I < E ? static_cast<int>(I) : PoisonMaskElem; |
| 7558 | }); |
| 7559 | Data.first->reorderSplitNode(Idx: P.second ? 1 : 0, Mask, MaskOrder); |
| 7560 | // Clear ordering of the operand. |
| 7561 | if (!OpTE.ReorderIndices.empty()) { |
| 7562 | OpTE.ReorderIndices.clear(); |
| 7563 | } else if (!OpTE.ReuseShuffleIndices.empty()) { |
| 7564 | reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask); |
| 7565 | } else { |
| 7566 | assert(OpTE.isGather() && "Expected only gather/buildvector node." ); |
| 7567 | reorderScalars(Scalars&: OpTE.Scalars, Mask); |
| 7568 | } |
| 7569 | } |
| 7570 | if (Data.first->ReuseShuffleIndices.empty() && |
| 7571 | !Data.first->ReorderIndices.empty()) { |
| 7572 | // Insert user node to the list to try to sink reordering deeper in |
| 7573 | // the graph. |
| 7574 | Queue.push(x: Data.first); |
| 7575 | } |
| 7576 | continue; |
| 7577 | } |
| 7578 | // Check that operands are used only in the User node. |
| 7579 | SmallVector<TreeEntry *> GatherOps; |
| 7580 | buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized, |
| 7581 | GatherOps); |
| 7582 | // All operands are reordered and used only in this node - propagate the |
| 7583 | // most used order to the user node. |
| 7584 | MapVector<OrdersType, unsigned, |
| 7585 | DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>> |
| 7586 | OrdersUses; |
| 7587 | // Do the analysis for each tree entry only once, otherwise the order of |
| 7588 | // the same node my be considered several times, though might be not |
| 7589 | // profitable. |
| 7590 | SmallPtrSet<const TreeEntry *, 4> VisitedOps; |
| 7591 | SmallPtrSet<const TreeEntry *, 4> VisitedUsers; |
| 7592 | for (const auto &Op : Data.second) { |
| 7593 | TreeEntry *OpTE = Op.second; |
| 7594 | if (!VisitedOps.insert(Ptr: OpTE).second) |
| 7595 | continue; |
| 7596 | if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE)) |
| 7597 | continue; |
| 7598 | const auto Order = [&]() -> const OrdersType { |
| 7599 | if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) |
| 7600 | return getReorderingData(TE: *OpTE, /*TopToBottom=*/false, |
| 7601 | IgnoreReorder) |
| 7602 | .value_or(u: OrdersType(1)); |
| 7603 | return OpTE->ReorderIndices; |
| 7604 | }(); |
| 7605 | // The order is partially ordered, skip it in favor of fully non-ordered |
| 7606 | // orders. |
| 7607 | if (Order.size() == 1) |
| 7608 | continue; |
| 7609 | |
| 7610 | // Check that the reordering does not increase number of shuffles, i.e. |
| 7611 | // same-values-nodes has same parents or their parents has same parents. |
| 7612 | if (!Order.empty() && !isIdentityOrder(Order)) { |
| 7613 | Value *Root = OpTE->hasState() |
| 7614 | ? OpTE->getMainOp() |
| 7615 | : *find_if_not(Range&: OpTE->Scalars, P: isConstant); |
| 7616 | auto GetSameNodesUsers = [&](Value *Root) { |
| 7617 | SmallSetVector<TreeEntry *, 4> Res; |
| 7618 | for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) { |
| 7619 | if (TE != OpTE && TE->UserTreeIndex && |
| 7620 | TE->getVectorFactor() == OpTE->getVectorFactor() && |
| 7621 | TE->Scalars.size() == OpTE->Scalars.size() && |
| 7622 | ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) || |
| 7623 | (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars)))) |
| 7624 | Res.insert(X: TE->UserTreeIndex.UserTE); |
| 7625 | } |
| 7626 | for (const TreeEntry *TE : getTreeEntries(V: Root)) { |
| 7627 | if (TE != OpTE && TE->UserTreeIndex && |
| 7628 | TE->getVectorFactor() == OpTE->getVectorFactor() && |
| 7629 | TE->Scalars.size() == OpTE->Scalars.size() && |
| 7630 | ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) || |
| 7631 | (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars)))) |
| 7632 | Res.insert(X: TE->UserTreeIndex.UserTE); |
| 7633 | } |
| 7634 | return Res.takeVector(); |
| 7635 | }; |
| 7636 | auto GetNumOperands = [](const TreeEntry *TE) { |
| 7637 | if (TE->State == TreeEntry::SplitVectorize) |
| 7638 | return TE->getNumOperands(); |
| 7639 | if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI) |
| 7640 | return CI->arg_size(); |
| 7641 | return TE->getNumOperands(); |
| 7642 | }; |
| 7643 | auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI]( |
| 7644 | const TreeEntry *TE) { |
| 7645 | Intrinsic::ID ID = Intrinsic::not_intrinsic; |
| 7646 | if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI) |
| 7647 | ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 7648 | for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(TE))) { |
| 7649 | if (ID != Intrinsic::not_intrinsic && |
| 7650 | isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) |
| 7651 | continue; |
| 7652 | const TreeEntry *Op = getOperandEntry(E: TE, Idx); |
| 7653 | if (Op->isGather() && Op->hasState()) { |
| 7654 | const TreeEntry *VecOp = |
| 7655 | getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars); |
| 7656 | if (VecOp) |
| 7657 | Op = VecOp; |
| 7658 | } |
| 7659 | if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty()) |
| 7660 | return false; |
| 7661 | } |
| 7662 | return true; |
| 7663 | }; |
| 7664 | SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root); |
| 7665 | if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) { |
| 7666 | if (!RevisitedOps.insert(Ptr: UTE).second) |
| 7667 | return false; |
| 7668 | return UTE == Data.first || !UTE->ReorderIndices.empty() || |
| 7669 | !UTE->ReuseShuffleIndices.empty() || |
| 7670 | (UTE->UserTreeIndex && |
| 7671 | UTE->UserTreeIndex.UserTE == Data.first) || |
| 7672 | (Data.first->UserTreeIndex && |
| 7673 | Data.first->UserTreeIndex.UserTE == UTE) || |
| 7674 | (IgnoreReorder && UTE->UserTreeIndex && |
| 7675 | UTE->UserTreeIndex.UserTE->Idx == 0) || |
| 7676 | NodeShouldBeReorderedWithOperands(UTE); |
| 7677 | })) |
| 7678 | continue; |
| 7679 | for (TreeEntry *UTE : Users) { |
| 7680 | Intrinsic::ID ID = Intrinsic::not_intrinsic; |
| 7681 | if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI) |
| 7682 | ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 7683 | for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(UTE))) { |
| 7684 | if (ID != Intrinsic::not_intrinsic && |
| 7685 | isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) |
| 7686 | continue; |
| 7687 | const TreeEntry *Op = getOperandEntry(E: UTE, Idx); |
| 7688 | Visited.erase(Ptr: Op); |
| 7689 | Queue.push(x: const_cast<TreeEntry *>(Op)); |
| 7690 | } |
| 7691 | } |
| 7692 | } |
| 7693 | unsigned NumOps = count_if( |
| 7694 | Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) { |
| 7695 | return P.second == OpTE; |
| 7696 | }); |
| 7697 | // Stores actually store the mask, not the order, need to invert. |
| 7698 | if (OpTE->State == TreeEntry::Vectorize && |
| 7699 | OpTE->getOpcode() == Instruction::Store && !Order.empty()) { |
| 7700 | assert(!OpTE->isAltShuffle() && |
| 7701 | "Alternate instructions are only supported by BinaryOperator " |
| 7702 | "and CastInst." ); |
| 7703 | SmallVector<int> Mask; |
| 7704 | inversePermutation(Indices: Order, Mask); |
| 7705 | unsigned E = Order.size(); |
| 7706 | OrdersType CurrentOrder(E, E); |
| 7707 | transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) { |
| 7708 | return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx); |
| 7709 | }); |
| 7710 | fixupOrderingIndices(Order: CurrentOrder); |
| 7711 | OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second += |
| 7712 | NumOps; |
| 7713 | } else { |
| 7714 | OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second += NumOps; |
| 7715 | } |
| 7716 | auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0)); |
| 7717 | const auto AllowsReordering = [&](const TreeEntry *TE) { |
| 7718 | if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() || |
| 7719 | (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) || |
| 7720 | (IgnoreReorder && TE->Idx == 0)) |
| 7721 | return true; |
| 7722 | if (TE->isGather()) { |
| 7723 | if (GathersToOrders.contains(V: TE)) |
| 7724 | return !getReorderingData(TE: *TE, /*TopToBottom=*/false, |
| 7725 | IgnoreReorder) |
| 7726 | .value_or(u: OrdersType(1)) |
| 7727 | .empty(); |
| 7728 | return true; |
| 7729 | } |
| 7730 | return false; |
| 7731 | }; |
| 7732 | if (OpTE->UserTreeIndex) { |
| 7733 | TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE; |
| 7734 | if (!VisitedUsers.insert(Ptr: UserTE).second) |
| 7735 | continue; |
| 7736 | // May reorder user node if it requires reordering, has reused |
| 7737 | // scalars, is an alternate op vectorize node or its op nodes require |
| 7738 | // reordering. |
| 7739 | if (AllowsReordering(UserTE)) |
| 7740 | continue; |
| 7741 | // Check if users allow reordering. |
| 7742 | // Currently look up just 1 level of operands to avoid increase of |
| 7743 | // the compile time. |
| 7744 | // Profitable to reorder if definitely more operands allow |
| 7745 | // reordering rather than those with natural order. |
| 7746 | ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second; |
| 7747 | if (static_cast<unsigned>(count_if( |
| 7748 | Range&: Ops, P: [UserTE, &AllowsReordering]( |
| 7749 | const std::pair<unsigned, TreeEntry *> &Op) { |
| 7750 | return AllowsReordering(Op.second) && |
| 7751 | Op.second->UserTreeIndex.UserTE == UserTE; |
| 7752 | })) <= Ops.size() / 2) |
| 7753 | ++Res.first->second; |
| 7754 | } |
| 7755 | } |
| 7756 | if (OrdersUses.empty()) { |
| 7757 | Visited.insert_range(R: llvm::make_second_range(c&: Data.second)); |
| 7758 | continue; |
| 7759 | } |
| 7760 | // Choose the most used order. |
| 7761 | unsigned IdentityCnt = 0; |
| 7762 | unsigned VF = Data.second.front().second->getVectorFactor(); |
| 7763 | OrdersType IdentityOrder(VF, VF); |
| 7764 | for (auto &Pair : OrdersUses) { |
| 7765 | if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) { |
| 7766 | IdentityCnt += Pair.second; |
| 7767 | combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first); |
| 7768 | } |
| 7769 | } |
| 7770 | MutableArrayRef<unsigned> BestOrder = IdentityOrder; |
| 7771 | unsigned Cnt = IdentityCnt; |
| 7772 | for (auto &Pair : OrdersUses) { |
| 7773 | // Prefer identity order. But, if filled identity found (non-empty |
| 7774 | // order) with same number of uses, as the new candidate order, we can |
| 7775 | // choose this candidate order. |
| 7776 | if (Cnt < Pair.second) { |
| 7777 | combineOrders(Order: Pair.first, SecondaryOrder: BestOrder); |
| 7778 | BestOrder = Pair.first; |
| 7779 | Cnt = Pair.second; |
| 7780 | } else { |
| 7781 | combineOrders(Order: BestOrder, SecondaryOrder: Pair.first); |
| 7782 | } |
| 7783 | } |
| 7784 | // Set order of the user node. |
| 7785 | if (isIdentityOrder(Order: BestOrder)) { |
| 7786 | Visited.insert_range(R: llvm::make_second_range(c&: Data.second)); |
| 7787 | continue; |
| 7788 | } |
| 7789 | fixupOrderingIndices(Order: BestOrder); |
| 7790 | // Erase operands from OrderedEntries list and adjust their orders. |
| 7791 | VisitedOps.clear(); |
| 7792 | SmallVector<int> Mask; |
| 7793 | inversePermutation(Indices: BestOrder, Mask); |
| 7794 | SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem); |
| 7795 | unsigned E = BestOrder.size(); |
| 7796 | transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) { |
| 7797 | return I < E ? static_cast<int>(I) : PoisonMaskElem; |
| 7798 | }); |
| 7799 | for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) { |
| 7800 | TreeEntry *TE = Op.second; |
| 7801 | if (!VisitedOps.insert(Ptr: TE).second) |
| 7802 | continue; |
| 7803 | if (TE->ReuseShuffleIndices.size() == BestOrder.size()) { |
| 7804 | reorderNodeWithReuses(TE&: *TE, Mask); |
| 7805 | continue; |
| 7806 | } |
| 7807 | // Gathers are processed separately. |
| 7808 | if (TE->State != TreeEntry::Vectorize && |
| 7809 | TE->State != TreeEntry::StridedVectorize && |
| 7810 | TE->State != TreeEntry::CompressVectorize && |
| 7811 | TE->State != TreeEntry::SplitVectorize && |
| 7812 | (TE->State != TreeEntry::ScatterVectorize || |
| 7813 | TE->ReorderIndices.empty())) |
| 7814 | continue; |
| 7815 | assert((BestOrder.size() == TE->ReorderIndices.size() || |
| 7816 | TE->ReorderIndices.empty()) && |
| 7817 | "Non-matching sizes of user/operand entries." ); |
| 7818 | reorderOrder(Order&: TE->ReorderIndices, Mask); |
| 7819 | if (IgnoreReorder && TE == VectorizableTree.front().get()) |
| 7820 | IgnoreReorder = false; |
| 7821 | } |
| 7822 | // For gathers just need to reorder its scalars. |
| 7823 | for (TreeEntry *Gather : GatherOps) { |
| 7824 | assert(Gather->ReorderIndices.empty() && |
| 7825 | "Unexpected reordering of gathers." ); |
| 7826 | if (!Gather->ReuseShuffleIndices.empty()) { |
| 7827 | // Just reorder reuses indices. |
| 7828 | reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask); |
| 7829 | continue; |
| 7830 | } |
| 7831 | reorderScalars(Scalars&: Gather->Scalars, Mask); |
| 7832 | Visited.insert(Ptr: Gather); |
| 7833 | } |
| 7834 | // Reorder operands of the user node and set the ordering for the user |
| 7835 | // node itself. |
| 7836 | auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) { |
| 7837 | return TE.isAltShuffle() && |
| 7838 | (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 || |
| 7839 | TE.ReorderIndices.empty()); |
| 7840 | }; |
| 7841 | if (Data.first->State != TreeEntry::Vectorize || |
| 7842 | !isa<ExtractElementInst, ExtractValueInst, LoadInst>( |
| 7843 | Val: Data.first->getMainOp()) || |
| 7844 | IsNotProfitableAltCodeNode(*Data.first)) |
| 7845 | Data.first->reorderOperands(Mask); |
| 7846 | if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) || |
| 7847 | IsNotProfitableAltCodeNode(*Data.first) || |
| 7848 | Data.first->State == TreeEntry::StridedVectorize || |
| 7849 | Data.first->State == TreeEntry::CompressVectorize) { |
| 7850 | reorderScalars(Scalars&: Data.first->Scalars, Mask); |
| 7851 | reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder, |
| 7852 | /*BottomOrder=*/true); |
| 7853 | if (Data.first->ReuseShuffleIndices.empty() && |
| 7854 | !Data.first->ReorderIndices.empty() && |
| 7855 | !IsNotProfitableAltCodeNode(*Data.first)) { |
| 7856 | // Insert user node to the list to try to sink reordering deeper in |
| 7857 | // the graph. |
| 7858 | Queue.push(x: Data.first); |
| 7859 | } |
| 7860 | } else { |
| 7861 | reorderOrder(Order&: Data.first->ReorderIndices, Mask); |
| 7862 | } |
| 7863 | } |
| 7864 | } |
| 7865 | // If the reordering is unnecessary, just remove the reorder. |
| 7866 | if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() && |
| 7867 | VectorizableTree.front()->ReuseShuffleIndices.empty()) |
| 7868 | VectorizableTree.front()->ReorderIndices.clear(); |
| 7869 | } |
| 7870 | |
| 7871 | Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const { |
| 7872 | if ((Entry.getOpcode() == Instruction::Store || |
| 7873 | Entry.getOpcode() == Instruction::Load) && |
| 7874 | Entry.State == TreeEntry::StridedVectorize && |
| 7875 | !Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices)) |
| 7876 | return dyn_cast<Instruction>(Val: Entry.Scalars[Entry.ReorderIndices.front()]); |
| 7877 | return dyn_cast<Instruction>(Val: Entry.Scalars.front()); |
| 7878 | } |
| 7879 | |
| 7880 | void BoUpSLP::buildExternalUses( |
| 7881 | const ExtraValueToDebugLocsMap &ExternallyUsedValues) { |
| 7882 | DenseMap<Value *, unsigned> ScalarToExtUses; |
| 7883 | // Collect the values that we need to extract from the tree. |
| 7884 | for (auto &TEPtr : VectorizableTree) { |
| 7885 | TreeEntry *Entry = TEPtr.get(); |
| 7886 | |
| 7887 | // No need to handle users of gathered values. |
| 7888 | if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) |
| 7889 | continue; |
| 7890 | |
| 7891 | // For each lane: |
| 7892 | for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { |
| 7893 | Value *Scalar = Entry->Scalars[Lane]; |
| 7894 | if (!isa<Instruction>(Val: Scalar)) |
| 7895 | continue; |
| 7896 | // All uses must be replaced already? No need to do it again. |
| 7897 | auto It = ScalarToExtUses.find(Val: Scalar); |
| 7898 | if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User) |
| 7899 | continue; |
| 7900 | |
| 7901 | // Check if the scalar is externally used as an extra arg. |
| 7902 | const auto ExtI = ExternallyUsedValues.find(V: Scalar); |
| 7903 | if (ExtI != ExternallyUsedValues.end()) { |
| 7904 | int FoundLane = Entry->findLaneForValue(V: Scalar); |
| 7905 | LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " |
| 7906 | << FoundLane << " from " << *Scalar << ".\n" ); |
| 7907 | ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()); |
| 7908 | ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane); |
| 7909 | continue; |
| 7910 | } |
| 7911 | for (User *U : Scalar->users()) { |
| 7912 | LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n" ); |
| 7913 | |
| 7914 | Instruction *UserInst = dyn_cast<Instruction>(Val: U); |
| 7915 | if (!UserInst || isDeleted(I: UserInst)) |
| 7916 | continue; |
| 7917 | |
| 7918 | // Ignore users in the user ignore list. |
| 7919 | if (UserIgnoreList && UserIgnoreList->contains(V: UserInst)) |
| 7920 | continue; |
| 7921 | |
| 7922 | // Skip in-tree scalars that become vectors |
| 7923 | if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U); |
| 7924 | !UseEntries.empty()) { |
| 7925 | // Some in-tree scalars will remain as scalar in vectorized |
| 7926 | // instructions. If that is the case, the one in FoundLane will |
| 7927 | // be used. |
| 7928 | if (all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) { |
| 7929 | return UseEntry->State == TreeEntry::ScatterVectorize || |
| 7930 | !doesInTreeUserNeedToExtract( |
| 7931 | Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI, |
| 7932 | TTI); |
| 7933 | })) { |
| 7934 | LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U |
| 7935 | << ".\n" ); |
| 7936 | assert(none_of(UseEntries, |
| 7937 | [](TreeEntry *UseEntry) { |
| 7938 | return UseEntry->isGather(); |
| 7939 | }) && |
| 7940 | "Bad state" ); |
| 7941 | continue; |
| 7942 | } |
| 7943 | U = nullptr; |
| 7944 | if (It != ScalarToExtUses.end()) { |
| 7945 | ExternalUses[It->second].User = nullptr; |
| 7946 | break; |
| 7947 | } |
| 7948 | } |
| 7949 | |
| 7950 | if (U && Scalar->hasNUsesOrMore(N: UsesLimit)) |
| 7951 | U = nullptr; |
| 7952 | int FoundLane = Entry->findLaneForValue(V: Scalar); |
| 7953 | LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst |
| 7954 | << " from lane " << FoundLane << " from " << *Scalar |
| 7955 | << ".\n" ); |
| 7956 | It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first; |
| 7957 | ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane); |
| 7958 | if (!U) |
| 7959 | break; |
| 7960 | } |
| 7961 | } |
| 7962 | } |
| 7963 | } |
| 7964 | |
| 7965 | SmallVector<SmallVector<StoreInst *>> |
| 7966 | BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const { |
| 7967 | SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>, |
| 7968 | SmallVector<StoreInst *>, 8> |
| 7969 | PtrToStoresMap; |
| 7970 | for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) { |
| 7971 | Value *V = TE->Scalars[Lane]; |
| 7972 | // Don't iterate over the users of constant data. |
| 7973 | if (!isa<Instruction>(Val: V)) |
| 7974 | continue; |
| 7975 | // To save compilation time we don't visit if we have too many users. |
| 7976 | if (V->hasNUsesOrMore(N: UsesLimit)) |
| 7977 | break; |
| 7978 | |
| 7979 | // Collect stores per pointer object. |
| 7980 | for (User *U : V->users()) { |
| 7981 | auto *SI = dyn_cast<StoreInst>(Val: U); |
| 7982 | // Test whether we can handle the store. V might be a global, which could |
| 7983 | // be used in a different function. |
| 7984 | if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F || |
| 7985 | !isValidElementType(Ty: SI->getValueOperand()->getType())) |
| 7986 | continue; |
| 7987 | // Skip entry if already |
| 7988 | if (isVectorized(V: U)) |
| 7989 | continue; |
| 7990 | |
| 7991 | Value *Ptr = |
| 7992 | getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth); |
| 7993 | auto &StoresVec = PtrToStoresMap[{SI->getParent(), |
| 7994 | SI->getValueOperand()->getType(), Ptr}]; |
| 7995 | // For now just keep one store per pointer object per lane. |
| 7996 | // TODO: Extend this to support multiple stores per pointer per lane |
| 7997 | if (StoresVec.size() > Lane) |
| 7998 | continue; |
| 7999 | if (!StoresVec.empty()) { |
| 8000 | std::optional<int64_t> Diff = getPointersDiff( |
| 8001 | ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(), |
| 8002 | ElemTyB: SI->getValueOperand()->getType(), |
| 8003 | PtrB: StoresVec.front()->getPointerOperand(), DL: *DL, SE&: *SE, |
| 8004 | /*StrictCheck=*/true); |
| 8005 | // We failed to compare the pointers so just abandon this store. |
| 8006 | if (!Diff) |
| 8007 | continue; |
| 8008 | } |
| 8009 | StoresVec.push_back(Elt: SI); |
| 8010 | } |
| 8011 | } |
| 8012 | SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size()); |
| 8013 | unsigned I = 0; |
| 8014 | for (auto &P : PtrToStoresMap) { |
| 8015 | Res[I].swap(RHS&: P.second); |
| 8016 | ++I; |
| 8017 | } |
| 8018 | return Res; |
| 8019 | } |
| 8020 | |
| 8021 | bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec, |
| 8022 | OrdersType &ReorderIndices) const { |
| 8023 | // We check whether the stores in StoreVec can form a vector by sorting them |
| 8024 | // and checking whether they are consecutive. |
| 8025 | |
| 8026 | // To avoid calling getPointersDiff() while sorting we create a vector of |
| 8027 | // pairs {store, offset from first} and sort this instead. |
| 8028 | SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec; |
| 8029 | StoreInst *S0 = StoresVec[0]; |
| 8030 | StoreOffsetVec.emplace_back(Args: 0, Args: 0); |
| 8031 | Type *S0Ty = S0->getValueOperand()->getType(); |
| 8032 | Value *S0Ptr = S0->getPointerOperand(); |
| 8033 | for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) { |
| 8034 | StoreInst *SI = StoresVec[Idx]; |
| 8035 | std::optional<int64_t> Diff = |
| 8036 | getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(), |
| 8037 | PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE, |
| 8038 | /*StrictCheck=*/true); |
| 8039 | StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx); |
| 8040 | } |
| 8041 | |
| 8042 | // Check if the stores are consecutive by checking if their difference is 1. |
| 8043 | if (StoreOffsetVec.size() != StoresVec.size()) |
| 8044 | return false; |
| 8045 | sort(C&: StoreOffsetVec, Comp: llvm::less_first()); |
| 8046 | unsigned Idx = 0; |
| 8047 | int64_t PrevDist = 0; |
| 8048 | for (const auto &P : StoreOffsetVec) { |
| 8049 | if (Idx > 0 && P.first != PrevDist + 1) |
| 8050 | return false; |
| 8051 | PrevDist = P.first; |
| 8052 | ++Idx; |
| 8053 | } |
| 8054 | |
| 8055 | // Calculate the shuffle indices according to their offset against the sorted |
| 8056 | // StoreOffsetVec. |
| 8057 | ReorderIndices.assign(NumElts: StoresVec.size(), Elt: 0); |
| 8058 | bool IsIdentity = true; |
| 8059 | for (auto [I, P] : enumerate(First&: StoreOffsetVec)) { |
| 8060 | ReorderIndices[P.second] = I; |
| 8061 | IsIdentity &= P.second == I; |
| 8062 | } |
| 8063 | // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in |
| 8064 | // reorderTopToBottom() and reorderBottomToTop(), so we are following the |
| 8065 | // same convention here. |
| 8066 | if (IsIdentity) |
| 8067 | ReorderIndices.clear(); |
| 8068 | |
| 8069 | return true; |
| 8070 | } |
| 8071 | |
| 8072 | #ifndef NDEBUG |
| 8073 | LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) { |
| 8074 | for (unsigned Idx : Order) |
| 8075 | dbgs() << Idx << ", " ; |
| 8076 | dbgs() << "\n" ; |
| 8077 | } |
| 8078 | #endif |
| 8079 | |
| 8080 | SmallVector<BoUpSLP::OrdersType, 1> |
| 8081 | BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const { |
| 8082 | unsigned NumLanes = TE->Scalars.size(); |
| 8083 | |
| 8084 | SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE); |
| 8085 | |
| 8086 | // Holds the reorder indices for each candidate store vector that is a user of |
| 8087 | // the current TreeEntry. |
| 8088 | SmallVector<OrdersType, 1> ExternalReorderIndices; |
| 8089 | |
| 8090 | // Now inspect the stores collected per pointer and look for vectorization |
| 8091 | // candidates. For each candidate calculate the reorder index vector and push |
| 8092 | // it into `ExternalReorderIndices` |
| 8093 | for (ArrayRef<StoreInst *> StoresVec : Stores) { |
| 8094 | // If we have fewer than NumLanes stores, then we can't form a vector. |
| 8095 | if (StoresVec.size() != NumLanes) |
| 8096 | continue; |
| 8097 | |
| 8098 | // If the stores are not consecutive then abandon this StoresVec. |
| 8099 | OrdersType ReorderIndices; |
| 8100 | if (!canFormVector(StoresVec, ReorderIndices)) |
| 8101 | continue; |
| 8102 | |
| 8103 | // We now know that the scalars in StoresVec can form a vector instruction, |
| 8104 | // so set the reorder indices. |
| 8105 | ExternalReorderIndices.push_back(Elt: ReorderIndices); |
| 8106 | } |
| 8107 | return ExternalReorderIndices; |
| 8108 | } |
| 8109 | |
| 8110 | void BoUpSLP::buildTree(ArrayRef<Value *> Roots, |
| 8111 | const SmallDenseSet<Value *> &UserIgnoreLst) { |
| 8112 | deleteTree(); |
| 8113 | UserIgnoreList = &UserIgnoreLst; |
| 8114 | if (!allSameType(VL: Roots)) |
| 8115 | return; |
| 8116 | buildTreeRec(Roots, Depth: 0, EI: EdgeInfo()); |
| 8117 | } |
| 8118 | |
| 8119 | void BoUpSLP::buildTree(ArrayRef<Value *> Roots) { |
| 8120 | deleteTree(); |
| 8121 | if (!allSameType(VL: Roots)) |
| 8122 | return; |
| 8123 | buildTreeRec(Roots, Depth: 0, EI: EdgeInfo()); |
| 8124 | } |
| 8125 | |
| 8126 | /// Tries to find subvector of loads and builds new vector of only loads if can |
| 8127 | /// be profitable. |
| 8128 | static void gatherPossiblyVectorizableLoads( |
| 8129 | const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL, |
| 8130 | ScalarEvolution &SE, const TargetTransformInfo &TTI, |
| 8131 | SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads, |
| 8132 | bool AddNew = true) { |
| 8133 | if (VL.empty()) |
| 8134 | return; |
| 8135 | Type *ScalarTy = getValueType(V: VL.front()); |
| 8136 | if (!isValidElementType(Ty: ScalarTy)) |
| 8137 | return; |
| 8138 | SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads; |
| 8139 | SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad; |
| 8140 | for (Value *V : VL) { |
| 8141 | auto *LI = dyn_cast<LoadInst>(Val: V); |
| 8142 | if (!LI) |
| 8143 | continue; |
| 8144 | if (R.isDeleted(I: LI) || R.isVectorized(V: LI) || !LI->isSimple()) |
| 8145 | continue; |
| 8146 | bool IsFound = false; |
| 8147 | for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) { |
| 8148 | assert(LI->getParent() == Data.front().first->getParent() && |
| 8149 | LI->getType() == Data.front().first->getType() && |
| 8150 | getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) == |
| 8151 | getUnderlyingObject(Data.front().first->getPointerOperand(), |
| 8152 | RecursionMaxDepth) && |
| 8153 | "Expected loads with the same type, same parent and same " |
| 8154 | "underlying pointer." ); |
| 8155 | std::optional<int64_t> Dist = getPointersDiff( |
| 8156 | ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(), |
| 8157 | PtrB: Data.front().first->getPointerOperand(), DL, SE, |
| 8158 | /*StrictCheck=*/true); |
| 8159 | if (!Dist) |
| 8160 | continue; |
| 8161 | auto It = Map.find(Val: *Dist); |
| 8162 | if (It != Map.end() && It->second != LI) |
| 8163 | continue; |
| 8164 | if (It == Map.end()) { |
| 8165 | Data.emplace_back(Args&: LI, Args&: *Dist); |
| 8166 | Map.try_emplace(Key: *Dist, Args&: LI); |
| 8167 | } |
| 8168 | IsFound = true; |
| 8169 | break; |
| 8170 | } |
| 8171 | if (!IsFound) { |
| 8172 | ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: 0); |
| 8173 | ClusteredDistToLoad.emplace_back().try_emplace(Key: 0, Args&: LI); |
| 8174 | } |
| 8175 | } |
| 8176 | auto FindMatchingLoads = |
| 8177 | [&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads, |
| 8178 | SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> |
| 8179 | &GatheredLoads, |
| 8180 | SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated, |
| 8181 | int64_t &Offset, unsigned &Start) { |
| 8182 | if (Loads.empty()) |
| 8183 | return GatheredLoads.end(); |
| 8184 | LoadInst *LI = Loads.front().first; |
| 8185 | for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) { |
| 8186 | if (Idx < Start) |
| 8187 | continue; |
| 8188 | ToAdd.clear(); |
| 8189 | if (LI->getParent() != Data.front().first->getParent() || |
| 8190 | LI->getType() != Data.front().first->getType()) |
| 8191 | continue; |
| 8192 | std::optional<int64_t> Dist = |
| 8193 | getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), |
| 8194 | ElemTyB: Data.front().first->getType(), |
| 8195 | PtrB: Data.front().first->getPointerOperand(), DL, SE, |
| 8196 | /*StrictCheck=*/true); |
| 8197 | if (!Dist) |
| 8198 | continue; |
| 8199 | SmallSet<int64_t, 4> DataDists; |
| 8200 | SmallPtrSet<LoadInst *, 4> DataLoads; |
| 8201 | for (std::pair<LoadInst *, int64_t> P : Data) { |
| 8202 | DataDists.insert(V: P.second); |
| 8203 | DataLoads.insert(Ptr: P.first); |
| 8204 | } |
| 8205 | // Found matching gathered loads - check if all loads are unique or |
| 8206 | // can be effectively vectorized. |
| 8207 | unsigned NumUniques = 0; |
| 8208 | for (auto [Cnt, Pair] : enumerate(First&: Loads)) { |
| 8209 | bool Used = DataLoads.contains(Ptr: Pair.first); |
| 8210 | if (!Used && !DataDists.contains(V: *Dist + Pair.second)) { |
| 8211 | ++NumUniques; |
| 8212 | ToAdd.insert(X: Cnt); |
| 8213 | } else if (Used) { |
| 8214 | Repeated.insert(X: Cnt); |
| 8215 | } |
| 8216 | } |
| 8217 | if (NumUniques > 0 && |
| 8218 | (Loads.size() == NumUniques || |
| 8219 | (Loads.size() - NumUniques >= 2 && |
| 8220 | Loads.size() - NumUniques >= Loads.size() / 2 && |
| 8221 | (has_single_bit(Value: Data.size() + NumUniques) || |
| 8222 | bit_ceil(Value: Data.size()) < |
| 8223 | bit_ceil(Value: Data.size() + NumUniques))))) { |
| 8224 | Offset = *Dist; |
| 8225 | Start = Idx + 1; |
| 8226 | return std::next(x: GatheredLoads.begin(), n: Idx); |
| 8227 | } |
| 8228 | } |
| 8229 | ToAdd.clear(); |
| 8230 | return GatheredLoads.end(); |
| 8231 | }; |
| 8232 | for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) { |
| 8233 | unsigned Start = 0; |
| 8234 | SetVector<unsigned> ToAdd, LocalToAdd, Repeated; |
| 8235 | int64_t Offset = 0; |
| 8236 | auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, |
| 8237 | Offset, Start); |
| 8238 | while (It != GatheredLoads.end()) { |
| 8239 | assert(!LocalToAdd.empty() && "Expected some elements to add." ); |
| 8240 | for (unsigned Idx : LocalToAdd) |
| 8241 | It->emplace_back(Args: Data[Idx].first, Args: Data[Idx].second + Offset); |
| 8242 | ToAdd.insert_range(R&: LocalToAdd); |
| 8243 | It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset, |
| 8244 | Start); |
| 8245 | } |
| 8246 | if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) { |
| 8247 | return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx); |
| 8248 | })) { |
| 8249 | auto AddNewLoads = |
| 8250 | [&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) { |
| 8251 | for (unsigned Idx : seq<unsigned>(Size: Data.size())) { |
| 8252 | if (ToAdd.contains(key: Idx) || Repeated.contains(key: Idx)) |
| 8253 | continue; |
| 8254 | Loads.push_back(Elt: Data[Idx]); |
| 8255 | } |
| 8256 | }; |
| 8257 | if (!AddNew) { |
| 8258 | LoadInst *LI = Data.front().first; |
| 8259 | It = find_if( |
| 8260 | Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) { |
| 8261 | return PD.front().first->getParent() == LI->getParent() && |
| 8262 | PD.front().first->getType() == LI->getType(); |
| 8263 | }); |
| 8264 | while (It != GatheredLoads.end()) { |
| 8265 | AddNewLoads(*It); |
| 8266 | It = std::find_if( |
| 8267 | first: std::next(x: It), last: GatheredLoads.end(), |
| 8268 | pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) { |
| 8269 | return PD.front().first->getParent() == LI->getParent() && |
| 8270 | PD.front().first->getType() == LI->getType(); |
| 8271 | }); |
| 8272 | } |
| 8273 | } |
| 8274 | GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end()); |
| 8275 | AddNewLoads(GatheredLoads.emplace_back()); |
| 8276 | } |
| 8277 | } |
| 8278 | } |
| 8279 | |
| 8280 | void BoUpSLP::tryToVectorizeGatheredLoads( |
| 8281 | const SmallMapVector< |
| 8282 | std::tuple<BasicBlock *, Value *, Type *>, |
| 8283 | SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8> |
| 8284 | &GatheredLoads) { |
| 8285 | GatheredLoadsEntriesFirst = VectorizableTree.size(); |
| 8286 | |
| 8287 | SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize( |
| 8288 | LoadEntriesToVectorize.size()); |
| 8289 | for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize)) |
| 8290 | Set.insert_range(R&: VectorizableTree[Idx]->Scalars); |
| 8291 | |
| 8292 | // Sort loads by distance. |
| 8293 | auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1, |
| 8294 | const std::pair<LoadInst *, int64_t> &L2) { |
| 8295 | return L1.second > L2.second; |
| 8296 | }; |
| 8297 | |
| 8298 | auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) { |
| 8299 | ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()), |
| 8300 | Loads.size()); |
| 8301 | Align Alignment = computeCommonAlignment<LoadInst>(VL: Values); |
| 8302 | auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size()); |
| 8303 | return TTI->isLegalMaskedGather(DataType: Ty, Alignment) && |
| 8304 | !TTI->forceScalarizeMaskedGather(Type: Ty, Alignment); |
| 8305 | }; |
| 8306 | |
| 8307 | auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads, |
| 8308 | BoUpSLP::ValueSet &VectorizedLoads, |
| 8309 | SmallVectorImpl<LoadInst *> &NonVectorized, |
| 8310 | bool Final, unsigned MaxVF) { |
| 8311 | SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results; |
| 8312 | unsigned StartIdx = 0; |
| 8313 | SmallVector<int> CandidateVFs; |
| 8314 | if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + 1)) |
| 8315 | CandidateVFs.push_back(Elt: MaxVF); |
| 8316 | for (int NumElts = getFloorFullVectorNumberOfElements( |
| 8317 | TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF); |
| 8318 | NumElts > 1; NumElts = getFloorFullVectorNumberOfElements( |
| 8319 | TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - 1)) { |
| 8320 | CandidateVFs.push_back(Elt: NumElts); |
| 8321 | if (VectorizeNonPowerOf2 && NumElts > 2) |
| 8322 | CandidateVFs.push_back(Elt: NumElts - 1); |
| 8323 | } |
| 8324 | |
| 8325 | if (Final && CandidateVFs.empty()) |
| 8326 | return Results; |
| 8327 | |
| 8328 | unsigned BestVF = Final ? CandidateVFs.back() : 0; |
| 8329 | for (unsigned NumElts : CandidateVFs) { |
| 8330 | if (Final && NumElts > BestVF) |
| 8331 | continue; |
| 8332 | SmallVector<unsigned> MaskedGatherVectorized; |
| 8333 | for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E; |
| 8334 | ++Cnt) { |
| 8335 | ArrayRef<LoadInst *> Slice = |
| 8336 | ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt)); |
| 8337 | if (VectorizedLoads.count(Ptr: Slice.front()) || |
| 8338 | VectorizedLoads.count(Ptr: Slice.back()) || |
| 8339 | areKnownNonVectorizableLoads(VL: Slice)) |
| 8340 | continue; |
| 8341 | // Check if it is profitable to try vectorizing gathered loads. It is |
| 8342 | // profitable if we have more than 3 consecutive loads or if we have |
| 8343 | // less but all users are vectorized or deleted. |
| 8344 | bool AllowToVectorize = false; |
| 8345 | // Check if it is profitable to vectorize 2-elements loads. |
| 8346 | if (NumElts == 2) { |
| 8347 | bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad( |
| 8348 | ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts)); |
| 8349 | auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) { |
| 8350 | for (LoadInst *LI : Slice) { |
| 8351 | // If single use/user - allow to vectorize. |
| 8352 | if (LI->hasOneUse()) |
| 8353 | continue; |
| 8354 | // 1. Check if number of uses equals number of users. |
| 8355 | // 2. All users are deleted. |
| 8356 | // 3. The load broadcasts are not allowed or the load is not |
| 8357 | // broadcasted. |
| 8358 | if (static_cast<unsigned int>(std::distance( |
| 8359 | first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses()) |
| 8360 | return false; |
| 8361 | if (!IsLegalBroadcastLoad) |
| 8362 | continue; |
| 8363 | if (LI->hasNUsesOrMore(N: UsesLimit)) |
| 8364 | return false; |
| 8365 | for (User *U : LI->users()) { |
| 8366 | if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI)) |
| 8367 | continue; |
| 8368 | for (const TreeEntry *UTE : getTreeEntries(V: U)) { |
| 8369 | for (int I : seq<int>(Size: UTE->getNumOperands())) { |
| 8370 | if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) { |
| 8371 | return V == LI || isa<PoisonValue>(Val: V); |
| 8372 | })) |
| 8373 | // Found legal broadcast - do not vectorize. |
| 8374 | return false; |
| 8375 | } |
| 8376 | } |
| 8377 | } |
| 8378 | } |
| 8379 | return true; |
| 8380 | }; |
| 8381 | AllowToVectorize = CheckIfAllowed(Slice); |
| 8382 | } else { |
| 8383 | AllowToVectorize = |
| 8384 | (NumElts >= 3 || |
| 8385 | any_of(Range: ValueToGatherNodes.at(Val: Slice.front()), |
| 8386 | P: [=](const TreeEntry *TE) { |
| 8387 | return TE->Scalars.size() == 2 && |
| 8388 | ((TE->Scalars.front() == Slice.front() && |
| 8389 | TE->Scalars.back() == Slice.back()) || |
| 8390 | (TE->Scalars.front() == Slice.back() && |
| 8391 | TE->Scalars.back() == Slice.front())); |
| 8392 | })) && |
| 8393 | hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), |
| 8394 | Sz: Slice.size()); |
| 8395 | } |
| 8396 | if (AllowToVectorize) { |
| 8397 | SmallVector<Value *> PointerOps; |
| 8398 | OrdersType CurrentOrder; |
| 8399 | // Try to build vector load. |
| 8400 | ArrayRef<Value *> Values( |
| 8401 | reinterpret_cast<Value *const *>(Slice.begin()), Slice.size()); |
| 8402 | LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder, |
| 8403 | PointerOps, BestVF: &BestVF); |
| 8404 | if (LS != LoadsState::Gather || |
| 8405 | (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) { |
| 8406 | if (LS == LoadsState::ScatterVectorize) { |
| 8407 | if (MaskedGatherVectorized.empty() || |
| 8408 | Cnt >= MaskedGatherVectorized.back() + NumElts) |
| 8409 | MaskedGatherVectorized.push_back(Elt: Cnt); |
| 8410 | continue; |
| 8411 | } |
| 8412 | if (LS != LoadsState::Gather) { |
| 8413 | Results.emplace_back(Args&: Values, Args&: LS); |
| 8414 | VectorizedLoads.insert_range(R&: Slice); |
| 8415 | // If we vectorized initial block, no need to try to vectorize it |
| 8416 | // again. |
| 8417 | if (Cnt == StartIdx) |
| 8418 | StartIdx += NumElts; |
| 8419 | } |
| 8420 | // Check if the whole array was vectorized already - exit. |
| 8421 | if (StartIdx >= Loads.size()) |
| 8422 | break; |
| 8423 | // Erase last masked gather candidate, if another candidate within |
| 8424 | // the range is found to be better. |
| 8425 | if (!MaskedGatherVectorized.empty() && |
| 8426 | Cnt < MaskedGatherVectorized.back() + NumElts) |
| 8427 | MaskedGatherVectorized.pop_back(); |
| 8428 | Cnt += NumElts - 1; |
| 8429 | continue; |
| 8430 | } |
| 8431 | } |
| 8432 | if (!AllowToVectorize || BestVF == 0) |
| 8433 | registerNonVectorizableLoads(VL: Slice); |
| 8434 | } |
| 8435 | // Mark masked gathers candidates as vectorized, if any. |
| 8436 | for (unsigned Cnt : MaskedGatherVectorized) { |
| 8437 | ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice( |
| 8438 | N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt)); |
| 8439 | ArrayRef<Value *> Values( |
| 8440 | reinterpret_cast<Value *const *>(Slice.begin()), Slice.size()); |
| 8441 | Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize); |
| 8442 | VectorizedLoads.insert_range(R&: Slice); |
| 8443 | // If we vectorized initial block, no need to try to vectorize it again. |
| 8444 | if (Cnt == StartIdx) |
| 8445 | StartIdx += NumElts; |
| 8446 | } |
| 8447 | } |
| 8448 | for (LoadInst *LI : Loads) { |
| 8449 | if (!VectorizedLoads.contains(Ptr: LI)) |
| 8450 | NonVectorized.push_back(Elt: LI); |
| 8451 | } |
| 8452 | return Results; |
| 8453 | }; |
| 8454 | auto ProcessGatheredLoads = |
| 8455 | [&, &TTI = *TTI]( |
| 8456 | ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads, |
| 8457 | bool Final = false) { |
| 8458 | SmallVector<LoadInst *> NonVectorized; |
| 8459 | for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists : |
| 8460 | GatheredLoads) { |
| 8461 | if (LoadsDists.size() <= 1) { |
| 8462 | NonVectorized.push_back(Elt: LoadsDists.back().first); |
| 8463 | continue; |
| 8464 | } |
| 8465 | SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists( |
| 8466 | LoadsDists); |
| 8467 | SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists)); |
| 8468 | stable_sort(Range&: LocalLoadsDists, C: LoadSorter); |
| 8469 | SmallVector<LoadInst *> Loads; |
| 8470 | unsigned MaxConsecutiveDistance = 0; |
| 8471 | unsigned CurrentConsecutiveDist = 1; |
| 8472 | int64_t LastDist = LocalLoadsDists.front().second; |
| 8473 | bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads); |
| 8474 | for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) { |
| 8475 | if (isVectorized(V: L.first)) |
| 8476 | continue; |
| 8477 | assert(LastDist >= L.second && |
| 8478 | "Expected first distance always not less than second" ); |
| 8479 | if (static_cast<uint64_t>(LastDist - L.second) == |
| 8480 | CurrentConsecutiveDist) { |
| 8481 | ++CurrentConsecutiveDist; |
| 8482 | MaxConsecutiveDistance = |
| 8483 | std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist); |
| 8484 | Loads.push_back(Elt: L.first); |
| 8485 | continue; |
| 8486 | } |
| 8487 | if (!AllowMaskedGather && CurrentConsecutiveDist == 1 && |
| 8488 | !Loads.empty()) |
| 8489 | Loads.pop_back(); |
| 8490 | CurrentConsecutiveDist = 1; |
| 8491 | LastDist = L.second; |
| 8492 | Loads.push_back(Elt: L.first); |
| 8493 | } |
| 8494 | if (Loads.size() <= 1) |
| 8495 | continue; |
| 8496 | if (AllowMaskedGather) |
| 8497 | MaxConsecutiveDistance = Loads.size(); |
| 8498 | else if (MaxConsecutiveDistance < 2) |
| 8499 | continue; |
| 8500 | BoUpSLP::ValueSet VectorizedLoads; |
| 8501 | SmallVector<LoadInst *> SortedNonVectorized; |
| 8502 | SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results = |
| 8503 | GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized, |
| 8504 | Final, MaxConsecutiveDistance); |
| 8505 | if (!Results.empty() && !SortedNonVectorized.empty() && |
| 8506 | OriginalLoads.size() == Loads.size() && |
| 8507 | MaxConsecutiveDistance == Loads.size() && |
| 8508 | all_of(Range&: Results, |
| 8509 | P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) { |
| 8510 | return P.second == LoadsState::ScatterVectorize; |
| 8511 | })) { |
| 8512 | VectorizedLoads.clear(); |
| 8513 | SmallVector<LoadInst *> UnsortedNonVectorized; |
| 8514 | SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> |
| 8515 | UnsortedResults = |
| 8516 | GetVectorizedRanges(OriginalLoads, VectorizedLoads, |
| 8517 | UnsortedNonVectorized, Final, |
| 8518 | OriginalLoads.size()); |
| 8519 | if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) { |
| 8520 | SortedNonVectorized.swap(RHS&: UnsortedNonVectorized); |
| 8521 | Results.swap(RHS&: UnsortedResults); |
| 8522 | } |
| 8523 | } |
| 8524 | for (auto [Slice, _] : Results) { |
| 8525 | LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads (" |
| 8526 | << Slice.size() << ")\n" ); |
| 8527 | if (any_of(Range&: Slice, P: [&](Value *V) { return isVectorized(V); })) { |
| 8528 | for (Value *L : Slice) |
| 8529 | if (!isVectorized(V: L)) |
| 8530 | SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L)); |
| 8531 | continue; |
| 8532 | } |
| 8533 | |
| 8534 | // Select maximum VF as a maximum of user gathered nodes and |
| 8535 | // distance between scalar loads in these nodes. |
| 8536 | unsigned MaxVF = Slice.size(); |
| 8537 | unsigned UserMaxVF = 0; |
| 8538 | unsigned InterleaveFactor = 0; |
| 8539 | if (MaxVF == 2) { |
| 8540 | UserMaxVF = MaxVF; |
| 8541 | } else { |
| 8542 | // Found distance between segments of the interleaved loads. |
| 8543 | std::optional<unsigned> InterleavedLoadsDistance = 0; |
| 8544 | unsigned Order = 0; |
| 8545 | std::optional<unsigned> CommonVF = 0; |
| 8546 | DenseMap<const TreeEntry *, unsigned> EntryToPosition; |
| 8547 | SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes; |
| 8548 | for (auto [Idx, V] : enumerate(First&: Slice)) { |
| 8549 | for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) { |
| 8550 | UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size()); |
| 8551 | unsigned Pos = |
| 8552 | EntryToPosition.try_emplace(Key: E, Args&: Idx).first->second; |
| 8553 | UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + 1); |
| 8554 | if (CommonVF) { |
| 8555 | if (*CommonVF == 0) { |
| 8556 | CommonVF = E->Scalars.size(); |
| 8557 | continue; |
| 8558 | } |
| 8559 | if (*CommonVF != E->Scalars.size()) |
| 8560 | CommonVF.reset(); |
| 8561 | } |
| 8562 | // Check if the load is the part of the interleaved load. |
| 8563 | if (Pos != Idx && InterleavedLoadsDistance) { |
| 8564 | if (!DeinterleavedNodes.contains(Ptr: E) && |
| 8565 | any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) { |
| 8566 | if (isa<Constant>(Val: V)) |
| 8567 | return false; |
| 8568 | if (isVectorized(V)) |
| 8569 | return true; |
| 8570 | const auto &Nodes = ValueToGatherNodes.at(Val: V); |
| 8571 | return (Nodes.size() != 1 || !Nodes.contains(key: E)) && |
| 8572 | !is_contained(Range: Slice, Element: V); |
| 8573 | })) { |
| 8574 | InterleavedLoadsDistance.reset(); |
| 8575 | continue; |
| 8576 | } |
| 8577 | DeinterleavedNodes.insert(Ptr: E); |
| 8578 | if (*InterleavedLoadsDistance == 0) { |
| 8579 | InterleavedLoadsDistance = Idx - Pos; |
| 8580 | continue; |
| 8581 | } |
| 8582 | if ((Idx - Pos) % *InterleavedLoadsDistance != 0 || |
| 8583 | (Idx - Pos) / *InterleavedLoadsDistance < Order) |
| 8584 | InterleavedLoadsDistance.reset(); |
| 8585 | Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: 1); |
| 8586 | } |
| 8587 | } |
| 8588 | } |
| 8589 | DeinterleavedNodes.clear(); |
| 8590 | // Check if the large load represents interleaved load operation. |
| 8591 | if (InterleavedLoadsDistance.value_or(u: 0) > 1 && |
| 8592 | CommonVF.value_or(u: 0) != 0) { |
| 8593 | InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance); |
| 8594 | unsigned VF = *CommonVF; |
| 8595 | OrdersType Order; |
| 8596 | SmallVector<Value *> PointerOps; |
| 8597 | // Segmented load detected - vectorize at maximum vector factor. |
| 8598 | if (InterleaveFactor <= Slice.size() && |
| 8599 | TTI.isLegalInterleavedAccessType( |
| 8600 | VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF), |
| 8601 | Factor: InterleaveFactor, |
| 8602 | Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(), |
| 8603 | AddrSpace: cast<LoadInst>(Val: Slice.front()) |
| 8604 | ->getPointerAddressSpace()) && |
| 8605 | canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, |
| 8606 | PointerOps) == LoadsState::Vectorize) { |
| 8607 | UserMaxVF = InterleaveFactor * VF; |
| 8608 | } else { |
| 8609 | InterleaveFactor = 0; |
| 8610 | } |
| 8611 | } |
| 8612 | // Cannot represent the loads as consecutive vectorizable nodes - |
| 8613 | // just exit. |
| 8614 | unsigned ConsecutiveNodesSize = 0; |
| 8615 | if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 && |
| 8616 | any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize), |
| 8617 | P: [&, Slice = Slice](const auto &P) { |
| 8618 | const auto *It = find_if(Slice, [&](Value *V) { |
| 8619 | return std::get<1>(P).contains(V); |
| 8620 | }); |
| 8621 | if (It == Slice.end()) |
| 8622 | return false; |
| 8623 | const TreeEntry &TE = |
| 8624 | *VectorizableTree[std::get<0>(P)]; |
| 8625 | ArrayRef<Value *> VL = TE.Scalars; |
| 8626 | OrdersType Order; |
| 8627 | SmallVector<Value *> PointerOps; |
| 8628 | LoadsState State = canVectorizeLoads( |
| 8629 | VL, VL0: VL.front(), Order, PointerOps); |
| 8630 | if (State == LoadsState::ScatterVectorize || |
| 8631 | State == LoadsState::CompressVectorize) |
| 8632 | return false; |
| 8633 | ConsecutiveNodesSize += VL.size(); |
| 8634 | unsigned Start = std::distance(Slice.begin(), It); |
| 8635 | unsigned Sz = Slice.size() - Start; |
| 8636 | return Sz < VL.size() || |
| 8637 | Slice.slice(std::distance(Slice.begin(), It), |
| 8638 | VL.size()) != VL; |
| 8639 | })) |
| 8640 | continue; |
| 8641 | // Try to build long masked gather loads. |
| 8642 | UserMaxVF = bit_ceil(Value: UserMaxVF); |
| 8643 | if (InterleaveFactor == 0 && |
| 8644 | any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF), |
| 8645 | P: [&, Slice = Slice](unsigned Idx) { |
| 8646 | OrdersType Order; |
| 8647 | SmallVector<Value *> PointerOps; |
| 8648 | return canVectorizeLoads( |
| 8649 | VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF), |
| 8650 | VL0: Slice[Idx * UserMaxVF], Order, |
| 8651 | PointerOps) == |
| 8652 | LoadsState::ScatterVectorize; |
| 8653 | })) |
| 8654 | UserMaxVF = MaxVF; |
| 8655 | if (Slice.size() != ConsecutiveNodesSize) |
| 8656 | MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF); |
| 8657 | } |
| 8658 | for (unsigned VF = MaxVF; VF >= 2; VF /= 2) { |
| 8659 | bool IsVectorized = true; |
| 8660 | for (unsigned I = 0, E = Slice.size(); I < E; I += VF) { |
| 8661 | ArrayRef<Value *> SubSlice = |
| 8662 | Slice.slice(N: I, M: std::min(a: VF, b: E - I)); |
| 8663 | if (isVectorized(V: SubSlice.front())) |
| 8664 | continue; |
| 8665 | // Check if the subslice is to be-vectorized entry, which is not |
| 8666 | // equal to entry. |
| 8667 | if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize), |
| 8668 | P: [&](const auto &P) { |
| 8669 | return !SubSlice.equals( |
| 8670 | RHS: VectorizableTree[std::get<0>(P)] |
| 8671 | ->Scalars) && |
| 8672 | set_is_subset(SubSlice, std::get<1>(P)); |
| 8673 | })) |
| 8674 | continue; |
| 8675 | unsigned Sz = VectorizableTree.size(); |
| 8676 | buildTreeRec(Roots: SubSlice, Depth: 0, EI: EdgeInfo(), InterleaveFactor); |
| 8677 | if (Sz == VectorizableTree.size()) { |
| 8678 | IsVectorized = false; |
| 8679 | // Try non-interleaved vectorization with smaller vector |
| 8680 | // factor. |
| 8681 | if (InterleaveFactor > 0) { |
| 8682 | VF = 2 * (MaxVF / InterleaveFactor); |
| 8683 | InterleaveFactor = 0; |
| 8684 | } |
| 8685 | continue; |
| 8686 | } |
| 8687 | } |
| 8688 | if (IsVectorized) |
| 8689 | break; |
| 8690 | } |
| 8691 | } |
| 8692 | NonVectorized.append(RHS: SortedNonVectorized); |
| 8693 | } |
| 8694 | return NonVectorized; |
| 8695 | }; |
| 8696 | for (const auto &GLs : GatheredLoads) { |
| 8697 | const auto &Ref = GLs.second; |
| 8698 | SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref); |
| 8699 | if (!Ref.empty() && !NonVectorized.empty() && |
| 8700 | std::accumulate( |
| 8701 | first: Ref.begin(), last: Ref.end(), init: 0u, |
| 8702 | binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists) |
| 8703 | -> unsigned { return S + LoadsDists.size(); }) != |
| 8704 | NonVectorized.size() && |
| 8705 | IsMaskedGatherSupported(NonVectorized)) { |
| 8706 | SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> |
| 8707 | FinalGatheredLoads; |
| 8708 | for (LoadInst *LI : NonVectorized) { |
| 8709 | // Reinsert non-vectorized loads to other list of loads with the same |
| 8710 | // base pointers. |
| 8711 | gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: *DL, SE&: *SE, TTI: *TTI, |
| 8712 | GatheredLoads&: FinalGatheredLoads, |
| 8713 | /*AddNew=*/false); |
| 8714 | } |
| 8715 | // Final attempt to vectorize non-vectorized loads. |
| 8716 | (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); |
| 8717 | } |
| 8718 | } |
| 8719 | // Try to vectorize postponed load entries, previously marked as gathered. |
| 8720 | for (unsigned Idx : LoadEntriesToVectorize) { |
| 8721 | const TreeEntry &E = *VectorizableTree[Idx]; |
| 8722 | SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end()); |
| 8723 | // Avoid reordering, if possible. |
| 8724 | if (!E.ReorderIndices.empty()) { |
| 8725 | // Build a mask out of the reorder indices and reorder scalars per this |
| 8726 | // mask. |
| 8727 | SmallVector<int> ReorderMask; |
| 8728 | inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask); |
| 8729 | reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask); |
| 8730 | } |
| 8731 | buildTreeRec(Roots: GatheredScalars, Depth: 0, EI: EdgeInfo()); |
| 8732 | } |
| 8733 | // If no new entries created, consider it as no gathered loads entries must be |
| 8734 | // handled. |
| 8735 | if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) == |
| 8736 | VectorizableTree.size()) |
| 8737 | GatheredLoadsEntriesFirst.reset(); |
| 8738 | } |
| 8739 | |
| 8740 | /// Generates key/subkey pair for the given value to provide effective sorting |
| 8741 | /// of the values and better detection of the vectorizable values sequences. The |
| 8742 | /// keys/subkeys can be used for better sorting of the values themselves (keys) |
| 8743 | /// and in values subgroups (subkeys). |
| 8744 | static std::pair<size_t, size_t> generateKeySubkey( |
| 8745 | Value *V, const TargetLibraryInfo *TLI, |
| 8746 | function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, |
| 8747 | bool AllowAlternate) { |
| 8748 | hash_code Key = hash_value(value: V->getValueID() + 2); |
| 8749 | hash_code SubKey = hash_value(value: 0); |
| 8750 | // Sort the loads by the distance between the pointers. |
| 8751 | if (auto *LI = dyn_cast<LoadInst>(Val: V)) { |
| 8752 | Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key); |
| 8753 | if (LI->isSimple()) |
| 8754 | SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI)); |
| 8755 | else |
| 8756 | Key = SubKey = hash_value(ptr: LI); |
| 8757 | } else if (isVectorLikeInstWithConstOps(V)) { |
| 8758 | // Sort extracts by the vector operands. |
| 8759 | if (isa<ExtractElementInst, UndefValue>(Val: V)) |
| 8760 | Key = hash_value(value: Value::UndefValueVal + 1); |
| 8761 | if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) { |
| 8762 | if (!isUndefVector(V: EI->getVectorOperand()).all() && |
| 8763 | !isa<UndefValue>(Val: EI->getIndexOperand())) |
| 8764 | SubKey = hash_value(ptr: EI->getVectorOperand()); |
| 8765 | } |
| 8766 | } else if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 8767 | // Sort other instructions just by the opcodes except for CMPInst. |
| 8768 | // For CMP also sort by the predicate kind. |
| 8769 | if ((isa<BinaryOperator, CastInst>(Val: I)) && |
| 8770 | isValidForAlternation(Opcode: I->getOpcode())) { |
| 8771 | if (AllowAlternate) |
| 8772 | Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0); |
| 8773 | else |
| 8774 | Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key); |
| 8775 | SubKey = hash_combine( |
| 8776 | args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()), |
| 8777 | args: hash_value(ptr: isa<BinaryOperator>(Val: I) |
| 8778 | ? I->getType() |
| 8779 | : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType())); |
| 8780 | // For casts, look through the only operand to improve compile time. |
| 8781 | if (isa<CastInst>(Val: I)) { |
| 8782 | std::pair<size_t, size_t> OpVals = |
| 8783 | generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator, |
| 8784 | /*AllowAlternate=*/true); |
| 8785 | Key = hash_combine(args: OpVals.first, args: Key); |
| 8786 | SubKey = hash_combine(args: OpVals.first, args: SubKey); |
| 8787 | } |
| 8788 | } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) { |
| 8789 | CmpInst::Predicate Pred = CI->getPredicate(); |
| 8790 | if (CI->isCommutative()) |
| 8791 | Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred)); |
| 8792 | CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred); |
| 8793 | SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred), |
| 8794 | args: hash_value(value: SwapPred), |
| 8795 | args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType())); |
| 8796 | } else if (auto *Call = dyn_cast<CallInst>(Val: I)) { |
| 8797 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI); |
| 8798 | if (isTriviallyVectorizable(ID)) { |
| 8799 | SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID)); |
| 8800 | } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) { |
| 8801 | SubKey = hash_combine(args: hash_value(value: I->getOpcode()), |
| 8802 | args: hash_value(ptr: Call->getCalledFunction())); |
| 8803 | } else { |
| 8804 | Key = hash_combine(args: hash_value(ptr: Call), args: Key); |
| 8805 | SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call)); |
| 8806 | } |
| 8807 | for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos()) |
| 8808 | SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End), |
| 8809 | args: hash_value(ptr: Op.Tag), args: SubKey); |
| 8810 | } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) { |
| 8811 | if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1))) |
| 8812 | SubKey = hash_value(ptr: Gep->getPointerOperand()); |
| 8813 | else |
| 8814 | SubKey = hash_value(ptr: Gep); |
| 8815 | } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) && |
| 8816 | !isa<ConstantInt>(Val: I->getOperand(i: 1))) { |
| 8817 | // Do not try to vectorize instructions with potentially high cost. |
| 8818 | SubKey = hash_value(ptr: I); |
| 8819 | } else { |
| 8820 | SubKey = hash_value(value: I->getOpcode()); |
| 8821 | } |
| 8822 | Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key); |
| 8823 | } |
| 8824 | return std::make_pair(x&: Key, y&: SubKey); |
| 8825 | } |
| 8826 | |
| 8827 | /// Checks if the specified instruction \p I is an main operation for the given |
| 8828 | /// \p MainOp and \p AltOp instructions. |
| 8829 | static bool isMainInstruction(Instruction *I, Instruction *MainOp, |
| 8830 | Instruction *AltOp, const TargetLibraryInfo &TLI); |
| 8831 | |
| 8832 | bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, |
| 8833 | ArrayRef<Value *> VL) const { |
| 8834 | Type *ScalarTy = S.getMainOp()->getType(); |
| 8835 | unsigned Opcode0 = S.getOpcode(); |
| 8836 | unsigned Opcode1 = S.getAltOpcode(); |
| 8837 | SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1)); |
| 8838 | // If this pattern is supported by the target then consider it profitable. |
| 8839 | if (TTI->isLegalAltInstr(VecTy: getWidenedType(ScalarTy, VF: VL.size()), Opcode0, |
| 8840 | Opcode1, OpcodeMask)) |
| 8841 | return true; |
| 8842 | SmallVector<ValueList> Operands; |
| 8843 | for (unsigned I : seq<unsigned>(Size: S.getMainOp()->getNumOperands())) { |
| 8844 | Operands.emplace_back(); |
| 8845 | // Prepare the operand vector. |
| 8846 | for (Value *V : VL) { |
| 8847 | if (isa<PoisonValue>(Val: V)) { |
| 8848 | Operands.back().push_back( |
| 8849 | Elt: PoisonValue::get(T: S.getMainOp()->getOperand(i: I)->getType())); |
| 8850 | continue; |
| 8851 | } |
| 8852 | Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I)); |
| 8853 | } |
| 8854 | } |
| 8855 | if (Operands.size() == 2) { |
| 8856 | // Try find best operands candidates. |
| 8857 | for (unsigned I : seq<unsigned>(Begin: 0, End: VL.size() - 1)) { |
| 8858 | SmallVector<std::pair<Value *, Value *>> Candidates(3); |
| 8859 | Candidates[0] = std::make_pair(x&: Operands[0][I], y&: Operands[0][I + 1]); |
| 8860 | Candidates[1] = std::make_pair(x&: Operands[0][I], y&: Operands[1][I + 1]); |
| 8861 | Candidates[2] = std::make_pair(x&: Operands[1][I], y&: Operands[0][I + 1]); |
| 8862 | std::optional<int> Res = findBestRootPair(Candidates); |
| 8863 | switch (Res.value_or(u: 0)) { |
| 8864 | case 0: |
| 8865 | break; |
| 8866 | case 1: |
| 8867 | std::swap(a&: Operands[0][I + 1], b&: Operands[1][I + 1]); |
| 8868 | break; |
| 8869 | case 2: |
| 8870 | std::swap(a&: Operands[0][I], b&: Operands[1][I]); |
| 8871 | break; |
| 8872 | default: |
| 8873 | llvm_unreachable("Unexpected index." ); |
| 8874 | } |
| 8875 | } |
| 8876 | } |
| 8877 | DenseSet<unsigned> UniqueOpcodes; |
| 8878 | constexpr unsigned NumAltInsts = 3; // main + alt + shuffle. |
| 8879 | unsigned NonInstCnt = 0; |
| 8880 | // Estimate number of instructions, required for the vectorized node and for |
| 8881 | // the buildvector node. |
| 8882 | unsigned UndefCnt = 0; |
| 8883 | // Count the number of extra shuffles, required for vector nodes. |
| 8884 | unsigned = 0; |
| 8885 | // Check that operands do not contain same values and create either perfect |
| 8886 | // diamond match or shuffled match. |
| 8887 | if (Operands.size() == 2) { |
| 8888 | // Do not count same operands twice. |
| 8889 | if (Operands.front() == Operands.back()) { |
| 8890 | Operands.erase(CI: Operands.begin()); |
| 8891 | } else if (!allConstant(VL: Operands.front()) && |
| 8892 | all_of(Range&: Operands.front(), P: [&](Value *V) { |
| 8893 | return is_contained(Range&: Operands.back(), Element: V); |
| 8894 | })) { |
| 8895 | Operands.erase(CI: Operands.begin()); |
| 8896 | ++ExtraShuffleInsts; |
| 8897 | } |
| 8898 | } |
| 8899 | const Loop *L = LI->getLoopFor(BB: S.getMainOp()->getParent()); |
| 8900 | // Vectorize node, if: |
| 8901 | // 1. at least single operand is constant or splat. |
| 8902 | // 2. Operands have many loop invariants (the instructions are not loop |
| 8903 | // invariants). |
| 8904 | // 3. At least single unique operands is supposed to vectorized. |
| 8905 | return none_of(Range&: Operands, |
| 8906 | P: [&](ArrayRef<Value *> Op) { |
| 8907 | if (allConstant(VL: Op) || |
| 8908 | (!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) && |
| 8909 | getSameOpcode(VL: Op, TLI: *TLI))) |
| 8910 | return false; |
| 8911 | DenseMap<Value *, unsigned> Uniques; |
| 8912 | for (Value *V : Op) { |
| 8913 | if (isa<Constant, ExtractElementInst>(Val: V) || |
| 8914 | isVectorized(V) || (L && L->isLoopInvariant(V))) { |
| 8915 | if (isa<UndefValue>(Val: V)) |
| 8916 | ++UndefCnt; |
| 8917 | continue; |
| 8918 | } |
| 8919 | auto Res = Uniques.try_emplace(Key: V, Args: 0); |
| 8920 | // Found first duplicate - need to add shuffle. |
| 8921 | if (!Res.second && Res.first->second == 1) |
| 8922 | ++ExtraShuffleInsts; |
| 8923 | ++Res.first->getSecond(); |
| 8924 | if (auto *I = dyn_cast<Instruction>(Val: V)) |
| 8925 | UniqueOpcodes.insert(V: I->getOpcode()); |
| 8926 | else if (Res.second) |
| 8927 | ++NonInstCnt; |
| 8928 | } |
| 8929 | return none_of(Range&: Uniques, P: [&](const auto &P) { |
| 8930 | return P.first->hasNUsesOrMore(P.second + 1) && |
| 8931 | none_of(P.first->users(), [&](User *U) { |
| 8932 | return isVectorized(V: U) || Uniques.contains(Val: U); |
| 8933 | }); |
| 8934 | }); |
| 8935 | }) || |
| 8936 | // Do not vectorize node, if estimated number of vector instructions is |
| 8937 | // more than estimated number of buildvector instructions. Number of |
| 8938 | // vector operands is number of vector instructions + number of vector |
| 8939 | // instructions for operands (buildvectors). Number of buildvector |
| 8940 | // instructions is just number_of_operands * number_of_scalars. |
| 8941 | (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() && |
| 8942 | (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts + |
| 8943 | NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size()); |
| 8944 | } |
| 8945 | |
| 8946 | /// Builds the arguments types vector for the given call instruction with the |
| 8947 | /// given \p ID for the specified vector factor. |
| 8948 | static SmallVector<Type *> |
| 8949 | buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, |
| 8950 | const unsigned VF, unsigned MinBW, |
| 8951 | const TargetTransformInfo *TTI) { |
| 8952 | SmallVector<Type *> ArgTys; |
| 8953 | for (auto [Idx, Arg] : enumerate(First: CI->args())) { |
| 8954 | if (ID != Intrinsic::not_intrinsic) { |
| 8955 | if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) { |
| 8956 | ArgTys.push_back(Elt: Arg->getType()); |
| 8957 | continue; |
| 8958 | } |
| 8959 | if (MinBW > 0) { |
| 8960 | ArgTys.push_back( |
| 8961 | Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF)); |
| 8962 | continue; |
| 8963 | } |
| 8964 | } |
| 8965 | ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg->getType(), VF)); |
| 8966 | } |
| 8967 | return ArgTys; |
| 8968 | } |
| 8969 | |
| 8970 | /// Calculates the costs of vectorized intrinsic (if possible) and vectorized |
| 8971 | /// function (if possible) calls. Returns invalid cost for the corresponding |
| 8972 | /// calls, if they cannot be vectorized/will be scalarized. |
| 8973 | static std::pair<InstructionCost, InstructionCost> |
| 8974 | getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, |
| 8975 | TargetTransformInfo *TTI, TargetLibraryInfo *TLI, |
| 8976 | ArrayRef<Type *> ArgTys) { |
| 8977 | auto Shape = VFShape::get(FTy: CI->getFunctionType(), |
| 8978 | EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()), |
| 8979 | HasGlobalPred: false /*HasGlobalPred*/); |
| 8980 | Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); |
| 8981 | auto LibCost = InstructionCost::getInvalid(); |
| 8982 | if (!CI->isNoBuiltin() && VecFunc) { |
| 8983 | // Calculate the cost of the vector library call. |
| 8984 | // If the corresponding vector call is cheaper, return its cost. |
| 8985 | LibCost = |
| 8986 | TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput); |
| 8987 | } |
| 8988 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 8989 | |
| 8990 | // Calculate the cost of the vector intrinsic call. |
| 8991 | FastMathFlags FMF; |
| 8992 | if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI)) |
| 8993 | FMF = FPCI->getFastMathFlags(); |
| 8994 | const InstructionCost ScalarLimit = 10000; |
| 8995 | IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr, |
| 8996 | LibCost.isValid() ? LibCost : ScalarLimit); |
| 8997 | auto IntrinsicCost = |
| 8998 | TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput); |
| 8999 | if ((LibCost.isValid() && IntrinsicCost > LibCost) || |
| 9000 | (!LibCost.isValid() && IntrinsicCost > ScalarLimit)) |
| 9001 | IntrinsicCost = InstructionCost::getInvalid(); |
| 9002 | |
| 9003 | return {IntrinsicCost, LibCost}; |
| 9004 | } |
| 9005 | |
| 9006 | BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( |
| 9007 | const InstructionsState &S, ArrayRef<Value *> VL, |
| 9008 | bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder, |
| 9009 | SmallVectorImpl<Value *> &PointerOps) { |
| 9010 | assert(S.getMainOp() && |
| 9011 | "Expected instructions with same/alternate opcodes only." ); |
| 9012 | |
| 9013 | unsigned ShuffleOrOp = |
| 9014 | S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); |
| 9015 | Instruction *VL0 = S.getMainOp(); |
| 9016 | switch (ShuffleOrOp) { |
| 9017 | case Instruction::PHI: { |
| 9018 | // Too many operands - gather, most probably won't be vectorized. |
| 9019 | if (VL0->getNumOperands() > MaxPHINumOperands) |
| 9020 | return TreeEntry::NeedToGather; |
| 9021 | // Check for terminator values (e.g. invoke). |
| 9022 | for (Value *V : VL) { |
| 9023 | auto *PHI = dyn_cast<PHINode>(Val: V); |
| 9024 | if (!PHI) |
| 9025 | continue; |
| 9026 | for (Value *Incoming : PHI->incoming_values()) { |
| 9027 | Instruction *Term = dyn_cast<Instruction>(Val: Incoming); |
| 9028 | if (Term && Term->isTerminator()) { |
| 9029 | LLVM_DEBUG(dbgs() |
| 9030 | << "SLP: Need to swizzle PHINodes (terminator use).\n" ); |
| 9031 | return TreeEntry::NeedToGather; |
| 9032 | } |
| 9033 | } |
| 9034 | } |
| 9035 | |
| 9036 | return TreeEntry::Vectorize; |
| 9037 | } |
| 9038 | case Instruction::ExtractElement: |
| 9039 | if (any_of(Range&: VL, P: [&](Value *V) { |
| 9040 | auto *EI = dyn_cast<ExtractElementInst>(Val: V); |
| 9041 | if (!EI) |
| 9042 | return true; |
| 9043 | return isVectorized(V: EI->getOperand(i_nocapture: 0)); |
| 9044 | })) |
| 9045 | return TreeEntry::NeedToGather; |
| 9046 | [[fallthrough]]; |
| 9047 | case Instruction::ExtractValue: { |
| 9048 | bool Reuse = canReuseExtract(VL, CurrentOrder); |
| 9049 | // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and |
| 9050 | // non-full registers). |
| 9051 | if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size())) |
| 9052 | return TreeEntry::NeedToGather; |
| 9053 | if (Reuse || !CurrentOrder.empty()) |
| 9054 | return TreeEntry::Vectorize; |
| 9055 | LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n" ); |
| 9056 | return TreeEntry::NeedToGather; |
| 9057 | } |
| 9058 | case Instruction::InsertElement: { |
| 9059 | // Check that we have a buildvector and not a shuffle of 2 or more |
| 9060 | // different vectors. |
| 9061 | ValueSet SourceVectors; |
| 9062 | for (Value *V : VL) { |
| 9063 | SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0)); |
| 9064 | assert(getElementIndex(V) != std::nullopt && |
| 9065 | "Non-constant or undef index?" ); |
| 9066 | } |
| 9067 | |
| 9068 | if (count_if(Range&: VL, P: [&SourceVectors](Value *V) { |
| 9069 | return !SourceVectors.contains(Ptr: V); |
| 9070 | }) >= 2) { |
| 9071 | // Found 2nd source vector - cancel. |
| 9072 | LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " |
| 9073 | "different source vectors.\n" ); |
| 9074 | return TreeEntry::NeedToGather; |
| 9075 | } |
| 9076 | |
| 9077 | if (any_of(Range&: VL, P: [&SourceVectors](Value *V) { |
| 9078 | // The last InsertElement can have multiple uses. |
| 9079 | return SourceVectors.contains(Ptr: V) && !V->hasOneUse(); |
| 9080 | })) { |
| 9081 | assert(SLPReVec && "Only supported by REVEC." ); |
| 9082 | LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " |
| 9083 | "multiple uses.\n" ); |
| 9084 | return TreeEntry::NeedToGather; |
| 9085 | } |
| 9086 | |
| 9087 | return TreeEntry::Vectorize; |
| 9088 | } |
| 9089 | case Instruction::Load: { |
| 9090 | // Check that a vectorized load would load the same memory as a scalar |
| 9091 | // load. For example, we don't want to vectorize loads that are smaller |
| 9092 | // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM |
| 9093 | // treats loading/storing it as an i8 struct. If we vectorize loads/stores |
| 9094 | // from such a struct, we read/write packed bits disagreeing with the |
| 9095 | // unvectorized version. |
| 9096 | auto IsGatheredNode = [&]() { |
| 9097 | if (!GatheredLoadsEntriesFirst) |
| 9098 | return false; |
| 9099 | return all_of(Range&: VL, P: [&](Value *V) { |
| 9100 | if (isa<PoisonValue>(Val: V)) |
| 9101 | return true; |
| 9102 | return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) { |
| 9103 | return TE->Idx >= *GatheredLoadsEntriesFirst; |
| 9104 | }); |
| 9105 | }); |
| 9106 | }; |
| 9107 | switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) { |
| 9108 | case LoadsState::Vectorize: |
| 9109 | return TreeEntry::Vectorize; |
| 9110 | case LoadsState::CompressVectorize: |
| 9111 | if (!IsGraphTransformMode && !VectorizableTree.empty()) { |
| 9112 | // Delay slow vectorized nodes for better vectorization attempts. |
| 9113 | LoadEntriesToVectorize.insert(X: VectorizableTree.size()); |
| 9114 | return TreeEntry::NeedToGather; |
| 9115 | } |
| 9116 | return IsGatheredNode() ? TreeEntry::NeedToGather |
| 9117 | : TreeEntry::CompressVectorize; |
| 9118 | case LoadsState::ScatterVectorize: |
| 9119 | if (!IsGraphTransformMode && !VectorizableTree.empty()) { |
| 9120 | // Delay slow vectorized nodes for better vectorization attempts. |
| 9121 | LoadEntriesToVectorize.insert(X: VectorizableTree.size()); |
| 9122 | return TreeEntry::NeedToGather; |
| 9123 | } |
| 9124 | return IsGatheredNode() ? TreeEntry::NeedToGather |
| 9125 | : TreeEntry::ScatterVectorize; |
| 9126 | case LoadsState::StridedVectorize: |
| 9127 | if (!IsGraphTransformMode && VectorizableTree.size() > 1) { |
| 9128 | // Delay slow vectorized nodes for better vectorization attempts. |
| 9129 | LoadEntriesToVectorize.insert(X: VectorizableTree.size()); |
| 9130 | return TreeEntry::NeedToGather; |
| 9131 | } |
| 9132 | return IsGatheredNode() ? TreeEntry::NeedToGather |
| 9133 | : TreeEntry::StridedVectorize; |
| 9134 | case LoadsState::Gather: |
| 9135 | #ifndef NDEBUG |
| 9136 | Type *ScalarTy = VL0->getType(); |
| 9137 | if (DL->getTypeSizeInBits(ScalarTy) != |
| 9138 | DL->getTypeAllocSizeInBits(ScalarTy)) |
| 9139 | LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n" ); |
| 9140 | else if (any_of(VL, [](Value *V) { |
| 9141 | auto *LI = dyn_cast<LoadInst>(V); |
| 9142 | return !LI || !LI->isSimple(); |
| 9143 | })) |
| 9144 | LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n" ); |
| 9145 | else |
| 9146 | LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n" ); |
| 9147 | #endif // NDEBUG |
| 9148 | registerNonVectorizableLoads(VL); |
| 9149 | return TreeEntry::NeedToGather; |
| 9150 | } |
| 9151 | llvm_unreachable("Unexpected state of loads" ); |
| 9152 | } |
| 9153 | case Instruction::ZExt: |
| 9154 | case Instruction::SExt: |
| 9155 | case Instruction::FPToUI: |
| 9156 | case Instruction::FPToSI: |
| 9157 | case Instruction::FPExt: |
| 9158 | case Instruction::PtrToInt: |
| 9159 | case Instruction::IntToPtr: |
| 9160 | case Instruction::SIToFP: |
| 9161 | case Instruction::UIToFP: |
| 9162 | case Instruction::Trunc: |
| 9163 | case Instruction::FPTrunc: |
| 9164 | case Instruction::BitCast: { |
| 9165 | Type *SrcTy = VL0->getOperand(i: 0)->getType(); |
| 9166 | for (Value *V : VL) { |
| 9167 | if (isa<PoisonValue>(Val: V)) |
| 9168 | continue; |
| 9169 | Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType(); |
| 9170 | if (Ty != SrcTy || !isValidElementType(Ty)) { |
| 9171 | LLVM_DEBUG( |
| 9172 | dbgs() << "SLP: Gathering casts with different src types.\n" ); |
| 9173 | return TreeEntry::NeedToGather; |
| 9174 | } |
| 9175 | } |
| 9176 | return TreeEntry::Vectorize; |
| 9177 | } |
| 9178 | case Instruction::ICmp: |
| 9179 | case Instruction::FCmp: { |
| 9180 | // Check that all of the compares have the same predicate. |
| 9181 | CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate(); |
| 9182 | CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0); |
| 9183 | Type *ComparedTy = VL0->getOperand(i: 0)->getType(); |
| 9184 | for (Value *V : VL) { |
| 9185 | if (isa<PoisonValue>(Val: V)) |
| 9186 | continue; |
| 9187 | auto *Cmp = cast<CmpInst>(Val: V); |
| 9188 | if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || |
| 9189 | Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) { |
| 9190 | LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n" ); |
| 9191 | return TreeEntry::NeedToGather; |
| 9192 | } |
| 9193 | } |
| 9194 | return TreeEntry::Vectorize; |
| 9195 | } |
| 9196 | case Instruction::Select: |
| 9197 | case Instruction::FNeg: |
| 9198 | case Instruction::Add: |
| 9199 | case Instruction::FAdd: |
| 9200 | case Instruction::Sub: |
| 9201 | case Instruction::FSub: |
| 9202 | case Instruction::Mul: |
| 9203 | case Instruction::FMul: |
| 9204 | case Instruction::UDiv: |
| 9205 | case Instruction::SDiv: |
| 9206 | case Instruction::FDiv: |
| 9207 | case Instruction::URem: |
| 9208 | case Instruction::SRem: |
| 9209 | case Instruction::FRem: |
| 9210 | case Instruction::Shl: |
| 9211 | case Instruction::LShr: |
| 9212 | case Instruction::AShr: |
| 9213 | case Instruction::And: |
| 9214 | case Instruction::Or: |
| 9215 | case Instruction::Xor: |
| 9216 | case Instruction::Freeze: |
| 9217 | if (S.getMainOp()->getType()->isFloatingPointTy() && |
| 9218 | TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) { |
| 9219 | auto *I = dyn_cast<Instruction>(Val: V); |
| 9220 | return I && I->isBinaryOp() && !I->isFast(); |
| 9221 | })) |
| 9222 | return TreeEntry::NeedToGather; |
| 9223 | return TreeEntry::Vectorize; |
| 9224 | case Instruction::GetElementPtr: { |
| 9225 | // We don't combine GEPs with complicated (nested) indexing. |
| 9226 | for (Value *V : VL) { |
| 9227 | auto *I = dyn_cast<GetElementPtrInst>(Val: V); |
| 9228 | if (!I) |
| 9229 | continue; |
| 9230 | if (I->getNumOperands() != 2) { |
| 9231 | LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n" ); |
| 9232 | return TreeEntry::NeedToGather; |
| 9233 | } |
| 9234 | } |
| 9235 | |
| 9236 | // We can't combine several GEPs into one vector if they operate on |
| 9237 | // different types. |
| 9238 | Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType(); |
| 9239 | for (Value *V : VL) { |
| 9240 | auto *GEP = dyn_cast<GEPOperator>(Val: V); |
| 9241 | if (!GEP) |
| 9242 | continue; |
| 9243 | Type *CurTy = GEP->getSourceElementType(); |
| 9244 | if (Ty0 != CurTy) { |
| 9245 | LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n" ); |
| 9246 | return TreeEntry::NeedToGather; |
| 9247 | } |
| 9248 | } |
| 9249 | |
| 9250 | // We don't combine GEPs with non-constant indexes. |
| 9251 | Type *Ty1 = VL0->getOperand(i: 1)->getType(); |
| 9252 | for (Value *V : VL) { |
| 9253 | auto *I = dyn_cast<GetElementPtrInst>(Val: V); |
| 9254 | if (!I) |
| 9255 | continue; |
| 9256 | auto *Op = I->getOperand(i_nocapture: 1); |
| 9257 | if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) || |
| 9258 | (Op->getType() != Ty1 && |
| 9259 | ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) || |
| 9260 | Op->getType()->getScalarSizeInBits() > |
| 9261 | DL->getIndexSizeInBits( |
| 9262 | AS: V->getType()->getPointerAddressSpace())))) { |
| 9263 | LLVM_DEBUG( |
| 9264 | dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n" ); |
| 9265 | return TreeEntry::NeedToGather; |
| 9266 | } |
| 9267 | } |
| 9268 | |
| 9269 | return TreeEntry::Vectorize; |
| 9270 | } |
| 9271 | case Instruction::Store: { |
| 9272 | // Check if the stores are consecutive or if we need to swizzle them. |
| 9273 | llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType(); |
| 9274 | // Avoid types that are padded when being allocated as scalars, while |
| 9275 | // being packed together in a vector (such as i1). |
| 9276 | if (DL->getTypeSizeInBits(Ty: ScalarTy) != |
| 9277 | DL->getTypeAllocSizeInBits(Ty: ScalarTy)) { |
| 9278 | LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n" ); |
| 9279 | return TreeEntry::NeedToGather; |
| 9280 | } |
| 9281 | // Make sure all stores in the bundle are simple - we can't vectorize |
| 9282 | // atomic or volatile stores. |
| 9283 | for (Value *V : VL) { |
| 9284 | auto *SI = cast<StoreInst>(Val: V); |
| 9285 | if (!SI->isSimple()) { |
| 9286 | LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n" ); |
| 9287 | return TreeEntry::NeedToGather; |
| 9288 | } |
| 9289 | PointerOps.push_back(Elt: SI->getPointerOperand()); |
| 9290 | } |
| 9291 | |
| 9292 | // Check the order of pointer operands. |
| 9293 | if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) { |
| 9294 | Value *Ptr0; |
| 9295 | Value *PtrN; |
| 9296 | if (CurrentOrder.empty()) { |
| 9297 | Ptr0 = PointerOps.front(); |
| 9298 | PtrN = PointerOps.back(); |
| 9299 | } else { |
| 9300 | Ptr0 = PointerOps[CurrentOrder.front()]; |
| 9301 | PtrN = PointerOps[CurrentOrder.back()]; |
| 9302 | } |
| 9303 | std::optional<int64_t> Dist = |
| 9304 | getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE); |
| 9305 | // Check that the sorted pointer operands are consecutive. |
| 9306 | if (static_cast<uint64_t>(*Dist) == VL.size() - 1) |
| 9307 | return TreeEntry::Vectorize; |
| 9308 | } |
| 9309 | |
| 9310 | LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n" ); |
| 9311 | return TreeEntry::NeedToGather; |
| 9312 | } |
| 9313 | case Instruction::Call: { |
| 9314 | if (S.getMainOp()->getType()->isFloatingPointTy() && |
| 9315 | TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) { |
| 9316 | auto *I = dyn_cast<Instruction>(Val: V); |
| 9317 | return I && !I->isFast(); |
| 9318 | })) |
| 9319 | return TreeEntry::NeedToGather; |
| 9320 | // Check if the calls are all to the same vectorizable intrinsic or |
| 9321 | // library function. |
| 9322 | CallInst *CI = cast<CallInst>(Val: VL0); |
| 9323 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 9324 | |
| 9325 | VFShape Shape = VFShape::get( |
| 9326 | FTy: CI->getFunctionType(), |
| 9327 | EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())), |
| 9328 | HasGlobalPred: false /*HasGlobalPred*/); |
| 9329 | Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape); |
| 9330 | |
| 9331 | if (!VecFunc && !isTriviallyVectorizable(ID)) { |
| 9332 | LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n" ); |
| 9333 | return TreeEntry::NeedToGather; |
| 9334 | } |
| 9335 | Function *F = CI->getCalledFunction(); |
| 9336 | unsigned NumArgs = CI->arg_size(); |
| 9337 | SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr); |
| 9338 | for (unsigned J = 0; J != NumArgs; ++J) |
| 9339 | if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) |
| 9340 | ScalarArgs[J] = CI->getArgOperand(i: J); |
| 9341 | for (Value *V : VL) { |
| 9342 | CallInst *CI2 = dyn_cast<CallInst>(Val: V); |
| 9343 | if (!CI2 || CI2->getCalledFunction() != F || |
| 9344 | getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID || |
| 9345 | (VecFunc && |
| 9346 | VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) || |
| 9347 | !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) { |
| 9348 | LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V |
| 9349 | << "\n" ); |
| 9350 | return TreeEntry::NeedToGather; |
| 9351 | } |
| 9352 | // Some intrinsics have scalar arguments and should be same in order for |
| 9353 | // them to be vectorized. |
| 9354 | for (unsigned J = 0; J != NumArgs; ++J) { |
| 9355 | if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) { |
| 9356 | Value *A1J = CI2->getArgOperand(i: J); |
| 9357 | if (ScalarArgs[J] != A1J) { |
| 9358 | LLVM_DEBUG(dbgs() |
| 9359 | << "SLP: mismatched arguments in call:" << *CI |
| 9360 | << " argument " << ScalarArgs[J] << "!=" << A1J << "\n" ); |
| 9361 | return TreeEntry::NeedToGather; |
| 9362 | } |
| 9363 | } |
| 9364 | } |
| 9365 | // Verify that the bundle operands are identical between the two calls. |
| 9366 | if (CI->hasOperandBundles() && |
| 9367 | !std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(), |
| 9368 | last1: CI->op_begin() + CI->getBundleOperandsEndIndex(), |
| 9369 | first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { |
| 9370 | LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI |
| 9371 | << "!=" << *V << '\n'); |
| 9372 | return TreeEntry::NeedToGather; |
| 9373 | } |
| 9374 | } |
| 9375 | SmallVector<Type *> ArgTys = |
| 9376 | buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: 0, TTI); |
| 9377 | auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size()); |
| 9378 | auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); |
| 9379 | if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid()) |
| 9380 | return TreeEntry::NeedToGather; |
| 9381 | |
| 9382 | return TreeEntry::Vectorize; |
| 9383 | } |
| 9384 | case Instruction::ShuffleVector: { |
| 9385 | if (!S.isAltShuffle()) { |
| 9386 | // REVEC can support non alternate shuffle. |
| 9387 | if (SLPReVec && getShufflevectorNumGroups(VL)) |
| 9388 | return TreeEntry::Vectorize; |
| 9389 | // If this is not an alternate sequence of opcode like add-sub |
| 9390 | // then do not vectorize this instruction. |
| 9391 | LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n" ); |
| 9392 | return TreeEntry::NeedToGather; |
| 9393 | } |
| 9394 | if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) { |
| 9395 | LLVM_DEBUG( |
| 9396 | dbgs() |
| 9397 | << "SLP: ShuffleVector not vectorized, operands are buildvector and " |
| 9398 | "the whole alt sequence is not profitable.\n" ); |
| 9399 | return TreeEntry::NeedToGather; |
| 9400 | } |
| 9401 | |
| 9402 | return TreeEntry::Vectorize; |
| 9403 | } |
| 9404 | default: |
| 9405 | LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n" ); |
| 9406 | return TreeEntry::NeedToGather; |
| 9407 | } |
| 9408 | } |
| 9409 | |
| 9410 | namespace { |
| 9411 | /// Allows to correctly handle operands of the phi nodes based on the \p Main |
| 9412 | /// PHINode order of incoming basic blocks/values. |
| 9413 | class PHIHandler { |
| 9414 | DominatorTree &DT; |
| 9415 | PHINode *Main = nullptr; |
| 9416 | SmallVector<Value *> Phis; |
| 9417 | SmallVector<SmallVector<Value *>> Operands; |
| 9418 | |
| 9419 | public: |
| 9420 | PHIHandler() = delete; |
| 9421 | PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis) |
| 9422 | : DT(DT), Main(Main), Phis(Phis), |
| 9423 | Operands(Main->getNumIncomingValues(), |
| 9424 | SmallVector<Value *>(Phis.size(), nullptr)) {} |
| 9425 | void buildOperands() { |
| 9426 | constexpr unsigned FastLimit = 4; |
| 9427 | if (Main->getNumIncomingValues() <= FastLimit) { |
| 9428 | for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) { |
| 9429 | BasicBlock *InBB = Main->getIncomingBlock(i: I); |
| 9430 | if (!DT.isReachableFromEntry(A: InBB)) { |
| 9431 | Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType())); |
| 9432 | continue; |
| 9433 | } |
| 9434 | // Prepare the operand vector. |
| 9435 | for (auto [Idx, V] : enumerate(First&: Phis)) { |
| 9436 | auto *P = dyn_cast<PHINode>(Val: V); |
| 9437 | if (!P) { |
| 9438 | assert(isa<PoisonValue>(V) && |
| 9439 | "Expected isa instruction or poison value." ); |
| 9440 | Operands[I][Idx] = V; |
| 9441 | continue; |
| 9442 | } |
| 9443 | if (P->getIncomingBlock(i: I) == InBB) |
| 9444 | Operands[I][Idx] = P->getIncomingValue(i: I); |
| 9445 | else |
| 9446 | Operands[I][Idx] = P->getIncomingValueForBlock(BB: InBB); |
| 9447 | } |
| 9448 | } |
| 9449 | return; |
| 9450 | } |
| 9451 | SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4> |
| 9452 | Blocks; |
| 9453 | for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) { |
| 9454 | BasicBlock *InBB = Main->getIncomingBlock(i: I); |
| 9455 | if (!DT.isReachableFromEntry(A: InBB)) { |
| 9456 | Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType())); |
| 9457 | continue; |
| 9458 | } |
| 9459 | Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I); |
| 9460 | } |
| 9461 | for (auto [Idx, V] : enumerate(First&: Phis)) { |
| 9462 | if (isa<PoisonValue>(Val: V)) { |
| 9463 | for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) |
| 9464 | Operands[I][Idx] = V; |
| 9465 | continue; |
| 9466 | } |
| 9467 | auto *P = cast<PHINode>(Val: V); |
| 9468 | for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) { |
| 9469 | BasicBlock *InBB = P->getIncomingBlock(i: I); |
| 9470 | if (InBB == Main->getIncomingBlock(i: I)) { |
| 9471 | if (isa_and_nonnull<PoisonValue>(Val: Operands[I][Idx])) |
| 9472 | continue; |
| 9473 | Operands[I][Idx] = P->getIncomingValue(i: I); |
| 9474 | continue; |
| 9475 | } |
| 9476 | auto *It = Blocks.find(Key: InBB); |
| 9477 | if (It == Blocks.end()) |
| 9478 | continue; |
| 9479 | Operands[It->second.front()][Idx] = P->getIncomingValue(i: I); |
| 9480 | } |
| 9481 | } |
| 9482 | for (const auto &P : Blocks) { |
| 9483 | ArrayRef<unsigned> IncomingValues = P.second; |
| 9484 | if (IncomingValues.size() <= 1) |
| 9485 | continue; |
| 9486 | unsigned BasicI = IncomingValues.front(); |
| 9487 | for (unsigned I : IncomingValues.drop_front()) { |
| 9488 | assert(all_of(enumerate(Operands[I]), |
| 9489 | [&](const auto &Data) { |
| 9490 | return !Data.value() || |
| 9491 | Data.value() == Operands[BasicI][Data.index()]; |
| 9492 | }) && |
| 9493 | "Expected empty operands list." ); |
| 9494 | Operands[I] = Operands[BasicI]; |
| 9495 | } |
| 9496 | } |
| 9497 | } |
| 9498 | ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; } |
| 9499 | }; |
| 9500 | } // namespace |
| 9501 | |
| 9502 | /// Returns main/alternate instructions for the given \p VL. Unlike |
| 9503 | /// getSameOpcode supports non-compatible instructions for better SplitVectorize |
| 9504 | /// node support. |
| 9505 | /// \returns first main/alt instructions, if only poisons and instruction with |
| 9506 | /// only 2 opcodes exists. Returns pair of nullptr otherwise. |
| 9507 | static std::pair<Instruction *, Instruction *> |
| 9508 | getMainAltOpsNoStateVL(ArrayRef<Value *> VL) { |
| 9509 | Instruction *MainOp = nullptr; |
| 9510 | Instruction *AltOp = nullptr; |
| 9511 | for (Value *V : VL) { |
| 9512 | if (isa<PoisonValue>(Val: V)) |
| 9513 | continue; |
| 9514 | auto *I = dyn_cast<Instruction>(Val: V); |
| 9515 | if (!I) |
| 9516 | return {}; |
| 9517 | if (!MainOp) { |
| 9518 | MainOp = I; |
| 9519 | continue; |
| 9520 | } |
| 9521 | if (MainOp->getOpcode() == I->getOpcode()) { |
| 9522 | if (I->getParent() != MainOp->getParent()) |
| 9523 | return {}; |
| 9524 | continue; |
| 9525 | } |
| 9526 | if (!AltOp) { |
| 9527 | AltOp = I; |
| 9528 | continue; |
| 9529 | } |
| 9530 | if (AltOp->getOpcode() == I->getOpcode()) { |
| 9531 | if (I->getParent() != AltOp->getParent()) |
| 9532 | return {}; |
| 9533 | continue; |
| 9534 | } |
| 9535 | return {}; |
| 9536 | } |
| 9537 | if (!AltOp) |
| 9538 | return {}; |
| 9539 | assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() && |
| 9540 | "Expected different main and alt instructions." ); |
| 9541 | return std::make_pair(x&: MainOp, y&: AltOp); |
| 9542 | } |
| 9543 | |
| 9544 | /// Checks that every instruction appears once in the list and if not, packs |
| 9545 | /// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of |
| 9546 | /// unique scalars is extended by poison values to the whole register size. |
| 9547 | /// |
| 9548 | /// \returns false if \p VL could not be uniquified, in which case \p VL is |
| 9549 | /// unchanged and \p ReuseShuffleIndices is empty. |
| 9550 | static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL, |
| 9551 | SmallVectorImpl<int> &ReuseShuffleIndices, |
| 9552 | const TargetTransformInfo &TTI, |
| 9553 | const TargetLibraryInfo &TLI, |
| 9554 | const InstructionsState &S, |
| 9555 | const BoUpSLP::EdgeInfo &UserTreeIdx, |
| 9556 | bool TryPad = false) { |
| 9557 | // Check that every instruction appears once in this bundle. |
| 9558 | SmallVector<Value *> UniqueValues; |
| 9559 | SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size()); |
| 9560 | for (Value *V : VL) { |
| 9561 | if (isConstant(V)) { |
| 9562 | // Constants are always considered distinct, even if the same constant |
| 9563 | // appears multiple times in VL. |
| 9564 | ReuseShuffleIndices.emplace_back( |
| 9565 | Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size()); |
| 9566 | UniqueValues.emplace_back(Args&: V); |
| 9567 | continue; |
| 9568 | } |
| 9569 | auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size()); |
| 9570 | ReuseShuffleIndices.emplace_back(Args&: Res.first->second); |
| 9571 | if (Res.second) |
| 9572 | UniqueValues.emplace_back(Args&: V); |
| 9573 | } |
| 9574 | |
| 9575 | // Easy case: VL has unique values and a "natural" size |
| 9576 | size_t NumUniqueScalarValues = UniqueValues.size(); |
| 9577 | bool IsFullVectors = hasFullVectorsOrPowerOf2( |
| 9578 | TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues); |
| 9579 | if (NumUniqueScalarValues == VL.size() && |
| 9580 | (VectorizeNonPowerOf2 || IsFullVectors)) { |
| 9581 | ReuseShuffleIndices.clear(); |
| 9582 | return true; |
| 9583 | } |
| 9584 | |
| 9585 | // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. |
| 9586 | if ((UserTreeIdx.UserTE && |
| 9587 | UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) || |
| 9588 | !hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) { |
| 9589 | LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " |
| 9590 | "for nodes with padding.\n" ); |
| 9591 | ReuseShuffleIndices.clear(); |
| 9592 | return false; |
| 9593 | } |
| 9594 | |
| 9595 | LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n" ); |
| 9596 | if (NumUniqueScalarValues <= 1 || !IsFullVectors || |
| 9597 | (UniquePositions.size() == 1 && all_of(Range&: UniqueValues, P: [](Value *V) { |
| 9598 | return isa<UndefValue>(Val: V) || !isConstant(V); |
| 9599 | }))) { |
| 9600 | if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && |
| 9601 | S.getMainOp()->isSafeToRemove() && |
| 9602 | all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>)) { |
| 9603 | // Find the number of elements, which forms full vectors. |
| 9604 | unsigned PWSz = getFullVectorNumberOfElements( |
| 9605 | TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size()); |
| 9606 | PWSz = std::min<unsigned>(a: PWSz, b: VL.size()); |
| 9607 | if (PWSz == VL.size()) { |
| 9608 | // We ended up with the same size after removing duplicates and |
| 9609 | // upgrading the resulting vector size to a "nice size". Just keep |
| 9610 | // the initial VL then. |
| 9611 | ReuseShuffleIndices.clear(); |
| 9612 | } else { |
| 9613 | // Pad unique values with poison to grow the vector to a "nice" size |
| 9614 | SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(), |
| 9615 | UniqueValues.end()); |
| 9616 | PaddedUniqueValues.append( |
| 9617 | NumInputs: PWSz - UniqueValues.size(), |
| 9618 | Elt: PoisonValue::get(T: UniqueValues.front()->getType())); |
| 9619 | // Check that extended with poisons operations are still valid for |
| 9620 | // vectorization (div/rem are not allowed). |
| 9621 | if (!getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) { |
| 9622 | LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n" ); |
| 9623 | ReuseShuffleIndices.clear(); |
| 9624 | return false; |
| 9625 | } |
| 9626 | VL = std::move(PaddedUniqueValues); |
| 9627 | } |
| 9628 | return true; |
| 9629 | } |
| 9630 | LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n" ); |
| 9631 | ReuseShuffleIndices.clear(); |
| 9632 | return false; |
| 9633 | } |
| 9634 | VL = std::move(UniqueValues); |
| 9635 | return true; |
| 9636 | } |
| 9637 | |
| 9638 | bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL, |
| 9639 | const InstructionsState &LocalState, |
| 9640 | SmallVectorImpl<Value *> &Op1, |
| 9641 | SmallVectorImpl<Value *> &Op2, |
| 9642 | OrdersType &ReorderIndices) const { |
| 9643 | constexpr unsigned SmallNodeSize = 4; |
| 9644 | if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() || |
| 9645 | !SplitAlternateInstructions) |
| 9646 | return false; |
| 9647 | |
| 9648 | // Check if this is a duplicate of another split entry. |
| 9649 | LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp() |
| 9650 | << ".\n" ); |
| 9651 | for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) { |
| 9652 | if (E->isSame(VL)) { |
| 9653 | LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " |
| 9654 | << *LocalState.getMainOp() << ".\n" ); |
| 9655 | return false; |
| 9656 | } |
| 9657 | SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars); |
| 9658 | if (all_of(Range&: VL, P: [&](Value *V) { |
| 9659 | return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V); |
| 9660 | })) { |
| 9661 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n" ); |
| 9662 | return false; |
| 9663 | } |
| 9664 | } |
| 9665 | |
| 9666 | ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size()); |
| 9667 | SmallBitVector Op1Indices(VL.size()); |
| 9668 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 9669 | auto *I = dyn_cast<Instruction>(Val: V); |
| 9670 | if (!I) { |
| 9671 | Op1.push_back(Elt: V); |
| 9672 | Op1Indices.set(Idx); |
| 9673 | continue; |
| 9674 | } |
| 9675 | if ((LocalState.getAltOpcode() != LocalState.getOpcode() && |
| 9676 | isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(), |
| 9677 | TLI: *TLI)) || |
| 9678 | (LocalState.getAltOpcode() == LocalState.getOpcode() && |
| 9679 | !isAlternateInstruction(I, MainOp: LocalState.getMainOp(), |
| 9680 | AltOp: LocalState.getAltOp(), TLI: *TLI))) { |
| 9681 | Op1.push_back(Elt: V); |
| 9682 | Op1Indices.set(Idx); |
| 9683 | continue; |
| 9684 | } |
| 9685 | Op2.push_back(Elt: V); |
| 9686 | } |
| 9687 | Type *ScalarTy = getValueType(V: VL.front()); |
| 9688 | VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size()); |
| 9689 | unsigned Opcode0 = LocalState.getOpcode(); |
| 9690 | unsigned Opcode1 = LocalState.getAltOpcode(); |
| 9691 | SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1)); |
| 9692 | // Enable split node, only if all nodes do not form legal alternate |
| 9693 | // instruction (like X86 addsub). |
| 9694 | SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1); |
| 9695 | SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2); |
| 9696 | if (UOp1.size() <= 1 || UOp2.size() <= 1 || |
| 9697 | TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) || |
| 9698 | !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) || |
| 9699 | !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size())) |
| 9700 | return false; |
| 9701 | // Enable split node, only if all nodes are power-of-2/full registers. |
| 9702 | unsigned Op1Cnt = 0, Op2Cnt = Op1.size(); |
| 9703 | for (unsigned Idx : seq<unsigned>(Size: VL.size())) { |
| 9704 | if (Op1Indices.test(Idx)) { |
| 9705 | ReorderIndices[Op1Cnt] = Idx; |
| 9706 | ++Op1Cnt; |
| 9707 | } else { |
| 9708 | ReorderIndices[Op2Cnt] = Idx; |
| 9709 | ++Op2Cnt; |
| 9710 | } |
| 9711 | } |
| 9712 | if (isIdentityOrder(Order: ReorderIndices)) |
| 9713 | ReorderIndices.clear(); |
| 9714 | SmallVector<int> Mask; |
| 9715 | if (!ReorderIndices.empty()) |
| 9716 | inversePermutation(Indices: ReorderIndices, Mask); |
| 9717 | unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy); |
| 9718 | VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size()); |
| 9719 | VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size()); |
| 9720 | // Check non-profitable single register ops, which better to be represented |
| 9721 | // as alternate ops. |
| 9722 | if (NumParts >= VL.size()) |
| 9723 | return false; |
| 9724 | constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; |
| 9725 | InstructionCost InsertCost = ::getShuffleCost( |
| 9726 | TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy); |
| 9727 | FixedVectorType *SubVecTy = |
| 9728 | getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size())); |
| 9729 | InstructionCost NewShuffleCost = |
| 9730 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind); |
| 9731 | if (!LocalState.isCmpOp() && NumParts <= 1 && |
| 9732 | (Mask.empty() || InsertCost >= NewShuffleCost)) |
| 9733 | return false; |
| 9734 | if ((LocalState.getMainOp()->isBinaryOp() && |
| 9735 | LocalState.getAltOp()->isBinaryOp() && |
| 9736 | (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() || |
| 9737 | LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) || |
| 9738 | (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) || |
| 9739 | (LocalState.getMainOp()->isUnaryOp() && |
| 9740 | LocalState.getAltOp()->isUnaryOp())) { |
| 9741 | InstructionCost OriginalVecOpsCost = |
| 9742 | TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) + |
| 9743 | TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind); |
| 9744 | SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem); |
| 9745 | for (unsigned Idx : seq<unsigned>(Size: VL.size())) { |
| 9746 | if (isa<PoisonValue>(Val: VL[Idx])) |
| 9747 | continue; |
| 9748 | OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size()); |
| 9749 | } |
| 9750 | InstructionCost OriginalCost = |
| 9751 | OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, |
| 9752 | Tp: VecTy, Mask: OriginalMask, CostKind: Kind); |
| 9753 | InstructionCost NewVecOpsCost = |
| 9754 | TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) + |
| 9755 | TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind); |
| 9756 | InstructionCost NewCost = |
| 9757 | NewVecOpsCost + InsertCost + |
| 9758 | (!VectorizableTree.empty() && VectorizableTree.front()->hasState() && |
| 9759 | VectorizableTree.front()->getOpcode() == Instruction::Store |
| 9760 | ? NewShuffleCost |
| 9761 | : 0); |
| 9762 | // If not profitable to split - exit. |
| 9763 | if (NewCost >= OriginalCost) |
| 9764 | return false; |
| 9765 | } |
| 9766 | return true; |
| 9767 | } |
| 9768 | |
| 9769 | namespace { |
| 9770 | /// Class accepts incoming list of values and generates the list of values |
| 9771 | /// for scheduling and list of operands for the new nodes. |
| 9772 | class InstructionsCompatibilityAnalysis { |
| 9773 | DominatorTree &DT; |
| 9774 | const DataLayout &DL; |
| 9775 | const TargetTransformInfo &TTI; |
| 9776 | const TargetLibraryInfo &TLI; |
| 9777 | |
| 9778 | /// Builds operands for the original instructions. |
| 9779 | void |
| 9780 | buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL, |
| 9781 | SmallVectorImpl<BoUpSLP::ValueList> &Operands) const { |
| 9782 | |
| 9783 | unsigned ShuffleOrOp = |
| 9784 | S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); |
| 9785 | Instruction *VL0 = S.getMainOp(); |
| 9786 | |
| 9787 | switch (ShuffleOrOp) { |
| 9788 | case Instruction::PHI: { |
| 9789 | auto *PH = cast<PHINode>(Val: VL0); |
| 9790 | |
| 9791 | // Keeps the reordered operands to avoid code duplication. |
| 9792 | PHIHandler Handler(DT, PH, VL); |
| 9793 | Handler.buildOperands(); |
| 9794 | Operands.assign(NumElts: PH->getNumOperands(), Elt: {}); |
| 9795 | for (unsigned I : seq<unsigned>(Size: PH->getNumOperands())) |
| 9796 | Operands[I].assign(in_start: Handler.getOperands(I).begin(), |
| 9797 | in_end: Handler.getOperands(I).end()); |
| 9798 | return; |
| 9799 | } |
| 9800 | case Instruction::ExtractValue: |
| 9801 | case Instruction::ExtractElement: |
| 9802 | // This is a special case, as it does not gather, but at the same time |
| 9803 | // we are not extending buildTree_rec() towards the operands. |
| 9804 | Operands.assign(NumElts: 1, Elt: {VL.size(), VL0->getOperand(i: 0)}); |
| 9805 | return; |
| 9806 | case Instruction::InsertElement: |
| 9807 | Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr}); |
| 9808 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 9809 | auto *IE = cast<InsertElementInst>(Val: V); |
| 9810 | for (auto [OpIdx, Ops] : enumerate(First&: Operands)) |
| 9811 | Ops[Idx] = IE->getOperand(i_nocapture: OpIdx); |
| 9812 | } |
| 9813 | return; |
| 9814 | case Instruction::Load: |
| 9815 | Operands.assign( |
| 9816 | NumElts: 1, Elt: {VL.size(), |
| 9817 | PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())}); |
| 9818 | for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) { |
| 9819 | auto *LI = dyn_cast<LoadInst>(Val: V); |
| 9820 | if (!LI) |
| 9821 | continue; |
| 9822 | Op = LI->getPointerOperand(); |
| 9823 | } |
| 9824 | return; |
| 9825 | case Instruction::ZExt: |
| 9826 | case Instruction::SExt: |
| 9827 | case Instruction::FPToUI: |
| 9828 | case Instruction::FPToSI: |
| 9829 | case Instruction::FPExt: |
| 9830 | case Instruction::PtrToInt: |
| 9831 | case Instruction::IntToPtr: |
| 9832 | case Instruction::SIToFP: |
| 9833 | case Instruction::UIToFP: |
| 9834 | case Instruction::Trunc: |
| 9835 | case Instruction::FPTrunc: |
| 9836 | case Instruction::BitCast: |
| 9837 | case Instruction::ICmp: |
| 9838 | case Instruction::FCmp: |
| 9839 | case Instruction::Select: |
| 9840 | case Instruction::FNeg: |
| 9841 | case Instruction::Add: |
| 9842 | case Instruction::FAdd: |
| 9843 | case Instruction::Sub: |
| 9844 | case Instruction::FSub: |
| 9845 | case Instruction::Mul: |
| 9846 | case Instruction::FMul: |
| 9847 | case Instruction::UDiv: |
| 9848 | case Instruction::SDiv: |
| 9849 | case Instruction::FDiv: |
| 9850 | case Instruction::URem: |
| 9851 | case Instruction::SRem: |
| 9852 | case Instruction::FRem: |
| 9853 | case Instruction::Shl: |
| 9854 | case Instruction::LShr: |
| 9855 | case Instruction::AShr: |
| 9856 | case Instruction::And: |
| 9857 | case Instruction::Or: |
| 9858 | case Instruction::Xor: |
| 9859 | case Instruction::Freeze: |
| 9860 | case Instruction::Store: |
| 9861 | case Instruction::ShuffleVector: |
| 9862 | Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr}); |
| 9863 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 9864 | auto *I = dyn_cast<Instruction>(Val: V); |
| 9865 | if (!I) { |
| 9866 | for (auto [OpIdx, Ops] : enumerate(First&: Operands)) |
| 9867 | Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType()); |
| 9868 | continue; |
| 9869 | } |
| 9870 | auto [Op, ConvertedOps] = convertTo(I, S); |
| 9871 | for (auto [OpIdx, Ops] : enumerate(First&: Operands)) |
| 9872 | Ops[Idx] = ConvertedOps[OpIdx]; |
| 9873 | } |
| 9874 | return; |
| 9875 | case Instruction::GetElementPtr: { |
| 9876 | Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr}); |
| 9877 | // Need to cast all indices to the same type before vectorization to |
| 9878 | // avoid crash. |
| 9879 | // Required to be able to find correct matches between different gather |
| 9880 | // nodes and reuse the vectorized values rather than trying to gather them |
| 9881 | // again. |
| 9882 | const unsigned IndexIdx = 1; |
| 9883 | Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType(); |
| 9884 | Type *Ty = |
| 9885 | all_of(Range&: VL, |
| 9886 | P: [&](Value *V) { |
| 9887 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: V); |
| 9888 | return !GEP || VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType(); |
| 9889 | }) |
| 9890 | ? VL0Ty |
| 9891 | : DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0) |
| 9892 | ->getPointerOperandType() |
| 9893 | ->getScalarType()); |
| 9894 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 9895 | auto *GEP = dyn_cast<GetElementPtrInst>(Val: V); |
| 9896 | if (!GEP) { |
| 9897 | Operands[0][Idx] = V; |
| 9898 | Operands[1][Idx] = ConstantInt::getNullValue(Ty); |
| 9899 | continue; |
| 9900 | } |
| 9901 | Operands[0][Idx] = GEP->getPointerOperand(); |
| 9902 | auto *Op = GEP->getOperand(i_nocapture: IndexIdx); |
| 9903 | auto *CI = dyn_cast<ConstantInt>(Val: Op); |
| 9904 | Operands[1][Idx] = CI ? ConstantFoldIntegerCast( |
| 9905 | C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL) |
| 9906 | : Op; |
| 9907 | } |
| 9908 | return; |
| 9909 | } |
| 9910 | case Instruction::Call: { |
| 9911 | auto *CI = cast<CallInst>(Val: VL0); |
| 9912 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI); |
| 9913 | for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) { |
| 9914 | if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI)) |
| 9915 | continue; |
| 9916 | auto &Ops = Operands.emplace_back(); |
| 9917 | for (Value *V : VL) { |
| 9918 | auto *I = dyn_cast<Instruction>(Val: V); |
| 9919 | Ops.push_back(Elt: I ? I->getOperand(i: Idx) |
| 9920 | : PoisonValue::get(T: VL0->getOperand(i: Idx)->getType())); |
| 9921 | } |
| 9922 | } |
| 9923 | return; |
| 9924 | } |
| 9925 | default: |
| 9926 | break; |
| 9927 | } |
| 9928 | llvm_unreachable("Unexpected vectorization of the instructions." ); |
| 9929 | } |
| 9930 | |
| 9931 | public: |
| 9932 | InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL, |
| 9933 | const TargetTransformInfo &TTI, |
| 9934 | const TargetLibraryInfo &TLI) |
| 9935 | : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {} |
| 9936 | |
| 9937 | SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S, |
| 9938 | ArrayRef<Value *> VL) { |
| 9939 | assert(S && "Invalid state!" ); |
| 9940 | SmallVector<BoUpSLP::ValueList> Operands; |
| 9941 | buildOriginalOperands(S, VL, Operands); |
| 9942 | return Operands; |
| 9943 | } |
| 9944 | }; |
| 9945 | } // namespace |
| 9946 | |
| 9947 | BoUpSLP::ScalarsVectorizationLegality |
| 9948 | BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth, |
| 9949 | const EdgeInfo &UserTreeIdx) const { |
| 9950 | assert((allConstant(VL) || allSameType(VL)) && "Invalid types!" ); |
| 9951 | |
| 9952 | InstructionsState S = getSameOpcode(VL, TLI: *TLI); |
| 9953 | |
| 9954 | // Don't go into catchswitch blocks, which can happen with PHIs. |
| 9955 | // Such blocks can only have PHIs and the catchswitch. There is no |
| 9956 | // place to insert a shuffle if we need to, so just avoid that issue. |
| 9957 | if (S && isa<CatchSwitchInst>(Val: S.getMainOp()->getParent()->getTerminator())) { |
| 9958 | LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n" ); |
| 9959 | // Do not try to pack to avoid extra instructions here. |
| 9960 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false, |
| 9961 | /*TryToFindDuplicates=*/false); |
| 9962 | } |
| 9963 | |
| 9964 | // Check if this is a duplicate of another entry. |
| 9965 | if (S) { |
| 9966 | LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n" ); |
| 9967 | for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) { |
| 9968 | if (E->isSame(VL)) { |
| 9969 | LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp() |
| 9970 | << ".\n" ); |
| 9971 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 9972 | } |
| 9973 | SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars); |
| 9974 | if (all_of(Range&: VL, P: [&](Value *V) { |
| 9975 | return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V); |
| 9976 | })) { |
| 9977 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n" ); |
| 9978 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 9979 | } |
| 9980 | } |
| 9981 | } |
| 9982 | |
| 9983 | // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of |
| 9984 | // a load), in which case peek through to include it in the tree, without |
| 9985 | // ballooning over-budget. |
| 9986 | if (Depth >= RecursionMaxDepth && |
| 9987 | !(S && !S.isAltShuffle() && VL.size() >= 4 && |
| 9988 | (match(V: S.getMainOp(), P: m_Load(Op: m_Value())) || |
| 9989 | all_of(Range&: VL, P: [&S](const Value *I) { |
| 9990 | return match(V: I, |
| 9991 | P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) && |
| 9992 | cast<Instruction>(Val: I)->getOpcode() == S.getOpcode(); |
| 9993 | })))) { |
| 9994 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n" ); |
| 9995 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 9996 | } |
| 9997 | |
| 9998 | // Don't handle scalable vectors |
| 9999 | if (S && S.getOpcode() == Instruction::ExtractElement && |
| 10000 | isa<ScalableVectorType>( |
| 10001 | Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) { |
| 10002 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n" ); |
| 10003 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 10004 | } |
| 10005 | |
| 10006 | // Don't handle vectors. |
| 10007 | if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) { |
| 10008 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n" ); |
| 10009 | // Do not try to pack to avoid extra instructions here. |
| 10010 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false, |
| 10011 | /*TryToFindDuplicates=*/false); |
| 10012 | } |
| 10013 | |
| 10014 | // If all of the operands are identical or constant we have a simple solution. |
| 10015 | // If we deal with insert/extract instructions, they all must have constant |
| 10016 | // indices, otherwise we should gather them, not try to vectorize. |
| 10017 | // If alternate op node with 2 elements with gathered operands - do not |
| 10018 | // vectorize. |
| 10019 | auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) { |
| 10020 | if (!S || !S.isAltShuffle() || VL.size() > 2) |
| 10021 | return false; |
| 10022 | if (VectorizableTree.size() < MinTreeSize) |
| 10023 | return false; |
| 10024 | if (Depth >= RecursionMaxDepth - 1) |
| 10025 | return true; |
| 10026 | // Check if all operands are extracts, part of vector node or can build a |
| 10027 | // regular vectorize node. |
| 10028 | SmallVector<unsigned, 8> InstsCount; |
| 10029 | for (Value *V : VL) { |
| 10030 | auto *I = cast<Instruction>(Val: V); |
| 10031 | InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) { |
| 10032 | return isa<Instruction>(Val: Op) || isVectorLikeInstWithConstOps(V: Op); |
| 10033 | })); |
| 10034 | } |
| 10035 | bool IsCommutative = |
| 10036 | isCommutative(I: S.getMainOp()) || isCommutative(I: S.getAltOp()); |
| 10037 | if ((IsCommutative && |
| 10038 | std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: 0) < 2) || |
| 10039 | (!IsCommutative && |
| 10040 | all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < 2; }))) |
| 10041 | return true; |
| 10042 | assert(VL.size() == 2 && "Expected only 2 alternate op instructions." ); |
| 10043 | SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; |
| 10044 | auto *I1 = cast<Instruction>(Val: VL.front()); |
| 10045 | auto *I2 = cast<Instruction>(Val: VL.back()); |
| 10046 | for (int Op : seq<int>(Size: S.getMainOp()->getNumOperands())) |
| 10047 | Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op), |
| 10048 | Args: I2->getOperand(i: Op)); |
| 10049 | if (static_cast<unsigned>(count_if( |
| 10050 | Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) { |
| 10051 | return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat); |
| 10052 | })) >= S.getMainOp()->getNumOperands() / 2) |
| 10053 | return false; |
| 10054 | if (S.getMainOp()->getNumOperands() > 2) |
| 10055 | return true; |
| 10056 | if (IsCommutative) { |
| 10057 | // Check permuted operands. |
| 10058 | Candidates.clear(); |
| 10059 | for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op) |
| 10060 | Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op), |
| 10061 | Args: I2->getOperand(i: (Op + 1) % E)); |
| 10062 | if (any_of( |
| 10063 | Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) { |
| 10064 | return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat); |
| 10065 | })) |
| 10066 | return false; |
| 10067 | } |
| 10068 | return true; |
| 10069 | }; |
| 10070 | SmallVector<unsigned> SortedIndices; |
| 10071 | BasicBlock *BB = nullptr; |
| 10072 | bool IsScatterVectorizeUserTE = |
| 10073 | UserTreeIdx.UserTE && |
| 10074 | UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; |
| 10075 | bool AreAllSameBlock = S && allSameBlock(VL); |
| 10076 | bool AreScatterAllGEPSameBlock = |
| 10077 | (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && |
| 10078 | VL.size() > 2 && |
| 10079 | all_of(Range&: VL, |
| 10080 | P: [&BB](Value *V) { |
| 10081 | auto *I = dyn_cast<GetElementPtrInst>(Val: V); |
| 10082 | if (!I) |
| 10083 | return doesNotNeedToBeScheduled(V); |
| 10084 | if (!BB) |
| 10085 | BB = I->getParent(); |
| 10086 | return BB == I->getParent() && I->getNumOperands() == 2; |
| 10087 | }) && |
| 10088 | BB && |
| 10089 | sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL, SE&: *SE, |
| 10090 | SortedIndices)); |
| 10091 | bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; |
| 10092 | if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) || |
| 10093 | (S && |
| 10094 | isa<InsertElementInst, ExtractValueInst, ExtractElementInst>( |
| 10095 | Val: S.getMainOp()) && |
| 10096 | !all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) || |
| 10097 | NotProfitableForVectorization(VL)) { |
| 10098 | if (!S) { |
| 10099 | LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to " |
| 10100 | "C,S,B,O, small shuffle. \n" ); |
| 10101 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false, |
| 10102 | /*TryToFindDuplicates=*/true, |
| 10103 | /*TrySplitVectorize=*/true); |
| 10104 | } |
| 10105 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n" ); |
| 10106 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 10107 | } |
| 10108 | |
| 10109 | // Don't vectorize ephemeral values. |
| 10110 | if (S && !EphValues.empty()) { |
| 10111 | for (Value *V : VL) { |
| 10112 | if (EphValues.count(Ptr: V)) { |
| 10113 | LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V |
| 10114 | << ") is ephemeral.\n" ); |
| 10115 | // Do not try to pack to avoid extra instructions here. |
| 10116 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false, |
| 10117 | /*TryToFindDuplicates=*/false); |
| 10118 | } |
| 10119 | } |
| 10120 | } |
| 10121 | |
| 10122 | // We now know that this is a vector of instructions of the same type from |
| 10123 | // the same block. |
| 10124 | |
| 10125 | // Check that none of the instructions in the bundle are already in the tree |
| 10126 | // and the node may be not profitable for the vectorization as the small |
| 10127 | // alternate node. |
| 10128 | if (S && S.isAltShuffle()) { |
| 10129 | auto = [&]() { |
| 10130 | APInt = APInt::getZero(numBits: VL.size()); |
| 10131 | APInt Vectorized = APInt::getAllOnes(numBits: VL.size()); |
| 10132 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 10133 | auto *I = dyn_cast<Instruction>(Val: V); |
| 10134 | if (!I || doesNotNeedToBeScheduled(V: I) || |
| 10135 | all_of(Range: I->operands(), P: [&](const Use &U) { |
| 10136 | return isa<ExtractElementInst>(Val: U.get()); |
| 10137 | })) |
| 10138 | continue; |
| 10139 | if (isVectorized(V: I)) |
| 10140 | Vectorized.clearBit(BitPosition: Idx); |
| 10141 | else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList)) |
| 10142 | Extracted.setBit(Idx); |
| 10143 | } |
| 10144 | return std::make_pair(x&: Vectorized, y&: Extracted); |
| 10145 | }; |
| 10146 | auto [Vectorized, Extracted] = GetNumVectorizedExtracted(); |
| 10147 | constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput; |
| 10148 | bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2; |
| 10149 | if (!Vectorized.isAllOnes() && !PreferScalarize) { |
| 10150 | // Rough cost estimation, if the vector code (+ potential extracts) is |
| 10151 | // more profitable than the scalar + buildvector. |
| 10152 | Type *ScalarTy = VL.front()->getType(); |
| 10153 | auto *VecTy = getWidenedType(ScalarTy, VF: VL.size()); |
| 10154 | InstructionCost VectorizeCostEstimate = |
| 10155 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) + |
| 10156 | ::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted, |
| 10157 | /*Insert=*/false, /*Extract=*/true, CostKind: Kind); |
| 10158 | InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead( |
| 10159 | TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized, |
| 10160 | /*Insert=*/true, /*Extract=*/false, CostKind: Kind, /*ForPoisonSrc=*/false); |
| 10161 | PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate; |
| 10162 | } |
| 10163 | if (PreferScalarize) { |
| 10164 | LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate " |
| 10165 | "node is not profitable.\n" ); |
| 10166 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 10167 | } |
| 10168 | } |
| 10169 | |
| 10170 | // The reduction nodes (stored in UserIgnoreList) also should stay scalar. |
| 10171 | if (UserIgnoreList && !UserIgnoreList->empty()) { |
| 10172 | for (Value *V : VL) { |
| 10173 | if (UserIgnoreList->contains(V)) { |
| 10174 | LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n" ); |
| 10175 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 10176 | } |
| 10177 | } |
| 10178 | } |
| 10179 | |
| 10180 | // Special processing for sorted pointers for ScatterVectorize node with |
| 10181 | // constant indeces only. |
| 10182 | if (!AreAllSameBlock && AreScatterAllGEPSameBlock) { |
| 10183 | assert(VL.front()->getType()->isPointerTy() && |
| 10184 | count_if(VL, IsaPred<GetElementPtrInst>) >= 2 && |
| 10185 | "Expected pointers only." ); |
| 10186 | // Reset S to make it GetElementPtr kind of node. |
| 10187 | const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>); |
| 10188 | assert(It != VL.end() && "Expected at least one GEP." ); |
| 10189 | S = getSameOpcode(VL: *It, TLI: *TLI); |
| 10190 | } |
| 10191 | |
| 10192 | // Check that all of the users of the scalars that we want to vectorize are |
| 10193 | // schedulable. |
| 10194 | Instruction *VL0 = S.getMainOp(); |
| 10195 | BB = VL0->getParent(); |
| 10196 | |
| 10197 | if (S && |
| 10198 | (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) || |
| 10199 | !DT->isReachableFromEntry(A: BB))) { |
| 10200 | // Don't go into unreachable blocks. They may contain instructions with |
| 10201 | // dependency cycles which confuse the final scheduling. |
| 10202 | // Do not vectorize EH and non-returning blocks, not profitable in most |
| 10203 | // cases. |
| 10204 | LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n" ); |
| 10205 | return ScalarsVectorizationLegality(S, /*IsLegal=*/false); |
| 10206 | } |
| 10207 | return ScalarsVectorizationLegality(S, /*IsLegal=*/true); |
| 10208 | } |
| 10209 | |
| 10210 | void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth, |
| 10211 | const EdgeInfo &UserTreeIdx, |
| 10212 | unsigned InterleaveFactor) { |
| 10213 | assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!" ); |
| 10214 | |
| 10215 | SmallVector<int> ReuseShuffleIndices; |
| 10216 | SmallVector<Value *> VL(VLRef); |
| 10217 | |
| 10218 | // Tries to build split node. |
| 10219 | auto TrySplitNode = [&](const InstructionsState &LocalState) { |
| 10220 | SmallVector<Value *> Op1, Op2; |
| 10221 | OrdersType ReorderIndices; |
| 10222 | if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices)) |
| 10223 | return false; |
| 10224 | |
| 10225 | SmallVector<Value *> NewVL(VL.size()); |
| 10226 | copy(Range&: Op1, Out: NewVL.begin()); |
| 10227 | copy(Range&: Op2, Out: std::next(x: NewVL.begin(), n: Op1.size())); |
| 10228 | auto Invalid = ScheduleBundle::invalid(); |
| 10229 | auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState, |
| 10230 | UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices); |
| 10231 | LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n" ; TE->dump()); |
| 10232 | auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) { |
| 10233 | InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI); |
| 10234 | if (S && (isa<LoadInst>(Val: S.getMainOp()) || |
| 10235 | getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /*SameVF=*/true))) { |
| 10236 | // Build gather node for loads, they will be gathered later. |
| 10237 | TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(), |
| 10238 | Args: Idx == 0 ? 0 : Op1.size()); |
| 10239 | (void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx}); |
| 10240 | } else { |
| 10241 | TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(), |
| 10242 | Args: Idx == 0 ? 0 : Op1.size()); |
| 10243 | buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx}); |
| 10244 | } |
| 10245 | }; |
| 10246 | AddNode(Op1, 0); |
| 10247 | AddNode(Op2, 1); |
| 10248 | return true; |
| 10249 | }; |
| 10250 | |
| 10251 | ScalarsVectorizationLegality Legality = |
| 10252 | getScalarsVectorizationLegality(VL, Depth, UserTreeIdx); |
| 10253 | const InstructionsState &S = Legality.getInstructionsState(); |
| 10254 | if (!Legality.isLegal()) { |
| 10255 | if (Legality.trySplitVectorize()) { |
| 10256 | auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL); |
| 10257 | // Last chance to try to vectorize alternate node. |
| 10258 | if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp))) |
| 10259 | return; |
| 10260 | } |
| 10261 | if (Legality.tryToFindDuplicates()) |
| 10262 | tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx); |
| 10263 | |
| 10264 | newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); |
| 10265 | return; |
| 10266 | } |
| 10267 | |
| 10268 | // FIXME: investigate if there are profitable cases for VL.size() <= 4. |
| 10269 | if (S.isAltShuffle() && TrySplitNode(S)) |
| 10270 | return; |
| 10271 | |
| 10272 | // Check that every instruction appears once in this bundle. |
| 10273 | if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx, |
| 10274 | /*TryPad=*/true)) { |
| 10275 | newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); |
| 10276 | return; |
| 10277 | } |
| 10278 | |
| 10279 | // Perform specific checks for each particular instruction kind. |
| 10280 | bool IsScatterVectorizeUserTE = |
| 10281 | UserTreeIdx.UserTE && |
| 10282 | UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; |
| 10283 | OrdersType CurrentOrder; |
| 10284 | SmallVector<Value *> PointerOps; |
| 10285 | TreeEntry::EntryState State = getScalarsVectorizationState( |
| 10286 | S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps); |
| 10287 | if (State == TreeEntry::NeedToGather) { |
| 10288 | newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); |
| 10289 | return; |
| 10290 | } |
| 10291 | |
| 10292 | Instruction *VL0 = S.getMainOp(); |
| 10293 | BasicBlock *BB = VL0->getParent(); |
| 10294 | auto &BSRef = BlocksSchedules[BB]; |
| 10295 | if (!BSRef) |
| 10296 | BSRef = std::make_unique<BlockScheduling>(args&: BB); |
| 10297 | |
| 10298 | BlockScheduling &BS = *BSRef; |
| 10299 | |
| 10300 | SetVector<Value *> UniqueValues(llvm::from_range, VL); |
| 10301 | std::optional<ScheduleBundle *> BundlePtr = |
| 10302 | BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S); |
| 10303 | #ifdef EXPENSIVE_CHECKS |
| 10304 | // Make sure we didn't break any internal invariants |
| 10305 | BS.verify(); |
| 10306 | #endif |
| 10307 | if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) { |
| 10308 | LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n" ); |
| 10309 | // Last chance to try to vectorize alternate node. |
| 10310 | if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S)) |
| 10311 | return; |
| 10312 | newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); |
| 10313 | NonScheduledFirst.insert(Ptr: VL.front()); |
| 10314 | if (S.getOpcode() == Instruction::Load && |
| 10315 | BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit) |
| 10316 | registerNonVectorizableLoads(VL: ArrayRef(VL)); |
| 10317 | return; |
| 10318 | } |
| 10319 | InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); |
| 10320 | SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL); |
| 10321 | ScheduleBundle Empty; |
| 10322 | ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty; |
| 10323 | LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n" ); |
| 10324 | |
| 10325 | unsigned ShuffleOrOp = |
| 10326 | S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); |
| 10327 | auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) { |
| 10328 | // Postpone PHI nodes creation |
| 10329 | SmallVector<unsigned> PHIOps; |
| 10330 | for (unsigned I : seq<unsigned>(Operands.size())) { |
| 10331 | ArrayRef<Value *> Op = Operands[I]; |
| 10332 | if (Op.empty()) |
| 10333 | continue; |
| 10334 | InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI); |
| 10335 | if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle()) |
| 10336 | buildTreeRec(VLRef: Op, Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10337 | else |
| 10338 | PHIOps.push_back(Elt: I); |
| 10339 | } |
| 10340 | for (unsigned I : PHIOps) |
| 10341 | buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10342 | }; |
| 10343 | switch (ShuffleOrOp) { |
| 10344 | case Instruction::PHI: { |
| 10345 | TreeEntry *TE = |
| 10346 | newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices); |
| 10347 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n" ; |
| 10348 | TE->dump()); |
| 10349 | |
| 10350 | TE->setOperands(Operands); |
| 10351 | CreateOperandNodes(TE, Operands); |
| 10352 | return; |
| 10353 | } |
| 10354 | case Instruction::ExtractValue: |
| 10355 | case Instruction::ExtractElement: { |
| 10356 | if (CurrentOrder.empty()) { |
| 10357 | LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n" ); |
| 10358 | } else { |
| 10359 | LLVM_DEBUG({ |
| 10360 | dbgs() << "SLP: Reusing or shuffling of reordered extract sequence " |
| 10361 | "with order" ; |
| 10362 | for (unsigned Idx : CurrentOrder) |
| 10363 | dbgs() << " " << Idx; |
| 10364 | dbgs() << "\n" ; |
| 10365 | }); |
| 10366 | fixupOrderingIndices(Order: CurrentOrder); |
| 10367 | } |
| 10368 | // Insert new order with initial value 0, if it does not exist, |
| 10369 | // otherwise return the iterator to the existing one. |
| 10370 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10371 | ReuseShuffleIndices, ReorderIndices: CurrentOrder); |
| 10372 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry " |
| 10373 | "(ExtractValueInst/ExtractElementInst).\n" ; |
| 10374 | TE->dump()); |
| 10375 | // This is a special case, as it does not gather, but at the same time |
| 10376 | // we are not extending buildTreeRec() towards the operands. |
| 10377 | TE->setOperands(Operands); |
| 10378 | return; |
| 10379 | } |
| 10380 | case Instruction::InsertElement: { |
| 10381 | assert(ReuseShuffleIndices.empty() && "All inserts should be unique" ); |
| 10382 | |
| 10383 | auto OrdCompare = [](const std::pair<int, int> &P1, |
| 10384 | const std::pair<int, int> &P2) { |
| 10385 | return P1.first > P2.first; |
| 10386 | }; |
| 10387 | PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>, |
| 10388 | decltype(OrdCompare)> |
| 10389 | Indices(OrdCompare); |
| 10390 | for (int I = 0, E = VL.size(); I < E; ++I) { |
| 10391 | unsigned Idx = *getElementIndex(Inst: VL[I]); |
| 10392 | Indices.emplace(args&: Idx, args&: I); |
| 10393 | } |
| 10394 | OrdersType CurrentOrder(VL.size(), VL.size()); |
| 10395 | bool IsIdentity = true; |
| 10396 | for (int I = 0, E = VL.size(); I < E; ++I) { |
| 10397 | CurrentOrder[Indices.top().second] = I; |
| 10398 | IsIdentity &= Indices.top().second == I; |
| 10399 | Indices.pop(); |
| 10400 | } |
| 10401 | if (IsIdentity) |
| 10402 | CurrentOrder.clear(); |
| 10403 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10404 | ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder); |
| 10405 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n" ; |
| 10406 | TE->dump()); |
| 10407 | |
| 10408 | TE->setOperands(Operands); |
| 10409 | buildTreeRec(VLRef: TE->getOperand(OpIdx: 1), Depth: Depth + 1, UserTreeIdx: {TE, 1}); |
| 10410 | return; |
| 10411 | } |
| 10412 | case Instruction::Load: { |
| 10413 | // Check that a vectorized load would load the same memory as a scalar |
| 10414 | // load. For example, we don't want to vectorize loads that are smaller |
| 10415 | // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM |
| 10416 | // treats loading/storing it as an i8 struct. If we vectorize loads/stores |
| 10417 | // from such a struct, we read/write packed bits disagreeing with the |
| 10418 | // unvectorized version. |
| 10419 | TreeEntry *TE = nullptr; |
| 10420 | fixupOrderingIndices(Order: CurrentOrder); |
| 10421 | switch (State) { |
| 10422 | case TreeEntry::Vectorize: |
| 10423 | TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10424 | ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor); |
| 10425 | if (CurrentOrder.empty()) |
| 10426 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n" ; |
| 10427 | TE->dump()); |
| 10428 | else |
| 10429 | LLVM_DEBUG(dbgs() |
| 10430 | << "SLP: added a new TreeEntry (jumbled LoadInst).\n" ; |
| 10431 | TE->dump()); |
| 10432 | break; |
| 10433 | case TreeEntry::CompressVectorize: |
| 10434 | // Vectorizing non-consecutive loads with (masked)load + compress. |
| 10435 | TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S, |
| 10436 | UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder); |
| 10437 | LLVM_DEBUG( |
| 10438 | dbgs() |
| 10439 | << "SLP: added a new TreeEntry (masked LoadInst + compress).\n" ; |
| 10440 | TE->dump()); |
| 10441 | break; |
| 10442 | case TreeEntry::StridedVectorize: |
| 10443 | // Vectorizing non-consecutive loads with `llvm.masked.gather`. |
| 10444 | TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S, |
| 10445 | UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder); |
| 10446 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n" ; |
| 10447 | TE->dump()); |
| 10448 | break; |
| 10449 | case TreeEntry::ScatterVectorize: |
| 10450 | // Vectorizing non-consecutive loads with `llvm.masked.gather`. |
| 10451 | TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S, |
| 10452 | UserTreeIdx, ReuseShuffleIndices); |
| 10453 | LLVM_DEBUG( |
| 10454 | dbgs() |
| 10455 | << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n" ; |
| 10456 | TE->dump()); |
| 10457 | break; |
| 10458 | case TreeEntry::CombinedVectorize: |
| 10459 | case TreeEntry::SplitVectorize: |
| 10460 | case TreeEntry::NeedToGather: |
| 10461 | llvm_unreachable("Unexpected loads state." ); |
| 10462 | } |
| 10463 | if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) { |
| 10464 | assert(Operands.size() == 1 && "Expected a single operand only" ); |
| 10465 | SmallVector<int> Mask; |
| 10466 | inversePermutation(Indices: CurrentOrder, Mask); |
| 10467 | reorderScalars(Scalars&: Operands.front(), Mask); |
| 10468 | } |
| 10469 | TE->setOperands(Operands); |
| 10470 | if (State == TreeEntry::ScatterVectorize) |
| 10471 | buildTreeRec(VLRef: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0}); |
| 10472 | return; |
| 10473 | } |
| 10474 | case Instruction::ZExt: |
| 10475 | case Instruction::SExt: |
| 10476 | case Instruction::FPToUI: |
| 10477 | case Instruction::FPToSI: |
| 10478 | case Instruction::FPExt: |
| 10479 | case Instruction::PtrToInt: |
| 10480 | case Instruction::IntToPtr: |
| 10481 | case Instruction::SIToFP: |
| 10482 | case Instruction::UIToFP: |
| 10483 | case Instruction::Trunc: |
| 10484 | case Instruction::FPTrunc: |
| 10485 | case Instruction::BitCast: { |
| 10486 | auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or( |
| 10487 | u: std::make_pair(x: std::numeric_limits<unsigned>::min(), |
| 10488 | y: std::numeric_limits<unsigned>::max())); |
| 10489 | if (ShuffleOrOp == Instruction::ZExt || |
| 10490 | ShuffleOrOp == Instruction::SExt) { |
| 10491 | CastMaxMinBWSizes = std::make_pair( |
| 10492 | x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()), |
| 10493 | b: PrevMaxBW), |
| 10494 | y: std::min<unsigned>( |
| 10495 | a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()), |
| 10496 | b: PrevMinBW)); |
| 10497 | } else if (ShuffleOrOp == Instruction::Trunc) { |
| 10498 | CastMaxMinBWSizes = std::make_pair( |
| 10499 | x: std::max<unsigned>( |
| 10500 | a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()), |
| 10501 | b: PrevMaxBW), |
| 10502 | y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()), |
| 10503 | b: PrevMinBW)); |
| 10504 | } |
| 10505 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10506 | ReuseShuffleIndices); |
| 10507 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n" ; |
| 10508 | TE->dump()); |
| 10509 | |
| 10510 | TE->setOperands(Operands); |
| 10511 | for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands())) |
| 10512 | buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10513 | if (ShuffleOrOp == Instruction::Trunc) { |
| 10514 | ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx); |
| 10515 | } else if (ShuffleOrOp == Instruction::SIToFP || |
| 10516 | ShuffleOrOp == Instruction::UIToFP) { |
| 10517 | unsigned NumSignBits = |
| 10518 | ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT); |
| 10519 | if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) { |
| 10520 | APInt Mask = DB->getDemandedBits(I: OpI); |
| 10521 | NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero()); |
| 10522 | } |
| 10523 | if (NumSignBits * 2 >= |
| 10524 | DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType())) |
| 10525 | ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx); |
| 10526 | } |
| 10527 | return; |
| 10528 | } |
| 10529 | case Instruction::ICmp: |
| 10530 | case Instruction::FCmp: { |
| 10531 | // Check that all of the compares have the same predicate. |
| 10532 | CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate(); |
| 10533 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10534 | ReuseShuffleIndices); |
| 10535 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n" ; |
| 10536 | TE->dump()); |
| 10537 | |
| 10538 | VLOperands Ops(VL, Operands, S, *this); |
| 10539 | if (cast<CmpInst>(Val: VL0)->isCommutative()) { |
| 10540 | // Commutative predicate - collect + sort operands of the instructions |
| 10541 | // so that each side is more likely to have the same opcode. |
| 10542 | assert(P0 == CmpInst::getSwappedPredicate(P0) && |
| 10543 | "Commutative Predicate mismatch" ); |
| 10544 | Ops.reorder(); |
| 10545 | Operands.front() = Ops.getVL(OpIdx: 0); |
| 10546 | Operands.back() = Ops.getVL(OpIdx: 1); |
| 10547 | } else { |
| 10548 | // Collect operands - commute if it uses the swapped predicate. |
| 10549 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 10550 | if (isa<PoisonValue>(Val: V)) |
| 10551 | continue; |
| 10552 | auto *Cmp = cast<CmpInst>(Val: V); |
| 10553 | if (Cmp->getPredicate() != P0) |
| 10554 | std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]); |
| 10555 | } |
| 10556 | } |
| 10557 | TE->setOperands(Operands); |
| 10558 | buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0}); |
| 10559 | buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1}); |
| 10560 | if (ShuffleOrOp == Instruction::ICmp) { |
| 10561 | unsigned NumSignBits0 = |
| 10562 | ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT); |
| 10563 | if (NumSignBits0 * 2 >= |
| 10564 | DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType())) |
| 10565 | ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx); |
| 10566 | unsigned NumSignBits1 = |
| 10567 | ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT); |
| 10568 | if (NumSignBits1 * 2 >= |
| 10569 | DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType())) |
| 10570 | ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx); |
| 10571 | } |
| 10572 | return; |
| 10573 | } |
| 10574 | case Instruction::Select: |
| 10575 | case Instruction::FNeg: |
| 10576 | case Instruction::Add: |
| 10577 | case Instruction::FAdd: |
| 10578 | case Instruction::Sub: |
| 10579 | case Instruction::FSub: |
| 10580 | case Instruction::Mul: |
| 10581 | case Instruction::FMul: |
| 10582 | case Instruction::UDiv: |
| 10583 | case Instruction::SDiv: |
| 10584 | case Instruction::FDiv: |
| 10585 | case Instruction::URem: |
| 10586 | case Instruction::SRem: |
| 10587 | case Instruction::FRem: |
| 10588 | case Instruction::Shl: |
| 10589 | case Instruction::LShr: |
| 10590 | case Instruction::AShr: |
| 10591 | case Instruction::And: |
| 10592 | case Instruction::Or: |
| 10593 | case Instruction::Xor: |
| 10594 | case Instruction::Freeze: { |
| 10595 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10596 | ReuseShuffleIndices); |
| 10597 | LLVM_DEBUG( |
| 10598 | dbgs() << "SLP: added a new TreeEntry " |
| 10599 | "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n" ; |
| 10600 | TE->dump()); |
| 10601 | |
| 10602 | if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) { |
| 10603 | VLOperands Ops(VL, Operands, S, *this); |
| 10604 | Ops.reorder(); |
| 10605 | Operands[0] = Ops.getVL(OpIdx: 0); |
| 10606 | Operands[1] = Ops.getVL(OpIdx: 1); |
| 10607 | } |
| 10608 | TE->setOperands(Operands); |
| 10609 | for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands())) |
| 10610 | buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10611 | return; |
| 10612 | } |
| 10613 | case Instruction::GetElementPtr: { |
| 10614 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10615 | ReuseShuffleIndices); |
| 10616 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n" ; |
| 10617 | TE->dump()); |
| 10618 | TE->setOperands(Operands); |
| 10619 | |
| 10620 | for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I) |
| 10621 | buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10622 | return; |
| 10623 | } |
| 10624 | case Instruction::Store: { |
| 10625 | bool Consecutive = CurrentOrder.empty(); |
| 10626 | if (!Consecutive) |
| 10627 | fixupOrderingIndices(Order: CurrentOrder); |
| 10628 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10629 | ReuseShuffleIndices, ReorderIndices: CurrentOrder); |
| 10630 | if (Consecutive) |
| 10631 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n" ; |
| 10632 | TE->dump()); |
| 10633 | else |
| 10634 | LLVM_DEBUG( |
| 10635 | dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n" ; |
| 10636 | TE->dump()); |
| 10637 | TE->setOperands(Operands); |
| 10638 | buildTreeRec(VLRef: TE->getOperand(OpIdx: 0), Depth: Depth + 1, UserTreeIdx: {TE, 0}); |
| 10639 | return; |
| 10640 | } |
| 10641 | case Instruction::Call: { |
| 10642 | // Check if the calls are all to the same vectorizable intrinsic or |
| 10643 | // library function. |
| 10644 | CallInst *CI = cast<CallInst>(Val: VL0); |
| 10645 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 10646 | |
| 10647 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10648 | ReuseShuffleIndices); |
| 10649 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n" ; |
| 10650 | TE->dump()); |
| 10651 | if (isCommutative(I: VL0)) { |
| 10652 | VLOperands Ops(VL, Operands, S, *this); |
| 10653 | Ops.reorder(); |
| 10654 | Operands[0] = Ops.getVL(OpIdx: 0); |
| 10655 | Operands[1] = Ops.getVL(OpIdx: 1); |
| 10656 | } |
| 10657 | TE->setOperands(Operands); |
| 10658 | for (unsigned I : seq<unsigned>(Size: CI->arg_size())) { |
| 10659 | // For scalar operands no need to create an entry since no need to |
| 10660 | // vectorize it. |
| 10661 | if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) |
| 10662 | continue; |
| 10663 | buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10664 | } |
| 10665 | return; |
| 10666 | } |
| 10667 | case Instruction::ShuffleVector: { |
| 10668 | TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, |
| 10669 | ReuseShuffleIndices); |
| 10670 | if (S.isAltShuffle()) { |
| 10671 | LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n" ; |
| 10672 | TE->dump()); |
| 10673 | } else { |
| 10674 | assert(SLPReVec && "Only supported by REVEC." ); |
| 10675 | LLVM_DEBUG( |
| 10676 | dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n" ; |
| 10677 | TE->dump()); |
| 10678 | } |
| 10679 | |
| 10680 | // Reorder operands if reordering would enable vectorization. |
| 10681 | auto *CI = dyn_cast<CmpInst>(Val: VL0); |
| 10682 | if (CI && any_of(Range&: VL, P: [](Value *V) { |
| 10683 | return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative(); |
| 10684 | })) { |
| 10685 | auto *MainCI = cast<CmpInst>(Val: S.getMainOp()); |
| 10686 | auto *AltCI = cast<CmpInst>(Val: S.getAltOp()); |
| 10687 | CmpInst::Predicate MainP = MainCI->getPredicate(); |
| 10688 | CmpInst::Predicate AltP = AltCI->getPredicate(); |
| 10689 | assert(MainP != AltP && |
| 10690 | "Expected different main/alternate predicates." ); |
| 10691 | // Collect operands - commute if it uses the swapped predicate or |
| 10692 | // alternate operation. |
| 10693 | for (auto [Idx, V] : enumerate(First&: VL)) { |
| 10694 | if (isa<PoisonValue>(Val: V)) |
| 10695 | continue; |
| 10696 | auto *Cmp = cast<CmpInst>(Val: V); |
| 10697 | |
| 10698 | if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) { |
| 10699 | if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate())) |
| 10700 | std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]); |
| 10701 | } else { |
| 10702 | if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate())) |
| 10703 | std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]); |
| 10704 | } |
| 10705 | } |
| 10706 | TE->setOperands(Operands); |
| 10707 | buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0}); |
| 10708 | buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1}); |
| 10709 | return; |
| 10710 | } |
| 10711 | |
| 10712 | if (isa<BinaryOperator>(Val: VL0) || CI) { |
| 10713 | VLOperands Ops(VL, Operands, S, *this); |
| 10714 | Ops.reorder(); |
| 10715 | Operands[0] = Ops.getVL(OpIdx: 0); |
| 10716 | Operands[1] = Ops.getVL(OpIdx: 1); |
| 10717 | } |
| 10718 | TE->setOperands(Operands); |
| 10719 | for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands())) |
| 10720 | buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I}); |
| 10721 | return; |
| 10722 | } |
| 10723 | default: |
| 10724 | break; |
| 10725 | } |
| 10726 | llvm_unreachable("Unexpected vectorization of the instructions." ); |
| 10727 | } |
| 10728 | |
| 10729 | unsigned BoUpSLP::canMapToVector(Type *T) const { |
| 10730 | unsigned N = 1; |
| 10731 | Type *EltTy = T; |
| 10732 | |
| 10733 | while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) { |
| 10734 | if (EltTy->isEmptyTy()) |
| 10735 | return 0; |
| 10736 | if (auto *ST = dyn_cast<StructType>(Val: EltTy)) { |
| 10737 | // Check that struct is homogeneous. |
| 10738 | for (const auto *Ty : ST->elements()) |
| 10739 | if (Ty != *ST->element_begin()) |
| 10740 | return 0; |
| 10741 | N *= ST->getNumElements(); |
| 10742 | EltTy = *ST->element_begin(); |
| 10743 | } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) { |
| 10744 | N *= AT->getNumElements(); |
| 10745 | EltTy = AT->getElementType(); |
| 10746 | } else { |
| 10747 | auto *VT = cast<FixedVectorType>(Val: EltTy); |
| 10748 | N *= VT->getNumElements(); |
| 10749 | EltTy = VT->getElementType(); |
| 10750 | } |
| 10751 | } |
| 10752 | |
| 10753 | if (!isValidElementType(Ty: EltTy)) |
| 10754 | return 0; |
| 10755 | size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N)); |
| 10756 | if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || |
| 10757 | VTSize != DL->getTypeStoreSizeInBits(Ty: T)) |
| 10758 | return 0; |
| 10759 | return N; |
| 10760 | } |
| 10761 | |
| 10762 | bool BoUpSLP::(ArrayRef<Value *> VL, |
| 10763 | SmallVectorImpl<unsigned> &CurrentOrder, |
| 10764 | bool ResizeAllowed) const { |
| 10765 | const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>); |
| 10766 | assert(It != VL.end() && "Expected at least one extract instruction." ); |
| 10767 | auto *E0 = cast<Instruction>(Val: *It); |
| 10768 | assert( |
| 10769 | all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) && |
| 10770 | "Invalid opcode" ); |
| 10771 | // Check if all of the extracts come from the same vector and from the |
| 10772 | // correct offset. |
| 10773 | Value *Vec = E0->getOperand(i: 0); |
| 10774 | |
| 10775 | CurrentOrder.clear(); |
| 10776 | |
| 10777 | // We have to extract from a vector/aggregate with the same number of elements. |
| 10778 | unsigned NElts; |
| 10779 | if (E0->getOpcode() == Instruction::ExtractValue) { |
| 10780 | NElts = canMapToVector(T: Vec->getType()); |
| 10781 | if (!NElts) |
| 10782 | return false; |
| 10783 | // Check if load can be rewritten as load of vector. |
| 10784 | LoadInst *LI = dyn_cast<LoadInst>(Val: Vec); |
| 10785 | if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size())) |
| 10786 | return false; |
| 10787 | } else { |
| 10788 | NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements(); |
| 10789 | } |
| 10790 | |
| 10791 | unsigned E = VL.size(); |
| 10792 | if (!ResizeAllowed && NElts != E) |
| 10793 | return false; |
| 10794 | SmallVector<int> Indices(E, PoisonMaskElem); |
| 10795 | unsigned MinIdx = NElts, MaxIdx = 0; |
| 10796 | for (auto [I, V] : enumerate(First&: VL)) { |
| 10797 | auto *Inst = dyn_cast<Instruction>(Val: V); |
| 10798 | if (!Inst) |
| 10799 | continue; |
| 10800 | if (Inst->getOperand(i: 0) != Vec) |
| 10801 | return false; |
| 10802 | if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) |
| 10803 | if (isa<UndefValue>(Val: EE->getIndexOperand())) |
| 10804 | continue; |
| 10805 | std::optional<unsigned> Idx = getExtractIndex(E: Inst); |
| 10806 | if (!Idx) |
| 10807 | return false; |
| 10808 | const unsigned ExtIdx = *Idx; |
| 10809 | if (ExtIdx >= NElts) |
| 10810 | continue; |
| 10811 | Indices[I] = ExtIdx; |
| 10812 | if (MinIdx > ExtIdx) |
| 10813 | MinIdx = ExtIdx; |
| 10814 | if (MaxIdx < ExtIdx) |
| 10815 | MaxIdx = ExtIdx; |
| 10816 | } |
| 10817 | if (MaxIdx - MinIdx + 1 > E) |
| 10818 | return false; |
| 10819 | if (MaxIdx + 1 <= E) |
| 10820 | MinIdx = 0; |
| 10821 | |
| 10822 | // Check that all of the indices extract from the correct offset. |
| 10823 | bool ShouldKeepOrder = true; |
| 10824 | // Assign to all items the initial value E + 1 so we can check if the extract |
| 10825 | // instruction index was used already. |
| 10826 | // Also, later we can check that all the indices are used and we have a |
| 10827 | // consecutive access in the extract instructions, by checking that no |
| 10828 | // element of CurrentOrder still has value E + 1. |
| 10829 | CurrentOrder.assign(NumElts: E, Elt: E); |
| 10830 | for (unsigned I = 0; I < E; ++I) { |
| 10831 | if (Indices[I] == PoisonMaskElem) |
| 10832 | continue; |
| 10833 | const unsigned ExtIdx = Indices[I] - MinIdx; |
| 10834 | if (CurrentOrder[ExtIdx] != E) { |
| 10835 | CurrentOrder.clear(); |
| 10836 | return false; |
| 10837 | } |
| 10838 | ShouldKeepOrder &= ExtIdx == I; |
| 10839 | CurrentOrder[ExtIdx] = I; |
| 10840 | } |
| 10841 | if (ShouldKeepOrder) |
| 10842 | CurrentOrder.clear(); |
| 10843 | |
| 10844 | return ShouldKeepOrder; |
| 10845 | } |
| 10846 | |
| 10847 | bool BoUpSLP::areAllUsersVectorized( |
| 10848 | Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const { |
| 10849 | return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) || |
| 10850 | all_of(Range: I->users(), P: [this](User *U) { |
| 10851 | return isVectorized(V: U) || isVectorLikeInstWithConstOps(V: U) || |
| 10852 | (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U)); |
| 10853 | }); |
| 10854 | } |
| 10855 | |
| 10856 | void BoUpSLP::TreeEntry::buildAltOpShuffleMask( |
| 10857 | const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask, |
| 10858 | SmallVectorImpl<Value *> *OpScalars, |
| 10859 | SmallVectorImpl<Value *> *AltScalars) const { |
| 10860 | unsigned Sz = Scalars.size(); |
| 10861 | Mask.assign(NumElts: Sz, Elt: PoisonMaskElem); |
| 10862 | SmallVector<int> OrderMask; |
| 10863 | if (!ReorderIndices.empty()) |
| 10864 | inversePermutation(Indices: ReorderIndices, Mask&: OrderMask); |
| 10865 | for (unsigned I = 0; I < Sz; ++I) { |
| 10866 | unsigned Idx = I; |
| 10867 | if (!ReorderIndices.empty()) |
| 10868 | Idx = OrderMask[I]; |
| 10869 | if (isa<PoisonValue>(Val: Scalars[Idx])) |
| 10870 | continue; |
| 10871 | auto *OpInst = cast<Instruction>(Val: Scalars[Idx]); |
| 10872 | if (IsAltOp(OpInst)) { |
| 10873 | Mask[I] = Sz + Idx; |
| 10874 | if (AltScalars) |
| 10875 | AltScalars->push_back(Elt: OpInst); |
| 10876 | } else { |
| 10877 | Mask[I] = Idx; |
| 10878 | if (OpScalars) |
| 10879 | OpScalars->push_back(Elt: OpInst); |
| 10880 | } |
| 10881 | } |
| 10882 | if (!ReuseShuffleIndices.empty()) { |
| 10883 | SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem); |
| 10884 | transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) { |
| 10885 | return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem; |
| 10886 | }); |
| 10887 | Mask.swap(RHS&: NewMask); |
| 10888 | } |
| 10889 | } |
| 10890 | |
| 10891 | static bool isMainInstruction(Instruction *I, Instruction *MainOp, |
| 10892 | Instruction *AltOp, |
| 10893 | const TargetLibraryInfo &TLI) { |
| 10894 | return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp; |
| 10895 | } |
| 10896 | |
| 10897 | static bool isAlternateInstruction(Instruction *I, Instruction *MainOp, |
| 10898 | Instruction *AltOp, |
| 10899 | const TargetLibraryInfo &TLI) { |
| 10900 | if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) { |
| 10901 | auto *AltCI = cast<CmpInst>(Val: AltOp); |
| 10902 | CmpInst::Predicate MainP = MainCI->getPredicate(); |
| 10903 | [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate(); |
| 10904 | assert(MainP != AltP && "Expected different main/alternate predicates." ); |
| 10905 | auto *CI = cast<CmpInst>(Val: I); |
| 10906 | if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI)) |
| 10907 | return false; |
| 10908 | if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI)) |
| 10909 | return true; |
| 10910 | CmpInst::Predicate P = CI->getPredicate(); |
| 10911 | CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P); |
| 10912 | |
| 10913 | assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) && |
| 10914 | "CmpInst expected to match either main or alternate predicate or " |
| 10915 | "their swap." ); |
| 10916 | return MainP != P && MainP != SwappedP; |
| 10917 | } |
| 10918 | return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp; |
| 10919 | } |
| 10920 | |
| 10921 | TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) { |
| 10922 | assert(!Ops.empty()); |
| 10923 | const auto *Op0 = Ops.front(); |
| 10924 | |
| 10925 | const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) { |
| 10926 | // TODO: We should allow undef elements here |
| 10927 | return isConstant(V) && !isa<UndefValue>(Val: V); |
| 10928 | }); |
| 10929 | const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) { |
| 10930 | // TODO: We should allow undef elements here |
| 10931 | return V == Op0; |
| 10932 | }); |
| 10933 | const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) { |
| 10934 | // TODO: We should allow undef elements here |
| 10935 | if (auto *CI = dyn_cast<ConstantInt>(Val: V)) |
| 10936 | return CI->getValue().isPowerOf2(); |
| 10937 | return false; |
| 10938 | }); |
| 10939 | const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) { |
| 10940 | // TODO: We should allow undef elements here |
| 10941 | if (auto *CI = dyn_cast<ConstantInt>(Val: V)) |
| 10942 | return CI->getValue().isNegatedPowerOf2(); |
| 10943 | return false; |
| 10944 | }); |
| 10945 | |
| 10946 | TTI::OperandValueKind VK = TTI::OK_AnyValue; |
| 10947 | if (IsConstant && IsUniform) |
| 10948 | VK = TTI::OK_UniformConstantValue; |
| 10949 | else if (IsConstant) |
| 10950 | VK = TTI::OK_NonUniformConstantValue; |
| 10951 | else if (IsUniform) |
| 10952 | VK = TTI::OK_UniformValue; |
| 10953 | |
| 10954 | TTI::OperandValueProperties VP = TTI::OP_None; |
| 10955 | VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP; |
| 10956 | VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP; |
| 10957 | |
| 10958 | return {.Kind: VK, .Properties: VP}; |
| 10959 | } |
| 10960 | |
| 10961 | namespace { |
| 10962 | /// The base class for shuffle instruction emission and shuffle cost estimation. |
| 10963 | class BaseShuffleAnalysis { |
| 10964 | protected: |
| 10965 | Type *ScalarTy = nullptr; |
| 10966 | |
| 10967 | BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {} |
| 10968 | |
| 10969 | /// V is expected to be a vectorized value. |
| 10970 | /// When REVEC is disabled, there is no difference between VF and |
| 10971 | /// VNumElements. |
| 10972 | /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements. |
| 10973 | /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead |
| 10974 | /// of 8. |
| 10975 | unsigned getVF(Value *V) const { |
| 10976 | assert(V && "V cannot be nullptr" ); |
| 10977 | assert(isa<FixedVectorType>(V->getType()) && |
| 10978 | "V does not have FixedVectorType" ); |
| 10979 | assert(ScalarTy && "ScalarTy cannot be nullptr" ); |
| 10980 | unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy); |
| 10981 | unsigned VNumElements = |
| 10982 | cast<FixedVectorType>(Val: V->getType())->getNumElements(); |
| 10983 | assert(VNumElements > ScalarTyNumElements && |
| 10984 | "the number of elements of V is not large enough" ); |
| 10985 | assert(VNumElements % ScalarTyNumElements == 0 && |
| 10986 | "the number of elements of V is not a vectorized value" ); |
| 10987 | return VNumElements / ScalarTyNumElements; |
| 10988 | } |
| 10989 | |
| 10990 | /// Checks if the mask is an identity mask. |
| 10991 | /// \param IsStrict if is true the function returns false if mask size does |
| 10992 | /// not match vector size. |
| 10993 | static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy, |
| 10994 | bool IsStrict) { |
| 10995 | int Limit = Mask.size(); |
| 10996 | int VF = VecTy->getNumElements(); |
| 10997 | int Index = -1; |
| 10998 | if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit)) |
| 10999 | return true; |
| 11000 | if (!IsStrict) { |
| 11001 | // Consider extract subvector starting from index 0. |
| 11002 | if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) && |
| 11003 | Index == 0) |
| 11004 | return true; |
| 11005 | // All VF-size submasks are identity (e.g. |
| 11006 | // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4). |
| 11007 | if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) { |
| 11008 | ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF); |
| 11009 | return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) || |
| 11010 | ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF); |
| 11011 | })) |
| 11012 | return true; |
| 11013 | } |
| 11014 | return false; |
| 11015 | } |
| 11016 | |
| 11017 | /// Tries to combine 2 different masks into single one. |
| 11018 | /// \param LocalVF Vector length of the permuted input vector. \p Mask may |
| 11019 | /// change the size of the vector, \p LocalVF is the original size of the |
| 11020 | /// shuffled vector. |
| 11021 | static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask, |
| 11022 | ArrayRef<int> ExtMask) { |
| 11023 | unsigned VF = Mask.size(); |
| 11024 | SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); |
| 11025 | for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { |
| 11026 | if (ExtMask[I] == PoisonMaskElem) |
| 11027 | continue; |
| 11028 | int MaskedIdx = Mask[ExtMask[I] % VF]; |
| 11029 | NewMask[I] = |
| 11030 | MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF; |
| 11031 | } |
| 11032 | Mask.swap(RHS&: NewMask); |
| 11033 | } |
| 11034 | |
| 11035 | /// Looks through shuffles trying to reduce final number of shuffles in the |
| 11036 | /// code. The function looks through the previously emitted shuffle |
| 11037 | /// instructions and properly mark indices in mask as undef. |
| 11038 | /// For example, given the code |
| 11039 | /// \code |
| 11040 | /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> |
| 11041 | /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> |
| 11042 | /// \endcode |
| 11043 | /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will |
| 11044 | /// look through %s1 and %s2 and select vectors %0 and %1 with mask |
| 11045 | /// <0, 1, 2, 3> for the shuffle. |
| 11046 | /// If 2 operands are of different size, the smallest one will be resized and |
| 11047 | /// the mask recalculated properly. |
| 11048 | /// For example, given the code |
| 11049 | /// \code |
| 11050 | /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> |
| 11051 | /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> |
| 11052 | /// \endcode |
| 11053 | /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will |
| 11054 | /// look through %s1 and %s2 and select vectors %0 and %1 with mask |
| 11055 | /// <0, 1, 2, 3> for the shuffle. |
| 11056 | /// So, it tries to transform permutations to simple vector merge, if |
| 11057 | /// possible. |
| 11058 | /// \param V The input vector which must be shuffled using the given \p Mask. |
| 11059 | /// If the better candidate is found, \p V is set to this best candidate |
| 11060 | /// vector. |
| 11061 | /// \param Mask The input mask for the shuffle. If the best candidate is found |
| 11062 | /// during looking-through-shuffles attempt, it is updated accordingly. |
| 11063 | /// \param SinglePermute true if the shuffle operation is originally a |
| 11064 | /// single-value-permutation. In this case the look-through-shuffles procedure |
| 11065 | /// may look for resizing shuffles as the best candidates. |
| 11066 | /// \return true if the shuffle results in the non-resizing identity shuffle |
| 11067 | /// (and thus can be ignored), false - otherwise. |
| 11068 | static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask, |
| 11069 | bool SinglePermute) { |
| 11070 | Value *Op = V; |
| 11071 | ShuffleVectorInst *IdentityOp = nullptr; |
| 11072 | SmallVector<int> IdentityMask; |
| 11073 | while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) { |
| 11074 | // Exit if not a fixed vector type or changing size shuffle. |
| 11075 | auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType()); |
| 11076 | if (!SVTy) |
| 11077 | break; |
| 11078 | // Remember the identity or broadcast mask, if it is not a resizing |
| 11079 | // shuffle. If no better candidates are found, this Op and Mask will be |
| 11080 | // used in the final shuffle. |
| 11081 | if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) { |
| 11082 | if (!IdentityOp || !SinglePermute || |
| 11083 | (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) && |
| 11084 | !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask, |
| 11085 | NumSrcElts: IdentityMask.size()))) { |
| 11086 | IdentityOp = SV; |
| 11087 | // Store current mask in the IdentityMask so later we did not lost |
| 11088 | // this info if IdentityOp is selected as the best candidate for the |
| 11089 | // permutation. |
| 11090 | IdentityMask.assign(RHS: Mask); |
| 11091 | } |
| 11092 | } |
| 11093 | // Remember the broadcast mask. If no better candidates are found, this Op |
| 11094 | // and Mask will be used in the final shuffle. |
| 11095 | // Zero splat can be used as identity too, since it might be used with |
| 11096 | // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling. |
| 11097 | // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is |
| 11098 | // expensive, the analysis founds out, that the source vector is just a |
| 11099 | // broadcast, this original mask can be transformed to identity mask <0, |
| 11100 | // 1, 2, 3>. |
| 11101 | // \code |
| 11102 | // %0 = shuffle %v, poison, zeroinitalizer |
| 11103 | // %res = shuffle %0, poison, <3, 1, 2, 0> |
| 11104 | // \endcode |
| 11105 | // may be transformed to |
| 11106 | // \code |
| 11107 | // %0 = shuffle %v, poison, zeroinitalizer |
| 11108 | // %res = shuffle %0, poison, <0, 1, 2, 3> |
| 11109 | // \endcode |
| 11110 | if (SV->isZeroEltSplat()) { |
| 11111 | IdentityOp = SV; |
| 11112 | IdentityMask.assign(RHS: Mask); |
| 11113 | } |
| 11114 | int LocalVF = Mask.size(); |
| 11115 | if (auto *SVOpTy = |
| 11116 | dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())) |
| 11117 | LocalVF = SVOpTy->getNumElements(); |
| 11118 | SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem); |
| 11119 | for (auto [Idx, I] : enumerate(First&: Mask)) { |
| 11120 | if (I == PoisonMaskElem || |
| 11121 | static_cast<unsigned>(I) >= SV->getShuffleMask().size()) |
| 11122 | continue; |
| 11123 | ExtMask[Idx] = SV->getMaskValue(Elt: I); |
| 11124 | } |
| 11125 | bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>( |
| 11126 | V: SV->getOperand(i_nocapture: 0), |
| 11127 | UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg)) |
| 11128 | .all(); |
| 11129 | bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>( |
| 11130 | V: SV->getOperand(i_nocapture: 1), |
| 11131 | UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg)) |
| 11132 | .all(); |
| 11133 | if (!IsOp1Undef && !IsOp2Undef) { |
| 11134 | // Update mask and mark undef elems. |
| 11135 | for (int &I : Mask) { |
| 11136 | if (I == PoisonMaskElem) |
| 11137 | continue; |
| 11138 | if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) == |
| 11139 | PoisonMaskElem) |
| 11140 | I = PoisonMaskElem; |
| 11141 | } |
| 11142 | break; |
| 11143 | } |
| 11144 | SmallVector<int> ShuffleMask(SV->getShuffleMask()); |
| 11145 | combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask); |
| 11146 | Mask.swap(RHS&: ShuffleMask); |
| 11147 | if (IsOp2Undef) |
| 11148 | Op = SV->getOperand(i_nocapture: 0); |
| 11149 | else |
| 11150 | Op = SV->getOperand(i_nocapture: 1); |
| 11151 | } |
| 11152 | if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType()); |
| 11153 | !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) || |
| 11154 | ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) { |
| 11155 | if (IdentityOp) { |
| 11156 | V = IdentityOp; |
| 11157 | assert(Mask.size() == IdentityMask.size() && |
| 11158 | "Expected masks of same sizes." ); |
| 11159 | // Clear known poison elements. |
| 11160 | for (auto [I, Idx] : enumerate(First&: Mask)) |
| 11161 | if (Idx == PoisonMaskElem) |
| 11162 | IdentityMask[I] = PoisonMaskElem; |
| 11163 | Mask.swap(RHS&: IdentityMask); |
| 11164 | auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V); |
| 11165 | return SinglePermute && |
| 11166 | (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()), |
| 11167 | /*IsStrict=*/true) || |
| 11168 | (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() && |
| 11169 | Shuffle->isZeroEltSplat() && |
| 11170 | ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) && |
| 11171 | all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) { |
| 11172 | return P.value() == PoisonMaskElem || |
| 11173 | Shuffle->getShuffleMask()[P.index()] == 0; |
| 11174 | }))); |
| 11175 | } |
| 11176 | V = Op; |
| 11177 | return false; |
| 11178 | } |
| 11179 | V = Op; |
| 11180 | return true; |
| 11181 | } |
| 11182 | |
| 11183 | /// Smart shuffle instruction emission, walks through shuffles trees and |
| 11184 | /// tries to find the best matching vector for the actual shuffle |
| 11185 | /// instruction. |
| 11186 | template <typename T, typename ShuffleBuilderTy> |
| 11187 | static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask, |
| 11188 | ShuffleBuilderTy &Builder, Type *ScalarTy) { |
| 11189 | assert(V1 && "Expected at least one vector value." ); |
| 11190 | unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy); |
| 11191 | SmallVector<int> NewMask(Mask); |
| 11192 | if (ScalarTyNumElements != 1) { |
| 11193 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 11194 | transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask); |
| 11195 | Mask = NewMask; |
| 11196 | } |
| 11197 | if (V2) |
| 11198 | Builder.resizeToMatch(V1, V2); |
| 11199 | int VF = Mask.size(); |
| 11200 | if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType())) |
| 11201 | VF = FTy->getNumElements(); |
| 11202 | if (V2 && !isUndefVector</*IsPoisonOnly=*/true>( |
| 11203 | V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg)) |
| 11204 | .all()) { |
| 11205 | // Peek through shuffles. |
| 11206 | Value *Op1 = V1; |
| 11207 | Value *Op2 = V2; |
| 11208 | int VF = |
| 11209 | cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue(); |
| 11210 | SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); |
| 11211 | SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); |
| 11212 | for (int I = 0, E = Mask.size(); I < E; ++I) { |
| 11213 | if (Mask[I] < VF) |
| 11214 | CombinedMask1[I] = Mask[I]; |
| 11215 | else |
| 11216 | CombinedMask2[I] = Mask[I] - VF; |
| 11217 | } |
| 11218 | Value *PrevOp1; |
| 11219 | Value *PrevOp2; |
| 11220 | do { |
| 11221 | PrevOp1 = Op1; |
| 11222 | PrevOp2 = Op2; |
| 11223 | (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false); |
| 11224 | (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false); |
| 11225 | // Check if we have 2 resizing shuffles - need to peek through operands |
| 11226 | // again. |
| 11227 | if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1)) |
| 11228 | if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) { |
| 11229 | SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem); |
| 11230 | for (auto [Idx, I] : enumerate(First&: CombinedMask1)) { |
| 11231 | if (I == PoisonMaskElem) |
| 11232 | continue; |
| 11233 | ExtMask1[Idx] = SV1->getMaskValue(Elt: I); |
| 11234 | } |
| 11235 | SmallBitVector UseMask1 = buildUseMask( |
| 11236 | VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType()) |
| 11237 | ->getNumElements(), |
| 11238 | Mask: ExtMask1, MaskArg: UseMask::SecondArg); |
| 11239 | SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem); |
| 11240 | for (auto [Idx, I] : enumerate(First&: CombinedMask2)) { |
| 11241 | if (I == PoisonMaskElem) |
| 11242 | continue; |
| 11243 | ExtMask2[Idx] = SV2->getMaskValue(Elt: I); |
| 11244 | } |
| 11245 | SmallBitVector UseMask2 = buildUseMask( |
| 11246 | VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType()) |
| 11247 | ->getNumElements(), |
| 11248 | Mask: ExtMask2, MaskArg: UseMask::SecondArg); |
| 11249 | if (SV1->getOperand(i_nocapture: 0)->getType() == |
| 11250 | SV2->getOperand(i_nocapture: 0)->getType() && |
| 11251 | SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() && |
| 11252 | isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() && |
| 11253 | isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) { |
| 11254 | Op1 = SV1->getOperand(i_nocapture: 0); |
| 11255 | Op2 = SV2->getOperand(i_nocapture: 0); |
| 11256 | SmallVector<int> ShuffleMask1(SV1->getShuffleMask()); |
| 11257 | int LocalVF = ShuffleMask1.size(); |
| 11258 | if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType())) |
| 11259 | LocalVF = FTy->getNumElements(); |
| 11260 | combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1); |
| 11261 | CombinedMask1.swap(RHS&: ShuffleMask1); |
| 11262 | SmallVector<int> ShuffleMask2(SV2->getShuffleMask()); |
| 11263 | LocalVF = ShuffleMask2.size(); |
| 11264 | if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType())) |
| 11265 | LocalVF = FTy->getNumElements(); |
| 11266 | combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2); |
| 11267 | CombinedMask2.swap(RHS&: ShuffleMask2); |
| 11268 | } |
| 11269 | } |
| 11270 | } while (PrevOp1 != Op1 || PrevOp2 != Op2); |
| 11271 | Builder.resizeToMatch(Op1, Op2); |
| 11272 | VF = std::max(a: cast<VectorType>(Val: Op1->getType()) |
| 11273 | ->getElementCount() |
| 11274 | .getKnownMinValue(), |
| 11275 | b: cast<VectorType>(Val: Op2->getType()) |
| 11276 | ->getElementCount() |
| 11277 | .getKnownMinValue()); |
| 11278 | for (int I = 0, E = Mask.size(); I < E; ++I) { |
| 11279 | if (CombinedMask2[I] != PoisonMaskElem) { |
| 11280 | assert(CombinedMask1[I] == PoisonMaskElem && |
| 11281 | "Expected undefined mask element" ); |
| 11282 | CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF); |
| 11283 | } |
| 11284 | } |
| 11285 | if (Op1 == Op2 && |
| 11286 | (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) || |
| 11287 | (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) && |
| 11288 | isa<ShuffleVectorInst>(Val: Op1) && |
| 11289 | cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() == |
| 11290 | ArrayRef(CombinedMask1)))) |
| 11291 | return Builder.createIdentity(Op1); |
| 11292 | return Builder.createShuffleVector( |
| 11293 | Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2, |
| 11294 | CombinedMask1); |
| 11295 | } |
| 11296 | if (isa<PoisonValue>(Val: V1)) |
| 11297 | return Builder.createPoison( |
| 11298 | cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size()); |
| 11299 | bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true); |
| 11300 | assert(V1 && "Expected non-null value after looking through shuffles." ); |
| 11301 | |
| 11302 | if (!IsIdentity) |
| 11303 | return Builder.createShuffleVector(V1, NewMask); |
| 11304 | return Builder.createIdentity(V1); |
| 11305 | } |
| 11306 | |
| 11307 | /// Transforms mask \p CommonMask per given \p Mask to make proper set after |
| 11308 | /// shuffle emission. |
| 11309 | static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask, |
| 11310 | ArrayRef<int> Mask) { |
| 11311 | for (unsigned I : seq<unsigned>(Size: CommonMask.size())) |
| 11312 | if (Mask[I] != PoisonMaskElem) |
| 11313 | CommonMask[I] = I; |
| 11314 | } |
| 11315 | }; |
| 11316 | } // namespace |
| 11317 | |
| 11318 | /// Calculate the scalar and the vector costs from vectorizing set of GEPs. |
| 11319 | static std::pair<InstructionCost, InstructionCost> |
| 11320 | getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs, |
| 11321 | Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, |
| 11322 | Type *ScalarTy, VectorType *VecTy) { |
| 11323 | InstructionCost ScalarCost = 0; |
| 11324 | InstructionCost VecCost = 0; |
| 11325 | // Here we differentiate two cases: (1) when Ptrs represent a regular |
| 11326 | // vectorization tree node (as they are pointer arguments of scattered |
| 11327 | // loads) or (2) when Ptrs are the arguments of loads or stores being |
| 11328 | // vectorized as plane wide unit-stride load/store since all the |
| 11329 | // loads/stores are known to be from/to adjacent locations. |
| 11330 | if (Opcode == Instruction::Load || Opcode == Instruction::Store) { |
| 11331 | // Case 2: estimate costs for pointer related costs when vectorizing to |
| 11332 | // a wide load/store. |
| 11333 | // Scalar cost is estimated as a set of pointers with known relationship |
| 11334 | // between them. |
| 11335 | // For vector code we will use BasePtr as argument for the wide load/store |
| 11336 | // but we also need to account all the instructions which are going to |
| 11337 | // stay in vectorized code due to uses outside of these scalar |
| 11338 | // loads/stores. |
| 11339 | ScalarCost = TTI.getPointersChainCost( |
| 11340 | Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy, |
| 11341 | CostKind); |
| 11342 | |
| 11343 | SmallVector<const Value *> PtrsRetainedInVecCode; |
| 11344 | for (Value *V : Ptrs) { |
| 11345 | if (V == BasePtr) { |
| 11346 | PtrsRetainedInVecCode.push_back(Elt: V); |
| 11347 | continue; |
| 11348 | } |
| 11349 | auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V); |
| 11350 | // For simplicity assume Ptr to stay in vectorized code if it's not a |
| 11351 | // GEP instruction. We don't care since it's cost considered free. |
| 11352 | // TODO: We should check for any uses outside of vectorizable tree |
| 11353 | // rather than just single use. |
| 11354 | if (!Ptr || !Ptr->hasOneUse()) |
| 11355 | PtrsRetainedInVecCode.push_back(Elt: V); |
| 11356 | } |
| 11357 | |
| 11358 | if (PtrsRetainedInVecCode.size() == Ptrs.size()) { |
| 11359 | // If all pointers stay in vectorized code then we don't have |
| 11360 | // any savings on that. |
| 11361 | return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free); |
| 11362 | } |
| 11363 | VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr, |
| 11364 | Info: TTI::PointersChainInfo::getKnownStride(), |
| 11365 | AccessTy: VecTy, CostKind); |
| 11366 | } else { |
| 11367 | // Case 1: Ptrs are the arguments of loads that we are going to transform |
| 11368 | // into masked gather load intrinsic. |
| 11369 | // All the scalar GEPs will be removed as a result of vectorization. |
| 11370 | // For any external uses of some lanes extract element instructions will |
| 11371 | // be generated (which cost is estimated separately). |
| 11372 | TTI::PointersChainInfo PtrsInfo = |
| 11373 | all_of(Range&: Ptrs, |
| 11374 | P: [](const Value *V) { |
| 11375 | auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V); |
| 11376 | return Ptr && !Ptr->hasAllConstantIndices(); |
| 11377 | }) |
| 11378 | ? TTI::PointersChainInfo::getUnknownStride() |
| 11379 | : TTI::PointersChainInfo::getKnownStride(); |
| 11380 | |
| 11381 | ScalarCost = |
| 11382 | TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind); |
| 11383 | auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr); |
| 11384 | if (!BaseGEP) { |
| 11385 | auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>); |
| 11386 | if (It != Ptrs.end()) |
| 11387 | BaseGEP = cast<GEPOperator>(Val: *It); |
| 11388 | } |
| 11389 | if (BaseGEP) { |
| 11390 | SmallVector<const Value *> Indices(BaseGEP->indices()); |
| 11391 | VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(), |
| 11392 | Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy, |
| 11393 | CostKind); |
| 11394 | } |
| 11395 | } |
| 11396 | |
| 11397 | return std::make_pair(x&: ScalarCost, y&: VecCost); |
| 11398 | } |
| 11399 | |
| 11400 | void BoUpSLP::reorderGatherNode(TreeEntry &TE) { |
| 11401 | assert(TE.isGather() && TE.ReorderIndices.empty() && |
| 11402 | "Expected gather node without reordering." ); |
| 11403 | DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap; |
| 11404 | SmallSet<size_t, 2> LoadKeyUsed; |
| 11405 | |
| 11406 | // Do not reorder nodes if it small (just 2 elements), all-constant or all |
| 11407 | // instructions have same opcode already. |
| 11408 | if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) || |
| 11409 | all_of(Range&: TE.Scalars, P: isConstant)) |
| 11410 | return; |
| 11411 | |
| 11412 | if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) { |
| 11413 | return VectorizableTree[Idx]->isSame(VL: TE.Scalars); |
| 11414 | })) |
| 11415 | return; |
| 11416 | |
| 11417 | auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { |
| 11418 | Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key); |
| 11419 | Value *Ptr = |
| 11420 | getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth); |
| 11421 | if (LoadKeyUsed.contains(V: Key)) { |
| 11422 | auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr)); |
| 11423 | if (LIt != LoadsMap.end()) { |
| 11424 | for (LoadInst *RLI : LIt->second) { |
| 11425 | if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(), |
| 11426 | ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: *DL, SE&: *SE, |
| 11427 | /*StrictCheck=*/true)) |
| 11428 | return hash_value(ptr: RLI->getPointerOperand()); |
| 11429 | } |
| 11430 | for (LoadInst *RLI : LIt->second) { |
| 11431 | if (arePointersCompatible(Ptr1: RLI->getPointerOperand(), |
| 11432 | Ptr2: LI->getPointerOperand(), TLI: *TLI)) { |
| 11433 | hash_code SubKey = hash_value(ptr: RLI->getPointerOperand()); |
| 11434 | return SubKey; |
| 11435 | } |
| 11436 | } |
| 11437 | if (LIt->second.size() > 2) { |
| 11438 | hash_code SubKey = |
| 11439 | hash_value(ptr: LIt->second.back()->getPointerOperand()); |
| 11440 | return SubKey; |
| 11441 | } |
| 11442 | } |
| 11443 | } |
| 11444 | LoadKeyUsed.insert(V: Key); |
| 11445 | LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first->second.push_back(Elt: LI); |
| 11446 | return hash_value(ptr: LI->getPointerOperand()); |
| 11447 | }; |
| 11448 | MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues; |
| 11449 | SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex; |
| 11450 | bool IsOrdered = true; |
| 11451 | unsigned NumInstructions = 0; |
| 11452 | // Try to "cluster" scalar instructions, to be able to build extra vectorized |
| 11453 | // nodes. |
| 11454 | for (auto [I, V] : enumerate(First&: TE.Scalars)) { |
| 11455 | size_t Key = 1, Idx = 1; |
| 11456 | if (auto *Inst = dyn_cast<Instruction>(Val: V); |
| 11457 | Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) && |
| 11458 | !isDeleted(I: Inst) && !isVectorized(V)) { |
| 11459 | std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey, |
| 11460 | /*AllowAlternate=*/false); |
| 11461 | ++NumInstructions; |
| 11462 | } |
| 11463 | auto &Container = SortedValues[Key]; |
| 11464 | if (IsOrdered && !KeyToIndex.contains(Val: V) && |
| 11465 | !(isa<Constant, ExtractElementInst>(Val: V) || |
| 11466 | isVectorLikeInstWithConstOps(V)) && |
| 11467 | ((Container.contains(Key: Idx) && |
| 11468 | KeyToIndex.at(Val: Container[Idx].back()).back() != I - 1) || |
| 11469 | (!Container.empty() && !Container.contains(Key: Idx) && |
| 11470 | KeyToIndex.at(Val: Container.back().second.back()).back() != I - 1))) |
| 11471 | IsOrdered = false; |
| 11472 | auto &KTI = KeyToIndex[V]; |
| 11473 | if (KTI.empty()) |
| 11474 | Container[Idx].push_back(Elt: V); |
| 11475 | KTI.push_back(Elt: I); |
| 11476 | } |
| 11477 | SmallVector<std::pair<unsigned, unsigned>> SubVectors; |
| 11478 | APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size()); |
| 11479 | if (!IsOrdered && NumInstructions > 1) { |
| 11480 | unsigned Cnt = 0; |
| 11481 | TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size()); |
| 11482 | for (const auto &D : SortedValues) { |
| 11483 | for (const auto &P : D.second) { |
| 11484 | unsigned Sz = 0; |
| 11485 | for (Value *V : P.second) { |
| 11486 | ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V); |
| 11487 | for (auto [K, Idx] : enumerate(First&: Indices)) { |
| 11488 | TE.ReorderIndices[Cnt + K] = Idx; |
| 11489 | TE.Scalars[Cnt + K] = V; |
| 11490 | } |
| 11491 | Sz += Indices.size(); |
| 11492 | Cnt += Indices.size(); |
| 11493 | } |
| 11494 | if (Sz > 1 && isa<Instruction>(Val: P.second.front())) { |
| 11495 | const unsigned SubVF = getFloorFullVectorNumberOfElements( |
| 11496 | TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz); |
| 11497 | SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF); |
| 11498 | for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF)) |
| 11499 | DemandedElts.clearBit(BitPosition: I); |
| 11500 | } else if (!P.second.empty() && isConstant(V: P.second.front())) { |
| 11501 | for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt)) |
| 11502 | DemandedElts.clearBit(BitPosition: I); |
| 11503 | } |
| 11504 | } |
| 11505 | } |
| 11506 | } |
| 11507 | // Reuses always require shuffles, so consider it as profitable. |
| 11508 | if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty()) |
| 11509 | return; |
| 11510 | // Do simple cost estimation. |
| 11511 | constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 11512 | InstructionCost Cost = 0; |
| 11513 | auto *ScalarTy = TE.Scalars.front()->getType(); |
| 11514 | auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size()); |
| 11515 | for (auto [Idx, Sz] : SubVectors) { |
| 11516 | Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind, |
| 11517 | Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz)); |
| 11518 | } |
| 11519 | Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts, |
| 11520 | /*Insert=*/true, |
| 11521 | /*Extract=*/false, CostKind); |
| 11522 | int Sz = TE.Scalars.size(); |
| 11523 | SmallVector<int> ReorderMask(TE.ReorderIndices.begin(), |
| 11524 | TE.ReorderIndices.end()); |
| 11525 | for (unsigned I : seq<unsigned>(Size: Sz)) { |
| 11526 | Value *V = TE.getOrdered(Idx: I); |
| 11527 | if (isa<PoisonValue>(Val: V)) { |
| 11528 | ReorderMask[I] = PoisonMaskElem; |
| 11529 | } else if (isConstant(V) || DemandedElts[I]) { |
| 11530 | ReorderMask[I] = I + TE.ReorderIndices.size(); |
| 11531 | } |
| 11532 | } |
| 11533 | Cost += ::getShuffleCost(TTI: *TTI, |
| 11534 | Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; }) |
| 11535 | ? TTI::SK_PermuteTwoSrc |
| 11536 | : TTI::SK_PermuteSingleSrc, |
| 11537 | Tp: VecTy, Mask: ReorderMask); |
| 11538 | DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size()); |
| 11539 | ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem); |
| 11540 | for (unsigned I : seq<unsigned>(Size: Sz)) { |
| 11541 | Value *V = TE.getOrdered(Idx: I); |
| 11542 | if (isConstant(V)) { |
| 11543 | DemandedElts.clearBit(BitPosition: I); |
| 11544 | if (!isa<PoisonValue>(Val: V)) |
| 11545 | ReorderMask[I] = I; |
| 11546 | } else { |
| 11547 | ReorderMask[I] = I + Sz; |
| 11548 | } |
| 11549 | } |
| 11550 | InstructionCost BVCost = |
| 11551 | getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts, |
| 11552 | /*Insert=*/true, /*Extract=*/false, CostKind); |
| 11553 | if (!DemandedElts.isAllOnes()) |
| 11554 | BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask); |
| 11555 | if (Cost >= BVCost) { |
| 11556 | SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end()); |
| 11557 | reorderScalars(Scalars&: TE.Scalars, Mask); |
| 11558 | TE.ReorderIndices.clear(); |
| 11559 | } |
| 11560 | } |
| 11561 | |
| 11562 | void BoUpSLP::transformNodes() { |
| 11563 | constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 11564 | BaseGraphSize = VectorizableTree.size(); |
| 11565 | // Turn graph transforming mode on and off, when done. |
| 11566 | class GraphTransformModeRAAI { |
| 11567 | bool &SavedIsGraphTransformMode; |
| 11568 | |
| 11569 | public: |
| 11570 | GraphTransformModeRAAI(bool &IsGraphTransformMode) |
| 11571 | : SavedIsGraphTransformMode(IsGraphTransformMode) { |
| 11572 | IsGraphTransformMode = true; |
| 11573 | } |
| 11574 | ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; } |
| 11575 | } TransformContext(IsGraphTransformMode); |
| 11576 | // Operands are profitable if they are: |
| 11577 | // 1. At least one constant |
| 11578 | // or |
| 11579 | // 2. Splats |
| 11580 | // or |
| 11581 | // 3. Results in good vectorization opportunity, i.e. may generate vector |
| 11582 | // nodes and reduce cost of the graph. |
| 11583 | auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2, |
| 11584 | const InstructionsState &S) { |
| 11585 | SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates; |
| 11586 | for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands())) |
| 11587 | Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op), |
| 11588 | Args: I2->getOperand(i: Op)); |
| 11589 | return all_of( |
| 11590 | Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) { |
| 11591 | return all_of(Range&: Cand, |
| 11592 | P: [](const std::pair<Value *, Value *> &P) { |
| 11593 | return isa<Constant>(Val: P.first) || |
| 11594 | isa<Constant>(Val: P.second) || P.first == P.second; |
| 11595 | }) || |
| 11596 | findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads); |
| 11597 | }); |
| 11598 | }; |
| 11599 | |
| 11600 | // Try to reorder gather nodes for better vectorization opportunities. |
| 11601 | for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) { |
| 11602 | TreeEntry &E = *VectorizableTree[Idx]; |
| 11603 | if (E.isGather()) |
| 11604 | reorderGatherNode(TE&: E); |
| 11605 | } |
| 11606 | |
| 11607 | // Better to use full gathered loads analysis, if there are only 2 loads |
| 11608 | // gathered nodes each having less than 16 elements. |
| 11609 | constexpr unsigned VFLimit = 16; |
| 11610 | bool ForceLoadGather = |
| 11611 | count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 11612 | return TE->isGather() && TE->hasState() && |
| 11613 | TE->getOpcode() == Instruction::Load && |
| 11614 | TE->getVectorFactor() < VFLimit; |
| 11615 | }) == 2; |
| 11616 | |
| 11617 | // Checks if the scalars are used in other node. |
| 11618 | auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL, |
| 11619 | function_ref<bool(Value *)> CheckContainer) { |
| 11620 | return TE->isSame(VL) || all_of(Range&: VL, P: [&](Value *V) { |
| 11621 | if (isa<PoisonValue>(Val: V)) |
| 11622 | return true; |
| 11623 | auto *I = dyn_cast<Instruction>(Val: V); |
| 11624 | if (!I) |
| 11625 | return false; |
| 11626 | return is_contained(Range: TE->Scalars, Element: I) || CheckContainer(I); |
| 11627 | }); |
| 11628 | }; |
| 11629 | auto CheckForSameVectorNodes = [&](const TreeEntry &E) { |
| 11630 | if (E.hasState()) { |
| 11631 | if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp()); |
| 11632 | !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) { |
| 11633 | return AreReusedScalars(TE, E.Scalars, [&](Value *V) { |
| 11634 | ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); |
| 11635 | return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) { |
| 11636 | return is_contained(Range&: TEs, Element: TE); |
| 11637 | }); |
| 11638 | }); |
| 11639 | })) |
| 11640 | return true; |
| 11641 | ; |
| 11642 | if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp()); |
| 11643 | !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) { |
| 11644 | return AreReusedScalars(TE, E.Scalars, [&](Value *V) { |
| 11645 | ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); |
| 11646 | return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) { |
| 11647 | return is_contained(Range&: TEs, Element: TE); |
| 11648 | }); |
| 11649 | }); |
| 11650 | })) |
| 11651 | return true; |
| 11652 | } else { |
| 11653 | // Check if the gather node full copy of split node. |
| 11654 | auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>); |
| 11655 | if (It != E.Scalars.end()) { |
| 11656 | if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: *It); |
| 11657 | !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) { |
| 11658 | return AreReusedScalars(TE, E.Scalars, [&](Value *V) { |
| 11659 | ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); |
| 11660 | return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) { |
| 11661 | return is_contained(Range&: TEs, Element: TE); |
| 11662 | }); |
| 11663 | }); |
| 11664 | })) |
| 11665 | return true; |
| 11666 | } |
| 11667 | } |
| 11668 | return false; |
| 11669 | }; |
| 11670 | // The tree may grow here, so iterate over nodes, built before. |
| 11671 | for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) { |
| 11672 | TreeEntry &E = *VectorizableTree[Idx]; |
| 11673 | if (E.isGather()) { |
| 11674 | ArrayRef<Value *> VL = E.Scalars; |
| 11675 | const unsigned Sz = getVectorElementSize(V: VL.front()); |
| 11676 | unsigned MinVF = getMinVF(Sz: 2 * Sz); |
| 11677 | // Do not try partial vectorization for small nodes (<= 2), nodes with the |
| 11678 | // same opcode and same parent block or all constants. |
| 11679 | if (VL.size() <= 2 || LoadEntriesToVectorize.contains(key: Idx) || |
| 11680 | !(!E.hasState() || E.getOpcode() == Instruction::Load || |
| 11681 | // We use allSameOpcode instead of isAltShuffle because we don't |
| 11682 | // want to use interchangeable instruction here. |
| 11683 | !allSameOpcode(VL) || !allSameBlock(VL)) || |
| 11684 | allConstant(VL) || isSplat(VL)) |
| 11685 | continue; |
| 11686 | if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load) |
| 11687 | continue; |
| 11688 | // Check if the node is a copy of other vector nodes. |
| 11689 | if (CheckForSameVectorNodes(E)) |
| 11690 | continue; |
| 11691 | // Try to find vectorizable sequences and transform them into a series of |
| 11692 | // insertvector instructions. |
| 11693 | unsigned StartIdx = 0; |
| 11694 | unsigned End = VL.size(); |
| 11695 | for (unsigned VF = getFloorFullVectorNumberOfElements( |
| 11696 | TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - 1); |
| 11697 | VF >= MinVF; VF = getFloorFullVectorNumberOfElements( |
| 11698 | TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) { |
| 11699 | if (StartIdx + VF > End) |
| 11700 | continue; |
| 11701 | SmallVector<std::pair<unsigned, unsigned>> Slices; |
| 11702 | for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { |
| 11703 | ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF); |
| 11704 | // If any instruction is vectorized already - do not try again. |
| 11705 | // Reuse the existing node, if it fully matches the slice. |
| 11706 | if (isVectorized(V: Slice.front()) && |
| 11707 | !getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /*SameVF=*/true)) |
| 11708 | continue; |
| 11709 | // Constant already handled effectively - skip. |
| 11710 | if (allConstant(VL: Slice)) |
| 11711 | continue; |
| 11712 | // Do not try to vectorize small splats (less than vector register and |
| 11713 | // only with the single non-undef element). |
| 11714 | bool IsSplat = isSplat(VL: Slice); |
| 11715 | bool IsTwoRegisterSplat = true; |
| 11716 | if (IsSplat && VF == 2) { |
| 11717 | unsigned NumRegs2VF = ::getNumberOfParts( |
| 11718 | TTI: *TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: 2 * VF)); |
| 11719 | IsTwoRegisterSplat = NumRegs2VF == 2; |
| 11720 | } |
| 11721 | if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat || |
| 11722 | count(Range&: Slice, Element: Slice.front()) == |
| 11723 | static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - 1 |
| 11724 | : 1)) { |
| 11725 | if (IsSplat) |
| 11726 | continue; |
| 11727 | InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI); |
| 11728 | if (!S || !allSameOpcode(VL: Slice) || !allSameBlock(VL: Slice) || |
| 11729 | (S.getOpcode() == Instruction::Load && |
| 11730 | areKnownNonVectorizableLoads(VL: Slice)) || |
| 11731 | (S.getOpcode() != Instruction::Load && |
| 11732 | !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF))) |
| 11733 | continue; |
| 11734 | if (VF == 2) { |
| 11735 | // Try to vectorize reduced values or if all users are vectorized. |
| 11736 | // For expensive instructions extra extracts might be profitable. |
| 11737 | if ((!UserIgnoreList || E.Idx != 0) && |
| 11738 | TTI->getInstructionCost(U: S.getMainOp(), CostKind) < |
| 11739 | TTI::TCC_Expensive && |
| 11740 | !all_of(Range&: Slice, P: [&](Value *V) { |
| 11741 | if (isa<PoisonValue>(Val: V)) |
| 11742 | return true; |
| 11743 | return areAllUsersVectorized(I: cast<Instruction>(Val: V), |
| 11744 | VectorizedVals: UserIgnoreList); |
| 11745 | })) |
| 11746 | continue; |
| 11747 | if (S.getOpcode() == Instruction::Load) { |
| 11748 | OrdersType Order; |
| 11749 | SmallVector<Value *> PointerOps; |
| 11750 | LoadsState Res = |
| 11751 | canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps); |
| 11752 | // Do not vectorize gathers. |
| 11753 | if (Res == LoadsState::ScatterVectorize || |
| 11754 | Res == LoadsState::Gather) { |
| 11755 | if (Res == LoadsState::Gather) { |
| 11756 | registerNonVectorizableLoads(VL: Slice); |
| 11757 | // If reductions and the scalars from the root node are |
| 11758 | // analyzed - mark as non-vectorizable reduction. |
| 11759 | if (UserIgnoreList && E.Idx == 0) |
| 11760 | analyzedReductionVals(VL: Slice); |
| 11761 | } |
| 11762 | continue; |
| 11763 | } |
| 11764 | } else if (S.getOpcode() == Instruction::ExtractElement || |
| 11765 | (TTI->getInstructionCost(U: S.getMainOp(), CostKind) < |
| 11766 | TTI::TCC_Expensive && |
| 11767 | !CheckOperandsProfitability( |
| 11768 | S.getMainOp(), |
| 11769 | cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice), |
| 11770 | P: IsaPred<Instruction>)), |
| 11771 | S))) { |
| 11772 | // Do not vectorize extractelements (handled effectively |
| 11773 | // alread). Do not vectorize non-profitable instructions (with |
| 11774 | // low cost and non-vectorizable operands.) |
| 11775 | continue; |
| 11776 | } |
| 11777 | } |
| 11778 | } |
| 11779 | Slices.emplace_back(Args&: Cnt, Args: Slice.size()); |
| 11780 | } |
| 11781 | auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { |
| 11782 | E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt); |
| 11783 | if (StartIdx == Cnt) |
| 11784 | StartIdx = Cnt + Sz; |
| 11785 | if (End == Cnt + Sz) |
| 11786 | End = Cnt; |
| 11787 | }; |
| 11788 | for (auto [Cnt, Sz] : Slices) { |
| 11789 | ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz); |
| 11790 | const TreeEntry *SameTE = nullptr; |
| 11791 | if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>); |
| 11792 | It != Slice.end()) { |
| 11793 | // If any instruction is vectorized already - do not try again. |
| 11794 | SameTE = getSameValuesTreeEntry(V: *It, VL: Slice); |
| 11795 | } |
| 11796 | unsigned PrevSize = VectorizableTree.size(); |
| 11797 | [[maybe_unused]] unsigned PrevEntriesSize = |
| 11798 | LoadEntriesToVectorize.size(); |
| 11799 | buildTreeRec(VLRef: Slice, Depth: 0, UserTreeIdx: EdgeInfo(&E, UINT_MAX)); |
| 11800 | if (PrevSize + 1 == VectorizableTree.size() && !SameTE && |
| 11801 | VectorizableTree[PrevSize]->isGather() && |
| 11802 | VectorizableTree[PrevSize]->hasState() && |
| 11803 | VectorizableTree[PrevSize]->getOpcode() != |
| 11804 | Instruction::ExtractElement && |
| 11805 | !isSplat(VL: Slice)) { |
| 11806 | if (UserIgnoreList && E.Idx == 0 && VF == 2) |
| 11807 | analyzedReductionVals(VL: Slice); |
| 11808 | VectorizableTree.pop_back(); |
| 11809 | assert(PrevEntriesSize == LoadEntriesToVectorize.size() && |
| 11810 | "LoadEntriesToVectorize expected to remain the same" ); |
| 11811 | continue; |
| 11812 | } |
| 11813 | AddCombinedNode(PrevSize, Cnt, Sz); |
| 11814 | } |
| 11815 | } |
| 11816 | // Restore ordering, if no extra vectorization happened. |
| 11817 | if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) { |
| 11818 | SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); |
| 11819 | reorderScalars(Scalars&: E.Scalars, Mask); |
| 11820 | E.ReorderIndices.clear(); |
| 11821 | } |
| 11822 | } |
| 11823 | if (!E.hasState()) |
| 11824 | continue; |
| 11825 | switch (E.getOpcode()) { |
| 11826 | case Instruction::Load: { |
| 11827 | // No need to reorder masked gather loads, just reorder the scalar |
| 11828 | // operands. |
| 11829 | if (E.State != TreeEntry::Vectorize) |
| 11830 | break; |
| 11831 | Type *ScalarTy = E.getMainOp()->getType(); |
| 11832 | auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size()); |
| 11833 | Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars); |
| 11834 | // Check if profitable to represent consecutive load + reverse as strided |
| 11835 | // load with stride -1. |
| 11836 | if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) && |
| 11837 | TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) { |
| 11838 | SmallVector<int> Mask; |
| 11839 | inversePermutation(Indices: E.ReorderIndices, Mask); |
| 11840 | auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back()); |
| 11841 | InstructionCost OriginalVecCost = |
| 11842 | TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(), |
| 11843 | AddressSpace: BaseLI->getPointerAddressSpace(), CostKind, |
| 11844 | OpdInfo: TTI::OperandValueInfo()) + |
| 11845 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind); |
| 11846 | InstructionCost StridedCost = TTI->getStridedMemoryOpCost( |
| 11847 | Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(), |
| 11848 | /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseLI); |
| 11849 | if (StridedCost < OriginalVecCost) |
| 11850 | // Strided load is more profitable than consecutive load + reverse - |
| 11851 | // transform the node to strided load. |
| 11852 | E.State = TreeEntry::StridedVectorize; |
| 11853 | } |
| 11854 | break; |
| 11855 | } |
| 11856 | case Instruction::Store: { |
| 11857 | Type *ScalarTy = |
| 11858 | cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType(); |
| 11859 | auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size()); |
| 11860 | Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars); |
| 11861 | // Check if profitable to represent consecutive load + reverse as strided |
| 11862 | // load with stride -1. |
| 11863 | if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) && |
| 11864 | TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) { |
| 11865 | SmallVector<int> Mask; |
| 11866 | inversePermutation(Indices: E.ReorderIndices, Mask); |
| 11867 | auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back()); |
| 11868 | InstructionCost OriginalVecCost = |
| 11869 | TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(), |
| 11870 | AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, |
| 11871 | OpdInfo: TTI::OperandValueInfo()) + |
| 11872 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind); |
| 11873 | InstructionCost StridedCost = TTI->getStridedMemoryOpCost( |
| 11874 | Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(), |
| 11875 | /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseSI); |
| 11876 | if (StridedCost < OriginalVecCost) |
| 11877 | // Strided store is more profitable than reverse + consecutive store - |
| 11878 | // transform the node to strided store. |
| 11879 | E.State = TreeEntry::StridedVectorize; |
| 11880 | } else if (!E.ReorderIndices.empty()) { |
| 11881 | // Check for interleaved stores. |
| 11882 | auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) { |
| 11883 | auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front()); |
| 11884 | assert(Mask.size() > 1 && "Expected mask greater than 1 element." ); |
| 11885 | if (Mask.size() < 4) |
| 11886 | return 0u; |
| 11887 | for (unsigned Factor : seq<unsigned>(Begin: 2, End: Mask.size() / 2 + 1)) { |
| 11888 | if (ShuffleVectorInst::isInterleaveMask( |
| 11889 | Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) && |
| 11890 | TTI.isLegalInterleavedAccessType( |
| 11891 | VTy: VecTy, Factor, Alignment: BaseSI->getAlign(), |
| 11892 | AddrSpace: BaseSI->getPointerAddressSpace())) |
| 11893 | return Factor; |
| 11894 | } |
| 11895 | |
| 11896 | return 0u; |
| 11897 | }; |
| 11898 | SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end()); |
| 11899 | unsigned InterleaveFactor = IsInterleaveMask(Mask); |
| 11900 | if (InterleaveFactor != 0) |
| 11901 | E.setInterleave(InterleaveFactor); |
| 11902 | } |
| 11903 | break; |
| 11904 | } |
| 11905 | case Instruction::Select: { |
| 11906 | if (E.State != TreeEntry::Vectorize) |
| 11907 | break; |
| 11908 | auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars); |
| 11909 | if (MinMaxID == Intrinsic::not_intrinsic) |
| 11910 | break; |
| 11911 | // This node is a minmax node. |
| 11912 | E.CombinedOp = TreeEntry::MinMax; |
| 11913 | TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: 0); |
| 11914 | if (SelectOnly && CondEntry->UserTreeIndex && |
| 11915 | CondEntry->State == TreeEntry::Vectorize) { |
| 11916 | // The condition node is part of the combined minmax node. |
| 11917 | CondEntry->State = TreeEntry::CombinedVectorize; |
| 11918 | } |
| 11919 | break; |
| 11920 | } |
| 11921 | default: |
| 11922 | break; |
| 11923 | } |
| 11924 | } |
| 11925 | |
| 11926 | if (LoadEntriesToVectorize.empty()) { |
| 11927 | // Single load node - exit. |
| 11928 | if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() && |
| 11929 | VectorizableTree.front()->getOpcode() == Instruction::Load) |
| 11930 | return; |
| 11931 | // Small graph with small VF - exit. |
| 11932 | constexpr unsigned SmallTree = 3; |
| 11933 | constexpr unsigned SmallVF = 2; |
| 11934 | if ((VectorizableTree.size() <= SmallTree && |
| 11935 | VectorizableTree.front()->Scalars.size() == SmallVF) || |
| 11936 | (VectorizableTree.size() <= 2 && UserIgnoreList)) |
| 11937 | return; |
| 11938 | |
| 11939 | if (VectorizableTree.front()->isNonPowOf2Vec() && |
| 11940 | getCanonicalGraphSize() != getTreeSize() && UserIgnoreList && |
| 11941 | getCanonicalGraphSize() <= SmallTree && |
| 11942 | count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()), |
| 11943 | P: [](const std::unique_ptr<TreeEntry> &TE) { |
| 11944 | return TE->isGather() && TE->hasState() && |
| 11945 | TE->getOpcode() == Instruction::Load && |
| 11946 | !allSameBlock(VL: TE->Scalars); |
| 11947 | }) == 1) |
| 11948 | return; |
| 11949 | } |
| 11950 | |
| 11951 | // A list of loads to be gathered during the vectorization process. We can |
| 11952 | // try to vectorize them at the end, if profitable. |
| 11953 | SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>, |
| 11954 | SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8> |
| 11955 | GatheredLoads; |
| 11956 | |
| 11957 | for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 11958 | TreeEntry &E = *TE; |
| 11959 | if (E.isGather() && |
| 11960 | ((E.hasState() && E.getOpcode() == Instruction::Load) || |
| 11961 | (!E.hasState() && any_of(Range&: E.Scalars, |
| 11962 | P: [&](Value *V) { |
| 11963 | return isa<LoadInst>(Val: V) && |
| 11964 | !isVectorized(V) && |
| 11965 | !isDeleted(I: cast<Instruction>(Val: V)); |
| 11966 | }))) && |
| 11967 | !isSplat(VL: E.Scalars)) { |
| 11968 | for (Value *V : E.Scalars) { |
| 11969 | auto *LI = dyn_cast<LoadInst>(Val: V); |
| 11970 | if (!LI) |
| 11971 | continue; |
| 11972 | if (isDeleted(I: LI) || isVectorized(V: LI) || !LI->isSimple()) |
| 11973 | continue; |
| 11974 | gatherPossiblyVectorizableLoads( |
| 11975 | R: *this, VL: V, DL: *DL, SE&: *SE, TTI: *TTI, |
| 11976 | GatheredLoads&: GatheredLoads[std::make_tuple( |
| 11977 | args: LI->getParent(), |
| 11978 | args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth), |
| 11979 | args: LI->getType())]); |
| 11980 | } |
| 11981 | } |
| 11982 | } |
| 11983 | // Try to vectorize gathered loads if this is not just a gather of loads. |
| 11984 | if (!GatheredLoads.empty()) |
| 11985 | tryToVectorizeGatheredLoads(GatheredLoads); |
| 11986 | } |
| 11987 | |
| 11988 | /// Merges shuffle masks and emits final shuffle instruction, if required. It |
| 11989 | /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, |
| 11990 | /// when the actual shuffle instruction is generated only if this is actually |
| 11991 | /// required. Otherwise, the shuffle instruction emission is delayed till the |
| 11992 | /// end of the process, to reduce the number of emitted instructions and further |
| 11993 | /// analysis/transformations. |
| 11994 | class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { |
| 11995 | bool IsFinalized = false; |
| 11996 | SmallVector<int> CommonMask; |
| 11997 | SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors; |
| 11998 | const TargetTransformInfo &TTI; |
| 11999 | InstructionCost Cost = 0; |
| 12000 | SmallDenseSet<Value *> VectorizedVals; |
| 12001 | BoUpSLP &R; |
| 12002 | SmallPtrSetImpl<Value *> &; |
| 12003 | constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 12004 | /// While set, still trying to estimate the cost for the same nodes and we |
| 12005 | /// can delay actual cost estimation (virtual shuffle instruction emission). |
| 12006 | /// May help better estimate the cost if same nodes must be permuted + allows |
| 12007 | /// to move most of the long shuffles cost estimation to TTI. |
| 12008 | bool SameNodesEstimated = true; |
| 12009 | |
| 12010 | static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) { |
| 12011 | if (Ty->getScalarType()->isPointerTy()) { |
| 12012 | Constant *Res = ConstantExpr::getIntToPtr( |
| 12013 | C: ConstantInt::getAllOnesValue( |
| 12014 | Ty: IntegerType::get(C&: Ty->getContext(), |
| 12015 | NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))), |
| 12016 | Ty: Ty->getScalarType()); |
| 12017 | if (auto *VTy = dyn_cast<VectorType>(Val: Ty)) |
| 12018 | Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res); |
| 12019 | return Res; |
| 12020 | } |
| 12021 | return Constant::getAllOnesValue(Ty); |
| 12022 | } |
| 12023 | |
| 12024 | InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) { |
| 12025 | if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>)) |
| 12026 | return TTI::TCC_Free; |
| 12027 | auto *VecTy = getWidenedType(ScalarTy, VF: VL.size()); |
| 12028 | InstructionCost GatherCost = 0; |
| 12029 | SmallVector<Value *> Gathers(VL); |
| 12030 | if (!Root && isSplat(VL)) { |
| 12031 | // Found the broadcasting of the single scalar, calculate the cost as |
| 12032 | // the broadcast. |
| 12033 | const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>); |
| 12034 | assert(It != VL.end() && "Expected at least one non-undef value." ); |
| 12035 | // Add broadcast for non-identity shuffle only. |
| 12036 | bool NeedShuffle = |
| 12037 | count(Range&: VL, Element: *It) > 1 && |
| 12038 | (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>)); |
| 12039 | if (!NeedShuffle) { |
| 12040 | if (isa<FixedVectorType>(Val: ScalarTy)) { |
| 12041 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 12042 | return TTI.getShuffleCost( |
| 12043 | Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind, |
| 12044 | Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy), |
| 12045 | SubTp: cast<FixedVectorType>(Val: ScalarTy)); |
| 12046 | } |
| 12047 | return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, |
| 12048 | CostKind, Index: std::distance(first: VL.begin(), last: It), |
| 12049 | Op0: PoisonValue::get(T: VecTy), Op1: *It); |
| 12050 | } |
| 12051 | |
| 12052 | SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem); |
| 12053 | transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) { |
| 12054 | return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0; |
| 12055 | }); |
| 12056 | InstructionCost InsertCost = |
| 12057 | TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0, |
| 12058 | Op0: PoisonValue::get(T: VecTy), Op1: *It); |
| 12059 | return InsertCost + ::getShuffleCost(TTI, |
| 12060 | Kind: TargetTransformInfo::SK_Broadcast, |
| 12061 | Tp: VecTy, Mask: ShuffleMask, CostKind, |
| 12062 | /*Index=*/0, /*SubTp=*/nullptr, |
| 12063 | /*Args=*/*It); |
| 12064 | } |
| 12065 | return GatherCost + |
| 12066 | (all_of(Range&: Gathers, P: IsaPred<UndefValue>) |
| 12067 | ? TTI::TCC_Free |
| 12068 | : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers), |
| 12069 | ScalarTy)); |
| 12070 | }; |
| 12071 | |
| 12072 | /// Compute the cost of creating a vector containing the extracted values from |
| 12073 | /// \p VL. |
| 12074 | InstructionCost |
| 12075 | (ArrayRef<Value *> VL, ArrayRef<int> Mask, |
| 12076 | ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, |
| 12077 | unsigned NumParts) { |
| 12078 | assert(VL.size() > NumParts && "Unexpected scalarized shuffle." ); |
| 12079 | unsigned NumElts = |
| 12080 | std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) { |
| 12081 | auto *EE = dyn_cast<ExtractElementInst>(Val: V); |
| 12082 | if (!EE) |
| 12083 | return Sz; |
| 12084 | auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType()); |
| 12085 | if (!VecTy) |
| 12086 | return Sz; |
| 12087 | return std::max(a: Sz, b: VecTy->getNumElements()); |
| 12088 | }); |
| 12089 | // FIXME: this must be moved to TTI for better estimation. |
| 12090 | unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts); |
| 12091 | auto = [&](MutableArrayRef<int> Mask, |
| 12092 | SmallVectorImpl<unsigned> &Indices, |
| 12093 | SmallVectorImpl<unsigned> &SubVecSizes) |
| 12094 | -> std::optional<TTI::ShuffleKind> { |
| 12095 | if (NumElts <= EltsPerVector) |
| 12096 | return std::nullopt; |
| 12097 | int OffsetReg0 = |
| 12098 | alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX, |
| 12099 | binary_op: [](int S, int I) { |
| 12100 | if (I == PoisonMaskElem) |
| 12101 | return S; |
| 12102 | return std::min(a: S, b: I); |
| 12103 | }), |
| 12104 | Align: EltsPerVector); |
| 12105 | int OffsetReg1 = OffsetReg0; |
| 12106 | DenseSet<int> RegIndices; |
| 12107 | // Check that if trying to permute same single/2 input vectors. |
| 12108 | TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc; |
| 12109 | int FirstRegId = -1; |
| 12110 | Indices.assign(NumElts: 1, Elt: OffsetReg0); |
| 12111 | for (auto [Pos, I] : enumerate(First&: Mask)) { |
| 12112 | if (I == PoisonMaskElem) |
| 12113 | continue; |
| 12114 | int Idx = I - OffsetReg0; |
| 12115 | int RegId = |
| 12116 | (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector; |
| 12117 | if (FirstRegId < 0) |
| 12118 | FirstRegId = RegId; |
| 12119 | RegIndices.insert(V: RegId); |
| 12120 | if (RegIndices.size() > 2) |
| 12121 | return std::nullopt; |
| 12122 | if (RegIndices.size() == 2) { |
| 12123 | ShuffleKind = TTI::SK_PermuteTwoSrc; |
| 12124 | if (Indices.size() == 1) { |
| 12125 | OffsetReg1 = alignDown( |
| 12126 | Value: std::accumulate( |
| 12127 | first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX, |
| 12128 | binary_op: [&](int S, int I) { |
| 12129 | if (I == PoisonMaskElem) |
| 12130 | return S; |
| 12131 | int RegId = ((I - OffsetReg0) / NumElts) * NumParts + |
| 12132 | ((I - OffsetReg0) % NumElts) / EltsPerVector; |
| 12133 | if (RegId == FirstRegId) |
| 12134 | return S; |
| 12135 | return std::min(a: S, b: I); |
| 12136 | }), |
| 12137 | Align: EltsPerVector); |
| 12138 | unsigned Index = OffsetReg1 % NumElts; |
| 12139 | Indices.push_back(Elt: Index); |
| 12140 | SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector)); |
| 12141 | } |
| 12142 | Idx = I - OffsetReg1; |
| 12143 | } |
| 12144 | I = (Idx % NumElts) % EltsPerVector + |
| 12145 | (RegId == FirstRegId ? 0 : EltsPerVector); |
| 12146 | } |
| 12147 | return ShuffleKind; |
| 12148 | }; |
| 12149 | InstructionCost Cost = 0; |
| 12150 | |
| 12151 | // Process extracts in blocks of EltsPerVector to check if the source vector |
| 12152 | // operand can be re-used directly. If not, add the cost of creating a |
| 12153 | // shuffle to extract the values into a vector register. |
| 12154 | for (unsigned Part : seq<unsigned>(Size: NumParts)) { |
| 12155 | if (!ShuffleKinds[Part]) |
| 12156 | continue; |
| 12157 | ArrayRef<int> MaskSlice = Mask.slice( |
| 12158 | N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part)); |
| 12159 | SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem); |
| 12160 | copy(Range&: MaskSlice, Out: SubMask.begin()); |
| 12161 | SmallVector<unsigned, 2> Indices; |
| 12162 | SmallVector<unsigned, 2> SubVecSizes; |
| 12163 | std::optional<TTI::ShuffleKind> RegShuffleKind = |
| 12164 | CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes); |
| 12165 | if (!RegShuffleKind) { |
| 12166 | if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc || |
| 12167 | !ShuffleVectorInst::isIdentityMask( |
| 12168 | Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size()))) |
| 12169 | Cost += |
| 12170 | ::getShuffleCost(TTI, Kind: *ShuffleKinds[Part], |
| 12171 | Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice); |
| 12172 | continue; |
| 12173 | } |
| 12174 | if (*RegShuffleKind != TTI::SK_PermuteSingleSrc || |
| 12175 | !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) { |
| 12176 | Cost += |
| 12177 | ::getShuffleCost(TTI, Kind: *RegShuffleKind, |
| 12178 | Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask); |
| 12179 | } |
| 12180 | const unsigned BaseVF = getFullVectorNumberOfElements( |
| 12181 | TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector)); |
| 12182 | for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) { |
| 12183 | assert((Idx + SubVecSize) <= BaseVF && |
| 12184 | "SK_ExtractSubvector index out of range" ); |
| 12185 | Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, |
| 12186 | Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind, |
| 12187 | Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize)); |
| 12188 | } |
| 12189 | // Second attempt to check, if just a permute is better estimated than |
| 12190 | // subvector extract. |
| 12191 | SubMask.assign(NumElts, Elt: PoisonMaskElem); |
| 12192 | copy(Range&: MaskSlice, Out: SubMask.begin()); |
| 12193 | InstructionCost OriginalCost = ::getShuffleCost( |
| 12194 | TTI, Kind: *ShuffleKinds[Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask); |
| 12195 | if (OriginalCost < Cost) |
| 12196 | Cost = OriginalCost; |
| 12197 | } |
| 12198 | return Cost; |
| 12199 | } |
| 12200 | /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given |
| 12201 | /// mask \p Mask, register number \p Part, that includes \p SliceSize |
| 12202 | /// elements. |
| 12203 | void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2, |
| 12204 | ArrayRef<int> Mask, unsigned Part, |
| 12205 | unsigned SliceSize) { |
| 12206 | if (SameNodesEstimated) { |
| 12207 | // Delay the cost estimation if the same nodes are reshuffling. |
| 12208 | // If we already requested the cost of reshuffling of E1 and E2 before, no |
| 12209 | // need to estimate another cost with the sub-Mask, instead include this |
| 12210 | // sub-Mask into the CommonMask to estimate it later and avoid double cost |
| 12211 | // estimation. |
| 12212 | if ((InVectors.size() == 2 && |
| 12213 | cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 && |
| 12214 | cast<const TreeEntry *>(Val&: InVectors.back()) == E2) || |
| 12215 | (!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) { |
| 12216 | unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part); |
| 12217 | assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit), |
| 12218 | [](int Idx) { return Idx == PoisonMaskElem; }) && |
| 12219 | "Expected all poisoned elements." ); |
| 12220 | ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit); |
| 12221 | copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part)); |
| 12222 | return; |
| 12223 | } |
| 12224 | // Found non-matching nodes - need to estimate the cost for the matched |
| 12225 | // and transform mask. |
| 12226 | Cost += createShuffle(P1: InVectors.front(), |
| 12227 | P2: InVectors.size() == 1 ? nullptr : InVectors.back(), |
| 12228 | Mask: CommonMask); |
| 12229 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12230 | } else if (InVectors.size() == 2) { |
| 12231 | Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask); |
| 12232 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12233 | } |
| 12234 | SameNodesEstimated = false; |
| 12235 | if (!E2 && InVectors.size() == 1) { |
| 12236 | unsigned VF = E1.getVectorFactor(); |
| 12237 | if (Value *V1 = dyn_cast<Value *>(Val&: InVectors.front())) { |
| 12238 | VF = std::max(a: VF, b: getVF(V: V1)); |
| 12239 | } else { |
| 12240 | const auto *E = cast<const TreeEntry *>(Val&: InVectors.front()); |
| 12241 | VF = std::max(a: VF, b: E->getVectorFactor()); |
| 12242 | } |
| 12243 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 12244 | if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) |
| 12245 | CommonMask[Idx] = Mask[Idx] + VF; |
| 12246 | Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask); |
| 12247 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12248 | } else { |
| 12249 | auto P = InVectors.front(); |
| 12250 | Cost += createShuffle(P1: &E1, P2: E2, Mask); |
| 12251 | unsigned VF = Mask.size(); |
| 12252 | if (Value *V1 = dyn_cast<Value *>(Val&: P)) { |
| 12253 | VF = std::max(a: VF, |
| 12254 | b: getNumElements(Ty: V1->getType())); |
| 12255 | } else { |
| 12256 | const auto *E = cast<const TreeEntry *>(Val&: P); |
| 12257 | VF = std::max(a: VF, b: E->getVectorFactor()); |
| 12258 | } |
| 12259 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 12260 | if (Mask[Idx] != PoisonMaskElem) |
| 12261 | CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF); |
| 12262 | Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask); |
| 12263 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12264 | } |
| 12265 | } |
| 12266 | |
| 12267 | class ShuffleCostBuilder { |
| 12268 | const TargetTransformInfo &TTI; |
| 12269 | |
| 12270 | static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) { |
| 12271 | int Index = -1; |
| 12272 | return Mask.empty() || |
| 12273 | (VF == Mask.size() && |
| 12274 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) || |
| 12275 | (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) && |
| 12276 | Index == 0); |
| 12277 | } |
| 12278 | |
| 12279 | public: |
| 12280 | ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {} |
| 12281 | ~ShuffleCostBuilder() = default; |
| 12282 | InstructionCost createShuffleVector(Value *V1, Value *, |
| 12283 | ArrayRef<int> Mask) const { |
| 12284 | // Empty mask or identity mask are free. |
| 12285 | unsigned VF = |
| 12286 | cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue(); |
| 12287 | if (isEmptyOrIdentity(Mask, VF)) |
| 12288 | return TTI::TCC_Free; |
| 12289 | return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc, |
| 12290 | Tp: cast<VectorType>(Val: V1->getType()), Mask); |
| 12291 | } |
| 12292 | InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const { |
| 12293 | // Empty mask or identity mask are free. |
| 12294 | unsigned VF = |
| 12295 | cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue(); |
| 12296 | if (isEmptyOrIdentity(Mask, VF)) |
| 12297 | return TTI::TCC_Free; |
| 12298 | return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, |
| 12299 | Tp: cast<VectorType>(Val: V1->getType()), Mask); |
| 12300 | } |
| 12301 | InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; } |
| 12302 | InstructionCost createPoison(Type *Ty, unsigned VF) const { |
| 12303 | return TTI::TCC_Free; |
| 12304 | } |
| 12305 | void resizeToMatch(Value *&, Value *&) const {} |
| 12306 | }; |
| 12307 | |
| 12308 | /// Smart shuffle instruction emission, walks through shuffles trees and |
| 12309 | /// tries to find the best matching vector for the actual shuffle |
| 12310 | /// instruction. |
| 12311 | InstructionCost |
| 12312 | createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1, |
| 12313 | const PointerUnion<Value *, const TreeEntry *> &P2, |
| 12314 | ArrayRef<int> Mask) { |
| 12315 | ShuffleCostBuilder Builder(TTI); |
| 12316 | SmallVector<int> CommonMask(Mask); |
| 12317 | Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>(); |
| 12318 | unsigned CommonVF = Mask.size(); |
| 12319 | InstructionCost = 0; |
| 12320 | auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E, |
| 12321 | unsigned VF) -> InstructionCost { |
| 12322 | if (E.isGather() && allConstant(VL: E.Scalars)) |
| 12323 | return TTI::TCC_Free; |
| 12324 | Type *EScalarTy = E.Scalars.front()->getType(); |
| 12325 | bool IsSigned = true; |
| 12326 | if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) { |
| 12327 | EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first); |
| 12328 | IsSigned = It->second.second; |
| 12329 | } |
| 12330 | if (EScalarTy != ScalarTy) { |
| 12331 | unsigned CastOpcode = Instruction::Trunc; |
| 12332 | unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy); |
| 12333 | unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy); |
| 12334 | if (DstSz > SrcSz) |
| 12335 | CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; |
| 12336 | return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF), |
| 12337 | Src: getWidenedType(ScalarTy: EScalarTy, VF), |
| 12338 | CCH: TTI::CastContextHint::None, CostKind); |
| 12339 | } |
| 12340 | return TTI::TCC_Free; |
| 12341 | }; |
| 12342 | auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost { |
| 12343 | if (isa<Constant>(Val: V)) |
| 12344 | return TTI::TCC_Free; |
| 12345 | auto *VecTy = cast<VectorType>(Val: V->getType()); |
| 12346 | Type *EScalarTy = VecTy->getElementType(); |
| 12347 | if (EScalarTy != ScalarTy) { |
| 12348 | bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL)); |
| 12349 | unsigned CastOpcode = Instruction::Trunc; |
| 12350 | unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy); |
| 12351 | unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy); |
| 12352 | if (DstSz > SrcSz) |
| 12353 | CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; |
| 12354 | return TTI.getCastInstrCost( |
| 12355 | Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()), |
| 12356 | Src: VecTy, CCH: TTI::CastContextHint::None, CostKind); |
| 12357 | } |
| 12358 | return TTI::TCC_Free; |
| 12359 | }; |
| 12360 | if (!V1 && !V2 && !P2.isNull()) { |
| 12361 | // Shuffle 2 entry nodes. |
| 12362 | const TreeEntry *E = cast<const TreeEntry *>(Val: P1); |
| 12363 | unsigned VF = E->getVectorFactor(); |
| 12364 | const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2); |
| 12365 | CommonVF = std::max(a: VF, b: E2->getVectorFactor()); |
| 12366 | assert(all_of(Mask, |
| 12367 | [=](int Idx) { |
| 12368 | return Idx < 2 * static_cast<int>(CommonVF); |
| 12369 | }) && |
| 12370 | "All elements in mask must be less than 2 * CommonVF." ); |
| 12371 | if (E->Scalars.size() == E2->Scalars.size()) { |
| 12372 | SmallVector<int> EMask = E->getCommonMask(); |
| 12373 | SmallVector<int> E2Mask = E2->getCommonMask(); |
| 12374 | if (!EMask.empty() || !E2Mask.empty()) { |
| 12375 | for (int &Idx : CommonMask) { |
| 12376 | if (Idx == PoisonMaskElem) |
| 12377 | continue; |
| 12378 | if (Idx < static_cast<int>(CommonVF) && !EMask.empty()) |
| 12379 | Idx = EMask[Idx]; |
| 12380 | else if (Idx >= static_cast<int>(CommonVF)) |
| 12381 | Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) + |
| 12382 | E->Scalars.size(); |
| 12383 | } |
| 12384 | } |
| 12385 | CommonVF = E->Scalars.size(); |
| 12386 | ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) + |
| 12387 | GetNodeMinBWAffectedCost(*E2, CommonVF); |
| 12388 | } else { |
| 12389 | ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) + |
| 12390 | GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor()); |
| 12391 | } |
| 12392 | V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12393 | V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12394 | } else if (!V1 && P2.isNull()) { |
| 12395 | // Shuffle single entry node. |
| 12396 | const TreeEntry *E = cast<const TreeEntry *>(Val: P1); |
| 12397 | unsigned VF = E->getVectorFactor(); |
| 12398 | CommonVF = VF; |
| 12399 | assert( |
| 12400 | all_of(Mask, |
| 12401 | [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && |
| 12402 | "All elements in mask must be less than CommonVF." ); |
| 12403 | if (E->Scalars.size() == Mask.size() && VF != Mask.size()) { |
| 12404 | SmallVector<int> EMask = E->getCommonMask(); |
| 12405 | assert(!EMask.empty() && "Expected non-empty common mask." ); |
| 12406 | for (int &Idx : CommonMask) { |
| 12407 | if (Idx != PoisonMaskElem) |
| 12408 | Idx = EMask[Idx]; |
| 12409 | } |
| 12410 | CommonVF = E->Scalars.size(); |
| 12411 | } else if (unsigned Factor = E->getInterleaveFactor(); |
| 12412 | Factor > 0 && E->Scalars.size() != Mask.size() && |
| 12413 | ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask, |
| 12414 | Factor)) { |
| 12415 | // Deinterleaved nodes are free. |
| 12416 | std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: 0); |
| 12417 | } |
| 12418 | ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF); |
| 12419 | V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12420 | // Not identity/broadcast? Try to see if the original vector is better. |
| 12421 | if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() && |
| 12422 | CommonVF == CommonMask.size() && |
| 12423 | any_of(Range: enumerate(First&: CommonMask), |
| 12424 | P: [](const auto &&P) { |
| 12425 | return P.value() != PoisonMaskElem && |
| 12426 | static_cast<unsigned>(P.value()) != P.index(); |
| 12427 | }) && |
| 12428 | any_of(Range&: CommonMask, |
| 12429 | P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) { |
| 12430 | SmallVector<int> ReorderMask; |
| 12431 | inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask); |
| 12432 | ::addMask(Mask&: CommonMask, SubMask: ReorderMask); |
| 12433 | } |
| 12434 | } else if (V1 && P2.isNull()) { |
| 12435 | // Shuffle single vector. |
| 12436 | ExtraCost += GetValueMinBWAffectedCost(V1); |
| 12437 | CommonVF = getVF(V: V1); |
| 12438 | assert( |
| 12439 | all_of(Mask, |
| 12440 | [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) && |
| 12441 | "All elements in mask must be less than CommonVF." ); |
| 12442 | } else if (V1 && !V2) { |
| 12443 | // Shuffle vector and tree node. |
| 12444 | unsigned VF = getVF(V: V1); |
| 12445 | const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2); |
| 12446 | CommonVF = std::max(a: VF, b: E2->getVectorFactor()); |
| 12447 | assert(all_of(Mask, |
| 12448 | [=](int Idx) { |
| 12449 | return Idx < 2 * static_cast<int>(CommonVF); |
| 12450 | }) && |
| 12451 | "All elements in mask must be less than 2 * CommonVF." ); |
| 12452 | if (E2->Scalars.size() == VF && VF != CommonVF) { |
| 12453 | SmallVector<int> E2Mask = E2->getCommonMask(); |
| 12454 | assert(!E2Mask.empty() && "Expected non-empty common mask." ); |
| 12455 | for (int &Idx : CommonMask) { |
| 12456 | if (Idx == PoisonMaskElem) |
| 12457 | continue; |
| 12458 | if (Idx >= static_cast<int>(CommonVF)) |
| 12459 | Idx = E2Mask[Idx - CommonVF] + VF; |
| 12460 | } |
| 12461 | CommonVF = VF; |
| 12462 | } |
| 12463 | ExtraCost += GetValueMinBWAffectedCost(V1); |
| 12464 | V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12465 | ExtraCost += GetNodeMinBWAffectedCost( |
| 12466 | *E2, std::min(a: CommonVF, b: E2->getVectorFactor())); |
| 12467 | V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12468 | } else if (!V1 && V2) { |
| 12469 | // Shuffle vector and tree node. |
| 12470 | unsigned VF = getVF(V: V2); |
| 12471 | const TreeEntry *E1 = cast<const TreeEntry *>(Val: P1); |
| 12472 | CommonVF = std::max(a: VF, b: E1->getVectorFactor()); |
| 12473 | assert(all_of(Mask, |
| 12474 | [=](int Idx) { |
| 12475 | return Idx < 2 * static_cast<int>(CommonVF); |
| 12476 | }) && |
| 12477 | "All elements in mask must be less than 2 * CommonVF." ); |
| 12478 | if (E1->Scalars.size() == VF && VF != CommonVF) { |
| 12479 | SmallVector<int> E1Mask = E1->getCommonMask(); |
| 12480 | assert(!E1Mask.empty() && "Expected non-empty common mask." ); |
| 12481 | for (int &Idx : CommonMask) { |
| 12482 | if (Idx == PoisonMaskElem) |
| 12483 | continue; |
| 12484 | if (Idx >= static_cast<int>(CommonVF)) |
| 12485 | Idx = E1Mask[Idx - CommonVF] + VF; |
| 12486 | else |
| 12487 | Idx = E1Mask[Idx]; |
| 12488 | } |
| 12489 | CommonVF = VF; |
| 12490 | } |
| 12491 | ExtraCost += GetNodeMinBWAffectedCost( |
| 12492 | *E1, std::min(a: CommonVF, b: E1->getVectorFactor())); |
| 12493 | V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12494 | ExtraCost += GetValueMinBWAffectedCost(V2); |
| 12495 | V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12496 | } else { |
| 12497 | assert(V1 && V2 && "Expected both vectors." ); |
| 12498 | unsigned VF = getVF(V: V1); |
| 12499 | CommonVF = std::max(a: VF, b: getVF(V: V2)); |
| 12500 | assert(all_of(Mask, |
| 12501 | [=](int Idx) { |
| 12502 | return Idx < 2 * static_cast<int>(CommonVF); |
| 12503 | }) && |
| 12504 | "All elements in mask must be less than 2 * CommonVF." ); |
| 12505 | ExtraCost += |
| 12506 | GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2); |
| 12507 | if (V1->getType() != V2->getType()) { |
| 12508 | V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12509 | V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12510 | } else { |
| 12511 | if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy) |
| 12512 | V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12513 | if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy) |
| 12514 | V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF)); |
| 12515 | } |
| 12516 | } |
| 12517 | InVectors.front() = |
| 12518 | Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size())); |
| 12519 | if (InVectors.size() == 2) |
| 12520 | InVectors.pop_back(); |
| 12521 | return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>( |
| 12522 | V1, V2, Mask: CommonMask, Builder, ScalarTy); |
| 12523 | } |
| 12524 | |
| 12525 | public: |
| 12526 | ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, |
| 12527 | ArrayRef<Value *> VectorizedVals, BoUpSLP &R, |
| 12528 | SmallPtrSetImpl<Value *> &) |
| 12529 | : BaseShuffleAnalysis(ScalarTy), TTI(TTI), |
| 12530 | VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R), |
| 12531 | CheckedExtracts(CheckedExtracts) {} |
| 12532 | Value *(const TreeEntry *E, MutableArrayRef<int> Mask, |
| 12533 | ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, |
| 12534 | unsigned NumParts, bool &UseVecBaseAsInput) { |
| 12535 | UseVecBaseAsInput = false; |
| 12536 | if (Mask.empty()) |
| 12537 | return nullptr; |
| 12538 | Value *VecBase = nullptr; |
| 12539 | SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end()); |
| 12540 | if (!E->ReorderIndices.empty()) { |
| 12541 | SmallVector<int> ReorderMask(E->ReorderIndices.begin(), |
| 12542 | E->ReorderIndices.end()); |
| 12543 | reorderScalars(Scalars&: VL, Mask: ReorderMask); |
| 12544 | } |
| 12545 | // Check if it can be considered reused if same extractelements were |
| 12546 | // vectorized already. |
| 12547 | bool PrevNodeFound = any_of( |
| 12548 | Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx), |
| 12549 | P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 12550 | return ((TE->hasState() && !TE->isAltShuffle() && |
| 12551 | TE->getOpcode() == Instruction::ExtractElement) || |
| 12552 | TE->isGather()) && |
| 12553 | all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) { |
| 12554 | return VL.size() > Data.index() && |
| 12555 | (Mask[Data.index()] == PoisonMaskElem || |
| 12556 | isa<UndefValue>(VL[Data.index()]) || |
| 12557 | Data.value() == VL[Data.index()]); |
| 12558 | }); |
| 12559 | }); |
| 12560 | SmallPtrSet<Value *, 4> UniqueBases; |
| 12561 | unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts); |
| 12562 | SmallDenseMap<Value *, APInt, 4> ; |
| 12563 | for (unsigned Part : seq<unsigned>(Size: NumParts)) { |
| 12564 | unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part); |
| 12565 | ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit); |
| 12566 | for (auto [I, V] : |
| 12567 | enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) { |
| 12568 | // Ignore non-extractelement scalars. |
| 12569 | if (isa<UndefValue>(Val: V) || |
| 12570 | (!SubMask.empty() && SubMask[I] == PoisonMaskElem)) |
| 12571 | continue; |
| 12572 | // If all users of instruction are going to be vectorized and this |
| 12573 | // instruction itself is not going to be vectorized, consider this |
| 12574 | // instruction as dead and remove its cost from the final cost of the |
| 12575 | // vectorized tree. |
| 12576 | // Also, avoid adjusting the cost for extractelements with multiple uses |
| 12577 | // in different graph entries. |
| 12578 | auto *EE = cast<ExtractElementInst>(Val: V); |
| 12579 | VecBase = EE->getVectorOperand(); |
| 12580 | UniqueBases.insert(Ptr: VecBase); |
| 12581 | ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V); |
| 12582 | if (!CheckedExtracts.insert(Ptr: V).second || |
| 12583 | !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) || |
| 12584 | any_of(Range: EE->users(), |
| 12585 | P: [&](User *U) { |
| 12586 | return isa<GetElementPtrInst>(Val: U) && |
| 12587 | !R.areAllUsersVectorized(I: cast<Instruction>(Val: U), |
| 12588 | VectorizedVals: &VectorizedVals); |
| 12589 | }) || |
| 12590 | (!VEs.empty() && !is_contained(Range&: VEs, Element: E))) |
| 12591 | continue; |
| 12592 | std::optional<unsigned> EEIdx = getExtractIndex(E: EE); |
| 12593 | if (!EEIdx) |
| 12594 | continue; |
| 12595 | unsigned Idx = *EEIdx; |
| 12596 | // Take credit for instruction that will become dead. |
| 12597 | if (EE->hasOneUse() || !PrevNodeFound) { |
| 12598 | Instruction *Ext = EE->user_back(); |
| 12599 | if (isa<SExtInst, ZExtInst>(Val: Ext) && |
| 12600 | all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) { |
| 12601 | // Use getExtractWithExtendCost() to calculate the cost of |
| 12602 | // extractelement/ext pair. |
| 12603 | Cost -= TTI.getExtractWithExtendCost( |
| 12604 | Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(), |
| 12605 | Index: Idx, CostKind); |
| 12606 | // Add back the cost of s|zext which is subtracted separately. |
| 12607 | Cost += TTI.getCastInstrCost( |
| 12608 | Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(), |
| 12609 | CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext); |
| 12610 | continue; |
| 12611 | } |
| 12612 | } |
| 12613 | APInt &DemandedElts = |
| 12614 | VectorOpsToExtracts |
| 12615 | .try_emplace(Key: VecBase, |
| 12616 | Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType()))) |
| 12617 | .first->getSecond(); |
| 12618 | DemandedElts.setBit(Idx); |
| 12619 | } |
| 12620 | } |
| 12621 | for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts) |
| 12622 | Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()), |
| 12623 | DemandedElts, /*Insert=*/false, |
| 12624 | /*Extract=*/true, CostKind); |
| 12625 | // Check that gather of extractelements can be represented as just a |
| 12626 | // shuffle of a single/two vectors the scalars are extracted from. |
| 12627 | // Found the bunch of extractelement instructions that must be gathered |
| 12628 | // into a vector and can be represented as a permutation elements in a |
| 12629 | // single input vector or of 2 input vectors. |
| 12630 | // Done for reused if same extractelements were vectorized already. |
| 12631 | if (!PrevNodeFound) |
| 12632 | Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts); |
| 12633 | InVectors.assign(NumElts: 1, Elt: E); |
| 12634 | CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end()); |
| 12635 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12636 | SameNodesEstimated = false; |
| 12637 | if (NumParts != 1 && UniqueBases.size() != 1) { |
| 12638 | UseVecBaseAsInput = true; |
| 12639 | VecBase = |
| 12640 | Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size())); |
| 12641 | } |
| 12642 | return VecBase; |
| 12643 | } |
| 12644 | /// Checks if the specified entry \p E needs to be delayed because of its |
| 12645 | /// dependency nodes. |
| 12646 | std::optional<InstructionCost> |
| 12647 | needToDelay(const TreeEntry *, |
| 12648 | ArrayRef<SmallVector<const TreeEntry *>>) const { |
| 12649 | // No need to delay the cost estimation during analysis. |
| 12650 | return std::nullopt; |
| 12651 | } |
| 12652 | /// Reset the builder to handle perfect diamond match. |
| 12653 | void resetForSameNode() { |
| 12654 | IsFinalized = false; |
| 12655 | CommonMask.clear(); |
| 12656 | InVectors.clear(); |
| 12657 | Cost = 0; |
| 12658 | VectorizedVals.clear(); |
| 12659 | SameNodesEstimated = true; |
| 12660 | } |
| 12661 | void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { |
| 12662 | if (&E1 == &E2) { |
| 12663 | assert(all_of(Mask, |
| 12664 | [&](int Idx) { |
| 12665 | return Idx < static_cast<int>(E1.getVectorFactor()); |
| 12666 | }) && |
| 12667 | "Expected single vector shuffle mask." ); |
| 12668 | add(E1, Mask); |
| 12669 | return; |
| 12670 | } |
| 12671 | if (InVectors.empty()) { |
| 12672 | CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end()); |
| 12673 | InVectors.assign(IL: {&E1, &E2}); |
| 12674 | return; |
| 12675 | } |
| 12676 | assert(!CommonMask.empty() && "Expected non-empty common mask." ); |
| 12677 | auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size()); |
| 12678 | unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size()); |
| 12679 | unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts); |
| 12680 | const auto *It = |
| 12681 | find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; }); |
| 12682 | unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize; |
| 12683 | estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize); |
| 12684 | } |
| 12685 | void add(const TreeEntry &E1, ArrayRef<int> Mask) { |
| 12686 | if (InVectors.empty()) { |
| 12687 | CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end()); |
| 12688 | InVectors.assign(NumElts: 1, Elt: &E1); |
| 12689 | return; |
| 12690 | } |
| 12691 | assert(!CommonMask.empty() && "Expected non-empty common mask." ); |
| 12692 | auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size()); |
| 12693 | unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size()); |
| 12694 | unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts); |
| 12695 | const auto *It = |
| 12696 | find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; }); |
| 12697 | unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize; |
| 12698 | estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize); |
| 12699 | if (!SameNodesEstimated && InVectors.size() == 1) |
| 12700 | InVectors.emplace_back(Args: &E1); |
| 12701 | } |
| 12702 | /// Adds 2 input vectors and the mask for their shuffling. |
| 12703 | void add(Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 12704 | // May come only for shuffling of 2 vectors with extractelements, already |
| 12705 | // handled in adjustExtracts. |
| 12706 | assert(InVectors.size() == 1 && |
| 12707 | all_of(enumerate(CommonMask), |
| 12708 | [&](auto P) { |
| 12709 | if (P.value() == PoisonMaskElem) |
| 12710 | return Mask[P.index()] == PoisonMaskElem; |
| 12711 | auto *EI = cast<ExtractElementInst>( |
| 12712 | cast<const TreeEntry *>(InVectors.front()) |
| 12713 | ->getOrdered(P.index())); |
| 12714 | return EI->getVectorOperand() == V1 || |
| 12715 | EI->getVectorOperand() == V2; |
| 12716 | }) && |
| 12717 | "Expected extractelement vectors." ); |
| 12718 | } |
| 12719 | /// Adds another one input vector and the mask for the shuffling. |
| 12720 | void add(Value *V1, ArrayRef<int> Mask, bool = false) { |
| 12721 | if (InVectors.empty()) { |
| 12722 | assert(CommonMask.empty() && !ForExtracts && |
| 12723 | "Expected empty input mask/vectors." ); |
| 12724 | CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end()); |
| 12725 | InVectors.assign(NumElts: 1, Elt: V1); |
| 12726 | return; |
| 12727 | } |
| 12728 | if (ForExtracts) { |
| 12729 | // No need to add vectors here, already handled them in adjustExtracts. |
| 12730 | assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) && |
| 12731 | !CommonMask.empty() && |
| 12732 | all_of(enumerate(CommonMask), |
| 12733 | [&](auto P) { |
| 12734 | Value *Scalar = cast<const TreeEntry *>(InVectors[0]) |
| 12735 | ->getOrdered(P.index()); |
| 12736 | if (P.value() == PoisonMaskElem) |
| 12737 | return P.value() == Mask[P.index()] || |
| 12738 | isa<UndefValue>(Scalar); |
| 12739 | if (isa<Constant>(V1)) |
| 12740 | return true; |
| 12741 | auto *EI = cast<ExtractElementInst>(Scalar); |
| 12742 | return EI->getVectorOperand() == V1; |
| 12743 | }) && |
| 12744 | "Expected only tree entry for extractelement vectors." ); |
| 12745 | return; |
| 12746 | } |
| 12747 | assert(!InVectors.empty() && !CommonMask.empty() && |
| 12748 | "Expected only tree entries from extracts/reused buildvectors." ); |
| 12749 | unsigned VF = getVF(V: V1); |
| 12750 | if (InVectors.size() == 2) { |
| 12751 | Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask); |
| 12752 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12753 | VF = std::max<unsigned>(a: VF, b: CommonMask.size()); |
| 12754 | } else if (const auto *InTE = |
| 12755 | InVectors.front().dyn_cast<const TreeEntry *>()) { |
| 12756 | VF = std::max(a: VF, b: InTE->getVectorFactor()); |
| 12757 | } else { |
| 12758 | VF = std::max( |
| 12759 | a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType()) |
| 12760 | ->getNumElements()); |
| 12761 | } |
| 12762 | InVectors.push_back(Elt: V1); |
| 12763 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 12764 | if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) |
| 12765 | CommonMask[Idx] = Mask[Idx] + VF; |
| 12766 | } |
| 12767 | Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, |
| 12768 | Value *Root = nullptr) { |
| 12769 | Cost += getBuildVectorCost(VL, Root); |
| 12770 | if (!Root) { |
| 12771 | // FIXME: Need to find a way to avoid use of getNullValue here. |
| 12772 | SmallVector<Constant *> Vals; |
| 12773 | unsigned VF = VL.size(); |
| 12774 | if (MaskVF != 0) |
| 12775 | VF = std::min(a: VF, b: MaskVF); |
| 12776 | Type *VLScalarTy = VL.front()->getType(); |
| 12777 | for (Value *V : VL.take_front(N: VF)) { |
| 12778 | Type *ScalarTy = VLScalarTy->getScalarType(); |
| 12779 | if (isa<PoisonValue>(Val: V)) { |
| 12780 | Vals.push_back(Elt: PoisonValue::get(T: ScalarTy)); |
| 12781 | continue; |
| 12782 | } |
| 12783 | if (isa<UndefValue>(Val: V)) { |
| 12784 | Vals.push_back(Elt: UndefValue::get(T: ScalarTy)); |
| 12785 | continue; |
| 12786 | } |
| 12787 | Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy)); |
| 12788 | } |
| 12789 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) { |
| 12790 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 12791 | // When REVEC is enabled, we need to expand vector types into scalar |
| 12792 | // types. |
| 12793 | Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements()); |
| 12794 | } |
| 12795 | return ConstantVector::get(V: Vals); |
| 12796 | } |
| 12797 | return ConstantVector::getSplat( |
| 12798 | EC: ElementCount::getFixed( |
| 12799 | MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()), |
| 12800 | Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType())); |
| 12801 | } |
| 12802 | InstructionCost createFreeze(InstructionCost Cost) { return Cost; } |
| 12803 | /// Finalize emission of the shuffles. |
| 12804 | InstructionCost finalize( |
| 12805 | ArrayRef<int> ExtMask, |
| 12806 | ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors, |
| 12807 | ArrayRef<int> SubVectorsMask, unsigned VF = 0, |
| 12808 | function_ref<void(Value *&, SmallVectorImpl<int> &, |
| 12809 | function_ref<Value *(Value *, Value *, ArrayRef<int>)>)> |
| 12810 | Action = {}) { |
| 12811 | IsFinalized = true; |
| 12812 | if (Action) { |
| 12813 | const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); |
| 12814 | if (InVectors.size() == 2) |
| 12815 | Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask); |
| 12816 | else |
| 12817 | Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask); |
| 12818 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12819 | assert(VF > 0 && |
| 12820 | "Expected vector length for the final value before action." ); |
| 12821 | Value *V = cast<Value *>(Val: Vec); |
| 12822 | Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 12823 | Cost += createShuffle(P1: V1, P2: V2, Mask); |
| 12824 | return V1; |
| 12825 | }); |
| 12826 | InVectors.front() = V; |
| 12827 | } |
| 12828 | if (!SubVectors.empty()) { |
| 12829 | const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front(); |
| 12830 | if (InVectors.size() == 2) |
| 12831 | Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask); |
| 12832 | else |
| 12833 | Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask); |
| 12834 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 12835 | // Add subvectors permutation cost. |
| 12836 | if (!SubVectorsMask.empty()) { |
| 12837 | assert(SubVectorsMask.size() <= CommonMask.size() && |
| 12838 | "Expected same size of masks for subvectors and common mask." ); |
| 12839 | SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem); |
| 12840 | copy(Range&: SubVectorsMask, Out: SVMask.begin()); |
| 12841 | for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) { |
| 12842 | if (I2 != PoisonMaskElem) { |
| 12843 | assert(I1 == PoisonMaskElem && "Expected unused subvectors mask" ); |
| 12844 | I1 = I2 + CommonMask.size(); |
| 12845 | } |
| 12846 | } |
| 12847 | Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc, |
| 12848 | Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), |
| 12849 | Mask: SVMask, CostKind); |
| 12850 | } |
| 12851 | for (auto [E, Idx] : SubVectors) { |
| 12852 | Type *EScalarTy = E->Scalars.front()->getType(); |
| 12853 | bool IsSigned = true; |
| 12854 | if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) { |
| 12855 | EScalarTy = |
| 12856 | IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first); |
| 12857 | IsSigned = It->second.second; |
| 12858 | } |
| 12859 | if (ScalarTy != EScalarTy) { |
| 12860 | unsigned CastOpcode = Instruction::Trunc; |
| 12861 | unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy); |
| 12862 | unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy); |
| 12863 | if (DstSz > SrcSz) |
| 12864 | CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt; |
| 12865 | Cost += TTI.getCastInstrCost( |
| 12866 | Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()), |
| 12867 | Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()), |
| 12868 | CCH: TTI::CastContextHint::Normal, CostKind); |
| 12869 | } |
| 12870 | Cost += ::getShuffleCost( |
| 12871 | TTI, Kind: TTI::SK_InsertSubvector, |
| 12872 | Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx, |
| 12873 | SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor())); |
| 12874 | if (!CommonMask.empty()) { |
| 12875 | std::iota(first: std::next(x: CommonMask.begin(), n: Idx), |
| 12876 | last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()), |
| 12877 | value: Idx); |
| 12878 | } |
| 12879 | } |
| 12880 | } |
| 12881 | |
| 12882 | if (!ExtMask.empty()) { |
| 12883 | if (CommonMask.empty()) { |
| 12884 | CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end()); |
| 12885 | } else { |
| 12886 | SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); |
| 12887 | for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { |
| 12888 | if (ExtMask[I] == PoisonMaskElem) |
| 12889 | continue; |
| 12890 | NewMask[I] = CommonMask[ExtMask[I]]; |
| 12891 | } |
| 12892 | CommonMask.swap(RHS&: NewMask); |
| 12893 | } |
| 12894 | } |
| 12895 | if (CommonMask.empty()) { |
| 12896 | assert(InVectors.size() == 1 && "Expected only one vector with no mask" ); |
| 12897 | return Cost; |
| 12898 | } |
| 12899 | return Cost + |
| 12900 | createShuffle(P1: InVectors.front(), |
| 12901 | P2: InVectors.size() == 2 ? InVectors.back() : nullptr, |
| 12902 | Mask: CommonMask); |
| 12903 | } |
| 12904 | |
| 12905 | ~ShuffleCostEstimator() { |
| 12906 | assert((IsFinalized || CommonMask.empty()) && |
| 12907 | "Shuffle construction must be finalized." ); |
| 12908 | } |
| 12909 | }; |
| 12910 | |
| 12911 | const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E, |
| 12912 | unsigned Idx) const { |
| 12913 | TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx}); |
| 12914 | assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!" ); |
| 12915 | return Op; |
| 12916 | } |
| 12917 | |
| 12918 | TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const { |
| 12919 | if (TE.State == TreeEntry::ScatterVectorize || |
| 12920 | TE.State == TreeEntry::StridedVectorize) |
| 12921 | return TTI::CastContextHint::GatherScatter; |
| 12922 | if (TE.State == TreeEntry::CompressVectorize) |
| 12923 | return TTI::CastContextHint::Masked; |
| 12924 | if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load && |
| 12925 | !TE.isAltShuffle()) { |
| 12926 | if (TE.ReorderIndices.empty()) |
| 12927 | return TTI::CastContextHint::Normal; |
| 12928 | SmallVector<int> Mask; |
| 12929 | inversePermutation(Indices: TE.ReorderIndices, Mask); |
| 12930 | if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size())) |
| 12931 | return TTI::CastContextHint::Reversed; |
| 12932 | } |
| 12933 | return TTI::CastContextHint::None; |
| 12934 | } |
| 12935 | |
| 12936 | InstructionCost |
| 12937 | BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, |
| 12938 | SmallPtrSetImpl<Value *> &) { |
| 12939 | ArrayRef<Value *> VL = E->Scalars; |
| 12940 | |
| 12941 | Type *ScalarTy = getValueType(V: VL[0]); |
| 12942 | if (!isValidElementType(Ty: ScalarTy)) |
| 12943 | return InstructionCost::getInvalid(); |
| 12944 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 12945 | |
| 12946 | // If we have computed a smaller type for the expression, update VecTy so |
| 12947 | // that the costs will be accurate. |
| 12948 | auto It = MinBWs.find(Val: E); |
| 12949 | Type *OrigScalarTy = ScalarTy; |
| 12950 | if (It != MinBWs.end()) { |
| 12951 | auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy); |
| 12952 | ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first); |
| 12953 | if (VecTy) |
| 12954 | ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements()); |
| 12955 | } |
| 12956 | auto *VecTy = getWidenedType(ScalarTy, VF: VL.size()); |
| 12957 | unsigned EntryVF = E->getVectorFactor(); |
| 12958 | auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF); |
| 12959 | |
| 12960 | if (E->isGather()) { |
| 12961 | if (allConstant(VL)) |
| 12962 | return 0; |
| 12963 | if (isa<InsertElementInst>(Val: VL[0])) |
| 12964 | return InstructionCost::getInvalid(); |
| 12965 | if (isa<CmpInst>(Val: VL.front())) |
| 12966 | ScalarTy = VL.front()->getType(); |
| 12967 | return processBuildVector<ShuffleCostEstimator, InstructionCost>( |
| 12968 | E, ScalarTy, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts); |
| 12969 | } |
| 12970 | if (E->State == TreeEntry::SplitVectorize) { |
| 12971 | assert(E->CombinedEntriesWithIndices.size() == 2 && |
| 12972 | "Expected exactly 2 combined entries." ); |
| 12973 | assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask." ); |
| 12974 | InstructionCost VectorCost = 0; |
| 12975 | if (E->ReorderIndices.empty()) { |
| 12976 | VectorCost = ::getShuffleCost( |
| 12977 | TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind, |
| 12978 | Index: E->CombinedEntriesWithIndices.back().second, |
| 12979 | SubTp: getWidenedType( |
| 12980 | ScalarTy, |
| 12981 | VF: VectorizableTree[E->CombinedEntriesWithIndices.back().first] |
| 12982 | ->getVectorFactor())); |
| 12983 | } else { |
| 12984 | unsigned CommonVF = |
| 12985 | std::max(a: VectorizableTree[E->CombinedEntriesWithIndices.front().first] |
| 12986 | ->getVectorFactor(), |
| 12987 | b: VectorizableTree[E->CombinedEntriesWithIndices.back().first] |
| 12988 | ->getVectorFactor()); |
| 12989 | VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, |
| 12990 | Tp: getWidenedType(ScalarTy, VF: CommonVF), |
| 12991 | Mask: E->getSplitMask(), CostKind); |
| 12992 | } |
| 12993 | LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree" )); |
| 12994 | return VectorCost; |
| 12995 | } |
| 12996 | InstructionCost CommonCost = 0; |
| 12997 | SmallVector<int> Mask; |
| 12998 | if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize && |
| 12999 | (E->State != TreeEntry::StridedVectorize || |
| 13000 | !isReverseOrder(Order: E->ReorderIndices))) { |
| 13001 | SmallVector<int> NewMask; |
| 13002 | if (E->getOpcode() == Instruction::Store) { |
| 13003 | // For stores the order is actually a mask. |
| 13004 | NewMask.resize(N: E->ReorderIndices.size()); |
| 13005 | copy(Range: E->ReorderIndices, Out: NewMask.begin()); |
| 13006 | } else { |
| 13007 | inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask); |
| 13008 | } |
| 13009 | ::addMask(Mask, SubMask: NewMask); |
| 13010 | } |
| 13011 | if (!E->ReuseShuffleIndices.empty()) |
| 13012 | ::addMask(Mask, SubMask: E->ReuseShuffleIndices); |
| 13013 | if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size())) |
| 13014 | CommonCost = |
| 13015 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask); |
| 13016 | assert((E->State == TreeEntry::Vectorize || |
| 13017 | E->State == TreeEntry::ScatterVectorize || |
| 13018 | E->State == TreeEntry::StridedVectorize || |
| 13019 | E->State == TreeEntry::CompressVectorize) && |
| 13020 | "Unhandled state" ); |
| 13021 | assert(E->getOpcode() && |
| 13022 | ((allSameType(VL) && allSameBlock(VL)) || |
| 13023 | (E->getOpcode() == Instruction::GetElementPtr && |
| 13024 | E->getMainOp()->getType()->isPointerTy())) && |
| 13025 | "Invalid VL" ); |
| 13026 | Instruction *VL0 = E->getMainOp(); |
| 13027 | unsigned ShuffleOrOp = |
| 13028 | E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); |
| 13029 | if (E->CombinedOp != TreeEntry::NotCombinedOp) |
| 13030 | ShuffleOrOp = E->CombinedOp; |
| 13031 | SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end()); |
| 13032 | const unsigned Sz = UniqueValues.size(); |
| 13033 | SmallBitVector UsedScalars(Sz, false); |
| 13034 | for (unsigned I = 0; I < Sz; ++I) { |
| 13035 | if (isa<Instruction>(Val: UniqueValues[I]) && |
| 13036 | getTreeEntries(V: UniqueValues[I]).front() == E) |
| 13037 | continue; |
| 13038 | UsedScalars.set(I); |
| 13039 | } |
| 13040 | auto GetCastContextHint = [&](Value *V) { |
| 13041 | if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1) |
| 13042 | return getCastContextHint(TE: *OpTEs.front()); |
| 13043 | InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI); |
| 13044 | if (SrcState && SrcState.getOpcode() == Instruction::Load && |
| 13045 | !SrcState.isAltShuffle()) |
| 13046 | return TTI::CastContextHint::GatherScatter; |
| 13047 | return TTI::CastContextHint::None; |
| 13048 | }; |
| 13049 | auto GetCostDiff = |
| 13050 | [=](function_ref<InstructionCost(unsigned)> ScalarEltCost, |
| 13051 | function_ref<InstructionCost(InstructionCost)> VectorCost) { |
| 13052 | // Calculate the cost of this instruction. |
| 13053 | InstructionCost ScalarCost = 0; |
| 13054 | if (isa<CastInst, CallInst>(Val: VL0)) { |
| 13055 | // For some of the instructions no need to calculate cost for each |
| 13056 | // particular instruction, we can use the cost of the single |
| 13057 | // instruction x total number of scalar instructions. |
| 13058 | ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0); |
| 13059 | } else { |
| 13060 | for (unsigned I = 0; I < Sz; ++I) { |
| 13061 | if (UsedScalars.test(Idx: I)) |
| 13062 | continue; |
| 13063 | ScalarCost += ScalarEltCost(I); |
| 13064 | } |
| 13065 | } |
| 13066 | |
| 13067 | InstructionCost VecCost = VectorCost(CommonCost); |
| 13068 | // Check if the current node must be resized, if the parent node is not |
| 13069 | // resized. |
| 13070 | if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) && |
| 13071 | E->Idx != 0 && |
| 13072 | (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) { |
| 13073 | const EdgeInfo &EI = E->UserTreeIndex; |
| 13074 | if (!EI.UserTE->hasState() || |
| 13075 | EI.UserTE->getOpcode() != Instruction::Select || |
| 13076 | EI.EdgeIdx != 0) { |
| 13077 | auto UserBWIt = MinBWs.find(Val: EI.UserTE); |
| 13078 | Type *UserScalarTy = |
| 13079 | (EI.UserTE->isGather() || |
| 13080 | EI.UserTE->State == TreeEntry::SplitVectorize) |
| 13081 | ? EI.UserTE->Scalars.front()->getType() |
| 13082 | : EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType(); |
| 13083 | if (UserBWIt != MinBWs.end()) |
| 13084 | UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(), |
| 13085 | NumBits: UserBWIt->second.first); |
| 13086 | if (ScalarTy != UserScalarTy) { |
| 13087 | unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy); |
| 13088 | unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy); |
| 13089 | unsigned VecOpcode; |
| 13090 | auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size()); |
| 13091 | if (BWSz > SrcBWSz) |
| 13092 | VecOpcode = Instruction::Trunc; |
| 13093 | else |
| 13094 | VecOpcode = |
| 13095 | It->second.second ? Instruction::SExt : Instruction::ZExt; |
| 13096 | TTI::CastContextHint CCH = GetCastContextHint(VL0); |
| 13097 | VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH, |
| 13098 | CostKind); |
| 13099 | } |
| 13100 | } |
| 13101 | } |
| 13102 | LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost, |
| 13103 | ScalarCost, "Calculated costs for Tree" )); |
| 13104 | return VecCost - ScalarCost; |
| 13105 | }; |
| 13106 | // Calculate cost difference from vectorizing set of GEPs. |
| 13107 | // Negative value means vectorizing is profitable. |
| 13108 | auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) { |
| 13109 | assert((E->State == TreeEntry::Vectorize || |
| 13110 | E->State == TreeEntry::StridedVectorize || |
| 13111 | E->State == TreeEntry::CompressVectorize) && |
| 13112 | "Entry state expected to be Vectorize, StridedVectorize or " |
| 13113 | "MaskedLoadCompressVectorize here." ); |
| 13114 | InstructionCost ScalarCost = 0; |
| 13115 | InstructionCost VecCost = 0; |
| 13116 | std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts( |
| 13117 | TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy); |
| 13118 | LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost, |
| 13119 | "Calculated GEPs cost for Tree" )); |
| 13120 | |
| 13121 | return VecCost - ScalarCost; |
| 13122 | }; |
| 13123 | |
| 13124 | auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) { |
| 13125 | auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL); |
| 13126 | if (MinMaxID == Intrinsic::not_intrinsic) |
| 13127 | return InstructionCost::getInvalid(); |
| 13128 | Type *CanonicalType = Ty; |
| 13129 | if (CanonicalType->isPtrOrPtrVectorTy()) |
| 13130 | CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get( |
| 13131 | C&: CanonicalType->getContext(), |
| 13132 | NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType()))); |
| 13133 | |
| 13134 | IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType, |
| 13135 | {CanonicalType, CanonicalType}); |
| 13136 | InstructionCost IntrinsicCost = |
| 13137 | TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind); |
| 13138 | // If the selects are the only uses of the compares, they will be |
| 13139 | // dead and we can adjust the cost by removing their cost. |
| 13140 | if (VI && SelectOnly) { |
| 13141 | assert((!Ty->isVectorTy() || SLPReVec) && |
| 13142 | "Expected only for scalar type." ); |
| 13143 | auto *CI = cast<CmpInst>(Val: VI->getOperand(i: 0)); |
| 13144 | IntrinsicCost -= TTI->getCmpSelInstrCost( |
| 13145 | Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(), |
| 13146 | CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 13147 | Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI); |
| 13148 | } |
| 13149 | return IntrinsicCost; |
| 13150 | }; |
| 13151 | switch (ShuffleOrOp) { |
| 13152 | case Instruction::PHI: { |
| 13153 | // Count reused scalars. |
| 13154 | InstructionCost ScalarCost = 0; |
| 13155 | SmallPtrSet<const TreeEntry *, 4> CountedOps; |
| 13156 | for (Value *V : UniqueValues) { |
| 13157 | auto *PHI = dyn_cast<PHINode>(Val: V); |
| 13158 | if (!PHI) |
| 13159 | continue; |
| 13160 | |
| 13161 | ValueList Operands(PHI->getNumIncomingValues(), nullptr); |
| 13162 | for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) { |
| 13163 | Value *Op = PHI->getIncomingValue(i: I); |
| 13164 | Operands[I] = Op; |
| 13165 | } |
| 13166 | if (const TreeEntry *OpTE = |
| 13167 | getSameValuesTreeEntry(V: Operands.front(), VL: Operands)) |
| 13168 | if (CountedOps.insert(Ptr: OpTE).second && |
| 13169 | !OpTE->ReuseShuffleIndices.empty()) |
| 13170 | ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() - |
| 13171 | OpTE->Scalars.size()); |
| 13172 | } |
| 13173 | |
| 13174 | return CommonCost - ScalarCost; |
| 13175 | } |
| 13176 | case Instruction::ExtractValue: |
| 13177 | case Instruction::ExtractElement: { |
| 13178 | APInt DemandedElts; |
| 13179 | VectorType *SrcVecTy = nullptr; |
| 13180 | auto GetScalarCost = [&](unsigned Idx) { |
| 13181 | if (isa<PoisonValue>(Val: UniqueValues[Idx])) |
| 13182 | return InstructionCost(TTI::TCC_Free); |
| 13183 | |
| 13184 | auto *I = cast<Instruction>(Val: UniqueValues[Idx]); |
| 13185 | if (!SrcVecTy) { |
| 13186 | if (ShuffleOrOp == Instruction::ExtractElement) { |
| 13187 | auto *EE = cast<ExtractElementInst>(Val: I); |
| 13188 | SrcVecTy = EE->getVectorOperandType(); |
| 13189 | } else { |
| 13190 | auto *EV = cast<ExtractValueInst>(Val: I); |
| 13191 | Type *AggregateTy = EV->getAggregateOperand()->getType(); |
| 13192 | unsigned NumElts; |
| 13193 | if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy)) |
| 13194 | NumElts = ATy->getNumElements(); |
| 13195 | else |
| 13196 | NumElts = AggregateTy->getStructNumElements(); |
| 13197 | SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts); |
| 13198 | } |
| 13199 | } |
| 13200 | if (I->hasOneUse()) { |
| 13201 | Instruction *Ext = I->user_back(); |
| 13202 | if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) && |
| 13203 | all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) { |
| 13204 | // Use getExtractWithExtendCost() to calculate the cost of |
| 13205 | // extractelement/ext pair. |
| 13206 | InstructionCost Cost = TTI->getExtractWithExtendCost( |
| 13207 | Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I), |
| 13208 | CostKind); |
| 13209 | // Subtract the cost of s|zext which is subtracted separately. |
| 13210 | Cost -= TTI->getCastInstrCost( |
| 13211 | Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(), |
| 13212 | CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext); |
| 13213 | return Cost; |
| 13214 | } |
| 13215 | } |
| 13216 | if (DemandedElts.isZero()) |
| 13217 | DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy)); |
| 13218 | DemandedElts.setBit(*getExtractIndex(E: I)); |
| 13219 | return InstructionCost(TTI::TCC_Free); |
| 13220 | }; |
| 13221 | auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) { |
| 13222 | return CommonCost - (DemandedElts.isZero() |
| 13223 | ? TTI::TCC_Free |
| 13224 | : TTI.getScalarizationOverhead( |
| 13225 | Ty: SrcVecTy, DemandedElts, /*Insert=*/false, |
| 13226 | /*Extract=*/true, CostKind)); |
| 13227 | }; |
| 13228 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13229 | } |
| 13230 | case Instruction::InsertElement: { |
| 13231 | assert(E->ReuseShuffleIndices.empty() && |
| 13232 | "Unique insertelements only are expected." ); |
| 13233 | auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType()); |
| 13234 | unsigned const NumElts = SrcVecTy->getNumElements(); |
| 13235 | unsigned const NumScalars = VL.size(); |
| 13236 | |
| 13237 | unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy); |
| 13238 | |
| 13239 | SmallVector<int> InsertMask(NumElts, PoisonMaskElem); |
| 13240 | unsigned OffsetBeg = *getElementIndex(Inst: VL.front()); |
| 13241 | unsigned OffsetEnd = OffsetBeg; |
| 13242 | InsertMask[OffsetBeg] = 0; |
| 13243 | for (auto [I, V] : enumerate(First: VL.drop_front())) { |
| 13244 | unsigned Idx = *getElementIndex(Inst: V); |
| 13245 | if (OffsetBeg > Idx) |
| 13246 | OffsetBeg = Idx; |
| 13247 | else if (OffsetEnd < Idx) |
| 13248 | OffsetEnd = Idx; |
| 13249 | InsertMask[Idx] = I + 1; |
| 13250 | } |
| 13251 | unsigned = PowerOf2Ceil(A: NumElts); |
| 13252 | if (NumOfParts > 0 && NumOfParts < NumElts) |
| 13253 | VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts); |
| 13254 | unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * |
| 13255 | VecScalarsSz; |
| 13256 | unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); |
| 13257 | unsigned InsertVecSz = std::min<unsigned>( |
| 13258 | a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1), |
| 13259 | b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz); |
| 13260 | bool IsWholeSubvector = |
| 13261 | OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); |
| 13262 | // Check if we can safely insert a subvector. If it is not possible, just |
| 13263 | // generate a whole-sized vector and shuffle the source vector and the new |
| 13264 | // subvector. |
| 13265 | if (OffsetBeg + InsertVecSz > VecSz) { |
| 13266 | // Align OffsetBeg to generate correct mask. |
| 13267 | OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset); |
| 13268 | InsertVecSz = VecSz; |
| 13269 | } |
| 13270 | |
| 13271 | APInt DemandedElts = APInt::getZero(numBits: NumElts); |
| 13272 | // TODO: Add support for Instruction::InsertValue. |
| 13273 | SmallVector<int> Mask; |
| 13274 | if (!E->ReorderIndices.empty()) { |
| 13275 | inversePermutation(Indices: E->ReorderIndices, Mask); |
| 13276 | Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem); |
| 13277 | } else { |
| 13278 | Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem); |
| 13279 | std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0); |
| 13280 | } |
| 13281 | bool IsIdentity = true; |
| 13282 | SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem); |
| 13283 | Mask.swap(RHS&: PrevMask); |
| 13284 | for (unsigned I = 0; I < NumScalars; ++I) { |
| 13285 | unsigned InsertIdx = *getElementIndex(Inst: VL[PrevMask[I]]); |
| 13286 | DemandedElts.setBit(InsertIdx); |
| 13287 | IsIdentity &= InsertIdx - OffsetBeg == I; |
| 13288 | Mask[InsertIdx - OffsetBeg] = I; |
| 13289 | } |
| 13290 | assert(Offset < NumElts && "Failed to find vector index offset" ); |
| 13291 | |
| 13292 | InstructionCost Cost = 0; |
| 13293 | Cost -= |
| 13294 | getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts, |
| 13295 | /*Insert*/ true, /*Extract*/ false, CostKind); |
| 13296 | |
| 13297 | // First cost - resize to actual vector size if not identity shuffle or |
| 13298 | // need to shift the vector. |
| 13299 | // Do not calculate the cost if the actual size is the register size and |
| 13300 | // we can merge this shuffle with the following SK_Select. |
| 13301 | auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz); |
| 13302 | if (!IsIdentity) |
| 13303 | Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, |
| 13304 | Tp: InsertVecTy, Mask); |
| 13305 | auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) { |
| 13306 | return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0)); |
| 13307 | })); |
| 13308 | // Second cost - permutation with subvector, if some elements are from the |
| 13309 | // initial vector or inserting a subvector. |
| 13310 | // TODO: Implement the analysis of the FirstInsert->getOperand(0) |
| 13311 | // subvector of ActualVecTy. |
| 13312 | SmallBitVector InMask = |
| 13313 | isUndefVector(V: FirstInsert->getOperand(i: 0), |
| 13314 | UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask)); |
| 13315 | if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) { |
| 13316 | if (InsertVecSz != VecSz) { |
| 13317 | auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz); |
| 13318 | Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {}, |
| 13319 | CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy); |
| 13320 | } else { |
| 13321 | for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) |
| 13322 | Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I; |
| 13323 | for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; |
| 13324 | I <= End; ++I) |
| 13325 | if (Mask[I] != PoisonMaskElem) |
| 13326 | Mask[I] = I + VecSz; |
| 13327 | for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) |
| 13328 | Mask[I] = |
| 13329 | ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I; |
| 13330 | Cost += |
| 13331 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask); |
| 13332 | } |
| 13333 | } |
| 13334 | return Cost; |
| 13335 | } |
| 13336 | case Instruction::ZExt: |
| 13337 | case Instruction::SExt: |
| 13338 | case Instruction::FPToUI: |
| 13339 | case Instruction::FPToSI: |
| 13340 | case Instruction::FPExt: |
| 13341 | case Instruction::PtrToInt: |
| 13342 | case Instruction::IntToPtr: |
| 13343 | case Instruction::SIToFP: |
| 13344 | case Instruction::UIToFP: |
| 13345 | case Instruction::Trunc: |
| 13346 | case Instruction::FPTrunc: |
| 13347 | case Instruction::BitCast: { |
| 13348 | auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0)); |
| 13349 | Type *SrcScalarTy = VL0->getOperand(i: 0)->getType(); |
| 13350 | auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size()); |
| 13351 | unsigned Opcode = ShuffleOrOp; |
| 13352 | unsigned VecOpcode = Opcode; |
| 13353 | if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() && |
| 13354 | (SrcIt != MinBWs.end() || It != MinBWs.end())) { |
| 13355 | // Check if the values are candidates to demote. |
| 13356 | unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType()); |
| 13357 | if (SrcIt != MinBWs.end()) { |
| 13358 | SrcBWSz = SrcIt->second.first; |
| 13359 | unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy); |
| 13360 | SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz); |
| 13361 | SrcVecTy = |
| 13362 | getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements); |
| 13363 | } |
| 13364 | unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType()); |
| 13365 | if (BWSz == SrcBWSz) { |
| 13366 | VecOpcode = Instruction::BitCast; |
| 13367 | } else if (BWSz < SrcBWSz) { |
| 13368 | VecOpcode = Instruction::Trunc; |
| 13369 | } else if (It != MinBWs.end()) { |
| 13370 | assert(BWSz > SrcBWSz && "Invalid cast!" ); |
| 13371 | VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; |
| 13372 | } else if (SrcIt != MinBWs.end()) { |
| 13373 | assert(BWSz > SrcBWSz && "Invalid cast!" ); |
| 13374 | VecOpcode = |
| 13375 | SrcIt->second.second ? Instruction::SExt : Instruction::ZExt; |
| 13376 | } |
| 13377 | } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() && |
| 13378 | !SrcIt->second.second) { |
| 13379 | VecOpcode = Instruction::UIToFP; |
| 13380 | } |
| 13381 | auto GetScalarCost = [&](unsigned Idx) -> InstructionCost { |
| 13382 | assert(Idx == 0 && "Expected 0 index only" ); |
| 13383 | return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(), |
| 13384 | Src: VL0->getOperand(i: 0)->getType(), |
| 13385 | CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0); |
| 13386 | }; |
| 13387 | auto GetVectorCost = [=](InstructionCost CommonCost) { |
| 13388 | // Do not count cost here if minimum bitwidth is in effect and it is just |
| 13389 | // a bitcast (here it is just a noop). |
| 13390 | if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast) |
| 13391 | return CommonCost; |
| 13392 | auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr; |
| 13393 | TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0)); |
| 13394 | |
| 13395 | bool IsArithmeticExtendedReduction = |
| 13396 | E->Idx == 0 && UserIgnoreList && |
| 13397 | all_of(Range: *UserIgnoreList, P: [](Value *V) { |
| 13398 | auto *I = cast<Instruction>(Val: V); |
| 13399 | return is_contained(Set: {Instruction::Add, Instruction::FAdd, |
| 13400 | Instruction::Mul, Instruction::FMul, |
| 13401 | Instruction::And, Instruction::Or, |
| 13402 | Instruction::Xor}, |
| 13403 | Element: I->getOpcode()); |
| 13404 | }); |
| 13405 | if (IsArithmeticExtendedReduction && |
| 13406 | (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt)) |
| 13407 | return CommonCost; |
| 13408 | return CommonCost + |
| 13409 | TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind, |
| 13410 | I: VecOpcode == Opcode ? VI : nullptr); |
| 13411 | }; |
| 13412 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13413 | } |
| 13414 | case Instruction::FCmp: |
| 13415 | case Instruction::ICmp: |
| 13416 | case Instruction::Select: { |
| 13417 | CmpPredicate VecPred, SwappedVecPred; |
| 13418 | auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value()); |
| 13419 | if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) || |
| 13420 | match(V: VL0, P: MatchCmp)) |
| 13421 | SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred); |
| 13422 | else |
| 13423 | SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy() |
| 13424 | ? CmpInst::BAD_FCMP_PREDICATE |
| 13425 | : CmpInst::BAD_ICMP_PREDICATE; |
| 13426 | auto GetScalarCost = [&](unsigned Idx) { |
| 13427 | if (isa<PoisonValue>(Val: UniqueValues[Idx])) |
| 13428 | return InstructionCost(TTI::TCC_Free); |
| 13429 | |
| 13430 | auto *VI = cast<Instruction>(Val: UniqueValues[Idx]); |
| 13431 | CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy() |
| 13432 | ? CmpInst::BAD_FCMP_PREDICATE |
| 13433 | : CmpInst::BAD_ICMP_PREDICATE; |
| 13434 | auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value()); |
| 13435 | if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) && |
| 13436 | !match(V: VI, P: MatchCmp)) || |
| 13437 | (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) && |
| 13438 | CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred))) |
| 13439 | VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy() |
| 13440 | ? CmpInst::BAD_FCMP_PREDICATE |
| 13441 | : CmpInst::BAD_ICMP_PREDICATE; |
| 13442 | |
| 13443 | InstructionCost ScalarCost = TTI->getCmpSelInstrCost( |
| 13444 | Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred, |
| 13445 | CostKind, Op1Info: getOperandInfo(Ops: VI->getOperand(i: 0)), |
| 13446 | Op2Info: getOperandInfo(Ops: VI->getOperand(i: 1)), I: VI); |
| 13447 | InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI); |
| 13448 | if (IntrinsicCost.isValid()) |
| 13449 | ScalarCost = IntrinsicCost; |
| 13450 | |
| 13451 | return ScalarCost; |
| 13452 | }; |
| 13453 | auto GetVectorCost = [&](InstructionCost CommonCost) { |
| 13454 | auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size()); |
| 13455 | |
| 13456 | InstructionCost VecCost = |
| 13457 | TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred, |
| 13458 | CostKind, Op1Info: getOperandInfo(Ops: E->getOperand(OpIdx: 0)), |
| 13459 | Op2Info: getOperandInfo(Ops: E->getOperand(OpIdx: 1)), I: VL0); |
| 13460 | if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) { |
| 13461 | auto *CondType = |
| 13462 | getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size()); |
| 13463 | unsigned CondNumElements = CondType->getNumElements(); |
| 13464 | unsigned VecTyNumElements = getNumElements(Ty: VecTy); |
| 13465 | assert(VecTyNumElements >= CondNumElements && |
| 13466 | VecTyNumElements % CondNumElements == 0 && |
| 13467 | "Cannot vectorize Instruction::Select" ); |
| 13468 | if (CondNumElements != VecTyNumElements) { |
| 13469 | // When the return type is i1 but the source is fixed vector type, we |
| 13470 | // need to duplicate the condition value. |
| 13471 | VecCost += ::getShuffleCost( |
| 13472 | TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType, |
| 13473 | Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements, |
| 13474 | VF: CondNumElements)); |
| 13475 | } |
| 13476 | } |
| 13477 | return VecCost + CommonCost; |
| 13478 | }; |
| 13479 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13480 | } |
| 13481 | case TreeEntry::MinMax: { |
| 13482 | auto GetScalarCost = [&](unsigned Idx) { |
| 13483 | return GetMinMaxCost(OrigScalarTy); |
| 13484 | }; |
| 13485 | auto GetVectorCost = [&](InstructionCost CommonCost) { |
| 13486 | InstructionCost VecCost = GetMinMaxCost(VecTy); |
| 13487 | return VecCost + CommonCost; |
| 13488 | }; |
| 13489 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13490 | } |
| 13491 | case Instruction::FNeg: |
| 13492 | case Instruction::Add: |
| 13493 | case Instruction::FAdd: |
| 13494 | case Instruction::Sub: |
| 13495 | case Instruction::FSub: |
| 13496 | case Instruction::Mul: |
| 13497 | case Instruction::FMul: |
| 13498 | case Instruction::UDiv: |
| 13499 | case Instruction::SDiv: |
| 13500 | case Instruction::FDiv: |
| 13501 | case Instruction::URem: |
| 13502 | case Instruction::SRem: |
| 13503 | case Instruction::FRem: |
| 13504 | case Instruction::Shl: |
| 13505 | case Instruction::LShr: |
| 13506 | case Instruction::AShr: |
| 13507 | case Instruction::And: |
| 13508 | case Instruction::Or: |
| 13509 | case Instruction::Xor: { |
| 13510 | auto GetScalarCost = [&](unsigned Idx) { |
| 13511 | if (isa<PoisonValue>(Val: UniqueValues[Idx])) |
| 13512 | return InstructionCost(TTI::TCC_Free); |
| 13513 | |
| 13514 | // We cannot retrieve the operand from UniqueValues[Idx] because an |
| 13515 | // interchangeable instruction may be used. The order and the actual |
| 13516 | // operand might differ from what is retrieved from UniqueValues[Idx]. |
| 13517 | Value *Op1 = E->getOperand(OpIdx: 0)[Idx]; |
| 13518 | Value *Op2; |
| 13519 | SmallVector<const Value *, 2> Operands(1, Op1); |
| 13520 | if (isa<UnaryOperator>(Val: UniqueValues[Idx])) { |
| 13521 | Op2 = Op1; |
| 13522 | } else { |
| 13523 | Op2 = E->getOperand(OpIdx: 1)[Idx]; |
| 13524 | Operands.push_back(Elt: Op2); |
| 13525 | } |
| 13526 | TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1); |
| 13527 | TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2); |
| 13528 | return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind, |
| 13529 | Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands); |
| 13530 | }; |
| 13531 | auto GetVectorCost = [=](InstructionCost CommonCost) { |
| 13532 | if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { |
| 13533 | for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) { |
| 13534 | ArrayRef<Value *> Ops = E->getOperand(OpIdx: I); |
| 13535 | if (all_of(Range&: Ops, P: [&](Value *Op) { |
| 13536 | auto *CI = dyn_cast<ConstantInt>(Val: Op); |
| 13537 | return CI && CI->getValue().countr_one() >= It->second.first; |
| 13538 | })) |
| 13539 | return CommonCost; |
| 13540 | } |
| 13541 | } |
| 13542 | unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1; |
| 13543 | TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0)); |
| 13544 | TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx)); |
| 13545 | return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info, |
| 13546 | Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) + |
| 13547 | CommonCost; |
| 13548 | }; |
| 13549 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13550 | } |
| 13551 | case Instruction::GetElementPtr: { |
| 13552 | return CommonCost + GetGEPCostDiff(VL, VL0); |
| 13553 | } |
| 13554 | case Instruction::Load: { |
| 13555 | auto GetScalarCost = [&](unsigned Idx) { |
| 13556 | auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]); |
| 13557 | return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy, |
| 13558 | Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(), |
| 13559 | CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI); |
| 13560 | }; |
| 13561 | auto *LI0 = cast<LoadInst>(Val: VL0); |
| 13562 | auto GetVectorCost = [&](InstructionCost CommonCost) { |
| 13563 | InstructionCost VecLdCost; |
| 13564 | switch (E->State) { |
| 13565 | case TreeEntry::Vectorize: |
| 13566 | if (unsigned Factor = E->getInterleaveFactor()) { |
| 13567 | VecLdCost = TTI->getInterleavedMemoryOpCost( |
| 13568 | Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(), |
| 13569 | AddressSpace: LI0->getPointerAddressSpace(), CostKind); |
| 13570 | |
| 13571 | } else { |
| 13572 | VecLdCost = TTI->getMemoryOpCost( |
| 13573 | Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(), |
| 13574 | AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo()); |
| 13575 | } |
| 13576 | break; |
| 13577 | case TreeEntry::StridedVectorize: { |
| 13578 | Align CommonAlignment = |
| 13579 | computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef()); |
| 13580 | VecLdCost = TTI->getStridedMemoryOpCost( |
| 13581 | Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(), |
| 13582 | /*VariableMask=*/false, Alignment: CommonAlignment, CostKind); |
| 13583 | break; |
| 13584 | } |
| 13585 | case TreeEntry::CompressVectorize: { |
| 13586 | bool IsMasked; |
| 13587 | unsigned InterleaveFactor; |
| 13588 | SmallVector<int> CompressMask; |
| 13589 | VectorType *LoadVecTy; |
| 13590 | SmallVector<Value *> Scalars(VL); |
| 13591 | if (!E->ReorderIndices.empty()) { |
| 13592 | SmallVector<int> Mask(E->ReorderIndices.begin(), |
| 13593 | E->ReorderIndices.end()); |
| 13594 | reorderScalars(Scalars, Mask); |
| 13595 | } |
| 13596 | SmallVector<Value *> PointerOps(Scalars.size()); |
| 13597 | for (auto [I, V] : enumerate(First&: Scalars)) |
| 13598 | PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand(); |
| 13599 | [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress( |
| 13600 | VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT, |
| 13601 | TLI: *TLI, AreAllUsersVectorized: [](Value *) { return true; }, IsMasked, InterleaveFactor, |
| 13602 | CompressMask, LoadVecTy); |
| 13603 | assert(IsVectorized && "Failed to vectorize load" ); |
| 13604 | CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy, |
| 13605 | Args&: InterleaveFactor, Args&: IsMasked); |
| 13606 | Align CommonAlignment = LI0->getAlign(); |
| 13607 | if (InterleaveFactor) { |
| 13608 | VecLdCost = TTI->getInterleavedMemoryOpCost( |
| 13609 | Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {}, |
| 13610 | Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind); |
| 13611 | } else if (IsMasked) { |
| 13612 | VecLdCost = TTI->getMaskedMemoryOpCost( |
| 13613 | Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment, |
| 13614 | AddressSpace: LI0->getPointerAddressSpace(), CostKind); |
| 13615 | // TODO: include this cost into CommonCost. |
| 13616 | VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, |
| 13617 | Tp: LoadVecTy, Mask: CompressMask, CostKind); |
| 13618 | } else { |
| 13619 | VecLdCost = TTI->getMemoryOpCost( |
| 13620 | Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment, |
| 13621 | AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo()); |
| 13622 | // TODO: include this cost into CommonCost. |
| 13623 | VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, |
| 13624 | Tp: LoadVecTy, Mask: CompressMask, CostKind); |
| 13625 | } |
| 13626 | break; |
| 13627 | } |
| 13628 | case TreeEntry::ScatterVectorize: { |
| 13629 | Align CommonAlignment = |
| 13630 | computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef()); |
| 13631 | VecLdCost = TTI->getGatherScatterOpCost( |
| 13632 | Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(), |
| 13633 | /*VariableMask=*/false, Alignment: CommonAlignment, CostKind); |
| 13634 | break; |
| 13635 | } |
| 13636 | case TreeEntry::CombinedVectorize: |
| 13637 | case TreeEntry::SplitVectorize: |
| 13638 | case TreeEntry::NeedToGather: |
| 13639 | llvm_unreachable("Unexpected vectorization state." ); |
| 13640 | } |
| 13641 | return VecLdCost + CommonCost; |
| 13642 | }; |
| 13643 | |
| 13644 | InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost); |
| 13645 | // If this node generates masked gather load then it is not a terminal node. |
| 13646 | // Hence address operand cost is estimated separately. |
| 13647 | if (E->State == TreeEntry::ScatterVectorize) |
| 13648 | return Cost; |
| 13649 | |
| 13650 | // Estimate cost of GEPs since this tree node is a terminator. |
| 13651 | SmallVector<Value *> PointerOps(VL.size()); |
| 13652 | for (auto [I, V] : enumerate(First&: VL)) |
| 13653 | PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand(); |
| 13654 | return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand()); |
| 13655 | } |
| 13656 | case Instruction::Store: { |
| 13657 | bool IsReorder = !E->ReorderIndices.empty(); |
| 13658 | auto GetScalarCost = [=](unsigned Idx) { |
| 13659 | auto *VI = cast<StoreInst>(Val: VL[Idx]); |
| 13660 | TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand()); |
| 13661 | return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy, |
| 13662 | Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(), |
| 13663 | CostKind, OpdInfo: OpInfo, I: VI); |
| 13664 | }; |
| 13665 | auto *BaseSI = |
| 13666 | cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0); |
| 13667 | auto GetVectorCost = [=](InstructionCost CommonCost) { |
| 13668 | // We know that we can merge the stores. Calculate the cost. |
| 13669 | InstructionCost VecStCost; |
| 13670 | if (E->State == TreeEntry::StridedVectorize) { |
| 13671 | Align CommonAlignment = |
| 13672 | computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef()); |
| 13673 | VecStCost = TTI->getStridedMemoryOpCost( |
| 13674 | Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(), |
| 13675 | /*VariableMask=*/false, Alignment: CommonAlignment, CostKind); |
| 13676 | } else { |
| 13677 | assert(E->State == TreeEntry::Vectorize && |
| 13678 | "Expected either strided or consecutive stores." ); |
| 13679 | if (unsigned Factor = E->getInterleaveFactor()) { |
| 13680 | assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() && |
| 13681 | "No reused shuffles expected" ); |
| 13682 | CommonCost = 0; |
| 13683 | VecStCost = TTI->getInterleavedMemoryOpCost( |
| 13684 | Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(), |
| 13685 | AddressSpace: BaseSI->getPointerAddressSpace(), CostKind); |
| 13686 | } else { |
| 13687 | TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0)); |
| 13688 | VecStCost = TTI->getMemoryOpCost( |
| 13689 | Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(), |
| 13690 | AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo); |
| 13691 | } |
| 13692 | } |
| 13693 | return VecStCost + CommonCost; |
| 13694 | }; |
| 13695 | SmallVector<Value *> PointerOps(VL.size()); |
| 13696 | for (auto [I, V] : enumerate(First&: VL)) { |
| 13697 | unsigned Idx = IsReorder ? E->ReorderIndices[I] : I; |
| 13698 | PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand(); |
| 13699 | } |
| 13700 | |
| 13701 | return GetCostDiff(GetScalarCost, GetVectorCost) + |
| 13702 | GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand()); |
| 13703 | } |
| 13704 | case Instruction::Call: { |
| 13705 | auto GetScalarCost = [&](unsigned Idx) { |
| 13706 | auto *CI = cast<CallInst>(Val: UniqueValues[Idx]); |
| 13707 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 13708 | if (ID != Intrinsic::not_intrinsic) { |
| 13709 | IntrinsicCostAttributes CostAttrs(ID, *CI, 1); |
| 13710 | return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind); |
| 13711 | } |
| 13712 | return TTI->getCallInstrCost(F: CI->getCalledFunction(), |
| 13713 | RetTy: CI->getFunctionType()->getReturnType(), |
| 13714 | Tys: CI->getFunctionType()->params(), CostKind); |
| 13715 | }; |
| 13716 | auto GetVectorCost = [=](InstructionCost CommonCost) { |
| 13717 | auto *CI = cast<CallInst>(Val: VL0); |
| 13718 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 13719 | SmallVector<Type *> ArgTys = buildIntrinsicArgTypes( |
| 13720 | CI, ID, VF: VecTy->getNumElements(), |
| 13721 | MinBW: It != MinBWs.end() ? It->second.first : 0, TTI); |
| 13722 | auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); |
| 13723 | return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost; |
| 13724 | }; |
| 13725 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13726 | } |
| 13727 | case Instruction::ShuffleVector: { |
| 13728 | if (!SLPReVec || E->isAltShuffle()) |
| 13729 | assert(E->isAltShuffle() && |
| 13730 | ((Instruction::isBinaryOp(E->getOpcode()) && |
| 13731 | Instruction::isBinaryOp(E->getAltOpcode())) || |
| 13732 | (Instruction::isCast(E->getOpcode()) && |
| 13733 | Instruction::isCast(E->getAltOpcode())) || |
| 13734 | (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && |
| 13735 | "Invalid Shuffle Vector Operand" ); |
| 13736 | // Try to find the previous shuffle node with the same operands and same |
| 13737 | // main/alternate ops. |
| 13738 | auto TryFindNodeWithEqualOperands = [=]() { |
| 13739 | for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 13740 | if (TE.get() == E) |
| 13741 | break; |
| 13742 | if (TE->hasState() && TE->isAltShuffle() && |
| 13743 | ((TE->getOpcode() == E->getOpcode() && |
| 13744 | TE->getAltOpcode() == E->getAltOpcode()) || |
| 13745 | (TE->getOpcode() == E->getAltOpcode() && |
| 13746 | TE->getAltOpcode() == E->getOpcode())) && |
| 13747 | TE->hasEqualOperands(TE: *E)) |
| 13748 | return true; |
| 13749 | } |
| 13750 | return false; |
| 13751 | }; |
| 13752 | auto GetScalarCost = [&](unsigned Idx) { |
| 13753 | if (isa<PoisonValue>(Val: UniqueValues[Idx])) |
| 13754 | return InstructionCost(TTI::TCC_Free); |
| 13755 | |
| 13756 | auto *VI = cast<Instruction>(Val: UniqueValues[Idx]); |
| 13757 | assert(E->getMatchingMainOpOrAltOp(VI) && |
| 13758 | "Unexpected main/alternate opcode" ); |
| 13759 | (void)E; |
| 13760 | return TTI->getInstructionCost(U: VI, CostKind); |
| 13761 | }; |
| 13762 | // Need to clear CommonCost since the final shuffle cost is included into |
| 13763 | // vector cost. |
| 13764 | auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) { |
| 13765 | // VecCost is equal to sum of the cost of creating 2 vectors |
| 13766 | // and the cost of creating shuffle. |
| 13767 | InstructionCost VecCost = 0; |
| 13768 | if (TryFindNodeWithEqualOperands()) { |
| 13769 | LLVM_DEBUG({ |
| 13770 | dbgs() << "SLP: diamond match for alternate node found.\n" ; |
| 13771 | E->dump(); |
| 13772 | }); |
| 13773 | // No need to add new vector costs here since we're going to reuse |
| 13774 | // same main/alternate vector ops, just do different shuffling. |
| 13775 | } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) { |
| 13776 | VecCost = |
| 13777 | TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind); |
| 13778 | VecCost += |
| 13779 | TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind); |
| 13780 | } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) { |
| 13781 | auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size()); |
| 13782 | VecCost = TTIRef.getCmpSelInstrCost( |
| 13783 | Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind, |
| 13784 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 13785 | I: VL0); |
| 13786 | VecCost += TTIRef.getCmpSelInstrCost( |
| 13787 | Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, |
| 13788 | VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind, |
| 13789 | Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, |
| 13790 | I: E->getAltOp()); |
| 13791 | } else { |
| 13792 | Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType(); |
| 13793 | auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size()); |
| 13794 | if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) { |
| 13795 | auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0)); |
| 13796 | unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy); |
| 13797 | unsigned SrcBWSz = |
| 13798 | DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType()); |
| 13799 | if (SrcIt != MinBWs.end()) { |
| 13800 | SrcBWSz = SrcIt->second.first; |
| 13801 | SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz); |
| 13802 | SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size()); |
| 13803 | } |
| 13804 | if (BWSz <= SrcBWSz) { |
| 13805 | if (BWSz < SrcBWSz) |
| 13806 | VecCost = |
| 13807 | TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy, |
| 13808 | CCH: TTI::CastContextHint::None, CostKind); |
| 13809 | LLVM_DEBUG({ |
| 13810 | dbgs() |
| 13811 | << "SLP: alternate extension, which should be truncated.\n" ; |
| 13812 | E->dump(); |
| 13813 | }); |
| 13814 | return VecCost; |
| 13815 | } |
| 13816 | } |
| 13817 | VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy, |
| 13818 | CCH: TTI::CastContextHint::None, CostKind); |
| 13819 | VecCost += |
| 13820 | TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy, |
| 13821 | CCH: TTI::CastContextHint::None, CostKind); |
| 13822 | } |
| 13823 | SmallVector<int> Mask; |
| 13824 | E->buildAltOpShuffleMask( |
| 13825 | IsAltOp: [&](Instruction *I) { |
| 13826 | assert(E->getMatchingMainOpOrAltOp(I) && |
| 13827 | "Unexpected main/alternate opcode" ); |
| 13828 | return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(), |
| 13829 | TLI: *TLI); |
| 13830 | }, |
| 13831 | Mask); |
| 13832 | VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc, |
| 13833 | Tp: FinalVecTy, Mask, CostKind); |
| 13834 | // Patterns like [fadd,fsub] can be combined into a single instruction |
| 13835 | // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we |
| 13836 | // need to take into account their order when looking for the most used |
| 13837 | // order. |
| 13838 | unsigned Opcode0 = E->getOpcode(); |
| 13839 | unsigned Opcode1 = E->getAltOpcode(); |
| 13840 | SmallBitVector OpcodeMask( |
| 13841 | getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1)); |
| 13842 | // If this pattern is supported by the target then we consider the |
| 13843 | // order. |
| 13844 | if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) { |
| 13845 | InstructionCost AltVecCost = TTIRef.getAltInstrCost( |
| 13846 | VecTy, Opcode0, Opcode1, OpcodeMask, CostKind); |
| 13847 | return AltVecCost < VecCost ? AltVecCost : VecCost; |
| 13848 | } |
| 13849 | // TODO: Check the reverse order too. |
| 13850 | return VecCost; |
| 13851 | }; |
| 13852 | if (SLPReVec && !E->isAltShuffle()) |
| 13853 | return GetCostDiff( |
| 13854 | GetScalarCost, [&](InstructionCost) -> InstructionCost { |
| 13855 | // If a group uses mask in order, the shufflevector can be |
| 13856 | // eliminated by instcombine. Then the cost is 0. |
| 13857 | assert(isa<ShuffleVectorInst>(VL.front()) && |
| 13858 | "Not supported shufflevector usage." ); |
| 13859 | auto *SV = cast<ShuffleVectorInst>(Val: VL.front()); |
| 13860 | unsigned SVNumElements = |
| 13861 | cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()) |
| 13862 | ->getNumElements(); |
| 13863 | unsigned GroupSize = SVNumElements / SV->getShuffleMask().size(); |
| 13864 | for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) { |
| 13865 | ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize); |
| 13866 | int NextIndex = 0; |
| 13867 | if (!all_of(Range&: Group, P: [&](Value *V) { |
| 13868 | assert(isa<ShuffleVectorInst>(V) && |
| 13869 | "Not supported shufflevector usage." ); |
| 13870 | auto *SV = cast<ShuffleVectorInst>(Val: V); |
| 13871 | int Index; |
| 13872 | [[maybe_unused]] bool = |
| 13873 | SV->isExtractSubvectorMask(Index); |
| 13874 | assert(IsExtractSubvectorMask && |
| 13875 | "Not supported shufflevector usage." ); |
| 13876 | if (NextIndex != Index) |
| 13877 | return false; |
| 13878 | NextIndex += SV->getShuffleMask().size(); |
| 13879 | return true; |
| 13880 | })) |
| 13881 | return ::getShuffleCost( |
| 13882 | TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy, |
| 13883 | Mask: calculateShufflevectorMask(VL: E->Scalars)); |
| 13884 | } |
| 13885 | return TTI::TCC_Free; |
| 13886 | }); |
| 13887 | return GetCostDiff(GetScalarCost, GetVectorCost); |
| 13888 | } |
| 13889 | case Instruction::Freeze: |
| 13890 | return CommonCost; |
| 13891 | default: |
| 13892 | llvm_unreachable("Unknown instruction" ); |
| 13893 | } |
| 13894 | } |
| 13895 | |
| 13896 | bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { |
| 13897 | LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height " |
| 13898 | << VectorizableTree.size() << " is fully vectorizable .\n" ); |
| 13899 | |
| 13900 | auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) { |
| 13901 | SmallVector<int> Mask; |
| 13902 | return TE->isGather() && |
| 13903 | !any_of(Range: TE->Scalars, |
| 13904 | P: [this](Value *V) { return EphValues.contains(Ptr: V); }) && |
| 13905 | (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) || |
| 13906 | TE->Scalars.size() < Limit || |
| 13907 | (((TE->hasState() && |
| 13908 | TE->getOpcode() == Instruction::ExtractElement) || |
| 13909 | all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) && |
| 13910 | isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) || |
| 13911 | (TE->hasState() && TE->getOpcode() == Instruction::Load && |
| 13912 | !TE->isAltShuffle()) || |
| 13913 | any_of(Range: TE->Scalars, P: IsaPred<LoadInst>)); |
| 13914 | }; |
| 13915 | |
| 13916 | // We only handle trees of heights 1 and 2. |
| 13917 | if (VectorizableTree.size() == 1 && |
| 13918 | (VectorizableTree[0]->State == TreeEntry::Vectorize || |
| 13919 | VectorizableTree[0]->State == TreeEntry::StridedVectorize || |
| 13920 | VectorizableTree[0]->State == TreeEntry::CompressVectorize || |
| 13921 | (ForReduction && |
| 13922 | AreVectorizableGathers(VectorizableTree[0].get(), |
| 13923 | VectorizableTree[0]->Scalars.size()) && |
| 13924 | VectorizableTree[0]->getVectorFactor() > 2))) |
| 13925 | return true; |
| 13926 | |
| 13927 | if (VectorizableTree.size() != 2) |
| 13928 | return false; |
| 13929 | |
| 13930 | // Handle splat and all-constants stores. Also try to vectorize tiny trees |
| 13931 | // with the second gather nodes if they have less scalar operands rather than |
| 13932 | // the initial tree element (may be profitable to shuffle the second gather) |
| 13933 | // or they are extractelements, which form shuffle. |
| 13934 | if (VectorizableTree[0]->State == TreeEntry::Vectorize && |
| 13935 | AreVectorizableGathers(VectorizableTree[1].get(), |
| 13936 | VectorizableTree[0]->Scalars.size())) |
| 13937 | return true; |
| 13938 | |
| 13939 | // Gathering cost would be too much for tiny trees. |
| 13940 | if (VectorizableTree[0]->isGather() || |
| 13941 | (VectorizableTree[1]->isGather() && |
| 13942 | VectorizableTree[0]->State != TreeEntry::ScatterVectorize && |
| 13943 | VectorizableTree[0]->State != TreeEntry::StridedVectorize && |
| 13944 | VectorizableTree[0]->State != TreeEntry::CompressVectorize)) |
| 13945 | return false; |
| 13946 | |
| 13947 | return true; |
| 13948 | } |
| 13949 | |
| 13950 | static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, |
| 13951 | TargetTransformInfo *TTI, |
| 13952 | bool MustMatchOrInst) { |
| 13953 | // Look past the root to find a source value. Arbitrarily follow the |
| 13954 | // path through operand 0 of any 'or'. Also, peek through optional |
| 13955 | // shift-left-by-multiple-of-8-bits. |
| 13956 | Value *ZextLoad = Root; |
| 13957 | const APInt *ShAmtC; |
| 13958 | bool FoundOr = false; |
| 13959 | while (!isa<ConstantExpr>(Val: ZextLoad) && |
| 13960 | (match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) || |
| 13961 | (match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) && |
| 13962 | ShAmtC->urem(RHS: 8) == 0))) { |
| 13963 | auto *BinOp = cast<BinaryOperator>(Val: ZextLoad); |
| 13964 | ZextLoad = BinOp->getOperand(i_nocapture: 0); |
| 13965 | if (BinOp->getOpcode() == Instruction::Or) |
| 13966 | FoundOr = true; |
| 13967 | } |
| 13968 | // Check if the input is an extended load of the required or/shift expression. |
| 13969 | Value *Load; |
| 13970 | if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || |
| 13971 | !match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) || !isa<LoadInst>(Val: Load)) |
| 13972 | return false; |
| 13973 | |
| 13974 | // Require that the total load bit width is a legal integer type. |
| 13975 | // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target. |
| 13976 | // But <16 x i8> --> i128 is not, so the backend probably can't reduce it. |
| 13977 | Type *SrcTy = Load->getType(); |
| 13978 | unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts; |
| 13979 | if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth))) |
| 13980 | return false; |
| 13981 | |
| 13982 | // Everything matched - assume that we can fold the whole sequence using |
| 13983 | // load combining. |
| 13984 | LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at " |
| 13985 | << *(cast<Instruction>(Root)) << "\n" ); |
| 13986 | |
| 13987 | return true; |
| 13988 | } |
| 13989 | |
| 13990 | bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { |
| 13991 | if (RdxKind != RecurKind::Or) |
| 13992 | return false; |
| 13993 | |
| 13994 | unsigned NumElts = VectorizableTree[0]->Scalars.size(); |
| 13995 | Value *FirstReduced = VectorizableTree[0]->Scalars[0]; |
| 13996 | return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI, |
| 13997 | /* MatchOr */ MustMatchOrInst: false); |
| 13998 | } |
| 13999 | |
| 14000 | bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const { |
| 14001 | // Peek through a final sequence of stores and check if all operations are |
| 14002 | // likely to be load-combined. |
| 14003 | unsigned NumElts = Stores.size(); |
| 14004 | for (Value *Scalar : Stores) { |
| 14005 | Value *X; |
| 14006 | if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) || |
| 14007 | !isLoadCombineCandidateImpl(Root: X, NumElts, TTI, /* MatchOr */ MustMatchOrInst: true)) |
| 14008 | return false; |
| 14009 | } |
| 14010 | return true; |
| 14011 | } |
| 14012 | |
| 14013 | bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { |
| 14014 | if (!DebugCounter::shouldExecute(CounterName: VectorizedGraphs)) |
| 14015 | return true; |
| 14016 | |
| 14017 | // Graph is empty - do nothing. |
| 14018 | if (VectorizableTree.empty()) { |
| 14019 | assert(ExternalUses.empty() && "We shouldn't have any external users" ); |
| 14020 | |
| 14021 | return true; |
| 14022 | } |
| 14023 | |
| 14024 | // No need to vectorize inserts of gathered values. |
| 14025 | if (VectorizableTree.size() == 2 && |
| 14026 | isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) && |
| 14027 | VectorizableTree[1]->isGather() && |
| 14028 | (VectorizableTree[1]->getVectorFactor() <= 2 || |
| 14029 | !(isSplat(VL: VectorizableTree[1]->Scalars) || |
| 14030 | allConstant(VL: VectorizableTree[1]->Scalars)))) |
| 14031 | return true; |
| 14032 | |
| 14033 | // If the graph includes only PHI nodes and gathers, it is defnitely not |
| 14034 | // profitable for the vectorization, we can skip it, if the cost threshold is |
| 14035 | // default. The cost of vectorized PHI nodes is almost always 0 + the cost of |
| 14036 | // gathers/buildvectors. |
| 14037 | constexpr int Limit = 4; |
| 14038 | if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && |
| 14039 | !VectorizableTree.empty() && |
| 14040 | all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 14041 | return (TE->isGather() && |
| 14042 | (!TE->hasState() || |
| 14043 | TE->getOpcode() != Instruction::ExtractElement) && |
| 14044 | count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) || |
| 14045 | (TE->hasState() && TE->getOpcode() == Instruction::PHI); |
| 14046 | })) |
| 14047 | return true; |
| 14048 | |
| 14049 | // Do not vectorize small tree of phis only, if all vector phis are also |
| 14050 | // gathered. |
| 14051 | if (!ForReduction && SLPCostThreshold.getNumOccurrences() && |
| 14052 | VectorizableTree.size() <= Limit && |
| 14053 | all_of(Range: VectorizableTree, |
| 14054 | P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 14055 | return (TE->isGather() && |
| 14056 | (!TE->hasState() || |
| 14057 | TE->getOpcode() != Instruction::ExtractElement) && |
| 14058 | count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= |
| 14059 | Limit) || |
| 14060 | (TE->hasState() && |
| 14061 | (TE->getOpcode() == Instruction::InsertElement || |
| 14062 | (TE->getOpcode() == Instruction::PHI && |
| 14063 | all_of(Range&: TE->Scalars, P: [&](Value *V) { |
| 14064 | return isa<PoisonValue>(Val: V) || MustGather.contains(Ptr: V); |
| 14065 | })))); |
| 14066 | }) && |
| 14067 | any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 14068 | return TE->State == TreeEntry::Vectorize && |
| 14069 | TE->getOpcode() == Instruction::PHI; |
| 14070 | })) |
| 14071 | return true; |
| 14072 | |
| 14073 | // If the tree contains only phis, buildvectors, split nodes and |
| 14074 | // small nodes with reuses, we can skip it. |
| 14075 | if (!ForReduction && !SLPCostThreshold.getNumOccurrences() && |
| 14076 | all_of(Range: VectorizableTree, P: [](const std::unique_ptr<TreeEntry> &TE) { |
| 14077 | return TE->State == TreeEntry::SplitVectorize || |
| 14078 | (TE->isGather() && |
| 14079 | none_of(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>)) || |
| 14080 | (TE->hasState() && (TE->getOpcode() == Instruction::PHI || |
| 14081 | (!TE->ReuseShuffleIndices.empty() && |
| 14082 | TE->Scalars.size() == 2))); |
| 14083 | })) |
| 14084 | return true; |
| 14085 | |
| 14086 | // We can vectorize the tree if its size is greater than or equal to the |
| 14087 | // minimum size specified by the MinTreeSize command line option. |
| 14088 | if (VectorizableTree.size() >= MinTreeSize) |
| 14089 | return false; |
| 14090 | |
| 14091 | // If we have a tiny tree (a tree whose size is less than MinTreeSize), we |
| 14092 | // can vectorize it if we can prove it fully vectorizable. |
| 14093 | if (isFullyVectorizableTinyTree(ForReduction)) |
| 14094 | return false; |
| 14095 | |
| 14096 | // Check if any of the gather node forms an insertelement buildvector |
| 14097 | // somewhere. |
| 14098 | bool IsAllowedSingleBVNode = |
| 14099 | VectorizableTree.size() > 1 || |
| 14100 | (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() && |
| 14101 | !VectorizableTree.front()->isAltShuffle() && |
| 14102 | VectorizableTree.front()->getOpcode() != Instruction::PHI && |
| 14103 | VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr && |
| 14104 | allSameBlock(VL: VectorizableTree.front()->Scalars)); |
| 14105 | if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 14106 | return TE->isGather() && all_of(Range&: TE->Scalars, P: [&](Value *V) { |
| 14107 | return isa<ExtractElementInst, Constant>(Val: V) || |
| 14108 | (IsAllowedSingleBVNode && |
| 14109 | !V->hasNUsesOrMore(N: UsesLimit) && |
| 14110 | any_of(Range: V->users(), P: IsaPred<InsertElementInst>)); |
| 14111 | }); |
| 14112 | })) |
| 14113 | return false; |
| 14114 | |
| 14115 | if (VectorizableTree.back()->isGather() && |
| 14116 | VectorizableTree.back()->hasState() && |
| 14117 | VectorizableTree.back()->isAltShuffle() && |
| 14118 | VectorizableTree.back()->getVectorFactor() > 2 && |
| 14119 | allSameBlock(VL: VectorizableTree.back()->Scalars) && |
| 14120 | !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() && |
| 14121 | TTI->getScalarizationOverhead( |
| 14122 | Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(), |
| 14123 | VF: VectorizableTree.back()->getVectorFactor()), |
| 14124 | DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()), |
| 14125 | /*Insert=*/true, /*Extract=*/false, |
| 14126 | CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold) |
| 14127 | return false; |
| 14128 | |
| 14129 | // Otherwise, we can't vectorize the tree. It is both tiny and not fully |
| 14130 | // vectorizable. |
| 14131 | return true; |
| 14132 | } |
| 14133 | |
| 14134 | bool BoUpSLP::isTreeNotExtendable() const { |
| 14135 | if (getCanonicalGraphSize() != getTreeSize()) { |
| 14136 | constexpr unsigned SmallTree = 3; |
| 14137 | if (VectorizableTree.front()->isNonPowOf2Vec() && |
| 14138 | getCanonicalGraphSize() <= SmallTree && |
| 14139 | count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()), |
| 14140 | P: [](const std::unique_ptr<TreeEntry> &TE) { |
| 14141 | return TE->isGather() && TE->hasState() && |
| 14142 | TE->getOpcode() == Instruction::Load && |
| 14143 | !allSameBlock(VL: TE->Scalars); |
| 14144 | }) == 1) |
| 14145 | return true; |
| 14146 | return false; |
| 14147 | } |
| 14148 | bool Res = false; |
| 14149 | for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) { |
| 14150 | TreeEntry &E = *VectorizableTree[Idx]; |
| 14151 | if (E.State == TreeEntry::SplitVectorize) |
| 14152 | return false; |
| 14153 | if (!E.isGather()) |
| 14154 | continue; |
| 14155 | if ((E.hasState() && E.getOpcode() != Instruction::Load) || |
| 14156 | (!E.hasState() && |
| 14157 | all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) || |
| 14158 | (isa<ExtractElementInst>(Val: E.Scalars.front()) && |
| 14159 | getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid())) |
| 14160 | return false; |
| 14161 | if (isSplat(VL: E.Scalars) || allConstant(VL: E.Scalars)) |
| 14162 | continue; |
| 14163 | Res = true; |
| 14164 | } |
| 14165 | return Res; |
| 14166 | } |
| 14167 | |
| 14168 | InstructionCost BoUpSLP::getSpillCost() { |
| 14169 | // Walk from the bottom of the tree to the top, tracking which values are |
| 14170 | // live. When we see a call instruction that is not part of our tree, |
| 14171 | // query TTI to see if there is a cost to keeping values live over it |
| 14172 | // (for example, if spills and fills are required). |
| 14173 | |
| 14174 | const TreeEntry *Root = VectorizableTree.front().get(); |
| 14175 | if (Root->isGather()) |
| 14176 | return 0; |
| 14177 | |
| 14178 | InstructionCost Cost = 0; |
| 14179 | SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>> |
| 14180 | EntriesToOperands; |
| 14181 | SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction; |
| 14182 | SmallPtrSet<const Instruction *, 8> LastInstructions; |
| 14183 | for (const auto &TEPtr : VectorizableTree) { |
| 14184 | if (!TEPtr->isGather()) { |
| 14185 | Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get()); |
| 14186 | EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst); |
| 14187 | LastInstructions.insert(Ptr: LastInst); |
| 14188 | } |
| 14189 | if (TEPtr->UserTreeIndex) |
| 14190 | EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get()); |
| 14191 | } |
| 14192 | |
| 14193 | auto NoCallIntrinsic = [this](const Instruction *I) { |
| 14194 | const auto *II = dyn_cast<IntrinsicInst>(Val: I); |
| 14195 | if (!II) |
| 14196 | return false; |
| 14197 | if (II->isAssumeLikeIntrinsic()) |
| 14198 | return true; |
| 14199 | IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II); |
| 14200 | InstructionCost IntrCost = |
| 14201 | TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput); |
| 14202 | InstructionCost CallCost = TTI->getCallInstrCost( |
| 14203 | F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput); |
| 14204 | return IntrCost < CallCost; |
| 14205 | }; |
| 14206 | |
| 14207 | // Maps last instruction in the entry to the last instruction for the one of |
| 14208 | // operand entries and the flag. If the flag is true, there are no calls in |
| 14209 | // between these instructions. |
| 14210 | SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>> |
| 14211 | CheckedInstructions; |
| 14212 | unsigned Budget = 0; |
| 14213 | const unsigned BudgetLimit = |
| 14214 | ScheduleRegionSizeBudget / VectorizableTree.size(); |
| 14215 | auto CheckForNonVecCallsInSameBlock = [&](Instruction *First, |
| 14216 | const Instruction *Last) { |
| 14217 | assert(First->getParent() == Last->getParent() && |
| 14218 | "Expected instructions in same block." ); |
| 14219 | if (auto It = CheckedInstructions.find(Val: Last); |
| 14220 | It != CheckedInstructions.end()) { |
| 14221 | const Instruction *Checked = It->second.getPointer(); |
| 14222 | if (Checked == First || Checked->comesBefore(Other: First)) |
| 14223 | return It->second.getInt() != 0; |
| 14224 | Last = Checked; |
| 14225 | } else if (Last == First || Last->comesBefore(Other: First)) { |
| 14226 | return true; |
| 14227 | } |
| 14228 | BasicBlock::const_reverse_iterator InstIt = |
| 14229 | ++First->getIterator().getReverse(), |
| 14230 | PrevInstIt = |
| 14231 | Last->getIterator().getReverse(); |
| 14232 | SmallVector<const Instruction *> LastInstsInRange; |
| 14233 | while (InstIt != PrevInstIt && Budget <= BudgetLimit) { |
| 14234 | // Debug information does not impact spill cost. |
| 14235 | // Vectorized calls, represented as vector intrinsics, do not impact spill |
| 14236 | // cost. |
| 14237 | if (const auto *CB = dyn_cast<CallBase>(Val: &*PrevInstIt); |
| 14238 | CB && !NoCallIntrinsic(CB) && !isVectorized(V: CB)) { |
| 14239 | for (const Instruction *LastInst : LastInstsInRange) |
| 14240 | CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: 0); |
| 14241 | return false; |
| 14242 | } |
| 14243 | if (LastInstructions.contains(Ptr: &*PrevInstIt)) |
| 14244 | LastInstsInRange.push_back(Elt: &*PrevInstIt); |
| 14245 | |
| 14246 | ++PrevInstIt; |
| 14247 | ++Budget; |
| 14248 | } |
| 14249 | for (const Instruction *LastInst : LastInstsInRange) |
| 14250 | CheckedInstructions.try_emplace( |
| 14251 | Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt, |
| 14252 | Args: Budget <= BudgetLimit ? 1 : 0); |
| 14253 | return Budget <= BudgetLimit; |
| 14254 | }; |
| 14255 | auto AddCosts = [&](const TreeEntry *Op) { |
| 14256 | Type *ScalarTy = Op->Scalars.front()->getType(); |
| 14257 | auto It = MinBWs.find(Val: Op); |
| 14258 | if (It != MinBWs.end()) |
| 14259 | ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first); |
| 14260 | auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor()); |
| 14261 | Cost += TTI->getCostOfKeepingLiveOverCall(Tys: VecTy); |
| 14262 | if (ScalarTy->isVectorTy()) { |
| 14263 | // Handle revec dead vector instructions. |
| 14264 | Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy); |
| 14265 | } |
| 14266 | }; |
| 14267 | // Memoize the relationship between blocks, i.e. if there is (at least one) |
| 14268 | // non-vectorized call between the blocks. This allows to skip the analysis of |
| 14269 | // the same block paths multiple times. |
| 14270 | SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool> |
| 14271 | ParentOpParentToPreds; |
| 14272 | auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred, |
| 14273 | BasicBlock *OpParent) { |
| 14274 | auto Key = std::make_pair(x&: Root, y&: OpParent); |
| 14275 | if (auto It = ParentOpParentToPreds.find(Val: Key); |
| 14276 | It != ParentOpParentToPreds.end()) |
| 14277 | return It->second; |
| 14278 | SmallVector<BasicBlock *> Worklist; |
| 14279 | if (Pred) |
| 14280 | Worklist.push_back(Elt: Pred); |
| 14281 | else |
| 14282 | Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root)); |
| 14283 | SmallPtrSet<const BasicBlock *, 16> Visited; |
| 14284 | SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>> |
| 14285 | ParentsPairsToAdd; |
| 14286 | bool Res = false; |
| 14287 | auto Cleanup = make_scope_exit(F: [&]() { |
| 14288 | for (const auto &KeyPair : ParentsPairsToAdd) { |
| 14289 | assert(!ParentOpParentToPreds.contains(KeyPair) && |
| 14290 | "Should not have been added before." ); |
| 14291 | ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res); |
| 14292 | } |
| 14293 | }); |
| 14294 | while (!Worklist.empty()) { |
| 14295 | BasicBlock *BB = Worklist.pop_back_val(); |
| 14296 | if (BB == OpParent || !Visited.insert(Ptr: BB).second) |
| 14297 | continue; |
| 14298 | auto Pair = std::make_pair(x&: BB, y&: OpParent); |
| 14299 | if (auto It = ParentOpParentToPreds.find(Val: Pair); |
| 14300 | It != ParentOpParentToPreds.end()) { |
| 14301 | Res = It->second; |
| 14302 | return Res; |
| 14303 | } |
| 14304 | ParentsPairsToAdd.insert(V: Pair); |
| 14305 | unsigned BlockSize = BB->size(); |
| 14306 | if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget)) |
| 14307 | return Res; |
| 14308 | Budget += BlockSize; |
| 14309 | if (Budget > BudgetLimit) |
| 14310 | return Res; |
| 14311 | if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) && |
| 14312 | !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(), |
| 14313 | BB->getTerminator())) |
| 14314 | return Res; |
| 14315 | Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB)); |
| 14316 | } |
| 14317 | Res = true; |
| 14318 | return Res; |
| 14319 | }; |
| 14320 | SmallVector<const TreeEntry *> LiveEntries(1, Root); |
| 14321 | while (!LiveEntries.empty()) { |
| 14322 | const TreeEntry *Entry = LiveEntries.pop_back_val(); |
| 14323 | SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry); |
| 14324 | if (Operands.empty()) |
| 14325 | continue; |
| 14326 | Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry); |
| 14327 | BasicBlock *Parent = LastInst->getParent(); |
| 14328 | for (const TreeEntry *Op : Operands) { |
| 14329 | if (!Op->isGather()) |
| 14330 | LiveEntries.push_back(Elt: Op); |
| 14331 | if (Entry->State == TreeEntry::SplitVectorize || |
| 14332 | (Entry->getOpcode() != Instruction::PHI && Op->isGather()) || |
| 14333 | (Op->isGather() && allConstant(VL: Op->Scalars))) |
| 14334 | continue; |
| 14335 | Budget = 0; |
| 14336 | BasicBlock *Pred = nullptr; |
| 14337 | if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp())) |
| 14338 | Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx); |
| 14339 | BasicBlock *OpParent; |
| 14340 | Instruction *OpLastInst; |
| 14341 | if (Op->isGather()) { |
| 14342 | assert(Entry->getOpcode() == Instruction::PHI && |
| 14343 | "Expected phi node only." ); |
| 14344 | OpParent = cast<PHINode>(Val: Entry->getMainOp()) |
| 14345 | ->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx); |
| 14346 | OpLastInst = OpParent->getTerminator(); |
| 14347 | for (Value *V : Op->Scalars) { |
| 14348 | auto *Inst = dyn_cast<Instruction>(Val: V); |
| 14349 | if (!Inst) |
| 14350 | continue; |
| 14351 | if (isVectorized(V)) { |
| 14352 | OpParent = Inst->getParent(); |
| 14353 | OpLastInst = Inst; |
| 14354 | break; |
| 14355 | } |
| 14356 | } |
| 14357 | } else { |
| 14358 | OpLastInst = EntriesToLastInstruction.at(Val: Op); |
| 14359 | OpParent = OpLastInst->getParent(); |
| 14360 | } |
| 14361 | // Check the call instructions within the same basic blocks. |
| 14362 | if (OpParent == Parent) { |
| 14363 | if (Entry->getOpcode() == Instruction::PHI) { |
| 14364 | if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst)) |
| 14365 | AddCosts(Op); |
| 14366 | continue; |
| 14367 | } |
| 14368 | if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst)) |
| 14369 | AddCosts(Op); |
| 14370 | continue; |
| 14371 | } |
| 14372 | // Check for call instruction in between blocks. |
| 14373 | // 1. Check entry's block to the head. |
| 14374 | if (Entry->getOpcode() != Instruction::PHI && |
| 14375 | !CheckForNonVecCallsInSameBlock( |
| 14376 | &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(), |
| 14377 | LastInst)) { |
| 14378 | AddCosts(Op); |
| 14379 | continue; |
| 14380 | } |
| 14381 | // 2. Check op's block from the end. |
| 14382 | if (!CheckForNonVecCallsInSameBlock(OpLastInst, |
| 14383 | OpParent->getTerminator())) { |
| 14384 | AddCosts(Op); |
| 14385 | continue; |
| 14386 | } |
| 14387 | // 3. Check the predecessors of entry's block till op's block. |
| 14388 | if (!CheckPredecessors(Parent, Pred, OpParent)) { |
| 14389 | AddCosts(Op); |
| 14390 | continue; |
| 14391 | } |
| 14392 | } |
| 14393 | } |
| 14394 | |
| 14395 | return Cost; |
| 14396 | } |
| 14397 | |
| 14398 | /// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the |
| 14399 | /// buildvector sequence. |
| 14400 | static bool isFirstInsertElement(const InsertElementInst *IE1, |
| 14401 | const InsertElementInst *IE2) { |
| 14402 | if (IE1 == IE2) |
| 14403 | return false; |
| 14404 | const auto *I1 = IE1; |
| 14405 | const auto *I2 = IE2; |
| 14406 | const InsertElementInst *PrevI1; |
| 14407 | const InsertElementInst *PrevI2; |
| 14408 | unsigned Idx1 = *getElementIndex(Inst: IE1); |
| 14409 | unsigned Idx2 = *getElementIndex(Inst: IE2); |
| 14410 | do { |
| 14411 | if (I2 == IE1) |
| 14412 | return true; |
| 14413 | if (I1 == IE2) |
| 14414 | return false; |
| 14415 | PrevI1 = I1; |
| 14416 | PrevI2 = I2; |
| 14417 | if (I1 && (I1 == IE1 || I1->hasOneUse()) && |
| 14418 | getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2) |
| 14419 | I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0)); |
| 14420 | if (I2 && ((I2 == IE2 || I2->hasOneUse())) && |
| 14421 | getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1) |
| 14422 | I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0)); |
| 14423 | } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2)); |
| 14424 | llvm_unreachable("Two different buildvectors not expected." ); |
| 14425 | } |
| 14426 | |
| 14427 | namespace { |
| 14428 | /// Returns incoming Value *, if the requested type is Value * too, or a default |
| 14429 | /// value, otherwise. |
| 14430 | struct ValueSelect { |
| 14431 | template <typename U> |
| 14432 | static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) { |
| 14433 | return V; |
| 14434 | } |
| 14435 | template <typename U> |
| 14436 | static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) { |
| 14437 | return U(); |
| 14438 | } |
| 14439 | }; |
| 14440 | } // namespace |
| 14441 | |
| 14442 | /// Does the analysis of the provided shuffle masks and performs the requested |
| 14443 | /// actions on the vectors with the given shuffle masks. It tries to do it in |
| 14444 | /// several steps. |
| 14445 | /// 1. If the Base vector is not undef vector, resizing the very first mask to |
| 14446 | /// have common VF and perform action for 2 input vectors (including non-undef |
| 14447 | /// Base). Other shuffle masks are combined with the resulting after the 1 stage |
| 14448 | /// and processed as a shuffle of 2 elements. |
| 14449 | /// 2. If the Base is undef vector and have only 1 shuffle mask, perform the |
| 14450 | /// action only for 1 vector with the given mask, if it is not the identity |
| 14451 | /// mask. |
| 14452 | /// 3. If > 2 masks are used, perform the remaining shuffle actions for 2 |
| 14453 | /// vectors, combing the masks properly between the steps. |
| 14454 | template <typename T> |
| 14455 | static T *( |
| 14456 | MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base, |
| 14457 | function_ref<unsigned(T *)> GetVF, |
| 14458 | function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction, |
| 14459 | function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) { |
| 14460 | assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts." ); |
| 14461 | SmallVector<int> Mask(ShuffleMask.begin()->second); |
| 14462 | auto VMIt = std::next(ShuffleMask.begin()); |
| 14463 | T *Prev = nullptr; |
| 14464 | SmallBitVector UseMask = |
| 14465 | buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask); |
| 14466 | SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask); |
| 14467 | if (!IsBaseUndef.all()) { |
| 14468 | // Base is not undef, need to combine it with the next subvectors. |
| 14469 | std::pair<T *, bool> Res = |
| 14470 | ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false); |
| 14471 | SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask); |
| 14472 | for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) { |
| 14473 | if (Mask[Idx] == PoisonMaskElem) |
| 14474 | Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx; |
| 14475 | else |
| 14476 | Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF; |
| 14477 | } |
| 14478 | [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base); |
| 14479 | assert((!V || GetVF(V) == Mask.size()) && |
| 14480 | "Expected base vector of VF number of elements." ); |
| 14481 | Prev = Action(Mask, {nullptr, Res.first}); |
| 14482 | } else if (ShuffleMask.size() == 1) { |
| 14483 | // Base is undef and only 1 vector is shuffled - perform the action only for |
| 14484 | // single vector, if the mask is not the identity mask. |
| 14485 | std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask, |
| 14486 | /*ForSingleMask=*/true); |
| 14487 | if (Res.second) |
| 14488 | // Identity mask is found. |
| 14489 | Prev = Res.first; |
| 14490 | else |
| 14491 | Prev = Action(Mask, {ShuffleMask.begin()->first}); |
| 14492 | } else { |
| 14493 | // Base is undef and at least 2 input vectors shuffled - perform 2 vectors |
| 14494 | // shuffles step by step, combining shuffle between the steps. |
| 14495 | unsigned Vec1VF = GetVF(ShuffleMask.begin()->first); |
| 14496 | unsigned Vec2VF = GetVF(VMIt->first); |
| 14497 | if (Vec1VF == Vec2VF) { |
| 14498 | // No need to resize the input vectors since they are of the same size, we |
| 14499 | // can shuffle them directly. |
| 14500 | ArrayRef<int> SecMask = VMIt->second; |
| 14501 | for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { |
| 14502 | if (SecMask[I] != PoisonMaskElem) { |
| 14503 | assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars." ); |
| 14504 | Mask[I] = SecMask[I] + Vec1VF; |
| 14505 | } |
| 14506 | } |
| 14507 | Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first}); |
| 14508 | } else { |
| 14509 | // Vectors of different sizes - resize and reshuffle. |
| 14510 | std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask, |
| 14511 | /*ForSingleMask=*/false); |
| 14512 | std::pair<T *, bool> Res2 = |
| 14513 | ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); |
| 14514 | ArrayRef<int> SecMask = VMIt->second; |
| 14515 | for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { |
| 14516 | if (Mask[I] != PoisonMaskElem) { |
| 14517 | assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars." ); |
| 14518 | if (Res1.second) |
| 14519 | Mask[I] = I; |
| 14520 | } else if (SecMask[I] != PoisonMaskElem) { |
| 14521 | assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars." ); |
| 14522 | Mask[I] = (Res2.second ? I : SecMask[I]) + VF; |
| 14523 | } |
| 14524 | } |
| 14525 | Prev = Action(Mask, {Res1.first, Res2.first}); |
| 14526 | } |
| 14527 | VMIt = std::next(VMIt); |
| 14528 | } |
| 14529 | [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all(); |
| 14530 | // Perform requested actions for the remaining masks/vectors. |
| 14531 | for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) { |
| 14532 | // Shuffle other input vectors, if any. |
| 14533 | std::pair<T *, bool> Res = |
| 14534 | ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false); |
| 14535 | ArrayRef<int> SecMask = VMIt->second; |
| 14536 | for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) { |
| 14537 | if (SecMask[I] != PoisonMaskElem) { |
| 14538 | assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) && |
| 14539 | "Multiple uses of scalars." ); |
| 14540 | Mask[I] = (Res.second ? I : SecMask[I]) + VF; |
| 14541 | } else if (Mask[I] != PoisonMaskElem) { |
| 14542 | Mask[I] = I; |
| 14543 | } |
| 14544 | } |
| 14545 | Prev = Action(Mask, {Prev, Res.first}); |
| 14546 | } |
| 14547 | return Prev; |
| 14548 | } |
| 14549 | |
| 14550 | namespace { |
| 14551 | /// Data type for handling buildvector sequences with the reused scalars from |
| 14552 | /// other tree entries. |
| 14553 | template <typename T> struct ShuffledInsertData { |
| 14554 | /// List of insertelements to be replaced by shuffles. |
| 14555 | SmallVector<InsertElementInst *> InsertElements; |
| 14556 | /// The parent vectors and shuffle mask for the given list of inserts. |
| 14557 | MapVector<T, SmallVector<int>> ValueMasks; |
| 14558 | }; |
| 14559 | } // namespace |
| 14560 | |
| 14561 | InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals, |
| 14562 | InstructionCost ReductionCost) { |
| 14563 | InstructionCost Cost = ReductionCost; |
| 14564 | LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " |
| 14565 | << VectorizableTree.size() << ".\n" ); |
| 14566 | |
| 14567 | unsigned BundleWidth = VectorizableTree[0]->Scalars.size(); |
| 14568 | |
| 14569 | SmallPtrSet<Value *, 4> ; |
| 14570 | for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { |
| 14571 | TreeEntry &TE = *VectorizableTree[I]; |
| 14572 | // No need to count the cost for combined entries, they are combined and |
| 14573 | // just skip their cost. |
| 14574 | if (TE.State == TreeEntry::CombinedVectorize) { |
| 14575 | LLVM_DEBUG( |
| 14576 | dbgs() << "SLP: Skipping cost for combined node that starts with " |
| 14577 | << *TE.Scalars[0] << ".\n" ; |
| 14578 | TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n" ); |
| 14579 | continue; |
| 14580 | } |
| 14581 | if (TE.hasState() && |
| 14582 | (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) { |
| 14583 | if (const TreeEntry *E = |
| 14584 | getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars); |
| 14585 | E && E->getVectorFactor() == TE.getVectorFactor()) { |
| 14586 | // Some gather nodes might be absolutely the same as some vectorizable |
| 14587 | // nodes after reordering, need to handle it. |
| 14588 | LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle " |
| 14589 | << shortBundleName(TE.Scalars, TE.Idx) << ".\n" |
| 14590 | << "SLP: Current total cost = " << Cost << "\n" ); |
| 14591 | continue; |
| 14592 | } |
| 14593 | } |
| 14594 | |
| 14595 | // Exclude cost of gather loads nodes which are not used. These nodes were |
| 14596 | // built as part of the final attempt to vectorize gathered loads. |
| 14597 | assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) && |
| 14598 | "Expected gather nodes with users only." ); |
| 14599 | |
| 14600 | InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts); |
| 14601 | Cost += C; |
| 14602 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " |
| 14603 | << shortBundleName(TE.Scalars, TE.Idx) << ".\n" |
| 14604 | << "SLP: Current total cost = " << Cost << "\n" ); |
| 14605 | } |
| 14606 | |
| 14607 | if (Cost >= -SLPCostThreshold && |
| 14608 | none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) { |
| 14609 | return isa_and_nonnull<InsertElementInst>(Val: EU.User); |
| 14610 | })) |
| 14611 | return Cost; |
| 14612 | |
| 14613 | SmallPtrSet<Value *, 16> ; |
| 14614 | InstructionCost = 0; |
| 14615 | SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts; |
| 14616 | SmallVector<APInt> DemandedElts; |
| 14617 | SmallDenseSet<Value *, 4> UsedInserts; |
| 14618 | DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts; |
| 14619 | std::optional<DenseMap<Value *, unsigned>> ValueToExtUses; |
| 14620 | DenseMap<const TreeEntry *, DenseSet<Value *>> ; |
| 14621 | SmallPtrSet<Value *, 4> ScalarOpsFromCasts; |
| 14622 | // Keep track {Scalar, Index, User} tuple. |
| 14623 | // On AArch64, this helps in fusing a mov instruction, associated with |
| 14624 | // extractelement, with fmul in the backend so that extractelement is free. |
| 14625 | SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx; |
| 14626 | for (ExternalUser &EU : ExternalUses) { |
| 14627 | ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane); |
| 14628 | } |
| 14629 | SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser; |
| 14630 | for (ExternalUser &EU : ExternalUses) { |
| 14631 | // Uses by ephemeral values are free (because the ephemeral value will be |
| 14632 | // removed prior to code generation, and so the extraction will be |
| 14633 | // removed as well). |
| 14634 | if (EphValues.count(Ptr: EU.User)) |
| 14635 | continue; |
| 14636 | |
| 14637 | // Check if the scalar for the given user or all users is accounted already. |
| 14638 | if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second || |
| 14639 | (EU.User && |
| 14640 | CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr)))) |
| 14641 | continue; |
| 14642 | |
| 14643 | // Used in unreachable blocks or in EH pads (rarely executed) or is |
| 14644 | // terminated with unreachable instruction. |
| 14645 | if (BasicBlock *UserParent = |
| 14646 | EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr; |
| 14647 | UserParent && |
| 14648 | (!DT->isReachableFromEntry(A: UserParent) || UserParent->isEHPad() || |
| 14649 | isa_and_present<UnreachableInst>(Val: UserParent->getTerminator()))) |
| 14650 | continue; |
| 14651 | |
| 14652 | // We only add extract cost once for the same scalar. |
| 14653 | if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) && |
| 14654 | !ExtractCostCalculated.insert(Ptr: EU.Scalar).second) |
| 14655 | continue; |
| 14656 | |
| 14657 | // No extract cost for vector "scalar" if REVEC is disabled |
| 14658 | if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType())) |
| 14659 | continue; |
| 14660 | |
| 14661 | // If found user is an insertelement, do not calculate extract cost but try |
| 14662 | // to detect it as a final shuffled/identity match. |
| 14663 | // TODO: what if a user is insertvalue when REVEC is enabled? |
| 14664 | if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User); |
| 14665 | VU && VU->getOperand(i_nocapture: 1) == EU.Scalar) { |
| 14666 | if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) { |
| 14667 | if (!UsedInserts.insert(V: VU).second) |
| 14668 | continue; |
| 14669 | std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU); |
| 14670 | if (InsertIdx) { |
| 14671 | const TreeEntry *ScalarTE = &EU.E; |
| 14672 | auto *It = find_if( |
| 14673 | Range&: ShuffledInserts, |
| 14674 | P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) { |
| 14675 | // Checks if 2 insertelements are from the same buildvector. |
| 14676 | InsertElementInst *VecInsert = Data.InsertElements.front(); |
| 14677 | return areTwoInsertFromSameBuildVector( |
| 14678 | VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst *II) -> Value * { |
| 14679 | Value *Op0 = II->getOperand(i_nocapture: 0); |
| 14680 | if (isVectorized(V: II) && !isVectorized(V: Op0)) |
| 14681 | return nullptr; |
| 14682 | return Op0; |
| 14683 | }); |
| 14684 | }); |
| 14685 | int VecId = -1; |
| 14686 | if (It == ShuffledInserts.end()) { |
| 14687 | auto &Data = ShuffledInserts.emplace_back(); |
| 14688 | Data.InsertElements.emplace_back(Args&: VU); |
| 14689 | DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements())); |
| 14690 | VecId = ShuffledInserts.size() - 1; |
| 14691 | auto It = MinBWs.find(Val: ScalarTE); |
| 14692 | if (It != MinBWs.end() && |
| 14693 | VectorCasts |
| 14694 | .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType())) |
| 14695 | .second) { |
| 14696 | unsigned BWSz = It->second.first; |
| 14697 | unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType()); |
| 14698 | unsigned VecOpcode; |
| 14699 | if (DstBWSz < BWSz) |
| 14700 | VecOpcode = Instruction::Trunc; |
| 14701 | else |
| 14702 | VecOpcode = |
| 14703 | It->second.second ? Instruction::SExt : Instruction::ZExt; |
| 14704 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 14705 | InstructionCost C = TTI->getCastInstrCost( |
| 14706 | Opcode: VecOpcode, Dst: FTy, |
| 14707 | Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz), |
| 14708 | VF: FTy->getNumElements()), |
| 14709 | CCH: TTI::CastContextHint::None, CostKind); |
| 14710 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C |
| 14711 | << " for extending externally used vector with " |
| 14712 | "non-equal minimum bitwidth.\n" ); |
| 14713 | Cost += C; |
| 14714 | } |
| 14715 | } else { |
| 14716 | if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front())) |
| 14717 | It->InsertElements.front() = VU; |
| 14718 | VecId = std::distance(first: ShuffledInserts.begin(), last: It); |
| 14719 | } |
| 14720 | int InIdx = *InsertIdx; |
| 14721 | SmallVectorImpl<int> &Mask = |
| 14722 | ShuffledInserts[VecId].ValueMasks[ScalarTE]; |
| 14723 | if (Mask.empty()) |
| 14724 | Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem); |
| 14725 | Mask[InIdx] = EU.Lane; |
| 14726 | DemandedElts[VecId].setBit(InIdx); |
| 14727 | continue; |
| 14728 | } |
| 14729 | } |
| 14730 | } |
| 14731 | |
| 14732 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 14733 | // If we plan to rewrite the tree in a smaller type, we will need to sign |
| 14734 | // extend the extracted value back to the original type. Here, we account |
| 14735 | // for the extract and the added cost of the sign extend if needed. |
| 14736 | InstructionCost = TTI::TCC_Free; |
| 14737 | auto *ScalarTy = EU.Scalar->getType(); |
| 14738 | auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth); |
| 14739 | const TreeEntry *Entry = &EU.E; |
| 14740 | auto It = MinBWs.find(Val: Entry); |
| 14741 | if (It != MinBWs.end()) { |
| 14742 | Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first); |
| 14743 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) |
| 14744 | MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements()); |
| 14745 | unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery(*DL)) |
| 14746 | ? Instruction::ZExt |
| 14747 | : Instruction::SExt; |
| 14748 | VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth); |
| 14749 | ExtraCost = |
| 14750 | getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane); |
| 14751 | } else { |
| 14752 | ExtraCost = |
| 14753 | getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy, |
| 14754 | CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx); |
| 14755 | } |
| 14756 | // Leave the scalar instructions as is if they are cheaper than extracts. |
| 14757 | if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr || |
| 14758 | Entry->getOpcode() == Instruction::Load) { |
| 14759 | // Checks if the user of the external scalar is phi in loop body. |
| 14760 | auto IsPhiInLoop = [&](const ExternalUser &U) { |
| 14761 | if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) { |
| 14762 | auto *I = cast<Instruction>(Val: U.Scalar); |
| 14763 | const Loop *L = LI->getLoopFor(BB: Phi->getParent()); |
| 14764 | return L && (Phi->getParent() == I->getParent() || |
| 14765 | L == LI->getLoopFor(BB: I->getParent())); |
| 14766 | } |
| 14767 | return false; |
| 14768 | }; |
| 14769 | if (!ValueToExtUses) { |
| 14770 | ValueToExtUses.emplace(); |
| 14771 | for (const auto &P : enumerate(First&: ExternalUses)) { |
| 14772 | // Ignore phis in loops. |
| 14773 | if (IsPhiInLoop(P.value())) |
| 14774 | continue; |
| 14775 | |
| 14776 | ValueToExtUses->try_emplace(Key: P.value().Scalar, Args: P.index()); |
| 14777 | } |
| 14778 | } |
| 14779 | // Can use original instruction, if no operands vectorized or they are |
| 14780 | // marked as externally used already. |
| 14781 | auto *Inst = cast<Instruction>(Val: EU.Scalar); |
| 14782 | InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind); |
| 14783 | auto OperandIsScalar = [&](Value *V) { |
| 14784 | if (!isVectorized(V)) { |
| 14785 | // Some extractelements might be not vectorized, but |
| 14786 | // transformed into shuffle and removed from the function, |
| 14787 | // consider it here. |
| 14788 | if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) |
| 14789 | return !EE->hasOneUse() || !MustGather.contains(Ptr: EE); |
| 14790 | return true; |
| 14791 | } |
| 14792 | return ValueToExtUses->contains(Val: V); |
| 14793 | }; |
| 14794 | bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar); |
| 14795 | bool CanBeUsedAsScalarCast = false; |
| 14796 | if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) { |
| 14797 | if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: 0)); |
| 14798 | Op && all_of(Range: Op->operands(), P: OperandIsScalar)) { |
| 14799 | InstructionCost OpCost = |
| 14800 | (isVectorized(V: Op) && !ValueToExtUses->contains(Val: Op)) |
| 14801 | ? TTI->getInstructionCost(U: Op, CostKind) |
| 14802 | : 0; |
| 14803 | if (ScalarCost + OpCost <= ExtraCost) { |
| 14804 | CanBeUsedAsScalar = CanBeUsedAsScalarCast = true; |
| 14805 | ScalarCost += OpCost; |
| 14806 | } |
| 14807 | } |
| 14808 | } |
| 14809 | if (CanBeUsedAsScalar) { |
| 14810 | bool KeepScalar = ScalarCost <= ExtraCost; |
| 14811 | // Try to keep original scalar if the user is the phi node from the same |
| 14812 | // block as the root phis, currently vectorized. It allows to keep |
| 14813 | // better ordering info of PHIs, being vectorized currently. |
| 14814 | bool IsProfitablePHIUser = |
| 14815 | (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic && |
| 14816 | VectorizableTree.front()->Scalars.size() > 2)) && |
| 14817 | VectorizableTree.front()->getOpcode() == Instruction::PHI && |
| 14818 | !Inst->hasNUsesOrMore(N: UsesLimit) && |
| 14819 | none_of(Range: Inst->users(), |
| 14820 | P: [&](User *U) { |
| 14821 | auto *PHIUser = dyn_cast<PHINode>(Val: U); |
| 14822 | return (!PHIUser || |
| 14823 | PHIUser->getParent() != |
| 14824 | cast<Instruction>( |
| 14825 | Val: VectorizableTree.front()->getMainOp()) |
| 14826 | ->getParent()) && |
| 14827 | !isVectorized(V: U); |
| 14828 | }) && |
| 14829 | count_if(Range: Entry->Scalars, P: [&](Value *V) { |
| 14830 | return ValueToExtUses->contains(Val: V); |
| 14831 | }) <= 2; |
| 14832 | if (IsProfitablePHIUser) { |
| 14833 | KeepScalar = true; |
| 14834 | } else if (KeepScalar && ScalarCost != TTI::TCC_Free && |
| 14835 | ExtraCost - ScalarCost <= TTI::TCC_Basic && |
| 14836 | (!GatheredLoadsEntriesFirst.has_value() || |
| 14837 | Entry->Idx < *GatheredLoadsEntriesFirst)) { |
| 14838 | unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) { |
| 14839 | return ValueToExtUses->contains(Val: V); |
| 14840 | }); |
| 14841 | auto It = ExtractsCount.find(Val: Entry); |
| 14842 | if (It != ExtractsCount.end()) { |
| 14843 | assert(ScalarUsesCount >= It->getSecond().size() && |
| 14844 | "Expected total number of external uses not less than " |
| 14845 | "number of scalar uses." ); |
| 14846 | ScalarUsesCount -= It->getSecond().size(); |
| 14847 | } |
| 14848 | // Keep original scalar if number of externally used instructions in |
| 14849 | // the same entry is not power of 2. It may help to do some extra |
| 14850 | // vectorization for now. |
| 14851 | KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(Value: ScalarUsesCount); |
| 14852 | } |
| 14853 | if (KeepScalar) { |
| 14854 | ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar); |
| 14855 | for (Value *V : Inst->operands()) { |
| 14856 | auto It = ValueToExtUses->find(Val: V); |
| 14857 | if (It != ValueToExtUses->end()) { |
| 14858 | // Replace all uses to avoid compiler crash. |
| 14859 | ExternalUses[It->second].User = nullptr; |
| 14860 | } |
| 14861 | } |
| 14862 | ExtraCost = ScalarCost; |
| 14863 | if (!IsPhiInLoop(EU)) |
| 14864 | ExtractsCount[Entry].insert(V: Inst); |
| 14865 | if (CanBeUsedAsScalarCast) { |
| 14866 | ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: 0)); |
| 14867 | // Update the users of the operands of the cast operand to avoid |
| 14868 | // compiler crash. |
| 14869 | if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: 0))) { |
| 14870 | for (Value *V : IOp->operands()) { |
| 14871 | auto It = ValueToExtUses->find(Val: V); |
| 14872 | if (It != ValueToExtUses->end()) { |
| 14873 | // Replace all uses to avoid compiler crash. |
| 14874 | ExternalUses[It->second].User = nullptr; |
| 14875 | } |
| 14876 | } |
| 14877 | } |
| 14878 | } |
| 14879 | } |
| 14880 | } |
| 14881 | } |
| 14882 | |
| 14883 | ExtractCost += ExtraCost; |
| 14884 | } |
| 14885 | // Insert externals for extract of operands of casts to be emitted as scalars |
| 14886 | // instead of extractelement. |
| 14887 | for (Value *V : ScalarOpsFromCasts) { |
| 14888 | ExternalUsesAsOriginalScalar.insert(Ptr: V); |
| 14889 | if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) { |
| 14890 | ExternalUses.emplace_back(Args&: V, Args: nullptr, Args&: *TEs.front(), |
| 14891 | Args: TEs.front()->findLaneForValue(V)); |
| 14892 | } |
| 14893 | } |
| 14894 | // Add reduced value cost, if resized. |
| 14895 | if (!VectorizedVals.empty()) { |
| 14896 | const TreeEntry &Root = *VectorizableTree.front(); |
| 14897 | auto BWIt = MinBWs.find(Val: &Root); |
| 14898 | if (BWIt != MinBWs.end()) { |
| 14899 | Type *DstTy = Root.Scalars.front()->getType(); |
| 14900 | unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType()); |
| 14901 | unsigned SrcSz = |
| 14902 | ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth; |
| 14903 | if (OriginalSz != SrcSz) { |
| 14904 | unsigned Opcode = Instruction::Trunc; |
| 14905 | if (OriginalSz > SrcSz) |
| 14906 | Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt; |
| 14907 | Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz); |
| 14908 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) { |
| 14909 | assert(SLPReVec && "Only supported by REVEC." ); |
| 14910 | SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements()); |
| 14911 | } |
| 14912 | Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy, |
| 14913 | CCH: TTI::CastContextHint::None, |
| 14914 | CostKind: TTI::TCK_RecipThroughput); |
| 14915 | } |
| 14916 | } |
| 14917 | } |
| 14918 | |
| 14919 | Cost += ExtractCost; |
| 14920 | auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask, |
| 14921 | bool ForSingleMask) { |
| 14922 | InstructionCost C = 0; |
| 14923 | unsigned VF = Mask.size(); |
| 14924 | unsigned VecVF = TE->getVectorFactor(); |
| 14925 | bool HasLargeIndex = |
| 14926 | any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); }); |
| 14927 | if ((VF != VecVF && HasLargeIndex) || |
| 14928 | !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) { |
| 14929 | |
| 14930 | if (HasLargeIndex) { |
| 14931 | SmallVector<int> OrigMask(VecVF, PoisonMaskElem); |
| 14932 | std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)), |
| 14933 | result: OrigMask.begin()); |
| 14934 | C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, |
| 14935 | Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), |
| 14936 | Mask: OrigMask); |
| 14937 | LLVM_DEBUG( |
| 14938 | dbgs() << "SLP: Adding cost " << C |
| 14939 | << " for final shuffle of insertelement external users.\n" ; |
| 14940 | TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n" ); |
| 14941 | Cost += C; |
| 14942 | return std::make_pair(x&: TE, y: true); |
| 14943 | } |
| 14944 | |
| 14945 | if (!ForSingleMask) { |
| 14946 | SmallVector<int> ResizeMask(VF, PoisonMaskElem); |
| 14947 | for (unsigned I = 0; I < VF; ++I) { |
| 14948 | if (Mask[I] != PoisonMaskElem) |
| 14949 | ResizeMask[Mask[I]] = Mask[I]; |
| 14950 | } |
| 14951 | if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF)) |
| 14952 | C = ::getShuffleCost( |
| 14953 | TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, |
| 14954 | Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask); |
| 14955 | LLVM_DEBUG( |
| 14956 | dbgs() << "SLP: Adding cost " << C |
| 14957 | << " for final shuffle of insertelement external users.\n" ; |
| 14958 | TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n" ); |
| 14959 | |
| 14960 | Cost += C; |
| 14961 | } |
| 14962 | } |
| 14963 | return std::make_pair(x&: TE, y: false); |
| 14964 | }; |
| 14965 | // Calculate the cost of the reshuffled vectors, if any. |
| 14966 | for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { |
| 14967 | Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(i_nocapture: 0); |
| 14968 | auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); |
| 14969 | unsigned VF = 0; |
| 14970 | auto EstimateShufflesCost = [&](ArrayRef<int> Mask, |
| 14971 | ArrayRef<const TreeEntry *> TEs) { |
| 14972 | assert((TEs.size() == 1 || TEs.size() == 2) && |
| 14973 | "Expected exactly 1 or 2 tree entries." ); |
| 14974 | if (TEs.size() == 1) { |
| 14975 | if (VF == 0) |
| 14976 | VF = TEs.front()->getVectorFactor(); |
| 14977 | auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF); |
| 14978 | if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) && |
| 14979 | !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) { |
| 14980 | return Data.value() == PoisonMaskElem || |
| 14981 | (Data.index() < VF && |
| 14982 | static_cast<int>(Data.index()) == Data.value()); |
| 14983 | })) { |
| 14984 | InstructionCost C = |
| 14985 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask); |
| 14986 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C |
| 14987 | << " for final shuffle of insertelement " |
| 14988 | "external users.\n" ; |
| 14989 | TEs.front()->dump(); |
| 14990 | dbgs() << "SLP: Current total cost = " << Cost << "\n" ); |
| 14991 | Cost += C; |
| 14992 | } |
| 14993 | } else { |
| 14994 | if (VF == 0) { |
| 14995 | if (TEs.front() && |
| 14996 | TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor()) |
| 14997 | VF = TEs.front()->getVectorFactor(); |
| 14998 | else |
| 14999 | VF = Mask.size(); |
| 15000 | } |
| 15001 | auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF); |
| 15002 | InstructionCost C = |
| 15003 | ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask); |
| 15004 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C |
| 15005 | << " for final shuffle of vector node and external " |
| 15006 | "insertelement users.\n" ; |
| 15007 | if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump(); |
| 15008 | dbgs() << "SLP: Current total cost = " << Cost << "\n" ); |
| 15009 | Cost += C; |
| 15010 | } |
| 15011 | VF = Mask.size(); |
| 15012 | return TEs.back(); |
| 15013 | }; |
| 15014 | (void)performExtractsShuffleAction<const TreeEntry>( |
| 15015 | ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base, |
| 15016 | GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF, |
| 15017 | Action: EstimateShufflesCost); |
| 15018 | InstructionCost InsertCost = TTI->getScalarizationOverhead( |
| 15019 | Ty: cast<FixedVectorType>( |
| 15020 | Val: ShuffledInserts[I].InsertElements.front()->getType()), |
| 15021 | DemandedElts: DemandedElts[I], |
| 15022 | /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput); |
| 15023 | Cost -= InsertCost; |
| 15024 | } |
| 15025 | |
| 15026 | // Add the cost for reduced value resize (if required). |
| 15027 | if (ReductionBitWidth != 0) { |
| 15028 | assert(UserIgnoreList && "Expected reduction tree." ); |
| 15029 | const TreeEntry &E = *VectorizableTree.front(); |
| 15030 | auto It = MinBWs.find(Val: &E); |
| 15031 | if (It != MinBWs.end() && It->second.first != ReductionBitWidth) { |
| 15032 | unsigned SrcSize = It->second.first; |
| 15033 | unsigned DstSize = ReductionBitWidth; |
| 15034 | unsigned Opcode = Instruction::Trunc; |
| 15035 | if (SrcSize < DstSize) { |
| 15036 | bool IsArithmeticExtendedReduction = |
| 15037 | all_of(Range: *UserIgnoreList, P: [](Value *V) { |
| 15038 | auto *I = cast<Instruction>(Val: V); |
| 15039 | return is_contained(Set: {Instruction::Add, Instruction::FAdd, |
| 15040 | Instruction::Mul, Instruction::FMul, |
| 15041 | Instruction::And, Instruction::Or, |
| 15042 | Instruction::Xor}, |
| 15043 | Element: I->getOpcode()); |
| 15044 | }); |
| 15045 | if (IsArithmeticExtendedReduction) |
| 15046 | Opcode = |
| 15047 | Instruction::BitCast; // Handle it by getExtendedReductionCost |
| 15048 | else |
| 15049 | Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt; |
| 15050 | } |
| 15051 | if (Opcode != Instruction::BitCast) { |
| 15052 | auto *SrcVecTy = |
| 15053 | getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor()); |
| 15054 | auto *DstVecTy = |
| 15055 | getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor()); |
| 15056 | TTI::CastContextHint CCH = getCastContextHint(TE: E); |
| 15057 | InstructionCost CastCost; |
| 15058 | switch (E.getOpcode()) { |
| 15059 | case Instruction::SExt: |
| 15060 | case Instruction::ZExt: |
| 15061 | case Instruction::Trunc: { |
| 15062 | const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0); |
| 15063 | CCH = getCastContextHint(TE: *OpTE); |
| 15064 | break; |
| 15065 | } |
| 15066 | default: |
| 15067 | break; |
| 15068 | } |
| 15069 | CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH, |
| 15070 | CostKind: TTI::TCK_RecipThroughput); |
| 15071 | Cost += CastCost; |
| 15072 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost |
| 15073 | << " for final resize for reduction from " << SrcVecTy |
| 15074 | << " to " << DstVecTy << "\n" ; |
| 15075 | dbgs() << "SLP: Current total cost = " << Cost << "\n" ); |
| 15076 | } |
| 15077 | } |
| 15078 | } |
| 15079 | |
| 15080 | std::optional<InstructionCost> SpillCost; |
| 15081 | if (Cost < -SLPCostThreshold) { |
| 15082 | SpillCost = getSpillCost(); |
| 15083 | Cost += *SpillCost; |
| 15084 | } |
| 15085 | #ifndef NDEBUG |
| 15086 | SmallString<256> Str; |
| 15087 | { |
| 15088 | raw_svector_ostream OS(Str); |
| 15089 | OS << "SLP: Spill Cost = " ; |
| 15090 | if (SpillCost) |
| 15091 | OS << *SpillCost; |
| 15092 | else |
| 15093 | OS << "<skipped>" ; |
| 15094 | OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n" |
| 15095 | << "SLP: Total Cost = " << Cost << ".\n" ; |
| 15096 | } |
| 15097 | LLVM_DEBUG(dbgs() << Str); |
| 15098 | if (ViewSLPTree) |
| 15099 | ViewGraph(this, "SLP" + F->getName(), false, Str); |
| 15100 | #endif |
| 15101 | |
| 15102 | return Cost; |
| 15103 | } |
| 15104 | |
| 15105 | /// Tries to find extractelement instructions with constant indices from fixed |
| 15106 | /// vector type and gather such instructions into a bunch, which highly likely |
| 15107 | /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was |
| 15108 | /// successful, the matched scalars are replaced by poison values in \p VL for |
| 15109 | /// future analysis. |
| 15110 | std::optional<TTI::ShuffleKind> |
| 15111 | BoUpSLP::( |
| 15112 | MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const { |
| 15113 | // Scan list of gathered scalars for extractelements that can be represented |
| 15114 | // as shuffles. |
| 15115 | MapVector<Value *, SmallVector<int>> VectorOpToIdx; |
| 15116 | SmallVector<int> ; |
| 15117 | for (int I = 0, E = VL.size(); I < E; ++I) { |
| 15118 | auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]); |
| 15119 | if (!EI) { |
| 15120 | if (isa<UndefValue>(Val: VL[I])) |
| 15121 | UndefVectorExtracts.push_back(Elt: I); |
| 15122 | continue; |
| 15123 | } |
| 15124 | auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType()); |
| 15125 | if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand())) |
| 15126 | continue; |
| 15127 | std::optional<unsigned> Idx = getExtractIndex(E: EI); |
| 15128 | // Undefined index. |
| 15129 | if (!Idx) { |
| 15130 | UndefVectorExtracts.push_back(Elt: I); |
| 15131 | continue; |
| 15132 | } |
| 15133 | if (Idx >= VecTy->getNumElements()) { |
| 15134 | UndefVectorExtracts.push_back(Elt: I); |
| 15135 | continue; |
| 15136 | } |
| 15137 | SmallBitVector (VecTy->getNumElements(), true); |
| 15138 | ExtractMask.reset(Idx: *Idx); |
| 15139 | if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) { |
| 15140 | UndefVectorExtracts.push_back(Elt: I); |
| 15141 | continue; |
| 15142 | } |
| 15143 | VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I); |
| 15144 | } |
| 15145 | // Sort the vector operands by the maximum number of uses in extractelements. |
| 15146 | SmallVector<std::pair<Value *, SmallVector<int>>> Vectors = |
| 15147 | VectorOpToIdx.takeVector(); |
| 15148 | stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) { |
| 15149 | return P1.second.size() > P2.second.size(); |
| 15150 | }); |
| 15151 | // Find the best pair of the vectors or a single vector. |
| 15152 | const int UndefSz = UndefVectorExtracts.size(); |
| 15153 | unsigned SingleMax = 0; |
| 15154 | unsigned PairMax = 0; |
| 15155 | if (!Vectors.empty()) { |
| 15156 | SingleMax = Vectors.front().second.size() + UndefSz; |
| 15157 | if (Vectors.size() > 1) { |
| 15158 | auto *ItNext = std::next(x: Vectors.begin()); |
| 15159 | PairMax = SingleMax + ItNext->second.size(); |
| 15160 | } |
| 15161 | } |
| 15162 | if (SingleMax == 0 && PairMax == 0 && UndefSz == 0) |
| 15163 | return std::nullopt; |
| 15164 | // Check if better to perform a shuffle of 2 vectors or just of a single |
| 15165 | // vector. |
| 15166 | SmallVector<Value *> SavedVL(VL.begin(), VL.end()); |
| 15167 | SmallVector<Value *> ( |
| 15168 | VL.size(), PoisonValue::get(T: VL.front()->getType())); |
| 15169 | if (SingleMax >= PairMax && SingleMax) { |
| 15170 | for (int Idx : Vectors.front().second) |
| 15171 | std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]); |
| 15172 | } else if (!Vectors.empty()) { |
| 15173 | for (unsigned Idx : {0, 1}) |
| 15174 | for (int Idx : Vectors[Idx].second) |
| 15175 | std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]); |
| 15176 | } |
| 15177 | // Add extracts from undefs too. |
| 15178 | for (int Idx : UndefVectorExtracts) |
| 15179 | std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]); |
| 15180 | // Check that gather of extractelements can be represented as just a |
| 15181 | // shuffle of a single/two vectors the scalars are extracted from. |
| 15182 | std::optional<TTI::ShuffleKind> Res = |
| 15183 | isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC); |
| 15184 | if (!Res || all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) { |
| 15185 | // TODO: try to check other subsets if possible. |
| 15186 | // Restore the original VL if attempt was not successful. |
| 15187 | copy(Range&: SavedVL, Out: VL.begin()); |
| 15188 | return std::nullopt; |
| 15189 | } |
| 15190 | // Restore unused scalars from mask, if some of the extractelements were not |
| 15191 | // selected for shuffle. |
| 15192 | for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) { |
| 15193 | if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) && |
| 15194 | isa<UndefValue>(Val: GatheredExtracts[I])) { |
| 15195 | std::swap(a&: VL[I], b&: GatheredExtracts[I]); |
| 15196 | continue; |
| 15197 | } |
| 15198 | auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]); |
| 15199 | if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) || |
| 15200 | !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) || |
| 15201 | is_contained(Range&: UndefVectorExtracts, Element: I)) |
| 15202 | continue; |
| 15203 | } |
| 15204 | return Res; |
| 15205 | } |
| 15206 | |
| 15207 | /// Tries to find extractelement instructions with constant indices from fixed |
| 15208 | /// vector type and gather such instructions into a bunch, which highly likely |
| 15209 | /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was |
| 15210 | /// successful, the matched scalars are replaced by poison values in \p VL for |
| 15211 | /// future analysis. |
| 15212 | SmallVector<std::optional<TTI::ShuffleKind>> |
| 15213 | BoUpSLP::(SmallVectorImpl<Value *> &VL, |
| 15214 | SmallVectorImpl<int> &Mask, |
| 15215 | unsigned NumParts) const { |
| 15216 | assert(NumParts > 0 && "NumParts expected be greater than or equal to 1." ); |
| 15217 | SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts); |
| 15218 | Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem); |
| 15219 | unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts); |
| 15220 | for (unsigned Part : seq<unsigned>(Size: NumParts)) { |
| 15221 | // Scan list of gathered scalars for extractelements that can be represented |
| 15222 | // as shuffles. |
| 15223 | MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice( |
| 15224 | N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part)); |
| 15225 | SmallVector<int> SubMask; |
| 15226 | std::optional<TTI::ShuffleKind> Res = |
| 15227 | tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask); |
| 15228 | ShufflesRes[Part] = Res; |
| 15229 | copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize)); |
| 15230 | } |
| 15231 | if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) { |
| 15232 | return Res.has_value(); |
| 15233 | })) |
| 15234 | ShufflesRes.clear(); |
| 15235 | return ShufflesRes; |
| 15236 | } |
| 15237 | |
| 15238 | std::optional<TargetTransformInfo::ShuffleKind> |
| 15239 | BoUpSLP::isGatherShuffledSingleRegisterEntry( |
| 15240 | const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask, |
| 15241 | SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) { |
| 15242 | Entries.clear(); |
| 15243 | // TODO: currently checking only for Scalars in the tree entry, need to count |
| 15244 | // reused elements too for better cost estimation. |
| 15245 | auto GetUserEntry = [&](const TreeEntry *TE) { |
| 15246 | while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX) |
| 15247 | TE = TE->UserTreeIndex.UserTE; |
| 15248 | if (TE == VectorizableTree.front().get()) |
| 15249 | return EdgeInfo(const_cast<TreeEntry *>(TE), 0); |
| 15250 | return TE->UserTreeIndex; |
| 15251 | }; |
| 15252 | auto HasGatherUser = [&](const TreeEntry *TE) { |
| 15253 | while (TE->Idx != 0 && TE->UserTreeIndex) { |
| 15254 | if (TE->UserTreeIndex.EdgeIdx == UINT_MAX) |
| 15255 | return true; |
| 15256 | TE = TE->UserTreeIndex.UserTE; |
| 15257 | } |
| 15258 | return false; |
| 15259 | }; |
| 15260 | const EdgeInfo TEUseEI = GetUserEntry(TE); |
| 15261 | if (!TEUseEI) |
| 15262 | return std::nullopt; |
| 15263 | const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE); |
| 15264 | const BasicBlock *TEInsertBlock = nullptr; |
| 15265 | // Main node of PHI entries keeps the correct order of operands/incoming |
| 15266 | // blocks. |
| 15267 | if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp()); |
| 15268 | PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) { |
| 15269 | TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx); |
| 15270 | TEInsertPt = TEInsertBlock->getTerminator(); |
| 15271 | } else { |
| 15272 | TEInsertBlock = TEInsertPt->getParent(); |
| 15273 | } |
| 15274 | if (!DT->isReachableFromEntry(A: TEInsertBlock)) |
| 15275 | return std::nullopt; |
| 15276 | auto *NodeUI = DT->getNode(BB: TEInsertBlock); |
| 15277 | assert(NodeUI && "Should only process reachable instructions" ); |
| 15278 | SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL); |
| 15279 | auto CheckOrdering = [&](const Instruction *InsertPt) { |
| 15280 | // Argument InsertPt is an instruction where vector code for some other |
| 15281 | // tree entry (one that shares one or more scalars with TE) is going to be |
| 15282 | // generated. This lambda returns true if insertion point of vector code |
| 15283 | // for the TE dominates that point (otherwise dependency is the other way |
| 15284 | // around). The other node is not limited to be of a gather kind. Gather |
| 15285 | // nodes are not scheduled and their vector code is inserted before their |
| 15286 | // first user. If user is PHI, that is supposed to be at the end of a |
| 15287 | // predecessor block. Otherwise it is the last instruction among scalars of |
| 15288 | // the user node. So, instead of checking dependency between instructions |
| 15289 | // themselves, we check dependency between their insertion points for vector |
| 15290 | // code (since each scalar instruction ends up as a lane of a vector |
| 15291 | // instruction). |
| 15292 | const BasicBlock *InsertBlock = InsertPt->getParent(); |
| 15293 | auto *NodeEUI = DT->getNode(BB: InsertBlock); |
| 15294 | if (!NodeEUI) |
| 15295 | return false; |
| 15296 | assert((NodeUI == NodeEUI) == |
| 15297 | (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) && |
| 15298 | "Different nodes should have different DFS numbers" ); |
| 15299 | // Check the order of the gather nodes users. |
| 15300 | if (TEInsertPt->getParent() != InsertBlock && |
| 15301 | (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI))) |
| 15302 | return false; |
| 15303 | if (TEInsertPt->getParent() == InsertBlock && |
| 15304 | TEInsertPt->comesBefore(Other: InsertPt)) |
| 15305 | return false; |
| 15306 | return true; |
| 15307 | }; |
| 15308 | // Find all tree entries used by the gathered values. If no common entries |
| 15309 | // found - not a shuffle. |
| 15310 | // Here we build a set of tree nodes for each gathered value and trying to |
| 15311 | // find the intersection between these sets. If we have at least one common |
| 15312 | // tree node for each gathered value - we have just a permutation of the |
| 15313 | // single vector. If we have 2 different sets, we're in situation where we |
| 15314 | // have a permutation of 2 input vectors. |
| 15315 | SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; |
| 15316 | SmallDenseMap<Value *, int> UsedValuesEntry; |
| 15317 | SmallPtrSet<const Value *, 16> VisitedValue; |
| 15318 | auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) { |
| 15319 | // The node is reused - exit. |
| 15320 | if ((TEPtr->getVectorFactor() != VL.size() && |
| 15321 | TEPtr->Scalars.size() != VL.size()) || |
| 15322 | (!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars))) |
| 15323 | return false; |
| 15324 | UsedTEs.clear(); |
| 15325 | UsedTEs.emplace_back().insert(Ptr: TEPtr); |
| 15326 | for (Value *V : VL) { |
| 15327 | if (isConstant(V)) |
| 15328 | continue; |
| 15329 | UsedValuesEntry.try_emplace(Key: V, Args: 0); |
| 15330 | } |
| 15331 | return true; |
| 15332 | }; |
| 15333 | auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2, |
| 15334 | unsigned EdgeIdx) { |
| 15335 | const TreeEntry *Ptr1 = User1; |
| 15336 | const TreeEntry *Ptr2 = User2; |
| 15337 | SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx; |
| 15338 | while (Ptr2) { |
| 15339 | PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx); |
| 15340 | EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx; |
| 15341 | Ptr2 = Ptr2->UserTreeIndex.UserTE; |
| 15342 | } |
| 15343 | while (Ptr1) { |
| 15344 | unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx; |
| 15345 | Ptr1 = Ptr1->UserTreeIndex.UserTE; |
| 15346 | if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end()) |
| 15347 | return Idx < It->second; |
| 15348 | } |
| 15349 | return false; |
| 15350 | }; |
| 15351 | for (Value *V : VL) { |
| 15352 | if (isConstant(V) || !VisitedValue.insert(Ptr: V).second) |
| 15353 | continue; |
| 15354 | // Build a list of tree entries where V is used. |
| 15355 | SmallPtrSet<const TreeEntry *, 4> VToTEs; |
| 15356 | for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(Val: V)) { |
| 15357 | if (TEPtr == TE || TEPtr->Idx == 0) |
| 15358 | continue; |
| 15359 | assert(any_of(TEPtr->Scalars, |
| 15360 | [&](Value *V) { return GatheredScalars.contains(V); }) && |
| 15361 | "Must contain at least single gathered value." ); |
| 15362 | assert(TEPtr->UserTreeIndex && |
| 15363 | "Expected only single user of a gather node." ); |
| 15364 | const EdgeInfo &UseEI = TEPtr->UserTreeIndex; |
| 15365 | |
| 15366 | PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize |
| 15367 | ? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp()) |
| 15368 | : nullptr; |
| 15369 | Instruction *InsertPt = |
| 15370 | UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator() |
| 15371 | : &getLastInstructionInBundle(E: UseEI.UserTE); |
| 15372 | if (TEInsertPt == InsertPt) { |
| 15373 | // Check nodes, which might be emitted first. |
| 15374 | if (TEUseEI.UserTE->State == TreeEntry::Vectorize && |
| 15375 | (TEUseEI.UserTE->getOpcode() != Instruction::PHI || |
| 15376 | TEUseEI.UserTE->isAltShuffle()) && |
| 15377 | all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) { |
| 15378 | if (UseEI.UserTE->State != TreeEntry::Vectorize || |
| 15379 | (UseEI.UserTE->getOpcode() == Instruction::PHI && |
| 15380 | !UseEI.UserTE->isAltShuffle()) || |
| 15381 | !all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock)) |
| 15382 | continue; |
| 15383 | } |
| 15384 | |
| 15385 | // If the schedulable insertion point is used in multiple entries - just |
| 15386 | // exit, no known ordering at this point, available only after real |
| 15387 | // scheduling. |
| 15388 | if (!doesNotNeedToBeScheduled(V: InsertPt) && |
| 15389 | (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx)) |
| 15390 | continue; |
| 15391 | // If the users are the PHI nodes with the same incoming blocks - skip. |
| 15392 | if (TEUseEI.UserTE->State == TreeEntry::Vectorize && |
| 15393 | TEUseEI.UserTE->getOpcode() == Instruction::PHI && |
| 15394 | UseEI.UserTE->State == TreeEntry::Vectorize && |
| 15395 | UseEI.UserTE->getOpcode() == Instruction::PHI && |
| 15396 | TEUseEI.UserTE != UseEI.UserTE) |
| 15397 | continue; |
| 15398 | // If 2 gathers are operands of the same entry (regardless of whether |
| 15399 | // user is PHI or else), compare operands indices, use the earlier one |
| 15400 | // as the base. |
| 15401 | if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx) |
| 15402 | continue; |
| 15403 | // If the user instruction is used for some reason in different |
| 15404 | // vectorized nodes - make it depend on index. |
| 15405 | if (TEUseEI.UserTE != UseEI.UserTE && |
| 15406 | (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx || |
| 15407 | HasGatherUser(TEUseEI.UserTE))) |
| 15408 | continue; |
| 15409 | // If the user node is the operand of the other user node - skip. |
| 15410 | if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx)) |
| 15411 | continue; |
| 15412 | } |
| 15413 | |
| 15414 | if (!TEUseEI.UserTE->isGather() && !UserPHI && |
| 15415 | TEUseEI.UserTE->doesNotNeedToSchedule() != |
| 15416 | UseEI.UserTE->doesNotNeedToSchedule() && |
| 15417 | is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt)) |
| 15418 | continue; |
| 15419 | // Check if the user node of the TE comes after user node of TEPtr, |
| 15420 | // otherwise TEPtr depends on TE. |
| 15421 | if ((TEInsertBlock != InsertPt->getParent() || |
| 15422 | TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) && |
| 15423 | !CheckOrdering(InsertPt)) |
| 15424 | continue; |
| 15425 | // The node is reused - exit. |
| 15426 | if (CheckAndUseSameNode(TEPtr)) |
| 15427 | break; |
| 15428 | VToTEs.insert(Ptr: TEPtr); |
| 15429 | } |
| 15430 | if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) { |
| 15431 | const auto *It = find_if( |
| 15432 | Range&: VTEs, P: [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; }); |
| 15433 | if (It != VTEs.end()) { |
| 15434 | const TreeEntry *VTE = *It; |
| 15435 | if (none_of(Range: TE->CombinedEntriesWithIndices, |
| 15436 | P: [&](const auto &P) { return P.first == VTE->Idx; })) { |
| 15437 | Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE); |
| 15438 | if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) |
| 15439 | continue; |
| 15440 | } |
| 15441 | // The node is reused - exit. |
| 15442 | if (CheckAndUseSameNode(VTE)) |
| 15443 | break; |
| 15444 | VToTEs.insert(Ptr: VTE); |
| 15445 | } |
| 15446 | } |
| 15447 | if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) { |
| 15448 | const TreeEntry *VTE = VTEs.front(); |
| 15449 | if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: 0) && |
| 15450 | VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) { |
| 15451 | VTEs = VTEs.drop_front(); |
| 15452 | // Iterate through all vectorized nodes. |
| 15453 | const auto *MIt = find_if(Range&: VTEs, P: [](const TreeEntry *MTE) { |
| 15454 | return MTE->State == TreeEntry::Vectorize; |
| 15455 | }); |
| 15456 | if (MIt == VTEs.end()) |
| 15457 | continue; |
| 15458 | VTE = *MIt; |
| 15459 | } |
| 15460 | if (none_of(Range: TE->CombinedEntriesWithIndices, |
| 15461 | P: [&](const auto &P) { return P.first == VTE->Idx; })) { |
| 15462 | Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE); |
| 15463 | if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst)) |
| 15464 | continue; |
| 15465 | } |
| 15466 | // The node is reused - exit. |
| 15467 | if (CheckAndUseSameNode(VTE)) |
| 15468 | break; |
| 15469 | VToTEs.insert(Ptr: VTE); |
| 15470 | } |
| 15471 | if (VToTEs.empty()) |
| 15472 | continue; |
| 15473 | if (UsedTEs.empty()) { |
| 15474 | // The first iteration, just insert the list of nodes to vector. |
| 15475 | UsedTEs.push_back(Elt: VToTEs); |
| 15476 | UsedValuesEntry.try_emplace(Key: V, Args: 0); |
| 15477 | } else { |
| 15478 | // Need to check if there are any previously used tree nodes which use V. |
| 15479 | // If there are no such nodes, consider that we have another one input |
| 15480 | // vector. |
| 15481 | SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); |
| 15482 | unsigned Idx = 0; |
| 15483 | for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { |
| 15484 | // Do we have a non-empty intersection of previously listed tree entries |
| 15485 | // and tree entries using current V? |
| 15486 | set_intersect(S1&: VToTEs, S2: Set); |
| 15487 | if (!VToTEs.empty()) { |
| 15488 | // Yes, write the new subset and continue analysis for the next |
| 15489 | // scalar. |
| 15490 | Set.swap(RHS&: VToTEs); |
| 15491 | break; |
| 15492 | } |
| 15493 | VToTEs = SavedVToTEs; |
| 15494 | ++Idx; |
| 15495 | } |
| 15496 | // No non-empty intersection found - need to add a second set of possible |
| 15497 | // source vectors. |
| 15498 | if (Idx == UsedTEs.size()) { |
| 15499 | // If the number of input vectors is greater than 2 - not a permutation, |
| 15500 | // fallback to the regular gather. |
| 15501 | // TODO: support multiple reshuffled nodes. |
| 15502 | if (UsedTEs.size() == 2) |
| 15503 | continue; |
| 15504 | UsedTEs.push_back(Elt: SavedVToTEs); |
| 15505 | Idx = UsedTEs.size() - 1; |
| 15506 | } |
| 15507 | UsedValuesEntry.try_emplace(Key: V, Args&: Idx); |
| 15508 | } |
| 15509 | } |
| 15510 | |
| 15511 | if (UsedTEs.empty()) { |
| 15512 | Entries.clear(); |
| 15513 | return std::nullopt; |
| 15514 | } |
| 15515 | |
| 15516 | unsigned VF = 0; |
| 15517 | if (UsedTEs.size() == 1) { |
| 15518 | // Keep the order to avoid non-determinism. |
| 15519 | SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(), |
| 15520 | UsedTEs.front().end()); |
| 15521 | sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) { |
| 15522 | return TE1->Idx < TE2->Idx; |
| 15523 | }); |
| 15524 | // Try to find the perfect match in another gather node at first. |
| 15525 | auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) { |
| 15526 | return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars); |
| 15527 | }); |
| 15528 | if (It != FirstEntries.end() && |
| 15529 | ((*It)->getVectorFactor() == VL.size() || |
| 15530 | ((*It)->getVectorFactor() == TE->Scalars.size() && |
| 15531 | TE->ReuseShuffleIndices.size() == VL.size() && |
| 15532 | (*It)->isSame(VL: TE->Scalars)))) { |
| 15533 | Entries.push_back(Elt: *It); |
| 15534 | if ((*It)->getVectorFactor() == VL.size()) { |
| 15535 | std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()), |
| 15536 | last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0); |
| 15537 | } else { |
| 15538 | SmallVector<int> CommonMask = TE->getCommonMask(); |
| 15539 | copy(Range&: CommonMask, Out: Mask.begin()); |
| 15540 | } |
| 15541 | // Clear undef scalars. |
| 15542 | for (unsigned I : seq<unsigned>(Size: VL.size())) |
| 15543 | if (isa<PoisonValue>(Val: VL[I])) |
| 15544 | Mask[Part * VL.size() + I] = PoisonMaskElem; |
| 15545 | return TargetTransformInfo::SK_PermuteSingleSrc; |
| 15546 | } |
| 15547 | // No perfect match, just shuffle, so choose the first tree node from the |
| 15548 | // tree. |
| 15549 | Entries.push_back(Elt: FirstEntries.front()); |
| 15550 | // Update mapping between values and corresponding tree entries. |
| 15551 | for (auto &P : UsedValuesEntry) |
| 15552 | P.second = 0; |
| 15553 | VF = FirstEntries.front()->getVectorFactor(); |
| 15554 | } else { |
| 15555 | // Try to find nodes with the same vector factor. |
| 15556 | assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries." ); |
| 15557 | // Keep the order of tree nodes to avoid non-determinism. |
| 15558 | DenseMap<int, const TreeEntry *> VFToTE; |
| 15559 | for (const TreeEntry *TE : UsedTEs.front()) { |
| 15560 | unsigned VF = TE->getVectorFactor(); |
| 15561 | auto It = VFToTE.find(Val: VF); |
| 15562 | if (It != VFToTE.end()) { |
| 15563 | if (It->second->Idx > TE->Idx) |
| 15564 | It->getSecond() = TE; |
| 15565 | continue; |
| 15566 | } |
| 15567 | VFToTE.try_emplace(Key: VF, Args&: TE); |
| 15568 | } |
| 15569 | // Same, keep the order to avoid non-determinism. |
| 15570 | SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(), |
| 15571 | UsedTEs.back().end()); |
| 15572 | sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) { |
| 15573 | return TE1->Idx < TE2->Idx; |
| 15574 | }); |
| 15575 | for (const TreeEntry *TE : SecondEntries) { |
| 15576 | auto It = VFToTE.find(Val: TE->getVectorFactor()); |
| 15577 | if (It != VFToTE.end()) { |
| 15578 | VF = It->first; |
| 15579 | Entries.push_back(Elt: It->second); |
| 15580 | Entries.push_back(Elt: TE); |
| 15581 | break; |
| 15582 | } |
| 15583 | } |
| 15584 | // No 2 source vectors with the same vector factor - just choose 2 with max |
| 15585 | // index. |
| 15586 | if (Entries.empty()) { |
| 15587 | Entries.push_back(Elt: *llvm::max_element( |
| 15588 | Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) { |
| 15589 | return TE1->Idx < TE2->Idx; |
| 15590 | })); |
| 15591 | Entries.push_back(Elt: SecondEntries.front()); |
| 15592 | VF = std::max(a: Entries.front()->getVectorFactor(), |
| 15593 | b: Entries.back()->getVectorFactor()); |
| 15594 | } else { |
| 15595 | VF = Entries.front()->getVectorFactor(); |
| 15596 | } |
| 15597 | SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries; |
| 15598 | for (const TreeEntry *E : Entries) |
| 15599 | ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(), |
| 15600 | E: E->Scalars.end()); |
| 15601 | // Update mapping between values and corresponding tree entries. |
| 15602 | for (auto &P : UsedValuesEntry) { |
| 15603 | for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size())) |
| 15604 | if (ValuesToEntries[Idx].contains(Ptr: P.first)) { |
| 15605 | P.second = Idx; |
| 15606 | break; |
| 15607 | } |
| 15608 | } |
| 15609 | } |
| 15610 | |
| 15611 | bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>); |
| 15612 | // Checks if the 2 PHIs are compatible in terms of high possibility to be |
| 15613 | // vectorized. |
| 15614 | auto AreCompatiblePHIs = [&](Value *V, Value *V1) { |
| 15615 | auto *PHI = cast<PHINode>(Val: V); |
| 15616 | auto *PHI1 = cast<PHINode>(Val: V1); |
| 15617 | // Check that all incoming values are compatible/from same parent (if they |
| 15618 | // are instructions). |
| 15619 | // The incoming values are compatible if they all are constants, or |
| 15620 | // instruction with the same/alternate opcodes from the same basic block. |
| 15621 | for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) { |
| 15622 | Value *In = PHI->getIncomingValue(i: I); |
| 15623 | Value *In1 = PHI1->getIncomingValue(i: I); |
| 15624 | if (isConstant(V: In) && isConstant(V: In1)) |
| 15625 | continue; |
| 15626 | if (!getSameOpcode(VL: {In, In1}, TLI: *TLI)) |
| 15627 | return false; |
| 15628 | if (cast<Instruction>(Val: In)->getParent() != |
| 15629 | cast<Instruction>(Val: In1)->getParent()) |
| 15630 | return false; |
| 15631 | } |
| 15632 | return true; |
| 15633 | }; |
| 15634 | // Check if the value can be ignored during analysis for shuffled gathers. |
| 15635 | // We suppose it is better to ignore instruction, which do not form splats, |
| 15636 | // are not vectorized/not extractelements (these instructions will be handled |
| 15637 | // by extractelements processing) or may form vector node in future. |
| 15638 | auto MightBeIgnored = [=](Value *V) { |
| 15639 | auto *I = dyn_cast<Instruction>(Val: V); |
| 15640 | return I && !IsSplatOrUndefs && !isVectorized(V: I) && |
| 15641 | !isVectorLikeInstWithConstOps(V: I) && |
| 15642 | !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I); |
| 15643 | }; |
| 15644 | // Check that the neighbor instruction may form a full vector node with the |
| 15645 | // current instruction V. It is possible, if they have same/alternate opcode |
| 15646 | // and same parent basic block. |
| 15647 | auto NeighborMightBeIgnored = [&](Value *V, int Idx) { |
| 15648 | Value *V1 = VL[Idx]; |
| 15649 | bool UsedInSameVTE = false; |
| 15650 | auto It = UsedValuesEntry.find(Val: V1); |
| 15651 | if (It != UsedValuesEntry.end()) |
| 15652 | UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second; |
| 15653 | return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE && |
| 15654 | getSameOpcode(VL: {V, V1}, TLI: *TLI) && |
| 15655 | cast<Instruction>(Val: V)->getParent() == |
| 15656 | cast<Instruction>(Val: V1)->getParent() && |
| 15657 | (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1)); |
| 15658 | }; |
| 15659 | // Build a shuffle mask for better cost estimation and vector emission. |
| 15660 | SmallBitVector UsedIdxs(Entries.size()); |
| 15661 | SmallVector<std::pair<unsigned, int>> EntryLanes; |
| 15662 | for (int I = 0, E = VL.size(); I < E; ++I) { |
| 15663 | Value *V = VL[I]; |
| 15664 | auto It = UsedValuesEntry.find(Val: V); |
| 15665 | if (It == UsedValuesEntry.end()) |
| 15666 | continue; |
| 15667 | // Do not try to shuffle scalars, if they are constants, or instructions |
| 15668 | // that can be vectorized as a result of the following vector build |
| 15669 | // vectorization. |
| 15670 | if (isConstant(V) || (MightBeIgnored(V) && |
| 15671 | ((I > 0 && NeighborMightBeIgnored(V, I - 1)) || |
| 15672 | (I != E - 1 && NeighborMightBeIgnored(V, I + 1))))) |
| 15673 | continue; |
| 15674 | unsigned Idx = It->second; |
| 15675 | EntryLanes.emplace_back(Args&: Idx, Args&: I); |
| 15676 | UsedIdxs.set(Idx); |
| 15677 | } |
| 15678 | // Iterate through all shuffled scalars and select entries, which can be used |
| 15679 | // for final shuffle. |
| 15680 | SmallVector<const TreeEntry *> TempEntries; |
| 15681 | for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) { |
| 15682 | if (!UsedIdxs.test(Idx: I)) |
| 15683 | continue; |
| 15684 | // Fix the entry number for the given scalar. If it is the first entry, set |
| 15685 | // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes). |
| 15686 | // These indices are used when calculating final shuffle mask as the vector |
| 15687 | // offset. |
| 15688 | for (std::pair<unsigned, int> &Pair : EntryLanes) |
| 15689 | if (Pair.first == I) |
| 15690 | Pair.first = TempEntries.size(); |
| 15691 | TempEntries.push_back(Elt: Entries[I]); |
| 15692 | } |
| 15693 | Entries.swap(RHS&: TempEntries); |
| 15694 | if (EntryLanes.size() == Entries.size() && |
| 15695 | !VL.equals(RHS: ArrayRef(TE->Scalars) |
| 15696 | .slice(N: Part * VL.size(), |
| 15697 | M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) { |
| 15698 | // We may have here 1 or 2 entries only. If the number of scalars is equal |
| 15699 | // to the number of entries, no need to do the analysis, it is not very |
| 15700 | // profitable. Since VL is not the same as TE->Scalars, it means we already |
| 15701 | // have some shuffles before. Cut off not profitable case. |
| 15702 | Entries.clear(); |
| 15703 | return std::nullopt; |
| 15704 | } |
| 15705 | // Build the final mask, check for the identity shuffle, if possible. |
| 15706 | bool IsIdentity = Entries.size() == 1; |
| 15707 | // Pair.first is the offset to the vector, while Pair.second is the index of |
| 15708 | // scalar in the list. |
| 15709 | for (const std::pair<unsigned, int> &Pair : EntryLanes) { |
| 15710 | unsigned Idx = Part * VL.size() + Pair.second; |
| 15711 | Mask[Idx] = |
| 15712 | Pair.first * VF + |
| 15713 | (ForOrder ? std::distance( |
| 15714 | first: Entries[Pair.first]->Scalars.begin(), |
| 15715 | last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second])) |
| 15716 | : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second])); |
| 15717 | IsIdentity &= Mask[Idx] == Pair.second; |
| 15718 | } |
| 15719 | if (ForOrder || IsIdentity || Entries.empty()) { |
| 15720 | switch (Entries.size()) { |
| 15721 | case 1: |
| 15722 | if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2) |
| 15723 | return TargetTransformInfo::SK_PermuteSingleSrc; |
| 15724 | break; |
| 15725 | case 2: |
| 15726 | if (EntryLanes.size() > 2 || VL.size() <= 2) |
| 15727 | return TargetTransformInfo::SK_PermuteTwoSrc; |
| 15728 | break; |
| 15729 | default: |
| 15730 | break; |
| 15731 | } |
| 15732 | } else if (!isa<VectorType>(Val: VL.front()->getType()) && |
| 15733 | (EntryLanes.size() > Entries.size() || VL.size() <= 2)) { |
| 15734 | // Do the cost estimation if shuffle beneficial than buildvector. |
| 15735 | SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()), |
| 15736 | std::next(x: Mask.begin(), n: (Part + 1) * VL.size())); |
| 15737 | int MinElement = SubMask.front(), MaxElement = SubMask.front(); |
| 15738 | for (int Idx : SubMask) { |
| 15739 | if (Idx == PoisonMaskElem) |
| 15740 | continue; |
| 15741 | if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF) |
| 15742 | MinElement = Idx; |
| 15743 | if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF) |
| 15744 | MaxElement = Idx; |
| 15745 | } |
| 15746 | assert(MaxElement >= 0 && MinElement >= 0 && |
| 15747 | MaxElement % VF >= MinElement % VF && |
| 15748 | "Expected at least single element." ); |
| 15749 | unsigned NewVF = std::max<unsigned>( |
| 15750 | a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(), |
| 15751 | Sz: (MaxElement % VF) - |
| 15752 | (MinElement % VF) + 1)); |
| 15753 | if (NewVF < VF) { |
| 15754 | for (int &Idx : SubMask) { |
| 15755 | if (Idx == PoisonMaskElem) |
| 15756 | continue; |
| 15757 | Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF + |
| 15758 | (Idx >= static_cast<int>(VF) ? NewVF : 0); |
| 15759 | } |
| 15760 | } else { |
| 15761 | NewVF = VF; |
| 15762 | } |
| 15763 | |
| 15764 | constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 15765 | auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF); |
| 15766 | auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size()); |
| 15767 | auto GetShuffleCost = [&, |
| 15768 | &TTI = *TTI](ArrayRef<int> Mask, |
| 15769 | ArrayRef<const TreeEntry *> Entries, |
| 15770 | VectorType *VecTy) -> InstructionCost { |
| 15771 | if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 && |
| 15772 | ShuffleVectorInst::isDeInterleaveMaskOfFactor( |
| 15773 | Mask, Factor: Entries.front()->getInterleaveFactor())) |
| 15774 | return TTI::TCC_Free; |
| 15775 | return ::getShuffleCost(TTI, |
| 15776 | Kind: Entries.size() > 1 ? TTI::SK_PermuteTwoSrc |
| 15777 | : TTI::SK_PermuteSingleSrc, |
| 15778 | Tp: VecTy, Mask, CostKind); |
| 15779 | }; |
| 15780 | InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy); |
| 15781 | InstructionCost FirstShuffleCost = 0; |
| 15782 | SmallVector<int> FirstMask(SubMask.begin(), SubMask.end()); |
| 15783 | if (Entries.size() == 1 || !Entries[0]->isGather()) { |
| 15784 | FirstShuffleCost = ShuffleCost; |
| 15785 | } else { |
| 15786 | // Transform mask to include only first entry. |
| 15787 | APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size()); |
| 15788 | bool IsIdentity = true; |
| 15789 | for (auto [I, Idx] : enumerate(First&: FirstMask)) { |
| 15790 | if (Idx >= static_cast<int>(NewVF)) { |
| 15791 | Idx = PoisonMaskElem; |
| 15792 | } else { |
| 15793 | DemandedElts.clearBit(BitPosition: I); |
| 15794 | if (Idx != PoisonMaskElem) |
| 15795 | IsIdentity &= static_cast<int>(I) == Idx; |
| 15796 | } |
| 15797 | } |
| 15798 | if (!IsIdentity) |
| 15799 | FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy); |
| 15800 | FirstShuffleCost += getScalarizationOverhead( |
| 15801 | TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true, |
| 15802 | /*Extract=*/false, CostKind); |
| 15803 | } |
| 15804 | InstructionCost SecondShuffleCost = 0; |
| 15805 | SmallVector<int> SecondMask(SubMask.begin(), SubMask.end()); |
| 15806 | if (Entries.size() == 1 || !Entries[1]->isGather()) { |
| 15807 | SecondShuffleCost = ShuffleCost; |
| 15808 | } else { |
| 15809 | // Transform mask to include only first entry. |
| 15810 | APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size()); |
| 15811 | bool IsIdentity = true; |
| 15812 | for (auto [I, Idx] : enumerate(First&: SecondMask)) { |
| 15813 | if (Idx < static_cast<int>(NewVF) && Idx >= 0) { |
| 15814 | Idx = PoisonMaskElem; |
| 15815 | } else { |
| 15816 | DemandedElts.clearBit(BitPosition: I); |
| 15817 | if (Idx != PoisonMaskElem) { |
| 15818 | Idx -= NewVF; |
| 15819 | IsIdentity &= static_cast<int>(I) == Idx; |
| 15820 | } |
| 15821 | } |
| 15822 | } |
| 15823 | if (!IsIdentity) |
| 15824 | SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy); |
| 15825 | SecondShuffleCost += getScalarizationOverhead( |
| 15826 | TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true, |
| 15827 | /*Extract=*/false, CostKind); |
| 15828 | } |
| 15829 | APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size()); |
| 15830 | for (auto [I, Idx] : enumerate(First&: SubMask)) |
| 15831 | if (Idx == PoisonMaskElem) |
| 15832 | DemandedElts.clearBit(BitPosition: I); |
| 15833 | InstructionCost BuildVectorCost = getScalarizationOverhead( |
| 15834 | TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true, |
| 15835 | /*Extract=*/false, CostKind); |
| 15836 | const TreeEntry *BestEntry = nullptr; |
| 15837 | if (FirstShuffleCost < ShuffleCost) { |
| 15838 | std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()), |
| 15839 | last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), |
| 15840 | f: [&](int &Idx) { |
| 15841 | if (Idx >= static_cast<int>(VF)) |
| 15842 | Idx = PoisonMaskElem; |
| 15843 | }); |
| 15844 | BestEntry = Entries.front(); |
| 15845 | ShuffleCost = FirstShuffleCost; |
| 15846 | } |
| 15847 | if (SecondShuffleCost < ShuffleCost) { |
| 15848 | std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()), |
| 15849 | last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), |
| 15850 | f: [&](int &Idx) { |
| 15851 | if (Idx < static_cast<int>(VF)) |
| 15852 | Idx = PoisonMaskElem; |
| 15853 | else |
| 15854 | Idx -= VF; |
| 15855 | }); |
| 15856 | BestEntry = Entries[1]; |
| 15857 | ShuffleCost = SecondShuffleCost; |
| 15858 | } |
| 15859 | if (BuildVectorCost >= ShuffleCost) { |
| 15860 | if (BestEntry) { |
| 15861 | Entries.clear(); |
| 15862 | Entries.push_back(Elt: BestEntry); |
| 15863 | } |
| 15864 | return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc |
| 15865 | : TargetTransformInfo::SK_PermuteSingleSrc; |
| 15866 | } |
| 15867 | } |
| 15868 | Entries.clear(); |
| 15869 | // Clear the corresponding mask elements. |
| 15870 | std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()), |
| 15871 | last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: PoisonMaskElem); |
| 15872 | return std::nullopt; |
| 15873 | } |
| 15874 | |
| 15875 | SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> |
| 15876 | BoUpSLP::isGatherShuffledEntry( |
| 15877 | const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask, |
| 15878 | SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts, |
| 15879 | bool ForOrder) { |
| 15880 | assert(NumParts > 0 && NumParts < VL.size() && |
| 15881 | "Expected positive number of registers." ); |
| 15882 | Entries.clear(); |
| 15883 | // No need to check for the topmost gather node. |
| 15884 | if (TE == VectorizableTree.front().get() && |
| 15885 | (!GatheredLoadsEntriesFirst.has_value() || |
| 15886 | none_of(Range: ArrayRef(VectorizableTree).drop_front(), |
| 15887 | P: [](const std::unique_ptr<TreeEntry> &TE) { |
| 15888 | return !TE->isGather(); |
| 15889 | }))) |
| 15890 | return {}; |
| 15891 | // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not |
| 15892 | // implemented yet. |
| 15893 | if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI)) |
| 15894 | return {}; |
| 15895 | Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem); |
| 15896 | assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) && |
| 15897 | "Expected only single user of the gather node." ); |
| 15898 | assert(VL.size() % NumParts == 0 && |
| 15899 | "Number of scalars must be divisible by NumParts." ); |
| 15900 | if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() && |
| 15901 | TE->UserTreeIndex.EdgeIdx == UINT_MAX && |
| 15902 | (TE->Idx == 0 || |
| 15903 | (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) || |
| 15904 | isSplat(VL: TE->Scalars) || |
| 15905 | (TE->hasState() && |
| 15906 | getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars)))) |
| 15907 | return {}; |
| 15908 | unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts); |
| 15909 | SmallVector<std::optional<TTI::ShuffleKind>> Res; |
| 15910 | for (unsigned Part : seq<unsigned>(Size: NumParts)) { |
| 15911 | ArrayRef<Value *> SubVL = |
| 15912 | VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part)); |
| 15913 | SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back(); |
| 15914 | std::optional<TTI::ShuffleKind> SubRes = |
| 15915 | isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part, |
| 15916 | ForOrder); |
| 15917 | if (!SubRes) |
| 15918 | SubEntries.clear(); |
| 15919 | Res.push_back(Elt: SubRes); |
| 15920 | if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc && |
| 15921 | SubEntries.front()->getVectorFactor() == VL.size() && |
| 15922 | (SubEntries.front()->isSame(VL: TE->Scalars) || |
| 15923 | SubEntries.front()->isSame(VL))) { |
| 15924 | SmallVector<const TreeEntry *> LocalSubEntries; |
| 15925 | LocalSubEntries.swap(RHS&: SubEntries); |
| 15926 | Entries.clear(); |
| 15927 | Res.clear(); |
| 15928 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
| 15929 | // Clear undef scalars. |
| 15930 | for (int I = 0, Sz = VL.size(); I < Sz; ++I) |
| 15931 | if (isa<PoisonValue>(Val: VL[I])) |
| 15932 | Mask[I] = PoisonMaskElem; |
| 15933 | Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front()); |
| 15934 | Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc); |
| 15935 | return Res; |
| 15936 | } |
| 15937 | } |
| 15938 | if (all_of(Range&: Res, |
| 15939 | P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) { |
| 15940 | Entries.clear(); |
| 15941 | return {}; |
| 15942 | } |
| 15943 | return Res; |
| 15944 | } |
| 15945 | |
| 15946 | InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc, |
| 15947 | Type *ScalarTy) const { |
| 15948 | const unsigned VF = VL.size(); |
| 15949 | auto *VecTy = getWidenedType(ScalarTy, VF); |
| 15950 | // Find the cost of inserting/extracting values from the vector. |
| 15951 | // Check if the same elements are inserted several times and count them as |
| 15952 | // shuffle candidates. |
| 15953 | APInt DemandedElements = APInt::getZero(numBits: VF); |
| 15954 | constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 15955 | InstructionCost Cost; |
| 15956 | auto EstimateInsertCost = [&](unsigned I, Value *V) { |
| 15957 | DemandedElements.setBit(I); |
| 15958 | if (V->getType() != ScalarTy) |
| 15959 | Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(), |
| 15960 | CCH: TTI::CastContextHint::None, CostKind); |
| 15961 | }; |
| 15962 | SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem); |
| 15963 | std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: 0); |
| 15964 | for (auto [I, V] : enumerate(First&: VL)) { |
| 15965 | // No need to shuffle duplicates for constants. |
| 15966 | if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V)) |
| 15967 | continue; |
| 15968 | |
| 15969 | if (isConstant(V)) { |
| 15970 | ConstantShuffleMask[I] = I + VF; |
| 15971 | continue; |
| 15972 | } |
| 15973 | EstimateInsertCost(I, V); |
| 15974 | } |
| 15975 | // FIXME: add a cost for constant vector materialization. |
| 15976 | bool IsAnyNonUndefConst = |
| 15977 | any_of(Range&: VL, P: [](Value *V) { return !isa<UndefValue>(Val: V) && isConstant(V); }); |
| 15978 | // 1. Shuffle input source vector and constant vector. |
| 15979 | if (!ForPoisonSrc && IsAnyNonUndefConst) { |
| 15980 | Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy, |
| 15981 | Mask: ConstantShuffleMask); |
| 15982 | } |
| 15983 | |
| 15984 | // 2. Insert unique non-constants. |
| 15985 | if (!DemandedElements.isZero()) |
| 15986 | Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements, |
| 15987 | /*Insert=*/true, |
| 15988 | /*Extract=*/false, CostKind, |
| 15989 | ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL); |
| 15990 | return Cost; |
| 15991 | } |
| 15992 | |
| 15993 | Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { |
| 15994 | auto It = EntryToLastInstruction.find(Val: E); |
| 15995 | if (It != EntryToLastInstruction.end()) |
| 15996 | return *cast<Instruction>(Val&: It->second); |
| 15997 | Instruction *Res = nullptr; |
| 15998 | // Get the basic block this bundle is in. All instructions in the bundle |
| 15999 | // should be in this block (except for extractelement-like instructions with |
| 16000 | // constant indices or gathered loads). |
| 16001 | auto *Front = E->getMainOp(); |
| 16002 | auto *BB = Front->getParent(); |
| 16003 | assert(((GatheredLoadsEntriesFirst.has_value() && |
| 16004 | E->getOpcode() == Instruction::Load && E->isGather() && |
| 16005 | E->Idx < *GatheredLoadsEntriesFirst) || |
| 16006 | E->State == TreeEntry::SplitVectorize || |
| 16007 | all_of(E->Scalars, |
| 16008 | [=](Value *V) -> bool { |
| 16009 | if (E->getOpcode() == Instruction::GetElementPtr && |
| 16010 | !isa<GetElementPtrInst>(V)) |
| 16011 | return true; |
| 16012 | auto *I = dyn_cast<Instruction>(V); |
| 16013 | return !I || !E->getMatchingMainOpOrAltOp(I) || |
| 16014 | I->getParent() == BB || |
| 16015 | isVectorLikeInstWithConstOps(I); |
| 16016 | })) && |
| 16017 | "Expected gathered loads or GEPs or instructions from same basic " |
| 16018 | "block." ); |
| 16019 | |
| 16020 | auto FindLastInst = [&]() { |
| 16021 | Instruction *LastInst = Front; |
| 16022 | for (Value *V : E->Scalars) { |
| 16023 | auto *I = dyn_cast<Instruction>(Val: V); |
| 16024 | if (!I) |
| 16025 | continue; |
| 16026 | if (LastInst->getParent() == I->getParent()) { |
| 16027 | if (LastInst->comesBefore(Other: I)) |
| 16028 | LastInst = I; |
| 16029 | continue; |
| 16030 | } |
| 16031 | assert(((E->getOpcode() == Instruction::GetElementPtr && |
| 16032 | !isa<GetElementPtrInst>(I)) || |
| 16033 | E->State == TreeEntry::SplitVectorize || |
| 16034 | (isVectorLikeInstWithConstOps(LastInst) && |
| 16035 | isVectorLikeInstWithConstOps(I)) || |
| 16036 | (GatheredLoadsEntriesFirst.has_value() && |
| 16037 | E->getOpcode() == Instruction::Load && E->isGather() && |
| 16038 | E->Idx < *GatheredLoadsEntriesFirst)) && |
| 16039 | "Expected vector-like or non-GEP in GEP node insts only." ); |
| 16040 | if (!DT->isReachableFromEntry(A: LastInst->getParent())) { |
| 16041 | LastInst = I; |
| 16042 | continue; |
| 16043 | } |
| 16044 | if (!DT->isReachableFromEntry(A: I->getParent())) |
| 16045 | continue; |
| 16046 | auto *NodeA = DT->getNode(BB: LastInst->getParent()); |
| 16047 | auto *NodeB = DT->getNode(BB: I->getParent()); |
| 16048 | assert(NodeA && "Should only process reachable instructions" ); |
| 16049 | assert(NodeB && "Should only process reachable instructions" ); |
| 16050 | assert((NodeA == NodeB) == |
| 16051 | (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && |
| 16052 | "Different nodes should have different DFS numbers" ); |
| 16053 | if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) |
| 16054 | LastInst = I; |
| 16055 | } |
| 16056 | BB = LastInst->getParent(); |
| 16057 | return LastInst; |
| 16058 | }; |
| 16059 | |
| 16060 | auto FindFirstInst = [&]() { |
| 16061 | Instruction *FirstInst = Front; |
| 16062 | for (Value *V : E->Scalars) { |
| 16063 | auto *I = dyn_cast<Instruction>(Val: V); |
| 16064 | if (!I) |
| 16065 | continue; |
| 16066 | if (FirstInst->getParent() == I->getParent()) { |
| 16067 | if (I->comesBefore(Other: FirstInst)) |
| 16068 | FirstInst = I; |
| 16069 | continue; |
| 16070 | } |
| 16071 | assert(((E->getOpcode() == Instruction::GetElementPtr && |
| 16072 | !isa<GetElementPtrInst>(I)) || |
| 16073 | (isVectorLikeInstWithConstOps(FirstInst) && |
| 16074 | isVectorLikeInstWithConstOps(I))) && |
| 16075 | "Expected vector-like or non-GEP in GEP node insts only." ); |
| 16076 | if (!DT->isReachableFromEntry(A: FirstInst->getParent())) { |
| 16077 | FirstInst = I; |
| 16078 | continue; |
| 16079 | } |
| 16080 | if (!DT->isReachableFromEntry(A: I->getParent())) |
| 16081 | continue; |
| 16082 | auto *NodeA = DT->getNode(BB: FirstInst->getParent()); |
| 16083 | auto *NodeB = DT->getNode(BB: I->getParent()); |
| 16084 | assert(NodeA && "Should only process reachable instructions" ); |
| 16085 | assert(NodeB && "Should only process reachable instructions" ); |
| 16086 | assert((NodeA == NodeB) == |
| 16087 | (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && |
| 16088 | "Different nodes should have different DFS numbers" ); |
| 16089 | if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn()) |
| 16090 | FirstInst = I; |
| 16091 | } |
| 16092 | return FirstInst; |
| 16093 | }; |
| 16094 | |
| 16095 | if (E->State == TreeEntry::SplitVectorize) { |
| 16096 | Res = FindLastInst(); |
| 16097 | if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) { |
| 16098 | for (auto *E : Entries) { |
| 16099 | auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue); |
| 16100 | if (!I) |
| 16101 | I = &getLastInstructionInBundle(E); |
| 16102 | if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I)) |
| 16103 | Res = I; |
| 16104 | } |
| 16105 | } |
| 16106 | EntryToLastInstruction.try_emplace(Key: E, Args&: Res); |
| 16107 | return *Res; |
| 16108 | } |
| 16109 | |
| 16110 | // Set insertpoint for gathered loads to the very first load. |
| 16111 | if (GatheredLoadsEntriesFirst.has_value() && |
| 16112 | E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && |
| 16113 | E->getOpcode() == Instruction::Load) { |
| 16114 | Res = FindFirstInst(); |
| 16115 | EntryToLastInstruction.try_emplace(Key: E, Args&: Res); |
| 16116 | return *Res; |
| 16117 | } |
| 16118 | |
| 16119 | // Set the insert point to the beginning of the basic block if the entry |
| 16120 | // should not be scheduled. |
| 16121 | auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * { |
| 16122 | if (E->isGather()) |
| 16123 | return nullptr; |
| 16124 | // Found previously that the instruction do not need to be scheduled. |
| 16125 | const auto *It = BlocksSchedules.find(Key: BB); |
| 16126 | if (It == BlocksSchedules.end()) |
| 16127 | return nullptr; |
| 16128 | for (Value *V : E->Scalars) { |
| 16129 | auto *I = dyn_cast<Instruction>(Val: V); |
| 16130 | if (!I || isa<PHINode>(Val: I) || doesNotNeedToBeScheduled(V: I)) |
| 16131 | continue; |
| 16132 | ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(V: I); |
| 16133 | if (Bundles.empty()) |
| 16134 | continue; |
| 16135 | const auto *It = find_if( |
| 16136 | Range&: Bundles, P: [&](ScheduleBundle *B) { return B->getTreeEntry() == E; }); |
| 16137 | if (It != Bundles.end()) |
| 16138 | return *It; |
| 16139 | } |
| 16140 | return nullptr; |
| 16141 | }; |
| 16142 | const ScheduleBundle *Bundle = FindScheduleBundle(E); |
| 16143 | if (!E->isGather() && !Bundle) { |
| 16144 | if ((E->getOpcode() == Instruction::GetElementPtr && |
| 16145 | any_of(Range: E->Scalars, |
| 16146 | P: [](Value *V) { |
| 16147 | return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V); |
| 16148 | })) || |
| 16149 | all_of(Range: E->Scalars, P: [](Value *V) { |
| 16150 | return isa<PoisonValue>(Val: V) || |
| 16151 | (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V)); |
| 16152 | })) |
| 16153 | Res = FindLastInst(); |
| 16154 | else |
| 16155 | Res = FindFirstInst(); |
| 16156 | EntryToLastInstruction.try_emplace(Key: E, Args&: Res); |
| 16157 | return *Res; |
| 16158 | } |
| 16159 | |
| 16160 | // Find the last instruction. The common case should be that BB has been |
| 16161 | // scheduled, and the last instruction is VL.back(). So we start with |
| 16162 | // VL.back() and iterate over schedule data until we reach the end of the |
| 16163 | // bundle. The end of the bundle is marked by null ScheduleData. |
| 16164 | if (Bundle) { |
| 16165 | assert(!E->isGather() && "Gathered instructions should not be scheduled" ); |
| 16166 | Res = Bundle->getBundle().back()->getInst(); |
| 16167 | EntryToLastInstruction.try_emplace(Key: E, Args&: Res); |
| 16168 | return *Res; |
| 16169 | } |
| 16170 | |
| 16171 | // LastInst can still be null at this point if there's either not an entry |
| 16172 | // for BB in BlocksSchedules or there's no ScheduleData available for |
| 16173 | // VL.back(). This can be the case if buildTreeRec aborts for various |
| 16174 | // reasons (e.g., the maximum recursion depth is reached, the maximum region |
| 16175 | // size is reached, etc.). ScheduleData is initialized in the scheduling |
| 16176 | // "dry-run". |
| 16177 | // |
| 16178 | // If this happens, we can still find the last instruction by brute force. We |
| 16179 | // iterate forwards from Front (inclusive) until we either see all |
| 16180 | // instructions in the bundle or reach the end of the block. If Front is the |
| 16181 | // last instruction in program order, LastInst will be set to Front, and we |
| 16182 | // will visit all the remaining instructions in the block. |
| 16183 | // |
| 16184 | // One of the reasons we exit early from buildTreeRec is to place an upper |
| 16185 | // bound on compile-time. Thus, taking an additional compile-time hit here is |
| 16186 | // not ideal. However, this should be exceedingly rare since it requires that |
| 16187 | // we both exit early from buildTreeRec and that the bundle be out-of-order |
| 16188 | // (causing us to iterate all the way to the end of the block). |
| 16189 | if (!Res) |
| 16190 | Res = FindLastInst(); |
| 16191 | assert(Res && "Failed to find last instruction in bundle" ); |
| 16192 | EntryToLastInstruction.try_emplace(Key: E, Args&: Res); |
| 16193 | return *Res; |
| 16194 | } |
| 16195 | |
| 16196 | void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { |
| 16197 | auto *Front = E->getMainOp(); |
| 16198 | Instruction *LastInst = &getLastInstructionInBundle(E); |
| 16199 | assert(LastInst && "Failed to find last instruction in bundle" ); |
| 16200 | BasicBlock::iterator LastInstIt = LastInst->getIterator(); |
| 16201 | // If the instruction is PHI, set the insert point after all the PHIs. |
| 16202 | bool IsPHI = isa<PHINode>(Val: LastInst); |
| 16203 | if (IsPHI) { |
| 16204 | LastInstIt = LastInst->getParent()->getFirstNonPHIIt(); |
| 16205 | if (LastInstIt != LastInst->getParent()->end() && |
| 16206 | LastInstIt->getParent()->isLandingPad()) |
| 16207 | LastInstIt = std::next(x: LastInstIt); |
| 16208 | } |
| 16209 | if (IsPHI || |
| 16210 | (!E->isGather() && E->State != TreeEntry::SplitVectorize && |
| 16211 | E->doesNotNeedToSchedule()) || |
| 16212 | (GatheredLoadsEntriesFirst.has_value() && |
| 16213 | E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() && |
| 16214 | E->getOpcode() == Instruction::Load)) { |
| 16215 | Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt); |
| 16216 | } else { |
| 16217 | // Set the insertion point after the last instruction in the bundle. Set the |
| 16218 | // debug location to Front. |
| 16219 | Builder.SetInsertPoint( |
| 16220 | TheBB: LastInst->getParent(), |
| 16221 | IP: LastInst->getNextNonDebugInstruction()->getIterator()); |
| 16222 | } |
| 16223 | Builder.SetCurrentDebugLocation(Front->getDebugLoc()); |
| 16224 | } |
| 16225 | |
| 16226 | Value *BoUpSLP::gather( |
| 16227 | ArrayRef<Value *> VL, Value *Root, Type *ScalarTy, |
| 16228 | function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) { |
| 16229 | // List of instructions/lanes from current block and/or the blocks which are |
| 16230 | // part of the current loop. These instructions will be inserted at the end to |
| 16231 | // make it possible to optimize loops and hoist invariant instructions out of |
| 16232 | // the loops body with better chances for success. |
| 16233 | SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; |
| 16234 | SmallSet<int, 4> PostponedIndices; |
| 16235 | Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock()); |
| 16236 | auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { |
| 16237 | SmallPtrSet<BasicBlock *, 4> Visited; |
| 16238 | while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second) |
| 16239 | InsertBB = InsertBB->getSinglePredecessor(); |
| 16240 | return InsertBB && InsertBB == InstBB; |
| 16241 | }; |
| 16242 | for (int I = 0, E = VL.size(); I < E; ++I) { |
| 16243 | if (auto *Inst = dyn_cast<Instruction>(Val: VL[I])) |
| 16244 | if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || |
| 16245 | isVectorized(V: Inst) || |
| 16246 | (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) && |
| 16247 | PostponedIndices.insert(V: I).second) |
| 16248 | PostponedInsts.emplace_back(Args&: Inst, Args&: I); |
| 16249 | } |
| 16250 | |
| 16251 | auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos, |
| 16252 | Type *Ty) { |
| 16253 | Value *Scalar = V; |
| 16254 | if (Scalar->getType() != Ty) { |
| 16255 | assert(Scalar->getType()->isIntOrIntVectorTy() && |
| 16256 | Ty->isIntOrIntVectorTy() && "Expected integer types only." ); |
| 16257 | Value *V = Scalar; |
| 16258 | if (auto *CI = dyn_cast<CastInst>(Val: Scalar); |
| 16259 | isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) { |
| 16260 | Value *Op = CI->getOperand(i_nocapture: 0); |
| 16261 | if (auto *IOp = dyn_cast<Instruction>(Val: Op); |
| 16262 | !IOp || !(isDeleted(I: IOp) || isVectorized(V: IOp))) |
| 16263 | V = Op; |
| 16264 | } |
| 16265 | Scalar = Builder.CreateIntCast( |
| 16266 | V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL))); |
| 16267 | } |
| 16268 | |
| 16269 | Instruction *InsElt; |
| 16270 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) { |
| 16271 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 16272 | Vec = |
| 16273 | createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy)); |
| 16274 | auto *II = dyn_cast<IntrinsicInst>(Val: Vec); |
| 16275 | if (!II || II->getIntrinsicID() != Intrinsic::vector_insert) |
| 16276 | return Vec; |
| 16277 | InsElt = II; |
| 16278 | } else { |
| 16279 | Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos)); |
| 16280 | InsElt = dyn_cast<InsertElementInst>(Val: Vec); |
| 16281 | if (!InsElt) |
| 16282 | return Vec; |
| 16283 | } |
| 16284 | GatherShuffleExtractSeq.insert(X: InsElt); |
| 16285 | CSEBlocks.insert(V: InsElt->getParent()); |
| 16286 | // Add to our 'need-to-extract' list. |
| 16287 | if (isa<Instruction>(Val: V)) { |
| 16288 | if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) { |
| 16289 | // Find which lane we need to extract. |
| 16290 | User *UserOp = nullptr; |
| 16291 | if (Scalar != V) { |
| 16292 | if (auto *SI = dyn_cast<Instruction>(Val: Scalar)) |
| 16293 | UserOp = SI; |
| 16294 | } else { |
| 16295 | UserOp = InsElt; |
| 16296 | } |
| 16297 | if (UserOp) { |
| 16298 | unsigned FoundLane = Entries.front()->findLaneForValue(V); |
| 16299 | ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: *Entries.front(), Args&: FoundLane); |
| 16300 | } |
| 16301 | } |
| 16302 | } |
| 16303 | return Vec; |
| 16304 | }; |
| 16305 | auto *VecTy = getWidenedType(ScalarTy, VF: VL.size()); |
| 16306 | Value *Vec = PoisonValue::get(T: VecTy); |
| 16307 | SmallVector<int> NonConsts; |
| 16308 | SmallVector<int> Mask(VL.size()); |
| 16309 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
| 16310 | Value *OriginalRoot = Root; |
| 16311 | if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root); |
| 16312 | SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: 1)) && |
| 16313 | SV->getOperand(i_nocapture: 0)->getType() == VecTy) { |
| 16314 | Root = SV->getOperand(i_nocapture: 0); |
| 16315 | Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end()); |
| 16316 | } |
| 16317 | // Insert constant values at first. |
| 16318 | for (int I = 0, E = VL.size(); I < E; ++I) { |
| 16319 | if (PostponedIndices.contains(V: I)) |
| 16320 | continue; |
| 16321 | if (!isConstant(V: VL[I])) { |
| 16322 | NonConsts.push_back(Elt: I); |
| 16323 | continue; |
| 16324 | } |
| 16325 | if (isa<PoisonValue>(Val: VL[I])) |
| 16326 | continue; |
| 16327 | Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy); |
| 16328 | Mask[I] = I + E; |
| 16329 | } |
| 16330 | if (Root) { |
| 16331 | if (isa<PoisonValue>(Val: Vec)) { |
| 16332 | Vec = OriginalRoot; |
| 16333 | } else { |
| 16334 | Vec = CreateShuffle(Root, Vec, Mask); |
| 16335 | if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot); |
| 16336 | OI && OI->use_empty() && |
| 16337 | none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 16338 | return TE->VectorizedValue == OI; |
| 16339 | })) |
| 16340 | eraseInstruction(I: OI); |
| 16341 | } |
| 16342 | } |
| 16343 | // Insert non-constant values. |
| 16344 | for (int I : NonConsts) |
| 16345 | Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy); |
| 16346 | // Append instructions, which are/may be part of the loop, in the end to make |
| 16347 | // it possible to hoist non-loop-based instructions. |
| 16348 | for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) |
| 16349 | Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy); |
| 16350 | |
| 16351 | return Vec; |
| 16352 | } |
| 16353 | |
| 16354 | /// Merges shuffle masks and emits final shuffle instruction, if required. It |
| 16355 | /// supports shuffling of 2 input vectors. It implements lazy shuffles emission, |
| 16356 | /// when the actual shuffle instruction is generated only if this is actually |
| 16357 | /// required. Otherwise, the shuffle instruction emission is delayed till the |
| 16358 | /// end of the process, to reduce the number of emitted instructions and further |
| 16359 | /// analysis/transformations. |
| 16360 | /// The class also will look through the previously emitted shuffle instructions |
| 16361 | /// and properly mark indices in mask as undef. |
| 16362 | /// For example, given the code |
| 16363 | /// \code |
| 16364 | /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0> |
| 16365 | /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0> |
| 16366 | /// \endcode |
| 16367 | /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will |
| 16368 | /// look through %s1 and %s2 and emit |
| 16369 | /// \code |
| 16370 | /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> |
| 16371 | /// \endcode |
| 16372 | /// instead. |
| 16373 | /// If 2 operands are of different size, the smallest one will be resized and |
| 16374 | /// the mask recalculated properly. |
| 16375 | /// For example, given the code |
| 16376 | /// \code |
| 16377 | /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0> |
| 16378 | /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0> |
| 16379 | /// \endcode |
| 16380 | /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will |
| 16381 | /// look through %s1 and %s2 and emit |
| 16382 | /// \code |
| 16383 | /// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3> |
| 16384 | /// \endcode |
| 16385 | /// instead. |
| 16386 | class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { |
| 16387 | bool IsFinalized = false; |
| 16388 | /// Combined mask for all applied operands and masks. It is built during |
| 16389 | /// analysis and actual emission of shuffle vector instructions. |
| 16390 | SmallVector<int> CommonMask; |
| 16391 | /// List of operands for the shuffle vector instruction. It hold at max 2 |
| 16392 | /// operands, if the 3rd is going to be added, the first 2 are combined into |
| 16393 | /// shuffle with \p CommonMask mask, the first operand sets to be the |
| 16394 | /// resulting shuffle and the second operand sets to be the newly added |
| 16395 | /// operand. The \p CommonMask is transformed in the proper way after that. |
| 16396 | SmallVector<Value *, 2> InVectors; |
| 16397 | IRBuilderBase &Builder; |
| 16398 | BoUpSLP &R; |
| 16399 | |
| 16400 | class ShuffleIRBuilder { |
| 16401 | IRBuilderBase &Builder; |
| 16402 | /// Holds all of the instructions that we gathered. |
| 16403 | SetVector<Instruction *> &; |
| 16404 | /// A list of blocks that we are going to CSE. |
| 16405 | DenseSet<BasicBlock *> &CSEBlocks; |
| 16406 | /// Data layout. |
| 16407 | const DataLayout &DL; |
| 16408 | |
| 16409 | public: |
| 16410 | ShuffleIRBuilder(IRBuilderBase &Builder, |
| 16411 | SetVector<Instruction *> &, |
| 16412 | DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL) |
| 16413 | : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq), |
| 16414 | CSEBlocks(CSEBlocks), DL(DL) {} |
| 16415 | ~ShuffleIRBuilder() = default; |
| 16416 | /// Creates shufflevector for the 2 operands with the given mask. |
| 16417 | Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 16418 | if (V1->getType() != V2->getType()) { |
| 16419 | assert(V1->getType()->isIntOrIntVectorTy() && |
| 16420 | V1->getType()->isIntOrIntVectorTy() && |
| 16421 | "Expected integer vector types only." ); |
| 16422 | if (V1->getType() != V2->getType()) { |
| 16423 | if (cast<VectorType>(Val: V2->getType()) |
| 16424 | ->getElementType() |
| 16425 | ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType()) |
| 16426 | ->getElementType() |
| 16427 | ->getIntegerBitWidth()) |
| 16428 | V2 = Builder.CreateIntCast( |
| 16429 | V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL))); |
| 16430 | else |
| 16431 | V1 = Builder.CreateIntCast( |
| 16432 | V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL))); |
| 16433 | } |
| 16434 | } |
| 16435 | Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask); |
| 16436 | if (auto *I = dyn_cast<Instruction>(Val: Vec)) { |
| 16437 | GatherShuffleExtractSeq.insert(X: I); |
| 16438 | CSEBlocks.insert(V: I->getParent()); |
| 16439 | } |
| 16440 | return Vec; |
| 16441 | } |
| 16442 | /// Creates permutation of the single vector operand with the given mask, if |
| 16443 | /// it is not identity mask. |
| 16444 | Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) { |
| 16445 | if (Mask.empty()) |
| 16446 | return V1; |
| 16447 | unsigned VF = Mask.size(); |
| 16448 | unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements(); |
| 16449 | if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) |
| 16450 | return V1; |
| 16451 | Value *Vec = Builder.CreateShuffleVector(V: V1, Mask); |
| 16452 | if (auto *I = dyn_cast<Instruction>(Val: Vec)) { |
| 16453 | GatherShuffleExtractSeq.insert(X: I); |
| 16454 | CSEBlocks.insert(V: I->getParent()); |
| 16455 | } |
| 16456 | return Vec; |
| 16457 | } |
| 16458 | Value *createIdentity(Value *V) { return V; } |
| 16459 | Value *createPoison(Type *Ty, unsigned VF) { |
| 16460 | return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF)); |
| 16461 | } |
| 16462 | /// Resizes 2 input vector to match the sizes, if the they are not equal |
| 16463 | /// yet. The smallest vector is resized to the size of the larger vector. |
| 16464 | void resizeToMatch(Value *&V1, Value *&V2) { |
| 16465 | if (V1->getType() == V2->getType()) |
| 16466 | return; |
| 16467 | int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements(); |
| 16468 | int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements(); |
| 16469 | int VF = std::max(a: V1VF, b: V2VF); |
| 16470 | int MinVF = std::min(a: V1VF, b: V2VF); |
| 16471 | SmallVector<int> IdentityMask(VF, PoisonMaskElem); |
| 16472 | std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF), |
| 16473 | value: 0); |
| 16474 | Value *&Op = MinVF == V1VF ? V1 : V2; |
| 16475 | Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask); |
| 16476 | if (auto *I = dyn_cast<Instruction>(Val: Op)) { |
| 16477 | GatherShuffleExtractSeq.insert(X: I); |
| 16478 | CSEBlocks.insert(V: I->getParent()); |
| 16479 | } |
| 16480 | if (MinVF == V1VF) |
| 16481 | V1 = Op; |
| 16482 | else |
| 16483 | V2 = Op; |
| 16484 | } |
| 16485 | }; |
| 16486 | |
| 16487 | /// Smart shuffle instruction emission, walks through shuffles trees and |
| 16488 | /// tries to find the best matching vector for the actual shuffle |
| 16489 | /// instruction. |
| 16490 | Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 16491 | assert(V1 && "Expected at least one vector value." ); |
| 16492 | ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq, |
| 16493 | R.CSEBlocks, *R.DL); |
| 16494 | return BaseShuffleAnalysis::createShuffle<Value *>( |
| 16495 | V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy); |
| 16496 | } |
| 16497 | |
| 16498 | /// Cast value \p V to the vector type with the same number of elements, but |
| 16499 | /// the base type \p ScalarTy. |
| 16500 | Value *castToScalarTyElem(Value *V, |
| 16501 | std::optional<bool> IsSigned = std::nullopt) { |
| 16502 | auto *VecTy = cast<VectorType>(Val: V->getType()); |
| 16503 | assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0); |
| 16504 | if (VecTy->getElementType() == ScalarTy->getScalarType()) |
| 16505 | return V; |
| 16506 | return Builder.CreateIntCast( |
| 16507 | V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()), |
| 16508 | isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL)))); |
| 16509 | } |
| 16510 | |
| 16511 | public: |
| 16512 | ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R) |
| 16513 | : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {} |
| 16514 | |
| 16515 | /// Adjusts extractelements after reusing them. |
| 16516 | Value *(const TreeEntry *E, MutableArrayRef<int> Mask, |
| 16517 | ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds, |
| 16518 | unsigned NumParts, bool &UseVecBaseAsInput) { |
| 16519 | UseVecBaseAsInput = false; |
| 16520 | SmallPtrSet<Value *, 4> UniqueBases; |
| 16521 | Value *VecBase = nullptr; |
| 16522 | SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end()); |
| 16523 | if (!E->ReorderIndices.empty()) { |
| 16524 | SmallVector<int> ReorderMask(E->ReorderIndices.begin(), |
| 16525 | E->ReorderIndices.end()); |
| 16526 | reorderScalars(Scalars&: VL, Mask: ReorderMask); |
| 16527 | } |
| 16528 | for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { |
| 16529 | int Idx = Mask[I]; |
| 16530 | if (Idx == PoisonMaskElem) |
| 16531 | continue; |
| 16532 | auto *EI = cast<ExtractElementInst>(Val: VL[I]); |
| 16533 | VecBase = EI->getVectorOperand(); |
| 16534 | if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty()) |
| 16535 | VecBase = TEs.front()->VectorizedValue; |
| 16536 | assert(VecBase && "Expected vectorized value." ); |
| 16537 | UniqueBases.insert(Ptr: VecBase); |
| 16538 | // If the only one use is vectorized - can delete the extractelement |
| 16539 | // itself. |
| 16540 | if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) || |
| 16541 | (NumParts != 1 && count(Range&: VL, Element: EI) > 1) || |
| 16542 | any_of(Range: EI->users(), P: [&](User *U) { |
| 16543 | ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U); |
| 16544 | return UTEs.empty() || UTEs.size() > 1 || |
| 16545 | (isa<GetElementPtrInst>(Val: U) && |
| 16546 | !R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) || |
| 16547 | (!UTEs.empty() && |
| 16548 | count_if(Range&: R.VectorizableTree, |
| 16549 | P: [&](const std::unique_ptr<TreeEntry> &TE) { |
| 16550 | return TE->UserTreeIndex.UserTE == |
| 16551 | UTEs.front() && |
| 16552 | is_contained(Range&: VL, Element: EI); |
| 16553 | }) != 1); |
| 16554 | })) |
| 16555 | continue; |
| 16556 | R.eraseInstruction(I: EI); |
| 16557 | } |
| 16558 | if (NumParts == 1 || UniqueBases.size() == 1) { |
| 16559 | assert(VecBase && "Expected vectorized value." ); |
| 16560 | return castToScalarTyElem(V: VecBase); |
| 16561 | } |
| 16562 | UseVecBaseAsInput = true; |
| 16563 | auto TransformToIdentity = [](MutableArrayRef<int> Mask) { |
| 16564 | for (auto [I, Idx] : enumerate(First&: Mask)) |
| 16565 | if (Idx != PoisonMaskElem) |
| 16566 | Idx = I; |
| 16567 | }; |
| 16568 | // Perform multi-register vector shuffle, joining them into a single virtual |
| 16569 | // long vector. |
| 16570 | // Need to shuffle each part independently and then insert all this parts |
| 16571 | // into a long virtual vector register, forming the original vector. |
| 16572 | Value *Vec = nullptr; |
| 16573 | SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); |
| 16574 | unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts); |
| 16575 | for (unsigned Part : seq<unsigned>(Size: NumParts)) { |
| 16576 | unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part); |
| 16577 | ArrayRef<Value *> SubVL = ArrayRef(VL).slice(N: Part * SliceSize, M: Limit); |
| 16578 | MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit); |
| 16579 | constexpr int MaxBases = 2; |
| 16580 | SmallVector<Value *, MaxBases> Bases(MaxBases); |
| 16581 | auto VLMask = zip(t&: SubVL, u&: SubMask); |
| 16582 | const unsigned VF = std::accumulate( |
| 16583 | first: VLMask.begin(), last: VLMask.end(), init: 0U, binary_op: [&](unsigned S, const auto &D) { |
| 16584 | if (std::get<1>(D) == PoisonMaskElem) |
| 16585 | return S; |
| 16586 | Value *VecOp = |
| 16587 | cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand(); |
| 16588 | if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); |
| 16589 | !TEs.empty()) |
| 16590 | VecOp = TEs.front()->VectorizedValue; |
| 16591 | assert(VecOp && "Expected vectorized value." ); |
| 16592 | const unsigned Size = |
| 16593 | cast<FixedVectorType>(Val: VecOp->getType())->getNumElements(); |
| 16594 | return std::max(a: S, b: Size); |
| 16595 | }); |
| 16596 | for (const auto [V, I] : VLMask) { |
| 16597 | if (I == PoisonMaskElem) |
| 16598 | continue; |
| 16599 | Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand(); |
| 16600 | if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty()) |
| 16601 | VecOp = TEs.front()->VectorizedValue; |
| 16602 | assert(VecOp && "Expected vectorized value." ); |
| 16603 | VecOp = castToScalarTyElem(V: VecOp); |
| 16604 | Bases[I / VF] = VecOp; |
| 16605 | } |
| 16606 | if (!Bases.front()) |
| 16607 | continue; |
| 16608 | Value *SubVec; |
| 16609 | if (Bases.back()) { |
| 16610 | SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask); |
| 16611 | TransformToIdentity(SubMask); |
| 16612 | } else { |
| 16613 | SubVec = Bases.front(); |
| 16614 | } |
| 16615 | if (!Vec) { |
| 16616 | Vec = SubVec; |
| 16617 | assert((Part == 0 || all_of(seq<unsigned>(0, Part), |
| 16618 | [&](unsigned P) { |
| 16619 | ArrayRef<int> SubMask = |
| 16620 | Mask.slice(P * SliceSize, |
| 16621 | getNumElems(Mask.size(), |
| 16622 | SliceSize, P)); |
| 16623 | return all_of(SubMask, [](int Idx) { |
| 16624 | return Idx == PoisonMaskElem; |
| 16625 | }); |
| 16626 | })) && |
| 16627 | "Expected first part or all previous parts masked." ); |
| 16628 | copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize)); |
| 16629 | } else { |
| 16630 | unsigned NewVF = |
| 16631 | cast<FixedVectorType>(Val: Vec->getType())->getNumElements(); |
| 16632 | if (Vec->getType() != SubVec->getType()) { |
| 16633 | unsigned SubVecVF = |
| 16634 | cast<FixedVectorType>(Val: SubVec->getType())->getNumElements(); |
| 16635 | NewVF = std::max(a: NewVF, b: SubVecVF); |
| 16636 | } |
| 16637 | // Adjust SubMask. |
| 16638 | for (int &Idx : SubMask) |
| 16639 | if (Idx != PoisonMaskElem) |
| 16640 | Idx += NewVF; |
| 16641 | copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize)); |
| 16642 | Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask); |
| 16643 | TransformToIdentity(VecMask); |
| 16644 | } |
| 16645 | } |
| 16646 | copy(Range&: VecMask, Out: Mask.begin()); |
| 16647 | return Vec; |
| 16648 | } |
| 16649 | /// Checks if the specified entry \p E needs to be delayed because of its |
| 16650 | /// dependency nodes. |
| 16651 | std::optional<Value *> |
| 16652 | needToDelay(const TreeEntry *E, |
| 16653 | ArrayRef<SmallVector<const TreeEntry *>> Deps) const { |
| 16654 | // No need to delay emission if all deps are ready. |
| 16655 | if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) { |
| 16656 | return all_of( |
| 16657 | Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; }); |
| 16658 | })) |
| 16659 | return std::nullopt; |
| 16660 | // Postpone gather emission, will be emitted after the end of the |
| 16661 | // process to keep correct order. |
| 16662 | auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor()); |
| 16663 | return Builder.CreateAlignedLoad( |
| 16664 | Ty: ResVecTy, |
| 16665 | Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())), |
| 16666 | Align: MaybeAlign()); |
| 16667 | } |
| 16668 | /// Reset the builder to handle perfect diamond match. |
| 16669 | void resetForSameNode() { |
| 16670 | IsFinalized = false; |
| 16671 | CommonMask.clear(); |
| 16672 | InVectors.clear(); |
| 16673 | } |
| 16674 | /// Adds 2 input vectors (in form of tree entries) and the mask for their |
| 16675 | /// shuffling. |
| 16676 | void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) { |
| 16677 | Value *V1 = E1.VectorizedValue; |
| 16678 | if (V1->getType()->isIntOrIntVectorTy()) |
| 16679 | V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) { |
| 16680 | if (isa<PoisonValue>(Val: V)) |
| 16681 | return false; |
| 16682 | return !isKnownNonNegative( |
| 16683 | V, SQ: SimplifyQuery(*R.DL)); |
| 16684 | })); |
| 16685 | Value *V2 = E2.VectorizedValue; |
| 16686 | if (V2->getType()->isIntOrIntVectorTy()) |
| 16687 | V2 = castToScalarTyElem(V: V2, IsSigned: any_of(Range: E2.Scalars, P: [&](Value *V) { |
| 16688 | if (isa<PoisonValue>(Val: V)) |
| 16689 | return false; |
| 16690 | return !isKnownNonNegative( |
| 16691 | V, SQ: SimplifyQuery(*R.DL)); |
| 16692 | })); |
| 16693 | add(V1, V2, Mask); |
| 16694 | } |
| 16695 | /// Adds single input vector (in form of tree entry) and the mask for its |
| 16696 | /// shuffling. |
| 16697 | void add(const TreeEntry &E1, ArrayRef<int> Mask) { |
| 16698 | Value *V1 = E1.VectorizedValue; |
| 16699 | if (V1->getType()->isIntOrIntVectorTy()) |
| 16700 | V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) { |
| 16701 | if (isa<PoisonValue>(Val: V)) |
| 16702 | return false; |
| 16703 | return !isKnownNonNegative( |
| 16704 | V, SQ: SimplifyQuery(*R.DL)); |
| 16705 | })); |
| 16706 | add(V1, Mask); |
| 16707 | } |
| 16708 | /// Adds 2 input vectors and the mask for their shuffling. |
| 16709 | void add(Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 16710 | assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors." ); |
| 16711 | assert(isa<FixedVectorType>(V1->getType()) && |
| 16712 | isa<FixedVectorType>(V2->getType()) && |
| 16713 | "castToScalarTyElem expects V1 and V2 to be FixedVectorType" ); |
| 16714 | V1 = castToScalarTyElem(V: V1); |
| 16715 | V2 = castToScalarTyElem(V: V2); |
| 16716 | if (InVectors.empty()) { |
| 16717 | InVectors.push_back(Elt: V1); |
| 16718 | InVectors.push_back(Elt: V2); |
| 16719 | CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end()); |
| 16720 | return; |
| 16721 | } |
| 16722 | Value *Vec = InVectors.front(); |
| 16723 | if (InVectors.size() == 2) { |
| 16724 | Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask); |
| 16725 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 16726 | } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() != |
| 16727 | Mask.size()) { |
| 16728 | Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask); |
| 16729 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 16730 | } |
| 16731 | V1 = createShuffle(V1, V2, Mask); |
| 16732 | unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec)); |
| 16733 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 16734 | if (Mask[Idx] != PoisonMaskElem) |
| 16735 | CommonMask[Idx] = Idx + VF; |
| 16736 | InVectors.front() = Vec; |
| 16737 | if (InVectors.size() == 2) |
| 16738 | InVectors.back() = V1; |
| 16739 | else |
| 16740 | InVectors.push_back(Elt: V1); |
| 16741 | } |
| 16742 | /// Adds another one input vector and the mask for the shuffling. |
| 16743 | void add(Value *V1, ArrayRef<int> Mask, bool = false) { |
| 16744 | assert(isa<FixedVectorType>(V1->getType()) && |
| 16745 | "castToScalarTyElem expects V1 to be FixedVectorType" ); |
| 16746 | V1 = castToScalarTyElem(V: V1); |
| 16747 | if (InVectors.empty()) { |
| 16748 | InVectors.push_back(Elt: V1); |
| 16749 | CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end()); |
| 16750 | return; |
| 16751 | } |
| 16752 | const auto *It = find(Range&: InVectors, Val: V1); |
| 16753 | if (It == InVectors.end()) { |
| 16754 | if (InVectors.size() == 2 || |
| 16755 | InVectors.front()->getType() != V1->getType()) { |
| 16756 | Value *V = InVectors.front(); |
| 16757 | if (InVectors.size() == 2) { |
| 16758 | V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask); |
| 16759 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 16760 | } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() != |
| 16761 | CommonMask.size()) { |
| 16762 | V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask); |
| 16763 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 16764 | } |
| 16765 | unsigned VF = std::max(a: CommonMask.size(), b: Mask.size()); |
| 16766 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 16767 | if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem) |
| 16768 | CommonMask[Idx] = V->getType() != V1->getType() |
| 16769 | ? Idx + VF |
| 16770 | : Mask[Idx] + getVF(V: V1); |
| 16771 | if (V->getType() != V1->getType()) |
| 16772 | V1 = createShuffle(V1, V2: nullptr, Mask); |
| 16773 | InVectors.front() = V; |
| 16774 | if (InVectors.size() == 2) |
| 16775 | InVectors.back() = V1; |
| 16776 | else |
| 16777 | InVectors.push_back(Elt: V1); |
| 16778 | return; |
| 16779 | } |
| 16780 | // Check if second vector is required if the used elements are already |
| 16781 | // used from the first one. |
| 16782 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 16783 | if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) { |
| 16784 | InVectors.push_back(Elt: V1); |
| 16785 | break; |
| 16786 | } |
| 16787 | } |
| 16788 | unsigned VF = 0; |
| 16789 | for (Value *V : InVectors) |
| 16790 | VF = std::max(a: VF, b: getVF(V)); |
| 16791 | for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) |
| 16792 | if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) |
| 16793 | CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF); |
| 16794 | } |
| 16795 | /// Adds another one input vector and the mask for the shuffling. |
| 16796 | void addOrdered(Value *V1, ArrayRef<unsigned> Order) { |
| 16797 | SmallVector<int> NewMask; |
| 16798 | inversePermutation(Indices: Order, Mask&: NewMask); |
| 16799 | add(V1, Mask: NewMask); |
| 16800 | } |
| 16801 | Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0, |
| 16802 | Value *Root = nullptr) { |
| 16803 | return R.gather(VL, Root, ScalarTy, |
| 16804 | CreateShuffle: [&](Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 16805 | return createShuffle(V1, V2, Mask); |
| 16806 | }); |
| 16807 | } |
| 16808 | Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); } |
| 16809 | /// Finalize emission of the shuffles. |
| 16810 | /// \param Action the action (if any) to be performed before final applying of |
| 16811 | /// the \p ExtMask mask. |
| 16812 | Value *finalize( |
| 16813 | ArrayRef<int> ExtMask, |
| 16814 | ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors, |
| 16815 | ArrayRef<int> SubVectorsMask, unsigned VF = 0, |
| 16816 | function_ref<void(Value *&, SmallVectorImpl<int> &, |
| 16817 | function_ref<Value *(Value *, Value *, ArrayRef<int>)>)> |
| 16818 | Action = {}) { |
| 16819 | IsFinalized = true; |
| 16820 | if (Action) { |
| 16821 | Value *Vec = InVectors.front(); |
| 16822 | if (InVectors.size() == 2) { |
| 16823 | Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask); |
| 16824 | InVectors.pop_back(); |
| 16825 | } else { |
| 16826 | Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask); |
| 16827 | } |
| 16828 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 16829 | assert(VF > 0 && |
| 16830 | "Expected vector length for the final value before action." ); |
| 16831 | unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements(); |
| 16832 | if (VecVF < VF) { |
| 16833 | SmallVector<int> ResizeMask(VF, PoisonMaskElem); |
| 16834 | std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0); |
| 16835 | Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask); |
| 16836 | } |
| 16837 | Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 16838 | return createShuffle(V1, V2, Mask); |
| 16839 | }); |
| 16840 | InVectors.front() = Vec; |
| 16841 | } |
| 16842 | if (!SubVectors.empty()) { |
| 16843 | Value *Vec = InVectors.front(); |
| 16844 | if (InVectors.size() == 2) { |
| 16845 | Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask); |
| 16846 | InVectors.pop_back(); |
| 16847 | } else { |
| 16848 | Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask); |
| 16849 | } |
| 16850 | transformMaskAfterShuffle(CommonMask, Mask: CommonMask); |
| 16851 | auto CreateSubVectors = [&](Value *Vec, |
| 16852 | SmallVectorImpl<int> &CommonMask) { |
| 16853 | for (auto [E, Idx] : SubVectors) { |
| 16854 | Value *V = E->VectorizedValue; |
| 16855 | if (V->getType()->isIntOrIntVectorTy()) |
| 16856 | V = castToScalarTyElem(V, IsSigned: any_of(Range: E->Scalars, P: [&](Value *V) { |
| 16857 | if (isa<PoisonValue>(Val: V)) |
| 16858 | return false; |
| 16859 | return !isKnownNonNegative( |
| 16860 | V, SQ: SimplifyQuery(*R.DL)); |
| 16861 | })); |
| 16862 | unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy); |
| 16863 | Vec = createInsertVector( |
| 16864 | Builder, Vec, V, Index: InsertionIndex, |
| 16865 | Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2, |
| 16866 | args: _3)); |
| 16867 | if (!CommonMask.empty()) { |
| 16868 | std::iota(first: std::next(x: CommonMask.begin(), n: Idx), |
| 16869 | last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()), |
| 16870 | value: Idx); |
| 16871 | } |
| 16872 | } |
| 16873 | return Vec; |
| 16874 | }; |
| 16875 | if (SubVectorsMask.empty()) { |
| 16876 | Vec = CreateSubVectors(Vec, CommonMask); |
| 16877 | } else { |
| 16878 | SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem); |
| 16879 | copy(Range&: SubVectorsMask, Out: SVMask.begin()); |
| 16880 | for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) { |
| 16881 | if (I2 != PoisonMaskElem) { |
| 16882 | assert(I1 == PoisonMaskElem && "Expected unused subvectors mask" ); |
| 16883 | I1 = I2 + CommonMask.size(); |
| 16884 | } |
| 16885 | } |
| 16886 | Value *InsertVec = |
| 16887 | CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask); |
| 16888 | Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask); |
| 16889 | transformMaskAfterShuffle(CommonMask, Mask: SVMask); |
| 16890 | } |
| 16891 | InVectors.front() = Vec; |
| 16892 | } |
| 16893 | |
| 16894 | if (!ExtMask.empty()) { |
| 16895 | if (CommonMask.empty()) { |
| 16896 | CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end()); |
| 16897 | } else { |
| 16898 | SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem); |
| 16899 | for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) { |
| 16900 | if (ExtMask[I] == PoisonMaskElem) |
| 16901 | continue; |
| 16902 | NewMask[I] = CommonMask[ExtMask[I]]; |
| 16903 | } |
| 16904 | CommonMask.swap(RHS&: NewMask); |
| 16905 | } |
| 16906 | } |
| 16907 | if (CommonMask.empty()) { |
| 16908 | assert(InVectors.size() == 1 && "Expected only one vector with no mask" ); |
| 16909 | return InVectors.front(); |
| 16910 | } |
| 16911 | if (InVectors.size() == 2) |
| 16912 | return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask); |
| 16913 | return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask); |
| 16914 | } |
| 16915 | |
| 16916 | ~ShuffleInstructionBuilder() { |
| 16917 | assert((IsFinalized || CommonMask.empty()) && |
| 16918 | "Shuffle construction must be finalized." ); |
| 16919 | } |
| 16920 | }; |
| 16921 | |
| 16922 | Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) { |
| 16923 | return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx)); |
| 16924 | } |
| 16925 | |
| 16926 | template <typename BVTy, typename ResTy, typename... Args> |
| 16927 | ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, |
| 16928 | Args &...Params) { |
| 16929 | assert(E->isGather() && "Expected gather node." ); |
| 16930 | unsigned VF = E->getVectorFactor(); |
| 16931 | |
| 16932 | bool NeedFreeze = false; |
| 16933 | SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end()); |
| 16934 | // Clear values, to be replaced by insertvector instructions. |
| 16935 | for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices) |
| 16936 | for_each(MutableArrayRef(GatheredScalars) |
| 16937 | .slice(N: Idx, M: VectorizableTree[EIdx]->getVectorFactor()), |
| 16938 | [&](Value *&V) { V = PoisonValue::get(T: V->getType()); }); |
| 16939 | SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors( |
| 16940 | E->CombinedEntriesWithIndices.size()); |
| 16941 | transform(E->CombinedEntriesWithIndices, SubVectors.begin(), |
| 16942 | [&](const auto &P) { |
| 16943 | return std::make_pair(VectorizableTree[P.first].get(), P.second); |
| 16944 | }); |
| 16945 | // Build a mask out of the reorder indices and reorder scalars per this |
| 16946 | // mask. |
| 16947 | SmallVector<int> ReorderMask(E->ReorderIndices.begin(), |
| 16948 | E->ReorderIndices.end()); |
| 16949 | if (!ReorderMask.empty()) |
| 16950 | reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask); |
| 16951 | SmallVector<int> SubVectorsMask; |
| 16952 | inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask); |
| 16953 | // Transform non-clustered elements in the mask to poison (-1). |
| 16954 | // "Clustered" operations will be reordered using this mask later. |
| 16955 | if (!SubVectors.empty() && !SubVectorsMask.empty()) { |
| 16956 | for (unsigned I : seq<unsigned>(Size: GatheredScalars.size())) |
| 16957 | if (E->Scalars[I] == GatheredScalars[ReorderMask[I]]) |
| 16958 | SubVectorsMask[ReorderMask[I]] = PoisonMaskElem; |
| 16959 | } else { |
| 16960 | SubVectorsMask.clear(); |
| 16961 | } |
| 16962 | SmallVector<Value *> StoredGS(GatheredScalars); |
| 16963 | auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF, |
| 16964 | unsigned I, unsigned SliceSize, |
| 16965 | bool IsNotPoisonous) { |
| 16966 | if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) { |
| 16967 | return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V); |
| 16968 | })) |
| 16969 | return false; |
| 16970 | TreeEntry *UserTE = E->UserTreeIndex.UserTE; |
| 16971 | unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx; |
| 16972 | if (UserTE->getNumOperands() != 2) |
| 16973 | return false; |
| 16974 | if (!IsNotPoisonous) { |
| 16975 | auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + 1), |
| 16976 | [=](const std::unique_ptr<TreeEntry> &TE) { |
| 16977 | return TE->UserTreeIndex.UserTE == UserTE && |
| 16978 | TE->UserTreeIndex.EdgeIdx != EdgeIdx; |
| 16979 | }); |
| 16980 | if (It == VectorizableTree.end()) |
| 16981 | return false; |
| 16982 | SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end()); |
| 16983 | if (!(*It)->ReorderIndices.empty()) { |
| 16984 | inversePermutation((*It)->ReorderIndices, ReorderMask); |
| 16985 | reorderScalars(Scalars&: GS, Mask: ReorderMask); |
| 16986 | } |
| 16987 | if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) { |
| 16988 | Value *V0 = std::get<0>(P); |
| 16989 | Value *V1 = std::get<1>(P); |
| 16990 | return !isa<UndefValue>(Val: V0) || isa<PoisonValue>(Val: V0) || |
| 16991 | (isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) && |
| 16992 | is_contained(Range: E->Scalars, Element: V1)); |
| 16993 | })) |
| 16994 | return false; |
| 16995 | } |
| 16996 | int Idx; |
| 16997 | if ((Mask.size() < InputVF && |
| 16998 | ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) && |
| 16999 | Idx == 0) || |
| 17000 | (Mask.size() == InputVF && |
| 17001 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) { |
| 17002 | std::iota( |
| 17003 | first: std::next(x: Mask.begin(), n: I * SliceSize), |
| 17004 | last: std::next(x: Mask.begin(), |
| 17005 | n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)), |
| 17006 | value: 0); |
| 17007 | } else { |
| 17008 | unsigned IVal = |
| 17009 | *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; }); |
| 17010 | std::fill( |
| 17011 | first: std::next(x: Mask.begin(), n: I * SliceSize), |
| 17012 | last: std::next(x: Mask.begin(), |
| 17013 | n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)), |
| 17014 | value: IVal); |
| 17015 | } |
| 17016 | return true; |
| 17017 | }; |
| 17018 | BVTy ShuffleBuilder(ScalarTy, Params...); |
| 17019 | ResTy Res = ResTy(); |
| 17020 | SmallVector<int> Mask; |
| 17021 | SmallVector<int> (GatheredScalars.size(), PoisonMaskElem); |
| 17022 | SmallVector<std::optional<TTI::ShuffleKind>> ; |
| 17023 | Value * = nullptr; |
| 17024 | bool UseVecBaseAsInput = false; |
| 17025 | SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles; |
| 17026 | SmallVector<SmallVector<const TreeEntry *>> Entries; |
| 17027 | Type *OrigScalarTy = GatheredScalars.front()->getType(); |
| 17028 | auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size()); |
| 17029 | unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size()); |
| 17030 | if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) { |
| 17031 | // Check for gathered extracts. |
| 17032 | bool Resized = false; |
| 17033 | ExtractShuffles = |
| 17034 | tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts); |
| 17035 | if (!ExtractShuffles.empty()) { |
| 17036 | SmallVector<const TreeEntry *> ; |
| 17037 | for (auto [Idx, I] : enumerate(First&: ExtractMask)) { |
| 17038 | if (I == PoisonMaskElem) |
| 17039 | continue; |
| 17040 | if (ArrayRef<TreeEntry *> TEs = getTreeEntries( |
| 17041 | V: cast<ExtractElementInst>(Val: StoredGS[Idx])->getVectorOperand()); |
| 17042 | !TEs.empty()) |
| 17043 | ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end()); |
| 17044 | } |
| 17045 | if (std::optional<ResTy> Delayed = |
| 17046 | ShuffleBuilder.needToDelay(E, ExtractEntries)) { |
| 17047 | // Delay emission of gathers which are not ready yet. |
| 17048 | PostponedGathers.insert(X: E); |
| 17049 | // Postpone gather emission, will be emitted after the end of the |
| 17050 | // process to keep correct order. |
| 17051 | return *Delayed; |
| 17052 | } |
| 17053 | if (Value *VecBase = ShuffleBuilder.adjustExtracts( |
| 17054 | E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) { |
| 17055 | ExtractVecBase = VecBase; |
| 17056 | if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType())) |
| 17057 | if (VF == VecBaseTy->getNumElements() && |
| 17058 | GatheredScalars.size() != VF) { |
| 17059 | Resized = true; |
| 17060 | GatheredScalars.append(NumInputs: VF - GatheredScalars.size(), |
| 17061 | Elt: PoisonValue::get(T: OrigScalarTy)); |
| 17062 | NumParts = |
| 17063 | ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF); |
| 17064 | } |
| 17065 | } |
| 17066 | } |
| 17067 | // Gather extracts after we check for full matched gathers only. |
| 17068 | if (!ExtractShuffles.empty() || !E->hasState() || |
| 17069 | E->getOpcode() != Instruction::Load || |
| 17070 | (((E->hasState() && E->getOpcode() == Instruction::Load) || |
| 17071 | any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) && |
| 17072 | any_of(E->Scalars, |
| 17073 | [this](Value *V) { |
| 17074 | return isa<LoadInst>(Val: V) && isVectorized(V); |
| 17075 | })) || |
| 17076 | (E->hasState() && E->isAltShuffle()) || |
| 17077 | all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) || |
| 17078 | isSplat(VL: E->Scalars) || |
| 17079 | (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) { |
| 17080 | GatherShuffles = |
| 17081 | isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts); |
| 17082 | } |
| 17083 | if (!GatherShuffles.empty()) { |
| 17084 | if (std::optional<ResTy> Delayed = |
| 17085 | ShuffleBuilder.needToDelay(E, Entries)) { |
| 17086 | // Delay emission of gathers which are not ready yet. |
| 17087 | PostponedGathers.insert(X: E); |
| 17088 | // Postpone gather emission, will be emitted after the end of the |
| 17089 | // process to keep correct order. |
| 17090 | return *Delayed; |
| 17091 | } |
| 17092 | if (GatherShuffles.size() == 1 && |
| 17093 | *GatherShuffles.front() == TTI::SK_PermuteSingleSrc && |
| 17094 | Entries.front().front()->isSame(VL: E->Scalars)) { |
| 17095 | // Perfect match in the graph, will reuse the previously vectorized |
| 17096 | // node. Cost is 0. |
| 17097 | LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle " |
| 17098 | << shortBundleName(E->Scalars, E->Idx) << ".\n" ); |
| 17099 | // Restore the mask for previous partially matched values. |
| 17100 | Mask.resize(N: E->Scalars.size()); |
| 17101 | const TreeEntry *FrontTE = Entries.front().front(); |
| 17102 | if (FrontTE->ReorderIndices.empty() && |
| 17103 | ((FrontTE->ReuseShuffleIndices.empty() && |
| 17104 | E->Scalars.size() == FrontTE->Scalars.size()) || |
| 17105 | (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) { |
| 17106 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
| 17107 | } else { |
| 17108 | for (auto [I, V] : enumerate(First: E->Scalars)) { |
| 17109 | if (isa<PoisonValue>(Val: V)) { |
| 17110 | Mask[I] = PoisonMaskElem; |
| 17111 | continue; |
| 17112 | } |
| 17113 | Mask[I] = FrontTE->findLaneForValue(V); |
| 17114 | } |
| 17115 | } |
| 17116 | // Reset the builder(s) to correctly handle perfect diamond matched |
| 17117 | // nodes. |
| 17118 | ShuffleBuilder.resetForSameNode(); |
| 17119 | ShuffleBuilder.add(*FrontTE, Mask); |
| 17120 | // Full matched entry found, no need to insert subvectors. |
| 17121 | Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {}); |
| 17122 | return Res; |
| 17123 | } |
| 17124 | if (!Resized) { |
| 17125 | if (GatheredScalars.size() != VF && |
| 17126 | any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) { |
| 17127 | return any_of(TEs, [&](const TreeEntry *TE) { |
| 17128 | return TE->getVectorFactor() == VF; |
| 17129 | }); |
| 17130 | })) |
| 17131 | GatheredScalars.append(NumInputs: VF - GatheredScalars.size(), |
| 17132 | Elt: PoisonValue::get(T: OrigScalarTy)); |
| 17133 | } |
| 17134 | // Remove shuffled elements from list of gathers. |
| 17135 | for (int I = 0, Sz = Mask.size(); I < Sz; ++I) { |
| 17136 | if (Mask[I] != PoisonMaskElem) |
| 17137 | GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy); |
| 17138 | } |
| 17139 | } |
| 17140 | } |
| 17141 | auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars, |
| 17142 | SmallVectorImpl<int> &ReuseMask, |
| 17143 | bool IsRootPoison) { |
| 17144 | // For splats with can emit broadcasts instead of gathers, so try to find |
| 17145 | // such sequences. |
| 17146 | bool IsSplat = IsRootPoison && isSplat(VL: Scalars) && |
| 17147 | (Scalars.size() > 2 || Scalars.front() == Scalars.back()); |
| 17148 | Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy)); |
| 17149 | SmallVector<int> UndefPos; |
| 17150 | DenseMap<Value *, unsigned> UniquePositions; |
| 17151 | // Gather unique non-const values and all constant values. |
| 17152 | // For repeated values, just shuffle them. |
| 17153 | int NumNonConsts = 0; |
| 17154 | int SinglePos = 0; |
| 17155 | for (auto [I, V] : enumerate(First&: Scalars)) { |
| 17156 | if (isa<UndefValue>(Val: V)) { |
| 17157 | if (!isa<PoisonValue>(Val: V)) { |
| 17158 | ReuseMask[I] = I; |
| 17159 | UndefPos.push_back(Elt: I); |
| 17160 | } |
| 17161 | continue; |
| 17162 | } |
| 17163 | if (isConstant(V)) { |
| 17164 | ReuseMask[I] = I; |
| 17165 | continue; |
| 17166 | } |
| 17167 | ++NumNonConsts; |
| 17168 | SinglePos = I; |
| 17169 | Value *OrigV = V; |
| 17170 | Scalars[I] = PoisonValue::get(T: OrigScalarTy); |
| 17171 | if (IsSplat) { |
| 17172 | Scalars.front() = OrigV; |
| 17173 | ReuseMask[I] = 0; |
| 17174 | } else { |
| 17175 | const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I); |
| 17176 | Scalars[Res.first->second] = OrigV; |
| 17177 | ReuseMask[I] = Res.first->second; |
| 17178 | } |
| 17179 | } |
| 17180 | if (NumNonConsts == 1) { |
| 17181 | // Restore single insert element. |
| 17182 | if (IsSplat) { |
| 17183 | ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem); |
| 17184 | std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]); |
| 17185 | if (!UndefPos.empty() && UndefPos.front() == 0) |
| 17186 | Scalars.front() = UndefValue::get(T: OrigScalarTy); |
| 17187 | } |
| 17188 | ReuseMask[SinglePos] = SinglePos; |
| 17189 | } else if (!UndefPos.empty() && IsSplat) { |
| 17190 | // For undef values, try to replace them with the simple broadcast. |
| 17191 | // We can do it if the broadcasted value is guaranteed to be |
| 17192 | // non-poisonous, or by freezing the incoming scalar value first. |
| 17193 | auto *It = find_if(Scalars, [this, E](Value *V) { |
| 17194 | return !isa<UndefValue>(Val: V) && |
| 17195 | (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) || |
| 17196 | (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) { |
| 17197 | // Check if the value already used in the same operation in |
| 17198 | // one of the nodes already. |
| 17199 | return E->UserTreeIndex.EdgeIdx != U.getOperandNo() && |
| 17200 | is_contained(Range&: E->UserTreeIndex.UserTE->Scalars, |
| 17201 | Element: U.getUser()); |
| 17202 | }))); |
| 17203 | }); |
| 17204 | if (It != Scalars.end()) { |
| 17205 | // Replace undefs by the non-poisoned scalars and emit broadcast. |
| 17206 | int Pos = std::distance(Scalars.begin(), It); |
| 17207 | for (int I : UndefPos) { |
| 17208 | // Set the undef position to the non-poisoned scalar. |
| 17209 | ReuseMask[I] = Pos; |
| 17210 | // Replace the undef by the poison, in the mask it is replaced by |
| 17211 | // non-poisoned scalar already. |
| 17212 | if (I != Pos) |
| 17213 | Scalars[I] = PoisonValue::get(T: OrigScalarTy); |
| 17214 | } |
| 17215 | } else { |
| 17216 | // Replace undefs by the poisons, emit broadcast and then emit |
| 17217 | // freeze. |
| 17218 | for (int I : UndefPos) { |
| 17219 | ReuseMask[I] = PoisonMaskElem; |
| 17220 | if (isa<UndefValue>(Val: Scalars[I])) |
| 17221 | Scalars[I] = PoisonValue::get(T: OrigScalarTy); |
| 17222 | } |
| 17223 | NeedFreeze = true; |
| 17224 | } |
| 17225 | } |
| 17226 | }; |
| 17227 | if (!ExtractShuffles.empty() || !GatherShuffles.empty()) { |
| 17228 | bool IsNonPoisoned = true; |
| 17229 | bool IsUsedInExpr = true; |
| 17230 | Value *Vec1 = nullptr; |
| 17231 | if (!ExtractShuffles.empty()) { |
| 17232 | // Gather of extractelements can be represented as just a shuffle of |
| 17233 | // a single/two vectors the scalars are extracted from. |
| 17234 | // Find input vectors. |
| 17235 | Value *Vec2 = nullptr; |
| 17236 | for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { |
| 17237 | if (!Mask.empty() && Mask[I] != PoisonMaskElem) |
| 17238 | ExtractMask[I] = PoisonMaskElem; |
| 17239 | } |
| 17240 | if (UseVecBaseAsInput) { |
| 17241 | Vec1 = ExtractVecBase; |
| 17242 | } else { |
| 17243 | for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) { |
| 17244 | if (ExtractMask[I] == PoisonMaskElem) |
| 17245 | continue; |
| 17246 | if (isa<UndefValue>(Val: StoredGS[I])) |
| 17247 | continue; |
| 17248 | auto *EI = cast<ExtractElementInst>(Val: StoredGS[I]); |
| 17249 | Value *VecOp = EI->getVectorOperand(); |
| 17250 | if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp); |
| 17251 | !TEs.empty() && TEs.front()->VectorizedValue) |
| 17252 | VecOp = TEs.front()->VectorizedValue; |
| 17253 | if (!Vec1) { |
| 17254 | Vec1 = VecOp; |
| 17255 | } else if (Vec1 != VecOp) { |
| 17256 | assert((!Vec2 || Vec2 == VecOp) && |
| 17257 | "Expected only 1 or 2 vectors shuffle." ); |
| 17258 | Vec2 = VecOp; |
| 17259 | } |
| 17260 | } |
| 17261 | } |
| 17262 | if (Vec2) { |
| 17263 | IsUsedInExpr = false; |
| 17264 | IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) && |
| 17265 | isGuaranteedNotToBePoison(V: Vec2, AC); |
| 17266 | ShuffleBuilder.add(Vec1, Vec2, ExtractMask); |
| 17267 | } else if (Vec1) { |
| 17268 | bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC); |
| 17269 | IsUsedInExpr &= FindReusedSplat( |
| 17270 | ExtractMask, |
| 17271 | cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0, |
| 17272 | ExtractMask.size(), IsNotPoisonedVec); |
| 17273 | ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true); |
| 17274 | IsNonPoisoned &= IsNotPoisonedVec; |
| 17275 | } else { |
| 17276 | IsUsedInExpr = false; |
| 17277 | ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask, |
| 17278 | /*ForExtracts=*/true); |
| 17279 | } |
| 17280 | } |
| 17281 | if (!GatherShuffles.empty()) { |
| 17282 | unsigned SliceSize = |
| 17283 | getPartNumElems(Size: E->Scalars.size(), |
| 17284 | NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size())); |
| 17285 | SmallVector<int> VecMask(Mask.size(), PoisonMaskElem); |
| 17286 | for (const auto [I, TEs] : enumerate(First&: Entries)) { |
| 17287 | if (TEs.empty()) { |
| 17288 | assert(!GatherShuffles[I] && |
| 17289 | "No shuffles with empty entries list expected." ); |
| 17290 | continue; |
| 17291 | } |
| 17292 | assert((TEs.size() == 1 || TEs.size() == 2) && |
| 17293 | "Expected shuffle of 1 or 2 entries." ); |
| 17294 | unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I); |
| 17295 | auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit); |
| 17296 | VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem); |
| 17297 | copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize)); |
| 17298 | if (TEs.size() == 1) { |
| 17299 | bool IsNotPoisonedVec = |
| 17300 | TEs.front()->VectorizedValue |
| 17301 | ? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) |
| 17302 | : true; |
| 17303 | IsUsedInExpr &= |
| 17304 | FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I, |
| 17305 | SliceSize, IsNotPoisonedVec); |
| 17306 | ShuffleBuilder.add(*TEs.front(), VecMask); |
| 17307 | IsNonPoisoned &= IsNotPoisonedVec; |
| 17308 | } else { |
| 17309 | IsUsedInExpr = false; |
| 17310 | ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask); |
| 17311 | if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue) |
| 17312 | IsNonPoisoned &= |
| 17313 | isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) && |
| 17314 | isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC); |
| 17315 | } |
| 17316 | } |
| 17317 | } |
| 17318 | // Try to figure out best way to combine values: build a shuffle and insert |
| 17319 | // elements or just build several shuffles. |
| 17320 | // Insert non-constant scalars. |
| 17321 | SmallVector<Value *> NonConstants(GatheredScalars); |
| 17322 | int EMSz = ExtractMask.size(); |
| 17323 | int MSz = Mask.size(); |
| 17324 | // Try to build constant vector and shuffle with it only if currently we |
| 17325 | // have a single permutation and more than 1 scalar constants. |
| 17326 | bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty(); |
| 17327 | bool IsIdentityShuffle = |
| 17328 | ((UseVecBaseAsInput || |
| 17329 | all_of(ExtractShuffles, |
| 17330 | [](const std::optional<TTI::ShuffleKind> &SK) { |
| 17331 | return SK.value_or(u: TTI::SK_PermuteTwoSrc) == |
| 17332 | TTI::SK_PermuteSingleSrc; |
| 17333 | })) && |
| 17334 | none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && |
| 17335 | ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) || |
| 17336 | (!GatherShuffles.empty() && |
| 17337 | all_of(GatherShuffles, |
| 17338 | [](const std::optional<TTI::ShuffleKind> &SK) { |
| 17339 | return SK.value_or(u: TTI::SK_PermuteTwoSrc) == |
| 17340 | TTI::SK_PermuteSingleSrc; |
| 17341 | }) && |
| 17342 | none_of(Mask, [&](int I) { return I >= MSz; }) && |
| 17343 | ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz)); |
| 17344 | bool EnoughConstsForShuffle = |
| 17345 | IsSingleShuffle && |
| 17346 | (none_of(GatheredScalars, |
| 17347 | [](Value *V) { |
| 17348 | return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V); |
| 17349 | }) || |
| 17350 | any_of(GatheredScalars, |
| 17351 | [](Value *V) { |
| 17352 | return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V); |
| 17353 | })) && |
| 17354 | (!IsIdentityShuffle || |
| 17355 | (GatheredScalars.size() == 2 && |
| 17356 | any_of(GatheredScalars, |
| 17357 | [](Value *V) { return !isa<UndefValue>(Val: V); })) || |
| 17358 | count_if(GatheredScalars, [](Value *V) { |
| 17359 | return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V); |
| 17360 | }) > 1); |
| 17361 | // NonConstants array contains just non-constant values, GatheredScalars |
| 17362 | // contains only constant to build final vector and then shuffle. |
| 17363 | for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { |
| 17364 | if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I])) |
| 17365 | NonConstants[I] = PoisonValue::get(T: OrigScalarTy); |
| 17366 | else |
| 17367 | GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy); |
| 17368 | } |
| 17369 | // Generate constants for final shuffle and build a mask for them. |
| 17370 | if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) { |
| 17371 | SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem); |
| 17372 | TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true); |
| 17373 | Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size()); |
| 17374 | ShuffleBuilder.add(BV, BVMask); |
| 17375 | } |
| 17376 | if (all_of(NonConstants, [=](Value *V) { |
| 17377 | return isa<PoisonValue>(Val: V) || |
| 17378 | (IsSingleShuffle && ((IsIdentityShuffle && |
| 17379 | IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V)); |
| 17380 | })) |
| 17381 | Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, |
| 17382 | SubVectorsMask); |
| 17383 | else |
| 17384 | Res = ShuffleBuilder.finalize( |
| 17385 | E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(), |
| 17386 | [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) { |
| 17387 | bool IsSplat = isSplat(VL: NonConstants); |
| 17388 | SmallVector<int> BVMask(Mask.size(), PoisonMaskElem); |
| 17389 | TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false); |
| 17390 | auto CheckIfSplatIsProfitable = [&]() { |
| 17391 | // Estimate the cost of splatting + shuffle and compare with |
| 17392 | // insert + shuffle. |
| 17393 | constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 17394 | Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>); |
| 17395 | if (isa<ExtractElementInst>(Val: V) || isVectorized(V)) |
| 17396 | return false; |
| 17397 | InstructionCost SplatCost = TTI->getVectorInstrCost( |
| 17398 | Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /*Index=*/0, |
| 17399 | Op0: PoisonValue::get(T: VecTy), Op1: V); |
| 17400 | SmallVector<int> NewMask(Mask.begin(), Mask.end()); |
| 17401 | for (auto [Idx, I] : enumerate(First&: BVMask)) |
| 17402 | if (I != PoisonMaskElem) |
| 17403 | NewMask[Idx] = Mask.size(); |
| 17404 | SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, |
| 17405 | Mask: NewMask, CostKind); |
| 17406 | InstructionCost BVCost = TTI->getVectorInstrCost( |
| 17407 | Instruction::InsertElement, VecTy, CostKind, |
| 17408 | *find_if(Mask, [](int I) { return I != PoisonMaskElem; }), |
| 17409 | Vec, V); |
| 17410 | // Shuffle required? |
| 17411 | if (count(Range&: BVMask, Element: PoisonMaskElem) < |
| 17412 | static_cast<int>(BVMask.size() - 1)) { |
| 17413 | SmallVector<int> NewMask(Mask.begin(), Mask.end()); |
| 17414 | for (auto [Idx, I] : enumerate(First&: BVMask)) |
| 17415 | if (I != PoisonMaskElem) |
| 17416 | NewMask[Idx] = I; |
| 17417 | BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, |
| 17418 | Tp: VecTy, Mask: NewMask, CostKind); |
| 17419 | } |
| 17420 | return SplatCost <= BVCost; |
| 17421 | }; |
| 17422 | if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) { |
| 17423 | for (auto [Idx, I] : enumerate(First&: BVMask)) |
| 17424 | if (I != PoisonMaskElem) |
| 17425 | Mask[Idx] = I; |
| 17426 | Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); |
| 17427 | } else { |
| 17428 | Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>); |
| 17429 | SmallVector<Value *> Values(NonConstants.size(), |
| 17430 | PoisonValue::get(T: ScalarTy)); |
| 17431 | Values[0] = V; |
| 17432 | Value *BV = ShuffleBuilder.gather(Values, BVMask.size()); |
| 17433 | SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem); |
| 17434 | transform(BVMask, SplatMask.begin(), [](int I) { |
| 17435 | return I == PoisonMaskElem ? PoisonMaskElem : 0; |
| 17436 | }); |
| 17437 | if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF)) |
| 17438 | BV = CreateShuffle(BV, nullptr, SplatMask); |
| 17439 | for (auto [Idx, I] : enumerate(First&: BVMask)) |
| 17440 | if (I != PoisonMaskElem) |
| 17441 | Mask[Idx] = BVMask.size() + Idx; |
| 17442 | Vec = CreateShuffle(Vec, BV, Mask); |
| 17443 | for (auto [Idx, I] : enumerate(First&: Mask)) |
| 17444 | if (I != PoisonMaskElem) |
| 17445 | Mask[Idx] = Idx; |
| 17446 | } |
| 17447 | }); |
| 17448 | } else if (!allConstant(VL: GatheredScalars)) { |
| 17449 | // Gather unique scalars and all constants. |
| 17450 | SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem); |
| 17451 | TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); |
| 17452 | Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); |
| 17453 | ShuffleBuilder.add(BV, ReuseMask); |
| 17454 | Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, |
| 17455 | SubVectorsMask); |
| 17456 | } else { |
| 17457 | // Gather all constants. |
| 17458 | SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem); |
| 17459 | for (auto [I, V] : enumerate(First&: GatheredScalars)) { |
| 17460 | if (!isa<PoisonValue>(Val: V)) |
| 17461 | Mask[I] = I; |
| 17462 | } |
| 17463 | Value *BV = ShuffleBuilder.gather(GatheredScalars); |
| 17464 | ShuffleBuilder.add(BV, Mask); |
| 17465 | Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, |
| 17466 | SubVectorsMask); |
| 17467 | } |
| 17468 | |
| 17469 | if (NeedFreeze) |
| 17470 | Res = ShuffleBuilder.createFreeze(Res); |
| 17471 | return Res; |
| 17472 | } |
| 17473 | |
| 17474 | Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { |
| 17475 | for (auto [EIdx, _] : E->CombinedEntriesWithIndices) |
| 17476 | (void)vectorizeTree(E: VectorizableTree[EIdx].get()); |
| 17477 | return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy, |
| 17478 | Params&: Builder, Params&: *this); |
| 17479 | } |
| 17480 | |
| 17481 | /// \returns \p I after propagating metadata from \p VL only for instructions in |
| 17482 | /// \p VL. |
| 17483 | static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) { |
| 17484 | SmallVector<Value *> Insts; |
| 17485 | for (Value *V : VL) |
| 17486 | if (isa<Instruction>(Val: V)) |
| 17487 | Insts.push_back(Elt: V); |
| 17488 | return llvm::propagateMetadata(I: Inst, VL: Insts); |
| 17489 | } |
| 17490 | |
| 17491 | static DebugLoc getDebugLocFromPHI(PHINode &PN) { |
| 17492 | if (DebugLoc DL = PN.getDebugLoc()) |
| 17493 | return DL; |
| 17494 | return DebugLoc::getUnknown(); |
| 17495 | } |
| 17496 | |
| 17497 | Value *BoUpSLP::vectorizeTree(TreeEntry *E) { |
| 17498 | IRBuilderBase::InsertPointGuard Guard(Builder); |
| 17499 | |
| 17500 | Value *V = E->Scalars.front(); |
| 17501 | Type *ScalarTy = V->getType(); |
| 17502 | if (!isa<CmpInst>(Val: V)) |
| 17503 | ScalarTy = getValueType(V); |
| 17504 | auto It = MinBWs.find(Val: E); |
| 17505 | if (It != MinBWs.end()) { |
| 17506 | auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy); |
| 17507 | ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first); |
| 17508 | if (VecTy) |
| 17509 | ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements()); |
| 17510 | } |
| 17511 | if (E->VectorizedValue) |
| 17512 | return E->VectorizedValue; |
| 17513 | auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size()); |
| 17514 | if (E->isGather()) { |
| 17515 | // Set insert point for non-reduction initial nodes. |
| 17516 | if (E->hasState() && E->Idx == 0 && !UserIgnoreList) |
| 17517 | setInsertPointAfterBundle(E); |
| 17518 | Value *Vec = createBuildVector(E, ScalarTy); |
| 17519 | E->VectorizedValue = Vec; |
| 17520 | return Vec; |
| 17521 | } |
| 17522 | if (E->State == TreeEntry::SplitVectorize) { |
| 17523 | assert(E->CombinedEntriesWithIndices.size() == 2 && |
| 17524 | "Expected exactly 2 combined entries." ); |
| 17525 | setInsertPointAfterBundle(E); |
| 17526 | TreeEntry &OpTE1 = |
| 17527 | *VectorizableTree[E->CombinedEntriesWithIndices.front().first]; |
| 17528 | assert(OpTE1.isSame( |
| 17529 | ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) && |
| 17530 | "Expected same first part of scalars." ); |
| 17531 | Value *Op1 = vectorizeTree(E: &OpTE1); |
| 17532 | TreeEntry &OpTE2 = |
| 17533 | *VectorizableTree[E->CombinedEntriesWithIndices.back().first]; |
| 17534 | assert( |
| 17535 | OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) && |
| 17536 | "Expected same second part of scalars." ); |
| 17537 | Value *Op2 = vectorizeTree(E: &OpTE2); |
| 17538 | auto GetOperandSignedness = [&](const TreeEntry *OpE) { |
| 17539 | bool IsSigned = false; |
| 17540 | auto It = MinBWs.find(Val: OpE); |
| 17541 | if (It != MinBWs.end()) |
| 17542 | IsSigned = It->second.second; |
| 17543 | else |
| 17544 | IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) { |
| 17545 | if (isa<PoisonValue>(Val: V)) |
| 17546 | return false; |
| 17547 | return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL)); |
| 17548 | }); |
| 17549 | return IsSigned; |
| 17550 | }; |
| 17551 | if (cast<VectorType>(Val: Op1->getType())->getElementType() != |
| 17552 | ScalarTy->getScalarType()) { |
| 17553 | assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs." ); |
| 17554 | Op1 = Builder.CreateIntCast( |
| 17555 | V: Op1, |
| 17556 | DestTy: getWidenedType( |
| 17557 | ScalarTy, |
| 17558 | VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()), |
| 17559 | isSigned: GetOperandSignedness(&OpTE1)); |
| 17560 | } |
| 17561 | if (cast<VectorType>(Val: Op2->getType())->getElementType() != |
| 17562 | ScalarTy->getScalarType()) { |
| 17563 | assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs." ); |
| 17564 | Op2 = Builder.CreateIntCast( |
| 17565 | V: Op2, |
| 17566 | DestTy: getWidenedType( |
| 17567 | ScalarTy, |
| 17568 | VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()), |
| 17569 | isSigned: GetOperandSignedness(&OpTE2)); |
| 17570 | } |
| 17571 | if (E->ReorderIndices.empty()) { |
| 17572 | SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem); |
| 17573 | std::iota( |
| 17574 | first: Mask.begin(), |
| 17575 | last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second), |
| 17576 | value: 0); |
| 17577 | unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy); |
| 17578 | if (ScalarTyNumElements != 1) { |
| 17579 | assert(SLPReVec && "Only supported by REVEC." ); |
| 17580 | transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask); |
| 17581 | } |
| 17582 | Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask); |
| 17583 | Vec = createInsertVector(Builder, Vec, V: Op2, |
| 17584 | Index: E->CombinedEntriesWithIndices.back().second * |
| 17585 | ScalarTyNumElements); |
| 17586 | E->VectorizedValue = Vec; |
| 17587 | return Vec; |
| 17588 | } |
| 17589 | unsigned CommonVF = |
| 17590 | std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor()); |
| 17591 | if (getNumElements(Ty: Op1->getType()) != CommonVF) { |
| 17592 | SmallVector<int> Mask(CommonVF, PoisonMaskElem); |
| 17593 | std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE1.getVectorFactor()), |
| 17594 | value: 0); |
| 17595 | Op1 = Builder.CreateShuffleVector(V: Op1, Mask); |
| 17596 | } |
| 17597 | if (getNumElements(Ty: Op2->getType()) != CommonVF) { |
| 17598 | SmallVector<int> Mask(CommonVF, PoisonMaskElem); |
| 17599 | std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE2.getVectorFactor()), |
| 17600 | value: 0); |
| 17601 | Op2 = Builder.CreateShuffleVector(V: Op2, Mask); |
| 17602 | } |
| 17603 | Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask()); |
| 17604 | E->VectorizedValue = Vec; |
| 17605 | return Vec; |
| 17606 | } |
| 17607 | |
| 17608 | bool IsReverseOrder = |
| 17609 | !E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices); |
| 17610 | auto FinalShuffle = [&](Value *V, const TreeEntry *E) { |
| 17611 | ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this); |
| 17612 | if (E->getOpcode() == Instruction::Store && |
| 17613 | E->State == TreeEntry::Vectorize) { |
| 17614 | ArrayRef<int> Mask = |
| 17615 | ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()), |
| 17616 | E->ReorderIndices.size()); |
| 17617 | ShuffleBuilder.add(V1: V, Mask); |
| 17618 | } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) || |
| 17619 | E->State == TreeEntry::CompressVectorize) { |
| 17620 | ShuffleBuilder.addOrdered(V1: V, Order: {}); |
| 17621 | } else { |
| 17622 | ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices); |
| 17623 | } |
| 17624 | SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors( |
| 17625 | E->CombinedEntriesWithIndices.size()); |
| 17626 | transform( |
| 17627 | Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) { |
| 17628 | return std::make_pair(VectorizableTree[P.first].get(), P.second); |
| 17629 | }); |
| 17630 | assert( |
| 17631 | (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) && |
| 17632 | "Expected either combined subnodes or reordering" ); |
| 17633 | return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {}); |
| 17634 | }; |
| 17635 | |
| 17636 | assert(!E->isGather() && "Unhandled state" ); |
| 17637 | unsigned ShuffleOrOp = |
| 17638 | E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); |
| 17639 | Instruction *VL0 = E->getMainOp(); |
| 17640 | auto GetOperandSignedness = [&](unsigned Idx) { |
| 17641 | const TreeEntry *OpE = getOperandEntry(E, Idx); |
| 17642 | bool IsSigned = false; |
| 17643 | auto It = MinBWs.find(Val: OpE); |
| 17644 | if (It != MinBWs.end()) |
| 17645 | IsSigned = It->second.second; |
| 17646 | else |
| 17647 | IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) { |
| 17648 | if (isa<PoisonValue>(Val: V)) |
| 17649 | return false; |
| 17650 | return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL)); |
| 17651 | }); |
| 17652 | return IsSigned; |
| 17653 | }; |
| 17654 | switch (ShuffleOrOp) { |
| 17655 | case Instruction::PHI: { |
| 17656 | assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() || |
| 17657 | E != VectorizableTree.front().get() || E->UserTreeIndex) && |
| 17658 | "PHI reordering is free." ); |
| 17659 | auto *PH = cast<PHINode>(Val: VL0); |
| 17660 | Builder.SetInsertPoint(TheBB: PH->getParent(), |
| 17661 | IP: PH->getParent()->getFirstNonPHIIt()); |
| 17662 | Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH)); |
| 17663 | PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues()); |
| 17664 | Value *V = NewPhi; |
| 17665 | |
| 17666 | // Adjust insertion point once all PHI's have been generated. |
| 17667 | Builder.SetInsertPoint(TheBB: PH->getParent(), |
| 17668 | IP: PH->getParent()->getFirstInsertionPt()); |
| 17669 | Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH)); |
| 17670 | |
| 17671 | V = FinalShuffle(V, E); |
| 17672 | |
| 17673 | E->VectorizedValue = V; |
| 17674 | // If phi node is fully emitted - exit. |
| 17675 | if (NewPhi->getNumIncomingValues() != 0) |
| 17676 | return NewPhi; |
| 17677 | |
| 17678 | // PHINodes may have multiple entries from the same block. We want to |
| 17679 | // visit every block once. |
| 17680 | SmallPtrSet<BasicBlock *, 4> VisitedBBs; |
| 17681 | |
| 17682 | for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) { |
| 17683 | BasicBlock *IBB = PH->getIncomingBlock(i: I); |
| 17684 | |
| 17685 | // Stop emission if all incoming values are generated. |
| 17686 | if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) { |
| 17687 | LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n" ); |
| 17688 | return NewPhi; |
| 17689 | } |
| 17690 | |
| 17691 | if (!VisitedBBs.insert(Ptr: IBB).second) { |
| 17692 | Value *VecOp = NewPhi->getIncomingValueForBlock(BB: IBB); |
| 17693 | NewPhi->addIncoming(V: VecOp, BB: IBB); |
| 17694 | TreeEntry *OpTE = getOperandEntry(E, Idx: I); |
| 17695 | assert(!OpTE->VectorizedValue && "Expected no vectorized value." ); |
| 17696 | OpTE->VectorizedValue = VecOp; |
| 17697 | continue; |
| 17698 | } |
| 17699 | |
| 17700 | Builder.SetInsertPoint(IBB->getTerminator()); |
| 17701 | Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH)); |
| 17702 | Value *Vec = vectorizeOperand(E, NodeIdx: I); |
| 17703 | if (VecTy != Vec->getType()) { |
| 17704 | assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || |
| 17705 | MinBWs.contains(getOperandEntry(E, I))) && |
| 17706 | "Expected item in MinBWs." ); |
| 17707 | Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I)); |
| 17708 | } |
| 17709 | NewPhi->addIncoming(V: Vec, BB: IBB); |
| 17710 | } |
| 17711 | |
| 17712 | assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && |
| 17713 | "Invalid number of incoming values" ); |
| 17714 | assert(E->VectorizedValue && "Expected vectorized value." ); |
| 17715 | return E->VectorizedValue; |
| 17716 | } |
| 17717 | |
| 17718 | case Instruction::ExtractElement: { |
| 17719 | Value *V = E->getSingleOperand(OpIdx: 0); |
| 17720 | setInsertPointAfterBundle(E); |
| 17721 | V = FinalShuffle(V, E); |
| 17722 | E->VectorizedValue = V; |
| 17723 | return V; |
| 17724 | } |
| 17725 | case Instruction::ExtractValue: { |
| 17726 | auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0)); |
| 17727 | Builder.SetInsertPoint(LI); |
| 17728 | Value *Ptr = LI->getPointerOperand(); |
| 17729 | LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign()); |
| 17730 | Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars); |
| 17731 | NewV = FinalShuffle(NewV, E); |
| 17732 | E->VectorizedValue = NewV; |
| 17733 | return NewV; |
| 17734 | } |
| 17735 | case Instruction::InsertElement: { |
| 17736 | assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique" ); |
| 17737 | Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back())); |
| 17738 | Value *V = vectorizeOperand(E, NodeIdx: 1); |
| 17739 | ArrayRef<Value *> Op = E->getOperand(OpIdx: 1); |
| 17740 | Type *ScalarTy = Op.front()->getType(); |
| 17741 | if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) { |
| 17742 | assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs." ); |
| 17743 | std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1)); |
| 17744 | assert(Res.first > 0 && "Expected item in MinBWs." ); |
| 17745 | V = Builder.CreateIntCast( |
| 17746 | V, |
| 17747 | DestTy: getWidenedType( |
| 17748 | ScalarTy, |
| 17749 | VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()), |
| 17750 | isSigned: Res.second); |
| 17751 | } |
| 17752 | |
| 17753 | // Create InsertVector shuffle if necessary |
| 17754 | auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) { |
| 17755 | return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0)); |
| 17756 | })); |
| 17757 | const unsigned NumElts = |
| 17758 | cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements(); |
| 17759 | const unsigned NumScalars = E->Scalars.size(); |
| 17760 | |
| 17761 | unsigned Offset = *getElementIndex(Inst: VL0); |
| 17762 | assert(Offset < NumElts && "Failed to find vector index offset" ); |
| 17763 | |
| 17764 | // Create shuffle to resize vector |
| 17765 | SmallVector<int> Mask; |
| 17766 | if (!E->ReorderIndices.empty()) { |
| 17767 | inversePermutation(Indices: E->ReorderIndices, Mask); |
| 17768 | Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem); |
| 17769 | } else { |
| 17770 | Mask.assign(NumElts, Elt: PoisonMaskElem); |
| 17771 | std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0); |
| 17772 | } |
| 17773 | // Create InsertVector shuffle if necessary |
| 17774 | bool IsIdentity = true; |
| 17775 | SmallVector<int> PrevMask(NumElts, PoisonMaskElem); |
| 17776 | Mask.swap(RHS&: PrevMask); |
| 17777 | for (unsigned I = 0; I < NumScalars; ++I) { |
| 17778 | Value *Scalar = E->Scalars[PrevMask[I]]; |
| 17779 | unsigned InsertIdx = *getElementIndex(Inst: Scalar); |
| 17780 | IsIdentity &= InsertIdx - Offset == I; |
| 17781 | Mask[InsertIdx - Offset] = I; |
| 17782 | } |
| 17783 | if (!IsIdentity || NumElts != NumScalars) { |
| 17784 | Value *V2 = nullptr; |
| 17785 | bool IsVNonPoisonous = |
| 17786 | !isConstant(V) && isGuaranteedNotToBePoison(V, AC); |
| 17787 | SmallVector<int> InsertMask(Mask); |
| 17788 | if (NumElts != NumScalars && Offset == 0) { |
| 17789 | // Follow all insert element instructions from the current buildvector |
| 17790 | // sequence. |
| 17791 | InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0); |
| 17792 | do { |
| 17793 | std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins); |
| 17794 | if (!InsertIdx) |
| 17795 | break; |
| 17796 | if (InsertMask[*InsertIdx] == PoisonMaskElem) |
| 17797 | InsertMask[*InsertIdx] = *InsertIdx; |
| 17798 | if (!Ins->hasOneUse()) |
| 17799 | break; |
| 17800 | Ins = dyn_cast_or_null<InsertElementInst>( |
| 17801 | Val: Ins->getUniqueUndroppableUser()); |
| 17802 | } while (Ins); |
| 17803 | SmallBitVector UseMask = |
| 17804 | buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask); |
| 17805 | SmallBitVector IsFirstPoison = |
| 17806 | isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask); |
| 17807 | SmallBitVector IsFirstUndef = |
| 17808 | isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask); |
| 17809 | if (!IsFirstPoison.all()) { |
| 17810 | unsigned Idx = 0; |
| 17811 | for (unsigned I = 0; I < NumElts; I++) { |
| 17812 | if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) && |
| 17813 | IsFirstUndef.test(Idx: I)) { |
| 17814 | if (IsVNonPoisonous) { |
| 17815 | InsertMask[I] = I < NumScalars ? I : 0; |
| 17816 | continue; |
| 17817 | } |
| 17818 | if (!V2) |
| 17819 | V2 = UndefValue::get(T: V->getType()); |
| 17820 | if (Idx >= NumScalars) |
| 17821 | Idx = NumScalars - 1; |
| 17822 | InsertMask[I] = NumScalars + Idx; |
| 17823 | ++Idx; |
| 17824 | } else if (InsertMask[I] != PoisonMaskElem && |
| 17825 | Mask[I] == PoisonMaskElem) { |
| 17826 | InsertMask[I] = PoisonMaskElem; |
| 17827 | } |
| 17828 | } |
| 17829 | } else { |
| 17830 | InsertMask = Mask; |
| 17831 | } |
| 17832 | } |
| 17833 | if (!V2) |
| 17834 | V2 = PoisonValue::get(T: V->getType()); |
| 17835 | V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask); |
| 17836 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 17837 | GatherShuffleExtractSeq.insert(X: I); |
| 17838 | CSEBlocks.insert(V: I->getParent()); |
| 17839 | } |
| 17840 | } |
| 17841 | |
| 17842 | SmallVector<int> InsertMask(NumElts, PoisonMaskElem); |
| 17843 | for (unsigned I = 0; I < NumElts; I++) { |
| 17844 | if (Mask[I] != PoisonMaskElem) |
| 17845 | InsertMask[Offset + I] = I; |
| 17846 | } |
| 17847 | SmallBitVector UseMask = |
| 17848 | buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask); |
| 17849 | SmallBitVector IsFirstUndef = |
| 17850 | isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask); |
| 17851 | if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) && |
| 17852 | NumElts != NumScalars) { |
| 17853 | if (IsFirstUndef.all()) { |
| 17854 | if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) { |
| 17855 | SmallBitVector IsFirstPoison = |
| 17856 | isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask); |
| 17857 | if (!IsFirstPoison.all()) { |
| 17858 | for (unsigned I = 0; I < NumElts; I++) { |
| 17859 | if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I)) |
| 17860 | InsertMask[I] = I + NumElts; |
| 17861 | } |
| 17862 | } |
| 17863 | V = Builder.CreateShuffleVector( |
| 17864 | V1: V, |
| 17865 | V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType()) |
| 17866 | : FirstInsert->getOperand(i: 0), |
| 17867 | Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName()); |
| 17868 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 17869 | GatherShuffleExtractSeq.insert(X: I); |
| 17870 | CSEBlocks.insert(V: I->getParent()); |
| 17871 | } |
| 17872 | } |
| 17873 | } else { |
| 17874 | SmallBitVector IsFirstPoison = |
| 17875 | isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask); |
| 17876 | for (unsigned I = 0; I < NumElts; I++) { |
| 17877 | if (InsertMask[I] == PoisonMaskElem) |
| 17878 | InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I; |
| 17879 | else |
| 17880 | InsertMask[I] += NumElts; |
| 17881 | } |
| 17882 | V = Builder.CreateShuffleVector( |
| 17883 | V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask, |
| 17884 | Name: cast<Instruction>(Val: E->Scalars.back())->getName()); |
| 17885 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 17886 | GatherShuffleExtractSeq.insert(X: I); |
| 17887 | CSEBlocks.insert(V: I->getParent()); |
| 17888 | } |
| 17889 | } |
| 17890 | } |
| 17891 | |
| 17892 | ++NumVectorInstructions; |
| 17893 | E->VectorizedValue = V; |
| 17894 | return V; |
| 17895 | } |
| 17896 | case Instruction::ZExt: |
| 17897 | case Instruction::SExt: |
| 17898 | case Instruction::FPToUI: |
| 17899 | case Instruction::FPToSI: |
| 17900 | case Instruction::FPExt: |
| 17901 | case Instruction::PtrToInt: |
| 17902 | case Instruction::IntToPtr: |
| 17903 | case Instruction::SIToFP: |
| 17904 | case Instruction::UIToFP: |
| 17905 | case Instruction::Trunc: |
| 17906 | case Instruction::FPTrunc: |
| 17907 | case Instruction::BitCast: { |
| 17908 | setInsertPointAfterBundle(E); |
| 17909 | |
| 17910 | Value *InVec = vectorizeOperand(E, NodeIdx: 0); |
| 17911 | |
| 17912 | auto *CI = cast<CastInst>(Val: VL0); |
| 17913 | Instruction::CastOps VecOpcode = CI->getOpcode(); |
| 17914 | Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType(); |
| 17915 | auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0)); |
| 17916 | if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() && |
| 17917 | (SrcIt != MinBWs.end() || It != MinBWs.end() || |
| 17918 | SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType()->getScalarType())) { |
| 17919 | // Check if the values are candidates to demote. |
| 17920 | unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy); |
| 17921 | if (SrcIt != MinBWs.end()) |
| 17922 | SrcBWSz = SrcIt->second.first; |
| 17923 | unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType()); |
| 17924 | if (BWSz == SrcBWSz) { |
| 17925 | VecOpcode = Instruction::BitCast; |
| 17926 | } else if (BWSz < SrcBWSz) { |
| 17927 | VecOpcode = Instruction::Trunc; |
| 17928 | } else if (It != MinBWs.end()) { |
| 17929 | assert(BWSz > SrcBWSz && "Invalid cast!" ); |
| 17930 | VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt; |
| 17931 | } else if (SrcIt != MinBWs.end()) { |
| 17932 | assert(BWSz > SrcBWSz && "Invalid cast!" ); |
| 17933 | VecOpcode = |
| 17934 | SrcIt->second.second ? Instruction::SExt : Instruction::ZExt; |
| 17935 | } |
| 17936 | } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() && |
| 17937 | !SrcIt->second.second) { |
| 17938 | VecOpcode = Instruction::UIToFP; |
| 17939 | } |
| 17940 | Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast) |
| 17941 | ? InVec |
| 17942 | : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy); |
| 17943 | V = FinalShuffle(V, E); |
| 17944 | |
| 17945 | E->VectorizedValue = V; |
| 17946 | ++NumVectorInstructions; |
| 17947 | return V; |
| 17948 | } |
| 17949 | case Instruction::FCmp: |
| 17950 | case Instruction::ICmp: { |
| 17951 | setInsertPointAfterBundle(E); |
| 17952 | |
| 17953 | Value *L = vectorizeOperand(E, NodeIdx: 0); |
| 17954 | Value *R = vectorizeOperand(E, NodeIdx: 1); |
| 17955 | if (L->getType() != R->getType()) { |
| 17956 | assert((getOperandEntry(E, 0)->isGather() || |
| 17957 | getOperandEntry(E, 1)->isGather() || |
| 17958 | MinBWs.contains(getOperandEntry(E, 0)) || |
| 17959 | MinBWs.contains(getOperandEntry(E, 1))) && |
| 17960 | "Expected item in MinBWs." ); |
| 17961 | if (cast<VectorType>(Val: L->getType()) |
| 17962 | ->getElementType() |
| 17963 | ->getIntegerBitWidth() < cast<VectorType>(Val: R->getType()) |
| 17964 | ->getElementType() |
| 17965 | ->getIntegerBitWidth()) { |
| 17966 | Type *CastTy = R->getType(); |
| 17967 | L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0)); |
| 17968 | } else { |
| 17969 | Type *CastTy = L->getType(); |
| 17970 | R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1)); |
| 17971 | } |
| 17972 | } |
| 17973 | |
| 17974 | CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate(); |
| 17975 | Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R); |
| 17976 | propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0); |
| 17977 | if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end()) |
| 17978 | ICmp->setSameSign(/*B=*/false); |
| 17979 | // Do not cast for cmps. |
| 17980 | VecTy = cast<FixedVectorType>(Val: V->getType()); |
| 17981 | V = FinalShuffle(V, E); |
| 17982 | |
| 17983 | E->VectorizedValue = V; |
| 17984 | ++NumVectorInstructions; |
| 17985 | return V; |
| 17986 | } |
| 17987 | case Instruction::Select: { |
| 17988 | setInsertPointAfterBundle(E); |
| 17989 | |
| 17990 | Value *Cond = vectorizeOperand(E, NodeIdx: 0); |
| 17991 | Value *True = vectorizeOperand(E, NodeIdx: 1); |
| 17992 | Value *False = vectorizeOperand(E, NodeIdx: 2); |
| 17993 | if (True->getType() != VecTy || False->getType() != VecTy) { |
| 17994 | assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() || |
| 17995 | getOperandEntry(E, 2)->isGather() || |
| 17996 | MinBWs.contains(getOperandEntry(E, 1)) || |
| 17997 | MinBWs.contains(getOperandEntry(E, 2))) && |
| 17998 | "Expected item in MinBWs." ); |
| 17999 | if (True->getType() != VecTy) |
| 18000 | True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1)); |
| 18001 | if (False->getType() != VecTy) |
| 18002 | False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2)); |
| 18003 | } |
| 18004 | |
| 18005 | unsigned CondNumElements = getNumElements(Ty: Cond->getType()); |
| 18006 | unsigned TrueNumElements = getNumElements(Ty: True->getType()); |
| 18007 | assert(TrueNumElements >= CondNumElements && |
| 18008 | TrueNumElements % CondNumElements == 0 && |
| 18009 | "Cannot vectorize Instruction::Select" ); |
| 18010 | assert(TrueNumElements == getNumElements(False->getType()) && |
| 18011 | "Cannot vectorize Instruction::Select" ); |
| 18012 | if (CondNumElements != TrueNumElements) { |
| 18013 | // When the return type is i1 but the source is fixed vector type, we |
| 18014 | // need to duplicate the condition value. |
| 18015 | Cond = Builder.CreateShuffleVector( |
| 18016 | V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements, |
| 18017 | VF: CondNumElements)); |
| 18018 | } |
| 18019 | assert(getNumElements(Cond->getType()) == TrueNumElements && |
| 18020 | "Cannot vectorize Instruction::Select" ); |
| 18021 | Value *V = Builder.CreateSelect(C: Cond, True, False); |
| 18022 | V = FinalShuffle(V, E); |
| 18023 | |
| 18024 | E->VectorizedValue = V; |
| 18025 | ++NumVectorInstructions; |
| 18026 | return V; |
| 18027 | } |
| 18028 | case Instruction::FNeg: { |
| 18029 | setInsertPointAfterBundle(E); |
| 18030 | |
| 18031 | Value *Op = vectorizeOperand(E, NodeIdx: 0); |
| 18032 | |
| 18033 | Value *V = Builder.CreateUnOp( |
| 18034 | Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op); |
| 18035 | propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0); |
| 18036 | if (auto *I = dyn_cast<Instruction>(Val: V)) |
| 18037 | V = ::propagateMetadata(Inst: I, VL: E->Scalars); |
| 18038 | |
| 18039 | V = FinalShuffle(V, E); |
| 18040 | |
| 18041 | E->VectorizedValue = V; |
| 18042 | ++NumVectorInstructions; |
| 18043 | |
| 18044 | return V; |
| 18045 | } |
| 18046 | case Instruction::Freeze: { |
| 18047 | setInsertPointAfterBundle(E); |
| 18048 | |
| 18049 | Value *Op = vectorizeOperand(E, NodeIdx: 0); |
| 18050 | |
| 18051 | if (Op->getType() != VecTy) { |
| 18052 | assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || |
| 18053 | MinBWs.contains(getOperandEntry(E, 0))) && |
| 18054 | "Expected item in MinBWs." ); |
| 18055 | Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness(0)); |
| 18056 | } |
| 18057 | Value *V = Builder.CreateFreeze(V: Op); |
| 18058 | V = FinalShuffle(V, E); |
| 18059 | |
| 18060 | E->VectorizedValue = V; |
| 18061 | ++NumVectorInstructions; |
| 18062 | |
| 18063 | return V; |
| 18064 | } |
| 18065 | case Instruction::Add: |
| 18066 | case Instruction::FAdd: |
| 18067 | case Instruction::Sub: |
| 18068 | case Instruction::FSub: |
| 18069 | case Instruction::Mul: |
| 18070 | case Instruction::FMul: |
| 18071 | case Instruction::UDiv: |
| 18072 | case Instruction::SDiv: |
| 18073 | case Instruction::FDiv: |
| 18074 | case Instruction::URem: |
| 18075 | case Instruction::SRem: |
| 18076 | case Instruction::FRem: |
| 18077 | case Instruction::Shl: |
| 18078 | case Instruction::LShr: |
| 18079 | case Instruction::AShr: |
| 18080 | case Instruction::And: |
| 18081 | case Instruction::Or: |
| 18082 | case Instruction::Xor: { |
| 18083 | setInsertPointAfterBundle(E); |
| 18084 | |
| 18085 | Value *LHS = vectorizeOperand(E, NodeIdx: 0); |
| 18086 | Value *RHS = vectorizeOperand(E, NodeIdx: 1); |
| 18087 | if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) { |
| 18088 | for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) { |
| 18089 | ArrayRef<Value *> Ops = E->getOperand(OpIdx: I); |
| 18090 | if (all_of(Range&: Ops, P: [&](Value *Op) { |
| 18091 | auto *CI = dyn_cast<ConstantInt>(Val: Op); |
| 18092 | return CI && CI->getValue().countr_one() >= It->second.first; |
| 18093 | })) { |
| 18094 | V = FinalShuffle(I == 0 ? RHS : LHS, E); |
| 18095 | E->VectorizedValue = V; |
| 18096 | ++NumVectorInstructions; |
| 18097 | return V; |
| 18098 | } |
| 18099 | } |
| 18100 | } |
| 18101 | if (LHS->getType() != VecTy || RHS->getType() != VecTy) { |
| 18102 | assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() || |
| 18103 | getOperandEntry(E, 1)->isGather() || |
| 18104 | MinBWs.contains(getOperandEntry(E, 0)) || |
| 18105 | MinBWs.contains(getOperandEntry(E, 1))) && |
| 18106 | "Expected item in MinBWs." ); |
| 18107 | if (LHS->getType() != VecTy) |
| 18108 | LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0)); |
| 18109 | if (RHS->getType() != VecTy) |
| 18110 | RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1)); |
| 18111 | } |
| 18112 | |
| 18113 | Value *V = Builder.CreateBinOp( |
| 18114 | Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, |
| 18115 | RHS); |
| 18116 | propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end()); |
| 18117 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 18118 | V = ::propagateMetadata(Inst: I, VL: E->Scalars); |
| 18119 | // Drop nuw flags for abs(sub(commutative), true). |
| 18120 | if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub && |
| 18121 | any_of(Range&: E->Scalars, P: [](Value *V) { |
| 18122 | return isa<PoisonValue>(Val: V) || isCommutative(I: cast<Instruction>(Val: V)); |
| 18123 | })) |
| 18124 | I->setHasNoUnsignedWrap(/*b=*/false); |
| 18125 | } |
| 18126 | |
| 18127 | V = FinalShuffle(V, E); |
| 18128 | |
| 18129 | E->VectorizedValue = V; |
| 18130 | ++NumVectorInstructions; |
| 18131 | |
| 18132 | return V; |
| 18133 | } |
| 18134 | case Instruction::Load: { |
| 18135 | // Loads are inserted at the head of the tree because we don't want to |
| 18136 | // sink them all the way down past store instructions. |
| 18137 | setInsertPointAfterBundle(E); |
| 18138 | |
| 18139 | LoadInst *LI = cast<LoadInst>(Val: VL0); |
| 18140 | Instruction *NewLI; |
| 18141 | Value *PO = LI->getPointerOperand(); |
| 18142 | if (E->State == TreeEntry::Vectorize) { |
| 18143 | NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign()); |
| 18144 | } else if (E->State == TreeEntry::CompressVectorize) { |
| 18145 | auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] = |
| 18146 | CompressEntryToData.at(Val: E); |
| 18147 | Align CommonAlignment = LI->getAlign(); |
| 18148 | if (IsMasked) { |
| 18149 | unsigned VF = getNumElements(Ty: LoadVecTy); |
| 18150 | SmallVector<Constant *> MaskValues( |
| 18151 | VF / getNumElements(Ty: LI->getType()), |
| 18152 | ConstantInt::getFalse(Context&: VecTy->getContext())); |
| 18153 | for (int I : CompressMask) |
| 18154 | MaskValues[I] = ConstantInt::getTrue(Context&: VecTy->getContext()); |
| 18155 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) { |
| 18156 | assert(SLPReVec && "Only supported by REVEC." ); |
| 18157 | MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements()); |
| 18158 | } |
| 18159 | Constant *MaskValue = ConstantVector::get(V: MaskValues); |
| 18160 | NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment, |
| 18161 | Mask: MaskValue); |
| 18162 | } else { |
| 18163 | NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment); |
| 18164 | } |
| 18165 | NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars); |
| 18166 | // TODO: include this cost into CommonCost. |
| 18167 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) { |
| 18168 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 18169 | transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), |
| 18170 | Mask&: CompressMask); |
| 18171 | } |
| 18172 | NewLI = |
| 18173 | cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask)); |
| 18174 | } else if (E->State == TreeEntry::StridedVectorize) { |
| 18175 | Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand(); |
| 18176 | Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand(); |
| 18177 | PO = IsReverseOrder ? PtrN : Ptr0; |
| 18178 | std::optional<int64_t> Diff = getPointersDiff( |
| 18179 | ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: *DL, SE&: *SE); |
| 18180 | Type *StrideTy = DL->getIndexType(PtrTy: PO->getType()); |
| 18181 | Value *StrideVal; |
| 18182 | if (Diff) { |
| 18183 | int64_t Stride = |
| 18184 | *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1); |
| 18185 | StrideVal = |
| 18186 | ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) * Stride * |
| 18187 | DL->getTypeAllocSize(Ty: ScalarTy)); |
| 18188 | } else { |
| 18189 | SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr); |
| 18190 | transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) { |
| 18191 | return cast<LoadInst>(Val: V)->getPointerOperand(); |
| 18192 | }); |
| 18193 | OrdersType Order; |
| 18194 | std::optional<Value *> Stride = |
| 18195 | calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order, |
| 18196 | Inst: &*Builder.GetInsertPoint()); |
| 18197 | Value *NewStride = |
| 18198 | Builder.CreateIntCast(V: *Stride, DestTy: StrideTy, /*isSigned=*/true); |
| 18199 | StrideVal = Builder.CreateMul( |
| 18200 | LHS: NewStride, |
| 18201 | RHS: ConstantInt::get( |
| 18202 | Ty: StrideTy, |
| 18203 | V: (IsReverseOrder ? -1 : 1) * |
| 18204 | static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy)))); |
| 18205 | } |
| 18206 | Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars); |
| 18207 | auto *Inst = Builder.CreateIntrinsic( |
| 18208 | ID: Intrinsic::experimental_vp_strided_load, |
| 18209 | Types: {VecTy, PO->getType(), StrideTy}, |
| 18210 | Args: {PO, StrideVal, Builder.getAllOnesMask(NumElts: VecTy->getElementCount()), |
| 18211 | Builder.getInt32(C: E->Scalars.size())}); |
| 18212 | Inst->addParamAttr( |
| 18213 | /*ArgNo=*/0, |
| 18214 | Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment)); |
| 18215 | NewLI = Inst; |
| 18216 | } else { |
| 18217 | assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state" ); |
| 18218 | Value *VecPtr = vectorizeOperand(E, NodeIdx: 0); |
| 18219 | if (isa<FixedVectorType>(Val: ScalarTy)) { |
| 18220 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 18221 | // CreateMaskedGather expects VecTy and VecPtr have same size. We need |
| 18222 | // to expand VecPtr if ScalarTy is a vector type. |
| 18223 | unsigned ScalarTyNumElements = |
| 18224 | cast<FixedVectorType>(Val: ScalarTy)->getNumElements(); |
| 18225 | unsigned VecTyNumElements = |
| 18226 | cast<FixedVectorType>(Val: VecTy)->getNumElements(); |
| 18227 | assert(VecTyNumElements % ScalarTyNumElements == 0 && |
| 18228 | "Cannot expand getelementptr." ); |
| 18229 | unsigned VF = VecTyNumElements / ScalarTyNumElements; |
| 18230 | SmallVector<Constant *> Indices(VecTyNumElements); |
| 18231 | transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) { |
| 18232 | return Builder.getInt64(C: I % ScalarTyNumElements); |
| 18233 | }); |
| 18234 | VecPtr = Builder.CreateGEP( |
| 18235 | Ty: VecTy->getElementType(), |
| 18236 | Ptr: Builder.CreateShuffleVector( |
| 18237 | V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)), |
| 18238 | IdxList: ConstantVector::get(V: Indices)); |
| 18239 | } |
| 18240 | // Use the minimum alignment of the gathered loads. |
| 18241 | Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars); |
| 18242 | NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment); |
| 18243 | } |
| 18244 | Value *V = E->State == TreeEntry::CompressVectorize |
| 18245 | ? NewLI |
| 18246 | : ::propagateMetadata(Inst: NewLI, VL: E->Scalars); |
| 18247 | |
| 18248 | V = FinalShuffle(V, E); |
| 18249 | E->VectorizedValue = V; |
| 18250 | ++NumVectorInstructions; |
| 18251 | return V; |
| 18252 | } |
| 18253 | case Instruction::Store: { |
| 18254 | auto *SI = cast<StoreInst>(Val: VL0); |
| 18255 | |
| 18256 | setInsertPointAfterBundle(E); |
| 18257 | |
| 18258 | Value *VecValue = vectorizeOperand(E, NodeIdx: 0); |
| 18259 | if (VecValue->getType() != VecTy) |
| 18260 | VecValue = |
| 18261 | Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0)); |
| 18262 | VecValue = FinalShuffle(VecValue, E); |
| 18263 | |
| 18264 | Value *Ptr = SI->getPointerOperand(); |
| 18265 | Instruction *ST; |
| 18266 | if (E->State == TreeEntry::Vectorize) { |
| 18267 | ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign()); |
| 18268 | } else { |
| 18269 | assert(E->State == TreeEntry::StridedVectorize && |
| 18270 | "Expected either strided or consecutive stores." ); |
| 18271 | if (!E->ReorderIndices.empty()) { |
| 18272 | SI = cast<StoreInst>(Val: E->Scalars[E->ReorderIndices.front()]); |
| 18273 | Ptr = SI->getPointerOperand(); |
| 18274 | } |
| 18275 | Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars); |
| 18276 | Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType()); |
| 18277 | auto *Inst = Builder.CreateIntrinsic( |
| 18278 | ID: Intrinsic::experimental_vp_strided_store, |
| 18279 | Types: {VecTy, Ptr->getType(), StrideTy}, |
| 18280 | Args: {VecValue, Ptr, |
| 18281 | ConstantInt::get( |
| 18282 | Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))), |
| 18283 | Builder.getAllOnesMask(NumElts: VecTy->getElementCount()), |
| 18284 | Builder.getInt32(C: E->Scalars.size())}); |
| 18285 | Inst->addParamAttr( |
| 18286 | /*ArgNo=*/1, |
| 18287 | Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment)); |
| 18288 | ST = Inst; |
| 18289 | } |
| 18290 | |
| 18291 | Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars); |
| 18292 | |
| 18293 | E->VectorizedValue = V; |
| 18294 | ++NumVectorInstructions; |
| 18295 | return V; |
| 18296 | } |
| 18297 | case Instruction::GetElementPtr: { |
| 18298 | auto *GEP0 = cast<GetElementPtrInst>(Val: VL0); |
| 18299 | setInsertPointAfterBundle(E); |
| 18300 | |
| 18301 | Value *Op0 = vectorizeOperand(E, NodeIdx: 0); |
| 18302 | |
| 18303 | SmallVector<Value *> OpVecs; |
| 18304 | for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) { |
| 18305 | Value *OpVec = vectorizeOperand(E, NodeIdx: J); |
| 18306 | OpVecs.push_back(Elt: OpVec); |
| 18307 | } |
| 18308 | |
| 18309 | Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs); |
| 18310 | if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) { |
| 18311 | SmallVector<Value *> GEPs; |
| 18312 | for (Value *V : E->Scalars) { |
| 18313 | if (isa<GetElementPtrInst>(Val: V)) |
| 18314 | GEPs.push_back(Elt: V); |
| 18315 | } |
| 18316 | V = ::propagateMetadata(Inst: I, VL: GEPs); |
| 18317 | } |
| 18318 | |
| 18319 | V = FinalShuffle(V, E); |
| 18320 | |
| 18321 | E->VectorizedValue = V; |
| 18322 | ++NumVectorInstructions; |
| 18323 | |
| 18324 | return V; |
| 18325 | } |
| 18326 | case Instruction::Call: { |
| 18327 | CallInst *CI = cast<CallInst>(Val: VL0); |
| 18328 | setInsertPointAfterBundle(E); |
| 18329 | |
| 18330 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); |
| 18331 | |
| 18332 | SmallVector<Type *> ArgTys = buildIntrinsicArgTypes( |
| 18333 | CI, ID, VF: VecTy->getNumElements(), |
| 18334 | MinBW: It != MinBWs.end() ? It->second.first : 0, TTI); |
| 18335 | auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys); |
| 18336 | bool UseIntrinsic = ID != Intrinsic::not_intrinsic && |
| 18337 | VecCallCosts.first <= VecCallCosts.second; |
| 18338 | |
| 18339 | Value *ScalarArg = nullptr; |
| 18340 | SmallVector<Value *> OpVecs; |
| 18341 | SmallVector<Type *, 2> TysForDecl; |
| 18342 | // Add return type if intrinsic is overloaded on it. |
| 18343 | if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1, TTI)) |
| 18344 | TysForDecl.push_back(Elt: VecTy); |
| 18345 | auto *CEI = cast<CallInst>(Val: VL0); |
| 18346 | for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) { |
| 18347 | // Some intrinsics have scalar arguments. This argument should not be |
| 18348 | // vectorized. |
| 18349 | if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) { |
| 18350 | ScalarArg = CEI->getArgOperand(i: I); |
| 18351 | // if decided to reduce bitwidth of abs intrinsic, it second argument |
| 18352 | // must be set false (do not return poison, if value issigned min). |
| 18353 | if (ID == Intrinsic::abs && It != MinBWs.end() && |
| 18354 | It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType())) |
| 18355 | ScalarArg = Builder.getFalse(); |
| 18356 | OpVecs.push_back(Elt: ScalarArg); |
| 18357 | if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI)) |
| 18358 | TysForDecl.push_back(Elt: ScalarArg->getType()); |
| 18359 | continue; |
| 18360 | } |
| 18361 | |
| 18362 | Value *OpVec = vectorizeOperand(E, NodeIdx: I); |
| 18363 | ScalarArg = CEI->getArgOperand(i: I); |
| 18364 | if (cast<VectorType>(Val: OpVec->getType())->getElementType() != |
| 18365 | ScalarArg->getType()->getScalarType() && |
| 18366 | It == MinBWs.end()) { |
| 18367 | auto *CastTy = |
| 18368 | getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements()); |
| 18369 | OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I)); |
| 18370 | } else if (It != MinBWs.end()) { |
| 18371 | OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I)); |
| 18372 | } |
| 18373 | LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n" ); |
| 18374 | OpVecs.push_back(Elt: OpVec); |
| 18375 | if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI)) |
| 18376 | TysForDecl.push_back(Elt: OpVec->getType()); |
| 18377 | } |
| 18378 | |
| 18379 | Function *CF; |
| 18380 | if (!UseIntrinsic) { |
| 18381 | VFShape Shape = |
| 18382 | VFShape::get(FTy: CI->getFunctionType(), |
| 18383 | EC: ElementCount::getFixed( |
| 18384 | MinVal: static_cast<unsigned>(VecTy->getNumElements())), |
| 18385 | HasGlobalPred: false /*HasGlobalPred*/); |
| 18386 | CF = VFDatabase(*CI).getVectorizedFunction(Shape); |
| 18387 | } else { |
| 18388 | CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl); |
| 18389 | } |
| 18390 | |
| 18391 | SmallVector<OperandBundleDef, 1> OpBundles; |
| 18392 | CI->getOperandBundlesAsDefs(Defs&: OpBundles); |
| 18393 | Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles); |
| 18394 | |
| 18395 | propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0); |
| 18396 | V = FinalShuffle(V, E); |
| 18397 | |
| 18398 | E->VectorizedValue = V; |
| 18399 | ++NumVectorInstructions; |
| 18400 | return V; |
| 18401 | } |
| 18402 | case Instruction::ShuffleVector: { |
| 18403 | Value *V; |
| 18404 | if (SLPReVec && !E->isAltShuffle()) { |
| 18405 | setInsertPointAfterBundle(E); |
| 18406 | Value *Src = vectorizeOperand(E, NodeIdx: 0); |
| 18407 | SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars)); |
| 18408 | if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) { |
| 18409 | SmallVector<int> NewMask(ThisMask.size()); |
| 18410 | transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) { |
| 18411 | return SVSrc->getShuffleMask()[Mask]; |
| 18412 | }); |
| 18413 | V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: 0), |
| 18414 | V2: SVSrc->getOperand(i_nocapture: 1), Mask: NewMask); |
| 18415 | } else { |
| 18416 | V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask); |
| 18417 | } |
| 18418 | propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0); |
| 18419 | if (auto *I = dyn_cast<Instruction>(Val: V)) |
| 18420 | V = ::propagateMetadata(Inst: I, VL: E->Scalars); |
| 18421 | V = FinalShuffle(V, E); |
| 18422 | } else { |
| 18423 | assert(E->isAltShuffle() && |
| 18424 | ((Instruction::isBinaryOp(E->getOpcode()) && |
| 18425 | Instruction::isBinaryOp(E->getAltOpcode())) || |
| 18426 | (Instruction::isCast(E->getOpcode()) && |
| 18427 | Instruction::isCast(E->getAltOpcode())) || |
| 18428 | (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) && |
| 18429 | "Invalid Shuffle Vector Operand" ); |
| 18430 | |
| 18431 | Value *LHS = nullptr, *RHS = nullptr; |
| 18432 | if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) { |
| 18433 | setInsertPointAfterBundle(E); |
| 18434 | LHS = vectorizeOperand(E, NodeIdx: 0); |
| 18435 | RHS = vectorizeOperand(E, NodeIdx: 1); |
| 18436 | } else { |
| 18437 | setInsertPointAfterBundle(E); |
| 18438 | LHS = vectorizeOperand(E, NodeIdx: 0); |
| 18439 | } |
| 18440 | if (LHS && RHS && |
| 18441 | ((Instruction::isBinaryOp(Opcode: E->getOpcode()) && |
| 18442 | (LHS->getType() != VecTy || RHS->getType() != VecTy)) || |
| 18443 | (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) { |
| 18444 | assert((It != MinBWs.end() || |
| 18445 | getOperandEntry(E, 0)->State == TreeEntry::NeedToGather || |
| 18446 | getOperandEntry(E, 1)->State == TreeEntry::NeedToGather || |
| 18447 | MinBWs.contains(getOperandEntry(E, 0)) || |
| 18448 | MinBWs.contains(getOperandEntry(E, 1))) && |
| 18449 | "Expected item in MinBWs." ); |
| 18450 | Type *CastTy = VecTy; |
| 18451 | if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) { |
| 18452 | if (cast<VectorType>(Val: LHS->getType()) |
| 18453 | ->getElementType() |
| 18454 | ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType()) |
| 18455 | ->getElementType() |
| 18456 | ->getIntegerBitWidth()) |
| 18457 | CastTy = RHS->getType(); |
| 18458 | else |
| 18459 | CastTy = LHS->getType(); |
| 18460 | } |
| 18461 | if (LHS->getType() != CastTy) |
| 18462 | LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0)); |
| 18463 | if (RHS->getType() != CastTy) |
| 18464 | RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1)); |
| 18465 | } |
| 18466 | |
| 18467 | Value *V0, *V1; |
| 18468 | if (Instruction::isBinaryOp(Opcode: E->getOpcode())) { |
| 18469 | V0 = Builder.CreateBinOp( |
| 18470 | Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS); |
| 18471 | V1 = Builder.CreateBinOp( |
| 18472 | Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS); |
| 18473 | } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) { |
| 18474 | V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS); |
| 18475 | auto *AltCI = cast<CmpInst>(Val: E->getAltOp()); |
| 18476 | CmpInst::Predicate AltPred = AltCI->getPredicate(); |
| 18477 | V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS); |
| 18478 | } else { |
| 18479 | if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) { |
| 18480 | unsigned SrcBWSz = DL->getTypeSizeInBits( |
| 18481 | Ty: cast<VectorType>(Val: LHS->getType())->getElementType()); |
| 18482 | unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy); |
| 18483 | if (BWSz <= SrcBWSz) { |
| 18484 | if (BWSz < SrcBWSz) |
| 18485 | LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first); |
| 18486 | assert(LHS->getType() == VecTy && |
| 18487 | "Expected same type as operand." ); |
| 18488 | if (auto *I = dyn_cast<Instruction>(Val: LHS)) |
| 18489 | LHS = ::propagateMetadata(Inst: I, VL: E->Scalars); |
| 18490 | LHS = FinalShuffle(LHS, E); |
| 18491 | E->VectorizedValue = LHS; |
| 18492 | ++NumVectorInstructions; |
| 18493 | return LHS; |
| 18494 | } |
| 18495 | } |
| 18496 | V0 = Builder.CreateCast( |
| 18497 | Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy); |
| 18498 | V1 = Builder.CreateCast( |
| 18499 | Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy); |
| 18500 | } |
| 18501 | // Add V0 and V1 to later analysis to try to find and remove matching |
| 18502 | // instruction, if any. |
| 18503 | for (Value *V : {V0, V1}) { |
| 18504 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 18505 | GatherShuffleExtractSeq.insert(X: I); |
| 18506 | CSEBlocks.insert(V: I->getParent()); |
| 18507 | } |
| 18508 | } |
| 18509 | |
| 18510 | // Create shuffle to take alternate operations from the vector. |
| 18511 | // Also, gather up main and alt scalar ops to propagate IR flags to |
| 18512 | // each vector operation. |
| 18513 | ValueList OpScalars, AltScalars; |
| 18514 | SmallVector<int> Mask; |
| 18515 | E->buildAltOpShuffleMask( |
| 18516 | IsAltOp: [E, this](Instruction *I) { |
| 18517 | assert(E->getMatchingMainOpOrAltOp(I) && |
| 18518 | "Unexpected main/alternate opcode" ); |
| 18519 | return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(), |
| 18520 | TLI: *TLI); |
| 18521 | }, |
| 18522 | Mask, OpScalars: &OpScalars, AltScalars: &AltScalars); |
| 18523 | |
| 18524 | propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end()); |
| 18525 | propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end()); |
| 18526 | auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) { |
| 18527 | // Drop nuw flags for abs(sub(commutative), true). |
| 18528 | if (auto *I = dyn_cast<Instruction>(Val: Vec); |
| 18529 | I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) && |
| 18530 | any_of(Range&: E->Scalars, P: [](Value *V) { |
| 18531 | if (isa<PoisonValue>(Val: V)) |
| 18532 | return false; |
| 18533 | auto *IV = cast<Instruction>(Val: V); |
| 18534 | return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV); |
| 18535 | })) |
| 18536 | I->setHasNoUnsignedWrap(/*b=*/false); |
| 18537 | }; |
| 18538 | DropNuwFlag(V0, E->getOpcode()); |
| 18539 | DropNuwFlag(V1, E->getAltOpcode()); |
| 18540 | |
| 18541 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) { |
| 18542 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 18543 | transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask); |
| 18544 | } |
| 18545 | V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask); |
| 18546 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 18547 | V = ::propagateMetadata(Inst: I, VL: E->Scalars); |
| 18548 | GatherShuffleExtractSeq.insert(X: I); |
| 18549 | CSEBlocks.insert(V: I->getParent()); |
| 18550 | } |
| 18551 | } |
| 18552 | |
| 18553 | E->VectorizedValue = V; |
| 18554 | ++NumVectorInstructions; |
| 18555 | |
| 18556 | return V; |
| 18557 | } |
| 18558 | default: |
| 18559 | llvm_unreachable("unknown inst" ); |
| 18560 | } |
| 18561 | return nullptr; |
| 18562 | } |
| 18563 | |
| 18564 | Value *BoUpSLP::vectorizeTree() { |
| 18565 | ExtraValueToDebugLocsMap ExternallyUsedValues; |
| 18566 | return vectorizeTree(ExternallyUsedValues); |
| 18567 | } |
| 18568 | |
| 18569 | Value *BoUpSLP::vectorizeTree( |
| 18570 | const ExtraValueToDebugLocsMap &ExternallyUsedValues, |
| 18571 | Instruction *ReductionRoot, |
| 18572 | ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) { |
| 18573 | // Clean Entry-to-LastInstruction table. It can be affected after scheduling, |
| 18574 | // need to rebuild it. |
| 18575 | EntryToLastInstruction.clear(); |
| 18576 | // All blocks must be scheduled before any instructions are inserted. |
| 18577 | for (auto &BSIter : BlocksSchedules) |
| 18578 | scheduleBlock(BS: BSIter.second.get()); |
| 18579 | // Cache last instructions for the nodes to avoid side effects, which may |
| 18580 | // appear during vectorization, like extra uses, etc. |
| 18581 | for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 18582 | if (TE->isGather()) |
| 18583 | continue; |
| 18584 | (void)getLastInstructionInBundle(E: TE.get()); |
| 18585 | } |
| 18586 | |
| 18587 | if (ReductionRoot) |
| 18588 | Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(), |
| 18589 | IP: ReductionRoot->getIterator()); |
| 18590 | else |
| 18591 | Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin()); |
| 18592 | |
| 18593 | // Vectorize gather operands of the nodes with the external uses only. |
| 18594 | SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries; |
| 18595 | for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 18596 | if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE && |
| 18597 | TE->UserTreeIndex.UserTE->hasState() && |
| 18598 | TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && |
| 18599 | (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI || |
| 18600 | TE->UserTreeIndex.UserTE->isAltShuffle()) && |
| 18601 | all_of(Range&: TE->UserTreeIndex.UserTE->Scalars, |
| 18602 | P: [](Value *V) { return isUsedOutsideBlock(V); })) { |
| 18603 | Instruction &LastInst = |
| 18604 | getLastInstructionInBundle(E: TE->UserTreeIndex.UserTE); |
| 18605 | GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst); |
| 18606 | } |
| 18607 | } |
| 18608 | for (auto &Entry : GatherEntries) { |
| 18609 | IRBuilderBase::InsertPointGuard Guard(Builder); |
| 18610 | Builder.SetInsertPoint(Entry.second); |
| 18611 | Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc()); |
| 18612 | (void)vectorizeTree(E: Entry.first); |
| 18613 | } |
| 18614 | // Emit gathered loads first to emit better code for the users of those |
| 18615 | // gathered loads. |
| 18616 | for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) { |
| 18617 | if (GatheredLoadsEntriesFirst.has_value() && |
| 18618 | TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue && |
| 18619 | (!TE->isGather() || TE->UserTreeIndex)) { |
| 18620 | assert((TE->UserTreeIndex || |
| 18621 | (TE->getOpcode() == Instruction::Load && !TE->isGather())) && |
| 18622 | "Expected gathered load node." ); |
| 18623 | (void)vectorizeTree(E: TE.get()); |
| 18624 | } |
| 18625 | } |
| 18626 | (void)vectorizeTree(E: VectorizableTree[0].get()); |
| 18627 | // Run through the list of postponed gathers and emit them, replacing the temp |
| 18628 | // emitted allocas with actual vector instructions. |
| 18629 | ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef(); |
| 18630 | DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues; |
| 18631 | for (const TreeEntry *E : PostponedNodes) { |
| 18632 | auto *TE = const_cast<TreeEntry *>(E); |
| 18633 | auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue); |
| 18634 | TE->VectorizedValue = nullptr; |
| 18635 | auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue); |
| 18636 | // If user is a PHI node, its vector code have to be inserted right before |
| 18637 | // block terminator. Since the node was delayed, there were some unresolved |
| 18638 | // dependencies at the moment when stab instruction was emitted. In a case |
| 18639 | // when any of these dependencies turn out an operand of another PHI, coming |
| 18640 | // from this same block, position of a stab instruction will become invalid. |
| 18641 | // The is because source vector that supposed to feed this gather node was |
| 18642 | // inserted at the end of the block [after stab instruction]. So we need |
| 18643 | // to adjust insertion point again to the end of block. |
| 18644 | if (isa<PHINode>(Val: UserI)) { |
| 18645 | // Insert before all users. |
| 18646 | Instruction *InsertPt = PrevVec->getParent()->getTerminator(); |
| 18647 | for (User *U : PrevVec->users()) { |
| 18648 | if (U == UserI) |
| 18649 | continue; |
| 18650 | auto *UI = dyn_cast<Instruction>(Val: U); |
| 18651 | if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent()) |
| 18652 | continue; |
| 18653 | if (UI->comesBefore(Other: InsertPt)) |
| 18654 | InsertPt = UI; |
| 18655 | } |
| 18656 | Builder.SetInsertPoint(InsertPt); |
| 18657 | } else { |
| 18658 | Builder.SetInsertPoint(PrevVec); |
| 18659 | } |
| 18660 | Builder.SetCurrentDebugLocation(UserI->getDebugLoc()); |
| 18661 | Value *Vec = vectorizeTree(E: TE); |
| 18662 | if (auto *VecI = dyn_cast<Instruction>(Val: Vec); |
| 18663 | VecI && VecI->getParent() == Builder.GetInsertBlock() && |
| 18664 | Builder.GetInsertPoint()->comesBefore(Other: VecI)) |
| 18665 | VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(), |
| 18666 | I: Builder.GetInsertPoint()); |
| 18667 | if (Vec->getType() != PrevVec->getType()) { |
| 18668 | assert(Vec->getType()->isIntOrIntVectorTy() && |
| 18669 | PrevVec->getType()->isIntOrIntVectorTy() && |
| 18670 | "Expected integer vector types only." ); |
| 18671 | std::optional<bool> IsSigned; |
| 18672 | for (Value *V : TE->Scalars) { |
| 18673 | if (isVectorized(V)) { |
| 18674 | for (const TreeEntry *MNTE : getTreeEntries(V)) { |
| 18675 | auto It = MinBWs.find(Val: MNTE); |
| 18676 | if (It != MinBWs.end()) { |
| 18677 | IsSigned = IsSigned.value_or(u: false) || It->second.second; |
| 18678 | if (*IsSigned) |
| 18679 | break; |
| 18680 | } |
| 18681 | } |
| 18682 | if (IsSigned.value_or(u: false)) |
| 18683 | break; |
| 18684 | // Scan through gather nodes. |
| 18685 | for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) { |
| 18686 | auto It = MinBWs.find(Val: BVE); |
| 18687 | if (It != MinBWs.end()) { |
| 18688 | IsSigned = IsSigned.value_or(u: false) || It->second.second; |
| 18689 | if (*IsSigned) |
| 18690 | break; |
| 18691 | } |
| 18692 | } |
| 18693 | if (IsSigned.value_or(u: false)) |
| 18694 | break; |
| 18695 | if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) { |
| 18696 | IsSigned = |
| 18697 | IsSigned.value_or(u: false) || |
| 18698 | !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL)); |
| 18699 | continue; |
| 18700 | } |
| 18701 | if (IsSigned.value_or(u: false)) |
| 18702 | break; |
| 18703 | } |
| 18704 | } |
| 18705 | if (IsSigned.value_or(u: false)) { |
| 18706 | // Final attempt - check user node. |
| 18707 | auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE); |
| 18708 | if (It != MinBWs.end()) |
| 18709 | IsSigned = It->second.second; |
| 18710 | } |
| 18711 | assert(IsSigned && |
| 18712 | "Expected user node or perfect diamond match in MinBWs." ); |
| 18713 | Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned); |
| 18714 | } |
| 18715 | PrevVec->replaceAllUsesWith(V: Vec); |
| 18716 | PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE); |
| 18717 | // Replace the stub vector node, if it was used before for one of the |
| 18718 | // buildvector nodes already. |
| 18719 | auto It = PostponedValues.find(Val: PrevVec); |
| 18720 | if (It != PostponedValues.end()) { |
| 18721 | for (TreeEntry *VTE : It->getSecond()) |
| 18722 | VTE->VectorizedValue = Vec; |
| 18723 | } |
| 18724 | eraseInstruction(I: PrevVec); |
| 18725 | } |
| 18726 | |
| 18727 | LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() |
| 18728 | << " values .\n" ); |
| 18729 | |
| 18730 | SmallVector<ShuffledInsertData<Value *>> ShuffledInserts; |
| 18731 | // Maps vector instruction to original insertelement instruction |
| 18732 | DenseMap<Value *, InsertElementInst *> VectorToInsertElement; |
| 18733 | // Maps extract Scalar to the corresponding extractelement instruction in the |
| 18734 | // basic block. Only one extractelement per block should be emitted. |
| 18735 | DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>> |
| 18736 | ScalarToEEs; |
| 18737 | SmallDenseSet<Value *, 4> UsedInserts; |
| 18738 | DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts; |
| 18739 | SmallDenseSet<Value *, 4> ScalarsWithNullptrUser; |
| 18740 | SmallDenseSet<ExtractElementInst *, 4> ; |
| 18741 | // Extract all of the elements with the external uses. |
| 18742 | for (const auto &ExternalUse : ExternalUses) { |
| 18743 | Value *Scalar = ExternalUse.Scalar; |
| 18744 | llvm::User *User = ExternalUse.User; |
| 18745 | |
| 18746 | // Skip users that we already RAUW. This happens when one instruction |
| 18747 | // has multiple uses of the same value. |
| 18748 | if (User && !is_contained(Range: Scalar->users(), Element: User)) |
| 18749 | continue; |
| 18750 | const TreeEntry *E = &ExternalUse.E; |
| 18751 | assert(E && "Invalid scalar" ); |
| 18752 | assert(!E->isGather() && "Extracting from a gather list" ); |
| 18753 | // Non-instruction pointers are not deleted, just skip them. |
| 18754 | if (E->getOpcode() == Instruction::GetElementPtr && |
| 18755 | !isa<GetElementPtrInst>(Val: Scalar)) |
| 18756 | continue; |
| 18757 | |
| 18758 | Value *Vec = E->VectorizedValue; |
| 18759 | assert(Vec && "Can't find vectorizable value" ); |
| 18760 | |
| 18761 | Value *Lane = Builder.getInt32(C: ExternalUse.Lane); |
| 18762 | auto ExtractAndExtendIfNeeded = [&](Value *Vec) { |
| 18763 | if (Scalar->getType() != Vec->getType()) { |
| 18764 | Value *Ex = nullptr; |
| 18765 | Value *ExV = nullptr; |
| 18766 | auto *Inst = dyn_cast<Instruction>(Val: Scalar); |
| 18767 | bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst); |
| 18768 | auto It = ScalarToEEs.find(Val: Scalar); |
| 18769 | if (It != ScalarToEEs.end()) { |
| 18770 | // No need to emit many extracts, just move the only one in the |
| 18771 | // current block. |
| 18772 | auto EEIt = It->second.find(Val: ReplaceInst ? Inst->getParent() |
| 18773 | : Builder.GetInsertBlock()); |
| 18774 | if (EEIt != It->second.end()) { |
| 18775 | Value *PrevV = EEIt->second.first; |
| 18776 | if (auto *I = dyn_cast<Instruction>(Val: PrevV); |
| 18777 | I && !ReplaceInst && |
| 18778 | Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() && |
| 18779 | Builder.GetInsertPoint()->comesBefore(Other: I)) { |
| 18780 | I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(), |
| 18781 | I: Builder.GetInsertPoint()); |
| 18782 | if (auto *CI = dyn_cast<Instruction>(Val: EEIt->second.second)) |
| 18783 | CI->moveAfter(MovePos: I); |
| 18784 | } |
| 18785 | Ex = PrevV; |
| 18786 | ExV = EEIt->second.second ? EEIt->second.second : Ex; |
| 18787 | } |
| 18788 | } |
| 18789 | if (!Ex) { |
| 18790 | // "Reuse" the existing extract to improve final codegen. |
| 18791 | if (ReplaceInst) { |
| 18792 | // Leave the instruction as is, if it cheaper extracts and all |
| 18793 | // operands are scalar. |
| 18794 | if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) { |
| 18795 | IgnoredExtracts.insert(V: EE); |
| 18796 | Ex = EE; |
| 18797 | } else { |
| 18798 | auto *CloneInst = Inst->clone(); |
| 18799 | CloneInst->insertBefore(InsertPos: Inst->getIterator()); |
| 18800 | if (Inst->hasName()) |
| 18801 | CloneInst->takeName(V: Inst); |
| 18802 | Ex = CloneInst; |
| 18803 | } |
| 18804 | } else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar); |
| 18805 | ES && isa<Instruction>(Val: Vec)) { |
| 18806 | Value *V = ES->getVectorOperand(); |
| 18807 | auto *IVec = cast<Instruction>(Val: Vec); |
| 18808 | if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty()) |
| 18809 | V = ETEs.front()->VectorizedValue; |
| 18810 | if (auto *IV = dyn_cast<Instruction>(Val: V); |
| 18811 | !IV || IV == Vec || IV->getParent() != IVec->getParent() || |
| 18812 | IV->comesBefore(Other: IVec)) |
| 18813 | Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand()); |
| 18814 | else |
| 18815 | Ex = Builder.CreateExtractElement(Vec, Idx: Lane); |
| 18816 | } else if (auto *VecTy = |
| 18817 | dyn_cast<FixedVectorType>(Val: Scalar->getType())) { |
| 18818 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 18819 | unsigned VecTyNumElements = VecTy->getNumElements(); |
| 18820 | // When REVEC is enabled, we need to extract a vector. |
| 18821 | // Note: The element size of Scalar may be different from the |
| 18822 | // element size of Vec. |
| 18823 | Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements, |
| 18824 | Index: ExternalUse.Lane * VecTyNumElements); |
| 18825 | } else { |
| 18826 | Ex = Builder.CreateExtractElement(Vec, Idx: Lane); |
| 18827 | } |
| 18828 | // If necessary, sign-extend or zero-extend ScalarRoot |
| 18829 | // to the larger type. |
| 18830 | ExV = Ex; |
| 18831 | if (Scalar->getType() != Ex->getType()) |
| 18832 | ExV = Builder.CreateIntCast( |
| 18833 | V: Ex, DestTy: Scalar->getType(), |
| 18834 | isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL))); |
| 18835 | auto *I = dyn_cast<Instruction>(Val: Ex); |
| 18836 | ScalarToEEs[Scalar].try_emplace(Key: I ? I->getParent() |
| 18837 | : &F->getEntryBlock(), |
| 18838 | Args: std::make_pair(x&: Ex, y&: ExV)); |
| 18839 | } |
| 18840 | // The then branch of the previous if may produce constants, since 0 |
| 18841 | // operand might be a constant. |
| 18842 | if (auto *ExI = dyn_cast<Instruction>(Val: Ex); |
| 18843 | ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) { |
| 18844 | GatherShuffleExtractSeq.insert(X: ExI); |
| 18845 | CSEBlocks.insert(V: ExI->getParent()); |
| 18846 | } |
| 18847 | return ExV; |
| 18848 | } |
| 18849 | assert(isa<FixedVectorType>(Scalar->getType()) && |
| 18850 | isa<InsertElementInst>(Scalar) && |
| 18851 | "In-tree scalar of vector type is not insertelement?" ); |
| 18852 | auto *IE = cast<InsertElementInst>(Val: Scalar); |
| 18853 | VectorToInsertElement.try_emplace(Key: Vec, Args&: IE); |
| 18854 | return Vec; |
| 18855 | }; |
| 18856 | // If User == nullptr, the Scalar remains as scalar in vectorized |
| 18857 | // instructions or is used as extra arg. Generate ExtractElement instruction |
| 18858 | // and update the record for this scalar in ExternallyUsedValues. |
| 18859 | if (!User) { |
| 18860 | if (!ScalarsWithNullptrUser.insert(V: Scalar).second) |
| 18861 | continue; |
| 18862 | assert( |
| 18863 | (ExternallyUsedValues.count(Scalar) || |
| 18864 | Scalar->hasNUsesOrMore(UsesLimit) || |
| 18865 | ExternalUsesAsOriginalScalar.contains(Scalar) || |
| 18866 | any_of( |
| 18867 | Scalar->users(), |
| 18868 | [&, TTI = TTI](llvm::User *U) { |
| 18869 | if (ExternalUsesAsOriginalScalar.contains(U)) |
| 18870 | return true; |
| 18871 | ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U); |
| 18872 | return !UseEntries.empty() && |
| 18873 | (E->State == TreeEntry::Vectorize || |
| 18874 | E->State == TreeEntry::StridedVectorize || |
| 18875 | E->State == TreeEntry::CompressVectorize) && |
| 18876 | any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) { |
| 18877 | return (UseEntry->State == TreeEntry::Vectorize || |
| 18878 | UseEntry->State == |
| 18879 | TreeEntry::StridedVectorize || |
| 18880 | UseEntry->State == |
| 18881 | TreeEntry::CompressVectorize) && |
| 18882 | doesInTreeUserNeedToExtract( |
| 18883 | Scalar, getRootEntryInstruction(*UseEntry), |
| 18884 | TLI, TTI); |
| 18885 | }); |
| 18886 | })) && |
| 18887 | "Scalar with nullptr User must be registered in " |
| 18888 | "ExternallyUsedValues map or remain as scalar in vectorized " |
| 18889 | "instructions" ); |
| 18890 | if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) { |
| 18891 | if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) { |
| 18892 | if (PHI->getParent()->isLandingPad()) |
| 18893 | Builder.SetInsertPoint( |
| 18894 | TheBB: PHI->getParent(), |
| 18895 | IP: std::next( |
| 18896 | x: PHI->getParent()->getLandingPadInst()->getIterator())); |
| 18897 | else |
| 18898 | Builder.SetInsertPoint(TheBB: PHI->getParent(), |
| 18899 | IP: PHI->getParent()->getFirstNonPHIIt()); |
| 18900 | } else { |
| 18901 | Builder.SetInsertPoint(TheBB: VecI->getParent(), |
| 18902 | IP: std::next(x: VecI->getIterator())); |
| 18903 | } |
| 18904 | } else { |
| 18905 | Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin()); |
| 18906 | } |
| 18907 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); |
| 18908 | // Required to update internally referenced instructions. |
| 18909 | if (Scalar != NewInst) { |
| 18910 | assert((!isa<ExtractElementInst>(Scalar) || |
| 18911 | !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) && |
| 18912 | "Extractelements should not be replaced." ); |
| 18913 | Scalar->replaceAllUsesWith(V: NewInst); |
| 18914 | } |
| 18915 | continue; |
| 18916 | } |
| 18917 | |
| 18918 | if (auto *VU = dyn_cast<InsertElementInst>(Val: User); |
| 18919 | VU && VU->getOperand(i_nocapture: 1) == Scalar) { |
| 18920 | // Skip if the scalar is another vector op or Vec is not an instruction. |
| 18921 | if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) { |
| 18922 | if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) { |
| 18923 | if (!UsedInserts.insert(V: VU).second) |
| 18924 | continue; |
| 18925 | // Need to use original vector, if the root is truncated. |
| 18926 | auto BWIt = MinBWs.find(Val: E); |
| 18927 | if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) { |
| 18928 | auto *ScalarTy = FTy->getElementType(); |
| 18929 | auto Key = std::make_pair(x&: Vec, y&: ScalarTy); |
| 18930 | auto VecIt = VectorCasts.find(Val: Key); |
| 18931 | if (VecIt == VectorCasts.end()) { |
| 18932 | IRBuilderBase::InsertPointGuard Guard(Builder); |
| 18933 | if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) { |
| 18934 | if (IVec->getParent()->isLandingPad()) |
| 18935 | Builder.SetInsertPoint(TheBB: IVec->getParent(), |
| 18936 | IP: std::next(x: IVec->getParent() |
| 18937 | ->getLandingPadInst() |
| 18938 | ->getIterator())); |
| 18939 | else |
| 18940 | Builder.SetInsertPoint( |
| 18941 | IVec->getParent()->getFirstNonPHIOrDbgOrLifetime()); |
| 18942 | } else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) { |
| 18943 | Builder.SetInsertPoint(IVec->getNextNonDebugInstruction()); |
| 18944 | } |
| 18945 | Vec = Builder.CreateIntCast( |
| 18946 | V: Vec, |
| 18947 | DestTy: getWidenedType( |
| 18948 | ScalarTy, |
| 18949 | VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()), |
| 18950 | isSigned: BWIt->second.second); |
| 18951 | VectorCasts.try_emplace(Key, Args&: Vec); |
| 18952 | } else { |
| 18953 | Vec = VecIt->second; |
| 18954 | } |
| 18955 | } |
| 18956 | |
| 18957 | std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU); |
| 18958 | if (InsertIdx) { |
| 18959 | auto *It = find_if( |
| 18960 | Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) { |
| 18961 | // Checks if 2 insertelements are from the same buildvector. |
| 18962 | InsertElementInst *VecInsert = Data.InsertElements.front(); |
| 18963 | return areTwoInsertFromSameBuildVector( |
| 18964 | VU, V: VecInsert, |
| 18965 | GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); }); |
| 18966 | }); |
| 18967 | unsigned Idx = *InsertIdx; |
| 18968 | if (It == ShuffledInserts.end()) { |
| 18969 | (void)ShuffledInserts.emplace_back(); |
| 18970 | It = std::next(x: ShuffledInserts.begin(), |
| 18971 | n: ShuffledInserts.size() - 1); |
| 18972 | } |
| 18973 | SmallVectorImpl<int> &Mask = It->ValueMasks[Vec]; |
| 18974 | if (Mask.empty()) |
| 18975 | Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem); |
| 18976 | Mask[Idx] = ExternalUse.Lane; |
| 18977 | It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User)); |
| 18978 | continue; |
| 18979 | } |
| 18980 | } |
| 18981 | } |
| 18982 | } |
| 18983 | |
| 18984 | // Generate extracts for out-of-tree users. |
| 18985 | // Find the insertion point for the extractelement lane. |
| 18986 | if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) { |
| 18987 | if (PHINode *PH = dyn_cast<PHINode>(Val: User)) { |
| 18988 | for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) { |
| 18989 | if (PH->getIncomingValue(i: I) == Scalar) { |
| 18990 | Instruction *IncomingTerminator = |
| 18991 | PH->getIncomingBlock(i: I)->getTerminator(); |
| 18992 | if (isa<CatchSwitchInst>(Val: IncomingTerminator)) { |
| 18993 | Builder.SetInsertPoint(TheBB: VecI->getParent(), |
| 18994 | IP: std::next(x: VecI->getIterator())); |
| 18995 | } else { |
| 18996 | Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator()); |
| 18997 | } |
| 18998 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); |
| 18999 | PH->setOperand(i_nocapture: I, Val_nocapture: NewInst); |
| 19000 | } |
| 19001 | } |
| 19002 | } else { |
| 19003 | Builder.SetInsertPoint(cast<Instruction>(Val: User)); |
| 19004 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); |
| 19005 | User->replaceUsesOfWith(From: Scalar, To: NewInst); |
| 19006 | } |
| 19007 | } else { |
| 19008 | Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin()); |
| 19009 | Value *NewInst = ExtractAndExtendIfNeeded(Vec); |
| 19010 | User->replaceUsesOfWith(From: Scalar, To: NewInst); |
| 19011 | } |
| 19012 | |
| 19013 | LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n" ); |
| 19014 | } |
| 19015 | |
| 19016 | auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) { |
| 19017 | SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem); |
| 19018 | SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem); |
| 19019 | int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements(); |
| 19020 | for (int I = 0, E = Mask.size(); I < E; ++I) { |
| 19021 | if (Mask[I] < VF) |
| 19022 | CombinedMask1[I] = Mask[I]; |
| 19023 | else |
| 19024 | CombinedMask2[I] = Mask[I] - VF; |
| 19025 | } |
| 19026 | ShuffleInstructionBuilder ShuffleBuilder( |
| 19027 | cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this); |
| 19028 | ShuffleBuilder.add(V1, Mask: CombinedMask1); |
| 19029 | if (V2) |
| 19030 | ShuffleBuilder.add(V1: V2, Mask: CombinedMask2); |
| 19031 | return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {}); |
| 19032 | }; |
| 19033 | |
| 19034 | auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask, |
| 19035 | bool ForSingleMask) { |
| 19036 | unsigned VF = Mask.size(); |
| 19037 | unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements(); |
| 19038 | if (VF != VecVF) { |
| 19039 | if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) { |
| 19040 | Vec = CreateShuffle(Vec, nullptr, Mask); |
| 19041 | return std::make_pair(x&: Vec, y: true); |
| 19042 | } |
| 19043 | if (!ForSingleMask) { |
| 19044 | SmallVector<int> ResizeMask(VF, PoisonMaskElem); |
| 19045 | for (unsigned I = 0; I < VF; ++I) { |
| 19046 | if (Mask[I] != PoisonMaskElem) |
| 19047 | ResizeMask[Mask[I]] = Mask[I]; |
| 19048 | } |
| 19049 | Vec = CreateShuffle(Vec, nullptr, ResizeMask); |
| 19050 | } |
| 19051 | } |
| 19052 | |
| 19053 | return std::make_pair(x&: Vec, y: false); |
| 19054 | }; |
| 19055 | // Perform shuffling of the vectorize tree entries for better handling of |
| 19056 | // external extracts. |
| 19057 | for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) { |
| 19058 | // Find the first and the last instruction in the list of insertelements. |
| 19059 | sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement); |
| 19060 | InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front(); |
| 19061 | InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back(); |
| 19062 | Builder.SetInsertPoint(LastInsert); |
| 19063 | auto Vector = ShuffledInserts[I].ValueMasks.takeVector(); |
| 19064 | Value *NewInst = performExtractsShuffleAction<Value>( |
| 19065 | ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), |
| 19066 | Base: FirstInsert->getOperand(i_nocapture: 0), |
| 19067 | GetVF: [](Value *Vec) { |
| 19068 | return cast<VectorType>(Val: Vec->getType()) |
| 19069 | ->getElementCount() |
| 19070 | .getKnownMinValue(); |
| 19071 | }, |
| 19072 | ResizeAction: ResizeToVF, |
| 19073 | Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask, |
| 19074 | ArrayRef<Value *> Vals) { |
| 19075 | assert((Vals.size() == 1 || Vals.size() == 2) && |
| 19076 | "Expected exactly 1 or 2 input values." ); |
| 19077 | if (Vals.size() == 1) { |
| 19078 | // Do not create shuffle if the mask is a simple identity |
| 19079 | // non-resizing mask. |
| 19080 | if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType()) |
| 19081 | ->getNumElements() || |
| 19082 | !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size())) |
| 19083 | return CreateShuffle(Vals.front(), nullptr, Mask); |
| 19084 | return Vals.front(); |
| 19085 | } |
| 19086 | return CreateShuffle(Vals.front() ? Vals.front() |
| 19087 | : FirstInsert->getOperand(i_nocapture: 0), |
| 19088 | Vals.back(), Mask); |
| 19089 | }); |
| 19090 | auto It = ShuffledInserts[I].InsertElements.rbegin(); |
| 19091 | // Rebuild buildvector chain. |
| 19092 | InsertElementInst *II = nullptr; |
| 19093 | if (It != ShuffledInserts[I].InsertElements.rend()) |
| 19094 | II = *It; |
| 19095 | SmallVector<Instruction *> Inserts; |
| 19096 | while (It != ShuffledInserts[I].InsertElements.rend()) { |
| 19097 | assert(II && "Must be an insertelement instruction." ); |
| 19098 | if (*It == II) |
| 19099 | ++It; |
| 19100 | else |
| 19101 | Inserts.push_back(Elt: cast<Instruction>(Val: II)); |
| 19102 | II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0)); |
| 19103 | } |
| 19104 | for (Instruction *II : reverse(C&: Inserts)) { |
| 19105 | II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst); |
| 19106 | if (auto *NewI = dyn_cast<Instruction>(Val: NewInst)) |
| 19107 | if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI)) |
| 19108 | II->moveAfter(MovePos: NewI); |
| 19109 | NewInst = II; |
| 19110 | } |
| 19111 | LastInsert->replaceAllUsesWith(V: NewInst); |
| 19112 | for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) { |
| 19113 | IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0), |
| 19114 | To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType())); |
| 19115 | IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1), |
| 19116 | To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType())); |
| 19117 | eraseInstruction(I: IE); |
| 19118 | } |
| 19119 | CSEBlocks.insert(V: LastInsert->getParent()); |
| 19120 | } |
| 19121 | |
| 19122 | SmallVector<Instruction *> RemovedInsts; |
| 19123 | // For each vectorized value: |
| 19124 | for (auto &TEPtr : VectorizableTree) { |
| 19125 | TreeEntry *Entry = TEPtr.get(); |
| 19126 | |
| 19127 | // No need to handle users of gathered values. |
| 19128 | if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize) |
| 19129 | continue; |
| 19130 | |
| 19131 | assert(Entry->VectorizedValue && "Can't find vectorizable value" ); |
| 19132 | |
| 19133 | // For each lane: |
| 19134 | for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { |
| 19135 | Value *Scalar = Entry->Scalars[Lane]; |
| 19136 | |
| 19137 | if (Entry->getOpcode() == Instruction::GetElementPtr && |
| 19138 | !isa<GetElementPtrInst>(Val: Scalar)) |
| 19139 | continue; |
| 19140 | if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar); |
| 19141 | EE && IgnoredExtracts.contains(V: EE)) |
| 19142 | continue; |
| 19143 | if (isa<PoisonValue>(Val: Scalar)) |
| 19144 | continue; |
| 19145 | #ifndef NDEBUG |
| 19146 | Type *Ty = Scalar->getType(); |
| 19147 | if (!Ty->isVoidTy()) { |
| 19148 | for (User *U : Scalar->users()) { |
| 19149 | LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n" ); |
| 19150 | |
| 19151 | // It is legal to delete users in the ignorelist. |
| 19152 | assert((isVectorized(U) || |
| 19153 | (UserIgnoreList && UserIgnoreList->contains(U)) || |
| 19154 | (isa_and_nonnull<Instruction>(U) && |
| 19155 | isDeleted(cast<Instruction>(U)))) && |
| 19156 | "Deleting out-of-tree value" ); |
| 19157 | } |
| 19158 | } |
| 19159 | #endif |
| 19160 | LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n" ); |
| 19161 | auto *I = cast<Instruction>(Val: Scalar); |
| 19162 | RemovedInsts.push_back(Elt: I); |
| 19163 | } |
| 19164 | } |
| 19165 | |
| 19166 | // Merge the DIAssignIDs from the about-to-be-deleted instructions into the |
| 19167 | // new vector instruction. |
| 19168 | if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue)) |
| 19169 | V->mergeDIAssignID(SourceInstructions: RemovedInsts); |
| 19170 | |
| 19171 | // Clear up reduction references, if any. |
| 19172 | if (UserIgnoreList) { |
| 19173 | for (Instruction *I : RemovedInsts) { |
| 19174 | const TreeEntry *IE = getTreeEntries(V: I).front(); |
| 19175 | if (IE->Idx != 0 && |
| 19176 | !(VectorizableTree.front()->isGather() && IE->UserTreeIndex && |
| 19177 | (ValueToGatherNodes.lookup(Val: I).contains( |
| 19178 | key: VectorizableTree.front().get()) || |
| 19179 | (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() && |
| 19180 | IE->UserTreeIndex.EdgeIdx == UINT_MAX))) && |
| 19181 | !(VectorizableTree.front()->State == TreeEntry::SplitVectorize && |
| 19182 | IE->UserTreeIndex && |
| 19183 | is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) && |
| 19184 | !(GatheredLoadsEntriesFirst.has_value() && |
| 19185 | IE->Idx >= *GatheredLoadsEntriesFirst && |
| 19186 | VectorizableTree.front()->isGather() && |
| 19187 | is_contained(Range&: VectorizableTree.front()->Scalars, Element: I))) |
| 19188 | continue; |
| 19189 | SmallVector<SelectInst *> LogicalOpSelects; |
| 19190 | I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) { |
| 19191 | // Do not replace condition of the logical op in form select <cond>. |
| 19192 | bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) && |
| 19193 | (match(V: U.getUser(), P: m_LogicalAnd()) || |
| 19194 | match(V: U.getUser(), P: m_LogicalOr())) && |
| 19195 | U.getOperandNo() == 0; |
| 19196 | if (IsPoisoningLogicalOp) { |
| 19197 | LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser())); |
| 19198 | return false; |
| 19199 | } |
| 19200 | return UserIgnoreList->contains(V: U.getUser()); |
| 19201 | }); |
| 19202 | // Replace conditions of the poisoning logical ops with the non-poison |
| 19203 | // constant value. |
| 19204 | for (SelectInst *SI : LogicalOpSelects) |
| 19205 | SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType())); |
| 19206 | } |
| 19207 | } |
| 19208 | // Retain to-be-deleted instructions for some debug-info bookkeeping and alias |
| 19209 | // cache correctness. |
| 19210 | // NOTE: removeInstructionAndOperands only marks the instruction for deletion |
| 19211 | // - instructions are not deleted until later. |
| 19212 | removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales); |
| 19213 | |
| 19214 | Builder.ClearInsertionPoint(); |
| 19215 | InstrElementSize.clear(); |
| 19216 | |
| 19217 | const TreeEntry &RootTE = *VectorizableTree.front(); |
| 19218 | Value *Vec = RootTE.VectorizedValue; |
| 19219 | if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 && |
| 19220 | It != MinBWs.end() && |
| 19221 | ReductionBitWidth != It->second.first) { |
| 19222 | IRBuilder<>::InsertPointGuard Guard(Builder); |
| 19223 | Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(), |
| 19224 | IP: ReductionRoot->getIterator()); |
| 19225 | Vec = Builder.CreateIntCast( |
| 19226 | V: Vec, |
| 19227 | DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth), |
| 19228 | EC: cast<VectorType>(Val: Vec->getType())->getElementCount()), |
| 19229 | isSigned: It->second.second); |
| 19230 | } |
| 19231 | return Vec; |
| 19232 | } |
| 19233 | |
| 19234 | void BoUpSLP::optimizeGatherSequence() { |
| 19235 | LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size() |
| 19236 | << " gather sequences instructions.\n" ); |
| 19237 | // LICM InsertElementInst sequences. |
| 19238 | for (Instruction *I : GatherShuffleExtractSeq) { |
| 19239 | if (isDeleted(I)) |
| 19240 | continue; |
| 19241 | |
| 19242 | // Check if this block is inside a loop. |
| 19243 | Loop *L = LI->getLoopFor(BB: I->getParent()); |
| 19244 | if (!L) |
| 19245 | continue; |
| 19246 | |
| 19247 | // Check if it has a preheader. |
| 19248 | BasicBlock * = L->getLoopPreheader(); |
| 19249 | if (!PreHeader) |
| 19250 | continue; |
| 19251 | |
| 19252 | // If the vector or the element that we insert into it are |
| 19253 | // instructions that are defined in this basic block then we can't |
| 19254 | // hoist this instruction. |
| 19255 | if (any_of(Range: I->operands(), P: [L](Value *V) { |
| 19256 | auto *OpI = dyn_cast<Instruction>(Val: V); |
| 19257 | return OpI && L->contains(Inst: OpI); |
| 19258 | })) |
| 19259 | continue; |
| 19260 | |
| 19261 | // We can hoist this instruction. Move it to the pre-header. |
| 19262 | I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator()); |
| 19263 | CSEBlocks.insert(V: PreHeader); |
| 19264 | } |
| 19265 | |
| 19266 | // Make a list of all reachable blocks in our CSE queue. |
| 19267 | SmallVector<const DomTreeNode *, 8> CSEWorkList; |
| 19268 | CSEWorkList.reserve(N: CSEBlocks.size()); |
| 19269 | for (BasicBlock *BB : CSEBlocks) |
| 19270 | if (DomTreeNode *N = DT->getNode(BB)) { |
| 19271 | assert(DT->isReachableFromEntry(N)); |
| 19272 | CSEWorkList.push_back(Elt: N); |
| 19273 | } |
| 19274 | |
| 19275 | // Sort blocks by domination. This ensures we visit a block after all blocks |
| 19276 | // dominating it are visited. |
| 19277 | llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) { |
| 19278 | assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && |
| 19279 | "Different nodes should have different DFS numbers" ); |
| 19280 | return A->getDFSNumIn() < B->getDFSNumIn(); |
| 19281 | }); |
| 19282 | |
| 19283 | // Less defined shuffles can be replaced by the more defined copies. |
| 19284 | // Between two shuffles one is less defined if it has the same vector operands |
| 19285 | // and its mask indeces are the same as in the first one or undefs. E.g. |
| 19286 | // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0, |
| 19287 | // poison, <0, 0, 0, 0>. |
| 19288 | auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1, |
| 19289 | Instruction *I2, |
| 19290 | SmallVectorImpl<int> &NewMask) { |
| 19291 | if (I1->getType() != I2->getType()) |
| 19292 | return false; |
| 19293 | auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1); |
| 19294 | auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2); |
| 19295 | if (!SI1 || !SI2) |
| 19296 | return I1->isIdenticalTo(I: I2); |
| 19297 | if (SI1->isIdenticalTo(I: SI2)) |
| 19298 | return true; |
| 19299 | for (int I = 0, E = SI1->getNumOperands(); I < E; ++I) |
| 19300 | if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I)) |
| 19301 | return false; |
| 19302 | // Check if the second instruction is more defined than the first one. |
| 19303 | NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end()); |
| 19304 | ArrayRef<int> SM1 = SI1->getShuffleMask(); |
| 19305 | // Count trailing undefs in the mask to check the final number of used |
| 19306 | // registers. |
| 19307 | unsigned LastUndefsCnt = 0; |
| 19308 | for (int I = 0, E = NewMask.size(); I < E; ++I) { |
| 19309 | if (SM1[I] == PoisonMaskElem) |
| 19310 | ++LastUndefsCnt; |
| 19311 | else |
| 19312 | LastUndefsCnt = 0; |
| 19313 | if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem && |
| 19314 | NewMask[I] != SM1[I]) |
| 19315 | return false; |
| 19316 | if (NewMask[I] == PoisonMaskElem) |
| 19317 | NewMask[I] = SM1[I]; |
| 19318 | } |
| 19319 | // Check if the last undefs actually change the final number of used vector |
| 19320 | // registers. |
| 19321 | return SM1.size() - LastUndefsCnt > 1 && |
| 19322 | ::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) == |
| 19323 | ::getNumberOfParts( |
| 19324 | TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(), |
| 19325 | VF: SM1.size() - LastUndefsCnt)); |
| 19326 | }; |
| 19327 | // Perform O(N^2) search over the gather/shuffle sequences and merge identical |
| 19328 | // instructions. TODO: We can further optimize this scan if we split the |
| 19329 | // instructions into different buckets based on the insert lane. |
| 19330 | SmallVector<Instruction *, 16> Visited; |
| 19331 | for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) { |
| 19332 | assert(*I && |
| 19333 | (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) && |
| 19334 | "Worklist not sorted properly!" ); |
| 19335 | BasicBlock *BB = (*I)->getBlock(); |
| 19336 | // For all instructions in blocks containing gather sequences: |
| 19337 | for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) { |
| 19338 | if (isDeleted(I: &In)) |
| 19339 | continue; |
| 19340 | if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) && |
| 19341 | !GatherShuffleExtractSeq.contains(key: &In)) |
| 19342 | continue; |
| 19343 | |
| 19344 | // Check if we can replace this instruction with any of the |
| 19345 | // visited instructions. |
| 19346 | bool Replaced = false; |
| 19347 | for (Instruction *&V : Visited) { |
| 19348 | SmallVector<int> NewMask; |
| 19349 | if (IsIdenticalOrLessDefined(&In, V, NewMask) && |
| 19350 | DT->dominates(A: V->getParent(), B: In.getParent())) { |
| 19351 | In.replaceAllUsesWith(V); |
| 19352 | eraseInstruction(I: &In); |
| 19353 | if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V)) |
| 19354 | if (!NewMask.empty()) |
| 19355 | SI->setShuffleMask(NewMask); |
| 19356 | Replaced = true; |
| 19357 | break; |
| 19358 | } |
| 19359 | if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) && |
| 19360 | GatherShuffleExtractSeq.contains(key: V) && |
| 19361 | IsIdenticalOrLessDefined(V, &In, NewMask) && |
| 19362 | DT->dominates(A: In.getParent(), B: V->getParent())) { |
| 19363 | In.moveAfter(MovePos: V); |
| 19364 | V->replaceAllUsesWith(V: &In); |
| 19365 | eraseInstruction(I: V); |
| 19366 | if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In)) |
| 19367 | if (!NewMask.empty()) |
| 19368 | SI->setShuffleMask(NewMask); |
| 19369 | V = &In; |
| 19370 | Replaced = true; |
| 19371 | break; |
| 19372 | } |
| 19373 | } |
| 19374 | if (!Replaced) { |
| 19375 | assert(!is_contained(Visited, &In)); |
| 19376 | Visited.push_back(Elt: &In); |
| 19377 | } |
| 19378 | } |
| 19379 | } |
| 19380 | CSEBlocks.clear(); |
| 19381 | GatherShuffleExtractSeq.clear(); |
| 19382 | } |
| 19383 | |
| 19384 | BoUpSLP::ScheduleBundle & |
| 19385 | BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) { |
| 19386 | auto &BundlePtr = |
| 19387 | ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>()); |
| 19388 | for (Value *V : VL) { |
| 19389 | if (doesNotNeedToBeScheduled(V)) |
| 19390 | continue; |
| 19391 | ScheduleData *BundleMember = getScheduleData(V); |
| 19392 | assert(BundleMember && "no ScheduleData for bundle member " |
| 19393 | "(maybe not in same basic block)" ); |
| 19394 | // Group the instructions to a bundle. |
| 19395 | BundlePtr->add(SD: BundleMember); |
| 19396 | ScheduledBundles.try_emplace(Key: cast<Instruction>(Val: V)) |
| 19397 | .first->getSecond() |
| 19398 | .push_back(Elt: BundlePtr.get()); |
| 19399 | } |
| 19400 | assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle" ); |
| 19401 | return *BundlePtr; |
| 19402 | } |
| 19403 | |
| 19404 | // Groups the instructions to a bundle (which is then a single scheduling entity) |
| 19405 | // and schedules instructions until the bundle gets ready. |
| 19406 | std::optional<BoUpSLP::ScheduleBundle *> |
| 19407 | BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, |
| 19408 | const InstructionsState &S) { |
| 19409 | // No need to schedule PHIs, insertelement, extractelement and extractvalue |
| 19410 | // instructions. |
| 19411 | if (isa<PHINode>(Val: S.getMainOp()) || |
| 19412 | isVectorLikeInstWithConstOps(V: S.getMainOp()) || doesNotNeedToSchedule(VL)) |
| 19413 | return nullptr; |
| 19414 | |
| 19415 | // Initialize the instruction bundle. |
| 19416 | Instruction *OldScheduleEnd = ScheduleEnd; |
| 19417 | LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n" ); |
| 19418 | |
| 19419 | auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) { |
| 19420 | // The scheduling region got new instructions at the lower end (or it is a |
| 19421 | // new region for the first bundle). This makes it necessary to |
| 19422 | // recalculate all dependencies. |
| 19423 | // It is seldom that this needs to be done a second time after adding the |
| 19424 | // initial bundle to the region. |
| 19425 | if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) { |
| 19426 | for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { |
| 19427 | if (ScheduleData *SD = getScheduleData(I)) |
| 19428 | SD->clearDependencies(); |
| 19429 | } |
| 19430 | ReSchedule = true; |
| 19431 | } |
| 19432 | if (Bundle && !Bundle.getBundle().empty()) { |
| 19433 | LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block " |
| 19434 | << BB->getName() << "\n" ); |
| 19435 | calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP); |
| 19436 | } |
| 19437 | |
| 19438 | if (ReSchedule) { |
| 19439 | resetSchedule(); |
| 19440 | initialFillReadyList(ReadyList&: ReadyInsts); |
| 19441 | } |
| 19442 | |
| 19443 | // Now try to schedule the new bundle or (if no bundle) just calculate |
| 19444 | // dependencies. As soon as the bundle is "ready" it means that there are no |
| 19445 | // cyclic dependencies and we can schedule it. Note that's important that we |
| 19446 | // don't "schedule" the bundle yet. |
| 19447 | while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) && |
| 19448 | !ReadyInsts.empty()) { |
| 19449 | ScheduleEntity *Picked = ReadyInsts.pop_back_val(); |
| 19450 | assert(Picked->isReady() && "must be ready to schedule" ); |
| 19451 | schedule(Data: Picked, ReadyList&: ReadyInsts); |
| 19452 | if (Picked == &Bundle) |
| 19453 | break; |
| 19454 | } |
| 19455 | }; |
| 19456 | |
| 19457 | // Make sure that the scheduling region contains all |
| 19458 | // instructions of the bundle. |
| 19459 | for (Value *V : VL) { |
| 19460 | if (doesNotNeedToBeScheduled(V)) |
| 19461 | continue; |
| 19462 | if (!extendSchedulingRegion(V, S)) { |
| 19463 | // If the scheduling region got new instructions at the lower end (or it |
| 19464 | // is a new region for the first bundle). This makes it necessary to |
| 19465 | // recalculate all dependencies. |
| 19466 | // Otherwise the compiler may crash trying to incorrectly calculate |
| 19467 | // dependencies and emit instruction in the wrong order at the actual |
| 19468 | // scheduling. |
| 19469 | ScheduleBundle Invalid = ScheduleBundle::invalid(); |
| 19470 | TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid); |
| 19471 | return std::nullopt; |
| 19472 | } |
| 19473 | } |
| 19474 | |
| 19475 | bool ReSchedule = false; |
| 19476 | for (Value *V : VL) { |
| 19477 | if (doesNotNeedToBeScheduled(V)) |
| 19478 | continue; |
| 19479 | ScheduleData *BundleMember = getScheduleData(V); |
| 19480 | assert(BundleMember && |
| 19481 | "no ScheduleData for bundle member (maybe not in same basic block)" ); |
| 19482 | |
| 19483 | // Make sure we don't leave the pieces of the bundle in the ready list when |
| 19484 | // whole bundle might not be ready. |
| 19485 | ReadyInsts.remove(X: BundleMember); |
| 19486 | if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V); |
| 19487 | !Bundles.empty()) { |
| 19488 | for (ScheduleBundle *B : Bundles) |
| 19489 | ReadyInsts.remove(X: B); |
| 19490 | } |
| 19491 | |
| 19492 | if (!BundleMember->isScheduled()) |
| 19493 | continue; |
| 19494 | // A bundle member was scheduled as single instruction before and now |
| 19495 | // needs to be scheduled as part of the bundle. We just get rid of the |
| 19496 | // existing schedule. |
| 19497 | LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember |
| 19498 | << " was already scheduled\n" ); |
| 19499 | ReSchedule = true; |
| 19500 | } |
| 19501 | |
| 19502 | ScheduleBundle &Bundle = buildBundle(VL); |
| 19503 | TryScheduleBundleImpl(ReSchedule, Bundle); |
| 19504 | if (!Bundle.isReady()) { |
| 19505 | for (ScheduleData *BD : Bundle.getBundle()) { |
| 19506 | if (BD->isReady()) { |
| 19507 | ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst()); |
| 19508 | if (Bundles.empty()) { |
| 19509 | ReadyInsts.insert(X: BD); |
| 19510 | continue; |
| 19511 | } |
| 19512 | for (ScheduleBundle *B : Bundles) |
| 19513 | if (B->isReady()) |
| 19514 | ReadyInsts.insert(X: B); |
| 19515 | } |
| 19516 | } |
| 19517 | ScheduledBundlesList.pop_back(); |
| 19518 | for (Value *V : VL) { |
| 19519 | if (doesNotNeedToBeScheduled(V)) |
| 19520 | continue; |
| 19521 | ScheduledBundles.find(Val: cast<Instruction>(Val: V))->getSecond().pop_back(); |
| 19522 | } |
| 19523 | return std::nullopt; |
| 19524 | } |
| 19525 | return &Bundle; |
| 19526 | } |
| 19527 | |
| 19528 | BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { |
| 19529 | // Allocate a new ScheduleData for the instruction. |
| 19530 | if (ChunkPos >= ChunkSize) { |
| 19531 | ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize)); |
| 19532 | ChunkPos = 0; |
| 19533 | } |
| 19534 | return &(ScheduleDataChunks.back()[ChunkPos++]); |
| 19535 | } |
| 19536 | |
| 19537 | bool BoUpSLP::BlockScheduling::extendSchedulingRegion( |
| 19538 | Value *V, const InstructionsState &S) { |
| 19539 | Instruction *I = dyn_cast<Instruction>(Val: V); |
| 19540 | assert(I && "bundle member must be an instruction" ); |
| 19541 | assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) && |
| 19542 | !doesNotNeedToBeScheduled(I) && |
| 19543 | "phi nodes/insertelements/extractelements/extractvalues don't need to " |
| 19544 | "be scheduled" ); |
| 19545 | if (getScheduleData(I)) |
| 19546 | return true; |
| 19547 | if (!ScheduleStart) { |
| 19548 | // It's the first instruction in the new region. |
| 19549 | initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr); |
| 19550 | ScheduleStart = I; |
| 19551 | ScheduleEnd = I->getNextNode(); |
| 19552 | assert(ScheduleEnd && "tried to vectorize a terminator?" ); |
| 19553 | LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n" ); |
| 19554 | return true; |
| 19555 | } |
| 19556 | // Search up and down at the same time, because we don't know if the new |
| 19557 | // instruction is above or below the existing scheduling region. |
| 19558 | // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted |
| 19559 | // against the budget. Otherwise debug info could affect codegen. |
| 19560 | BasicBlock::reverse_iterator UpIter = |
| 19561 | ++ScheduleStart->getIterator().getReverse(); |
| 19562 | BasicBlock::reverse_iterator UpperEnd = BB->rend(); |
| 19563 | BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); |
| 19564 | BasicBlock::iterator LowerEnd = BB->end(); |
| 19565 | auto IsAssumeLikeIntr = [](const Instruction &I) { |
| 19566 | if (auto *II = dyn_cast<IntrinsicInst>(Val: &I)) |
| 19567 | return II->isAssumeLikeIntrinsic(); |
| 19568 | return false; |
| 19569 | }; |
| 19570 | UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr); |
| 19571 | DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr); |
| 19572 | while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && |
| 19573 | &*DownIter != I) { |
| 19574 | if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { |
| 19575 | LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n" ); |
| 19576 | return false; |
| 19577 | } |
| 19578 | |
| 19579 | ++UpIter; |
| 19580 | ++DownIter; |
| 19581 | |
| 19582 | UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr); |
| 19583 | DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr); |
| 19584 | } |
| 19585 | if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { |
| 19586 | assert(I->getParent() == ScheduleStart->getParent() && |
| 19587 | "Instruction is in wrong basic block." ); |
| 19588 | initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion); |
| 19589 | ScheduleStart = I; |
| 19590 | LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I |
| 19591 | << "\n" ); |
| 19592 | return true; |
| 19593 | } |
| 19594 | assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && |
| 19595 | "Expected to reach top of the basic block or instruction down the " |
| 19596 | "lower end." ); |
| 19597 | assert(I->getParent() == ScheduleEnd->getParent() && |
| 19598 | "Instruction is in wrong basic block." ); |
| 19599 | initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion, |
| 19600 | NextLoadStore: nullptr); |
| 19601 | ScheduleEnd = I->getNextNode(); |
| 19602 | assert(ScheduleEnd && "tried to vectorize a terminator?" ); |
| 19603 | LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n" ); |
| 19604 | return true; |
| 19605 | } |
| 19606 | |
| 19607 | void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, |
| 19608 | Instruction *ToI, |
| 19609 | ScheduleData *PrevLoadStore, |
| 19610 | ScheduleData *NextLoadStore) { |
| 19611 | ScheduleData *CurrentLoadStore = PrevLoadStore; |
| 19612 | for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { |
| 19613 | // No need to allocate data for non-schedulable instructions. |
| 19614 | if (doesNotNeedToBeScheduled(V: I)) |
| 19615 | continue; |
| 19616 | ScheduleData *SD = ScheduleDataMap.lookup(Val: I); |
| 19617 | if (!SD) { |
| 19618 | SD = allocateScheduleDataChunks(); |
| 19619 | ScheduleDataMap[I] = SD; |
| 19620 | } |
| 19621 | assert(!isInSchedulingRegion(SD) && |
| 19622 | "new ScheduleData already in scheduling region" ); |
| 19623 | SD->init(BlockSchedulingRegionID: SchedulingRegionID, I); |
| 19624 | |
| 19625 | if (I->mayReadOrWriteMemory() && |
| 19626 | (!isa<IntrinsicInst>(Val: I) || |
| 19627 | (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect && |
| 19628 | cast<IntrinsicInst>(Val: I)->getIntrinsicID() != |
| 19629 | Intrinsic::pseudoprobe))) { |
| 19630 | // Update the linked list of memory accessing instructions. |
| 19631 | if (CurrentLoadStore) { |
| 19632 | CurrentLoadStore->setNextLoadStore(SD); |
| 19633 | } else { |
| 19634 | FirstLoadStoreInRegion = SD; |
| 19635 | } |
| 19636 | CurrentLoadStore = SD; |
| 19637 | } |
| 19638 | |
| 19639 | if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) || |
| 19640 | match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>())) |
| 19641 | RegionHasStackSave = true; |
| 19642 | } |
| 19643 | if (NextLoadStore) { |
| 19644 | if (CurrentLoadStore) |
| 19645 | CurrentLoadStore->setNextLoadStore(NextLoadStore); |
| 19646 | } else { |
| 19647 | LastLoadStoreInRegion = CurrentLoadStore; |
| 19648 | } |
| 19649 | } |
| 19650 | |
| 19651 | void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, |
| 19652 | bool InsertInReadyList, |
| 19653 | BoUpSLP *SLP) { |
| 19654 | SmallVector<ScheduleData *> WorkList; |
| 19655 | auto ProcessNode = [&](ScheduleData *BundleMember) { |
| 19656 | if (BundleMember->hasValidDependencies()) |
| 19657 | return; |
| 19658 | LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n" ); |
| 19659 | BundleMember->initDependencies(); |
| 19660 | BundleMember->resetUnscheduledDeps(); |
| 19661 | // Handle def-use chain dependencies. |
| 19662 | for (User *U : BundleMember->getInst()->users()) { |
| 19663 | if (ScheduleData *UseSD = getScheduleData(V: U)) { |
| 19664 | BundleMember->incDependencies(); |
| 19665 | if (!UseSD->isScheduled()) |
| 19666 | BundleMember->incrementUnscheduledDeps(Incr: 1); |
| 19667 | WorkList.push_back(Elt: UseSD); |
| 19668 | } |
| 19669 | } |
| 19670 | |
| 19671 | auto MakeControlDependent = [&](Instruction *I) { |
| 19672 | auto *DepDest = getScheduleData(I); |
| 19673 | assert(DepDest && "must be in schedule window" ); |
| 19674 | DepDest->addControlDependency(Dep: BundleMember); |
| 19675 | BundleMember->incDependencies(); |
| 19676 | if (!DepDest->isScheduled()) |
| 19677 | BundleMember->incrementUnscheduledDeps(Incr: 1); |
| 19678 | WorkList.push_back(Elt: DepDest); |
| 19679 | }; |
| 19680 | |
| 19681 | // Any instruction which isn't safe to speculate at the beginning of the |
| 19682 | // block is control depend on any early exit or non-willreturn call |
| 19683 | // which proceeds it. |
| 19684 | if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) { |
| 19685 | for (Instruction *I = BundleMember->getInst()->getNextNode(); |
| 19686 | I != ScheduleEnd; I = I->getNextNode()) { |
| 19687 | if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC)) |
| 19688 | continue; |
| 19689 | |
| 19690 | // Add the dependency |
| 19691 | MakeControlDependent(I); |
| 19692 | |
| 19693 | if (!isGuaranteedToTransferExecutionToSuccessor(I)) |
| 19694 | // Everything past here must be control dependent on I. |
| 19695 | break; |
| 19696 | } |
| 19697 | } |
| 19698 | |
| 19699 | if (RegionHasStackSave) { |
| 19700 | // If we have an inalloc alloca instruction, it needs to be scheduled |
| 19701 | // after any preceeding stacksave. We also need to prevent any alloca |
| 19702 | // from reordering above a preceeding stackrestore. |
| 19703 | if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) || |
| 19704 | match(V: BundleMember->getInst(), |
| 19705 | P: m_Intrinsic<Intrinsic::stackrestore>())) { |
| 19706 | for (Instruction *I = BundleMember->getInst()->getNextNode(); |
| 19707 | I != ScheduleEnd; I = I->getNextNode()) { |
| 19708 | if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) || |
| 19709 | match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>())) |
| 19710 | // Any allocas past here must be control dependent on I, and I |
| 19711 | // must be memory dependend on BundleMember->Inst. |
| 19712 | break; |
| 19713 | |
| 19714 | if (!isa<AllocaInst>(Val: I)) |
| 19715 | continue; |
| 19716 | |
| 19717 | // Add the dependency |
| 19718 | MakeControlDependent(I); |
| 19719 | } |
| 19720 | } |
| 19721 | |
| 19722 | // In addition to the cases handle just above, we need to prevent |
| 19723 | // allocas and loads/stores from moving below a stacksave or a |
| 19724 | // stackrestore. Avoiding moving allocas below stackrestore is currently |
| 19725 | // thought to be conservatism. Moving loads/stores below a stackrestore |
| 19726 | // can lead to incorrect code. |
| 19727 | if (isa<AllocaInst>(Val: BundleMember->getInst()) || |
| 19728 | BundleMember->getInst()->mayReadOrWriteMemory()) { |
| 19729 | for (Instruction *I = BundleMember->getInst()->getNextNode(); |
| 19730 | I != ScheduleEnd; I = I->getNextNode()) { |
| 19731 | if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) && |
| 19732 | !match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>())) |
| 19733 | continue; |
| 19734 | |
| 19735 | // Add the dependency |
| 19736 | MakeControlDependent(I); |
| 19737 | break; |
| 19738 | } |
| 19739 | } |
| 19740 | } |
| 19741 | |
| 19742 | // Handle the memory dependencies (if any). |
| 19743 | ScheduleData *NextLoadStore = BundleMember->getNextLoadStore(); |
| 19744 | if (!NextLoadStore) |
| 19745 | return; |
| 19746 | Instruction *SrcInst = BundleMember->getInst(); |
| 19747 | assert(SrcInst->mayReadOrWriteMemory() && |
| 19748 | "NextLoadStore list for non memory effecting bundle?" ); |
| 19749 | MemoryLocation SrcLoc = getLocation(I: SrcInst); |
| 19750 | bool SrcMayWrite = SrcInst->mayWriteToMemory(); |
| 19751 | unsigned NumAliased = 0; |
| 19752 | unsigned DistToSrc = 1; |
| 19753 | bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(I: SrcInst); |
| 19754 | |
| 19755 | for (ScheduleData *DepDest = NextLoadStore; DepDest; |
| 19756 | DepDest = DepDest->getNextLoadStore()) { |
| 19757 | assert(isInSchedulingRegion(DepDest) && "Expected to be in region" ); |
| 19758 | |
| 19759 | // We have two limits to reduce the complexity: |
| 19760 | // 1) AliasedCheckLimit: It's a small limit to reduce calls to |
| 19761 | // SLP->isAliased (which is the expensive part in this loop). |
| 19762 | // 2) MaxMemDepDistance: It's for very large blocks and it aborts |
| 19763 | // the whole loop (even if the loop is fast, it's quadratic). |
| 19764 | // It's important for the loop break condition (see below) to |
| 19765 | // check this limit even between two read-only instructions. |
| 19766 | if (DistToSrc >= MaxMemDepDistance || |
| 19767 | ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && |
| 19768 | (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit || |
| 19769 | SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) { |
| 19770 | |
| 19771 | // We increment the counter only if the locations are aliased |
| 19772 | // (instead of counting all alias checks). This gives a better |
| 19773 | // balance between reduced runtime and accurate dependencies. |
| 19774 | NumAliased++; |
| 19775 | |
| 19776 | DepDest->addMemoryDependency(Dep: BundleMember); |
| 19777 | BundleMember->incDependencies(); |
| 19778 | if (!DepDest->isScheduled()) |
| 19779 | BundleMember->incrementUnscheduledDeps(Incr: 1); |
| 19780 | WorkList.push_back(Elt: DepDest); |
| 19781 | } |
| 19782 | |
| 19783 | // Example, explaining the loop break condition: Let's assume our |
| 19784 | // starting instruction is i0 and MaxMemDepDistance = 3. |
| 19785 | // |
| 19786 | // +--------v--v--v |
| 19787 | // i0,i1,i2,i3,i4,i5,i6,i7,i8 |
| 19788 | // +--------^--^--^ |
| 19789 | // |
| 19790 | // MaxMemDepDistance let us stop alias-checking at i3 and we add |
| 19791 | // dependencies from i0 to i3,i4,.. (even if they are not aliased). |
| 19792 | // Previously we already added dependencies from i3 to i6,i7,i8 |
| 19793 | // (because of MaxMemDepDistance). As we added a dependency from |
| 19794 | // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8 |
| 19795 | // and we can abort this loop at i6. |
| 19796 | if (DistToSrc >= 2 * MaxMemDepDistance) |
| 19797 | break; |
| 19798 | DistToSrc++; |
| 19799 | } |
| 19800 | }; |
| 19801 | |
| 19802 | WorkList.push_back(Elt: Bundle.getBundle().front()); |
| 19803 | SmallPtrSet<ScheduleBundle *, 16> Visited; |
| 19804 | while (!WorkList.empty()) { |
| 19805 | ScheduleData *SD = WorkList.pop_back_val(); |
| 19806 | ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: SD->getInst()); |
| 19807 | if (Bundles.empty()) { |
| 19808 | ProcessNode(SD); |
| 19809 | if (InsertInReadyList && SD->isReady()) { |
| 19810 | ReadyInsts.insert(X: SD); |
| 19811 | LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n" ); |
| 19812 | } |
| 19813 | continue; |
| 19814 | } |
| 19815 | for (ScheduleBundle *Bundle : Bundles) { |
| 19816 | if (!Visited.insert(Ptr: Bundle).second || Bundle->hasValidDependencies()) |
| 19817 | continue; |
| 19818 | assert(isInSchedulingRegion(*Bundle) && |
| 19819 | "ScheduleData not in scheduling region" ); |
| 19820 | for_each(Range: Bundle->getBundle(), F: ProcessNode); |
| 19821 | } |
| 19822 | if (InsertInReadyList && SD->isReady()) { |
| 19823 | for (ScheduleBundle *Bundle : Bundles) { |
| 19824 | assert(isInSchedulingRegion(*Bundle) && |
| 19825 | "ScheduleData not in scheduling region" ); |
| 19826 | if (!Bundle->isReady()) |
| 19827 | continue; |
| 19828 | ReadyInsts.insert(X: Bundle); |
| 19829 | LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle |
| 19830 | << "\n" ); |
| 19831 | } |
| 19832 | } |
| 19833 | } |
| 19834 | } |
| 19835 | |
| 19836 | void BoUpSLP::BlockScheduling::resetSchedule() { |
| 19837 | assert(ScheduleStart && |
| 19838 | "tried to reset schedule on block which has not been scheduled" ); |
| 19839 | for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { |
| 19840 | if (ScheduleData *SD = getScheduleData(I)) { |
| 19841 | assert(isInSchedulingRegion(SD) && |
| 19842 | "ScheduleData not in scheduling region" ); |
| 19843 | SD->setScheduled(/*Scheduled=*/false); |
| 19844 | SD->resetUnscheduledDeps(); |
| 19845 | } |
| 19846 | for (ScheduleBundle *Bundle : getScheduleBundles(V: I)) { |
| 19847 | assert(isInSchedulingRegion(*Bundle) && |
| 19848 | "ScheduleBundle not in scheduling region" ); |
| 19849 | Bundle->setScheduled(/*Scheduled=*/false); |
| 19850 | } |
| 19851 | } |
| 19852 | ReadyInsts.clear(); |
| 19853 | } |
| 19854 | |
| 19855 | void BoUpSLP::scheduleBlock(BlockScheduling *BS) { |
| 19856 | if (!BS->ScheduleStart) |
| 19857 | return; |
| 19858 | |
| 19859 | LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n" ); |
| 19860 | |
| 19861 | // A key point - if we got here, pre-scheduling was able to find a valid |
| 19862 | // scheduling of the sub-graph of the scheduling window which consists |
| 19863 | // of all vector bundles and their transitive users. As such, we do not |
| 19864 | // need to reschedule anything *outside of* that subgraph. |
| 19865 | |
| 19866 | BS->resetSchedule(); |
| 19867 | |
| 19868 | // For the real scheduling we use a more sophisticated ready-list: it is |
| 19869 | // sorted by the original instruction location. This lets the final schedule |
| 19870 | // be as close as possible to the original instruction order. |
| 19871 | // WARNING: If changing this order causes a correctness issue, that means |
| 19872 | // there is some missing dependence edge in the schedule data graph. |
| 19873 | struct ScheduleDataCompare { |
| 19874 | bool operator()(const ScheduleEntity *SD1, |
| 19875 | const ScheduleEntity *SD2) const { |
| 19876 | return SD2->getSchedulingPriority() < SD1->getSchedulingPriority(); |
| 19877 | } |
| 19878 | }; |
| 19879 | std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts; |
| 19880 | |
| 19881 | // Ensure that all dependency data is updated (for nodes in the sub-graph) |
| 19882 | // and fill the ready-list with initial instructions. |
| 19883 | int Idx = 0; |
| 19884 | for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; |
| 19885 | I = I->getNextNode()) { |
| 19886 | ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I); |
| 19887 | if (!Bundles.empty()) { |
| 19888 | for (ScheduleBundle *Bundle : Bundles) { |
| 19889 | Bundle->setSchedulingPriority(Idx++); |
| 19890 | if (!Bundle->hasValidDependencies()) |
| 19891 | BS->calculateDependencies(Bundle&: *Bundle, /*InsertInReadyList=*/false, SLP: this); |
| 19892 | } |
| 19893 | continue; |
| 19894 | } |
| 19895 | if (ScheduleData *SD = BS->getScheduleData(I)) { |
| 19896 | [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I); |
| 19897 | assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() || |
| 19898 | SDTEs.front()->doesNotNeedToSchedule()) && |
| 19899 | "scheduler and vectorizer bundle mismatch" ); |
| 19900 | SD->setSchedulingPriority(Idx++); |
| 19901 | continue; |
| 19902 | } |
| 19903 | } |
| 19904 | BS->initialFillReadyList(ReadyList&: ReadyInsts); |
| 19905 | |
| 19906 | Instruction *LastScheduledInst = BS->ScheduleEnd; |
| 19907 | |
| 19908 | // Do the "real" scheduling. |
| 19909 | SmallPtrSet<Instruction *, 16> Scheduled; |
| 19910 | while (!ReadyInsts.empty()) { |
| 19911 | auto *Picked = *ReadyInsts.begin(); |
| 19912 | ReadyInsts.erase(position: ReadyInsts.begin()); |
| 19913 | |
| 19914 | // Move the scheduled instruction(s) to their dedicated places, if not |
| 19915 | // there yet. |
| 19916 | if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) { |
| 19917 | for (const ScheduleData *BundleMember : Bundle->getBundle()) { |
| 19918 | Instruction *PickedInst = BundleMember->getInst(); |
| 19919 | if (!Scheduled.insert(Ptr: PickedInst).second) |
| 19920 | continue; |
| 19921 | if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) |
| 19922 | PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode()); |
| 19923 | LastScheduledInst = PickedInst; |
| 19924 | } |
| 19925 | EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(), |
| 19926 | Args&: LastScheduledInst); |
| 19927 | } else { |
| 19928 | auto *SD = cast<ScheduleData>(Val: Picked); |
| 19929 | Instruction *PickedInst = SD->getInst(); |
| 19930 | if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst) |
| 19931 | PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode()); |
| 19932 | LastScheduledInst = PickedInst; |
| 19933 | } |
| 19934 | BS->schedule(Data: Picked, ReadyList&: ReadyInsts); |
| 19935 | } |
| 19936 | |
| 19937 | // Check that we didn't break any of our invariants. |
| 19938 | #ifdef EXPENSIVE_CHECKS |
| 19939 | BS->verify(); |
| 19940 | #endif |
| 19941 | |
| 19942 | #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) |
| 19943 | // Check that all schedulable entities got scheduled |
| 19944 | for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; |
| 19945 | I = I->getNextNode()) { |
| 19946 | ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I); |
| 19947 | assert(all_of(Bundles, |
| 19948 | [](const ScheduleBundle *Bundle) { |
| 19949 | return Bundle->isScheduled(); |
| 19950 | }) && |
| 19951 | "must be scheduled at this point" ); |
| 19952 | } |
| 19953 | #endif |
| 19954 | |
| 19955 | // Avoid duplicate scheduling of the block. |
| 19956 | BS->ScheduleStart = nullptr; |
| 19957 | } |
| 19958 | |
| 19959 | unsigned BoUpSLP::getVectorElementSize(Value *V) { |
| 19960 | // If V is a store, just return the width of the stored value (or value |
| 19961 | // truncated just before storing) without traversing the expression tree. |
| 19962 | // This is the common case. |
| 19963 | if (auto *Store = dyn_cast<StoreInst>(Val: V)) |
| 19964 | return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType()); |
| 19965 | |
| 19966 | if (auto *IEI = dyn_cast<InsertElementInst>(Val: V)) |
| 19967 | return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1)); |
| 19968 | |
| 19969 | auto E = InstrElementSize.find(Val: V); |
| 19970 | if (E != InstrElementSize.end()) |
| 19971 | return E->second; |
| 19972 | |
| 19973 | // If V is not a store, we can traverse the expression tree to find loads |
| 19974 | // that feed it. The type of the loaded value may indicate a more suitable |
| 19975 | // width than V's type. We want to base the vector element size on the width |
| 19976 | // of memory operations where possible. |
| 19977 | SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist; |
| 19978 | SmallPtrSet<Instruction *, 16> Visited; |
| 19979 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 19980 | Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0); |
| 19981 | Visited.insert(Ptr: I); |
| 19982 | } |
| 19983 | |
| 19984 | // Traverse the expression tree in bottom-up order looking for loads. If we |
| 19985 | // encounter an instruction we don't yet handle, we give up. |
| 19986 | auto Width = 0u; |
| 19987 | Value *FirstNonBool = nullptr; |
| 19988 | while (!Worklist.empty()) { |
| 19989 | auto [I, Parent, Level] = Worklist.pop_back_val(); |
| 19990 | |
| 19991 | // We should only be looking at scalar instructions here. If the current |
| 19992 | // instruction has a vector type, skip. |
| 19993 | auto *Ty = I->getType(); |
| 19994 | if (isa<VectorType>(Val: Ty)) |
| 19995 | continue; |
| 19996 | if (Ty != Builder.getInt1Ty() && !FirstNonBool) |
| 19997 | FirstNonBool = I; |
| 19998 | if (Level > RecursionMaxDepth) |
| 19999 | continue; |
| 20000 | |
| 20001 | // If the current instruction is a load, update MaxWidth to reflect the |
| 20002 | // width of the loaded value. |
| 20003 | if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I)) |
| 20004 | Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty)); |
| 20005 | |
| 20006 | // Otherwise, we need to visit the operands of the instruction. We only |
| 20007 | // handle the interesting cases from buildTree here. If an operand is an |
| 20008 | // instruction we haven't yet visited and from the same basic block as the |
| 20009 | // user or the use is a PHI node, we add it to the worklist. |
| 20010 | else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst, |
| 20011 | BinaryOperator, UnaryOperator>(Val: I)) { |
| 20012 | for (Use &U : I->operands()) { |
| 20013 | if (auto *J = dyn_cast<Instruction>(Val: U.get())) |
| 20014 | if (Visited.insert(Ptr: J).second && |
| 20015 | (isa<PHINode>(Val: I) || J->getParent() == Parent)) { |
| 20016 | Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1); |
| 20017 | continue; |
| 20018 | } |
| 20019 | if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty()) |
| 20020 | FirstNonBool = U.get(); |
| 20021 | } |
| 20022 | } else { |
| 20023 | break; |
| 20024 | } |
| 20025 | } |
| 20026 | |
| 20027 | // If we didn't encounter a memory access in the expression tree, or if we |
| 20028 | // gave up for some reason, just return the width of V. Otherwise, return the |
| 20029 | // maximum width we found. |
| 20030 | if (!Width) { |
| 20031 | if (V->getType() == Builder.getInt1Ty() && FirstNonBool) |
| 20032 | V = FirstNonBool; |
| 20033 | Width = DL->getTypeSizeInBits(Ty: V->getType()); |
| 20034 | } |
| 20035 | |
| 20036 | for (Instruction *I : Visited) |
| 20037 | InstrElementSize[I] = Width; |
| 20038 | |
| 20039 | return Width; |
| 20040 | } |
| 20041 | |
| 20042 | bool BoUpSLP::collectValuesToDemote( |
| 20043 | const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth, |
| 20044 | SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited, |
| 20045 | const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel, |
| 20046 | bool &IsProfitableToDemote, bool IsTruncRoot) const { |
| 20047 | // We can always demote constants. |
| 20048 | if (all_of(Range: E.Scalars, P: IsaPred<Constant>)) |
| 20049 | return true; |
| 20050 | |
| 20051 | unsigned OrigBitWidth = |
| 20052 | DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType()); |
| 20053 | if (OrigBitWidth == BitWidth) { |
| 20054 | MaxDepthLevel = 1; |
| 20055 | return true; |
| 20056 | } |
| 20057 | |
| 20058 | // Check if the node was analyzed already and must keep its original bitwidth. |
| 20059 | if (NodesToKeepBWs.contains(V: E.Idx)) |
| 20060 | return false; |
| 20061 | |
| 20062 | // If the value is not a vectorized instruction in the expression and not used |
| 20063 | // by the insertelement instruction and not used in multiple vector nodes, it |
| 20064 | // cannot be demoted. |
| 20065 | bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) { |
| 20066 | if (isa<PoisonValue>(Val: R)) |
| 20067 | return false; |
| 20068 | return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL)); |
| 20069 | }); |
| 20070 | auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool { |
| 20071 | if (isa<PoisonValue>(Val: V)) |
| 20072 | return true; |
| 20073 | if (getTreeEntries(V).size() > 1) |
| 20074 | return false; |
| 20075 | // For lat shuffle of sext/zext with many uses need to check the extra bit |
| 20076 | // for unsigned values, otherwise may have incorrect casting for reused |
| 20077 | // scalars. |
| 20078 | bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL)); |
| 20079 | if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) { |
| 20080 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth); |
| 20081 | if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL))) |
| 20082 | return true; |
| 20083 | } |
| 20084 | unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT); |
| 20085 | unsigned BitWidth1 = OrigBitWidth - NumSignBits; |
| 20086 | if (IsSignedNode) |
| 20087 | ++BitWidth1; |
| 20088 | if (auto *I = dyn_cast<Instruction>(Val: V)) { |
| 20089 | APInt Mask = DB->getDemandedBits(I); |
| 20090 | unsigned BitWidth2 = |
| 20091 | std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero()); |
| 20092 | while (!IsSignedNode && BitWidth2 < OrigBitWidth) { |
| 20093 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1); |
| 20094 | if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL))) |
| 20095 | break; |
| 20096 | BitWidth2 *= 2; |
| 20097 | } |
| 20098 | BitWidth1 = std::min(a: BitWidth1, b: BitWidth2); |
| 20099 | } |
| 20100 | BitWidth = std::max(a: BitWidth, b: BitWidth1); |
| 20101 | return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2); |
| 20102 | }; |
| 20103 | auto FinalAnalysis = [&, TTI = TTI]() { |
| 20104 | if (!IsProfitableToDemote) |
| 20105 | return false; |
| 20106 | bool Res = all_of( |
| 20107 | Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth))); |
| 20108 | // Demote gathers. |
| 20109 | if (Res && E.isGather()) { |
| 20110 | if (E.hasState()) { |
| 20111 | if (const TreeEntry *SameTE = |
| 20112 | getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars); |
| 20113 | SameTE) |
| 20114 | if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth, |
| 20115 | ToDemote, Visited, NodesToKeepBWs, |
| 20116 | MaxDepthLevel, IsProfitableToDemote, |
| 20117 | IsTruncRoot)) { |
| 20118 | ToDemote.push_back(Elt: E.Idx); |
| 20119 | return true; |
| 20120 | } |
| 20121 | } |
| 20122 | // Check possible extractelement instructions bases and final vector |
| 20123 | // length. |
| 20124 | SmallPtrSet<Value *, 4> UniqueBases; |
| 20125 | for (Value *V : E.Scalars) { |
| 20126 | auto *EE = dyn_cast<ExtractElementInst>(Val: V); |
| 20127 | if (!EE) |
| 20128 | continue; |
| 20129 | UniqueBases.insert(Ptr: EE->getVectorOperand()); |
| 20130 | } |
| 20131 | const unsigned VF = E.Scalars.size(); |
| 20132 | Type *OrigScalarTy = E.Scalars.front()->getType(); |
| 20133 | if (UniqueBases.size() <= 2 || |
| 20134 | ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >= |
| 20135 | ::getNumberOfParts( |
| 20136 | TTI: *TTI, |
| 20137 | VecTy: getWidenedType( |
| 20138 | ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth), |
| 20139 | VF))) { |
| 20140 | ToDemote.push_back(Elt: E.Idx); |
| 20141 | return true; |
| 20142 | } |
| 20143 | } |
| 20144 | return Res; |
| 20145 | }; |
| 20146 | if (E.isGather() || !Visited.insert(V: &E).second || |
| 20147 | any_of(Range: E.Scalars, P: [&](Value *V) { |
| 20148 | return !isa<PoisonValue>(Val: V) && all_of(Range: V->users(), P: [&](User *U) { |
| 20149 | return isa<InsertElementInst>(Val: U) && !isVectorized(V: U); |
| 20150 | }); |
| 20151 | })) |
| 20152 | return FinalAnalysis(); |
| 20153 | |
| 20154 | if (any_of(Range: E.Scalars, P: [&](Value *V) { |
| 20155 | return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) { |
| 20156 | return isVectorized(V: U) || |
| 20157 | (E.Idx == 0 && UserIgnoreList && |
| 20158 | UserIgnoreList->contains(V: U)) || |
| 20159 | (!isa<CmpInst>(Val: U) && U->getType()->isSized() && |
| 20160 | !U->getType()->isScalableTy() && |
| 20161 | DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth); |
| 20162 | }) && !IsPotentiallyTruncated(V, BitWidth); |
| 20163 | })) |
| 20164 | return false; |
| 20165 | |
| 20166 | auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands, |
| 20167 | bool &NeedToExit) { |
| 20168 | NeedToExit = false; |
| 20169 | unsigned InitLevel = MaxDepthLevel; |
| 20170 | for (const TreeEntry *Op : Operands) { |
| 20171 | unsigned Level = InitLevel; |
| 20172 | if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth, |
| 20173 | ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level, |
| 20174 | IsProfitableToDemote, IsTruncRoot)) { |
| 20175 | if (!IsProfitableToDemote) |
| 20176 | return false; |
| 20177 | NeedToExit = true; |
| 20178 | if (!FinalAnalysis()) |
| 20179 | return false; |
| 20180 | continue; |
| 20181 | } |
| 20182 | MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level); |
| 20183 | } |
| 20184 | return true; |
| 20185 | }; |
| 20186 | auto AttemptCheckBitwidth = |
| 20187 | [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) { |
| 20188 | // Try all bitwidth < OrigBitWidth. |
| 20189 | NeedToExit = false; |
| 20190 | unsigned BestFailBitwidth = 0; |
| 20191 | for (; BitWidth < OrigBitWidth; BitWidth *= 2) { |
| 20192 | if (Checker(BitWidth, OrigBitWidth)) |
| 20193 | return true; |
| 20194 | if (BestFailBitwidth == 0 && FinalAnalysis()) |
| 20195 | BestFailBitwidth = BitWidth; |
| 20196 | } |
| 20197 | if (BitWidth >= OrigBitWidth) { |
| 20198 | if (BestFailBitwidth == 0) { |
| 20199 | BitWidth = OrigBitWidth; |
| 20200 | return false; |
| 20201 | } |
| 20202 | MaxDepthLevel = 1; |
| 20203 | BitWidth = BestFailBitwidth; |
| 20204 | NeedToExit = true; |
| 20205 | return true; |
| 20206 | } |
| 20207 | return false; |
| 20208 | }; |
| 20209 | auto TryProcessInstruction = |
| 20210 | [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {}, |
| 20211 | function_ref<bool(unsigned, unsigned)> Checker = {}) { |
| 20212 | if (Operands.empty()) { |
| 20213 | if (!IsTruncRoot) |
| 20214 | MaxDepthLevel = 1; |
| 20215 | for (Value *V : E.Scalars) |
| 20216 | (void)IsPotentiallyTruncated(V, BitWidth); |
| 20217 | } else { |
| 20218 | // Several vectorized uses? Check if we can truncate it, otherwise - |
| 20219 | // exit. |
| 20220 | if (any_of(Range: E.Scalars, P: [&](Value *V) { |
| 20221 | return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth); |
| 20222 | })) |
| 20223 | return false; |
| 20224 | bool NeedToExit = false; |
| 20225 | if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit)) |
| 20226 | return false; |
| 20227 | if (NeedToExit) |
| 20228 | return true; |
| 20229 | if (!ProcessOperands(Operands, NeedToExit)) |
| 20230 | return false; |
| 20231 | if (NeedToExit) |
| 20232 | return true; |
| 20233 | } |
| 20234 | |
| 20235 | ++MaxDepthLevel; |
| 20236 | // Record the entry that we can demote. |
| 20237 | ToDemote.push_back(Elt: E.Idx); |
| 20238 | return IsProfitableToDemote; |
| 20239 | }; |
| 20240 | |
| 20241 | if (E.State == TreeEntry::SplitVectorize) |
| 20242 | return TryProcessInstruction( |
| 20243 | BitWidth, |
| 20244 | {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(), |
| 20245 | VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()}); |
| 20246 | |
| 20247 | switch (E.getOpcode()) { |
| 20248 | |
| 20249 | // We can always demote truncations and extensions. Since truncations can |
| 20250 | // seed additional demotion, we save the truncated value. |
| 20251 | case Instruction::Trunc: |
| 20252 | if (IsProfitableToDemoteRoot) |
| 20253 | IsProfitableToDemote = true; |
| 20254 | return TryProcessInstruction(BitWidth); |
| 20255 | case Instruction::ZExt: |
| 20256 | case Instruction::SExt: |
| 20257 | IsProfitableToDemote = true; |
| 20258 | return TryProcessInstruction(BitWidth); |
| 20259 | |
| 20260 | // We can demote certain binary operations if we can demote both of their |
| 20261 | // operands. |
| 20262 | case Instruction::Add: |
| 20263 | case Instruction::Sub: |
| 20264 | case Instruction::Mul: |
| 20265 | case Instruction::And: |
| 20266 | case Instruction::Or: |
| 20267 | case Instruction::Xor: { |
| 20268 | return TryProcessInstruction( |
| 20269 | BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}); |
| 20270 | } |
| 20271 | case Instruction::Freeze: |
| 20272 | return TryProcessInstruction(BitWidth, getOperandEntry(E: &E, Idx: 0)); |
| 20273 | case Instruction::Shl: { |
| 20274 | // If we are truncating the result of this SHL, and if it's a shift of an |
| 20275 | // inrange amount, we can always perform a SHL in a smaller type. |
| 20276 | auto ShlChecker = [&](unsigned BitWidth, unsigned) { |
| 20277 | return all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20278 | if (isa<PoisonValue>(Val: V)) |
| 20279 | return true; |
| 20280 | auto *I = cast<Instruction>(Val: V); |
| 20281 | KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL); |
| 20282 | return AmtKnownBits.getMaxValue().ult(RHS: BitWidth); |
| 20283 | }); |
| 20284 | }; |
| 20285 | return TryProcessInstruction( |
| 20286 | BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker); |
| 20287 | } |
| 20288 | case Instruction::LShr: { |
| 20289 | // If this is a truncate of a logical shr, we can truncate it to a smaller |
| 20290 | // lshr iff we know that the bits we would otherwise be shifting in are |
| 20291 | // already zeros. |
| 20292 | auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { |
| 20293 | return all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20294 | if (isa<PoisonValue>(Val: V)) |
| 20295 | return true; |
| 20296 | auto *I = cast<Instruction>(Val: V); |
| 20297 | KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL); |
| 20298 | APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth); |
| 20299 | return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) && |
| 20300 | MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits, |
| 20301 | SQ: SimplifyQuery(*DL)); |
| 20302 | }); |
| 20303 | }; |
| 20304 | return TryProcessInstruction( |
| 20305 | BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, |
| 20306 | LShrChecker); |
| 20307 | } |
| 20308 | case Instruction::AShr: { |
| 20309 | // If this is a truncate of an arithmetic shr, we can truncate it to a |
| 20310 | // smaller ashr iff we know that all the bits from the sign bit of the |
| 20311 | // original type and the sign bit of the truncate type are similar. |
| 20312 | auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { |
| 20313 | return all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20314 | if (isa<PoisonValue>(Val: V)) |
| 20315 | return true; |
| 20316 | auto *I = cast<Instruction>(Val: V); |
| 20317 | KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL); |
| 20318 | unsigned ShiftedBits = OrigBitWidth - BitWidth; |
| 20319 | return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) && |
| 20320 | ShiftedBits < |
| 20321 | ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT); |
| 20322 | }); |
| 20323 | }; |
| 20324 | return TryProcessInstruction( |
| 20325 | BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, |
| 20326 | AShrChecker); |
| 20327 | } |
| 20328 | case Instruction::UDiv: |
| 20329 | case Instruction::URem: { |
| 20330 | // UDiv and URem can be truncated if all the truncated bits are zero. |
| 20331 | auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) { |
| 20332 | assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!" ); |
| 20333 | return all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20334 | auto *I = cast<Instruction>(Val: V); |
| 20335 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth); |
| 20336 | return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)) && |
| 20337 | MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL)); |
| 20338 | }); |
| 20339 | }; |
| 20340 | return TryProcessInstruction( |
| 20341 | BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker); |
| 20342 | } |
| 20343 | |
| 20344 | // We can demote selects if we can demote their true and false values. |
| 20345 | case Instruction::Select: { |
| 20346 | return TryProcessInstruction( |
| 20347 | BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)}); |
| 20348 | } |
| 20349 | |
| 20350 | // We can demote phis if we can demote all their incoming operands. |
| 20351 | case Instruction::PHI: { |
| 20352 | const unsigned NumOps = E.getNumOperands(); |
| 20353 | SmallVector<const TreeEntry *> Ops(NumOps); |
| 20354 | transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(), |
| 20355 | F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); }); |
| 20356 | |
| 20357 | return TryProcessInstruction(BitWidth, Ops); |
| 20358 | } |
| 20359 | |
| 20360 | case Instruction::Call: { |
| 20361 | auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp()); |
| 20362 | if (!IC) |
| 20363 | break; |
| 20364 | Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI); |
| 20365 | if (ID != Intrinsic::abs && ID != Intrinsic::smin && |
| 20366 | ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax) |
| 20367 | break; |
| 20368 | SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0)); |
| 20369 | function_ref<bool(unsigned, unsigned)> CallChecker; |
| 20370 | auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { |
| 20371 | assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!" ); |
| 20372 | return all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20373 | auto *I = cast<Instruction>(Val: V); |
| 20374 | if (ID == Intrinsic::umin || ID == Intrinsic::umax) { |
| 20375 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth); |
| 20376 | return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, |
| 20377 | SQ: SimplifyQuery(*DL)) && |
| 20378 | MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL)); |
| 20379 | } |
| 20380 | assert((ID == Intrinsic::smin || ID == Intrinsic::smax) && |
| 20381 | "Expected min/max intrinsics only." ); |
| 20382 | unsigned SignBits = OrigBitWidth - BitWidth; |
| 20383 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1); |
| 20384 | unsigned Op0SignBits = |
| 20385 | ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT); |
| 20386 | unsigned Op1SignBits = |
| 20387 | ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT); |
| 20388 | return SignBits <= Op0SignBits && |
| 20389 | ((SignBits != Op0SignBits && |
| 20390 | !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) || |
| 20391 | MaskedValueIsZero(V: I->getOperand(i: 0), Mask, |
| 20392 | SQ: SimplifyQuery(*DL))) && |
| 20393 | SignBits <= Op1SignBits && |
| 20394 | ((SignBits != Op1SignBits && |
| 20395 | !isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL))) || |
| 20396 | MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL))); |
| 20397 | }); |
| 20398 | }; |
| 20399 | auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) { |
| 20400 | assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!" ); |
| 20401 | return all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20402 | auto *I = cast<Instruction>(Val: V); |
| 20403 | unsigned SignBits = OrigBitWidth - BitWidth; |
| 20404 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1); |
| 20405 | unsigned Op0SignBits = |
| 20406 | ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT); |
| 20407 | return SignBits <= Op0SignBits && |
| 20408 | ((SignBits != Op0SignBits && |
| 20409 | !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) || |
| 20410 | MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL))); |
| 20411 | }); |
| 20412 | }; |
| 20413 | if (ID != Intrinsic::abs) { |
| 20414 | Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1)); |
| 20415 | CallChecker = CompChecker; |
| 20416 | } else { |
| 20417 | CallChecker = AbsChecker; |
| 20418 | } |
| 20419 | InstructionCost BestCost = |
| 20420 | std::numeric_limits<InstructionCost::CostType>::max(); |
| 20421 | unsigned BestBitWidth = BitWidth; |
| 20422 | unsigned VF = E.Scalars.size(); |
| 20423 | // Choose the best bitwidth based on cost estimations. |
| 20424 | auto Checker = [&](unsigned BitWidth, unsigned) { |
| 20425 | unsigned MinBW = PowerOf2Ceil(A: BitWidth); |
| 20426 | SmallVector<Type *> ArgTys = |
| 20427 | buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI); |
| 20428 | auto VecCallCosts = getVectorCallCosts( |
| 20429 | CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF), |
| 20430 | TTI, TLI, ArgTys); |
| 20431 | InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second); |
| 20432 | if (Cost < BestCost) { |
| 20433 | BestCost = Cost; |
| 20434 | BestBitWidth = BitWidth; |
| 20435 | } |
| 20436 | return false; |
| 20437 | }; |
| 20438 | [[maybe_unused]] bool NeedToExit; |
| 20439 | (void)AttemptCheckBitwidth(Checker, NeedToExit); |
| 20440 | BitWidth = BestBitWidth; |
| 20441 | return TryProcessInstruction(BitWidth, Operands, CallChecker); |
| 20442 | } |
| 20443 | |
| 20444 | // Otherwise, conservatively give up. |
| 20445 | default: |
| 20446 | break; |
| 20447 | } |
| 20448 | MaxDepthLevel = 1; |
| 20449 | return FinalAnalysis(); |
| 20450 | } |
| 20451 | |
| 20452 | static RecurKind getRdxKind(Value *V); |
| 20453 | |
| 20454 | void BoUpSLP::computeMinimumValueSizes() { |
| 20455 | // We only attempt to truncate integer expressions. |
| 20456 | bool IsStoreOrInsertElt = |
| 20457 | VectorizableTree.front()->hasState() && |
| 20458 | (VectorizableTree.front()->getOpcode() == Instruction::Store || |
| 20459 | VectorizableTree.front()->getOpcode() == Instruction::InsertElement); |
| 20460 | if ((IsStoreOrInsertElt || UserIgnoreList) && |
| 20461 | ExtraBitWidthNodes.size() <= 1 && |
| 20462 | (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 || |
| 20463 | CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2)) |
| 20464 | return; |
| 20465 | |
| 20466 | unsigned NodeIdx = 0; |
| 20467 | if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather()) |
| 20468 | NodeIdx = 1; |
| 20469 | |
| 20470 | // Ensure the roots of the vectorizable tree don't form a cycle. |
| 20471 | assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 || |
| 20472 | !VectorizableTree[NodeIdx]->UserTreeIndex) && |
| 20473 | "Unexpected tree is graph." ); |
| 20474 | |
| 20475 | // The first value node for store/insertelement is sext/zext/trunc? Skip it, |
| 20476 | // resize to the final type. |
| 20477 | bool IsTruncRoot = false; |
| 20478 | bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt; |
| 20479 | SmallVector<unsigned> RootDemotes; |
| 20480 | SmallDenseSet<unsigned, 8> NodesToKeepBWs; |
| 20481 | if (NodeIdx != 0 && |
| 20482 | VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && |
| 20483 | VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { |
| 20484 | assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph." ); |
| 20485 | IsTruncRoot = true; |
| 20486 | RootDemotes.push_back(Elt: NodeIdx); |
| 20487 | IsProfitableToDemoteRoot = true; |
| 20488 | ++NodeIdx; |
| 20489 | } |
| 20490 | |
| 20491 | // Analyzed the reduction already and not profitable - exit. |
| 20492 | if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front())) |
| 20493 | return; |
| 20494 | |
| 20495 | SmallVector<unsigned> ToDemote; |
| 20496 | auto ComputeMaxBitWidth = |
| 20497 | [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot, |
| 20498 | unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned { |
| 20499 | ToDemote.clear(); |
| 20500 | // Check if the root is trunc and the next node is gather/buildvector, then |
| 20501 | // keep trunc in scalars, which is free in most cases. |
| 20502 | if (E.isGather() && IsTruncRoot && E.UserTreeIndex && |
| 20503 | !NodesToKeepBWs.contains(V: E.Idx) && |
| 20504 | E.Idx > (IsStoreOrInsertElt ? 2u : 1u) && |
| 20505 | all_of(Range: E.Scalars, P: [&](Value *V) { |
| 20506 | return V->hasOneUse() || isa<Constant>(Val: V) || |
| 20507 | (!V->hasNUsesOrMore(N: UsesLimit) && |
| 20508 | none_of(Range: V->users(), P: [&](User *U) { |
| 20509 | ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U); |
| 20510 | const TreeEntry *UserTE = E.UserTreeIndex.UserTE; |
| 20511 | if (TEs.empty() || is_contained(Range&: TEs, Element: UserTE)) |
| 20512 | return false; |
| 20513 | if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode, |
| 20514 | SelectInst>(Val: U) || |
| 20515 | isa<SIToFPInst, UIToFPInst>(Val: U) || |
| 20516 | !isa<CastInst, BinaryOperator, FreezeInst, PHINode, |
| 20517 | SelectInst>(Val: UserTE->getMainOp()) || |
| 20518 | isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp())) |
| 20519 | return true; |
| 20520 | unsigned UserTESz = DL->getTypeSizeInBits( |
| 20521 | Ty: UserTE->Scalars.front()->getType()); |
| 20522 | if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) { |
| 20523 | auto It = MinBWs.find(Val: TE); |
| 20524 | return It != MinBWs.end() && |
| 20525 | It->second.first > UserTESz; |
| 20526 | })) |
| 20527 | return true; |
| 20528 | return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz; |
| 20529 | })); |
| 20530 | })) { |
| 20531 | ToDemote.push_back(Elt: E.Idx); |
| 20532 | const TreeEntry *UserTE = E.UserTreeIndex.UserTE; |
| 20533 | auto It = MinBWs.find(Val: UserTE); |
| 20534 | if (It != MinBWs.end()) |
| 20535 | return It->second.first; |
| 20536 | unsigned MaxBitWidth = |
| 20537 | DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType()); |
| 20538 | MaxBitWidth = bit_ceil(Value: MaxBitWidth); |
| 20539 | if (MaxBitWidth < 8 && MaxBitWidth > 1) |
| 20540 | MaxBitWidth = 8; |
| 20541 | return MaxBitWidth; |
| 20542 | } |
| 20543 | |
| 20544 | if (!E.hasState()) |
| 20545 | return 0u; |
| 20546 | |
| 20547 | unsigned VF = E.getVectorFactor(); |
| 20548 | Type *ScalarTy = E.Scalars.front()->getType(); |
| 20549 | unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy); |
| 20550 | auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType()); |
| 20551 | if (!TreeRootIT) |
| 20552 | return 0u; |
| 20553 | |
| 20554 | if (any_of(Range: E.Scalars, |
| 20555 | P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); })) |
| 20556 | return 0u; |
| 20557 | |
| 20558 | unsigned NumParts = ::getNumberOfParts( |
| 20559 | TTI: *TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF * ScalarTyNumElements)); |
| 20560 | |
| 20561 | // The maximum bit width required to represent all the values that can be |
| 20562 | // demoted without loss of precision. It would be safe to truncate the roots |
| 20563 | // of the expression to this width. |
| 20564 | unsigned MaxBitWidth = 1u; |
| 20565 | |
| 20566 | // True if the roots can be zero-extended back to their original type, |
| 20567 | // rather than sign-extended. We know that if the leading bits are not |
| 20568 | // demanded, we can safely zero-extend. So we initialize IsKnownPositive to |
| 20569 | // True. |
| 20570 | // Determine if the sign bit of all the roots is known to be zero. If not, |
| 20571 | // IsKnownPositive is set to False. |
| 20572 | bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) { |
| 20573 | if (isa<PoisonValue>(Val: R)) |
| 20574 | return true; |
| 20575 | KnownBits Known = computeKnownBits(V: R, DL: *DL); |
| 20576 | return Known.isNonNegative(); |
| 20577 | }); |
| 20578 | |
| 20579 | if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex && |
| 20580 | E.UserTreeIndex.UserTE->hasState() && |
| 20581 | E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP) |
| 20582 | MaxBitWidth = |
| 20583 | std::min(a: DL->getTypeSizeInBits( |
| 20584 | Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()), |
| 20585 | b: DL->getTypeSizeInBits(Ty: ScalarTy)); |
| 20586 | |
| 20587 | // We first check if all the bits of the roots are demanded. If they're not, |
| 20588 | // we can truncate the roots to this narrower type. |
| 20589 | for (Value *Root : E.Scalars) { |
| 20590 | if (isa<PoisonValue>(Val: Root)) |
| 20591 | continue; |
| 20592 | unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, AC, CxtI: nullptr, DT); |
| 20593 | TypeSize NumTypeBits = |
| 20594 | DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType()); |
| 20595 | unsigned BitWidth1 = NumTypeBits - NumSignBits; |
| 20596 | // If we can't prove that the sign bit is zero, we must add one to the |
| 20597 | // maximum bit width to account for the unknown sign bit. This preserves |
| 20598 | // the existing sign bit so we can safely sign-extend the root back to the |
| 20599 | // original type. Otherwise, if we know the sign bit is zero, we will |
| 20600 | // zero-extend the root instead. |
| 20601 | // |
| 20602 | // FIXME: This is somewhat suboptimal, as there will be cases where adding |
| 20603 | // one to the maximum bit width will yield a larger-than-necessary |
| 20604 | // type. In general, we need to add an extra bit only if we can't |
| 20605 | // prove that the upper bit of the original type is equal to the |
| 20606 | // upper bit of the proposed smaller type. If these two bits are |
| 20607 | // the same (either zero or one) we know that sign-extending from |
| 20608 | // the smaller type will result in the same value. Here, since we |
| 20609 | // can't yet prove this, we are just making the proposed smaller |
| 20610 | // type larger to ensure correctness. |
| 20611 | if (!IsKnownPositive) |
| 20612 | ++BitWidth1; |
| 20613 | |
| 20614 | APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root)); |
| 20615 | unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); |
| 20616 | MaxBitWidth = |
| 20617 | std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth); |
| 20618 | } |
| 20619 | |
| 20620 | if (MaxBitWidth < 8 && MaxBitWidth > 1) |
| 20621 | MaxBitWidth = 8; |
| 20622 | |
| 20623 | // If the original type is large, but reduced type does not improve the reg |
| 20624 | // use - ignore it. |
| 20625 | if (NumParts > 1 && |
| 20626 | NumParts == |
| 20627 | ::getNumberOfParts( |
| 20628 | TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(), |
| 20629 | NumBits: bit_ceil(Value: MaxBitWidth)), |
| 20630 | VF))) |
| 20631 | return 0u; |
| 20632 | |
| 20633 | unsigned Opcode = E.getOpcode(); |
| 20634 | bool IsProfitableToDemote = Opcode == Instruction::Trunc || |
| 20635 | Opcode == Instruction::SExt || |
| 20636 | Opcode == Instruction::ZExt || NumParts > 1; |
| 20637 | // Conservatively determine if we can actually truncate the roots of the |
| 20638 | // expression. Collect the values that can be demoted in ToDemote and |
| 20639 | // additional roots that require investigating in Roots. |
| 20640 | DenseSet<const TreeEntry *> Visited; |
| 20641 | unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1; |
| 20642 | bool NeedToDemote = IsProfitableToDemote; |
| 20643 | |
| 20644 | if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth, |
| 20645 | ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel, |
| 20646 | IsProfitableToDemote&: NeedToDemote, IsTruncRoot) || |
| 20647 | (MaxDepthLevel <= Limit && |
| 20648 | !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) && |
| 20649 | (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) || |
| 20650 | DL->getTypeSizeInBits(Ty: TreeRootIT) / |
| 20651 | DL->getTypeSizeInBits( |
| 20652 | Ty: E.getMainOp()->getOperand(i: 0)->getType()) > |
| 20653 | 2))))) |
| 20654 | return 0u; |
| 20655 | // Round MaxBitWidth up to the next power-of-two. |
| 20656 | MaxBitWidth = bit_ceil(Value: MaxBitWidth); |
| 20657 | |
| 20658 | return MaxBitWidth; |
| 20659 | }; |
| 20660 | |
| 20661 | // If we can truncate the root, we must collect additional values that might |
| 20662 | // be demoted as a result. That is, those seeded by truncations we will |
| 20663 | // modify. |
| 20664 | // Add reduction ops sizes, if any. |
| 20665 | if (UserIgnoreList && |
| 20666 | isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) { |
| 20667 | // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n |
| 20668 | // x i1> to in)). |
| 20669 | if (all_of(Range: *UserIgnoreList, |
| 20670 | P: [](Value *V) { |
| 20671 | return isa<PoisonValue>(Val: V) || |
| 20672 | cast<Instruction>(Val: V)->getOpcode() == Instruction::Add; |
| 20673 | }) && |
| 20674 | VectorizableTree.front()->State == TreeEntry::Vectorize && |
| 20675 | VectorizableTree.front()->getOpcode() == Instruction::ZExt && |
| 20676 | cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() == |
| 20677 | Builder.getInt1Ty()) { |
| 20678 | ReductionBitWidth = 1; |
| 20679 | } else { |
| 20680 | for (Value *V : *UserIgnoreList) { |
| 20681 | if (isa<PoisonValue>(Val: V)) |
| 20682 | continue; |
| 20683 | unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT); |
| 20684 | TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType()); |
| 20685 | unsigned BitWidth1 = NumTypeBits - NumSignBits; |
| 20686 | if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL))) |
| 20687 | ++BitWidth1; |
| 20688 | unsigned BitWidth2 = BitWidth1; |
| 20689 | if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) { |
| 20690 | APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V)); |
| 20691 | BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); |
| 20692 | } |
| 20693 | ReductionBitWidth = |
| 20694 | std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth); |
| 20695 | } |
| 20696 | if (ReductionBitWidth < 8 && ReductionBitWidth > 1) |
| 20697 | ReductionBitWidth = 8; |
| 20698 | |
| 20699 | ReductionBitWidth = bit_ceil(Value: ReductionBitWidth); |
| 20700 | } |
| 20701 | } |
| 20702 | bool IsTopRoot = NodeIdx == 0; |
| 20703 | while (NodeIdx < VectorizableTree.size() && |
| 20704 | VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize && |
| 20705 | VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) { |
| 20706 | RootDemotes.push_back(Elt: NodeIdx); |
| 20707 | ++NodeIdx; |
| 20708 | IsTruncRoot = true; |
| 20709 | } |
| 20710 | bool IsSignedCmp = false; |
| 20711 | if (UserIgnoreList && all_of(Range: *UserIgnoreList, P: [](Value *V) { |
| 20712 | return match(V, P: m_SMin(L: m_Value(), R: m_Value())) || |
| 20713 | match(V, P: m_SMax(L: m_Value(), R: m_Value())); |
| 20714 | })) |
| 20715 | IsSignedCmp = true; |
| 20716 | while (NodeIdx < VectorizableTree.size()) { |
| 20717 | ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars; |
| 20718 | unsigned Limit = 2; |
| 20719 | if (IsTopRoot && |
| 20720 | ReductionBitWidth == |
| 20721 | DL->getTypeSizeInBits( |
| 20722 | Ty: VectorizableTree.front()->Scalars.front()->getType())) |
| 20723 | Limit = 3; |
| 20724 | unsigned MaxBitWidth = ComputeMaxBitWidth( |
| 20725 | *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit, |
| 20726 | IsTruncRoot, IsSignedCmp); |
| 20727 | if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) { |
| 20728 | if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth) |
| 20729 | ReductionBitWidth = bit_ceil(Value: MaxBitWidth); |
| 20730 | else if (MaxBitWidth == 0) |
| 20731 | ReductionBitWidth = 0; |
| 20732 | } |
| 20733 | |
| 20734 | for (unsigned Idx : RootDemotes) { |
| 20735 | if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) { |
| 20736 | uint32_t OrigBitWidth = |
| 20737 | DL->getTypeSizeInBits(Ty: V->getType()->getScalarType()); |
| 20738 | if (OrigBitWidth > MaxBitWidth) { |
| 20739 | APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth); |
| 20740 | return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)); |
| 20741 | } |
| 20742 | return false; |
| 20743 | })) |
| 20744 | ToDemote.push_back(Elt: Idx); |
| 20745 | } |
| 20746 | RootDemotes.clear(); |
| 20747 | IsTopRoot = false; |
| 20748 | IsProfitableToDemoteRoot = true; |
| 20749 | |
| 20750 | if (ExtraBitWidthNodes.empty()) { |
| 20751 | NodeIdx = VectorizableTree.size(); |
| 20752 | } else { |
| 20753 | unsigned NewIdx = 0; |
| 20754 | do { |
| 20755 | NewIdx = *ExtraBitWidthNodes.begin(); |
| 20756 | ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin()); |
| 20757 | } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty()); |
| 20758 | NodeIdx = NewIdx; |
| 20759 | IsTruncRoot = |
| 20760 | NodeIdx < VectorizableTree.size() && |
| 20761 | VectorizableTree[NodeIdx]->UserTreeIndex && |
| 20762 | VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 && |
| 20763 | VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() == |
| 20764 | Instruction::Trunc && |
| 20765 | !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle(); |
| 20766 | IsSignedCmp = |
| 20767 | NodeIdx < VectorizableTree.size() && |
| 20768 | VectorizableTree[NodeIdx]->UserTreeIndex && |
| 20769 | VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() && |
| 20770 | VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() == |
| 20771 | Instruction::ICmp && |
| 20772 | any_of( |
| 20773 | Range&: VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars, |
| 20774 | P: [&](Value *V) { |
| 20775 | auto *IC = dyn_cast<ICmpInst>(Val: V); |
| 20776 | return IC && (IC->isSigned() || |
| 20777 | !isKnownNonNegative(V: IC->getOperand(i_nocapture: 0), |
| 20778 | SQ: SimplifyQuery(*DL)) || |
| 20779 | !isKnownNonNegative(V: IC->getOperand(i_nocapture: 1), |
| 20780 | SQ: SimplifyQuery(*DL))); |
| 20781 | }); |
| 20782 | } |
| 20783 | |
| 20784 | // If the maximum bit width we compute is less than the width of the roots' |
| 20785 | // type, we can proceed with the narrowing. Otherwise, do nothing. |
| 20786 | if (MaxBitWidth == 0 || |
| 20787 | MaxBitWidth >= |
| 20788 | cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType()) |
| 20789 | ->getBitWidth()) { |
| 20790 | if (UserIgnoreList) |
| 20791 | AnalyzedMinBWVals.insert_range(R&: TreeRoot); |
| 20792 | NodesToKeepBWs.insert_range(R&: ToDemote); |
| 20793 | continue; |
| 20794 | } |
| 20795 | |
| 20796 | // Finally, map the values we can demote to the maximum bit with we |
| 20797 | // computed. |
| 20798 | for (unsigned Idx : ToDemote) { |
| 20799 | TreeEntry *TE = VectorizableTree[Idx].get(); |
| 20800 | if (MinBWs.contains(Val: TE)) |
| 20801 | continue; |
| 20802 | bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) { |
| 20803 | if (isa<PoisonValue>(Val: R)) |
| 20804 | return false; |
| 20805 | return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL)); |
| 20806 | }); |
| 20807 | MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned); |
| 20808 | } |
| 20809 | } |
| 20810 | } |
| 20811 | |
| 20812 | PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) { |
| 20813 | auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F); |
| 20814 | auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F); |
| 20815 | auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F); |
| 20816 | auto *AA = &AM.getResult<AAManager>(IR&: F); |
| 20817 | auto *LI = &AM.getResult<LoopAnalysis>(IR&: F); |
| 20818 | auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F); |
| 20819 | auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F); |
| 20820 | auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F); |
| 20821 | auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F); |
| 20822 | |
| 20823 | bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE); |
| 20824 | if (!Changed) |
| 20825 | return PreservedAnalyses::all(); |
| 20826 | |
| 20827 | PreservedAnalyses PA; |
| 20828 | PA.preserveSet<CFGAnalyses>(); |
| 20829 | return PA; |
| 20830 | } |
| 20831 | |
| 20832 | bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, |
| 20833 | TargetTransformInfo *TTI_, |
| 20834 | TargetLibraryInfo *TLI_, AAResults *AA_, |
| 20835 | LoopInfo *LI_, DominatorTree *DT_, |
| 20836 | AssumptionCache *AC_, DemandedBits *DB_, |
| 20837 | OptimizationRemarkEmitter *ORE_) { |
| 20838 | if (!RunSLPVectorization) |
| 20839 | return false; |
| 20840 | SE = SE_; |
| 20841 | TTI = TTI_; |
| 20842 | TLI = TLI_; |
| 20843 | AA = AA_; |
| 20844 | LI = LI_; |
| 20845 | DT = DT_; |
| 20846 | AC = AC_; |
| 20847 | DB = DB_; |
| 20848 | DL = &F.getDataLayout(); |
| 20849 | |
| 20850 | Stores.clear(); |
| 20851 | GEPs.clear(); |
| 20852 | bool Changed = false; |
| 20853 | |
| 20854 | // If the target claims to have no vector registers don't attempt |
| 20855 | // vectorization. |
| 20856 | if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) { |
| 20857 | LLVM_DEBUG( |
| 20858 | dbgs() << "SLP: Didn't find any vector registers for target, abort.\n" ); |
| 20859 | return false; |
| 20860 | } |
| 20861 | |
| 20862 | // Don't vectorize when the attribute NoImplicitFloat is used. |
| 20863 | if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat)) |
| 20864 | return false; |
| 20865 | |
| 20866 | LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n" ); |
| 20867 | |
| 20868 | // Use the bottom up slp vectorizer to construct chains that start with |
| 20869 | // store instructions. |
| 20870 | BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); |
| 20871 | |
| 20872 | // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to |
| 20873 | // delete instructions. |
| 20874 | |
| 20875 | // Update DFS numbers now so that we can use them for ordering. |
| 20876 | DT->updateDFSNumbers(); |
| 20877 | |
| 20878 | // Scan the blocks in the function in post order. |
| 20879 | for (auto *BB : post_order(G: &F.getEntryBlock())) { |
| 20880 | if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator())) |
| 20881 | continue; |
| 20882 | |
| 20883 | // Start new block - clear the list of reduction roots. |
| 20884 | R.clearReductionData(); |
| 20885 | collectSeedInstructions(BB); |
| 20886 | |
| 20887 | // Vectorize trees that end at stores. |
| 20888 | if (!Stores.empty()) { |
| 20889 | LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size() |
| 20890 | << " underlying objects.\n" ); |
| 20891 | Changed |= vectorizeStoreChains(R); |
| 20892 | } |
| 20893 | |
| 20894 | // Vectorize trees that end at reductions. |
| 20895 | Changed |= vectorizeChainsInBlock(BB, R); |
| 20896 | |
| 20897 | // Vectorize the index computations of getelementptr instructions. This |
| 20898 | // is primarily intended to catch gather-like idioms ending at |
| 20899 | // non-consecutive loads. |
| 20900 | if (!GEPs.empty()) { |
| 20901 | LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size() |
| 20902 | << " underlying objects.\n" ); |
| 20903 | Changed |= vectorizeGEPIndices(BB, R); |
| 20904 | } |
| 20905 | } |
| 20906 | |
| 20907 | if (Changed) { |
| 20908 | R.optimizeGatherSequence(); |
| 20909 | LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n" ); |
| 20910 | } |
| 20911 | return Changed; |
| 20912 | } |
| 20913 | |
| 20914 | std::optional<bool> |
| 20915 | SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, |
| 20916 | unsigned Idx, unsigned MinVF, |
| 20917 | unsigned &Size) { |
| 20918 | Size = 0; |
| 20919 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() |
| 20920 | << "\n" ); |
| 20921 | const unsigned Sz = R.getVectorElementSize(V: Chain[0]); |
| 20922 | unsigned VF = Chain.size(); |
| 20923 | |
| 20924 | if (!has_single_bit(Value: Sz) || |
| 20925 | !hasFullVectorsOrPowerOf2( |
| 20926 | TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(), |
| 20927 | Sz: VF) || |
| 20928 | VF < 2 || VF < MinVF) { |
| 20929 | // Check if vectorizing with a non-power-of-2 VF should be considered. At |
| 20930 | // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost |
| 20931 | // all vector lanes are used. |
| 20932 | if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF)) |
| 20933 | return false; |
| 20934 | } |
| 20935 | |
| 20936 | LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx |
| 20937 | << "\n" ); |
| 20938 | |
| 20939 | SetVector<Value *> ValOps; |
| 20940 | for (Value *V : Chain) |
| 20941 | ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand()); |
| 20942 | // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. |
| 20943 | InstructionsState S = getSameOpcode(VL: ValOps.getArrayRef(), TLI: *TLI); |
| 20944 | if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > 1) { |
| 20945 | DenseSet<Value *> Stores(Chain.begin(), Chain.end()); |
| 20946 | bool IsAllowedSize = |
| 20947 | hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(), |
| 20948 | Sz: ValOps.size()) || |
| 20949 | (VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + 1)); |
| 20950 | if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load && |
| 20951 | (!S.getMainOp()->isSafeToRemove() || |
| 20952 | any_of(Range: ValOps.getArrayRef(), |
| 20953 | P: [&](Value *V) { |
| 20954 | return !isa<ExtractElementInst>(Val: V) && |
| 20955 | (V->getNumUses() > Chain.size() || |
| 20956 | any_of(Range: V->users(), P: [&](User *U) { |
| 20957 | return !Stores.contains(V: U); |
| 20958 | })); |
| 20959 | }))) || |
| 20960 | (ValOps.size() > Chain.size() / 2 && !S)) { |
| 20961 | Size = (!IsAllowedSize && S) ? 1 : 2; |
| 20962 | return false; |
| 20963 | } |
| 20964 | } |
| 20965 | if (R.isLoadCombineCandidate(Stores: Chain)) |
| 20966 | return true; |
| 20967 | R.buildTree(Roots: Chain); |
| 20968 | // Check if tree tiny and store itself or its value is not vectorized. |
| 20969 | if (R.isTreeTinyAndNotFullyVectorizable()) { |
| 20970 | if (R.isGathered(V: Chain.front()) || |
| 20971 | R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand())) |
| 20972 | return std::nullopt; |
| 20973 | Size = R.getCanonicalGraphSize(); |
| 20974 | return false; |
| 20975 | } |
| 20976 | if (R.isProfitableToReorder()) { |
| 20977 | R.reorderTopToBottom(); |
| 20978 | R.reorderBottomToTop(); |
| 20979 | } |
| 20980 | R.transformNodes(); |
| 20981 | R.buildExternalUses(); |
| 20982 | |
| 20983 | R.computeMinimumValueSizes(); |
| 20984 | |
| 20985 | Size = R.getCanonicalGraphSize(); |
| 20986 | if (S && S.getOpcode() == Instruction::Load) |
| 20987 | Size = 2; // cut off masked gather small trees |
| 20988 | InstructionCost Cost = R.getTreeCost(); |
| 20989 | |
| 20990 | LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n" ); |
| 20991 | if (Cost < -SLPCostThreshold) { |
| 20992 | LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n" ); |
| 20993 | |
| 20994 | using namespace ore; |
| 20995 | |
| 20996 | R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "StoresVectorized" , |
| 20997 | cast<StoreInst>(Val: Chain[0])) |
| 20998 | << "Stores SLP vectorized with cost " << NV("Cost" , Cost) |
| 20999 | << " and with tree size " |
| 21000 | << NV("TreeSize" , R.getTreeSize())); |
| 21001 | |
| 21002 | R.vectorizeTree(); |
| 21003 | return true; |
| 21004 | } |
| 21005 | |
| 21006 | return false; |
| 21007 | } |
| 21008 | |
| 21009 | /// Checks if the quadratic mean deviation is less than 90% of the mean size. |
| 21010 | static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes, |
| 21011 | bool First) { |
| 21012 | unsigned Num = 0; |
| 21013 | uint64_t Sum = std::accumulate( |
| 21014 | first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0), |
| 21015 | binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) { |
| 21016 | unsigned Size = First ? Val.first : Val.second; |
| 21017 | if (Size == 1) |
| 21018 | return V; |
| 21019 | ++Num; |
| 21020 | return V + Size; |
| 21021 | }); |
| 21022 | if (Num == 0) |
| 21023 | return true; |
| 21024 | uint64_t Mean = Sum / Num; |
| 21025 | if (Mean == 0) |
| 21026 | return true; |
| 21027 | uint64_t Dev = std::accumulate( |
| 21028 | first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0), |
| 21029 | binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) { |
| 21030 | unsigned P = First ? Val.first : Val.second; |
| 21031 | if (P == 1) |
| 21032 | return V; |
| 21033 | return V + (P - Mean) * (P - Mean); |
| 21034 | }) / |
| 21035 | Num; |
| 21036 | return Dev * 96 / (Mean * Mean) == 0; |
| 21037 | } |
| 21038 | |
| 21039 | namespace { |
| 21040 | |
| 21041 | /// A group of stores that we'll try to bundle together using vector ops. |
| 21042 | /// They are ordered using the signed distance of their address operand to the |
| 21043 | /// address of this group's BaseInstr. |
| 21044 | class RelatedStoreInsts { |
| 21045 | public: |
| 21046 | RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores) |
| 21047 | : AllStores(AllStores) { |
| 21048 | reset(NewBaseInstr: BaseInstrIdx); |
| 21049 | } |
| 21050 | |
| 21051 | void reset(unsigned NewBaseInstr) { |
| 21052 | assert(NewBaseInstr < AllStores.size() && |
| 21053 | "Instruction index out of bounds" ); |
| 21054 | BaseInstrIdx = NewBaseInstr; |
| 21055 | Instrs.clear(); |
| 21056 | insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: 0); |
| 21057 | } |
| 21058 | |
| 21059 | /// Tries to insert \p InstrIdx as the store with a pointer distance of |
| 21060 | /// \p PtrDist. |
| 21061 | /// Does nothing if there is already a store with that \p PtrDist. |
| 21062 | /// \returns The previously associated Instruction index, or std::nullopt |
| 21063 | std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) { |
| 21064 | auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx); |
| 21065 | return Inserted ? std::nullopt : std::make_optional(t&: It->second); |
| 21066 | } |
| 21067 | |
| 21068 | using DistToInstMap = std::map<int64_t, unsigned>; |
| 21069 | const DistToInstMap &getStores() const { return Instrs; } |
| 21070 | |
| 21071 | /// If \p SI is related to this group of stores, return the distance of its |
| 21072 | /// pointer operand to the one the group's BaseInstr. |
| 21073 | std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL, |
| 21074 | ScalarEvolution &SE) const { |
| 21075 | StoreInst &BaseStore = *AllStores[BaseInstrIdx]; |
| 21076 | return getPointersDiff( |
| 21077 | ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(), |
| 21078 | ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE, |
| 21079 | /*StrictCheck=*/true); |
| 21080 | } |
| 21081 | |
| 21082 | /// Recompute the pointer distances to be based on \p NewBaseInstIdx. |
| 21083 | /// Stores whose index is less than \p MinSafeIdx will be dropped. |
| 21084 | void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx, |
| 21085 | int64_t DistFromCurBase) { |
| 21086 | DistToInstMap PrevSet = std::move(Instrs); |
| 21087 | reset(NewBaseInstr: NewBaseInstIdx); |
| 21088 | |
| 21089 | // Re-insert stores that come after MinSafeIdx to try and vectorize them |
| 21090 | // again. Their distance will be "rebased" to use NewBaseInstIdx as |
| 21091 | // reference. |
| 21092 | for (auto [Dist, InstIdx] : PrevSet) { |
| 21093 | if (InstIdx >= MinSafeIdx) |
| 21094 | insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase); |
| 21095 | } |
| 21096 | } |
| 21097 | |
| 21098 | /// Remove all stores that have been vectorized from this group. |
| 21099 | void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) { |
| 21100 | DistToInstMap::reverse_iterator LastVectorizedStore = find_if( |
| 21101 | Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) { |
| 21102 | return VectorizedStores.contains(Ptr: AllStores[DistAndIdx.second]); |
| 21103 | }); |
| 21104 | |
| 21105 | // Get a forward iterator pointing after the last vectorized store and erase |
| 21106 | // all stores before it so we don't try to vectorize them again. |
| 21107 | DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base(); |
| 21108 | Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd); |
| 21109 | } |
| 21110 | |
| 21111 | private: |
| 21112 | /// The index of the Base instruction, i.e. the one with a 0 pointer distance. |
| 21113 | unsigned BaseInstrIdx; |
| 21114 | |
| 21115 | /// Maps a pointer distance from \p BaseInstrIdx to an instruction index. |
| 21116 | DistToInstMap Instrs; |
| 21117 | |
| 21118 | /// Reference to all the stores in the BB being analyzed. |
| 21119 | ArrayRef<StoreInst *> AllStores; |
| 21120 | }; |
| 21121 | |
| 21122 | } // end anonymous namespace |
| 21123 | |
| 21124 | bool SLPVectorizerPass::vectorizeStores( |
| 21125 | ArrayRef<StoreInst *> Stores, BoUpSLP &R, |
| 21126 | DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> |
| 21127 | &Visited) { |
| 21128 | // We may run into multiple chains that merge into a single chain. We mark the |
| 21129 | // stores that we vectorized so that we don't visit the same store twice. |
| 21130 | BoUpSLP::ValueSet VectorizedStores; |
| 21131 | bool Changed = false; |
| 21132 | |
| 21133 | auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) { |
| 21134 | int64_t PrevDist = -1; |
| 21135 | BoUpSLP::ValueList Operands; |
| 21136 | // Collect the chain into a list. |
| 21137 | for (auto [Idx, Data] : enumerate(First: StoreSeq)) { |
| 21138 | auto &[Dist, InstIdx] = Data; |
| 21139 | if (Operands.empty() || Dist - PrevDist == 1) { |
| 21140 | Operands.push_back(Elt: Stores[InstIdx]); |
| 21141 | PrevDist = Dist; |
| 21142 | if (Idx != StoreSeq.size() - 1) |
| 21143 | continue; |
| 21144 | } |
| 21145 | auto E = make_scope_exit(F: [&, &Dist = Dist, &InstIdx = InstIdx]() { |
| 21146 | Operands.clear(); |
| 21147 | Operands.push_back(Elt: Stores[InstIdx]); |
| 21148 | PrevDist = Dist; |
| 21149 | }); |
| 21150 | |
| 21151 | if (Operands.size() <= 1 || |
| 21152 | !Visited |
| 21153 | .insert(V: {Operands.front(), |
| 21154 | cast<StoreInst>(Val: Operands.front())->getValueOperand(), |
| 21155 | Operands.back(), |
| 21156 | cast<StoreInst>(Val: Operands.back())->getValueOperand(), |
| 21157 | Operands.size()}) |
| 21158 | .second) |
| 21159 | continue; |
| 21160 | |
| 21161 | unsigned MaxVecRegSize = R.getMaxVecRegSize(); |
| 21162 | unsigned EltSize = R.getVectorElementSize(V: Operands[0]); |
| 21163 | unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize); |
| 21164 | |
| 21165 | unsigned MaxVF = |
| 21166 | std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts); |
| 21167 | auto *Store = cast<StoreInst>(Val: Operands[0]); |
| 21168 | Type *StoreTy = Store->getValueOperand()->getType(); |
| 21169 | Type *ValueTy = StoreTy; |
| 21170 | if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand())) |
| 21171 | ValueTy = Trunc->getSrcTy(); |
| 21172 | // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But |
| 21173 | // getStoreMinimumVF only support scalar type as arguments. As a result, |
| 21174 | // we need to use the element type of StoreTy and ValueTy to retrieve the |
| 21175 | // VF and then transform it back. |
| 21176 | // Remember: VF is defined as the number we want to vectorize, not the |
| 21177 | // number of elements in the final vector. |
| 21178 | Type *StoreScalarTy = StoreTy->getScalarType(); |
| 21179 | unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF( |
| 21180 | VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy, |
| 21181 | ScalarValTy: ValueTy->getScalarType())); |
| 21182 | MinVF /= getNumElements(Ty: StoreTy); |
| 21183 | MinVF = std::max<unsigned>(a: 2, b: MinVF); |
| 21184 | |
| 21185 | if (MaxVF < MinVF) { |
| 21186 | LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF |
| 21187 | << ") < " |
| 21188 | << "MinVF (" << MinVF << ")\n" ); |
| 21189 | continue; |
| 21190 | } |
| 21191 | |
| 21192 | unsigned NonPowerOf2VF = 0; |
| 21193 | if (VectorizeNonPowerOf2) { |
| 21194 | // First try vectorizing with a non-power-of-2 VF. At the moment, only |
| 21195 | // consider cases where VF + 1 is a power-of-2, i.e. almost all vector |
| 21196 | // lanes are used. |
| 21197 | unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF); |
| 21198 | if (has_single_bit(Value: CandVF + 1)) { |
| 21199 | NonPowerOf2VF = CandVF; |
| 21200 | assert(NonPowerOf2VF != MaxVF && |
| 21201 | "Non-power-of-2 VF should not be equal to MaxVF" ); |
| 21202 | } |
| 21203 | } |
| 21204 | |
| 21205 | // MaxRegVF represents the number of instructions (scalar, or vector in |
| 21206 | // case of revec) that can be vectorized to naturally fit in a vector |
| 21207 | // register. |
| 21208 | unsigned MaxRegVF = MaxVF; |
| 21209 | |
| 21210 | MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size())); |
| 21211 | if (MaxVF < MinVF) { |
| 21212 | LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF |
| 21213 | << ") < " |
| 21214 | << "MinVF (" << MinVF << ")\n" ); |
| 21215 | continue; |
| 21216 | } |
| 21217 | |
| 21218 | SmallVector<unsigned> CandidateVFs; |
| 21219 | for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF; |
| 21220 | VF = divideCeil(Numerator: VF, Denominator: 2)) |
| 21221 | CandidateVFs.push_back(Elt: VF); |
| 21222 | |
| 21223 | unsigned End = Operands.size(); |
| 21224 | unsigned Repeat = 0; |
| 21225 | constexpr unsigned MaxAttempts = 4; |
| 21226 | OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size()); |
| 21227 | for (std::pair<unsigned, unsigned> &P : RangeSizes) |
| 21228 | P.first = P.second = 1; |
| 21229 | DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable; |
| 21230 | auto IsNotVectorized = [](bool First, |
| 21231 | const std::pair<unsigned, unsigned> &P) { |
| 21232 | return First ? P.first > 0 : P.second > 0; |
| 21233 | }; |
| 21234 | auto IsVectorized = [](bool First, |
| 21235 | const std::pair<unsigned, unsigned> &P) { |
| 21236 | return First ? P.first == 0 : P.second == 0; |
| 21237 | }; |
| 21238 | auto VFIsProfitable = [](bool First, unsigned Size, |
| 21239 | const std::pair<unsigned, unsigned> &P) { |
| 21240 | return First ? Size >= P.first : Size >= P.second; |
| 21241 | }; |
| 21242 | auto FirstSizeSame = [](unsigned Size, |
| 21243 | const std::pair<unsigned, unsigned> &P) { |
| 21244 | return Size == P.first; |
| 21245 | }; |
| 21246 | while (true) { |
| 21247 | ++Repeat; |
| 21248 | bool RepeatChanged = false; |
| 21249 | bool AnyProfitableGraph = false; |
| 21250 | for (unsigned VF : CandidateVFs) { |
| 21251 | AnyProfitableGraph = false; |
| 21252 | unsigned FirstUnvecStore = |
| 21253 | std::distance(first: RangeSizes.begin(), |
| 21254 | last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, |
| 21255 | args: VF >= MaxRegVF, args: _1))); |
| 21256 | |
| 21257 | // Form slices of size VF starting from FirstUnvecStore and try to |
| 21258 | // vectorize them. |
| 21259 | while (FirstUnvecStore < End) { |
| 21260 | unsigned FirstVecStore = std::distance( |
| 21261 | first: RangeSizes.begin(), |
| 21262 | last: find_if(Range: RangeSizes.drop_front(N: FirstUnvecStore), |
| 21263 | P: std::bind(f&: IsVectorized, args: VF >= MaxRegVF, args: _1))); |
| 21264 | unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore; |
| 21265 | for (unsigned SliceStartIdx = FirstUnvecStore; |
| 21266 | SliceStartIdx + VF <= MaxSliceEnd;) { |
| 21267 | if (!checkTreeSizes(Sizes: RangeSizes.slice(N: SliceStartIdx, M: VF), |
| 21268 | First: VF >= MaxRegVF)) { |
| 21269 | ++SliceStartIdx; |
| 21270 | continue; |
| 21271 | } |
| 21272 | ArrayRef<Value *> Slice = |
| 21273 | ArrayRef(Operands).slice(N: SliceStartIdx, M: VF); |
| 21274 | assert(all_of(Slice, |
| 21275 | [&](Value *V) { |
| 21276 | return cast<StoreInst>(V) |
| 21277 | ->getValueOperand() |
| 21278 | ->getType() == |
| 21279 | cast<StoreInst>(Slice.front()) |
| 21280 | ->getValueOperand() |
| 21281 | ->getType(); |
| 21282 | }) && |
| 21283 | "Expected all operands of same type." ); |
| 21284 | if (!NonSchedulable.empty()) { |
| 21285 | auto [NonSchedSizeMax, NonSchedSizeMin] = |
| 21286 | NonSchedulable.lookup(Val: Slice.front()); |
| 21287 | if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) { |
| 21288 | // VF is too ambitious. Try to vectorize another slice before |
| 21289 | // trying a smaller VF. |
| 21290 | SliceStartIdx += NonSchedSizeMax; |
| 21291 | continue; |
| 21292 | } |
| 21293 | } |
| 21294 | unsigned TreeSize; |
| 21295 | std::optional<bool> Res = |
| 21296 | vectorizeStoreChain(Chain: Slice, R, Idx: SliceStartIdx, MinVF, Size&: TreeSize); |
| 21297 | if (!Res) { |
| 21298 | // Update the range of non schedulable VFs for slices starting |
| 21299 | // at SliceStartIdx. |
| 21300 | NonSchedulable |
| 21301 | .try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF)) |
| 21302 | .first->getSecond() |
| 21303 | .second = VF; |
| 21304 | } else if (*Res) { |
| 21305 | // Mark the vectorized stores so that we don't vectorize them |
| 21306 | // again. |
| 21307 | VectorizedStores.insert_range(R&: Slice); |
| 21308 | // Mark the vectorized stores so that we don't vectorize them |
| 21309 | // again. |
| 21310 | AnyProfitableGraph = RepeatChanged = Changed = true; |
| 21311 | // If we vectorized initial block, no need to try to vectorize |
| 21312 | // it again. |
| 21313 | for (std::pair<unsigned, unsigned> &P : |
| 21314 | RangeSizes.slice(N: SliceStartIdx, M: VF)) |
| 21315 | P.first = P.second = 0; |
| 21316 | if (SliceStartIdx < FirstUnvecStore + MinVF) { |
| 21317 | for (std::pair<unsigned, unsigned> &P : RangeSizes.slice( |
| 21318 | N: FirstUnvecStore, M: SliceStartIdx - FirstUnvecStore)) |
| 21319 | P.first = P.second = 0; |
| 21320 | FirstUnvecStore = SliceStartIdx + VF; |
| 21321 | } |
| 21322 | if (SliceStartIdx > MaxSliceEnd - VF - MinVF) { |
| 21323 | for (std::pair<unsigned, unsigned> &P : |
| 21324 | RangeSizes.slice(N: SliceStartIdx + VF, |
| 21325 | M: MaxSliceEnd - (SliceStartIdx + VF))) |
| 21326 | P.first = P.second = 0; |
| 21327 | if (MaxSliceEnd == End) |
| 21328 | End = SliceStartIdx; |
| 21329 | MaxSliceEnd = SliceStartIdx; |
| 21330 | } |
| 21331 | SliceStartIdx += VF; |
| 21332 | continue; |
| 21333 | } |
| 21334 | if (VF > 2 && Res && |
| 21335 | !all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF), |
| 21336 | P: std::bind(f&: VFIsProfitable, args: VF >= MaxRegVF, args&: TreeSize, |
| 21337 | args: _1))) { |
| 21338 | SliceStartIdx += VF; |
| 21339 | continue; |
| 21340 | } |
| 21341 | // Check for the very big VFs that we're not rebuilding same |
| 21342 | // trees, just with larger number of elements. |
| 21343 | if (VF > MaxRegVF && TreeSize > 1 && |
| 21344 | all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF), |
| 21345 | P: std::bind(f&: FirstSizeSame, args&: TreeSize, args: _1))) { |
| 21346 | SliceStartIdx += VF; |
| 21347 | while (SliceStartIdx != MaxSliceEnd && |
| 21348 | RangeSizes[SliceStartIdx].first == TreeSize) |
| 21349 | ++SliceStartIdx; |
| 21350 | continue; |
| 21351 | } |
| 21352 | if (TreeSize > 1) { |
| 21353 | for (std::pair<unsigned, unsigned> &P : |
| 21354 | RangeSizes.slice(N: SliceStartIdx, M: VF)) { |
| 21355 | if (VF >= MaxRegVF) |
| 21356 | P.second = std::max(a: P.second, b: TreeSize); |
| 21357 | else |
| 21358 | P.first = std::max(a: P.first, b: TreeSize); |
| 21359 | } |
| 21360 | } |
| 21361 | ++SliceStartIdx; |
| 21362 | AnyProfitableGraph = true; |
| 21363 | } |
| 21364 | if (FirstUnvecStore >= End) |
| 21365 | break; |
| 21366 | if (MaxSliceEnd - FirstUnvecStore < VF && |
| 21367 | MaxSliceEnd - FirstUnvecStore >= MinVF) |
| 21368 | AnyProfitableGraph = true; |
| 21369 | FirstUnvecStore = std::distance( |
| 21370 | first: RangeSizes.begin(), |
| 21371 | last: find_if(Range: RangeSizes.drop_front(N: MaxSliceEnd), |
| 21372 | P: std::bind(f&: IsNotVectorized, args: VF >= MaxRegVF, args: _1))); |
| 21373 | } |
| 21374 | if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF)) |
| 21375 | break; |
| 21376 | } |
| 21377 | // All values vectorized - exit. |
| 21378 | if (all_of(Range&: RangeSizes, P: [](const std::pair<unsigned, unsigned> &P) { |
| 21379 | return P.first == 0 && P.second == 0; |
| 21380 | })) |
| 21381 | break; |
| 21382 | // Check if tried all attempts or no need for the last attempts at all. |
| 21383 | if (Repeat >= MaxAttempts || |
| 21384 | (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph))) |
| 21385 | break; |
| 21386 | constexpr unsigned StoresLimit = 64; |
| 21387 | const unsigned MaxTotalNum = std::min<unsigned>( |
| 21388 | a: Operands.size(), |
| 21389 | b: static_cast<unsigned>( |
| 21390 | End - |
| 21391 | std::distance( |
| 21392 | first: RangeSizes.begin(), |
| 21393 | last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: true, args: _1))) + |
| 21394 | 1)); |
| 21395 | unsigned VF = bit_ceil(Value: CandidateVFs.front()) * 2; |
| 21396 | unsigned Limit = |
| 21397 | getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: StoreTy, Sz: MaxTotalNum); |
| 21398 | CandidateVFs.clear(); |
| 21399 | if (bit_floor(Value: Limit) == VF) |
| 21400 | CandidateVFs.push_back(Elt: Limit); |
| 21401 | if (VF > MaxTotalNum || VF >= StoresLimit) |
| 21402 | break; |
| 21403 | for (std::pair<unsigned, unsigned> &P : RangeSizes) { |
| 21404 | if (P.first != 0) |
| 21405 | P.first = std::max(a: P.second, b: P.first); |
| 21406 | } |
| 21407 | // Last attempt to vectorize max number of elements, if all previous |
| 21408 | // attempts were unsuccessful because of the cost issues. |
| 21409 | CandidateVFs.push_back(Elt: VF); |
| 21410 | } |
| 21411 | } |
| 21412 | }; |
| 21413 | |
| 21414 | /// Groups of stores to vectorize |
| 21415 | SmallVector<RelatedStoreInsts> SortedStores; |
| 21416 | |
| 21417 | // Inserts the specified store SI with the given index Idx to the set of the |
| 21418 | // stores. If the store with the same distance is found already - stop |
| 21419 | // insertion, try to vectorize already found stores. If some stores from this |
| 21420 | // sequence were not vectorized - try to vectorize them with the new store |
| 21421 | // later. But this logic is applied only to the stores, that come before the |
| 21422 | // previous store with the same distance. |
| 21423 | // Example: |
| 21424 | // 1. store x, %p |
| 21425 | // 2. store y, %p+1 |
| 21426 | // 3. store z, %p+2 |
| 21427 | // 4. store a, %p |
| 21428 | // 5. store b, %p+3 |
| 21429 | // - Scan this from the last to first store. The very first bunch of stores is |
| 21430 | // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores |
| 21431 | // vector). |
| 21432 | // - The next store in the list - #1 - has the same distance from store #5 as |
| 21433 | // the store #4. |
| 21434 | // - Try to vectorize sequence of stores 4,2,3,5. |
| 21435 | // - If all these stores are vectorized - just drop them. |
| 21436 | // - If some of them are not vectorized (say, #3 and #5), do extra analysis. |
| 21437 | // - Start new stores sequence. |
| 21438 | // The new bunch of stores is {1, {1, 0}}. |
| 21439 | // - Add the stores from previous sequence, that were not vectorized. |
| 21440 | // Here we consider the stores in the reversed order, rather they are used in |
| 21441 | // the IR (Stores are reversed already, see vectorizeStoreChains() function). |
| 21442 | // Store #3 can be added -> comes after store #4 with the same distance as |
| 21443 | // store #1. |
| 21444 | // Store #5 cannot be added - comes before store #4. |
| 21445 | // This logic allows to improve the compile time, we assume that the stores |
| 21446 | // after previous store with the same distance most likely have memory |
| 21447 | // dependencies and no need to waste compile time to try to vectorize them. |
| 21448 | // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}. |
| 21449 | auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) { |
| 21450 | std::optional<int64_t> PtrDist; |
| 21451 | auto *RelatedStores = find_if( |
| 21452 | Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) { |
| 21453 | PtrDist = StoreSeq.getPointerDiff(SI&: *SI, DL: *DL, SE&: *SE); |
| 21454 | return PtrDist.has_value(); |
| 21455 | }); |
| 21456 | |
| 21457 | // We did not find a comparable store, start a new group. |
| 21458 | if (RelatedStores == SortedStores.end()) { |
| 21459 | SortedStores.emplace_back(Args&: Idx, Args&: Stores); |
| 21460 | return; |
| 21461 | } |
| 21462 | |
| 21463 | // If there is already a store in the group with the same PtrDiff, try to |
| 21464 | // vectorize the existing instructions before adding the current store. |
| 21465 | // Otherwise, insert this store and keep collecting. |
| 21466 | if (std::optional<unsigned> PrevInst = |
| 21467 | RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) { |
| 21468 | TryToVectorize(RelatedStores->getStores()); |
| 21469 | RelatedStores->clearVectorizedStores(VectorizedStores); |
| 21470 | RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1, |
| 21471 | /*NewBaseInstIdx=*/Idx, |
| 21472 | /*DistFromCurBase=*/*PtrDist); |
| 21473 | } |
| 21474 | }; |
| 21475 | Type *PrevValTy = nullptr; |
| 21476 | for (auto [I, SI] : enumerate(First&: Stores)) { |
| 21477 | if (R.isDeleted(I: SI)) |
| 21478 | continue; |
| 21479 | if (!PrevValTy) |
| 21480 | PrevValTy = SI->getValueOperand()->getType(); |
| 21481 | // Check that we do not try to vectorize stores of different types. |
| 21482 | if (PrevValTy != SI->getValueOperand()->getType()) { |
| 21483 | for (RelatedStoreInsts &StoreSeq : SortedStores) |
| 21484 | TryToVectorize(StoreSeq.getStores()); |
| 21485 | SortedStores.clear(); |
| 21486 | PrevValTy = SI->getValueOperand()->getType(); |
| 21487 | } |
| 21488 | FillStoresSet(I, SI); |
| 21489 | } |
| 21490 | |
| 21491 | // Final vectorization attempt. |
| 21492 | for (RelatedStoreInsts &StoreSeq : SortedStores) |
| 21493 | TryToVectorize(StoreSeq.getStores()); |
| 21494 | |
| 21495 | return Changed; |
| 21496 | } |
| 21497 | |
| 21498 | void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) { |
| 21499 | // Initialize the collections. We will make a single pass over the block. |
| 21500 | Stores.clear(); |
| 21501 | GEPs.clear(); |
| 21502 | |
| 21503 | // Visit the store and getelementptr instructions in BB and organize them in |
| 21504 | // Stores and GEPs according to the underlying objects of their pointer |
| 21505 | // operands. |
| 21506 | for (Instruction &I : *BB) { |
| 21507 | // Ignore store instructions that are volatile or have a pointer operand |
| 21508 | // that doesn't point to a scalar type. |
| 21509 | if (auto *SI = dyn_cast<StoreInst>(Val: &I)) { |
| 21510 | if (!SI->isSimple()) |
| 21511 | continue; |
| 21512 | if (!isValidElementType(Ty: SI->getValueOperand()->getType())) |
| 21513 | continue; |
| 21514 | Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI); |
| 21515 | } |
| 21516 | |
| 21517 | // Ignore getelementptr instructions that have more than one index, a |
| 21518 | // constant index, or a pointer operand that doesn't point to a scalar |
| 21519 | // type. |
| 21520 | else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) { |
| 21521 | if (GEP->getNumIndices() != 1) |
| 21522 | continue; |
| 21523 | Value *Idx = GEP->idx_begin()->get(); |
| 21524 | if (isa<Constant>(Val: Idx)) |
| 21525 | continue; |
| 21526 | if (!isValidElementType(Ty: Idx->getType())) |
| 21527 | continue; |
| 21528 | if (GEP->getType()->isVectorTy()) |
| 21529 | continue; |
| 21530 | GEPs[GEP->getPointerOperand()].push_back(Elt: GEP); |
| 21531 | } |
| 21532 | } |
| 21533 | } |
| 21534 | |
| 21535 | bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, |
| 21536 | bool MaxVFOnly) { |
| 21537 | if (VL.size() < 2) |
| 21538 | return false; |
| 21539 | |
| 21540 | LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " |
| 21541 | << VL.size() << ".\n" ); |
| 21542 | |
| 21543 | // Check that all of the parts are instructions of the same type, |
| 21544 | // we permit an alternate opcode via InstructionsState. |
| 21545 | InstructionsState S = getSameOpcode(VL, TLI: *TLI); |
| 21546 | if (!S) |
| 21547 | return false; |
| 21548 | |
| 21549 | Instruction *I0 = S.getMainOp(); |
| 21550 | // Make sure invalid types (including vector type) are rejected before |
| 21551 | // determining vectorization factor for scalar instructions. |
| 21552 | for (Value *V : VL) { |
| 21553 | Type *Ty = V->getType(); |
| 21554 | if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) { |
| 21555 | // NOTE: the following will give user internal llvm type name, which may |
| 21556 | // not be useful. |
| 21557 | R.getORE()->emit(RemarkBuilder: [&]() { |
| 21558 | std::string TypeStr; |
| 21559 | llvm::raw_string_ostream OS(TypeStr); |
| 21560 | Ty->print(O&: OS); |
| 21561 | return OptimizationRemarkMissed(SV_NAME, "UnsupportedType" , I0) |
| 21562 | << "Cannot SLP vectorize list: type " |
| 21563 | << TypeStr + " is unsupported by vectorizer" ; |
| 21564 | }); |
| 21565 | return false; |
| 21566 | } |
| 21567 | } |
| 21568 | |
| 21569 | Type *ScalarTy = getValueType(V: VL[0]); |
| 21570 | unsigned Sz = R.getVectorElementSize(V: I0); |
| 21571 | unsigned MinVF = R.getMinVF(Sz); |
| 21572 | unsigned MaxVF = std::max<unsigned>( |
| 21573 | a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF); |
| 21574 | MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF); |
| 21575 | if (MaxVF < 2) { |
| 21576 | R.getORE()->emit(RemarkBuilder: [&]() { |
| 21577 | return OptimizationRemarkMissed(SV_NAME, "SmallVF" , I0) |
| 21578 | << "Cannot SLP vectorize list: vectorization factor " |
| 21579 | << "less than 2 is not supported" ; |
| 21580 | }); |
| 21581 | return false; |
| 21582 | } |
| 21583 | |
| 21584 | bool Changed = false; |
| 21585 | bool CandidateFound = false; |
| 21586 | InstructionCost MinCost = SLPCostThreshold.getValue(); |
| 21587 | |
| 21588 | unsigned NextInst = 0, MaxInst = VL.size(); |
| 21589 | for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; |
| 21590 | VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - 1)) { |
| 21591 | // No actual vectorization should happen, if number of parts is the same as |
| 21592 | // provided vectorization factor (i.e. the scalar type is used for vector |
| 21593 | // code during codegen). |
| 21594 | auto *VecTy = getWidenedType(ScalarTy, VF); |
| 21595 | if (TTI->getNumberOfParts(Tp: VecTy) == VF) |
| 21596 | continue; |
| 21597 | for (unsigned I = NextInst; I < MaxInst; ++I) { |
| 21598 | unsigned ActualVF = std::min(a: MaxInst - I, b: VF); |
| 21599 | |
| 21600 | if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF)) |
| 21601 | continue; |
| 21602 | |
| 21603 | if (MaxVFOnly && ActualVF < MaxVF) |
| 21604 | break; |
| 21605 | if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2)) |
| 21606 | break; |
| 21607 | |
| 21608 | SmallVector<Value *> Ops(ActualVF, nullptr); |
| 21609 | unsigned Idx = 0; |
| 21610 | for (Value *V : VL.drop_front(N: I)) { |
| 21611 | // Check that a previous iteration of this loop did not delete the |
| 21612 | // Value. |
| 21613 | if (auto *Inst = dyn_cast<Instruction>(Val: V); |
| 21614 | !Inst || !R.isDeleted(I: Inst)) { |
| 21615 | Ops[Idx] = V; |
| 21616 | ++Idx; |
| 21617 | if (Idx == ActualVF) |
| 21618 | break; |
| 21619 | } |
| 21620 | } |
| 21621 | // Not enough vectorizable instructions - exit. |
| 21622 | if (Idx != ActualVF) |
| 21623 | break; |
| 21624 | |
| 21625 | LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations " |
| 21626 | << "\n" ); |
| 21627 | |
| 21628 | R.buildTree(Roots: Ops); |
| 21629 | if (R.isTreeTinyAndNotFullyVectorizable()) |
| 21630 | continue; |
| 21631 | if (R.isProfitableToReorder()) { |
| 21632 | R.reorderTopToBottom(); |
| 21633 | R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front())); |
| 21634 | } |
| 21635 | R.transformNodes(); |
| 21636 | R.buildExternalUses(); |
| 21637 | |
| 21638 | R.computeMinimumValueSizes(); |
| 21639 | InstructionCost Cost = R.getTreeCost(); |
| 21640 | CandidateFound = true; |
| 21641 | MinCost = std::min(a: MinCost, b: Cost); |
| 21642 | |
| 21643 | LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost |
| 21644 | << " for VF=" << ActualVF << "\n" ); |
| 21645 | if (Cost < -SLPCostThreshold) { |
| 21646 | LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n" ); |
| 21647 | R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "VectorizedList" , |
| 21648 | cast<Instruction>(Val: Ops[0])) |
| 21649 | << "SLP vectorized with cost " << ore::NV("Cost" , Cost) |
| 21650 | << " and with tree size " |
| 21651 | << ore::NV("TreeSize" , R.getTreeSize())); |
| 21652 | |
| 21653 | R.vectorizeTree(); |
| 21654 | // Move to the next bundle. |
| 21655 | I += VF - 1; |
| 21656 | NextInst = I + 1; |
| 21657 | Changed = true; |
| 21658 | } |
| 21659 | } |
| 21660 | } |
| 21661 | |
| 21662 | if (!Changed && CandidateFound) { |
| 21663 | R.getORE()->emit(RemarkBuilder: [&]() { |
| 21664 | return OptimizationRemarkMissed(SV_NAME, "NotBeneficial" , I0) |
| 21665 | << "List vectorization was possible but not beneficial with cost " |
| 21666 | << ore::NV("Cost" , MinCost) << " >= " |
| 21667 | << ore::NV("Treshold" , -SLPCostThreshold); |
| 21668 | }); |
| 21669 | } else if (!Changed) { |
| 21670 | R.getORE()->emit(RemarkBuilder: [&]() { |
| 21671 | return OptimizationRemarkMissed(SV_NAME, "NotPossible" , I0) |
| 21672 | << "Cannot SLP vectorize list: vectorization was impossible" |
| 21673 | << " with available vectorization factors" ; |
| 21674 | }); |
| 21675 | } |
| 21676 | return Changed; |
| 21677 | } |
| 21678 | |
| 21679 | bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) { |
| 21680 | if (!I) |
| 21681 | return false; |
| 21682 | |
| 21683 | if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType())) |
| 21684 | return false; |
| 21685 | |
| 21686 | Value *P = I->getParent(); |
| 21687 | |
| 21688 | // Vectorize in current basic block only. |
| 21689 | auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0)); |
| 21690 | auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1)); |
| 21691 | if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P || |
| 21692 | R.isDeleted(I: Op0) || R.isDeleted(I: Op1)) |
| 21693 | return false; |
| 21694 | |
| 21695 | // First collect all possible candidates |
| 21696 | SmallVector<std::pair<Value *, Value *>, 4> Candidates; |
| 21697 | Candidates.emplace_back(Args&: Op0, Args&: Op1); |
| 21698 | |
| 21699 | auto *A = dyn_cast<BinaryOperator>(Val: Op0); |
| 21700 | auto *B = dyn_cast<BinaryOperator>(Val: Op1); |
| 21701 | // Try to skip B. |
| 21702 | if (A && B && B->hasOneUse()) { |
| 21703 | auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0)); |
| 21704 | auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1)); |
| 21705 | if (B0 && B0->getParent() == P && !R.isDeleted(I: B0)) |
| 21706 | Candidates.emplace_back(Args&: A, Args&: B0); |
| 21707 | if (B1 && B1->getParent() == P && !R.isDeleted(I: B1)) |
| 21708 | Candidates.emplace_back(Args&: A, Args&: B1); |
| 21709 | } |
| 21710 | // Try to skip A. |
| 21711 | if (B && A && A->hasOneUse()) { |
| 21712 | auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0)); |
| 21713 | auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1)); |
| 21714 | if (A0 && A0->getParent() == P && !R.isDeleted(I: A0)) |
| 21715 | Candidates.emplace_back(Args&: A0, Args&: B); |
| 21716 | if (A1 && A1->getParent() == P && !R.isDeleted(I: A1)) |
| 21717 | Candidates.emplace_back(Args&: A1, Args&: B); |
| 21718 | } |
| 21719 | |
| 21720 | if (Candidates.size() == 1) |
| 21721 | return tryToVectorizeList(VL: {Op0, Op1}, R); |
| 21722 | |
| 21723 | // We have multiple options. Try to pick the single best. |
| 21724 | std::optional<int> BestCandidate = R.findBestRootPair(Candidates); |
| 21725 | if (!BestCandidate) |
| 21726 | return false; |
| 21727 | return tryToVectorizeList( |
| 21728 | VL: {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R); |
| 21729 | } |
| 21730 | |
| 21731 | namespace { |
| 21732 | |
| 21733 | /// Model horizontal reductions. |
| 21734 | /// |
| 21735 | /// A horizontal reduction is a tree of reduction instructions that has values |
| 21736 | /// that can be put into a vector as its leaves. For example: |
| 21737 | /// |
| 21738 | /// mul mul mul mul |
| 21739 | /// \ / \ / |
| 21740 | /// + + |
| 21741 | /// \ / |
| 21742 | /// + |
| 21743 | /// This tree has "mul" as its leaf values and "+" as its reduction |
| 21744 | /// instructions. A reduction can feed into a store or a binary operation |
| 21745 | /// feeding a phi. |
| 21746 | /// ... |
| 21747 | /// \ / |
| 21748 | /// + |
| 21749 | /// | |
| 21750 | /// phi += |
| 21751 | /// |
| 21752 | /// Or: |
| 21753 | /// ... |
| 21754 | /// \ / |
| 21755 | /// + |
| 21756 | /// | |
| 21757 | /// *p = |
| 21758 | /// |
| 21759 | class HorizontalReduction { |
| 21760 | using ReductionOpsType = SmallVector<Value *, 16>; |
| 21761 | using ReductionOpsListType = SmallVector<ReductionOpsType, 2>; |
| 21762 | ReductionOpsListType ReductionOps; |
| 21763 | /// List of possibly reduced values. |
| 21764 | SmallVector<SmallVector<Value *>> ReducedVals; |
| 21765 | /// Maps reduced value to the corresponding reduction operation. |
| 21766 | SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps; |
| 21767 | WeakTrackingVH ReductionRoot; |
| 21768 | /// The type of reduction operation. |
| 21769 | RecurKind RdxKind; |
| 21770 | /// Checks if the optimization of original scalar identity operations on |
| 21771 | /// matched horizontal reductions is enabled and allowed. |
| 21772 | bool IsSupportedHorRdxIdentityOp = false; |
| 21773 | /// Contains vector values for reduction including their scale factor and |
| 21774 | /// signedness. |
| 21775 | SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales; |
| 21776 | |
| 21777 | static bool isCmpSelMinMax(Instruction *I) { |
| 21778 | return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) && |
| 21779 | RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I)); |
| 21780 | } |
| 21781 | |
| 21782 | // And/or are potentially poison-safe logical patterns like: |
| 21783 | // select x, y, false |
| 21784 | // select x, true, y |
| 21785 | static bool isBoolLogicOp(Instruction *I) { |
| 21786 | return isa<SelectInst>(Val: I) && |
| 21787 | (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr())); |
| 21788 | } |
| 21789 | |
| 21790 | /// Checks if instruction is associative and can be vectorized. |
| 21791 | static bool isVectorizable(RecurKind Kind, Instruction *I) { |
| 21792 | if (Kind == RecurKind::None) |
| 21793 | return false; |
| 21794 | |
| 21795 | // Integer ops that map to select instructions or intrinsics are fine. |
| 21796 | if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || |
| 21797 | isBoolLogicOp(I)) |
| 21798 | return true; |
| 21799 | |
| 21800 | if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { |
| 21801 | // FP min/max are associative except for NaN and -0.0. We do not |
| 21802 | // have to rule out -0.0 here because the intrinsic semantics do not |
| 21803 | // specify a fixed result for it. |
| 21804 | return I->getFastMathFlags().noNaNs(); |
| 21805 | } |
| 21806 | |
| 21807 | if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum) |
| 21808 | return true; |
| 21809 | |
| 21810 | return I->isAssociative(); |
| 21811 | } |
| 21812 | |
| 21813 | static Value *getRdxOperand(Instruction *I, unsigned Index) { |
| 21814 | // Poison-safe 'or' takes the form: select X, true, Y |
| 21815 | // To make that work with the normal operand processing, we skip the |
| 21816 | // true value operand. |
| 21817 | // TODO: Change the code and data structures to handle this without a hack. |
| 21818 | if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1) |
| 21819 | return I->getOperand(i: 2); |
| 21820 | return I->getOperand(i: Index); |
| 21821 | } |
| 21822 | |
| 21823 | /// Creates reduction operation with the current opcode. |
| 21824 | static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS, |
| 21825 | Value *RHS, const Twine &Name, bool UseSelect) { |
| 21826 | Type *OpTy = LHS->getType(); |
| 21827 | assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type" ); |
| 21828 | switch (Kind) { |
| 21829 | case RecurKind::Or: { |
| 21830 | if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy)) |
| 21831 | return Builder.CreateSelect( |
| 21832 | C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)), |
| 21833 | False: RHS, Name); |
| 21834 | unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); |
| 21835 | return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS, |
| 21836 | Name); |
| 21837 | } |
| 21838 | case RecurKind::And: { |
| 21839 | if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy)) |
| 21840 | return Builder.CreateSelect( |
| 21841 | C: LHS, True: RHS, |
| 21842 | False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)), Name); |
| 21843 | unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); |
| 21844 | return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS, |
| 21845 | Name); |
| 21846 | } |
| 21847 | case RecurKind::Add: |
| 21848 | case RecurKind::Mul: |
| 21849 | case RecurKind::Xor: |
| 21850 | case RecurKind::FAdd: |
| 21851 | case RecurKind::FMul: { |
| 21852 | unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); |
| 21853 | return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS, |
| 21854 | Name); |
| 21855 | } |
| 21856 | case RecurKind::SMax: |
| 21857 | case RecurKind::SMin: |
| 21858 | case RecurKind::UMax: |
| 21859 | case RecurKind::UMin: |
| 21860 | if (UseSelect) { |
| 21861 | CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind); |
| 21862 | Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name); |
| 21863 | return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name); |
| 21864 | } |
| 21865 | [[fallthrough]]; |
| 21866 | case RecurKind::FMax: |
| 21867 | case RecurKind::FMin: |
| 21868 | case RecurKind::FMaximum: |
| 21869 | case RecurKind::FMinimum: |
| 21870 | case RecurKind::FMaximumNum: |
| 21871 | case RecurKind::FMinimumNum: { |
| 21872 | Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind); |
| 21873 | return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS); |
| 21874 | } |
| 21875 | default: |
| 21876 | llvm_unreachable("Unknown reduction operation." ); |
| 21877 | } |
| 21878 | } |
| 21879 | |
| 21880 | /// Creates reduction operation with the current opcode with the IR flags |
| 21881 | /// from \p ReductionOps, dropping nuw/nsw flags. |
| 21882 | static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS, |
| 21883 | Value *RHS, const Twine &Name, |
| 21884 | const ReductionOpsListType &ReductionOps) { |
| 21885 | bool UseSelect = ReductionOps.size() == 2 || |
| 21886 | // Logical or/and. |
| 21887 | (ReductionOps.size() == 1 && |
| 21888 | any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>)); |
| 21889 | assert((!UseSelect || ReductionOps.size() != 2 || |
| 21890 | isa<SelectInst>(ReductionOps[1][0])) && |
| 21891 | "Expected cmp + select pairs for reduction" ); |
| 21892 | Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect); |
| 21893 | if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) { |
| 21894 | if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) { |
| 21895 | propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr, |
| 21896 | /*IncludeWrapFlags=*/false); |
| 21897 | propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr, |
| 21898 | /*IncludeWrapFlags=*/false); |
| 21899 | return Op; |
| 21900 | } |
| 21901 | } |
| 21902 | propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false); |
| 21903 | return Op; |
| 21904 | } |
| 21905 | |
| 21906 | public: |
| 21907 | static RecurKind getRdxKind(Value *V) { |
| 21908 | auto *I = dyn_cast<Instruction>(Val: V); |
| 21909 | if (!I) |
| 21910 | return RecurKind::None; |
| 21911 | if (match(V: I, P: m_Add(L: m_Value(), R: m_Value()))) |
| 21912 | return RecurKind::Add; |
| 21913 | if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value()))) |
| 21914 | return RecurKind::Mul; |
| 21915 | if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) || |
| 21916 | match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value()))) |
| 21917 | return RecurKind::And; |
| 21918 | if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) || |
| 21919 | match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value()))) |
| 21920 | return RecurKind::Or; |
| 21921 | if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value()))) |
| 21922 | return RecurKind::Xor; |
| 21923 | if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value()))) |
| 21924 | return RecurKind::FAdd; |
| 21925 | if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value()))) |
| 21926 | return RecurKind::FMul; |
| 21927 | |
| 21928 | if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value()))) |
| 21929 | return RecurKind::FMax; |
| 21930 | if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value()))) |
| 21931 | return RecurKind::FMin; |
| 21932 | |
| 21933 | if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value()))) |
| 21934 | return RecurKind::FMaximum; |
| 21935 | if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value()))) |
| 21936 | return RecurKind::FMinimum; |
| 21937 | // This matches either cmp+select or intrinsics. SLP is expected to handle |
| 21938 | // either form. |
| 21939 | // TODO: If we are canonicalizing to intrinsics, we can remove several |
| 21940 | // special-case paths that deal with selects. |
| 21941 | if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value()))) |
| 21942 | return RecurKind::SMax; |
| 21943 | if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value()))) |
| 21944 | return RecurKind::SMin; |
| 21945 | if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value()))) |
| 21946 | return RecurKind::UMax; |
| 21947 | if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value()))) |
| 21948 | return RecurKind::UMin; |
| 21949 | |
| 21950 | if (auto *Select = dyn_cast<SelectInst>(Val: I)) { |
| 21951 | // Try harder: look for min/max pattern based on instructions producing |
| 21952 | // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2). |
| 21953 | // During the intermediate stages of SLP, it's very common to have |
| 21954 | // pattern like this (since optimizeGatherSequence is run only once |
| 21955 | // at the end): |
| 21956 | // %1 = extractelement <2 x i32> %a, i32 0 |
| 21957 | // %2 = extractelement <2 x i32> %a, i32 1 |
| 21958 | // %cond = icmp sgt i32 %1, %2 |
| 21959 | // %3 = extractelement <2 x i32> %a, i32 0 |
| 21960 | // %4 = extractelement <2 x i32> %a, i32 1 |
| 21961 | // %select = select i1 %cond, i32 %3, i32 %4 |
| 21962 | CmpPredicate Pred; |
| 21963 | Instruction *L1; |
| 21964 | Instruction *L2; |
| 21965 | |
| 21966 | Value *LHS = Select->getTrueValue(); |
| 21967 | Value *RHS = Select->getFalseValue(); |
| 21968 | Value *Cond = Select->getCondition(); |
| 21969 | |
| 21970 | // TODO: Support inverse predicates. |
| 21971 | if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) { |
| 21972 | if (!isa<ExtractElementInst>(Val: RHS) || |
| 21973 | !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS))) |
| 21974 | return RecurKind::None; |
| 21975 | } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) { |
| 21976 | if (!isa<ExtractElementInst>(Val: LHS) || |
| 21977 | !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS))) |
| 21978 | return RecurKind::None; |
| 21979 | } else { |
| 21980 | if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS)) |
| 21981 | return RecurKind::None; |
| 21982 | if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) || |
| 21983 | !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) || |
| 21984 | !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS))) |
| 21985 | return RecurKind::None; |
| 21986 | } |
| 21987 | |
| 21988 | switch (Pred) { |
| 21989 | default: |
| 21990 | return RecurKind::None; |
| 21991 | case CmpInst::ICMP_SGT: |
| 21992 | case CmpInst::ICMP_SGE: |
| 21993 | return RecurKind::SMax; |
| 21994 | case CmpInst::ICMP_SLT: |
| 21995 | case CmpInst::ICMP_SLE: |
| 21996 | return RecurKind::SMin; |
| 21997 | case CmpInst::ICMP_UGT: |
| 21998 | case CmpInst::ICMP_UGE: |
| 21999 | return RecurKind::UMax; |
| 22000 | case CmpInst::ICMP_ULT: |
| 22001 | case CmpInst::ICMP_ULE: |
| 22002 | return RecurKind::UMin; |
| 22003 | } |
| 22004 | } |
| 22005 | return RecurKind::None; |
| 22006 | } |
| 22007 | |
| 22008 | /// Get the index of the first operand. |
| 22009 | static unsigned getFirstOperandIndex(Instruction *I) { |
| 22010 | return isCmpSelMinMax(I) ? 1 : 0; |
| 22011 | } |
| 22012 | |
| 22013 | private: |
| 22014 | /// Total number of operands in the reduction operation. |
| 22015 | static unsigned getNumberOfOperands(Instruction *I) { |
| 22016 | return isCmpSelMinMax(I) ? 3 : 2; |
| 22017 | } |
| 22018 | |
| 22019 | /// Checks if the instruction is in basic block \p BB. |
| 22020 | /// For a cmp+sel min/max reduction check that both ops are in \p BB. |
| 22021 | static bool hasSameParent(Instruction *I, BasicBlock *BB) { |
| 22022 | if (isCmpSelMinMax(I) || isBoolLogicOp(I)) { |
| 22023 | auto *Sel = cast<SelectInst>(Val: I); |
| 22024 | auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition()); |
| 22025 | return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB; |
| 22026 | } |
| 22027 | return I->getParent() == BB; |
| 22028 | } |
| 22029 | |
| 22030 | /// Expected number of uses for reduction operations/reduced values. |
| 22031 | static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { |
| 22032 | if (IsCmpSelMinMax) { |
| 22033 | // SelectInst must be used twice while the condition op must have single |
| 22034 | // use only. |
| 22035 | if (auto *Sel = dyn_cast<SelectInst>(Val: I)) |
| 22036 | return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse(); |
| 22037 | return I->hasNUses(N: 2); |
| 22038 | } |
| 22039 | |
| 22040 | // Arithmetic reduction operation must be used once only. |
| 22041 | return I->hasOneUse(); |
| 22042 | } |
| 22043 | |
| 22044 | /// Initializes the list of reduction operations. |
| 22045 | void initReductionOps(Instruction *I) { |
| 22046 | if (isCmpSelMinMax(I)) |
| 22047 | ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType()); |
| 22048 | else |
| 22049 | ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType()); |
| 22050 | } |
| 22051 | |
| 22052 | /// Add all reduction operations for the reduction instruction \p I. |
| 22053 | void addReductionOps(Instruction *I) { |
| 22054 | if (isCmpSelMinMax(I)) { |
| 22055 | ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition()); |
| 22056 | ReductionOps[1].emplace_back(Args&: I); |
| 22057 | } else { |
| 22058 | ReductionOps[0].emplace_back(Args&: I); |
| 22059 | } |
| 22060 | } |
| 22061 | |
| 22062 | static bool isGoodForReduction(ArrayRef<Value *> Data) { |
| 22063 | int Sz = Data.size(); |
| 22064 | auto *I = dyn_cast<Instruction>(Val: Data.front()); |
| 22065 | return Sz > 1 || isConstant(V: Data.front()) || |
| 22066 | (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode())); |
| 22067 | } |
| 22068 | |
| 22069 | public: |
| 22070 | HorizontalReduction() = default; |
| 22071 | |
| 22072 | /// Try to find a reduction tree. |
| 22073 | bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root, |
| 22074 | ScalarEvolution &SE, const DataLayout &DL, |
| 22075 | const TargetLibraryInfo &TLI) { |
| 22076 | RdxKind = HorizontalReduction::getRdxKind(V: Root); |
| 22077 | if (!isVectorizable(Kind: RdxKind, I: Root)) |
| 22078 | return false; |
| 22079 | |
| 22080 | // Analyze "regular" integer/FP types for reductions - no target-specific |
| 22081 | // types or pointers. |
| 22082 | Type *Ty = Root->getType(); |
| 22083 | if (!isValidElementType(Ty) || Ty->isPointerTy()) |
| 22084 | return false; |
| 22085 | |
| 22086 | // Though the ultimate reduction may have multiple uses, its condition must |
| 22087 | // have only single use. |
| 22088 | if (auto *Sel = dyn_cast<SelectInst>(Val: Root)) |
| 22089 | if (!Sel->getCondition()->hasOneUse()) |
| 22090 | return false; |
| 22091 | |
| 22092 | ReductionRoot = Root; |
| 22093 | |
| 22094 | // Iterate through all the operands of the possible reduction tree and |
| 22095 | // gather all the reduced values, sorting them by their value id. |
| 22096 | BasicBlock *BB = Root->getParent(); |
| 22097 | bool IsCmpSelMinMax = isCmpSelMinMax(I: Root); |
| 22098 | SmallVector<std::pair<Instruction *, unsigned>> Worklist( |
| 22099 | 1, std::make_pair(x&: Root, y: 0)); |
| 22100 | // Checks if the operands of the \p TreeN instruction are also reduction |
| 22101 | // operations or should be treated as reduced values or an extra argument, |
| 22102 | // which is not part of the reduction. |
| 22103 | auto CheckOperands = [&](Instruction *TreeN, |
| 22104 | SmallVectorImpl<Value *> &PossibleReducedVals, |
| 22105 | SmallVectorImpl<Instruction *> &ReductionOps, |
| 22106 | unsigned Level) { |
| 22107 | for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN), |
| 22108 | End: getNumberOfOperands(I: TreeN)))) { |
| 22109 | Value *EdgeVal = getRdxOperand(I: TreeN, Index: I); |
| 22110 | ReducedValsToOps[EdgeVal].push_back(Elt: TreeN); |
| 22111 | auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal); |
| 22112 | // If the edge is not an instruction, or it is different from the main |
| 22113 | // reduction opcode or has too many uses - possible reduced value. |
| 22114 | // Also, do not try to reduce const values, if the operation is not |
| 22115 | // foldable. |
| 22116 | if (!EdgeInst || Level > RecursionMaxDepth || |
| 22117 | getRdxKind(V: EdgeInst) != RdxKind || |
| 22118 | IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) || |
| 22119 | !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) || |
| 22120 | !isVectorizable(Kind: RdxKind, I: EdgeInst) || |
| 22121 | (R.isAnalyzedReductionRoot(I: EdgeInst) && |
| 22122 | all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) { |
| 22123 | PossibleReducedVals.push_back(Elt: EdgeVal); |
| 22124 | continue; |
| 22125 | } |
| 22126 | ReductionOps.push_back(Elt: EdgeInst); |
| 22127 | } |
| 22128 | }; |
| 22129 | // Try to regroup reduced values so that it gets more profitable to try to |
| 22130 | // reduce them. Values are grouped by their value ids, instructions - by |
| 22131 | // instruction op id and/or alternate op id, plus do extra analysis for |
| 22132 | // loads (grouping them by the distabce between pointers) and cmp |
| 22133 | // instructions (grouping them by the predicate). |
| 22134 | SmallMapVector< |
| 22135 | size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>, |
| 22136 | 8> |
| 22137 | PossibleReducedVals; |
| 22138 | initReductionOps(I: Root); |
| 22139 | DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap; |
| 22140 | SmallSet<size_t, 2> LoadKeyUsed; |
| 22141 | |
| 22142 | auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) { |
| 22143 | Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key); |
| 22144 | Value *Ptr = |
| 22145 | getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth); |
| 22146 | if (!LoadKeyUsed.insert(V: Key).second) { |
| 22147 | auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr)); |
| 22148 | if (LIt != LoadsMap.end()) { |
| 22149 | for (LoadInst *RLI : LIt->second) { |
| 22150 | if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(), |
| 22151 | ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE, |
| 22152 | /*StrictCheck=*/true)) |
| 22153 | return hash_value(ptr: RLI->getPointerOperand()); |
| 22154 | } |
| 22155 | for (LoadInst *RLI : LIt->second) { |
| 22156 | if (arePointersCompatible(Ptr1: RLI->getPointerOperand(), |
| 22157 | Ptr2: LI->getPointerOperand(), TLI)) { |
| 22158 | hash_code SubKey = hash_value(ptr: RLI->getPointerOperand()); |
| 22159 | return SubKey; |
| 22160 | } |
| 22161 | } |
| 22162 | if (LIt->second.size() > 2) { |
| 22163 | hash_code SubKey = |
| 22164 | hash_value(ptr: LIt->second.back()->getPointerOperand()); |
| 22165 | return SubKey; |
| 22166 | } |
| 22167 | } |
| 22168 | } |
| 22169 | LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)) |
| 22170 | .first->second.push_back(Elt: LI); |
| 22171 | return hash_value(ptr: LI->getPointerOperand()); |
| 22172 | }; |
| 22173 | |
| 22174 | while (!Worklist.empty()) { |
| 22175 | auto [TreeN, Level] = Worklist.pop_back_val(); |
| 22176 | SmallVector<Value *> PossibleRedVals; |
| 22177 | SmallVector<Instruction *> PossibleReductionOps; |
| 22178 | CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level); |
| 22179 | addReductionOps(I: TreeN); |
| 22180 | // Add reduction values. The values are sorted for better vectorization |
| 22181 | // results. |
| 22182 | for (Value *V : PossibleRedVals) { |
| 22183 | size_t Key, Idx; |
| 22184 | std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey, |
| 22185 | /*AllowAlternate=*/false); |
| 22186 | ++PossibleReducedVals[Key][Idx] |
| 22187 | .insert(KV: std::make_pair(x&: V, y: 0)) |
| 22188 | .first->second; |
| 22189 | } |
| 22190 | for (Instruction *I : reverse(C&: PossibleReductionOps)) |
| 22191 | Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? 0 : Level + 1); |
| 22192 | } |
| 22193 | auto PossibleReducedValsVect = PossibleReducedVals.takeVector(); |
| 22194 | // Sort values by the total number of values kinds to start the reduction |
| 22195 | // from the longest possible reduced values sequences. |
| 22196 | for (auto &PossibleReducedVals : PossibleReducedValsVect) { |
| 22197 | auto PossibleRedVals = PossibleReducedVals.second.takeVector(); |
| 22198 | SmallVector<SmallVector<Value *>> PossibleRedValsVect; |
| 22199 | for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); |
| 22200 | It != E; ++It) { |
| 22201 | PossibleRedValsVect.emplace_back(); |
| 22202 | auto RedValsVect = It->second.takeVector(); |
| 22203 | stable_sort(Range&: RedValsVect, C: llvm::less_second()); |
| 22204 | for (const std::pair<Value *, unsigned> &Data : RedValsVect) |
| 22205 | PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first); |
| 22206 | } |
| 22207 | stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) { |
| 22208 | return P1.size() > P2.size(); |
| 22209 | }); |
| 22210 | int NewIdx = -1; |
| 22211 | for (ArrayRef<Value *> Data : PossibleRedValsVect) { |
| 22212 | if (NewIdx < 0 || |
| 22213 | (!isGoodForReduction(Data) && |
| 22214 | (!isa<LoadInst>(Val: Data.front()) || |
| 22215 | !isa<LoadInst>(Val: ReducedVals[NewIdx].front()) || |
| 22216 | getUnderlyingObject( |
| 22217 | V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) != |
| 22218 | getUnderlyingObject( |
| 22219 | V: cast<LoadInst>(Val: ReducedVals[NewIdx].front()) |
| 22220 | ->getPointerOperand())))) { |
| 22221 | NewIdx = ReducedVals.size(); |
| 22222 | ReducedVals.emplace_back(); |
| 22223 | } |
| 22224 | ReducedVals[NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend()); |
| 22225 | } |
| 22226 | } |
| 22227 | // Sort the reduced values by number of same/alternate opcode and/or pointer |
| 22228 | // operand. |
| 22229 | stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) { |
| 22230 | return P1.size() > P2.size(); |
| 22231 | }); |
| 22232 | return true; |
| 22233 | } |
| 22234 | |
| 22235 | /// Attempt to vectorize the tree found by matchAssociativeReduction. |
| 22236 | Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI, |
| 22237 | const TargetLibraryInfo &TLI, AssumptionCache *AC) { |
| 22238 | const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4; |
| 22239 | constexpr unsigned RegMaxNumber = 4; |
| 22240 | constexpr unsigned RedValsMaxNumber = 128; |
| 22241 | // If there are a sufficient number of reduction values, reduce |
| 22242 | // to a nearby power-of-2. We can safely generate oversized |
| 22243 | // vectors and rely on the backend to split them to legal sizes. |
| 22244 | if (unsigned NumReducedVals = std::accumulate( |
| 22245 | first: ReducedVals.begin(), last: ReducedVals.end(), init: 0, |
| 22246 | binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned { |
| 22247 | if (!isGoodForReduction(Data: Vals)) |
| 22248 | return Num; |
| 22249 | return Num + Vals.size(); |
| 22250 | }); |
| 22251 | NumReducedVals < ReductionLimit && |
| 22252 | all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) { |
| 22253 | return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV); |
| 22254 | })) { |
| 22255 | for (ReductionOpsType &RdxOps : ReductionOps) |
| 22256 | for (Value *RdxOp : RdxOps) |
| 22257 | V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp)); |
| 22258 | return nullptr; |
| 22259 | } |
| 22260 | |
| 22261 | IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(), |
| 22262 | TargetFolder(DL)); |
| 22263 | Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot)); |
| 22264 | |
| 22265 | // Track the reduced values in case if they are replaced by extractelement |
| 22266 | // because of the vectorization. |
| 22267 | DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() * |
| 22268 | ReducedVals.front().size()); |
| 22269 | |
| 22270 | // The compare instruction of a min/max is the insertion point for new |
| 22271 | // instructions and may be replaced with a new compare instruction. |
| 22272 | auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) { |
| 22273 | assert(isa<SelectInst>(RdxRootInst) && |
| 22274 | "Expected min/max reduction to have select root instruction" ); |
| 22275 | Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition(); |
| 22276 | assert(isa<Instruction>(ScalarCond) && |
| 22277 | "Expected min/max reduction to have compare condition" ); |
| 22278 | return cast<Instruction>(Val: ScalarCond); |
| 22279 | }; |
| 22280 | |
| 22281 | bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) { |
| 22282 | return isBoolLogicOp(I: cast<Instruction>(Val: V)); |
| 22283 | }); |
| 22284 | // Return new VectorizedTree, based on previous value. |
| 22285 | auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) { |
| 22286 | if (VectorizedTree) { |
| 22287 | // Update the final value in the reduction. |
| 22288 | Builder.SetCurrentDebugLocation( |
| 22289 | cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc()); |
| 22290 | if (AnyBoolLogicOp) { |
| 22291 | auto It = ReducedValsToOps.find(Val: VectorizedTree); |
| 22292 | auto It1 = ReducedValsToOps.find(Val: Res); |
| 22293 | if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) || |
| 22294 | isGuaranteedNotToBePoison(V: VectorizedTree, AC) || |
| 22295 | (It != ReducedValsToOps.end() && |
| 22296 | any_of(Range&: It->getSecond(), P: [&](Instruction *I) { |
| 22297 | return isBoolLogicOp(I) && |
| 22298 | getRdxOperand(I, Index: 0) == VectorizedTree; |
| 22299 | }))) { |
| 22300 | ; |
| 22301 | } else if (isGuaranteedNotToBePoison(V: Res, AC) || |
| 22302 | (It1 != ReducedValsToOps.end() && |
| 22303 | any_of(Range&: It1->getSecond(), P: [&](Instruction *I) { |
| 22304 | return isBoolLogicOp(I) && getRdxOperand(I, Index: 0) == Res; |
| 22305 | }))) { |
| 22306 | std::swap(a&: VectorizedTree, b&: Res); |
| 22307 | } else { |
| 22308 | VectorizedTree = Builder.CreateFreeze(V: VectorizedTree); |
| 22309 | } |
| 22310 | } |
| 22311 | |
| 22312 | return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx" , |
| 22313 | ReductionOps); |
| 22314 | } |
| 22315 | // Initialize the final value in the reduction. |
| 22316 | return Res; |
| 22317 | }; |
| 22318 | SmallDenseSet<Value *> IgnoreList(ReductionOps.size() * |
| 22319 | ReductionOps.front().size()); |
| 22320 | for (ReductionOpsType &RdxOps : ReductionOps) |
| 22321 | for (Value *RdxOp : RdxOps) { |
| 22322 | if (!RdxOp) |
| 22323 | continue; |
| 22324 | IgnoreList.insert(V: RdxOp); |
| 22325 | } |
| 22326 | // Intersect the fast-math-flags from all reduction operations. |
| 22327 | FastMathFlags RdxFMF; |
| 22328 | RdxFMF.set(); |
| 22329 | for (Value *U : IgnoreList) |
| 22330 | if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U)) |
| 22331 | RdxFMF &= FPMO->getFastMathFlags(); |
| 22332 | bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot)); |
| 22333 | |
| 22334 | // Need to track reduced vals, they may be changed during vectorization of |
| 22335 | // subvectors. |
| 22336 | for (ArrayRef<Value *> Candidates : ReducedVals) |
| 22337 | for (Value *V : Candidates) |
| 22338 | TrackedVals.try_emplace(Key: V, Args&: V); |
| 22339 | |
| 22340 | auto At = [](SmallMapVector<Value *, unsigned, 16> &MV, |
| 22341 | Value *V) -> unsigned & { |
| 22342 | auto *It = MV.find(Key: V); |
| 22343 | assert(It != MV.end() && "Unable to find given key." ); |
| 22344 | return It->second; |
| 22345 | }; |
| 22346 | |
| 22347 | DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size()); |
| 22348 | // List of the values that were reduced in other trees as part of gather |
| 22349 | // nodes and thus requiring extract if fully vectorized in other trees. |
| 22350 | SmallPtrSet<Value *, 4> ; |
| 22351 | WeakTrackingVH VectorizedTree = nullptr; |
| 22352 | bool CheckForReusedReductionOps = false; |
| 22353 | // Try to vectorize elements based on their type. |
| 22354 | SmallVector<InstructionsState> States; |
| 22355 | for (ArrayRef<Value *> RV : ReducedVals) |
| 22356 | States.push_back(Elt: getSameOpcode(VL: RV, TLI)); |
| 22357 | for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { |
| 22358 | ArrayRef<Value *> OrigReducedVals = ReducedVals[I]; |
| 22359 | InstructionsState S = States[I]; |
| 22360 | SmallVector<Value *> Candidates; |
| 22361 | Candidates.reserve(N: 2 * OrigReducedVals.size()); |
| 22362 | DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size()); |
| 22363 | for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { |
| 22364 | Value *RdxVal = TrackedVals.at(Val: OrigReducedVals[Cnt]); |
| 22365 | // Check if the reduction value was not overriden by the extractelement |
| 22366 | // instruction because of the vectorization and exclude it, if it is not |
| 22367 | // compatible with other values. |
| 22368 | // Also check if the instruction was folded to constant/other value. |
| 22369 | auto *Inst = dyn_cast<Instruction>(Val: RdxVal); |
| 22370 | if ((Inst && isVectorLikeInstWithConstOps(V: Inst) && |
| 22371 | (!S || !S.getMatchingMainOpOrAltOp(I: Inst))) || |
| 22372 | (S && !Inst)) |
| 22373 | continue; |
| 22374 | Candidates.push_back(Elt: RdxVal); |
| 22375 | TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals[Cnt]); |
| 22376 | } |
| 22377 | bool = false; |
| 22378 | // Try to handle shuffled extractelements. |
| 22379 | if (S && S.getOpcode() == Instruction::ExtractElement && |
| 22380 | !S.isAltShuffle() && I + 1 < E) { |
| 22381 | SmallVector<Value *> CommonCandidates(Candidates); |
| 22382 | for (Value *RV : ReducedVals[I + 1]) { |
| 22383 | Value *RdxVal = TrackedVals.at(Val: RV); |
| 22384 | // Check if the reduction value was not overriden by the |
| 22385 | // extractelement instruction because of the vectorization and |
| 22386 | // exclude it, if it is not compatible with other values. |
| 22387 | auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal); |
| 22388 | if (!Inst) |
| 22389 | continue; |
| 22390 | CommonCandidates.push_back(Elt: RdxVal); |
| 22391 | TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV); |
| 22392 | } |
| 22393 | SmallVector<int> Mask; |
| 22394 | if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) { |
| 22395 | ++I; |
| 22396 | Candidates.swap(RHS&: CommonCandidates); |
| 22397 | ShuffledExtracts = true; |
| 22398 | } |
| 22399 | } |
| 22400 | |
| 22401 | // Emit code for constant values. |
| 22402 | if (Candidates.size() > 1 && allConstant(VL: Candidates)) { |
| 22403 | Value *Res = Candidates.front(); |
| 22404 | Value *OrigV = TrackedToOrig.at(Val: Candidates.front()); |
| 22405 | ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond(); |
| 22406 | for (Value *VC : ArrayRef(Candidates).drop_front()) { |
| 22407 | Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx" , ReductionOps); |
| 22408 | Value *OrigV = TrackedToOrig.at(Val: VC); |
| 22409 | ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond(); |
| 22410 | if (auto *ResI = dyn_cast<Instruction>(Val: Res)) |
| 22411 | V.analyzedReductionRoot(I: ResI); |
| 22412 | } |
| 22413 | VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res); |
| 22414 | continue; |
| 22415 | } |
| 22416 | |
| 22417 | unsigned NumReducedVals = Candidates.size(); |
| 22418 | if (NumReducedVals < ReductionLimit && |
| 22419 | (NumReducedVals < 2 || !isSplat(VL: Candidates))) |
| 22420 | continue; |
| 22421 | |
| 22422 | // Check if we support repeated scalar values processing (optimization of |
| 22423 | // original scalar identity operations on matched horizontal reductions). |
| 22424 | IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul && |
| 22425 | RdxKind != RecurKind::FMul && |
| 22426 | RdxKind != RecurKind::FMulAdd; |
| 22427 | // Gather same values. |
| 22428 | SmallMapVector<Value *, unsigned, 16> SameValuesCounter; |
| 22429 | if (IsSupportedHorRdxIdentityOp) |
| 22430 | for (Value *V : Candidates) { |
| 22431 | Value *OrigV = TrackedToOrig.at(Val: V); |
| 22432 | ++SameValuesCounter.try_emplace(Key: OrigV).first->second; |
| 22433 | } |
| 22434 | // Used to check if the reduced values used same number of times. In this |
| 22435 | // case the compiler may produce better code. E.g. if reduced values are |
| 22436 | // aabbccdd (8 x values), then the first node of the tree will have a node |
| 22437 | // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>. |
| 22438 | // Plus, the final reduction will be performed on <8 x aabbccdd>. |
| 22439 | // Instead compiler may build <4 x abcd> tree immediately, + reduction (4 |
| 22440 | // x abcd) * 2. |
| 22441 | // Currently it only handles add/fadd/xor. and/or/min/max do not require |
| 22442 | // this analysis, other operations may require an extra estimation of |
| 22443 | // the profitability. |
| 22444 | bool SameScaleFactor = false; |
| 22445 | bool OptReusedScalars = IsSupportedHorRdxIdentityOp && |
| 22446 | SameValuesCounter.size() != Candidates.size(); |
| 22447 | BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues; |
| 22448 | if (OptReusedScalars) { |
| 22449 | SameScaleFactor = |
| 22450 | (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd || |
| 22451 | RdxKind == RecurKind::Xor) && |
| 22452 | all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter), |
| 22453 | P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) { |
| 22454 | return P.second == SameValuesCounter.front().second; |
| 22455 | }); |
| 22456 | Candidates.resize(N: SameValuesCounter.size()); |
| 22457 | transform(Range&: SameValuesCounter, d_first: Candidates.begin(), |
| 22458 | F: [&](const auto &P) { return TrackedVals.at(Val: P.first); }); |
| 22459 | NumReducedVals = Candidates.size(); |
| 22460 | // Have a reduction of the same element. |
| 22461 | if (NumReducedVals == 1) { |
| 22462 | Value *OrigV = TrackedToOrig.at(Val: Candidates.front()); |
| 22463 | unsigned Cnt = At(SameValuesCounter, OrigV); |
| 22464 | Value *RedVal = |
| 22465 | emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt); |
| 22466 | VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); |
| 22467 | VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt); |
| 22468 | ExternallyUsedValues.insert(V: OrigV); |
| 22469 | continue; |
| 22470 | } |
| 22471 | } |
| 22472 | |
| 22473 | unsigned MaxVecRegSize = V.getMaxVecRegSize(); |
| 22474 | unsigned EltSize = V.getVectorElementSize(V: Candidates[0]); |
| 22475 | const unsigned MaxElts = std::clamp<unsigned>( |
| 22476 | val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber, |
| 22477 | hi: RegMaxNumber * RedValsMaxNumber); |
| 22478 | |
| 22479 | unsigned ReduxWidth = NumReducedVals; |
| 22480 | auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) { |
| 22481 | unsigned NumParts, NumRegs; |
| 22482 | Type *ScalarTy = Candidates.front()->getType(); |
| 22483 | ReduxWidth = |
| 22484 | getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth); |
| 22485 | VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth); |
| 22486 | NumParts = ::getNumberOfParts(TTI, VecTy: Tp); |
| 22487 | NumRegs = |
| 22488 | TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp)); |
| 22489 | while (NumParts > NumRegs) { |
| 22490 | assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0." ); |
| 22491 | ReduxWidth = bit_floor(Value: ReduxWidth - 1); |
| 22492 | VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth); |
| 22493 | NumParts = ::getNumberOfParts(TTI, VecTy: Tp); |
| 22494 | NumRegs = |
| 22495 | TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp)); |
| 22496 | } |
| 22497 | if (NumParts > NumRegs / 2) |
| 22498 | ReduxWidth = bit_floor(Value: ReduxWidth); |
| 22499 | return ReduxWidth; |
| 22500 | }; |
| 22501 | if (!VectorizeNonPowerOf2 || !has_single_bit(Value: ReduxWidth + 1)) |
| 22502 | ReduxWidth = GetVectorFactor(ReduxWidth); |
| 22503 | ReduxWidth = std::min(a: ReduxWidth, b: MaxElts); |
| 22504 | |
| 22505 | unsigned Start = 0; |
| 22506 | unsigned Pos = Start; |
| 22507 | // Restarts vectorization attempt with lower vector factor. |
| 22508 | unsigned PrevReduxWidth = ReduxWidth; |
| 22509 | bool CheckForReusedReductionOpsLocal = false; |
| 22510 | auto AdjustReducedVals = [&](bool IgnoreVL = false) { |
| 22511 | bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList); |
| 22512 | if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) { |
| 22513 | // Check if any of the reduction ops are gathered. If so, worth |
| 22514 | // trying again with less number of reduction ops. |
| 22515 | CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered; |
| 22516 | } |
| 22517 | ++Pos; |
| 22518 | if (Pos < NumReducedVals - ReduxWidth + 1) |
| 22519 | return IsAnyRedOpGathered; |
| 22520 | Pos = Start; |
| 22521 | --ReduxWidth; |
| 22522 | if (ReduxWidth > 1) |
| 22523 | ReduxWidth = GetVectorFactor(ReduxWidth); |
| 22524 | return IsAnyRedOpGathered; |
| 22525 | }; |
| 22526 | bool AnyVectorized = false; |
| 22527 | SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates; |
| 22528 | while (Pos < NumReducedVals - ReduxWidth + 1 && |
| 22529 | ReduxWidth >= ReductionLimit) { |
| 22530 | // Dependency in tree of the reduction ops - drop this attempt, try |
| 22531 | // later. |
| 22532 | if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth && |
| 22533 | Start == 0) { |
| 22534 | CheckForReusedReductionOps = true; |
| 22535 | break; |
| 22536 | } |
| 22537 | PrevReduxWidth = ReduxWidth; |
| 22538 | ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth); |
| 22539 | // Been analyzed already - skip. |
| 22540 | if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) || |
| 22541 | (!has_single_bit(Value: ReduxWidth) && |
| 22542 | (IgnoredCandidates.contains( |
| 22543 | V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) || |
| 22544 | IgnoredCandidates.contains( |
| 22545 | V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)), |
| 22546 | y: bit_floor(Value: ReduxWidth))))) || |
| 22547 | V.areAnalyzedReductionVals(VL)) { |
| 22548 | (void)AdjustReducedVals(/*IgnoreVL=*/true); |
| 22549 | continue; |
| 22550 | } |
| 22551 | // Early exit if any of the reduction values were deleted during |
| 22552 | // previous vectorization attempts. |
| 22553 | if (any_of(Range&: VL, P: [&V](Value *RedVal) { |
| 22554 | auto *RedValI = dyn_cast<Instruction>(Val: RedVal); |
| 22555 | if (!RedValI) |
| 22556 | return false; |
| 22557 | return V.isDeleted(I: RedValI); |
| 22558 | })) |
| 22559 | break; |
| 22560 | V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList); |
| 22561 | if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) { |
| 22562 | if (!AdjustReducedVals()) |
| 22563 | V.analyzedReductionVals(VL); |
| 22564 | continue; |
| 22565 | } |
| 22566 | if (V.isLoadCombineReductionCandidate(RdxKind)) { |
| 22567 | if (!AdjustReducedVals()) |
| 22568 | V.analyzedReductionVals(VL); |
| 22569 | continue; |
| 22570 | } |
| 22571 | V.reorderTopToBottom(); |
| 22572 | // No need to reorder the root node at all. |
| 22573 | V.reorderBottomToTop(/*IgnoreReorder=*/true); |
| 22574 | // Keep extracted other reduction values, if they are used in the |
| 22575 | // vectorization trees. |
| 22576 | BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues( |
| 22577 | ExternallyUsedValues); |
| 22578 | // The reduction root is used as the insertion point for new |
| 22579 | // instructions, so set it as externally used to prevent it from being |
| 22580 | // deleted. |
| 22581 | LocalExternallyUsedValues.insert(V: ReductionRoot); |
| 22582 | for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) { |
| 22583 | if (Cnt == I || (ShuffledExtracts && Cnt == I - 1)) |
| 22584 | continue; |
| 22585 | for (Value *V : ReducedVals[Cnt]) |
| 22586 | if (isa<Instruction>(Val: V)) |
| 22587 | LocalExternallyUsedValues.insert(V: TrackedVals[V]); |
| 22588 | } |
| 22589 | if (!IsSupportedHorRdxIdentityOp) { |
| 22590 | // Number of uses of the candidates in the vector of values. |
| 22591 | assert(SameValuesCounter.empty() && |
| 22592 | "Reused values counter map is not empty" ); |
| 22593 | for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { |
| 22594 | if (Cnt >= Pos && Cnt < Pos + ReduxWidth) |
| 22595 | continue; |
| 22596 | Value *V = Candidates[Cnt]; |
| 22597 | Value *OrigV = TrackedToOrig.at(Val: V); |
| 22598 | ++SameValuesCounter.try_emplace(Key: OrigV).first->second; |
| 22599 | } |
| 22600 | } |
| 22601 | V.transformNodes(); |
| 22602 | SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL); |
| 22603 | // Gather externally used values. |
| 22604 | SmallPtrSet<Value *, 4> Visited; |
| 22605 | for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) { |
| 22606 | if (Cnt >= Pos && Cnt < Pos + ReduxWidth) |
| 22607 | continue; |
| 22608 | Value *RdxVal = Candidates[Cnt]; |
| 22609 | if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end()) |
| 22610 | RdxVal = It->second; |
| 22611 | if (!Visited.insert(Ptr: RdxVal).second) |
| 22612 | continue; |
| 22613 | // Check if the scalar was vectorized as part of the vectorization |
| 22614 | // tree but not the top node. |
| 22615 | if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) { |
| 22616 | LocalExternallyUsedValues.insert(V: RdxVal); |
| 22617 | continue; |
| 22618 | } |
| 22619 | Value *OrigV = TrackedToOrig.at(Val: RdxVal); |
| 22620 | unsigned NumOps = |
| 22621 | VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV); |
| 22622 | if (NumOps != ReducedValsToOps.at(Val: OrigV).size()) |
| 22623 | LocalExternallyUsedValues.insert(V: RdxVal); |
| 22624 | } |
| 22625 | // Do not need the list of reused scalars in regular mode anymore. |
| 22626 | if (!IsSupportedHorRdxIdentityOp) |
| 22627 | SameValuesCounter.clear(); |
| 22628 | for (Value *RdxVal : VL) |
| 22629 | if (RequiredExtract.contains(Ptr: RdxVal)) |
| 22630 | LocalExternallyUsedValues.insert(V: RdxVal); |
| 22631 | V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues); |
| 22632 | |
| 22633 | V.computeMinimumValueSizes(); |
| 22634 | |
| 22635 | // Estimate cost. |
| 22636 | InstructionCost ReductionCost = |
| 22637 | getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, FMF: RdxFMF, R: V); |
| 22638 | InstructionCost Cost = V.getTreeCost(VectorizedVals: VL, ReductionCost); |
| 22639 | LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost |
| 22640 | << " for reduction\n" ); |
| 22641 | if (!Cost.isValid()) |
| 22642 | break; |
| 22643 | if (Cost >= -SLPCostThreshold) { |
| 22644 | V.getORE()->emit(RemarkBuilder: [&]() { |
| 22645 | return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial" , |
| 22646 | ReducedValsToOps.at(Val: VL[0]).front()) |
| 22647 | << "Vectorizing horizontal reduction is possible " |
| 22648 | << "but not beneficial with cost " << ore::NV("Cost" , Cost) |
| 22649 | << " and threshold " |
| 22650 | << ore::NV("Threshold" , -SLPCostThreshold); |
| 22651 | }); |
| 22652 | if (!AdjustReducedVals()) { |
| 22653 | V.analyzedReductionVals(VL); |
| 22654 | unsigned Offset = Pos == Start ? Pos : Pos - 1; |
| 22655 | if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) { |
| 22656 | // Add subvectors of VL to the list of the analyzed values. |
| 22657 | for (unsigned VF = getFloorFullVectorNumberOfElements( |
| 22658 | TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - 1); |
| 22659 | VF >= ReductionLimit; |
| 22660 | VF = getFloorFullVectorNumberOfElements( |
| 22661 | TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) { |
| 22662 | if (has_single_bit(Value: VF) && |
| 22663 | V.getCanonicalGraphSize() != V.getTreeSize()) |
| 22664 | continue; |
| 22665 | for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF)) |
| 22666 | IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF)); |
| 22667 | } |
| 22668 | } |
| 22669 | } |
| 22670 | continue; |
| 22671 | } |
| 22672 | |
| 22673 | LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" |
| 22674 | << Cost << ". (HorRdx)\n" ); |
| 22675 | V.getORE()->emit(RemarkBuilder: [&]() { |
| 22676 | return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction" , |
| 22677 | ReducedValsToOps.at(Val: VL[0]).front()) |
| 22678 | << "Vectorized horizontal reduction with cost " |
| 22679 | << ore::NV("Cost" , Cost) << " and with tree size " |
| 22680 | << ore::NV("TreeSize" , V.getTreeSize()); |
| 22681 | }); |
| 22682 | |
| 22683 | Builder.setFastMathFlags(RdxFMF); |
| 22684 | |
| 22685 | // Emit a reduction. If the root is a select (min/max idiom), the insert |
| 22686 | // point is the compare condition of that select. |
| 22687 | Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot); |
| 22688 | Instruction *InsertPt = RdxRootInst; |
| 22689 | if (IsCmpSelMinMax) |
| 22690 | InsertPt = GetCmpForMinMaxReduction(RdxRootInst); |
| 22691 | |
| 22692 | // Vectorize a tree. |
| 22693 | Value *VectorizedRoot = V.vectorizeTree( |
| 22694 | ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales); |
| 22695 | // Update TrackedToOrig mapping, since the tracked values might be |
| 22696 | // updated. |
| 22697 | for (Value *RdxVal : Candidates) { |
| 22698 | Value *OrigVal = TrackedToOrig.at(Val: RdxVal); |
| 22699 | Value *TransformedRdxVal = TrackedVals.at(Val: OrigVal); |
| 22700 | if (TransformedRdxVal != RdxVal) |
| 22701 | TrackedToOrig.try_emplace(Key: TransformedRdxVal, Args&: OrigVal); |
| 22702 | } |
| 22703 | |
| 22704 | Builder.SetInsertPoint(InsertPt); |
| 22705 | |
| 22706 | // To prevent poison from leaking across what used to be sequential, |
| 22707 | // safe, scalar boolean logic operations, the reduction operand must be |
| 22708 | // frozen. |
| 22709 | if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC)) |
| 22710 | VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot); |
| 22711 | |
| 22712 | // Emit code to correctly handle reused reduced values, if required. |
| 22713 | if (OptReusedScalars && !SameScaleFactor) { |
| 22714 | VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V, |
| 22715 | SameValuesCounter, TrackedToOrig); |
| 22716 | } |
| 22717 | |
| 22718 | Type *ScalarTy = VL.front()->getType(); |
| 22719 | Type *VecTy = VectorizedRoot->getType(); |
| 22720 | Type *RedScalarTy = VecTy->getScalarType(); |
| 22721 | VectorValuesAndScales.emplace_back( |
| 22722 | Args&: VectorizedRoot, |
| 22723 | Args: OptReusedScalars && SameScaleFactor |
| 22724 | ? SameValuesCounter.front().second |
| 22725 | : 1, |
| 22726 | Args: RedScalarTy != ScalarTy->getScalarType() |
| 22727 | ? V.isSignedMinBitwidthRootNode() |
| 22728 | : true); |
| 22729 | |
| 22730 | // Count vectorized reduced values to exclude them from final reduction. |
| 22731 | for (Value *RdxVal : VL) { |
| 22732 | Value *OrigV = TrackedToOrig.at(Val: RdxVal); |
| 22733 | if (IsSupportedHorRdxIdentityOp) { |
| 22734 | VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV)); |
| 22735 | continue; |
| 22736 | } |
| 22737 | ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond(); |
| 22738 | if (!V.isVectorized(V: RdxVal)) |
| 22739 | RequiredExtract.insert(Ptr: RdxVal); |
| 22740 | } |
| 22741 | Pos += ReduxWidth; |
| 22742 | Start = Pos; |
| 22743 | ReduxWidth = NumReducedVals - Pos; |
| 22744 | if (ReduxWidth > 1) |
| 22745 | ReduxWidth = GetVectorFactor(NumReducedVals - Pos); |
| 22746 | AnyVectorized = true; |
| 22747 | } |
| 22748 | if (OptReusedScalars && !AnyVectorized) { |
| 22749 | for (const std::pair<Value *, unsigned> &P : SameValuesCounter) { |
| 22750 | Value *RdxVal = TrackedVals.at(Val: P.first); |
| 22751 | Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second); |
| 22752 | VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal); |
| 22753 | VectorizedVals.try_emplace(Key: P.first, Args: P.second); |
| 22754 | } |
| 22755 | continue; |
| 22756 | } |
| 22757 | } |
| 22758 | if (!VectorValuesAndScales.empty()) |
| 22759 | VectorizedTree = GetNewVectorizedTree( |
| 22760 | VectorizedTree, |
| 22761 | emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot->getType())); |
| 22762 | if (VectorizedTree) { |
| 22763 | // Reorder operands of bool logical op in the natural order to avoid |
| 22764 | // possible problem with poison propagation. If not possible to reorder |
| 22765 | // (both operands are originally RHS), emit an extra freeze instruction |
| 22766 | // for the LHS operand. |
| 22767 | // I.e., if we have original code like this: |
| 22768 | // RedOp1 = select i1 ?, i1 LHS, i1 false |
| 22769 | // RedOp2 = select i1 RHS, i1 ?, i1 false |
| 22770 | |
| 22771 | // Then, we swap LHS/RHS to create a new op that matches the poison |
| 22772 | // semantics of the original code. |
| 22773 | |
| 22774 | // If we have original code like this and both values could be poison: |
| 22775 | // RedOp1 = select i1 ?, i1 LHS, i1 false |
| 22776 | // RedOp2 = select i1 ?, i1 RHS, i1 false |
| 22777 | |
| 22778 | // Then, we must freeze LHS in the new op. |
| 22779 | auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS, |
| 22780 | Instruction *RedOp1, |
| 22781 | Instruction *RedOp2, |
| 22782 | bool InitStep) { |
| 22783 | if (!AnyBoolLogicOp) |
| 22784 | return; |
| 22785 | if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) || |
| 22786 | getRdxOperand(I: RedOp1, Index: 0) == LHS || |
| 22787 | isGuaranteedNotToBePoison(V: LHS, AC))) |
| 22788 | return; |
| 22789 | if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) || |
| 22790 | getRdxOperand(I: RedOp2, Index: 0) == RHS || |
| 22791 | isGuaranteedNotToBePoison(V: RHS, AC))) { |
| 22792 | std::swap(a&: LHS, b&: RHS); |
| 22793 | return; |
| 22794 | } |
| 22795 | if (LHS != VectorizedTree) |
| 22796 | LHS = Builder.CreateFreeze(V: LHS); |
| 22797 | }; |
| 22798 | // Finish the reduction. |
| 22799 | // Need to add extra arguments and not vectorized possible reduction |
| 22800 | // values. |
| 22801 | // Try to avoid dependencies between the scalar remainders after |
| 22802 | // reductions. |
| 22803 | auto FinalGen = |
| 22804 | [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals, |
| 22805 | bool InitStep) { |
| 22806 | unsigned Sz = InstVals.size(); |
| 22807 | SmallVector<std::pair<Instruction *, Value *>> (Sz / 2 + |
| 22808 | Sz % 2); |
| 22809 | for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) { |
| 22810 | Instruction *RedOp = InstVals[I + 1].first; |
| 22811 | Builder.SetCurrentDebugLocation(RedOp->getDebugLoc()); |
| 22812 | Value *RdxVal1 = InstVals[I].second; |
| 22813 | Value *StableRdxVal1 = RdxVal1; |
| 22814 | auto It1 = TrackedVals.find(Val: RdxVal1); |
| 22815 | if (It1 != TrackedVals.end()) |
| 22816 | StableRdxVal1 = It1->second; |
| 22817 | Value *RdxVal2 = InstVals[I + 1].second; |
| 22818 | Value *StableRdxVal2 = RdxVal2; |
| 22819 | auto It2 = TrackedVals.find(Val: RdxVal2); |
| 22820 | if (It2 != TrackedVals.end()) |
| 22821 | StableRdxVal2 = It2->second; |
| 22822 | // To prevent poison from leaking across what used to be |
| 22823 | // sequential, safe, scalar boolean logic operations, the |
| 22824 | // reduction operand must be frozen. |
| 22825 | FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first, |
| 22826 | RedOp, InitStep); |
| 22827 | Value * = createOp(Builder, RdxKind, LHS: StableRdxVal1, |
| 22828 | RHS: StableRdxVal2, Name: "op.rdx" , ReductionOps); |
| 22829 | ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed); |
| 22830 | } |
| 22831 | if (Sz % 2 == 1) |
| 22832 | ExtraReds[Sz / 2] = InstVals.back(); |
| 22833 | return ExtraReds; |
| 22834 | }; |
| 22835 | SmallVector<std::pair<Instruction *, Value *>> ; |
| 22836 | ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot), |
| 22837 | Args&: VectorizedTree); |
| 22838 | SmallPtrSet<Value *, 8> Visited; |
| 22839 | for (ArrayRef<Value *> Candidates : ReducedVals) { |
| 22840 | for (Value *RdxVal : Candidates) { |
| 22841 | if (!Visited.insert(Ptr: RdxVal).second) |
| 22842 | continue; |
| 22843 | unsigned NumOps = VectorizedVals.lookup(Val: RdxVal); |
| 22844 | for (Instruction *RedOp : |
| 22845 | ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps)) |
| 22846 | ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal); |
| 22847 | } |
| 22848 | } |
| 22849 | // Iterate through all not-vectorized reduction values/extra arguments. |
| 22850 | bool InitStep = true; |
| 22851 | while (ExtraReductions.size() > 1) { |
| 22852 | SmallVector<std::pair<Instruction *, Value *>> NewReds = |
| 22853 | FinalGen(ExtraReductions, InitStep); |
| 22854 | ExtraReductions.swap(RHS&: NewReds); |
| 22855 | InitStep = false; |
| 22856 | } |
| 22857 | VectorizedTree = ExtraReductions.front().second; |
| 22858 | |
| 22859 | ReductionRoot->replaceAllUsesWith(V: VectorizedTree); |
| 22860 | |
| 22861 | // The original scalar reduction is expected to have no remaining |
| 22862 | // uses outside the reduction tree itself. Assert that we got this |
| 22863 | // correct, replace internal uses with undef, and mark for eventual |
| 22864 | // deletion. |
| 22865 | #ifndef NDEBUG |
| 22866 | SmallSet<Value *, 4> IgnoreSet; |
| 22867 | for (ArrayRef<Value *> RdxOps : ReductionOps) |
| 22868 | IgnoreSet.insert_range(RdxOps); |
| 22869 | #endif |
| 22870 | for (ArrayRef<Value *> RdxOps : ReductionOps) { |
| 22871 | for (Value *Ignore : RdxOps) { |
| 22872 | if (!Ignore) |
| 22873 | continue; |
| 22874 | #ifndef NDEBUG |
| 22875 | for (auto *U : Ignore->users()) { |
| 22876 | assert(IgnoreSet.count(U) && |
| 22877 | "All users must be either in the reduction ops list." ); |
| 22878 | } |
| 22879 | #endif |
| 22880 | if (!Ignore->use_empty()) { |
| 22881 | Value *P = PoisonValue::get(T: Ignore->getType()); |
| 22882 | Ignore->replaceAllUsesWith(V: P); |
| 22883 | } |
| 22884 | } |
| 22885 | V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales); |
| 22886 | } |
| 22887 | } else if (!CheckForReusedReductionOps) { |
| 22888 | for (ReductionOpsType &RdxOps : ReductionOps) |
| 22889 | for (Value *RdxOp : RdxOps) |
| 22890 | V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp)); |
| 22891 | } |
| 22892 | return VectorizedTree; |
| 22893 | } |
| 22894 | |
| 22895 | private: |
| 22896 | /// Creates the reduction from the given \p Vec vector value with the given |
| 22897 | /// scale \p Scale and signedness \p IsSigned. |
| 22898 | Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI, |
| 22899 | Value *Vec, unsigned Scale, bool IsSigned, |
| 22900 | Type *DestTy) { |
| 22901 | Value *Rdx; |
| 22902 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) { |
| 22903 | unsigned DestTyNumElements = getNumElements(Ty: VecTy); |
| 22904 | unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements; |
| 22905 | Rdx = PoisonValue::get( |
| 22906 | T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements)); |
| 22907 | for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) { |
| 22908 | // Do reduction for each lane. |
| 22909 | // e.g., do reduce add for |
| 22910 | // VL[0] = <4 x Ty> <a, b, c, d> |
| 22911 | // VL[1] = <4 x Ty> <e, f, g, h> |
| 22912 | // Lane[0] = <2 x Ty> <a, e> |
| 22913 | // Lane[1] = <2 x Ty> <b, f> |
| 22914 | // Lane[2] = <2 x Ty> <c, g> |
| 22915 | // Lane[3] = <2 x Ty> <d, h> |
| 22916 | // result[0] = reduce add Lane[0] |
| 22917 | // result[1] = reduce add Lane[1] |
| 22918 | // result[2] = reduce add Lane[2] |
| 22919 | // result[3] = reduce add Lane[3] |
| 22920 | SmallVector<int, 16> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF); |
| 22921 | Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask); |
| 22922 | Rdx = Builder.CreateInsertElement( |
| 22923 | Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I); |
| 22924 | } |
| 22925 | } else { |
| 22926 | Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy); |
| 22927 | } |
| 22928 | if (Rdx->getType() != DestTy) |
| 22929 | Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned); |
| 22930 | // Improved analysis for add/fadd/xor reductions with same scale |
| 22931 | // factor for all operands of reductions. We can emit scalar ops for |
| 22932 | // them instead. |
| 22933 | if (Scale > 1) |
| 22934 | Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale); |
| 22935 | return Rdx; |
| 22936 | } |
| 22937 | |
| 22938 | /// Calculate the cost of a reduction. |
| 22939 | InstructionCost getReductionCost(TargetTransformInfo *TTI, |
| 22940 | ArrayRef<Value *> ReducedVals, |
| 22941 | bool IsCmpSelMinMax, FastMathFlags FMF, |
| 22942 | const BoUpSLP &R) { |
| 22943 | TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; |
| 22944 | Type *ScalarTy = ReducedVals.front()->getType(); |
| 22945 | unsigned ReduxWidth = ReducedVals.size(); |
| 22946 | FixedVectorType *VectorTy = R.getReductionType(); |
| 22947 | InstructionCost VectorCost = 0, ScalarCost; |
| 22948 | // If all of the reduced values are constant, the vector cost is 0, since |
| 22949 | // the reduction value can be calculated at the compile time. |
| 22950 | bool AllConsts = allConstant(VL: ReducedVals); |
| 22951 | auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) { |
| 22952 | InstructionCost Cost = 0; |
| 22953 | // Scalar cost is repeated for N-1 elements. |
| 22954 | int Cnt = ReducedVals.size(); |
| 22955 | for (Value *RdxVal : ReducedVals) { |
| 22956 | if (Cnt == 1) |
| 22957 | break; |
| 22958 | --Cnt; |
| 22959 | if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) { |
| 22960 | Cost += GenCostFn(); |
| 22961 | continue; |
| 22962 | } |
| 22963 | InstructionCost ScalarCost = 0; |
| 22964 | for (User *U : RdxVal->users()) { |
| 22965 | auto *RdxOp = cast<Instruction>(Val: U); |
| 22966 | if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) { |
| 22967 | ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind); |
| 22968 | continue; |
| 22969 | } |
| 22970 | ScalarCost = InstructionCost::getInvalid(); |
| 22971 | break; |
| 22972 | } |
| 22973 | if (ScalarCost.isValid()) |
| 22974 | Cost += ScalarCost; |
| 22975 | else |
| 22976 | Cost += GenCostFn(); |
| 22977 | } |
| 22978 | return Cost; |
| 22979 | }; |
| 22980 | // Require reduction cost if: |
| 22981 | // 1. This type is not a full register type and no other vectors with the |
| 22982 | // same type in the storage (first vector with small type). |
| 22983 | // 2. The storage does not have any vector with full vector use (first |
| 22984 | // vector with full register use). |
| 22985 | bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty(); |
| 22986 | switch (RdxKind) { |
| 22987 | case RecurKind::Add: |
| 22988 | case RecurKind::Mul: |
| 22989 | case RecurKind::Or: |
| 22990 | case RecurKind::And: |
| 22991 | case RecurKind::Xor: |
| 22992 | case RecurKind::FAdd: |
| 22993 | case RecurKind::FMul: { |
| 22994 | unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind); |
| 22995 | if (!AllConsts) { |
| 22996 | if (DoesRequireReductionOp) { |
| 22997 | if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) { |
| 22998 | assert(SLPReVec && "FixedVectorType is not expected." ); |
| 22999 | unsigned ScalarTyNumElements = VecTy->getNumElements(); |
| 23000 | for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) { |
| 23001 | VectorCost += TTI->getShuffleCost( |
| 23002 | Kind: TTI::SK_PermuteSingleSrc, |
| 23003 | DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(), |
| 23004 | NumElts: ReducedVals.size()), |
| 23005 | SrcTy: VectorTy, |
| 23006 | Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size())); |
| 23007 | VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy, |
| 23008 | FMF, CostKind); |
| 23009 | } |
| 23010 | VectorCost += TTI->getScalarizationOverhead( |
| 23011 | Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /*Insert*/ true, |
| 23012 | /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput); |
| 23013 | } else { |
| 23014 | Type *RedTy = VectorTy->getElementType(); |
| 23015 | auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( |
| 23016 | u: std::make_pair(x&: RedTy, y: true)); |
| 23017 | if (RType == RedTy) { |
| 23018 | VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy, |
| 23019 | FMF, CostKind); |
| 23020 | } else { |
| 23021 | VectorCost = TTI->getExtendedReductionCost( |
| 23022 | Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy, |
| 23023 | Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind); |
| 23024 | } |
| 23025 | } |
| 23026 | } else { |
| 23027 | Type *RedTy = VectorTy->getElementType(); |
| 23028 | auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( |
| 23029 | u: std::make_pair(x&: RedTy, y: true)); |
| 23030 | VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth); |
| 23031 | VectorCost += |
| 23032 | TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind); |
| 23033 | if (RType != RedTy) { |
| 23034 | unsigned Opcode = Instruction::Trunc; |
| 23035 | if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) |
| 23036 | Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt; |
| 23037 | VectorCost += TTI->getCastInstrCost( |
| 23038 | Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind); |
| 23039 | } |
| 23040 | } |
| 23041 | } |
| 23042 | ScalarCost = EvaluateScalarCost([&]() { |
| 23043 | return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind); |
| 23044 | }); |
| 23045 | break; |
| 23046 | } |
| 23047 | case RecurKind::FMax: |
| 23048 | case RecurKind::FMin: |
| 23049 | case RecurKind::FMaximum: |
| 23050 | case RecurKind::FMinimum: |
| 23051 | case RecurKind::SMax: |
| 23052 | case RecurKind::SMin: |
| 23053 | case RecurKind::UMax: |
| 23054 | case RecurKind::UMin: { |
| 23055 | Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind); |
| 23056 | if (!AllConsts) { |
| 23057 | if (DoesRequireReductionOp) { |
| 23058 | VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind); |
| 23059 | } else { |
| 23060 | // Check if the previous reduction already exists and account it as |
| 23061 | // series of operations + single reduction. |
| 23062 | Type *RedTy = VectorTy->getElementType(); |
| 23063 | auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( |
| 23064 | u: std::make_pair(x&: RedTy, y: true)); |
| 23065 | VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth); |
| 23066 | IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF); |
| 23067 | VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind); |
| 23068 | if (RType != RedTy) { |
| 23069 | unsigned Opcode = Instruction::Trunc; |
| 23070 | if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits()) |
| 23071 | Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt; |
| 23072 | VectorCost += TTI->getCastInstrCost( |
| 23073 | Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind); |
| 23074 | } |
| 23075 | } |
| 23076 | } |
| 23077 | ScalarCost = EvaluateScalarCost([&]() { |
| 23078 | IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF); |
| 23079 | return TTI->getIntrinsicInstrCost(ICA, CostKind); |
| 23080 | }); |
| 23081 | break; |
| 23082 | } |
| 23083 | default: |
| 23084 | llvm_unreachable("Expected arithmetic or min/max reduction operation" ); |
| 23085 | } |
| 23086 | |
| 23087 | LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost |
| 23088 | << " for reduction of " << shortBundleName(ReducedVals) |
| 23089 | << " (It is a splitting reduction)\n" ); |
| 23090 | return VectorCost - ScalarCost; |
| 23091 | } |
| 23092 | |
| 23093 | /// Splits the values, stored in VectorValuesAndScales, into registers/free |
| 23094 | /// sub-registers, combines them with the given reduction operation as a |
| 23095 | /// vector operation and then performs single (small enough) reduction. |
| 23096 | Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI, |
| 23097 | Type *DestTy) { |
| 23098 | Value *ReducedSubTree = nullptr; |
| 23099 | // Creates reduction and combines with the previous reduction. |
| 23100 | auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) { |
| 23101 | Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy); |
| 23102 | if (ReducedSubTree) |
| 23103 | ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx, |
| 23104 | Name: "op.rdx" , ReductionOps); |
| 23105 | else |
| 23106 | ReducedSubTree = Rdx; |
| 23107 | }; |
| 23108 | if (VectorValuesAndScales.size() == 1) { |
| 23109 | const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front(); |
| 23110 | CreateSingleOp(Vec, Scale, IsSigned); |
| 23111 | return ReducedSubTree; |
| 23112 | } |
| 23113 | // Scales Vec using given Cnt scale factor and then performs vector combine |
| 23114 | // with previous value of VecOp. |
| 23115 | Value *VecRes = nullptr; |
| 23116 | bool VecResSignedness = false; |
| 23117 | auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) { |
| 23118 | Type *ScalarTy = Vec->getType()->getScalarType(); |
| 23119 | // Scale Vec using given Cnt scale factor. |
| 23120 | if (Cnt > 1) { |
| 23121 | ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount(); |
| 23122 | switch (RdxKind) { |
| 23123 | case RecurKind::Add: { |
| 23124 | if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) { |
| 23125 | unsigned VF = getNumElements(Ty: Vec->getType()); |
| 23126 | LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec |
| 23127 | << ". (HorRdx)\n" ); |
| 23128 | SmallVector<int> Mask(Cnt * VF, PoisonMaskElem); |
| 23129 | for (unsigned I : seq<unsigned>(Size: Cnt)) |
| 23130 | std::iota(first: std::next(x: Mask.begin(), n: VF * I), |
| 23131 | last: std::next(x: Mask.begin(), n: VF * (I + 1)), value: 0); |
| 23132 | ++NumVectorInstructions; |
| 23133 | Vec = Builder.CreateShuffleVector(V: Vec, Mask); |
| 23134 | break; |
| 23135 | } |
| 23136 | // res = mul vv, n |
| 23137 | if (ScalarTy != DestTy->getScalarType()) |
| 23138 | Vec = Builder.CreateIntCast( |
| 23139 | V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())), |
| 23140 | isSigned: IsSigned); |
| 23141 | Value *Scale = ConstantVector::getSplat( |
| 23142 | EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt)); |
| 23143 | LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec |
| 23144 | << ". (HorRdx)\n" ); |
| 23145 | ++NumVectorInstructions; |
| 23146 | Vec = Builder.CreateMul(LHS: Vec, RHS: Scale); |
| 23147 | break; |
| 23148 | } |
| 23149 | case RecurKind::Xor: { |
| 23150 | // res = n % 2 ? 0 : vv |
| 23151 | LLVM_DEBUG(dbgs() |
| 23152 | << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n" ); |
| 23153 | if (Cnt % 2 == 0) |
| 23154 | Vec = Constant::getNullValue(Ty: Vec->getType()); |
| 23155 | break; |
| 23156 | } |
| 23157 | case RecurKind::FAdd: { |
| 23158 | // res = fmul v, n |
| 23159 | Value *Scale = |
| 23160 | ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt)); |
| 23161 | LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec |
| 23162 | << ". (HorRdx)\n" ); |
| 23163 | ++NumVectorInstructions; |
| 23164 | Vec = Builder.CreateFMul(L: Vec, R: Scale); |
| 23165 | break; |
| 23166 | } |
| 23167 | case RecurKind::And: |
| 23168 | case RecurKind::Or: |
| 23169 | case RecurKind::SMax: |
| 23170 | case RecurKind::SMin: |
| 23171 | case RecurKind::UMax: |
| 23172 | case RecurKind::UMin: |
| 23173 | case RecurKind::FMax: |
| 23174 | case RecurKind::FMin: |
| 23175 | case RecurKind::FMaximum: |
| 23176 | case RecurKind::FMinimum: |
| 23177 | // res = vv |
| 23178 | break; |
| 23179 | case RecurKind::Mul: |
| 23180 | case RecurKind::FMul: |
| 23181 | case RecurKind::FMulAdd: |
| 23182 | case RecurKind::AnyOf: |
| 23183 | case RecurKind::FindFirstIVSMin: |
| 23184 | case RecurKind::FindLastIVSMax: |
| 23185 | case RecurKind::FindLastIVUMax: |
| 23186 | case RecurKind::FMaximumNum: |
| 23187 | case RecurKind::FMinimumNum: |
| 23188 | case RecurKind::None: |
| 23189 | llvm_unreachable("Unexpected reduction kind for repeated scalar." ); |
| 23190 | } |
| 23191 | } |
| 23192 | // Combine Vec with the previous VecOp. |
| 23193 | if (!VecRes) { |
| 23194 | VecRes = Vec; |
| 23195 | VecResSignedness = IsSigned; |
| 23196 | } else { |
| 23197 | ++NumVectorInstructions; |
| 23198 | if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy && |
| 23199 | VecRes->getType()->getScalarType() == Builder.getInt1Ty()) { |
| 23200 | // Handle ctpop. |
| 23201 | unsigned VecResVF = getNumElements(Ty: VecRes->getType()); |
| 23202 | unsigned VecVF = getNumElements(Ty: Vec->getType()); |
| 23203 | SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem); |
| 23204 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
| 23205 | // Ensure that VecRes is always larger than Vec |
| 23206 | if (VecResVF < VecVF) { |
| 23207 | std::swap(a&: VecRes, b&: Vec); |
| 23208 | std::swap(a&: VecResVF, b&: VecVF); |
| 23209 | } |
| 23210 | if (VecResVF != VecVF) { |
| 23211 | SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem); |
| 23212 | std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0); |
| 23213 | Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask); |
| 23214 | } |
| 23215 | VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op" ); |
| 23216 | return; |
| 23217 | } |
| 23218 | if (VecRes->getType()->getScalarType() != DestTy->getScalarType()) |
| 23219 | VecRes = Builder.CreateIntCast( |
| 23220 | V: VecRes, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: VecRes->getType())), |
| 23221 | isSigned: VecResSignedness); |
| 23222 | if (ScalarTy != DestTy->getScalarType()) |
| 23223 | Vec = Builder.CreateIntCast( |
| 23224 | V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())), |
| 23225 | isSigned: IsSigned); |
| 23226 | unsigned VecResVF = getNumElements(Ty: VecRes->getType()); |
| 23227 | unsigned VecVF = getNumElements(Ty: Vec->getType()); |
| 23228 | // Ensure that VecRes is always larger than Vec |
| 23229 | if (VecResVF < VecVF) { |
| 23230 | std::swap(a&: VecRes, b&: Vec); |
| 23231 | std::swap(a&: VecResVF, b&: VecVF); |
| 23232 | } |
| 23233 | // extract + op + insert |
| 23234 | Value *Op = VecRes; |
| 23235 | if (VecResVF != VecVF) |
| 23236 | Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /*Index=*/0); |
| 23237 | Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op" , ReductionOps); |
| 23238 | if (VecResVF != VecVF) |
| 23239 | Op = createInsertVector(Builder, Vec: VecRes, V: Op, /*Index=*/0); |
| 23240 | VecRes = Op; |
| 23241 | } |
| 23242 | }; |
| 23243 | for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales) |
| 23244 | CreateVecOp(Vec, Scale, IsSigned); |
| 23245 | CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false); |
| 23246 | |
| 23247 | return ReducedSubTree; |
| 23248 | } |
| 23249 | |
| 23250 | /// Emit a horizontal reduction of the vectorized value. |
| 23251 | Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder, |
| 23252 | const TargetTransformInfo *TTI, Type *DestTy) { |
| 23253 | assert(VectorizedValue && "Need to have a vectorized tree node" ); |
| 23254 | assert(RdxKind != RecurKind::FMulAdd && |
| 23255 | "A call to the llvm.fmuladd intrinsic is not handled yet" ); |
| 23256 | |
| 23257 | auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType()); |
| 23258 | if (FTy->getScalarType() == Builder.getInt1Ty() && |
| 23259 | RdxKind == RecurKind::Add && |
| 23260 | DestTy->getScalarType() != FTy->getScalarType()) { |
| 23261 | // Convert vector_reduce_add(ZExt(<n x i1>)) to |
| 23262 | // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)). |
| 23263 | Value *V = Builder.CreateBitCast( |
| 23264 | V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements())); |
| 23265 | ++NumVectorInstructions; |
| 23266 | return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V); |
| 23267 | } |
| 23268 | ++NumVectorInstructions; |
| 23269 | return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind); |
| 23270 | } |
| 23271 | |
| 23272 | /// Emits optimized code for unique scalar value reused \p Cnt times. |
| 23273 | Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, |
| 23274 | unsigned Cnt) { |
| 23275 | assert(IsSupportedHorRdxIdentityOp && |
| 23276 | "The optimization of matched scalar identity horizontal reductions " |
| 23277 | "must be supported." ); |
| 23278 | if (Cnt == 1) |
| 23279 | return VectorizedValue; |
| 23280 | switch (RdxKind) { |
| 23281 | case RecurKind::Add: { |
| 23282 | // res = mul vv, n |
| 23283 | Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt); |
| 23284 | LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " |
| 23285 | << VectorizedValue << ". (HorRdx)\n" ); |
| 23286 | return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale); |
| 23287 | } |
| 23288 | case RecurKind::Xor: { |
| 23289 | // res = n % 2 ? 0 : vv |
| 23290 | LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue |
| 23291 | << ". (HorRdx)\n" ); |
| 23292 | if (Cnt % 2 == 0) |
| 23293 | return Constant::getNullValue(Ty: VectorizedValue->getType()); |
| 23294 | return VectorizedValue; |
| 23295 | } |
| 23296 | case RecurKind::FAdd: { |
| 23297 | // res = fmul v, n |
| 23298 | Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt); |
| 23299 | LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " |
| 23300 | << VectorizedValue << ". (HorRdx)\n" ); |
| 23301 | return Builder.CreateFMul(L: VectorizedValue, R: Scale); |
| 23302 | } |
| 23303 | case RecurKind::And: |
| 23304 | case RecurKind::Or: |
| 23305 | case RecurKind::SMax: |
| 23306 | case RecurKind::SMin: |
| 23307 | case RecurKind::UMax: |
| 23308 | case RecurKind::UMin: |
| 23309 | case RecurKind::FMax: |
| 23310 | case RecurKind::FMin: |
| 23311 | case RecurKind::FMaximum: |
| 23312 | case RecurKind::FMinimum: |
| 23313 | // res = vv |
| 23314 | return VectorizedValue; |
| 23315 | case RecurKind::Mul: |
| 23316 | case RecurKind::FMul: |
| 23317 | case RecurKind::FMulAdd: |
| 23318 | case RecurKind::AnyOf: |
| 23319 | case RecurKind::FindFirstIVSMin: |
| 23320 | case RecurKind::FindLastIVSMax: |
| 23321 | case RecurKind::FindLastIVUMax: |
| 23322 | case RecurKind::FMaximumNum: |
| 23323 | case RecurKind::FMinimumNum: |
| 23324 | case RecurKind::None: |
| 23325 | llvm_unreachable("Unexpected reduction kind for repeated scalar." ); |
| 23326 | } |
| 23327 | return nullptr; |
| 23328 | } |
| 23329 | |
| 23330 | /// Emits actual operation for the scalar identity values, found during |
| 23331 | /// horizontal reduction analysis. |
| 23332 | Value * |
| 23333 | emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R, |
| 23334 | const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter, |
| 23335 | const DenseMap<Value *, Value *> &TrackedToOrig) { |
| 23336 | assert(IsSupportedHorRdxIdentityOp && |
| 23337 | "The optimization of matched scalar identity horizontal reductions " |
| 23338 | "must be supported." ); |
| 23339 | ArrayRef<Value *> VL = R.getRootNodeScalars(); |
| 23340 | auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType()); |
| 23341 | if (VTy->getElementType() != VL.front()->getType()) { |
| 23342 | VectorizedValue = Builder.CreateIntCast( |
| 23343 | V: VectorizedValue, |
| 23344 | DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()), |
| 23345 | isSigned: R.isSignedMinBitwidthRootNode()); |
| 23346 | } |
| 23347 | switch (RdxKind) { |
| 23348 | case RecurKind::Add: { |
| 23349 | // root = mul prev_root, <1, 1, n, 1> |
| 23350 | SmallVector<Constant *> Vals; |
| 23351 | for (Value *V : VL) { |
| 23352 | unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V)); |
| 23353 | Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false)); |
| 23354 | } |
| 23355 | auto *Scale = ConstantVector::get(V: Vals); |
| 23356 | LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of " |
| 23357 | << VectorizedValue << ". (HorRdx)\n" ); |
| 23358 | return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale); |
| 23359 | } |
| 23360 | case RecurKind::And: |
| 23361 | case RecurKind::Or: |
| 23362 | // No need for multiple or/and(s). |
| 23363 | LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue |
| 23364 | << ". (HorRdx)\n" ); |
| 23365 | return VectorizedValue; |
| 23366 | case RecurKind::SMax: |
| 23367 | case RecurKind::SMin: |
| 23368 | case RecurKind::UMax: |
| 23369 | case RecurKind::UMin: |
| 23370 | case RecurKind::FMax: |
| 23371 | case RecurKind::FMin: |
| 23372 | case RecurKind::FMaximum: |
| 23373 | case RecurKind::FMinimum: |
| 23374 | // No need for multiple min/max(s) of the same value. |
| 23375 | LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue |
| 23376 | << ". (HorRdx)\n" ); |
| 23377 | return VectorizedValue; |
| 23378 | case RecurKind::Xor: { |
| 23379 | // Replace values with even number of repeats with 0, since |
| 23380 | // x xor x = 0. |
| 23381 | // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6, |
| 23382 | // 7>, if elements 4th and 6th elements have even number of repeats. |
| 23383 | SmallVector<int> Mask( |
| 23384 | cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(), |
| 23385 | PoisonMaskElem); |
| 23386 | std::iota(first: Mask.begin(), last: Mask.end(), value: 0); |
| 23387 | bool NeedShuffle = false; |
| 23388 | for (unsigned I = 0, VF = VL.size(); I < VF; ++I) { |
| 23389 | Value *V = VL[I]; |
| 23390 | unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V)); |
| 23391 | if (Cnt % 2 == 0) { |
| 23392 | Mask[I] = VF; |
| 23393 | NeedShuffle = true; |
| 23394 | } |
| 23395 | } |
| 23396 | LLVM_DEBUG(dbgs() << "SLP: Xor <" ; for (int I |
| 23397 | : Mask) dbgs() |
| 23398 | << I << " " ; |
| 23399 | dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n" ); |
| 23400 | if (NeedShuffle) |
| 23401 | VectorizedValue = Builder.CreateShuffleVector( |
| 23402 | V1: VectorizedValue, |
| 23403 | V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask); |
| 23404 | return VectorizedValue; |
| 23405 | } |
| 23406 | case RecurKind::FAdd: { |
| 23407 | // root = fmul prev_root, <1.0, 1.0, n.0, 1.0> |
| 23408 | SmallVector<Constant *> Vals; |
| 23409 | for (Value *V : VL) { |
| 23410 | unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V)); |
| 23411 | Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt)); |
| 23412 | } |
| 23413 | auto *Scale = ConstantVector::get(V: Vals); |
| 23414 | return Builder.CreateFMul(L: VectorizedValue, R: Scale); |
| 23415 | } |
| 23416 | case RecurKind::Mul: |
| 23417 | case RecurKind::FMul: |
| 23418 | case RecurKind::FMulAdd: |
| 23419 | case RecurKind::AnyOf: |
| 23420 | case RecurKind::FindFirstIVSMin: |
| 23421 | case RecurKind::FindLastIVSMax: |
| 23422 | case RecurKind::FindLastIVUMax: |
| 23423 | case RecurKind::FMaximumNum: |
| 23424 | case RecurKind::FMinimumNum: |
| 23425 | case RecurKind::None: |
| 23426 | llvm_unreachable("Unexpected reduction kind for reused scalars." ); |
| 23427 | } |
| 23428 | return nullptr; |
| 23429 | } |
| 23430 | }; |
| 23431 | } // end anonymous namespace |
| 23432 | |
| 23433 | /// Gets recurrence kind from the specified value. |
| 23434 | static RecurKind getRdxKind(Value *V) { |
| 23435 | return HorizontalReduction::getRdxKind(V); |
| 23436 | } |
| 23437 | static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) { |
| 23438 | if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst)) |
| 23439 | return cast<FixedVectorType>(Val: IE->getType())->getNumElements(); |
| 23440 | |
| 23441 | unsigned AggregateSize = 1; |
| 23442 | auto *IV = cast<InsertValueInst>(Val: InsertInst); |
| 23443 | Type *CurrentType = IV->getType(); |
| 23444 | do { |
| 23445 | if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) { |
| 23446 | for (auto *Elt : ST->elements()) |
| 23447 | if (Elt != ST->getElementType(N: 0)) // check homogeneity |
| 23448 | return std::nullopt; |
| 23449 | AggregateSize *= ST->getNumElements(); |
| 23450 | CurrentType = ST->getElementType(N: 0); |
| 23451 | } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) { |
| 23452 | AggregateSize *= AT->getNumElements(); |
| 23453 | CurrentType = AT->getElementType(); |
| 23454 | } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) { |
| 23455 | AggregateSize *= VT->getNumElements(); |
| 23456 | return AggregateSize; |
| 23457 | } else if (CurrentType->isSingleValueType()) { |
| 23458 | return AggregateSize; |
| 23459 | } else { |
| 23460 | return std::nullopt; |
| 23461 | } |
| 23462 | } while (true); |
| 23463 | } |
| 23464 | |
| 23465 | static void findBuildAggregateRec(Instruction *LastInsertInst, |
| 23466 | TargetTransformInfo *TTI, |
| 23467 | SmallVectorImpl<Value *> &BuildVectorOpds, |
| 23468 | SmallVectorImpl<Value *> &InsertElts, |
| 23469 | unsigned OperandOffset, const BoUpSLP &R) { |
| 23470 | do { |
| 23471 | Value *InsertedOperand = LastInsertInst->getOperand(i: 1); |
| 23472 | std::optional<unsigned> OperandIndex = |
| 23473 | getElementIndex(Inst: LastInsertInst, Offset: OperandOffset); |
| 23474 | if (!OperandIndex || R.isDeleted(I: LastInsertInst)) |
| 23475 | return; |
| 23476 | if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) { |
| 23477 | findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI, |
| 23478 | BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R); |
| 23479 | |
| 23480 | } else { |
| 23481 | BuildVectorOpds[*OperandIndex] = InsertedOperand; |
| 23482 | InsertElts[*OperandIndex] = LastInsertInst; |
| 23483 | } |
| 23484 | LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0)); |
| 23485 | } while (LastInsertInst != nullptr && |
| 23486 | isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) && |
| 23487 | LastInsertInst->hasOneUse()); |
| 23488 | } |
| 23489 | |
| 23490 | /// Recognize construction of vectors like |
| 23491 | /// %ra = insertelement <4 x float> poison, float %s0, i32 0 |
| 23492 | /// %rb = insertelement <4 x float> %ra, float %s1, i32 1 |
| 23493 | /// %rc = insertelement <4 x float> %rb, float %s2, i32 2 |
| 23494 | /// %rd = insertelement <4 x float> %rc, float %s3, i32 3 |
| 23495 | /// starting from the last insertelement or insertvalue instruction. |
| 23496 | /// |
| 23497 | /// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>}, |
| 23498 | /// {{float, float}, {float, float}}, [2 x {float, float}] and so on. |
| 23499 | /// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples. |
| 23500 | /// |
| 23501 | /// Assume LastInsertInst is of InsertElementInst or InsertValueInst type. |
| 23502 | /// |
| 23503 | /// \return true if it matches. |
| 23504 | static bool findBuildAggregate(Instruction *LastInsertInst, |
| 23505 | TargetTransformInfo *TTI, |
| 23506 | SmallVectorImpl<Value *> &BuildVectorOpds, |
| 23507 | SmallVectorImpl<Value *> &InsertElts, |
| 23508 | const BoUpSLP &R) { |
| 23509 | |
| 23510 | assert((isa<InsertElementInst>(LastInsertInst) || |
| 23511 | isa<InsertValueInst>(LastInsertInst)) && |
| 23512 | "Expected insertelement or insertvalue instruction!" ); |
| 23513 | |
| 23514 | assert((BuildVectorOpds.empty() && InsertElts.empty()) && |
| 23515 | "Expected empty result vectors!" ); |
| 23516 | |
| 23517 | std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst); |
| 23518 | if (!AggregateSize) |
| 23519 | return false; |
| 23520 | BuildVectorOpds.resize(N: *AggregateSize); |
| 23521 | InsertElts.resize(N: *AggregateSize); |
| 23522 | |
| 23523 | findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0, R); |
| 23524 | llvm::erase(C&: BuildVectorOpds, V: nullptr); |
| 23525 | llvm::erase(C&: InsertElts, V: nullptr); |
| 23526 | if (BuildVectorOpds.size() >= 2) |
| 23527 | return true; |
| 23528 | |
| 23529 | return false; |
| 23530 | } |
| 23531 | |
| 23532 | /// Try and get a reduction instruction from a phi node. |
| 23533 | /// |
| 23534 | /// Given a phi node \p P in a block \p ParentBB, consider possible reductions |
| 23535 | /// if they come from either \p ParentBB or a containing loop latch. |
| 23536 | /// |
| 23537 | /// \returns A candidate reduction value if possible, or \code nullptr \endcode |
| 23538 | /// if not possible. |
| 23539 | static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P, |
| 23540 | BasicBlock *ParentBB, LoopInfo *LI) { |
| 23541 | // There are situations where the reduction value is not dominated by the |
| 23542 | // reduction phi. Vectorizing such cases has been reported to cause |
| 23543 | // miscompiles. See PR25787. |
| 23544 | auto DominatedReduxValue = [&](Value *R) { |
| 23545 | return isa<Instruction>(Val: R) && |
| 23546 | DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent()); |
| 23547 | }; |
| 23548 | |
| 23549 | Instruction *Rdx = nullptr; |
| 23550 | |
| 23551 | // Return the incoming value if it comes from the same BB as the phi node. |
| 23552 | if (P->getIncomingBlock(i: 0) == ParentBB) { |
| 23553 | Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0)); |
| 23554 | } else if (P->getIncomingBlock(i: 1) == ParentBB) { |
| 23555 | Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1)); |
| 23556 | } |
| 23557 | |
| 23558 | if (Rdx && DominatedReduxValue(Rdx)) |
| 23559 | return Rdx; |
| 23560 | |
| 23561 | // Otherwise, check whether we have a loop latch to look at. |
| 23562 | Loop *BBL = LI->getLoopFor(BB: ParentBB); |
| 23563 | if (!BBL) |
| 23564 | return nullptr; |
| 23565 | BasicBlock *BBLatch = BBL->getLoopLatch(); |
| 23566 | if (!BBLatch) |
| 23567 | return nullptr; |
| 23568 | |
| 23569 | // There is a loop latch, return the incoming value if it comes from |
| 23570 | // that. This reduction pattern occasionally turns up. |
| 23571 | if (P->getIncomingBlock(i: 0) == BBLatch) { |
| 23572 | Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0)); |
| 23573 | } else if (P->getIncomingBlock(i: 1) == BBLatch) { |
| 23574 | Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1)); |
| 23575 | } |
| 23576 | |
| 23577 | if (Rdx && DominatedReduxValue(Rdx)) |
| 23578 | return Rdx; |
| 23579 | |
| 23580 | return nullptr; |
| 23581 | } |
| 23582 | |
| 23583 | static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { |
| 23584 | if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1)))) |
| 23585 | return true; |
| 23586 | if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23587 | return true; |
| 23588 | if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23589 | return true; |
| 23590 | if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23591 | return true; |
| 23592 | if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23593 | return true; |
| 23594 | if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23595 | return true; |
| 23596 | if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23597 | return true; |
| 23598 | if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23599 | return true; |
| 23600 | if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1)))) |
| 23601 | return true; |
| 23602 | return false; |
| 23603 | } |
| 23604 | |
| 23605 | /// We could have an initial reduction that is not an add. |
| 23606 | /// r *= v1 + v2 + v3 + v4 |
| 23607 | /// In such a case start looking for a tree rooted in the first '+'. |
| 23608 | /// \Returns the new root if found, which may be nullptr if not an instruction. |
| 23609 | static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi, |
| 23610 | Instruction *Root) { |
| 23611 | assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) || |
| 23612 | isa<IntrinsicInst>(Root)) && |
| 23613 | "Expected binop, select, or intrinsic for reduction matching" ); |
| 23614 | Value *LHS = |
| 23615 | Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root)); |
| 23616 | Value *RHS = |
| 23617 | Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1); |
| 23618 | if (LHS == Phi) |
| 23619 | return dyn_cast<Instruction>(Val: RHS); |
| 23620 | if (RHS == Phi) |
| 23621 | return dyn_cast<Instruction>(Val: LHS); |
| 23622 | return nullptr; |
| 23623 | } |
| 23624 | |
| 23625 | /// \p Returns the first operand of \p I that does not match \p Phi. If |
| 23626 | /// operand is not an instruction it returns nullptr. |
| 23627 | static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) { |
| 23628 | Value *Op0 = nullptr; |
| 23629 | Value *Op1 = nullptr; |
| 23630 | if (!matchRdxBop(I, V0&: Op0, V1&: Op1)) |
| 23631 | return nullptr; |
| 23632 | return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0); |
| 23633 | } |
| 23634 | |
| 23635 | /// \Returns true if \p I is a candidate instruction for reduction vectorization. |
| 23636 | static bool isReductionCandidate(Instruction *I) { |
| 23637 | bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value())); |
| 23638 | Value *B0 = nullptr, *B1 = nullptr; |
| 23639 | bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1); |
| 23640 | return IsBinop || IsSelect; |
| 23641 | } |
| 23642 | |
| 23643 | bool SLPVectorizerPass::vectorizeHorReduction( |
| 23644 | PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, |
| 23645 | SmallVectorImpl<WeakTrackingVH> &PostponedInsts) { |
| 23646 | if (!ShouldVectorizeHor) |
| 23647 | return false; |
| 23648 | bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root); |
| 23649 | |
| 23650 | if (Root->getParent() != BB || isa<PHINode>(Val: Root)) |
| 23651 | return false; |
| 23652 | |
| 23653 | // If we can find a secondary reduction root, use that instead. |
| 23654 | auto SelectRoot = [&]() { |
| 23655 | if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) && |
| 23656 | HorizontalReduction::getRdxKind(V: Root) != RecurKind::None) |
| 23657 | if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root)) |
| 23658 | return NewRoot; |
| 23659 | return Root; |
| 23660 | }; |
| 23661 | |
| 23662 | // Start analysis starting from Root instruction. If horizontal reduction is |
| 23663 | // found, try to vectorize it. If it is not a horizontal reduction or |
| 23664 | // vectorization is not possible or not effective, and currently analyzed |
| 23665 | // instruction is a binary operation, try to vectorize the operands, using |
| 23666 | // pre-order DFS traversal order. If the operands were not vectorized, repeat |
| 23667 | // the same procedure considering each operand as a possible root of the |
| 23668 | // horizontal reduction. |
| 23669 | // Interrupt the process if the Root instruction itself was vectorized or all |
| 23670 | // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. |
| 23671 | // If a horizintal reduction was not matched or vectorized we collect |
| 23672 | // instructions for possible later attempts for vectorization. |
| 23673 | std::queue<std::pair<Instruction *, unsigned>> Stack; |
| 23674 | Stack.emplace(args: SelectRoot(), args: 0); |
| 23675 | SmallPtrSet<Value *, 8> VisitedInstrs; |
| 23676 | bool Res = false; |
| 23677 | auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * { |
| 23678 | if (R.isAnalyzedReductionRoot(I: Inst)) |
| 23679 | return nullptr; |
| 23680 | if (!isReductionCandidate(I: Inst)) |
| 23681 | return nullptr; |
| 23682 | HorizontalReduction HorRdx; |
| 23683 | if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DL: *DL, TLI: *TLI)) |
| 23684 | return nullptr; |
| 23685 | return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI, AC); |
| 23686 | }; |
| 23687 | auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) { |
| 23688 | if (TryOperandsAsNewSeeds && FutureSeed == Root) { |
| 23689 | FutureSeed = getNonPhiOperand(I: Root, Phi: P); |
| 23690 | if (!FutureSeed) |
| 23691 | return false; |
| 23692 | } |
| 23693 | // Do not collect CmpInst or InsertElementInst/InsertValueInst as their |
| 23694 | // analysis is done separately. |
| 23695 | if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed)) |
| 23696 | PostponedInsts.push_back(Elt: FutureSeed); |
| 23697 | return true; |
| 23698 | }; |
| 23699 | |
| 23700 | while (!Stack.empty()) { |
| 23701 | Instruction *Inst; |
| 23702 | unsigned Level; |
| 23703 | std::tie(args&: Inst, args&: Level) = Stack.front(); |
| 23704 | Stack.pop(); |
| 23705 | // Do not try to analyze instruction that has already been vectorized. |
| 23706 | // This may happen when we vectorize instruction operands on a previous |
| 23707 | // iteration while stack was populated before that happened. |
| 23708 | if (R.isDeleted(I: Inst)) |
| 23709 | continue; |
| 23710 | if (Value *VectorizedV = TryToReduce(Inst)) { |
| 23711 | Res = true; |
| 23712 | if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) { |
| 23713 | // Try to find another reduction. |
| 23714 | Stack.emplace(args&: I, args&: Level); |
| 23715 | continue; |
| 23716 | } |
| 23717 | if (R.isDeleted(I: Inst)) |
| 23718 | continue; |
| 23719 | } else { |
| 23720 | // We could not vectorize `Inst` so try to use it as a future seed. |
| 23721 | if (!TryAppendToPostponedInsts(Inst)) { |
| 23722 | assert(Stack.empty() && "Expected empty stack" ); |
| 23723 | break; |
| 23724 | } |
| 23725 | } |
| 23726 | |
| 23727 | // Try to vectorize operands. |
| 23728 | // Continue analysis for the instruction from the same basic block only to |
| 23729 | // save compile time. |
| 23730 | if (++Level < RecursionMaxDepth) |
| 23731 | for (auto *Op : Inst->operand_values()) |
| 23732 | if (VisitedInstrs.insert(Ptr: Op).second) |
| 23733 | if (auto *I = dyn_cast<Instruction>(Val: Op)) |
| 23734 | // Do not try to vectorize CmpInst operands, this is done |
| 23735 | // separately. |
| 23736 | if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) && |
| 23737 | !R.isDeleted(I) && I->getParent() == BB) |
| 23738 | Stack.emplace(args&: I, args&: Level); |
| 23739 | } |
| 23740 | return Res; |
| 23741 | } |
| 23742 | |
| 23743 | bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root, |
| 23744 | BasicBlock *BB, BoUpSLP &R) { |
| 23745 | SmallVector<WeakTrackingVH> PostponedInsts; |
| 23746 | bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts); |
| 23747 | Res |= tryToVectorize(Insts: PostponedInsts, R); |
| 23748 | return Res; |
| 23749 | } |
| 23750 | |
| 23751 | bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts, |
| 23752 | BoUpSLP &R) { |
| 23753 | bool Res = false; |
| 23754 | for (Value *V : Insts) |
| 23755 | if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst)) |
| 23756 | Res |= tryToVectorize(I: Inst, R); |
| 23757 | return Res; |
| 23758 | } |
| 23759 | |
| 23760 | bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, |
| 23761 | BasicBlock *BB, BoUpSLP &R, |
| 23762 | bool MaxVFOnly) { |
| 23763 | if (!R.canMapToVector(T: IVI->getType())) |
| 23764 | return false; |
| 23765 | |
| 23766 | SmallVector<Value *, 16> BuildVectorOpds; |
| 23767 | SmallVector<Value *, 16> BuildVectorInsts; |
| 23768 | if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R)) |
| 23769 | return false; |
| 23770 | |
| 23771 | if (MaxVFOnly && BuildVectorOpds.size() == 2) { |
| 23772 | R.getORE()->emit(RemarkBuilder: [&]() { |
| 23773 | return OptimizationRemarkMissed(SV_NAME, "NotPossible" , IVI) |
| 23774 | << "Cannot SLP vectorize list: only 2 elements of buildvalue, " |
| 23775 | "trying reduction first." ; |
| 23776 | }); |
| 23777 | return false; |
| 23778 | } |
| 23779 | LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n" ); |
| 23780 | // Aggregate value is unlikely to be processed in vector register. |
| 23781 | return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly); |
| 23782 | } |
| 23783 | |
| 23784 | bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, |
| 23785 | BasicBlock *BB, BoUpSLP &R, |
| 23786 | bool MaxVFOnly) { |
| 23787 | SmallVector<Value *, 16> BuildVectorInsts; |
| 23788 | SmallVector<Value *, 16> BuildVectorOpds; |
| 23789 | SmallVector<int> Mask; |
| 23790 | if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) || |
| 23791 | (all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) && |
| 23792 | isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC))) |
| 23793 | return false; |
| 23794 | |
| 23795 | if (MaxVFOnly && BuildVectorInsts.size() == 2) { |
| 23796 | R.getORE()->emit(RemarkBuilder: [&]() { |
| 23797 | return OptimizationRemarkMissed(SV_NAME, "NotPossible" , IEI) |
| 23798 | << "Cannot SLP vectorize list: only 2 elements of buildvector, " |
| 23799 | "trying reduction first." ; |
| 23800 | }); |
| 23801 | return false; |
| 23802 | } |
| 23803 | LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n" ); |
| 23804 | return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly); |
| 23805 | } |
| 23806 | |
| 23807 | template <typename T> |
| 23808 | static bool tryToVectorizeSequence( |
| 23809 | SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator, |
| 23810 | function_ref<bool(T *, T *)> AreCompatible, |
| 23811 | function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper, |
| 23812 | bool MaxVFOnly, BoUpSLP &R) { |
| 23813 | bool Changed = false; |
| 23814 | // Sort by type, parent, operands. |
| 23815 | stable_sort(Incoming, Comparator); |
| 23816 | |
| 23817 | // Try to vectorize elements base on their type. |
| 23818 | SmallVector<T *> Candidates; |
| 23819 | SmallVector<T *> VL; |
| 23820 | for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E; |
| 23821 | VL.clear()) { |
| 23822 | // Look for the next elements with the same type, parent and operand |
| 23823 | // kinds. |
| 23824 | auto *I = dyn_cast<Instruction>(*IncIt); |
| 23825 | if (!I || R.isDeleted(I)) { |
| 23826 | ++IncIt; |
| 23827 | continue; |
| 23828 | } |
| 23829 | auto *SameTypeIt = IncIt; |
| 23830 | while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) || |
| 23831 | R.isDeleted(I: cast<Instruction>(*SameTypeIt)) || |
| 23832 | AreCompatible(*SameTypeIt, *IncIt))) { |
| 23833 | auto *I = dyn_cast<Instruction>(*SameTypeIt); |
| 23834 | ++SameTypeIt; |
| 23835 | if (I && !R.isDeleted(I)) |
| 23836 | VL.push_back(cast<T>(I)); |
| 23837 | } |
| 23838 | |
| 23839 | // Try to vectorize them. |
| 23840 | unsigned NumElts = VL.size(); |
| 23841 | LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" |
| 23842 | << NumElts << ")\n" ); |
| 23843 | // The vectorization is a 3-state attempt: |
| 23844 | // 1. Try to vectorize instructions with the same/alternate opcodes with the |
| 23845 | // size of maximal register at first. |
| 23846 | // 2. Try to vectorize remaining instructions with the same type, if |
| 23847 | // possible. This may result in the better vectorization results rather than |
| 23848 | // if we try just to vectorize instructions with the same/alternate opcodes. |
| 23849 | // 3. Final attempt to try to vectorize all instructions with the |
| 23850 | // same/alternate ops only, this may result in some extra final |
| 23851 | // vectorization. |
| 23852 | if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) { |
| 23853 | // Success start over because instructions might have been changed. |
| 23854 | Changed = true; |
| 23855 | VL.swap(Candidates); |
| 23856 | Candidates.clear(); |
| 23857 | for (T *V : VL) { |
| 23858 | if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I)) |
| 23859 | Candidates.push_back(V); |
| 23860 | } |
| 23861 | } else { |
| 23862 | /// \Returns the minimum number of elements that we will attempt to |
| 23863 | /// vectorize. |
| 23864 | auto GetMinNumElements = [&R](Value *V) { |
| 23865 | unsigned EltSize = R.getVectorElementSize(V); |
| 23866 | return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize); |
| 23867 | }; |
| 23868 | if (NumElts < GetMinNumElements(*IncIt) && |
| 23869 | (Candidates.empty() || |
| 23870 | Candidates.front()->getType() == (*IncIt)->getType())) { |
| 23871 | for (T *V : VL) { |
| 23872 | if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I)) |
| 23873 | Candidates.push_back(V); |
| 23874 | } |
| 23875 | } |
| 23876 | } |
| 23877 | // Final attempt to vectorize instructions with the same types. |
| 23878 | if (Candidates.size() > 1 && |
| 23879 | (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) { |
| 23880 | if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) { |
| 23881 | // Success start over because instructions might have been changed. |
| 23882 | Changed = true; |
| 23883 | } else if (MaxVFOnly) { |
| 23884 | // Try to vectorize using small vectors. |
| 23885 | SmallVector<T *> VL; |
| 23886 | for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End; |
| 23887 | VL.clear()) { |
| 23888 | auto *I = dyn_cast<Instruction>(*It); |
| 23889 | if (!I || R.isDeleted(I)) { |
| 23890 | ++It; |
| 23891 | continue; |
| 23892 | } |
| 23893 | auto *SameTypeIt = It; |
| 23894 | while (SameTypeIt != End && |
| 23895 | (!isa<Instruction>(*SameTypeIt) || |
| 23896 | R.isDeleted(I: cast<Instruction>(*SameTypeIt)) || |
| 23897 | AreCompatible(*SameTypeIt, *It))) { |
| 23898 | auto *I = dyn_cast<Instruction>(*SameTypeIt); |
| 23899 | ++SameTypeIt; |
| 23900 | if (I && !R.isDeleted(I)) |
| 23901 | VL.push_back(cast<T>(I)); |
| 23902 | } |
| 23903 | unsigned NumElts = VL.size(); |
| 23904 | if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), |
| 23905 | /*MaxVFOnly=*/false)) |
| 23906 | Changed = true; |
| 23907 | It = SameTypeIt; |
| 23908 | } |
| 23909 | } |
| 23910 | Candidates.clear(); |
| 23911 | } |
| 23912 | |
| 23913 | // Start over at the next instruction of a different type (or the end). |
| 23914 | IncIt = SameTypeIt; |
| 23915 | } |
| 23916 | return Changed; |
| 23917 | } |
| 23918 | |
| 23919 | /// Compare two cmp instructions. If IsCompatibility is true, function returns |
| 23920 | /// true if 2 cmps have same/swapped predicates and mos compatible corresponding |
| 23921 | /// operands. If IsCompatibility is false, function implements strict weak |
| 23922 | /// ordering relation between two cmp instructions, returning true if the first |
| 23923 | /// instruction is "less" than the second, i.e. its predicate is less than the |
| 23924 | /// predicate of the second or the operands IDs are less than the operands IDs |
| 23925 | /// of the second cmp instruction. |
| 23926 | template <bool IsCompatibility> |
| 23927 | static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, |
| 23928 | const DominatorTree &DT) { |
| 23929 | assert(isValidElementType(V->getType()) && |
| 23930 | isValidElementType(V2->getType()) && |
| 23931 | "Expected valid element types only." ); |
| 23932 | if (V == V2) |
| 23933 | return IsCompatibility; |
| 23934 | auto *CI1 = cast<CmpInst>(Val: V); |
| 23935 | auto *CI2 = cast<CmpInst>(Val: V2); |
| 23936 | if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() < |
| 23937 | CI2->getOperand(i_nocapture: 0)->getType()->getTypeID()) |
| 23938 | return !IsCompatibility; |
| 23939 | if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() > |
| 23940 | CI2->getOperand(i_nocapture: 0)->getType()->getTypeID()) |
| 23941 | return false; |
| 23942 | if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() < |
| 23943 | CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits()) |
| 23944 | return !IsCompatibility; |
| 23945 | if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() > |
| 23946 | CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits()) |
| 23947 | return false; |
| 23948 | CmpInst::Predicate Pred1 = CI1->getPredicate(); |
| 23949 | CmpInst::Predicate Pred2 = CI2->getPredicate(); |
| 23950 | CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1); |
| 23951 | CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2); |
| 23952 | CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1); |
| 23953 | CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2); |
| 23954 | if (BasePred1 < BasePred2) |
| 23955 | return !IsCompatibility; |
| 23956 | if (BasePred1 > BasePred2) |
| 23957 | return false; |
| 23958 | // Compare operands. |
| 23959 | bool CI1Preds = Pred1 == BasePred1; |
| 23960 | bool CI2Preds = Pred2 == BasePred1; |
| 23961 | for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) { |
| 23962 | auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1); |
| 23963 | auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1); |
| 23964 | if (Op1 == Op2) |
| 23965 | continue; |
| 23966 | if (Op1->getValueID() < Op2->getValueID()) |
| 23967 | return !IsCompatibility; |
| 23968 | if (Op1->getValueID() > Op2->getValueID()) |
| 23969 | return false; |
| 23970 | if (auto *I1 = dyn_cast<Instruction>(Val: Op1)) |
| 23971 | if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) { |
| 23972 | if (IsCompatibility) { |
| 23973 | if (I1->getParent() != I2->getParent()) |
| 23974 | return false; |
| 23975 | } else { |
| 23976 | // Try to compare nodes with same parent. |
| 23977 | DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent()); |
| 23978 | DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent()); |
| 23979 | if (!NodeI1) |
| 23980 | return NodeI2 != nullptr; |
| 23981 | if (!NodeI2) |
| 23982 | return false; |
| 23983 | assert((NodeI1 == NodeI2) == |
| 23984 | (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && |
| 23985 | "Different nodes should have different DFS numbers" ); |
| 23986 | if (NodeI1 != NodeI2) |
| 23987 | return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); |
| 23988 | } |
| 23989 | InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI); |
| 23990 | if (S && (IsCompatibility || !S.isAltShuffle())) |
| 23991 | continue; |
| 23992 | if (IsCompatibility) |
| 23993 | return false; |
| 23994 | if (I1->getOpcode() != I2->getOpcode()) |
| 23995 | return I1->getOpcode() < I2->getOpcode(); |
| 23996 | } |
| 23997 | } |
| 23998 | return IsCompatibility; |
| 23999 | } |
| 24000 | |
| 24001 | template <typename ItT> |
| 24002 | bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts, |
| 24003 | BasicBlock *BB, BoUpSLP &R) { |
| 24004 | bool Changed = false; |
| 24005 | // Try to find reductions first. |
| 24006 | for (CmpInst *I : CmpInsts) { |
| 24007 | if (R.isDeleted(I)) |
| 24008 | continue; |
| 24009 | for (Value *Op : I->operands()) |
| 24010 | if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) { |
| 24011 | Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R); |
| 24012 | if (R.isDeleted(I)) |
| 24013 | break; |
| 24014 | } |
| 24015 | } |
| 24016 | // Try to vectorize operands as vector bundles. |
| 24017 | for (CmpInst *I : CmpInsts) { |
| 24018 | if (R.isDeleted(I)) |
| 24019 | continue; |
| 24020 | Changed |= tryToVectorize(I, R); |
| 24021 | } |
| 24022 | // Try to vectorize list of compares. |
| 24023 | // Sort by type, compare predicate, etc. |
| 24024 | auto CompareSorter = [&](Value *V, Value *V2) { |
| 24025 | if (V == V2) |
| 24026 | return false; |
| 24027 | return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT); |
| 24028 | }; |
| 24029 | |
| 24030 | auto AreCompatibleCompares = [&](Value *V1, Value *V2) { |
| 24031 | if (V1 == V2) |
| 24032 | return true; |
| 24033 | return compareCmp<true>(V: V1, V2, TLI&: *TLI, DT: *DT); |
| 24034 | }; |
| 24035 | |
| 24036 | SmallVector<Value *> Vals; |
| 24037 | for (Instruction *V : CmpInsts) |
| 24038 | if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V))) |
| 24039 | Vals.push_back(Elt: V); |
| 24040 | if (Vals.size() <= 1) |
| 24041 | return Changed; |
| 24042 | Changed |= tryToVectorizeSequence<Value>( |
| 24043 | Vals, CompareSorter, AreCompatibleCompares, |
| 24044 | [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { |
| 24045 | // Exclude possible reductions from other blocks. |
| 24046 | bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) { |
| 24047 | return any_of(V->users(), [V](User *U) { |
| 24048 | auto *Select = dyn_cast<SelectInst>(Val: U); |
| 24049 | return Select && |
| 24050 | Select->getParent() != cast<Instruction>(Val: V)->getParent(); |
| 24051 | }); |
| 24052 | }); |
| 24053 | if (ArePossiblyReducedInOtherBlock) |
| 24054 | return false; |
| 24055 | return tryToVectorizeList(VL: Candidates, R, MaxVFOnly); |
| 24056 | }, |
| 24057 | /*MaxVFOnly=*/true, R); |
| 24058 | return Changed; |
| 24059 | } |
| 24060 | |
| 24061 | bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions, |
| 24062 | BasicBlock *BB, BoUpSLP &R) { |
| 24063 | assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) && |
| 24064 | "This function only accepts Insert instructions" ); |
| 24065 | bool OpsChanged = false; |
| 24066 | SmallVector<WeakTrackingVH> PostponedInsts; |
| 24067 | for (auto *I : reverse(C&: Instructions)) { |
| 24068 | // pass1 - try to match and vectorize a buildvector sequence for MaxVF only. |
| 24069 | if (R.isDeleted(I) || isa<CmpInst>(Val: I)) |
| 24070 | continue; |
| 24071 | if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) { |
| 24072 | OpsChanged |= |
| 24073 | vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/true); |
| 24074 | } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) { |
| 24075 | OpsChanged |= |
| 24076 | vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /*MaxVFOnly=*/true); |
| 24077 | } |
| 24078 | // pass2 - try to vectorize reductions only |
| 24079 | if (R.isDeleted(I)) |
| 24080 | continue; |
| 24081 | OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts); |
| 24082 | if (R.isDeleted(I) || isa<CmpInst>(Val: I)) |
| 24083 | continue; |
| 24084 | // pass3 - try to match and vectorize a buildvector sequence. |
| 24085 | if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) { |
| 24086 | OpsChanged |= |
| 24087 | vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/false); |
| 24088 | } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) { |
| 24089 | OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, |
| 24090 | /*MaxVFOnly=*/false); |
| 24091 | } |
| 24092 | } |
| 24093 | // Now try to vectorize postponed instructions. |
| 24094 | OpsChanged |= tryToVectorize(Insts: PostponedInsts, R); |
| 24095 | |
| 24096 | Instructions.clear(); |
| 24097 | return OpsChanged; |
| 24098 | } |
| 24099 | |
| 24100 | bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { |
| 24101 | bool Changed = false; |
| 24102 | SmallVector<Value *, 4> Incoming; |
| 24103 | SmallPtrSet<Value *, 16> VisitedInstrs; |
| 24104 | // Maps phi nodes to the non-phi nodes found in the use tree for each phi |
| 24105 | // node. Allows better to identify the chains that can be vectorized in the |
| 24106 | // better way. |
| 24107 | DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; |
| 24108 | auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) { |
| 24109 | assert(isValidElementType(V1->getType()) && |
| 24110 | isValidElementType(V2->getType()) && |
| 24111 | "Expected vectorizable types only." ); |
| 24112 | if (V1 == V2) |
| 24113 | return false; |
| 24114 | // It is fine to compare type IDs here, since we expect only vectorizable |
| 24115 | // types, like ints, floats and pointers, we don't care about other type. |
| 24116 | if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) |
| 24117 | return true; |
| 24118 | if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) |
| 24119 | return false; |
| 24120 | if (V1->getType()->getScalarSizeInBits() < |
| 24121 | V2->getType()->getScalarSizeInBits()) |
| 24122 | return true; |
| 24123 | if (V1->getType()->getScalarSizeInBits() > |
| 24124 | V2->getType()->getScalarSizeInBits()) |
| 24125 | return false; |
| 24126 | ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; |
| 24127 | ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; |
| 24128 | if (Opcodes1.size() < Opcodes2.size()) |
| 24129 | return true; |
| 24130 | if (Opcodes1.size() > Opcodes2.size()) |
| 24131 | return false; |
| 24132 | for (int I = 0, E = Opcodes1.size(); I < E; ++I) { |
| 24133 | { |
| 24134 | // Instructions come first. |
| 24135 | auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]); |
| 24136 | auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]); |
| 24137 | if (I1 && I2) { |
| 24138 | DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent()); |
| 24139 | DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent()); |
| 24140 | if (!NodeI1) |
| 24141 | return NodeI2 != nullptr; |
| 24142 | if (!NodeI2) |
| 24143 | return false; |
| 24144 | assert((NodeI1 == NodeI2) == |
| 24145 | (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && |
| 24146 | "Different nodes should have different DFS numbers" ); |
| 24147 | if (NodeI1 != NodeI2) |
| 24148 | return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); |
| 24149 | InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI); |
| 24150 | if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) { |
| 24151 | const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1); |
| 24152 | const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2); |
| 24153 | if (!E1 || !E2) |
| 24154 | continue; |
| 24155 | |
| 24156 | // Sort on ExtractElementInsts primarily by vector operands. Prefer |
| 24157 | // program order of the vector operands. |
| 24158 | const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand()); |
| 24159 | const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand()); |
| 24160 | if (V1 != V2) { |
| 24161 | if (V1 && !V2) |
| 24162 | return true; |
| 24163 | if (!V1 && V2) |
| 24164 | return false; |
| 24165 | DomTreeNodeBase<BasicBlock> *NodeI1 = |
| 24166 | DT->getNode(BB: V1->getParent()); |
| 24167 | DomTreeNodeBase<BasicBlock> *NodeI2 = |
| 24168 | DT->getNode(BB: V2->getParent()); |
| 24169 | if (!NodeI1) |
| 24170 | return NodeI2 != nullptr; |
| 24171 | if (!NodeI2) |
| 24172 | return false; |
| 24173 | assert((NodeI1 == NodeI2) == |
| 24174 | (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && |
| 24175 | "Different nodes should have different DFS numbers" ); |
| 24176 | if (NodeI1 != NodeI2) |
| 24177 | return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); |
| 24178 | return V1->comesBefore(Other: V2); |
| 24179 | } |
| 24180 | // If we have the same vector operand, try to sort by constant |
| 24181 | // index. |
| 24182 | std::optional<unsigned> Id1 = getExtractIndex(E: E1); |
| 24183 | std::optional<unsigned> Id2 = getExtractIndex(E: E2); |
| 24184 | // Bring constants to the top |
| 24185 | if (Id1 && !Id2) |
| 24186 | return true; |
| 24187 | if (!Id1 && Id2) |
| 24188 | return false; |
| 24189 | // First elements come first. |
| 24190 | if (Id1 && Id2) |
| 24191 | return *Id1 < *Id2; |
| 24192 | |
| 24193 | continue; |
| 24194 | } |
| 24195 | if (I1->getOpcode() == I2->getOpcode()) |
| 24196 | continue; |
| 24197 | return I1->getOpcode() < I2->getOpcode(); |
| 24198 | } |
| 24199 | if (I1) |
| 24200 | return true; |
| 24201 | if (I2) |
| 24202 | return false; |
| 24203 | } |
| 24204 | { |
| 24205 | // Non-undef constants come next. |
| 24206 | bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]); |
| 24207 | bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]); |
| 24208 | if (C1 && C2) |
| 24209 | continue; |
| 24210 | if (C1) |
| 24211 | return true; |
| 24212 | if (C2) |
| 24213 | return false; |
| 24214 | } |
| 24215 | bool U1 = isa<UndefValue>(Val: Opcodes1[I]); |
| 24216 | bool U2 = isa<UndefValue>(Val: Opcodes2[I]); |
| 24217 | { |
| 24218 | // Non-constant non-instructions come next. |
| 24219 | if (!U1 && !U2) { |
| 24220 | auto ValID1 = Opcodes1[I]->getValueID(); |
| 24221 | auto ValID2 = Opcodes2[I]->getValueID(); |
| 24222 | if (ValID1 == ValID2) |
| 24223 | continue; |
| 24224 | if (ValID1 < ValID2) |
| 24225 | return true; |
| 24226 | if (ValID1 > ValID2) |
| 24227 | return false; |
| 24228 | } |
| 24229 | if (!U1) |
| 24230 | return true; |
| 24231 | if (!U2) |
| 24232 | return false; |
| 24233 | } |
| 24234 | // Undefs come last. |
| 24235 | assert(U1 && U2 && "The only thing left should be undef & undef." ); |
| 24236 | } |
| 24237 | return false; |
| 24238 | }; |
| 24239 | auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { |
| 24240 | if (V1 == V2) |
| 24241 | return true; |
| 24242 | if (V1->getType() != V2->getType()) |
| 24243 | return false; |
| 24244 | ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; |
| 24245 | ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; |
| 24246 | if (Opcodes1.size() != Opcodes2.size()) |
| 24247 | return false; |
| 24248 | for (int I = 0, E = Opcodes1.size(); I < E; ++I) { |
| 24249 | // Undefs are compatible with any other value. |
| 24250 | if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I])) |
| 24251 | continue; |
| 24252 | if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I])) |
| 24253 | if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) { |
| 24254 | if (R.isDeleted(I: I1) || R.isDeleted(I: I2)) |
| 24255 | return false; |
| 24256 | if (I1->getParent() != I2->getParent()) |
| 24257 | return false; |
| 24258 | if (getSameOpcode(VL: {I1, I2}, TLI: *TLI)) |
| 24259 | continue; |
| 24260 | return false; |
| 24261 | } |
| 24262 | if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I])) |
| 24263 | continue; |
| 24264 | if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) |
| 24265 | return false; |
| 24266 | } |
| 24267 | return true; |
| 24268 | }; |
| 24269 | |
| 24270 | bool HaveVectorizedPhiNodes = false; |
| 24271 | do { |
| 24272 | // Collect the incoming values from the PHIs. |
| 24273 | Incoming.clear(); |
| 24274 | for (Instruction &I : *BB) { |
| 24275 | auto *P = dyn_cast<PHINode>(Val: &I); |
| 24276 | if (!P || P->getNumIncomingValues() > MaxPHINumOperands) |
| 24277 | break; |
| 24278 | |
| 24279 | // No need to analyze deleted, vectorized and non-vectorizable |
| 24280 | // instructions. |
| 24281 | if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) && |
| 24282 | isValidElementType(Ty: P->getType())) |
| 24283 | Incoming.push_back(Elt: P); |
| 24284 | } |
| 24285 | |
| 24286 | if (Incoming.size() <= 1) |
| 24287 | break; |
| 24288 | |
| 24289 | // Find the corresponding non-phi nodes for better matching when trying to |
| 24290 | // build the tree. |
| 24291 | for (Value *V : Incoming) { |
| 24292 | SmallVectorImpl<Value *> &Opcodes = |
| 24293 | PHIToOpcodes.try_emplace(Key: V).first->getSecond(); |
| 24294 | if (!Opcodes.empty()) |
| 24295 | continue; |
| 24296 | SmallVector<Value *, 4> Nodes(1, V); |
| 24297 | SmallPtrSet<Value *, 4> Visited; |
| 24298 | while (!Nodes.empty()) { |
| 24299 | auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val()); |
| 24300 | if (!Visited.insert(Ptr: PHI).second) |
| 24301 | continue; |
| 24302 | for (Value *V : PHI->incoming_values()) { |
| 24303 | if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) { |
| 24304 | Nodes.push_back(Elt: PHI1); |
| 24305 | continue; |
| 24306 | } |
| 24307 | Opcodes.emplace_back(Args&: V); |
| 24308 | } |
| 24309 | } |
| 24310 | } |
| 24311 | |
| 24312 | HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>( |
| 24313 | Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs, |
| 24314 | TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) { |
| 24315 | return tryToVectorizeList(VL: Candidates, R, MaxVFOnly); |
| 24316 | }, |
| 24317 | /*MaxVFOnly=*/true, R); |
| 24318 | Changed |= HaveVectorizedPhiNodes; |
| 24319 | if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) { |
| 24320 | auto *PHI = dyn_cast<PHINode>(P.first); |
| 24321 | return !PHI || R.isDeleted(I: PHI); |
| 24322 | })) |
| 24323 | PHIToOpcodes.clear(); |
| 24324 | VisitedInstrs.insert_range(R&: Incoming); |
| 24325 | } while (HaveVectorizedPhiNodes); |
| 24326 | |
| 24327 | VisitedInstrs.clear(); |
| 24328 | |
| 24329 | InstSetVector PostProcessInserts; |
| 24330 | SmallSetVector<CmpInst *, 8> PostProcessCmps; |
| 24331 | // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true |
| 24332 | // also vectorizes `PostProcessCmps`. |
| 24333 | auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) { |
| 24334 | bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R); |
| 24335 | if (VectorizeCmps) { |
| 24336 | Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R); |
| 24337 | PostProcessCmps.clear(); |
| 24338 | } |
| 24339 | PostProcessInserts.clear(); |
| 24340 | return Changed; |
| 24341 | }; |
| 24342 | // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`. |
| 24343 | auto IsInPostProcessInstrs = [&](Instruction *I) { |
| 24344 | if (auto *Cmp = dyn_cast<CmpInst>(Val: I)) |
| 24345 | return PostProcessCmps.contains(key: Cmp); |
| 24346 | return isa<InsertElementInst, InsertValueInst>(Val: I) && |
| 24347 | PostProcessInserts.contains(key: I); |
| 24348 | }; |
| 24349 | // Returns true if `I` is an instruction without users, like terminator, or |
| 24350 | // function call with ignored return value, store. Ignore unused instructions |
| 24351 | // (basing on instruction type, except for CallInst and InvokeInst). |
| 24352 | auto HasNoUsers = [](Instruction *I) { |
| 24353 | return I->use_empty() && |
| 24354 | (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I)); |
| 24355 | }; |
| 24356 | for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) { |
| 24357 | // Skip instructions with scalable type. The num of elements is unknown at |
| 24358 | // compile-time for scalable type. |
| 24359 | if (isa<ScalableVectorType>(Val: It->getType())) |
| 24360 | continue; |
| 24361 | |
| 24362 | // Skip instructions marked for the deletion. |
| 24363 | if (R.isDeleted(I: &*It)) |
| 24364 | continue; |
| 24365 | // We may go through BB multiple times so skip the one we have checked. |
| 24366 | if (!VisitedInstrs.insert(Ptr: &*It).second) { |
| 24367 | if (HasNoUsers(&*It) && |
| 24368 | VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) { |
| 24369 | // We would like to start over since some instructions are deleted |
| 24370 | // and the iterator may become invalid value. |
| 24371 | Changed = true; |
| 24372 | It = BB->begin(); |
| 24373 | E = BB->end(); |
| 24374 | } |
| 24375 | continue; |
| 24376 | } |
| 24377 | |
| 24378 | // Try to vectorize reductions that use PHINodes. |
| 24379 | if (PHINode *P = dyn_cast<PHINode>(Val&: It)) { |
| 24380 | // Check that the PHI is a reduction PHI. |
| 24381 | if (P->getNumIncomingValues() == 2) { |
| 24382 | // Try to match and vectorize a horizontal reduction. |
| 24383 | Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI); |
| 24384 | if (Root && vectorizeRootInstruction(P, Root, BB, R)) { |
| 24385 | Changed = true; |
| 24386 | It = BB->begin(); |
| 24387 | E = BB->end(); |
| 24388 | continue; |
| 24389 | } |
| 24390 | } |
| 24391 | // Try to vectorize the incoming values of the PHI, to catch reductions |
| 24392 | // that feed into PHIs. |
| 24393 | for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) { |
| 24394 | // Skip if the incoming block is the current BB for now. Also, bypass |
| 24395 | // unreachable IR for efficiency and to avoid crashing. |
| 24396 | // TODO: Collect the skipped incoming values and try to vectorize them |
| 24397 | // after processing BB. |
| 24398 | if (BB == P->getIncomingBlock(i: I) || |
| 24399 | !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I))) |
| 24400 | continue; |
| 24401 | |
| 24402 | // Postponed instructions should not be vectorized here, delay their |
| 24403 | // vectorization. |
| 24404 | if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I)); |
| 24405 | PI && !IsInPostProcessInstrs(PI)) { |
| 24406 | bool Res = |
| 24407 | vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R); |
| 24408 | Changed |= Res; |
| 24409 | if (Res && R.isDeleted(I: P)) { |
| 24410 | It = BB->begin(); |
| 24411 | E = BB->end(); |
| 24412 | break; |
| 24413 | } |
| 24414 | } |
| 24415 | } |
| 24416 | continue; |
| 24417 | } |
| 24418 | |
| 24419 | if (HasNoUsers(&*It)) { |
| 24420 | bool OpsChanged = false; |
| 24421 | auto *SI = dyn_cast<StoreInst>(Val&: It); |
| 24422 | bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI; |
| 24423 | if (SI) { |
| 24424 | auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand())); |
| 24425 | // Try to vectorize chain in store, if this is the only store to the |
| 24426 | // address in the block. |
| 24427 | // TODO: This is just a temporarily solution to save compile time. Need |
| 24428 | // to investigate if we can safely turn on slp-vectorize-hor-store |
| 24429 | // instead to allow lookup for reduction chains in all non-vectorized |
| 24430 | // stores (need to check side effects and compile time). |
| 24431 | TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) && |
| 24432 | SI->getValueOperand()->hasOneUse(); |
| 24433 | } |
| 24434 | if (TryToVectorizeRoot) { |
| 24435 | for (auto *V : It->operand_values()) { |
| 24436 | // Postponed instructions should not be vectorized here, delay their |
| 24437 | // vectorization. |
| 24438 | if (auto *VI = dyn_cast<Instruction>(Val: V); |
| 24439 | VI && !IsInPostProcessInstrs(VI)) |
| 24440 | // Try to match and vectorize a horizontal reduction. |
| 24441 | OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R); |
| 24442 | } |
| 24443 | } |
| 24444 | // Start vectorization of post-process list of instructions from the |
| 24445 | // top-tree instructions to try to vectorize as many instructions as |
| 24446 | // possible. |
| 24447 | OpsChanged |= |
| 24448 | VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator()); |
| 24449 | if (OpsChanged) { |
| 24450 | // We would like to start over since some instructions are deleted |
| 24451 | // and the iterator may become invalid value. |
| 24452 | Changed = true; |
| 24453 | It = BB->begin(); |
| 24454 | E = BB->end(); |
| 24455 | continue; |
| 24456 | } |
| 24457 | } |
| 24458 | |
| 24459 | if (isa<InsertElementInst, InsertValueInst>(Val: It)) |
| 24460 | PostProcessInserts.insert(X: &*It); |
| 24461 | else if (isa<CmpInst>(Val: It)) |
| 24462 | PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It)); |
| 24463 | } |
| 24464 | |
| 24465 | return Changed; |
| 24466 | } |
| 24467 | |
| 24468 | bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { |
| 24469 | auto Changed = false; |
| 24470 | for (auto &Entry : GEPs) { |
| 24471 | // If the getelementptr list has fewer than two elements, there's nothing |
| 24472 | // to do. |
| 24473 | if (Entry.second.size() < 2) |
| 24474 | continue; |
| 24475 | |
| 24476 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " |
| 24477 | << Entry.second.size() << ".\n" ); |
| 24478 | |
| 24479 | // Process the GEP list in chunks suitable for the target's supported |
| 24480 | // vector size. If a vector register can't hold 1 element, we are done. We |
| 24481 | // are trying to vectorize the index computations, so the maximum number of |
| 24482 | // elements is based on the size of the index expression, rather than the |
| 24483 | // size of the GEP itself (the target's pointer size). |
| 24484 | auto *It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst *GEP) { |
| 24485 | return !R.isDeleted(I: GEP); |
| 24486 | }); |
| 24487 | if (It == Entry.second.end()) |
| 24488 | continue; |
| 24489 | unsigned MaxVecRegSize = R.getMaxVecRegSize(); |
| 24490 | unsigned EltSize = R.getVectorElementSize(V: *(*It)->idx_begin()); |
| 24491 | if (MaxVecRegSize < EltSize) |
| 24492 | continue; |
| 24493 | |
| 24494 | unsigned MaxElts = MaxVecRegSize / EltSize; |
| 24495 | for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) { |
| 24496 | auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts); |
| 24497 | ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len); |
| 24498 | |
| 24499 | // Initialize a set a candidate getelementptrs. Note that we use a |
| 24500 | // SetVector here to preserve program order. If the index computations |
| 24501 | // are vectorizable and begin with loads, we want to minimize the chance |
| 24502 | // of having to reorder them later. |
| 24503 | SetVector<Value *> Candidates(llvm::from_range, GEPList); |
| 24504 | |
| 24505 | // Some of the candidates may have already been vectorized after we |
| 24506 | // initially collected them or their index is optimized to constant value. |
| 24507 | // If so, they are marked as deleted, so remove them from the set of |
| 24508 | // candidates. |
| 24509 | Candidates.remove_if(P: [&R](Value *I) { |
| 24510 | return R.isDeleted(I: cast<Instruction>(Val: I)) || |
| 24511 | isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get()); |
| 24512 | }); |
| 24513 | |
| 24514 | // Remove from the set of candidates all pairs of getelementptrs with |
| 24515 | // constant differences. Such getelementptrs are likely not good |
| 24516 | // candidates for vectorization in a bottom-up phase since one can be |
| 24517 | // computed from the other. We also ensure all candidate getelementptr |
| 24518 | // indices are unique. |
| 24519 | for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { |
| 24520 | auto *GEPI = GEPList[I]; |
| 24521 | if (!Candidates.count(key: GEPI)) |
| 24522 | continue; |
| 24523 | const SCEV *SCEVI = SE->getSCEV(V: GEPList[I]); |
| 24524 | for (int J = I + 1; J < E && Candidates.size() > 1; ++J) { |
| 24525 | auto *GEPJ = GEPList[J]; |
| 24526 | const SCEV *SCEVJ = SE->getSCEV(V: GEPList[J]); |
| 24527 | if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) { |
| 24528 | Candidates.remove(X: GEPI); |
| 24529 | Candidates.remove(X: GEPJ); |
| 24530 | } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) { |
| 24531 | Candidates.remove(X: GEPJ); |
| 24532 | } |
| 24533 | } |
| 24534 | } |
| 24535 | |
| 24536 | // We break out of the above computation as soon as we know there are |
| 24537 | // fewer than two candidates remaining. |
| 24538 | if (Candidates.size() < 2) |
| 24539 | continue; |
| 24540 | |
| 24541 | // Add the single, non-constant index of each candidate to the bundle. We |
| 24542 | // ensured the indices met these constraints when we originally collected |
| 24543 | // the getelementptrs. |
| 24544 | SmallVector<Value *, 16> Bundle(Candidates.size()); |
| 24545 | auto BundleIndex = 0u; |
| 24546 | for (auto *V : Candidates) { |
| 24547 | auto *GEP = cast<GetElementPtrInst>(Val: V); |
| 24548 | auto *GEPIdx = GEP->idx_begin()->get(); |
| 24549 | assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx)); |
| 24550 | Bundle[BundleIndex++] = GEPIdx; |
| 24551 | } |
| 24552 | |
| 24553 | // Try and vectorize the indices. We are currently only interested in |
| 24554 | // gather-like cases of the form: |
| 24555 | // |
| 24556 | // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... |
| 24557 | // |
| 24558 | // where the loads of "a", the loads of "b", and the subtractions can be |
| 24559 | // performed in parallel. It's likely that detecting this pattern in a |
| 24560 | // bottom-up phase will be simpler and less costly than building a |
| 24561 | // full-blown top-down phase beginning at the consecutive loads. |
| 24562 | Changed |= tryToVectorizeList(VL: Bundle, R); |
| 24563 | } |
| 24564 | } |
| 24565 | return Changed; |
| 24566 | } |
| 24567 | |
| 24568 | bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { |
| 24569 | bool Changed = false; |
| 24570 | // Sort by type, base pointers and values operand. Value operands must be |
| 24571 | // compatible (have the same opcode, same parent), otherwise it is |
| 24572 | // definitely not profitable to try to vectorize them. |
| 24573 | auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { |
| 24574 | if (V->getValueOperand()->getType()->getTypeID() < |
| 24575 | V2->getValueOperand()->getType()->getTypeID()) |
| 24576 | return true; |
| 24577 | if (V->getValueOperand()->getType()->getTypeID() > |
| 24578 | V2->getValueOperand()->getType()->getTypeID()) |
| 24579 | return false; |
| 24580 | if (V->getPointerOperandType()->getTypeID() < |
| 24581 | V2->getPointerOperandType()->getTypeID()) |
| 24582 | return true; |
| 24583 | if (V->getPointerOperandType()->getTypeID() > |
| 24584 | V2->getPointerOperandType()->getTypeID()) |
| 24585 | return false; |
| 24586 | if (V->getValueOperand()->getType()->getScalarSizeInBits() < |
| 24587 | V2->getValueOperand()->getType()->getScalarSizeInBits()) |
| 24588 | return true; |
| 24589 | if (V->getValueOperand()->getType()->getScalarSizeInBits() > |
| 24590 | V2->getValueOperand()->getType()->getScalarSizeInBits()) |
| 24591 | return false; |
| 24592 | // UndefValues are compatible with all other values. |
| 24593 | if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand())) |
| 24594 | if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) { |
| 24595 | DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = |
| 24596 | DT->getNode(BB: I1->getParent()); |
| 24597 | DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = |
| 24598 | DT->getNode(BB: I2->getParent()); |
| 24599 | assert(NodeI1 && "Should only process reachable instructions" ); |
| 24600 | assert(NodeI2 && "Should only process reachable instructions" ); |
| 24601 | assert((NodeI1 == NodeI2) == |
| 24602 | (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && |
| 24603 | "Different nodes should have different DFS numbers" ); |
| 24604 | if (NodeI1 != NodeI2) |
| 24605 | return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); |
| 24606 | return I1->getOpcode() < I2->getOpcode(); |
| 24607 | } |
| 24608 | return V->getValueOperand()->getValueID() < |
| 24609 | V2->getValueOperand()->getValueID(); |
| 24610 | }; |
| 24611 | |
| 24612 | auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { |
| 24613 | if (V1 == V2) |
| 24614 | return true; |
| 24615 | if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) |
| 24616 | return false; |
| 24617 | if (V1->getPointerOperandType() != V2->getPointerOperandType()) |
| 24618 | return false; |
| 24619 | // Undefs are compatible with any other value. |
| 24620 | if (isa<UndefValue>(Val: V1->getValueOperand()) || |
| 24621 | isa<UndefValue>(Val: V2->getValueOperand())) |
| 24622 | return true; |
| 24623 | if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand())) |
| 24624 | if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) { |
| 24625 | if (I1->getParent() != I2->getParent()) |
| 24626 | return false; |
| 24627 | return getSameOpcode(VL: {I1, I2}, TLI: *TLI).valid(); |
| 24628 | } |
| 24629 | if (isa<Constant>(Val: V1->getValueOperand()) && |
| 24630 | isa<Constant>(Val: V2->getValueOperand())) |
| 24631 | return true; |
| 24632 | return V1->getValueOperand()->getValueID() == |
| 24633 | V2->getValueOperand()->getValueID(); |
| 24634 | }; |
| 24635 | |
| 24636 | // Attempt to sort and vectorize each of the store-groups. |
| 24637 | DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted; |
| 24638 | for (auto &Pair : Stores) { |
| 24639 | if (Pair.second.size() < 2) |
| 24640 | continue; |
| 24641 | |
| 24642 | LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " |
| 24643 | << Pair.second.size() << ".\n" ); |
| 24644 | |
| 24645 | if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType())) |
| 24646 | continue; |
| 24647 | |
| 24648 | // Reverse stores to do bottom-to-top analysis. This is important if the |
| 24649 | // values are stores to the same addresses several times, in this case need |
| 24650 | // to follow the stores order (reversed to meet the memory dependecies). |
| 24651 | SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(), |
| 24652 | Pair.second.rend()); |
| 24653 | Changed |= tryToVectorizeSequence<StoreInst>( |
| 24654 | Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores, |
| 24655 | TryToVectorizeHelper: [&](ArrayRef<StoreInst *> Candidates, bool) { |
| 24656 | return vectorizeStores(Stores: Candidates, R, Visited&: Attempted); |
| 24657 | }, |
| 24658 | /*MaxVFOnly=*/false, R); |
| 24659 | } |
| 24660 | return Changed; |
| 24661 | } |
| 24662 | |