1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/Loads.h"
42#include "llvm/Analysis/LoopAccessAnalysis.h"
43#include "llvm/Analysis/LoopInfo.h"
44#include "llvm/Analysis/MemoryLocation.h"
45#include "llvm/Analysis/OptimizationRemarkEmitter.h"
46#include "llvm/Analysis/ScalarEvolution.h"
47#include "llvm/Analysis/ScalarEvolutionExpressions.h"
48#include "llvm/Analysis/TargetLibraryInfo.h"
49#include "llvm/Analysis/TargetTransformInfo.h"
50#include "llvm/Analysis/ValueTracking.h"
51#include "llvm/Analysis/VectorUtils.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/DerivedTypes.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PatternMatch.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/DOTGraphTraits.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/DebugCounter.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/GraphWriter.h"
86#include "llvm/Support/InstructionCost.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Transforms/Utils/InjectTLIMappings.h"
91#include "llvm/Transforms/Utils/Local.h"
92#include "llvm/Transforms/Utils/LoopUtils.h"
93#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <memory>
99#include <optional>
100#include <set>
101#include <string>
102#include <tuple>
103#include <utility>
104
105using namespace llvm;
106using namespace llvm::PatternMatch;
107using namespace slpvectorizer;
108using namespace std::placeholders;
109
110#define SV_NAME "slp-vectorizer"
111#define DEBUG_TYPE "SLP"
112
113STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
114
115DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
116 "Controls which SLP graphs should be vectorized.");
117
118static cl::opt<bool>
119 RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
120 cl::desc("Run the SLP vectorization passes"));
121
122static cl::opt<bool>
123 SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
124 cl::desc("Enable vectorization for wider vector utilization"));
125
126static cl::opt<int>
127 SLPCostThreshold("slp-threshold", cl::init(Val: 0), cl::Hidden,
128 cl::desc("Only vectorize if you gain more than this "
129 "number "));
130
131static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
132 "slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
133 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
134 "heuristics and makes vectorization decision via cost modeling."));
135
136static cl::opt<bool>
137ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
138 cl::desc("Attempt to vectorize horizontal reductions"));
139
140static cl::opt<bool> ShouldStartVectorizeHorAtStore(
141 "slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
142 cl::desc(
143 "Attempt to vectorize horizontal reductions feeding into a store"));
144
145static cl::opt<bool> SplitAlternateInstructions(
146 "slp-split-alternate-instructions", cl::init(Val: true), cl::Hidden,
147 cl::desc("Improve the code quality by splitting alternate instructions"));
148
149static cl::opt<int>
150MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: 128), cl::Hidden,
151 cl::desc("Attempt to vectorize for this register size in bits"));
152
153static cl::opt<unsigned>
154MaxVFOption("slp-max-vf", cl::init(Val: 0), cl::Hidden,
155 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
156
157/// Limits the size of scheduling regions in a block.
158/// It avoid long compile times for _very_ large blocks where vector
159/// instructions are spread over a wide range.
160/// This limit is way higher than needed by real-world functions.
161static cl::opt<int>
162ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: 100000), cl::Hidden,
163 cl::desc("Limit the size of the SLP scheduling region per block"));
164
165static cl::opt<int> MinVectorRegSizeOption(
166 "slp-min-reg-size", cl::init(Val: 128), cl::Hidden,
167 cl::desc("Attempt to vectorize for this register size in bits"));
168
169static cl::opt<unsigned> RecursionMaxDepth(
170 "slp-recursion-max-depth", cl::init(Val: 12), cl::Hidden,
171 cl::desc("Limit the recursion depth when building a vectorizable tree"));
172
173static cl::opt<unsigned> MinTreeSize(
174 "slp-min-tree-size", cl::init(Val: 3), cl::Hidden,
175 cl::desc("Only vectorize small trees if they are fully vectorizable"));
176
177// The maximum depth that the look-ahead score heuristic will explore.
178// The higher this value, the higher the compilation time overhead.
179static cl::opt<int> LookAheadMaxDepth(
180 "slp-max-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
181 cl::desc("The maximum look-ahead depth for operand reordering scores"));
182
183// The maximum depth that the look-ahead score heuristic will explore
184// when it probing among candidates for vectorization tree roots.
185// The higher this value, the higher the compilation time overhead but unlike
186// similar limit for operands ordering this is less frequently used, hence
187// impact of higher value is less noticeable.
188static cl::opt<int> RootLookAheadMaxDepth(
189 "slp-max-root-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
190 cl::desc("The maximum look-ahead depth for searching best rooting option"));
191
192static cl::opt<unsigned> MinProfitableStridedLoads(
193 "slp-min-strided-loads", cl::init(Val: 2), cl::Hidden,
194 cl::desc("The minimum number of loads, which should be considered strided, "
195 "if the stride is > 1 or is runtime value"));
196
197static cl::opt<unsigned> MaxProfitableLoadStride(
198 "slp-max-stride", cl::init(Val: 8), cl::Hidden,
199 cl::desc("The maximum stride, considered to be profitable."));
200
201static cl::opt<bool>
202 ViewSLPTree("view-slp-tree", cl::Hidden,
203 cl::desc("Display the SLP trees with Graphviz"));
204
205static cl::opt<bool> VectorizeNonPowerOf2(
206 "slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
207 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
208
209// Limit the number of alias checks. The limit is chosen so that
210// it has no negative effect on the llvm benchmarks.
211static const unsigned AliasedCheckLimit = 10;
212
213// Limit of the number of uses for potentially transformed instructions/values,
214// used in checks to avoid compile-time explode.
215static constexpr int UsesLimit = 64;
216
217// Another limit for the alias checks: The maximum distance between load/store
218// instructions where alias checks are done.
219// This limit is useful for very large basic blocks.
220static const unsigned MaxMemDepDistance = 160;
221
222/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
223/// regions to be handled.
224static const int MinScheduleRegionSize = 16;
225
226/// Maximum allowed number of operands in the PHI nodes.
227static const unsigned MaxPHINumOperands = 128;
228
229/// Predicate for the element types that the SLP vectorizer supports.
230///
231/// The most important thing to filter here are types which are invalid in LLVM
232/// vectors. We also filter target specific types which have absolutely no
233/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
234/// avoids spending time checking the cost model and realizing that they will
235/// be inevitably scalarized.
236static bool isValidElementType(Type *Ty) {
237 // TODO: Support ScalableVectorType.
238 if (SLPReVec && isa<FixedVectorType>(Val: Ty))
239 Ty = Ty->getScalarType();
240 return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
241 !Ty->isPPC_FP128Ty();
242}
243
244/// Returns the type of the given value/instruction \p V. If it is store,
245/// returns the type of its value operand, for Cmp - the types of the compare
246/// operands and for insertelement - the type os the inserted operand.
247/// Otherwise, just the type of the value is returned.
248static Type *getValueType(Value *V) {
249 if (auto *SI = dyn_cast<StoreInst>(Val: V))
250 return SI->getValueOperand()->getType();
251 if (auto *CI = dyn_cast<CmpInst>(Val: V))
252 return CI->getOperand(i_nocapture: 0)->getType();
253 if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
254 return IE->getOperand(i_nocapture: 1)->getType();
255 return V->getType();
256}
257
258/// \returns the number of elements for Ty.
259static unsigned getNumElements(Type *Ty) {
260 assert(!isa<ScalableVectorType>(Ty) &&
261 "ScalableVectorType is not supported.");
262 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
263 return VecTy->getNumElements();
264 return 1;
265}
266
267/// \returns the vector type of ScalarTy based on vectorization factor.
268static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
269 return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
270 NumElts: VF * getNumElements(Ty: ScalarTy));
271}
272
273/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
274/// which forms type, which splits by \p TTI into whole vector types during
275/// legalization.
276static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
277 Type *Ty, unsigned Sz) {
278 if (!isValidElementType(Ty))
279 return bit_ceil(Value: Sz);
280 // Find the number of elements, which forms full vectors.
281 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
282 if (NumParts == 0 || NumParts >= Sz)
283 return bit_ceil(Value: Sz);
284 return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts;
285}
286
287/// Returns the number of elements of the given type \p Ty, not greater than \p
288/// Sz, which forms type, which splits by \p TTI into whole vector types during
289/// legalization.
290static unsigned
291getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
292 unsigned Sz) {
293 if (!isValidElementType(Ty))
294 return bit_floor(Value: Sz);
295 // Find the number of elements, which forms full vectors.
296 unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
297 if (NumParts == 0 || NumParts >= Sz)
298 return bit_floor(Value: Sz);
299 unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts));
300 if (RegVF > Sz)
301 return bit_floor(Value: Sz);
302 return (Sz / RegVF) * RegVF;
303}
304
305static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
306 SmallVectorImpl<int> &Mask) {
307 // The ShuffleBuilder implementation use shufflevector to splat an "element".
308 // But the element have different meaning for SLP (scalar) and REVEC
309 // (vector). We need to expand Mask into masks which shufflevector can use
310 // directly.
311 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
312 for (unsigned I : seq<unsigned>(Size: Mask.size()))
313 for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice(
314 N: I * VecTyNumElements, M: VecTyNumElements)))
315 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
316 : Mask[I] * VecTyNumElements + J;
317 Mask.swap(RHS&: NewMask);
318}
319
320/// \returns the number of groups of shufflevector
321/// A group has the following features
322/// 1. All of value in a group are shufflevector.
323/// 2. The mask of all shufflevector is isExtractSubvectorMask.
324/// 3. The mask of all shufflevector uses all of the elements of the source.
325/// e.g., it is 1 group (%0)
326/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
327/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
328/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
329/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
330/// it is 2 groups (%3 and %4)
331/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
332/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
334/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
335/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
338/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
339/// it is 0 group
340/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
341/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
342/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
343/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
344static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
345 if (VL.empty())
346 return 0;
347 if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>))
348 return 0;
349 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
350 unsigned SVNumElements =
351 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
352 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
353 if (SVNumElements % ShuffleMaskSize != 0)
354 return 0;
355 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
356 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
357 return 0;
358 unsigned NumGroup = 0;
359 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
360 auto *SV = cast<ShuffleVectorInst>(Val: VL[I]);
361 Value *Src = SV->getOperand(i_nocapture: 0);
362 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
363 SmallBitVector ExpectedIndex(GroupSize);
364 if (!all_of(Range&: Group, P: [&](Value *V) {
365 auto *SV = cast<ShuffleVectorInst>(Val: V);
366 // From the same source.
367 if (SV->getOperand(i_nocapture: 0) != Src)
368 return false;
369 int Index;
370 if (!SV->isExtractSubvectorMask(Index))
371 return false;
372 ExpectedIndex.set(Index / ShuffleMaskSize);
373 return true;
374 }))
375 return 0;
376 if (!ExpectedIndex.all())
377 return 0;
378 ++NumGroup;
379 }
380 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
381 return NumGroup;
382}
383
384/// \returns a shufflevector mask which is used to vectorize shufflevectors
385/// e.g.,
386/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
387/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
388/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
389/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
390/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
391/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
392/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
393/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
394/// the result is
395/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
396static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
397 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
398 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
399 unsigned SVNumElements =
400 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
401 SmallVector<int> Mask;
402 unsigned AccumulateLength = 0;
403 for (Value *V : VL) {
404 auto *SV = cast<ShuffleVectorInst>(Val: V);
405 for (int M : SV->getShuffleMask())
406 Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem
407 : AccumulateLength + M);
408 AccumulateLength += SVNumElements;
409 }
410 return Mask;
411}
412
413/// \returns True if the value is a constant (but not globals/constant
414/// expressions).
415static bool isConstant(Value *V) {
416 return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
417}
418
419/// Checks if \p V is one of vector-like instructions, i.e. undef,
420/// insertelement/extractelement with constant indices for fixed vector type or
421/// extractvalue instruction.
422static bool isVectorLikeInstWithConstOps(Value *V) {
423 if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
424 !isa<ExtractValueInst, UndefValue>(Val: V))
425 return false;
426 auto *I = dyn_cast<Instruction>(Val: V);
427 if (!I || isa<ExtractValueInst>(Val: I))
428 return true;
429 if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType()))
430 return false;
431 if (isa<ExtractElementInst>(Val: I))
432 return isConstant(V: I->getOperand(i: 1));
433 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
434 return isConstant(V: I->getOperand(i: 2));
435}
436
437/// Returns power-of-2 number of elements in a single register (part), given the
438/// total number of elements \p Size and number of registers (parts) \p
439/// NumParts.
440static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
441 return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts)));
442}
443
444/// Returns correct remaining number of elements, considering total amount \p
445/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
446/// and current register (part) \p Part.
447static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
448 unsigned Part) {
449 return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
450}
451
452#if !defined(NDEBUG)
453/// Print a short descriptor of the instruction bundle suitable for debug output.
454static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
455 std::string Result;
456 raw_string_ostream OS(Result);
457 if (Idx >= 0)
458 OS << "Idx: " << Idx << ", ";
459 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
460 return Result;
461}
462#endif
463
464/// \returns true if all of the instructions in \p VL are in the same block or
465/// false otherwise.
466static bool allSameBlock(ArrayRef<Value *> VL) {
467 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
468 if (It == VL.end())
469 return false;
470 Instruction *I0 = cast<Instruction>(Val: *It);
471 if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
472 return true;
473
474 BasicBlock *BB = I0->getParent();
475 for (Value *V : iterator_range(It, VL.end())) {
476 if (isa<PoisonValue>(Val: V))
477 continue;
478 auto *II = dyn_cast<Instruction>(Val: V);
479 if (!II)
480 return false;
481
482 if (BB != II->getParent())
483 return false;
484 }
485 return true;
486}
487
488/// \returns True if all of the values in \p VL are constants (but not
489/// globals/constant expressions).
490static bool allConstant(ArrayRef<Value *> VL) {
491 // Constant expressions and globals can't be vectorized like normal integer/FP
492 // constants.
493 return all_of(Range&: VL, P: isConstant);
494}
495
496/// \returns True if all of the values in \p VL are identical or some of them
497/// are UndefValue.
498static bool isSplat(ArrayRef<Value *> VL) {
499 Value *FirstNonUndef = nullptr;
500 for (Value *V : VL) {
501 if (isa<UndefValue>(Val: V))
502 continue;
503 if (!FirstNonUndef) {
504 FirstNonUndef = V;
505 continue;
506 }
507 if (V != FirstNonUndef)
508 return false;
509 }
510 return FirstNonUndef != nullptr;
511}
512
513/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
514/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
515/// patterns that make it effectively commutative (like equality comparisons
516/// with zero).
517/// In most cases, users should not call this function directly (since \p I and
518/// \p InstWithUses are the same). However, when analyzing interchangeable
519/// instructions, we need to use the converted opcode along with the original
520/// uses.
521/// \param I The instruction to check for commutativity
522/// \param InstWithUses The instruction whose uses are analyzed for special
523/// patterns
524static bool isCommutative(Instruction *I, Instruction *InstWithUses) {
525 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
526 return Cmp->isCommutative();
527 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
528 return BO->isCommutative() ||
529 (BO->getOpcode() == Instruction::Sub &&
530 !InstWithUses->hasNUsesOrMore(N: UsesLimit) &&
531 all_of(
532 Range: InstWithUses->uses(),
533 P: [](const Use &U) {
534 // Commutative, if icmp eq/ne sub, 0
535 CmpPredicate Pred;
536 if (match(V: U.getUser(),
537 P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
538 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
539 return true;
540 // Commutative, if abs(sub nsw, true) or abs(sub, false).
541 ConstantInt *Flag;
542 return match(V: U.getUser(),
543 P: m_Intrinsic<Intrinsic::abs>(
544 Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
545 (!cast<Instruction>(Val: U.get())->hasNoSignedWrap() ||
546 Flag->isOne());
547 })) ||
548 (BO->getOpcode() == Instruction::FSub &&
549 !InstWithUses->hasNUsesOrMore(N: UsesLimit) &&
550 all_of(Range: InstWithUses->uses(), P: [](const Use &U) {
551 return match(V: U.getUser(),
552 P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
553 }));
554 return I->isCommutative();
555}
556
557/// This is a helper function to check whether \p I is commutative.
558/// This is a convenience wrapper that calls the two-parameter version of
559/// isCommutative with the same instruction for both parameters. This is
560/// the common case where the instruction being checked for commutativity
561/// is the same as the instruction whose uses are analyzed for special
562/// patterns (see the two-parameter version above for details).
563/// \param I The instruction to check for commutativity
564/// \returns true if the instruction is commutative, false otherwise
565static bool isCommutative(Instruction *I) { return isCommutative(I, InstWithUses: I); }
566
567template <typename T>
568static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
569 unsigned Offset) {
570 static_assert(std::is_same_v<T, InsertElementInst> ||
571 std::is_same_v<T, ExtractElementInst>,
572 "unsupported T");
573 int Index = Offset;
574 if (const auto *IE = dyn_cast<T>(Inst)) {
575 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
576 if (!VT)
577 return std::nullopt;
578 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
579 if (!CI)
580 return std::nullopt;
581 if (CI->getValue().uge(VT->getNumElements()))
582 return std::nullopt;
583 Index *= VT->getNumElements();
584 Index += CI->getZExtValue();
585 return Index;
586 }
587 return std::nullopt;
588}
589
590/// \returns inserting or extracting index of InsertElement, ExtractElement or
591/// InsertValue instruction, using Offset as base offset for index.
592/// \returns std::nullopt if the index is not an immediate.
593static std::optional<unsigned> getElementIndex(const Value *Inst,
594 unsigned Offset = 0) {
595 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
596 return Index;
597 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
598 return Index;
599
600 int Index = Offset;
601
602 const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
603 if (!IV)
604 return std::nullopt;
605
606 Type *CurrentType = IV->getType();
607 for (unsigned I : IV->indices()) {
608 if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
609 Index *= ST->getNumElements();
610 CurrentType = ST->getElementType(N: I);
611 } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
612 Index *= AT->getNumElements();
613 CurrentType = AT->getElementType();
614 } else {
615 return std::nullopt;
616 }
617 Index += I;
618 }
619 return Index;
620}
621
622/// \returns true if all of the values in \p VL use the same opcode.
623/// For comparison instructions, also checks if predicates match.
624/// PoisonValues are considered matching.
625/// Interchangeable instructions are not considered.
626static bool allSameOpcode(ArrayRef<Value *> VL) {
627 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
628 if (It == VL.end())
629 return true;
630 Instruction *MainOp = cast<Instruction>(Val: *It);
631 unsigned Opcode = MainOp->getOpcode();
632 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
633 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
634 : CmpInst::BAD_ICMP_PREDICATE;
635 return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) {
636 if (auto *CI = dyn_cast<CmpInst>(Val: V))
637 return BasePred == CI->getPredicate();
638 if (auto *I = dyn_cast<Instruction>(Val: V))
639 return I->getOpcode() == Opcode;
640 return isa<PoisonValue>(Val: V);
641 });
642}
643
644namespace {
645/// Specifies the way the mask should be analyzed for undefs/poisonous elements
646/// in the shuffle mask.
647enum class UseMask {
648 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
649 ///< check for the mask elements for the first argument (mask
650 ///< indices are in range [0:VF)).
651 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
652 ///< for the mask elements for the second argument (mask indices
653 ///< are in range [VF:2*VF))
654 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
655 ///< future shuffle elements and mark them as ones as being used
656 ///< in future. Non-undef elements are considered as unused since
657 ///< they're already marked as used in the mask.
658};
659} // namespace
660
661/// Prepares a use bitset for the given mask either for the first argument or
662/// for the second.
663static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
664 UseMask MaskArg) {
665 SmallBitVector UseMask(VF, true);
666 for (auto [Idx, Value] : enumerate(First&: Mask)) {
667 if (Value == PoisonMaskElem) {
668 if (MaskArg == UseMask::UndefsAsMask)
669 UseMask.reset(Idx);
670 continue;
671 }
672 if (MaskArg == UseMask::FirstArg && Value < VF)
673 UseMask.reset(Idx: Value);
674 else if (MaskArg == UseMask::SecondArg && Value >= VF)
675 UseMask.reset(Idx: Value - VF);
676 }
677 return UseMask;
678}
679
680/// Checks if the given value is actually an undefined constant vector.
681/// Also, if the \p UseMask is not empty, tries to check if the non-masked
682/// elements actually mask the insertelement buildvector, if any.
683template <bool IsPoisonOnly = false>
684static SmallBitVector isUndefVector(const Value *V,
685 const SmallBitVector &UseMask = {}) {
686 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
687 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
688 if (isa<T>(V))
689 return Res;
690 auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
691 if (!VecTy)
692 return Res.reset();
693 auto *C = dyn_cast<Constant>(Val: V);
694 if (!C) {
695 if (!UseMask.empty()) {
696 const Value *Base = V;
697 while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
698 Base = II->getOperand(i_nocapture: 0);
699 if (isa<T>(II->getOperand(i_nocapture: 1)))
700 continue;
701 std::optional<unsigned> Idx = getElementIndex(Inst: II);
702 if (!Idx) {
703 Res.reset();
704 return Res;
705 }
706 if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx))
707 Res.reset(Idx: *Idx);
708 }
709 // TODO: Add analysis for shuffles here too.
710 if (V == Base) {
711 Res.reset();
712 } else {
713 SmallBitVector SubMask(UseMask.size(), false);
714 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
715 }
716 } else {
717 Res.reset();
718 }
719 return Res;
720 }
721 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
722 if (Constant *Elem = C->getAggregateElement(Elt: I))
723 if (!isa<T>(Elem) &&
724 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I))))
725 Res.reset(Idx: I);
726 }
727 return Res;
728}
729
730/// Checks if the vector of instructions can be represented as a shuffle, like:
731/// %x0 = extractelement <4 x i8> %x, i32 0
732/// %x3 = extractelement <4 x i8> %x, i32 3
733/// %y1 = extractelement <4 x i8> %y, i32 1
734/// %y2 = extractelement <4 x i8> %y, i32 2
735/// %x0x0 = mul i8 %x0, %x0
736/// %x3x3 = mul i8 %x3, %x3
737/// %y1y1 = mul i8 %y1, %y1
738/// %y2y2 = mul i8 %y2, %y2
739/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
740/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
741/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
742/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
743/// ret <4 x i8> %ins4
744/// can be transformed into:
745/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
746/// i32 6>
747/// %2 = mul <4 x i8> %1, %1
748/// ret <4 x i8> %2
749/// Mask will return the Shuffle Mask equivalent to the extracted elements.
750/// TODO: Can we split off and reuse the shuffle mask detection from
751/// ShuffleVectorInst/getShuffleCost?
752static std::optional<TargetTransformInfo::ShuffleKind>
753isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
754 AssumptionCache *AC) {
755 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
756 if (It == VL.end())
757 return std::nullopt;
758 unsigned Size =
759 std::accumulate(first: VL.begin(), last: VL.end(), init: 0u, binary_op: [](unsigned S, Value *V) {
760 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
761 if (!EI)
762 return S;
763 auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
764 if (!VTy)
765 return S;
766 return std::max(a: S, b: VTy->getNumElements());
767 });
768
769 Value *Vec1 = nullptr;
770 Value *Vec2 = nullptr;
771 bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) {
772 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
773 if (!EE)
774 return false;
775 Value *Vec = EE->getVectorOperand();
776 if (isa<UndefValue>(Val: Vec))
777 return false;
778 return isGuaranteedNotToBePoison(V: Vec, AC);
779 });
780 enum ShuffleMode { Unknown, Select, Permute };
781 ShuffleMode CommonShuffleMode = Unknown;
782 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
783 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
784 // Undef can be represented as an undef element in a vector.
785 if (isa<UndefValue>(Val: VL[I]))
786 continue;
787 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
788 if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
789 return std::nullopt;
790 auto *Vec = EI->getVectorOperand();
791 // We can extractelement from undef or poison vector.
792 if (isUndefVector</*isPoisonOnly=*/true>(V: Vec).all())
793 continue;
794 // All vector operands must have the same number of vector elements.
795 if (isa<UndefValue>(Val: Vec)) {
796 Mask[I] = I;
797 } else {
798 if (isa<UndefValue>(Val: EI->getIndexOperand()))
799 continue;
800 auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
801 if (!Idx)
802 return std::nullopt;
803 // Undefined behavior if Idx is negative or >= Size.
804 if (Idx->getValue().uge(RHS: Size))
805 continue;
806 unsigned IntIdx = Idx->getValue().getZExtValue();
807 Mask[I] = IntIdx;
808 }
809 if (isUndefVector(V: Vec).all() && HasNonUndefVec)
810 continue;
811 // For correct shuffling we have to have at most 2 different vector operands
812 // in all extractelement instructions.
813 if (!Vec1 || Vec1 == Vec) {
814 Vec1 = Vec;
815 } else if (!Vec2 || Vec2 == Vec) {
816 Vec2 = Vec;
817 Mask[I] += Size;
818 } else {
819 return std::nullopt;
820 }
821 if (CommonShuffleMode == Permute)
822 continue;
823 // If the extract index is not the same as the operation number, it is a
824 // permutation.
825 if (Mask[I] % Size != I) {
826 CommonShuffleMode = Permute;
827 continue;
828 }
829 CommonShuffleMode = Select;
830 }
831 // If we're not crossing lanes in different vectors, consider it as blending.
832 if (CommonShuffleMode == Select && Vec2)
833 return TargetTransformInfo::SK_Select;
834 // If Vec2 was never used, we have a permutation of a single vector, otherwise
835 // we have permutation of 2 vectors.
836 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
837 : TargetTransformInfo::SK_PermuteSingleSrc;
838}
839
840/// \returns True if Extract{Value,Element} instruction extracts element Idx.
841static std::optional<unsigned> getExtractIndex(const Instruction *E) {
842 unsigned Opcode = E->getOpcode();
843 assert((Opcode == Instruction::ExtractElement ||
844 Opcode == Instruction::ExtractValue) &&
845 "Expected extractelement or extractvalue instruction.");
846 if (Opcode == Instruction::ExtractElement) {
847 auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1));
848 if (!CI)
849 return std::nullopt;
850 return CI->getZExtValue();
851 }
852 auto *EI = cast<ExtractValueInst>(Val: E);
853 if (EI->getNumIndices() != 1)
854 return std::nullopt;
855 return *EI->idx_begin();
856}
857
858namespace {
859/// \returns true if \p Opcode is allowed as part of the main/alternate
860/// instruction for SLP vectorization.
861///
862/// Example of unsupported opcode is SDIV that can potentially cause UB if the
863/// "shuffled out" lane would result in division by zero.
864bool isValidForAlternation(unsigned Opcode) {
865 return !Instruction::isIntDivRem(Opcode);
866}
867
868/// Helper class that determines VL can use the same opcode.
869/// Alternate instruction is supported. In addition, it supports interchangeable
870/// instruction. An interchangeable instruction is an instruction that can be
871/// converted to another instruction with same semantics. For example, x << 1 is
872/// equal to x * 2. x * 1 is equal to x | 0.
873class BinOpSameOpcodeHelper {
874 using MaskType = std::uint_fast16_t;
875 /// Sort SupportedOp because it is used by binary_search.
876 constexpr static std::initializer_list<unsigned> SupportedOp = {
877 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
878 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
879 enum : MaskType {
880 ShlBIT = 0b1,
881 AShrBIT = 0b10,
882 MulBIT = 0b100,
883 AddBIT = 0b1000,
884 SubBIT = 0b10000,
885 AndBIT = 0b100000,
886 OrBIT = 0b1000000,
887 XorBIT = 0b10000000,
888 MainOpBIT = 0b100000000,
889 LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
890 };
891 /// Return a non-nullptr if either operand of I is a ConstantInt.
892 /// The second return value represents the operand position. We check the
893 /// right-hand side first (1). If the right hand side is not a ConstantInt and
894 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
895 /// side (0).
896 static std::pair<ConstantInt *, unsigned>
897 isBinOpWithConstantInt(const Instruction *I) {
898 unsigned Opcode = I->getOpcode();
899 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
900 (void)SupportedOp;
901 auto *BinOp = cast<BinaryOperator>(Val: I);
902 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1)))
903 return {CI, 1};
904 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
905 Opcode == Instruction::AShr)
906 return {nullptr, 0};
907 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 0)))
908 return {CI, 0};
909 return {nullptr, 0};
910 }
911 struct InterchangeableInfo {
912 const Instruction *I = nullptr;
913 /// The bit it sets represents whether MainOp can be converted to.
914 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
915 MulBIT | AShrBIT | ShlBIT;
916 /// We cannot create an interchangeable instruction that does not exist in
917 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
918 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
919 /// 1]. SeenBefore is used to know what operations have been seen before.
920 MaskType SeenBefore = 0;
921 InterchangeableInfo(const Instruction *I) : I(I) {}
922 /// Return false allows BinOpSameOpcodeHelper to find an alternate
923 /// instruction. Directly setting the mask will destroy the mask state,
924 /// preventing us from determining which instruction it should convert to.
925 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
926 if (Mask & InterchangeableMask) {
927 SeenBefore |= OpcodeInMaskForm;
928 Mask &= InterchangeableMask;
929 return true;
930 }
931 return false;
932 }
933 bool equal(unsigned Opcode) {
934 if (Opcode == I->getOpcode())
935 return trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT);
936 return false;
937 }
938 unsigned getOpcode() const {
939 MaskType Candidate = Mask & SeenBefore;
940 if (Candidate & MainOpBIT)
941 return I->getOpcode();
942 if (Candidate & ShlBIT)
943 return Instruction::Shl;
944 if (Candidate & AShrBIT)
945 return Instruction::AShr;
946 if (Candidate & MulBIT)
947 return Instruction::Mul;
948 if (Candidate & AddBIT)
949 return Instruction::Add;
950 if (Candidate & SubBIT)
951 return Instruction::Sub;
952 if (Candidate & AndBIT)
953 return Instruction::And;
954 if (Candidate & OrBIT)
955 return Instruction::Or;
956 if (Candidate & XorBIT)
957 return Instruction::Xor;
958 llvm_unreachable("Cannot find interchangeable instruction.");
959 }
960 SmallVector<Value *> getOperand(const Instruction *To) const {
961 unsigned ToOpcode = To->getOpcode();
962 unsigned FromOpcode = I->getOpcode();
963 if (FromOpcode == ToOpcode)
964 return SmallVector<Value *>(I->operands());
965 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
966 auto [CI, Pos] = isBinOpWithConstantInt(I);
967 const APInt &FromCIValue = CI->getValue();
968 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
969 APInt ToCIValue;
970 switch (FromOpcode) {
971 case Instruction::Shl:
972 if (ToOpcode == Instruction::Mul) {
973 ToCIValue = APInt::getOneBitSet(numBits: FromCIValueBitWidth,
974 BitNo: FromCIValue.getZExtValue());
975 } else {
976 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
977 ToCIValue = ToOpcode == Instruction::And
978 ? APInt::getAllOnes(numBits: FromCIValueBitWidth)
979 : APInt::getZero(numBits: FromCIValueBitWidth);
980 }
981 break;
982 case Instruction::Mul:
983 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
984 if (ToOpcode == Instruction::Shl) {
985 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
986 } else {
987 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
988 ToCIValue = ToOpcode == Instruction::And
989 ? APInt::getAllOnes(numBits: FromCIValueBitWidth)
990 : APInt::getZero(numBits: FromCIValueBitWidth);
991 }
992 break;
993 case Instruction::Add:
994 case Instruction::Sub:
995 if (FromCIValue.isZero()) {
996 ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth);
997 } else {
998 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
999 "Cannot convert the instruction.");
1000 ToCIValue = FromCIValue;
1001 ToCIValue.negate();
1002 }
1003 break;
1004 case Instruction::And:
1005 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1006 ToCIValue = ToOpcode == Instruction::Mul
1007 ? APInt::getOneBitSet(numBits: FromCIValueBitWidth, BitNo: 0)
1008 : APInt::getZero(numBits: FromCIValueBitWidth);
1009 break;
1010 default:
1011 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1012 ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth);
1013 break;
1014 }
1015 Value *LHS = I->getOperand(i: 1 - Pos);
1016 Constant *RHS =
1017 ConstantInt::get(Ty: I->getOperand(i: Pos)->getType(), V: ToCIValue);
1018 // constant + x cannot be -constant - x
1019 // instead, it should be x - -constant
1020 if (Pos == 1 ||
1021 (FromOpcode == Instruction::Add && ToOpcode == Instruction::Sub))
1022 return SmallVector<Value *>({LHS, RHS});
1023 return SmallVector<Value *>({RHS, LHS});
1024 }
1025 };
1026 InterchangeableInfo MainOp;
1027 InterchangeableInfo AltOp;
1028 bool isValidForAlternation(const Instruction *I) const {
1029 return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) &&
1030 ::isValidForAlternation(Opcode: I->getOpcode());
1031 }
1032 bool initializeAltOp(const Instruction *I) {
1033 if (AltOp.I)
1034 return true;
1035 if (!isValidForAlternation(I))
1036 return false;
1037 AltOp.I = I;
1038 return true;
1039 }
1040
1041public:
1042 BinOpSameOpcodeHelper(const Instruction *MainOp,
1043 const Instruction *AltOp = nullptr)
1044 : MainOp(MainOp), AltOp(AltOp) {
1045 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1046 }
1047 bool add(const Instruction *I) {
1048 assert(isa<BinaryOperator>(I) &&
1049 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1050 unsigned Opcode = I->getOpcode();
1051 MaskType OpcodeInMaskForm;
1052 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1053 switch (Opcode) {
1054 case Instruction::Shl:
1055 OpcodeInMaskForm = ShlBIT;
1056 break;
1057 case Instruction::AShr:
1058 OpcodeInMaskForm = AShrBIT;
1059 break;
1060 case Instruction::Mul:
1061 OpcodeInMaskForm = MulBIT;
1062 break;
1063 case Instruction::Add:
1064 OpcodeInMaskForm = AddBIT;
1065 break;
1066 case Instruction::Sub:
1067 OpcodeInMaskForm = SubBIT;
1068 break;
1069 case Instruction::And:
1070 OpcodeInMaskForm = AndBIT;
1071 break;
1072 case Instruction::Or:
1073 OpcodeInMaskForm = OrBIT;
1074 break;
1075 case Instruction::Xor:
1076 OpcodeInMaskForm = XorBIT;
1077 break;
1078 default:
1079 return MainOp.equal(Opcode) ||
1080 (initializeAltOp(I) && AltOp.equal(Opcode));
1081 }
1082 MaskType InterchangeableMask = OpcodeInMaskForm;
1083 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1084 if (CI) {
1085 constexpr MaskType CanBeAll =
1086 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1087 const APInt &CIValue = CI->getValue();
1088 switch (Opcode) {
1089 case Instruction::Shl:
1090 if (CIValue.ult(RHS: CIValue.getBitWidth()))
1091 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1092 break;
1093 case Instruction::Mul:
1094 if (CIValue.isOne()) {
1095 InterchangeableMask = CanBeAll;
1096 break;
1097 }
1098 if (CIValue.isPowerOf2())
1099 InterchangeableMask = MulBIT | ShlBIT;
1100 break;
1101 case Instruction::Add:
1102 case Instruction::Sub:
1103 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1104 break;
1105 case Instruction::And:
1106 if (CIValue.isAllOnes())
1107 InterchangeableMask = CanBeAll;
1108 break;
1109 default:
1110 if (CIValue.isZero())
1111 InterchangeableMask = CanBeAll;
1112 break;
1113 }
1114 }
1115 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1116 (initializeAltOp(I) &&
1117 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1118 }
1119 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1120 bool hasAltOp() const { return AltOp.I; }
1121 unsigned getAltOpcode() const {
1122 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1123 }
1124 SmallVector<Value *> getOperand(const Instruction *I) const {
1125 return MainOp.getOperand(To: I);
1126 }
1127};
1128
1129/// Main data required for vectorization of instructions.
1130class InstructionsState {
1131 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1132 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1133 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1134 /// isAltShuffle).
1135 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1136 /// from getMainAltOpsNoStateVL.
1137 /// For those InstructionsState that use alternate instructions, the resulting
1138 /// vectorized output ultimately comes from a shufflevector. For example,
1139 /// given a vector list (VL):
1140 /// VL[0] = add i32 a, e
1141 /// VL[1] = sub i32 b, f
1142 /// VL[2] = add i32 c, g
1143 /// VL[3] = sub i32 d, h
1144 /// The vectorized result would be:
1145 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1146 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1147 /// result = shufflevector <4 x i32> intermediated_0,
1148 /// <4 x i32> intermediated_1,
1149 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1150 /// Since shufflevector is used in the final result, when calculating the cost
1151 /// (getEntryCost), we must account for the usage of shufflevector in
1152 /// GetVectorCost.
1153 Instruction *MainOp = nullptr;
1154 Instruction *AltOp = nullptr;
1155
1156public:
1157 Instruction *getMainOp() const {
1158 assert(valid() && "InstructionsState is invalid.");
1159 return MainOp;
1160 }
1161
1162 Instruction *getAltOp() const {
1163 assert(valid() && "InstructionsState is invalid.");
1164 return AltOp;
1165 }
1166
1167 /// The main/alternate opcodes for the list of instructions.
1168 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1169
1170 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1171
1172 /// Some of the instructions in the list have alternate opcodes.
1173 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1174
1175 /// Checks if the instruction matches either the main or alternate opcode.
1176 /// \returns
1177 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1178 /// to it
1179 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1180 /// it
1181 /// - nullptr if \param I cannot be matched or converted to either opcode
1182 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1183 assert(MainOp && "MainOp cannot be nullptr.");
1184 if (I->getOpcode() == MainOp->getOpcode())
1185 return MainOp;
1186 // Prefer AltOp instead of interchangeable instruction of MainOp.
1187 assert(AltOp && "AltOp cannot be nullptr.");
1188 if (I->getOpcode() == AltOp->getOpcode())
1189 return AltOp;
1190 if (!I->isBinaryOp())
1191 return nullptr;
1192 BinOpSameOpcodeHelper Converter(MainOp);
1193 if (Converter.add(I) && Converter.add(I: MainOp) && !Converter.hasAltOp())
1194 return MainOp;
1195 return AltOp;
1196 }
1197
1198 /// Checks if main/alt instructions are shift operations.
1199 bool isShiftOp() const {
1200 return getMainOp()->isShift() && getAltOp()->isShift();
1201 }
1202
1203 /// Checks if main/alt instructions are bitwise logic operations.
1204 bool isBitwiseLogicOp() const {
1205 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1206 }
1207
1208 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1209 bool isMulDivLikeOp() const {
1210 constexpr std::array<unsigned, 8> MulDiv = {
1211 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1212 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1213 Instruction::URem, Instruction::FRem};
1214 return is_contained(Range: MulDiv, Element: getOpcode()) &&
1215 is_contained(Range: MulDiv, Element: getAltOpcode());
1216 }
1217
1218 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1219 bool isAddSubLikeOp() const {
1220 constexpr std::array<unsigned, 4> AddSub = {
1221 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1222 Instruction::FSub};
1223 return is_contained(Range: AddSub, Element: getOpcode()) &&
1224 is_contained(Range: AddSub, Element: getAltOpcode());
1225 }
1226
1227 /// Checks if main/alt instructions are cmp operations.
1228 bool isCmpOp() const {
1229 return (getOpcode() == Instruction::ICmp ||
1230 getOpcode() == Instruction::FCmp) &&
1231 getAltOpcode() == getOpcode();
1232 }
1233
1234 /// Checks if the current state is valid, i.e. has non-null MainOp
1235 bool valid() const { return MainOp && AltOp; }
1236
1237 explicit operator bool() const { return valid(); }
1238
1239 InstructionsState() = delete;
1240 InstructionsState(Instruction *MainOp, Instruction *AltOp)
1241 : MainOp(MainOp), AltOp(AltOp) {}
1242 static InstructionsState invalid() { return {nullptr, nullptr}; }
1243};
1244
1245std::pair<Instruction *, SmallVector<Value *>>
1246convertTo(Instruction *I, const InstructionsState &S) {
1247 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1248 assert(SelectedOp && "Cannot convert the instruction.");
1249 if (I->isBinaryOp()) {
1250 BinOpSameOpcodeHelper Converter(I);
1251 return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp));
1252 }
1253 return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands()));
1254}
1255
1256} // end anonymous namespace
1257
1258static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1259 const TargetLibraryInfo &TLI);
1260
1261/// Find an instruction with a specific opcode in VL.
1262/// \param VL Array of values to search through. Must contain only Instructions
1263/// and PoisonValues.
1264/// \param Opcode The instruction opcode to search for
1265/// \returns
1266/// - The first instruction found with matching opcode
1267/// - nullptr if no matching instruction is found
1268static Instruction *findInstructionWithOpcode(ArrayRef<Value *> VL,
1269 unsigned Opcode) {
1270 for (Value *V : VL) {
1271 if (isa<PoisonValue>(Val: V))
1272 continue;
1273 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1274 auto *Inst = cast<Instruction>(Val: V);
1275 if (Inst->getOpcode() == Opcode)
1276 return Inst;
1277 }
1278 return nullptr;
1279}
1280
1281/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1282/// compatible instructions or constants, or just some other regular values.
1283static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1284 Value *Op1, const TargetLibraryInfo &TLI) {
1285 return (isConstant(V: BaseOp0) && isConstant(V: Op0)) ||
1286 (isConstant(V: BaseOp1) && isConstant(V: Op1)) ||
1287 (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
1288 !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) ||
1289 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1290 getSameOpcode(VL: {BaseOp0, Op0}, TLI) ||
1291 getSameOpcode(VL: {BaseOp1, Op1}, TLI);
1292}
1293
1294/// \returns true if a compare instruction \p CI has similar "look" and
1295/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1296/// swapped, false otherwise.
1297static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1298 const TargetLibraryInfo &TLI) {
1299 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1300 "Assessing comparisons of different types?");
1301 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1302 CmpInst::Predicate Pred = CI->getPredicate();
1303 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
1304
1305 Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0);
1306 Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1);
1307 Value *Op0 = CI->getOperand(i_nocapture: 0);
1308 Value *Op1 = CI->getOperand(i_nocapture: 1);
1309
1310 return (BasePred == Pred &&
1311 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1312 (BasePred == SwappedPred &&
1313 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
1314}
1315
1316/// \returns analysis of the Instructions in \p VL described in
1317/// InstructionsState, the Opcode that we suppose the whole list
1318/// could be vectorized even if its structure is diverse.
1319static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1320 const TargetLibraryInfo &TLI) {
1321 // Make sure these are all Instructions.
1322 if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>))
1323 return InstructionsState::invalid();
1324
1325 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
1326 if (It == VL.end())
1327 return InstructionsState::invalid();
1328
1329 Instruction *MainOp = cast<Instruction>(Val: *It);
1330 unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>);
1331 if ((VL.size() > 2 && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / 2) ||
1332 (VL.size() == 2 && InstCnt < 2))
1333 return InstructionsState::invalid();
1334
1335 bool IsCastOp = isa<CastInst>(Val: MainOp);
1336 bool IsBinOp = isa<BinaryOperator>(Val: MainOp);
1337 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
1338 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
1339 : CmpInst::BAD_ICMP_PREDICATE;
1340 Instruction *AltOp = MainOp;
1341 unsigned Opcode = MainOp->getOpcode();
1342 unsigned AltOpcode = Opcode;
1343
1344 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1345 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1346 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1347 UniquePreds.insert(X: BasePred);
1348 UniqueNonSwappedPreds.insert(X: BasePred);
1349 for (Value *V : VL) {
1350 auto *I = dyn_cast<CmpInst>(Val: V);
1351 if (!I)
1352 return false;
1353 CmpInst::Predicate CurrentPred = I->getPredicate();
1354 CmpInst::Predicate SwappedCurrentPred =
1355 CmpInst::getSwappedPredicate(pred: CurrentPred);
1356 UniqueNonSwappedPreds.insert(X: CurrentPred);
1357 if (!UniquePreds.contains(key: CurrentPred) &&
1358 !UniquePreds.contains(key: SwappedCurrentPred))
1359 UniquePreds.insert(X: CurrentPred);
1360 }
1361 // Total number of predicates > 2, but if consider swapped predicates
1362 // compatible only 2, consider swappable predicates as compatible opcodes,
1363 // not alternate.
1364 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1365 }();
1366 // Check for one alternate opcode from another BinaryOperator.
1367 // TODO - generalize to support all operators (types, calls etc.).
1368 Intrinsic::ID BaseID = 0;
1369 SmallVector<VFInfo> BaseMappings;
1370 if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) {
1371 BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
1372 BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase);
1373 if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
1374 return InstructionsState::invalid();
1375 }
1376 bool AnyPoison = InstCnt != VL.size();
1377 // Check MainOp too to be sure that it matches the requirements for the
1378 // instructions.
1379 for (Value *V : iterator_range(It, VL.end())) {
1380 auto *I = dyn_cast<Instruction>(Val: V);
1381 if (!I)
1382 continue;
1383
1384 // Cannot combine poison and divisions.
1385 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1386 // intrinsics/functions only.
1387 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
1388 return InstructionsState::invalid();
1389 unsigned InstOpcode = I->getOpcode();
1390 if (IsBinOp && isa<BinaryOperator>(Val: I)) {
1391 if (BinOpHelper.add(I))
1392 continue;
1393 } else if (IsCastOp && isa<CastInst>(Val: I)) {
1394 Value *Op0 = MainOp->getOperand(i: 0);
1395 Type *Ty0 = Op0->getType();
1396 Value *Op1 = I->getOperand(i: 0);
1397 Type *Ty1 = Op1->getType();
1398 if (Ty0 == Ty1) {
1399 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1400 continue;
1401 if (Opcode == AltOpcode) {
1402 assert(isValidForAlternation(Opcode) &&
1403 isValidForAlternation(InstOpcode) &&
1404 "Cast isn't safe for alternation, logic needs to be updated!");
1405 AltOpcode = InstOpcode;
1406 AltOp = I;
1407 continue;
1408 }
1409 }
1410 } else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) {
1411 auto *BaseInst = cast<CmpInst>(Val: MainOp);
1412 Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType();
1413 Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType();
1414 if (Ty0 == Ty1) {
1415 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1416 assert(InstOpcode == AltOpcode &&
1417 "Alternate instructions are only supported by BinaryOperator "
1418 "and CastInst.");
1419 // Check for compatible operands. If the corresponding operands are not
1420 // compatible - need to perform alternate vectorization.
1421 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1422 CmpInst::Predicate SwappedCurrentPred =
1423 CmpInst::getSwappedPredicate(pred: CurrentPred);
1424
1425 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1426 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1427 continue;
1428
1429 if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
1430 continue;
1431 auto *AltInst = cast<CmpInst>(Val: AltOp);
1432 if (MainOp != AltOp) {
1433 if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
1434 continue;
1435 } else if (BasePred != CurrentPred) {
1436 assert(
1437 isValidForAlternation(InstOpcode) &&
1438 "CmpInst isn't safe for alternation, logic needs to be updated!");
1439 AltOp = I;
1440 continue;
1441 }
1442 CmpInst::Predicate AltPred = AltInst->getPredicate();
1443 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1444 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1445 continue;
1446 }
1447 } else if (InstOpcode == Opcode) {
1448 assert(InstOpcode == AltOpcode &&
1449 "Alternate instructions are only supported by BinaryOperator and "
1450 "CastInst.");
1451 if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
1452 if (Gep->getNumOperands() != 2 ||
1453 Gep->getOperand(i_nocapture: 0)->getType() != MainOp->getOperand(i: 0)->getType())
1454 return InstructionsState::invalid();
1455 } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
1456 if (!isVectorLikeInstWithConstOps(V: EI))
1457 return InstructionsState::invalid();
1458 } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
1459 auto *BaseLI = cast<LoadInst>(Val: MainOp);
1460 if (!LI->isSimple() || !BaseLI->isSimple())
1461 return InstructionsState::invalid();
1462 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
1463 auto *CallBase = cast<CallInst>(Val: MainOp);
1464 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1465 return InstructionsState::invalid();
1466 if (Call->hasOperandBundles() &&
1467 (!CallBase->hasOperandBundles() ||
1468 !std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
1469 last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
1470 first2: CallBase->op_begin() +
1471 CallBase->getBundleOperandsStartIndex())))
1472 return InstructionsState::invalid();
1473 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
1474 if (ID != BaseID)
1475 return InstructionsState::invalid();
1476 if (!ID) {
1477 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call);
1478 if (Mappings.size() != BaseMappings.size() ||
1479 Mappings.front().ISA != BaseMappings.front().ISA ||
1480 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1481 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1482 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1483 Mappings.front().Shape.Parameters !=
1484 BaseMappings.front().Shape.Parameters)
1485 return InstructionsState::invalid();
1486 }
1487 }
1488 continue;
1489 }
1490 return InstructionsState::invalid();
1491 }
1492
1493 if (IsBinOp) {
1494 MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode());
1495 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1496 AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode());
1497 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1498 }
1499 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1500 "Incorrect implementation of allSameOpcode.");
1501 InstructionsState S(MainOp, AltOp);
1502 assert(all_of(VL,
1503 [&](Value *V) {
1504 return isa<PoisonValue>(V) ||
1505 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1506 }) &&
1507 "Invalid InstructionsState.");
1508 return S;
1509}
1510
1511/// \returns true if all of the values in \p VL have the same type or false
1512/// otherwise.
1513static bool allSameType(ArrayRef<Value *> VL) {
1514 Type *Ty = VL.front()->getType();
1515 return all_of(Range: VL.drop_front(), P: [&](Value *V) { return V->getType() == Ty; });
1516}
1517
1518/// \returns True if in-tree use also needs extract. This refers to
1519/// possible scalar operand in vectorized instruction.
1520static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1521 TargetLibraryInfo *TLI,
1522 const TargetTransformInfo *TTI) {
1523 if (!UserInst)
1524 return false;
1525 unsigned Opcode = UserInst->getOpcode();
1526 switch (Opcode) {
1527 case Instruction::Load: {
1528 LoadInst *LI = cast<LoadInst>(Val: UserInst);
1529 return (LI->getPointerOperand() == Scalar);
1530 }
1531 case Instruction::Store: {
1532 StoreInst *SI = cast<StoreInst>(Val: UserInst);
1533 return (SI->getPointerOperand() == Scalar);
1534 }
1535 case Instruction::Call: {
1536 CallInst *CI = cast<CallInst>(Val: UserInst);
1537 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1538 return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
1539 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1540 Arg.value().get() == Scalar;
1541 });
1542 }
1543 default:
1544 return false;
1545 }
1546}
1547
1548/// \returns the AA location that is being access by the instruction.
1549static MemoryLocation getLocation(Instruction *I) {
1550 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1551 return MemoryLocation::get(SI);
1552 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1553 return MemoryLocation::get(LI);
1554 return MemoryLocation();
1555}
1556
1557/// \returns True if the instruction is not a volatile or atomic load/store.
1558static bool isSimple(Instruction *I) {
1559 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1560 return LI->isSimple();
1561 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1562 return SI->isSimple();
1563 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
1564 return !MI->isVolatile();
1565 return true;
1566}
1567
1568/// Shuffles \p Mask in accordance with the given \p SubMask.
1569/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1570/// one but two input vectors.
1571static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1572 bool ExtendingManyInputs = false) {
1573 if (SubMask.empty())
1574 return;
1575 assert(
1576 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1577 // Check if input scalars were extended to match the size of other node.
1578 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1579 "SubMask with many inputs support must be larger than the mask.");
1580 if (Mask.empty()) {
1581 Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
1582 return;
1583 }
1584 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1585 int TermValue = std::min(a: Mask.size(), b: SubMask.size());
1586 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1587 if (SubMask[I] == PoisonMaskElem ||
1588 (!ExtendingManyInputs &&
1589 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1590 continue;
1591 NewMask[I] = Mask[SubMask[I]];
1592 }
1593 Mask.swap(RHS&: NewMask);
1594}
1595
1596/// Order may have elements assigned special value (size) which is out of
1597/// bounds. Such indices only appear on places which correspond to undef values
1598/// (see canReuseExtract for details) and used in order to avoid undef values
1599/// have effect on operands ordering.
1600/// The first loop below simply finds all unused indices and then the next loop
1601/// nest assigns these indices for undef values positions.
1602/// As an example below Order has two undef positions and they have assigned
1603/// values 3 and 7 respectively:
1604/// before: 6 9 5 4 9 2 1 0
1605/// after: 6 3 5 4 7 2 1 0
1606static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1607 const size_t Sz = Order.size();
1608 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1609 SmallBitVector MaskedIndices(Sz);
1610 for (unsigned I = 0; I < Sz; ++I) {
1611 if (Order[I] < Sz)
1612 UnusedIndices.reset(Idx: Order[I]);
1613 else
1614 MaskedIndices.set(I);
1615 }
1616 if (MaskedIndices.none())
1617 return;
1618 assert(UnusedIndices.count() == MaskedIndices.count() &&
1619 "Non-synced masked/available indices.");
1620 int Idx = UnusedIndices.find_first();
1621 int MIdx = MaskedIndices.find_first();
1622 while (MIdx >= 0) {
1623 assert(Idx >= 0 && "Indices must be synced.");
1624 Order[MIdx] = Idx;
1625 Idx = UnusedIndices.find_next(Prev: Idx);
1626 MIdx = MaskedIndices.find_next(Prev: MIdx);
1627 }
1628}
1629
1630/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1631/// Opcode1.
1632static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy,
1633 unsigned Opcode0, unsigned Opcode1) {
1634 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
1635 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1636 for (unsigned Lane : seq<unsigned>(Size: VL.size())) {
1637 if (isa<PoisonValue>(Val: VL[Lane]))
1638 continue;
1639 if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1)
1640 OpcodeMask.set(I: Lane * ScalarTyNumElements,
1641 E: Lane * ScalarTyNumElements + ScalarTyNumElements);
1642 }
1643 return OpcodeMask;
1644}
1645
1646/// Replicates the given \p Val \p VF times.
1647static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val,
1648 unsigned VF) {
1649 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1650 "Expected scalar constants.");
1651 SmallVector<Constant *> NewVal(Val.size() * VF);
1652 for (auto [I, V] : enumerate(First&: Val))
1653 std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V);
1654 return NewVal;
1655}
1656
1657namespace llvm {
1658
1659static void inversePermutation(ArrayRef<unsigned> Indices,
1660 SmallVectorImpl<int> &Mask) {
1661 Mask.clear();
1662 const unsigned E = Indices.size();
1663 Mask.resize(N: E, NV: PoisonMaskElem);
1664 for (unsigned I = 0; I < E; ++I)
1665 Mask[Indices[I]] = I;
1666}
1667
1668/// Reorders the list of scalars in accordance with the given \p Mask.
1669static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1670 ArrayRef<int> Mask) {
1671 assert(!Mask.empty() && "Expected non-empty mask.");
1672 SmallVector<Value *> Prev(Scalars.size(),
1673 PoisonValue::get(T: Scalars.front()->getType()));
1674 Prev.swap(RHS&: Scalars);
1675 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1676 if (Mask[I] != PoisonMaskElem)
1677 Scalars[Mask[I]] = Prev[I];
1678}
1679
1680/// Checks if the provided value does not require scheduling. It does not
1681/// require scheduling if this is not an instruction or it is an instruction
1682/// that does not read/write memory and all operands are either not instructions
1683/// or phi nodes or instructions from different blocks.
1684static bool areAllOperandsNonInsts(Value *V) {
1685 auto *I = dyn_cast<Instruction>(Val: V);
1686 if (!I)
1687 return true;
1688 return !mayHaveNonDefUseDependency(I: *I) &&
1689 all_of(Range: I->operands(), P: [I](Value *V) {
1690 auto *IO = dyn_cast<Instruction>(Val: V);
1691 if (!IO)
1692 return true;
1693 return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent();
1694 });
1695}
1696
1697/// Checks if the provided value does not require scheduling. It does not
1698/// require scheduling if this is not an instruction or it is an instruction
1699/// that does not read/write memory and all users are phi nodes or instructions
1700/// from the different blocks.
1701static bool isUsedOutsideBlock(Value *V) {
1702 auto *I = dyn_cast<Instruction>(Val: V);
1703 if (!I)
1704 return true;
1705 // Limits the number of uses to save compile time.
1706 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1707 all_of(Range: I->users(), P: [I](User *U) {
1708 auto *IU = dyn_cast<Instruction>(Val: U);
1709 if (!IU)
1710 return true;
1711 return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU);
1712 });
1713}
1714
1715/// Checks if the specified value does not require scheduling. It does not
1716/// require scheduling if all operands and all users do not need to be scheduled
1717/// in the current basic block.
1718static bool doesNotNeedToBeScheduled(Value *V) {
1719 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1720}
1721
1722/// Checks if the specified array of instructions does not require scheduling.
1723/// It is so if all either instructions have operands that do not require
1724/// scheduling or their users do not require scheduling since they are phis or
1725/// in other basic blocks.
1726static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1727 return !VL.empty() &&
1728 (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts));
1729}
1730
1731/// Returns true if widened type of \p Ty elements with size \p Sz represents
1732/// full vector type, i.e. adding extra element results in extra parts upon type
1733/// legalization.
1734static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1735 unsigned Sz) {
1736 if (Sz <= 1)
1737 return false;
1738 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty))
1739 return false;
1740 if (has_single_bit(Value: Sz))
1741 return true;
1742 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
1743 return NumParts > 0 && NumParts < Sz && has_single_bit(Value: Sz / NumParts) &&
1744 Sz % NumParts == 0;
1745}
1746
1747/// Returns number of parts, the type \p VecTy will be split at the codegen
1748/// phase. If the type is going to be scalarized or does not uses whole
1749/// registers, returns 1.
1750static unsigned
1751getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1752 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1753 unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
1754 if (NumParts == 0 || NumParts >= Limit)
1755 return 1;
1756 unsigned Sz = getNumElements(Ty: VecTy);
1757 if (NumParts >= Sz || Sz % NumParts != 0 ||
1758 !hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts))
1759 return 1;
1760 return NumParts;
1761}
1762
1763namespace slpvectorizer {
1764
1765/// Bottom Up SLP Vectorizer.
1766class BoUpSLP {
1767 class TreeEntry;
1768 class ScheduleEntity;
1769 class ScheduleData;
1770 class ScheduleBundle;
1771 class ShuffleCostEstimator;
1772 class ShuffleInstructionBuilder;
1773
1774public:
1775 /// Tracks the state we can represent the loads in the given sequence.
1776 enum class LoadsState {
1777 Gather,
1778 Vectorize,
1779 ScatterVectorize,
1780 StridedVectorize,
1781 CompressVectorize
1782 };
1783
1784 using ValueList = SmallVector<Value *, 8>;
1785 using InstrList = SmallVector<Instruction *, 16>;
1786 using ValueSet = SmallPtrSet<Value *, 16>;
1787 using StoreList = SmallVector<StoreInst *, 8>;
1788 using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
1789 using OrdersType = SmallVector<unsigned, 4>;
1790
1791 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1792 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1793 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1794 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1795 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1796 AC(AC), DB(DB), DL(DL), ORE(ORE),
1797 Builder(Se->getContext(), TargetFolder(*DL)) {
1798 CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1799 // Use the vector register size specified by the target unless overridden
1800 // by a command-line option.
1801 // TODO: It would be better to limit the vectorization factor based on
1802 // data type rather than just register size. For example, x86 AVX has
1803 // 256-bit registers, but it does not support integer operations
1804 // at that width (that requires AVX2).
1805 if (MaxVectorRegSizeOption.getNumOccurrences())
1806 MaxVecRegSize = MaxVectorRegSizeOption;
1807 else
1808 MaxVecRegSize =
1809 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
1810 .getFixedValue();
1811
1812 if (MinVectorRegSizeOption.getNumOccurrences())
1813 MinVecRegSize = MinVectorRegSizeOption;
1814 else
1815 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1816 }
1817
1818 /// Vectorize the tree that starts with the elements in \p VL.
1819 /// Returns the vectorized root.
1820 Value *vectorizeTree();
1821
1822 /// Vectorize the tree but with the list of externally used values \p
1823 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1824 /// generated extractvalue instructions.
1825 Value *vectorizeTree(
1826 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1827 Instruction *ReductionRoot = nullptr,
1828 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
1829
1830 /// \returns the cost incurred by unwanted spills and fills, caused by
1831 /// holding live values over call sites.
1832 InstructionCost getSpillCost();
1833
1834 /// \returns the vectorization cost of the subtree that starts at \p VL.
1835 /// A negative number means that this is profitable.
1836 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {},
1837 InstructionCost ReductionCost = TTI::TCC_Free);
1838
1839 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1840 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1841 void buildTree(ArrayRef<Value *> Roots,
1842 const SmallDenseSet<Value *> &UserIgnoreLst);
1843
1844 /// Construct a vectorizable tree that starts at \p Roots.
1845 void buildTree(ArrayRef<Value *> Roots);
1846
1847 /// Return the scalars of the root node.
1848 ArrayRef<Value *> getRootNodeScalars() const {
1849 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1850 return VectorizableTree.front()->Scalars;
1851 }
1852
1853 /// Returns the type/is-signed info for the root node in the graph without
1854 /// casting.
1855 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1856 const TreeEntry &Root = *VectorizableTree.front();
1857 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1858 !Root.Scalars.front()->getType()->isIntegerTy())
1859 return std::nullopt;
1860 auto It = MinBWs.find(Val: &Root);
1861 if (It != MinBWs.end())
1862 return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(),
1863 NumBits: It->second.first),
1864 y: It->second.second);
1865 if (Root.getOpcode() == Instruction::ZExt ||
1866 Root.getOpcode() == Instruction::SExt)
1867 return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(),
1868 y: Root.getOpcode() == Instruction::SExt);
1869 return std::nullopt;
1870 }
1871
1872 /// Checks if the root graph node can be emitted with narrower bitwidth at
1873 /// codegen and returns it signedness, if so.
1874 bool isSignedMinBitwidthRootNode() const {
1875 return MinBWs.at(Val: VectorizableTree.front().get()).second;
1876 }
1877
1878 /// Returns reduction type after minbitdth analysis.
1879 FixedVectorType *getReductionType() const {
1880 if (ReductionBitWidth == 0 ||
1881 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1882 ReductionBitWidth >=
1883 DL->getTypeSizeInBits(
1884 Ty: VectorizableTree.front()->Scalars.front()->getType()))
1885 return getWidenedType(
1886 ScalarTy: VectorizableTree.front()->Scalars.front()->getType(),
1887 VF: VectorizableTree.front()->getVectorFactor());
1888 return getWidenedType(
1889 ScalarTy: IntegerType::get(
1890 C&: VectorizableTree.front()->Scalars.front()->getContext(),
1891 NumBits: ReductionBitWidth),
1892 VF: VectorizableTree.front()->getVectorFactor());
1893 }
1894
1895 /// Builds external uses of the vectorized scalars, i.e. the list of
1896 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1897 /// ExternallyUsedValues contains additional list of external uses to handle
1898 /// vectorization of reductions.
1899 void
1900 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1901
1902 /// Transforms graph nodes to target specific representations, if profitable.
1903 void transformNodes();
1904
1905 /// Clear the internal data structures that are created by 'buildTree'.
1906 void deleteTree() {
1907 VectorizableTree.clear();
1908 ScalarToTreeEntries.clear();
1909 OperandsToTreeEntry.clear();
1910 ScalarsInSplitNodes.clear();
1911 MustGather.clear();
1912 NonScheduledFirst.clear();
1913 EntryToLastInstruction.clear();
1914 LoadEntriesToVectorize.clear();
1915 IsGraphTransformMode = false;
1916 GatheredLoadsEntriesFirst.reset();
1917 CompressEntryToData.clear();
1918 ExternalUses.clear();
1919 ExternalUsesAsOriginalScalar.clear();
1920 for (auto &Iter : BlocksSchedules) {
1921 BlockScheduling *BS = Iter.second.get();
1922 BS->clear();
1923 }
1924 MinBWs.clear();
1925 ReductionBitWidth = 0;
1926 BaseGraphSize = 1;
1927 CastMaxMinBWSizes.reset();
1928 ExtraBitWidthNodes.clear();
1929 InstrElementSize.clear();
1930 UserIgnoreList = nullptr;
1931 PostponedGathers.clear();
1932 ValueToGatherNodes.clear();
1933 }
1934
1935 unsigned getTreeSize() const { return VectorizableTree.size(); }
1936
1937 /// Returns the base graph size, before any transformations.
1938 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1939
1940 /// Perform LICM and CSE on the newly generated gather sequences.
1941 void optimizeGatherSequence();
1942
1943 /// Does this non-empty order represent an identity order? Identity
1944 /// should be represented as an empty order, so this is used to
1945 /// decide if we can canonicalize a computed order. Undef elements
1946 /// (represented as size) are ignored.
1947 static bool isIdentityOrder(ArrayRef<unsigned> Order) {
1948 assert(!Order.empty() && "expected non-empty order");
1949 const unsigned Sz = Order.size();
1950 return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) {
1951 return P.value() == P.index() || P.value() == Sz;
1952 });
1953 }
1954
1955 /// Checks if the specified gather tree entry \p TE can be represented as a
1956 /// shuffled vector entry + (possibly) permutation with other gathers. It
1957 /// implements the checks only for possibly ordered scalars (Loads,
1958 /// ExtractElement, ExtractValue), which can be part of the graph.
1959 /// \param TopToBottom If true, used for the whole tree rotation, false - for
1960 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
1961 /// node might be ignored.
1962 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
1963 bool TopToBottom,
1964 bool IgnoreReorder);
1965
1966 /// Sort loads into increasing pointers offsets to allow greater clustering.
1967 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1968
1969 /// Gets reordering data for the given tree entry. If the entry is vectorized
1970 /// - just return ReorderIndices, otherwise check if the scalars can be
1971 /// reordered and return the most optimal order.
1972 /// \return std::nullopt if ordering is not important, empty order, if
1973 /// identity order is important, or the actual order.
1974 /// \param TopToBottom If true, include the order of vectorized stores and
1975 /// insertelement nodes, otherwise skip them.
1976 /// \param IgnoreReorder true, if the root node order can be ignored.
1977 std::optional<OrdersType>
1978 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
1979
1980 /// Checks if it is profitable to reorder the current tree.
1981 /// If the tree does not contain many profitable reordable nodes, better to
1982 /// skip it to save compile time.
1983 bool isProfitableToReorder() const;
1984
1985 /// Reorders the current graph to the most profitable order starting from the
1986 /// root node to the leaf nodes. The best order is chosen only from the nodes
1987 /// of the same size (vectorization factor). Smaller nodes are considered
1988 /// parts of subgraph with smaller VF and they are reordered independently. We
1989 /// can make it because we still need to extend smaller nodes to the wider VF
1990 /// and we can merge reordering shuffles with the widening shuffles.
1991 void reorderTopToBottom();
1992
1993 /// Reorders the current graph to the most profitable order starting from
1994 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1995 /// number of reshuffles if the leaf nodes use the same order. In this case we
1996 /// can merge the orders and just shuffle user node instead of shuffling its
1997 /// operands. Plus, even the leaf nodes have different orders, it allows to
1998 /// sink reordering in the graph closer to the root node and merge it later
1999 /// during analysis.
2000 void reorderBottomToTop(bool IgnoreReorder = false);
2001
2002 /// \return The vector element size in bits to use when vectorizing the
2003 /// expression tree ending at \p V. If V is a store, the size is the width of
2004 /// the stored value. Otherwise, the size is the width of the largest loaded
2005 /// value reaching V. This method is used by the vectorizer to calculate
2006 /// vectorization factors.
2007 unsigned getVectorElementSize(Value *V);
2008
2009 /// Compute the minimum type sizes required to represent the entries in a
2010 /// vectorizable tree.
2011 void computeMinimumValueSizes();
2012
2013 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2014 unsigned getMaxVecRegSize() const {
2015 return MaxVecRegSize;
2016 }
2017
2018 // \returns minimum vector register size as set by cl::opt.
2019 unsigned getMinVecRegSize() const {
2020 return MinVecRegSize;
2021 }
2022
2023 unsigned getMinVF(unsigned Sz) const {
2024 return std::max(a: 2U, b: getMinVecRegSize() / Sz);
2025 }
2026
2027 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2028 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2029 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2030 return MaxVF ? MaxVF : UINT_MAX;
2031 }
2032
2033 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2034 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2035 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2036 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2037 ///
2038 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2039 unsigned canMapToVector(Type *T) const;
2040
2041 /// \returns True if the VectorizableTree is both tiny and not fully
2042 /// vectorizable. We do not vectorize such trees.
2043 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2044
2045 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2046 /// It may happen, if all gather nodes are loads and they cannot be
2047 /// "clusterized". In this case even subgraphs cannot be vectorized more
2048 /// effectively than the base graph.
2049 bool isTreeNotExtendable() const;
2050
2051 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2052 /// can be load combined in the backend. Load combining may not be allowed in
2053 /// the IR optimizer, so we do not want to alter the pattern. For example,
2054 /// partially transforming a scalar bswap() pattern into vector code is
2055 /// effectively impossible for the backend to undo.
2056 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2057 /// may not be necessary.
2058 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2059
2060 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2061 /// can be load combined in the backend. Load combining may not be allowed in
2062 /// the IR optimizer, so we do not want to alter the pattern. For example,
2063 /// partially transforming a scalar bswap() pattern into vector code is
2064 /// effectively impossible for the backend to undo.
2065 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2066 /// may not be necessary.
2067 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2068
2069 /// Checks if the given array of loads can be represented as a vectorized,
2070 /// scatter or just simple gather.
2071 /// \param VL list of loads.
2072 /// \param VL0 main load value.
2073 /// \param Order returned order of load instructions.
2074 /// \param PointerOps returned list of pointer operands.
2075 /// \param BestVF return best vector factor, if recursive check found better
2076 /// vectorization sequences rather than masked gather.
2077 /// \param TryRecursiveCheck used to check if long masked gather can be
2078 /// represented as a serie of loads/insert subvector, if profitable.
2079 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
2080 SmallVectorImpl<unsigned> &Order,
2081 SmallVectorImpl<Value *> &PointerOps,
2082 unsigned *BestVF = nullptr,
2083 bool TryRecursiveCheck = true) const;
2084
2085 /// Registers non-vectorizable sequence of loads
2086 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2087 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2088 }
2089
2090 /// Checks if the given loads sequence is known as not vectorizable
2091 template <typename T>
2092 bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const {
2093 return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL));
2094 }
2095
2096 OptimizationRemarkEmitter *getORE() { return ORE; }
2097
2098 /// This structure holds any data we need about the edges being traversed
2099 /// during buildTreeRec(). We keep track of:
2100 /// (i) the user TreeEntry index, and
2101 /// (ii) the index of the edge.
2102 struct EdgeInfo {
2103 EdgeInfo() = default;
2104 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2105 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
2106 /// The user TreeEntry.
2107 TreeEntry *UserTE = nullptr;
2108 /// The operand index of the use.
2109 unsigned EdgeIdx = UINT_MAX;
2110#ifndef NDEBUG
2111 friend inline raw_ostream &operator<<(raw_ostream &OS,
2112 const BoUpSLP::EdgeInfo &EI) {
2113 EI.dump(OS);
2114 return OS;
2115 }
2116 /// Debug print.
2117 void dump(raw_ostream &OS) const {
2118 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2119 << " EdgeIdx:" << EdgeIdx << "}";
2120 }
2121 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2122#endif
2123 bool operator == (const EdgeInfo &Other) const {
2124 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2125 }
2126
2127 operator bool() const { return UserTE != nullptr; }
2128 };
2129
2130 /// A helper class used for scoring candidates for two consecutive lanes.
2131 class LookAheadHeuristics {
2132 const TargetLibraryInfo &TLI;
2133 const DataLayout &DL;
2134 ScalarEvolution &SE;
2135 const BoUpSLP &R;
2136 int NumLanes; // Total number of lanes (aka vectorization factor).
2137 int MaxLevel; // The maximum recursion depth for accumulating score.
2138
2139 public:
2140 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
2141 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2142 int MaxLevel)
2143 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2144 MaxLevel(MaxLevel) {}
2145
2146 // The hard-coded scores listed here are not very important, though it shall
2147 // be higher for better matches to improve the resulting cost. When
2148 // computing the scores of matching one sub-tree with another, we are
2149 // basically counting the number of values that are matching. So even if all
2150 // scores are set to 1, we would still get a decent matching result.
2151 // However, sometimes we have to break ties. For example we may have to
2152 // choose between matching loads vs matching opcodes. This is what these
2153 // scores are helping us with: they provide the order of preference. Also,
2154 // this is important if the scalar is externally used or used in another
2155 // tree entry node in the different lane.
2156
2157 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2158 static const int ScoreConsecutiveLoads = 4;
2159 /// The same load multiple times. This should have a better score than
2160 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2161 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2162 /// a vector load and 1.0 for a broadcast.
2163 static const int ScoreSplatLoads = 3;
2164 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2165 static const int ScoreReversedLoads = 3;
2166 /// A load candidate for masked gather.
2167 static const int ScoreMaskedGatherCandidate = 1;
2168 /// ExtractElementInst from same vector and consecutive indexes.
2169 static const int ScoreConsecutiveExtracts = 4;
2170 /// ExtractElementInst from same vector and reversed indices.
2171 static const int ScoreReversedExtracts = 3;
2172 /// Constants.
2173 static const int ScoreConstants = 2;
2174 /// Instructions with the same opcode.
2175 static const int ScoreSameOpcode = 2;
2176 /// Instructions with alt opcodes (e.g, add + sub).
2177 static const int ScoreAltOpcodes = 1;
2178 /// Identical instructions (a.k.a. splat or broadcast).
2179 static const int ScoreSplat = 1;
2180 /// Matching with an undef is preferable to failing.
2181 static const int ScoreUndef = 1;
2182 /// Score for failing to find a decent match.
2183 static const int ScoreFail = 0;
2184 /// Score if all users are vectorized.
2185 static const int ScoreAllUserVectorized = 1;
2186
2187 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2188 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2189 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2190 /// MainAltOps.
2191 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
2192 ArrayRef<Value *> MainAltOps) const {
2193 if (!isValidElementType(Ty: V1->getType()) ||
2194 !isValidElementType(Ty: V2->getType()))
2195 return LookAheadHeuristics::ScoreFail;
2196
2197 if (V1 == V2) {
2198 if (isa<LoadInst>(Val: V1)) {
2199 // Retruns true if the users of V1 and V2 won't need to be extracted.
2200 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2201 // Bail out if we have too many uses to save compilation time.
2202 if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit))
2203 return false;
2204
2205 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2206 return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
2207 return U == U1 || U == U2 || R.isVectorized(V: U);
2208 });
2209 };
2210 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2211 };
2212 // A broadcast of a load can be cheaper on some targets.
2213 if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
2214 NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
2215 ((int)V1->getNumUses() == NumLanes ||
2216 AllUsersAreInternal(V1, V2)))
2217 return LookAheadHeuristics::ScoreSplatLoads;
2218 }
2219 return LookAheadHeuristics::ScoreSplat;
2220 }
2221
2222 auto CheckSameEntryOrFail = [&]() {
2223 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) {
2224 SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1);
2225 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2);
2226 !TEs2.empty() &&
2227 any_of(Range&: TEs2, P: [&](TreeEntry *E) { return Set.contains(Ptr: E); }))
2228 return LookAheadHeuristics::ScoreSplatLoads;
2229 }
2230 return LookAheadHeuristics::ScoreFail;
2231 };
2232
2233 auto *LI1 = dyn_cast<LoadInst>(Val: V1);
2234 auto *LI2 = dyn_cast<LoadInst>(Val: V2);
2235 if (LI1 && LI2) {
2236 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2237 !LI2->isSimple())
2238 return CheckSameEntryOrFail();
2239
2240 std::optional<int64_t> Dist = getPointersDiff(
2241 ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
2242 PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2243 if (!Dist || *Dist == 0) {
2244 if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
2245 getUnderlyingObject(V: LI2->getPointerOperand()) &&
2246 R.TTI->isLegalMaskedGather(
2247 DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
2248 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2249 return CheckSameEntryOrFail();
2250 }
2251 // The distance is too large - still may be profitable to use masked
2252 // loads/gathers.
2253 if (std::abs(i: *Dist) > NumLanes / 2)
2254 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2255 // This still will detect consecutive loads, but we might have "holes"
2256 // in some cases. It is ok for non-power-2 vectorization and may produce
2257 // better results. It should not affect current vectorization.
2258 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
2259 : LookAheadHeuristics::ScoreReversedLoads;
2260 }
2261
2262 auto *C1 = dyn_cast<Constant>(Val: V1);
2263 auto *C2 = dyn_cast<Constant>(Val: V2);
2264 if (C1 && C2)
2265 return LookAheadHeuristics::ScoreConstants;
2266
2267 // Extracts from consecutive indexes of the same vector better score as
2268 // the extracts could be optimized away.
2269 Value *EV1;
2270 ConstantInt *Ex1Idx;
2271 if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
2272 // Undefs are always profitable for extractelements.
2273 // Compiler can easily combine poison and extractelement <non-poison> or
2274 // undef and extractelement <poison>. But combining undef +
2275 // extractelement <non-poison-but-may-produce-poison> requires some
2276 // extra operations.
2277 if (isa<UndefValue>(Val: V2))
2278 return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all())
2279 ? LookAheadHeuristics::ScoreConsecutiveExtracts
2280 : LookAheadHeuristics::ScoreSameOpcode;
2281 Value *EV2 = nullptr;
2282 ConstantInt *Ex2Idx = nullptr;
2283 if (match(V: V2,
2284 P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
2285 R: m_Undef())))) {
2286 // Undefs are always profitable for extractelements.
2287 if (!Ex2Idx)
2288 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2289 if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
2290 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2291 if (EV2 == EV1) {
2292 int Idx1 = Ex1Idx->getZExtValue();
2293 int Idx2 = Ex2Idx->getZExtValue();
2294 int Dist = Idx2 - Idx1;
2295 // The distance is too large - still may be profitable to use
2296 // shuffles.
2297 if (std::abs(x: Dist) == 0)
2298 return LookAheadHeuristics::ScoreSplat;
2299 if (std::abs(x: Dist) > NumLanes / 2)
2300 return LookAheadHeuristics::ScoreSameOpcode;
2301 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
2302 : LookAheadHeuristics::ScoreReversedExtracts;
2303 }
2304 return LookAheadHeuristics::ScoreAltOpcodes;
2305 }
2306 return CheckSameEntryOrFail();
2307 }
2308
2309 auto *I1 = dyn_cast<Instruction>(Val: V1);
2310 auto *I2 = dyn_cast<Instruction>(Val: V2);
2311 if (I1 && I2) {
2312 if (I1->getParent() != I2->getParent())
2313 return CheckSameEntryOrFail();
2314 SmallVector<Value *, 4> Ops(MainAltOps);
2315 Ops.push_back(Elt: I1);
2316 Ops.push_back(Elt: I2);
2317 InstructionsState S = getSameOpcode(VL: Ops, TLI);
2318 // Note: Only consider instructions with <= 2 operands to avoid
2319 // complexity explosion.
2320 if (S &&
2321 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2322 !S.isAltShuffle()) &&
2323 all_of(Range&: Ops, P: [&S](Value *V) {
2324 return isa<PoisonValue>(Val: V) ||
2325 cast<Instruction>(Val: V)->getNumOperands() ==
2326 S.getMainOp()->getNumOperands();
2327 }))
2328 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2329 : LookAheadHeuristics::ScoreSameOpcode;
2330 }
2331
2332 if (I1 && isa<PoisonValue>(Val: V2))
2333 return LookAheadHeuristics::ScoreSameOpcode;
2334
2335 if (isa<UndefValue>(Val: V2))
2336 return LookAheadHeuristics::ScoreUndef;
2337
2338 return CheckSameEntryOrFail();
2339 }
2340
2341 /// Go through the operands of \p LHS and \p RHS recursively until
2342 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2343 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2344 /// of \p U1 and \p U2), except at the beginning of the recursion where
2345 /// these are set to nullptr.
2346 ///
2347 /// For example:
2348 /// \verbatim
2349 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2350 /// \ / \ / \ / \ /
2351 /// + + + +
2352 /// G1 G2 G3 G4
2353 /// \endverbatim
2354 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2355 /// each level recursively, accumulating the score. It starts from matching
2356 /// the additions at level 0, then moves on to the loads (level 1). The
2357 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2358 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2359 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2360 /// Please note that the order of the operands does not matter, as we
2361 /// evaluate the score of all profitable combinations of operands. In
2362 /// other words the score of G1 and G4 is the same as G1 and G2. This
2363 /// heuristic is based on ideas described in:
2364 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2365 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2366 /// Luís F. W. Góes
2367 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
2368 Instruction *U2, int CurrLevel,
2369 ArrayRef<Value *> MainAltOps) const {
2370
2371 // Get the shallow score of V1 and V2.
2372 int ShallowScoreAtThisLevel =
2373 getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
2374
2375 // If reached MaxLevel,
2376 // or if V1 and V2 are not instructions,
2377 // or if they are SPLAT,
2378 // or if they are not consecutive,
2379 // or if profitable to vectorize loads or extractelements, early return
2380 // the current cost.
2381 auto *I1 = dyn_cast<Instruction>(Val: LHS);
2382 auto *I2 = dyn_cast<Instruction>(Val: RHS);
2383 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2384 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2385 (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) ||
2386 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2387 (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
2388 ShallowScoreAtThisLevel))
2389 return ShallowScoreAtThisLevel;
2390 assert(I1 && I2 && "Should have early exited.");
2391
2392 // Contains the I2 operand indexes that got matched with I1 operands.
2393 SmallSet<unsigned, 4> Op2Used;
2394
2395 // Recursion towards the operands of I1 and I2. We are trying all possible
2396 // operand pairs, and keeping track of the best score.
2397 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2398 OpIdx1 != NumOperands1; ++OpIdx1) {
2399 // Try to pair op1I with the best operand of I2.
2400 int MaxTmpScore = 0;
2401 unsigned MaxOpIdx2 = 0;
2402 bool FoundBest = false;
2403 // If I2 is commutative try all combinations.
2404 unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1;
2405 unsigned ToIdx = isCommutative(I: I2)
2406 ? I2->getNumOperands()
2407 : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1);
2408 assert(FromIdx <= ToIdx && "Bad index");
2409 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2410 // Skip operands already paired with OpIdx1.
2411 if (Op2Used.count(V: OpIdx2))
2412 continue;
2413 // Recursively calculate the cost at each level
2414 int TmpScore =
2415 getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
2416 U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: {});
2417 // Look for the best score.
2418 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2419 TmpScore > MaxTmpScore) {
2420 MaxTmpScore = TmpScore;
2421 MaxOpIdx2 = OpIdx2;
2422 FoundBest = true;
2423 }
2424 }
2425 if (FoundBest) {
2426 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2427 Op2Used.insert(V: MaxOpIdx2);
2428 ShallowScoreAtThisLevel += MaxTmpScore;
2429 }
2430 }
2431 return ShallowScoreAtThisLevel;
2432 }
2433 };
2434 /// A helper data structure to hold the operands of a vector of instructions.
2435 /// This supports a fixed vector length for all operand vectors.
2436 class VLOperands {
2437 /// For each operand we need (i) the value, and (ii) the opcode that it
2438 /// would be attached to if the expression was in a left-linearized form.
2439 /// This is required to avoid illegal operand reordering.
2440 /// For example:
2441 /// \verbatim
2442 /// 0 Op1
2443 /// |/
2444 /// Op1 Op2 Linearized + Op2
2445 /// \ / ----------> |/
2446 /// - -
2447 ///
2448 /// Op1 - Op2 (0 + Op1) - Op2
2449 /// \endverbatim
2450 ///
2451 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2452 ///
2453 /// Another way to think of this is to track all the operations across the
2454 /// path from the operand all the way to the root of the tree and to
2455 /// calculate the operation that corresponds to this path. For example, the
2456 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2457 /// corresponding operation is a '-' (which matches the one in the
2458 /// linearized tree, as shown above).
2459 ///
2460 /// For lack of a better term, we refer to this operation as Accumulated
2461 /// Path Operation (APO).
2462 struct OperandData {
2463 OperandData() = default;
2464 OperandData(Value *V, bool APO, bool IsUsed)
2465 : V(V), APO(APO), IsUsed(IsUsed) {}
2466 /// The operand value.
2467 Value *V = nullptr;
2468 /// TreeEntries only allow a single opcode, or an alternate sequence of
2469 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2470 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2471 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2472 /// (e.g., Add/Mul)
2473 bool APO = false;
2474 /// Helper data for the reordering function.
2475 bool IsUsed = false;
2476 };
2477
2478 /// During operand reordering, we are trying to select the operand at lane
2479 /// that matches best with the operand at the neighboring lane. Our
2480 /// selection is based on the type of value we are looking for. For example,
2481 /// if the neighboring lane has a load, we need to look for a load that is
2482 /// accessing a consecutive address. These strategies are summarized in the
2483 /// 'ReorderingMode' enumerator.
2484 enum class ReorderingMode {
2485 Load, ///< Matching loads to consecutive memory addresses
2486 Opcode, ///< Matching instructions based on opcode (same or alternate)
2487 Constant, ///< Matching constants
2488 Splat, ///< Matching the same instruction multiple times (broadcast)
2489 Failed, ///< We failed to create a vectorizable group
2490 };
2491
2492 using OperandDataVec = SmallVector<OperandData, 2>;
2493
2494 /// A vector of operand vectors.
2495 SmallVector<OperandDataVec, 4> OpsVec;
2496 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2497 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2498 unsigned ArgSize = 0;
2499
2500 const TargetLibraryInfo &TLI;
2501 const DataLayout &DL;
2502 ScalarEvolution &SE;
2503 const BoUpSLP &R;
2504 const Loop *L = nullptr;
2505
2506 /// \returns the operand data at \p OpIdx and \p Lane.
2507 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2508 return OpsVec[OpIdx][Lane];
2509 }
2510
2511 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2512 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2513 return OpsVec[OpIdx][Lane];
2514 }
2515
2516 /// Clears the used flag for all entries.
2517 void clearUsed() {
2518 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2519 OpIdx != NumOperands; ++OpIdx)
2520 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2521 ++Lane)
2522 OpsVec[OpIdx][Lane].IsUsed = false;
2523 }
2524
2525 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2526 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2527 std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]);
2528 }
2529
2530 /// \param Lane lane of the operands under analysis.
2531 /// \param OpIdx operand index in \p Lane lane we're looking the best
2532 /// candidate for.
2533 /// \param Idx operand index of the current candidate value.
2534 /// \returns The additional score due to possible broadcasting of the
2535 /// elements in the lane. It is more profitable to have power-of-2 unique
2536 /// elements in the lane, it will be vectorized with higher probability
2537 /// after removing duplicates. Currently the SLP vectorizer supports only
2538 /// vectorization of the power-of-2 number of unique scalars.
2539 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2540 const SmallBitVector &UsedLanes) const {
2541 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2542 if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2543 isa<ExtractElementInst>(Val: IdxLaneV))
2544 return 0;
2545 SmallDenseMap<Value *, unsigned, 4> Uniques;
2546 for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) {
2547 if (Ln == Lane)
2548 continue;
2549 Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
2550 if (!isa<Instruction>(Val: OpIdxLnV))
2551 return 0;
2552 Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln);
2553 }
2554 unsigned UniquesCount = Uniques.size();
2555 auto IdxIt = Uniques.find(Val: IdxLaneV);
2556 unsigned UniquesCntWithIdxLaneV =
2557 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2558 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2559 auto OpIdxIt = Uniques.find(Val: OpIdxLaneV);
2560 unsigned UniquesCntWithOpIdxLaneV =
2561 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2562 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2563 return 0;
2564 return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) -
2565 UniquesCntWithOpIdxLaneV,
2566 b: UniquesCntWithOpIdxLaneV -
2567 bit_floor(Value: UniquesCntWithOpIdxLaneV)) -
2568 ((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt->second))
2569 ? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV)
2570 : bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2571 }
2572
2573 /// \param Lane lane of the operands under analysis.
2574 /// \param OpIdx operand index in \p Lane lane we're looking the best
2575 /// candidate for.
2576 /// \param Idx operand index of the current candidate value.
2577 /// \returns The additional score for the scalar which users are all
2578 /// vectorized.
2579 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2580 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2581 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2582 // Do not care about number of uses for vector-like instructions
2583 // (extractelement/extractvalue with constant indices), they are extracts
2584 // themselves and already externally used. Vectorization of such
2585 // instructions does not add extra extractelement instruction, just may
2586 // remove it.
2587 if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
2588 isVectorLikeInstWithConstOps(V: OpIdxLaneV))
2589 return LookAheadHeuristics::ScoreAllUserVectorized;
2590 auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
2591 if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV))
2592 return 0;
2593 return R.areAllUsersVectorized(I: IdxLaneI)
2594 ? LookAheadHeuristics::ScoreAllUserVectorized
2595 : 0;
2596 }
2597
2598 /// Score scaling factor for fully compatible instructions but with
2599 /// different number of external uses. Allows better selection of the
2600 /// instructions with less external uses.
2601 static const int ScoreScaleFactor = 10;
2602
2603 /// \Returns the look-ahead score, which tells us how much the sub-trees
2604 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2605 /// score. This helps break ties in an informed way when we cannot decide on
2606 /// the order of the operands by just considering the immediate
2607 /// predecessors.
2608 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2609 int Lane, unsigned OpIdx, unsigned Idx,
2610 bool &IsUsed, const SmallBitVector &UsedLanes) {
2611 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2612 LookAheadMaxDepth);
2613 // Keep track of the instruction stack as we recurse into the operands
2614 // during the look-ahead score exploration.
2615 int Score =
2616 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2617 /*CurrLevel=*/1, MainAltOps);
2618 if (Score) {
2619 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2620 if (Score <= -SplatScore) {
2621 // Failed score.
2622 Score = 0;
2623 } else {
2624 Score += SplatScore;
2625 // Scale score to see the difference between different operands
2626 // and similar operands but all vectorized/not all vectorized
2627 // uses. It does not affect actual selection of the best
2628 // compatible operand in general, just allows to select the
2629 // operand with all vectorized uses.
2630 Score *= ScoreScaleFactor;
2631 Score += getExternalUseScore(Lane, OpIdx, Idx);
2632 IsUsed = true;
2633 }
2634 }
2635 return Score;
2636 }
2637
2638 /// Best defined scores per lanes between the passes. Used to choose the
2639 /// best operand (with the highest score) between the passes.
2640 /// The key - {Operand Index, Lane}.
2641 /// The value - the best score between the passes for the lane and the
2642 /// operand.
2643 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
2644 BestScoresPerLanes;
2645
2646 // Search all operands in Ops[*][Lane] for the one that matches best
2647 // Ops[OpIdx][LastLane] and return its opreand index.
2648 // If no good match can be found, return std::nullopt.
2649 std::optional<unsigned>
2650 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2651 ArrayRef<ReorderingMode> ReorderingModes,
2652 ArrayRef<Value *> MainAltOps,
2653 const SmallBitVector &UsedLanes) {
2654 unsigned NumOperands = getNumOperands();
2655
2656 // The operand of the previous lane at OpIdx.
2657 Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
2658
2659 // Our strategy mode for OpIdx.
2660 ReorderingMode RMode = ReorderingModes[OpIdx];
2661 if (RMode == ReorderingMode::Failed)
2662 return std::nullopt;
2663
2664 // The linearized opcode of the operand at OpIdx, Lane.
2665 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2666
2667 // The best operand index and its score.
2668 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2669 // are using the score to differentiate between the two.
2670 struct BestOpData {
2671 std::optional<unsigned> Idx;
2672 unsigned Score = 0;
2673 } BestOp;
2674 BestOp.Score =
2675 BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0)
2676 .first->second;
2677
2678 // Track if the operand must be marked as used. If the operand is set to
2679 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2680 // want to reestimate the operands again on the following iterations).
2681 bool IsUsed = RMode == ReorderingMode::Splat ||
2682 RMode == ReorderingMode::Constant ||
2683 RMode == ReorderingMode::Load;
2684 // Iterate through all unused operands and look for the best.
2685 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2686 // Get the operand at Idx and Lane.
2687 OperandData &OpData = getData(OpIdx: Idx, Lane);
2688 Value *Op = OpData.V;
2689 bool OpAPO = OpData.APO;
2690
2691 // Skip already selected operands.
2692 if (OpData.IsUsed)
2693 continue;
2694
2695 // Skip if we are trying to move the operand to a position with a
2696 // different opcode in the linearized tree form. This would break the
2697 // semantics.
2698 if (OpAPO != OpIdxAPO)
2699 continue;
2700
2701 // Look for an operand that matches the current mode.
2702 switch (RMode) {
2703 case ReorderingMode::Load:
2704 case ReorderingMode::Opcode: {
2705 bool LeftToRight = Lane > LastLane;
2706 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2707 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2708 int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
2709 OpIdx, Idx, IsUsed, UsedLanes);
2710 if (Score > static_cast<int>(BestOp.Score) ||
2711 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2712 Idx == OpIdx)) {
2713 BestOp.Idx = Idx;
2714 BestOp.Score = Score;
2715 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score;
2716 }
2717 break;
2718 }
2719 case ReorderingMode::Constant:
2720 if (isa<Constant>(Val: Op) ||
2721 (!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
2722 BestOp.Idx = Idx;
2723 if (isa<Constant>(Val: Op)) {
2724 BestOp.Score = LookAheadHeuristics::ScoreConstants;
2725 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
2726 LookAheadHeuristics::ScoreConstants;
2727 }
2728 if (isa<UndefValue>(Val: Op) || !isa<Constant>(Val: Op))
2729 IsUsed = false;
2730 }
2731 break;
2732 case ReorderingMode::Splat:
2733 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Val: Op))) {
2734 IsUsed = Op == OpLastLane;
2735 if (Op == OpLastLane) {
2736 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2737 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
2738 LookAheadHeuristics::ScoreSplat;
2739 }
2740 BestOp.Idx = Idx;
2741 }
2742 break;
2743 case ReorderingMode::Failed:
2744 llvm_unreachable("Not expected Failed reordering mode.");
2745 }
2746 }
2747
2748 if (BestOp.Idx) {
2749 getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
2750 return BestOp.Idx;
2751 }
2752 // If we could not find a good match return std::nullopt.
2753 return std::nullopt;
2754 }
2755
2756 /// Helper for reorderOperandVecs.
2757 /// \returns the lane that we should start reordering from. This is the one
2758 /// which has the least number of operands that can freely move about or
2759 /// less profitable because it already has the most optimal set of operands.
2760 unsigned getBestLaneToStartReordering() const {
2761 unsigned Min = UINT_MAX;
2762 unsigned SameOpNumber = 0;
2763 // std::pair<unsigned, unsigned> is used to implement a simple voting
2764 // algorithm and choose the lane with the least number of operands that
2765 // can freely move about or less profitable because it already has the
2766 // most optimal set of operands. The first unsigned is a counter for
2767 // voting, the second unsigned is the counter of lanes with instructions
2768 // with same/alternate opcodes and same parent basic block.
2769 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2770 // Try to be closer to the original results, if we have multiple lanes
2771 // with same cost. If 2 lanes have the same cost, use the one with the
2772 // highest index.
2773 for (int I = getNumLanes(); I > 0; --I) {
2774 unsigned Lane = I - 1;
2775 OperandsOrderData NumFreeOpsHash =
2776 getMaxNumOperandsThatCanBeReordered(Lane);
2777 // Compare the number of operands that can move and choose the one with
2778 // the least number.
2779 if (NumFreeOpsHash.NumOfAPOs < Min) {
2780 Min = NumFreeOpsHash.NumOfAPOs;
2781 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2782 HashMap.clear();
2783 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
2784 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2785 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2786 // Select the most optimal lane in terms of number of operands that
2787 // should be moved around.
2788 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2789 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
2790 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2791 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2792 auto [It, Inserted] =
2793 HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: 1, Args&: Lane);
2794 if (!Inserted)
2795 ++It->second.first;
2796 }
2797 }
2798 // Select the lane with the minimum counter.
2799 unsigned BestLane = 0;
2800 unsigned CntMin = UINT_MAX;
2801 for (const auto &Data : reverse(C&: HashMap)) {
2802 if (Data.second.first < CntMin) {
2803 CntMin = Data.second.first;
2804 BestLane = Data.second.second;
2805 }
2806 }
2807 return BestLane;
2808 }
2809
2810 /// Data structure that helps to reorder operands.
2811 struct OperandsOrderData {
2812 /// The best number of operands with the same APOs, which can be
2813 /// reordered.
2814 unsigned NumOfAPOs = UINT_MAX;
2815 /// Number of operands with the same/alternate instruction opcode and
2816 /// parent.
2817 unsigned NumOpsWithSameOpcodeParent = 0;
2818 /// Hash for the actual operands ordering.
2819 /// Used to count operands, actually their position id and opcode
2820 /// value. It is used in the voting mechanism to find the lane with the
2821 /// least number of operands that can freely move about or less profitable
2822 /// because it already has the most optimal set of operands. Can be
2823 /// replaced with SmallVector<unsigned> instead but hash code is faster
2824 /// and requires less memory.
2825 unsigned Hash = 0;
2826 };
2827 /// \returns the maximum number of operands that are allowed to be reordered
2828 /// for \p Lane and the number of compatible instructions(with the same
2829 /// parent/opcode). This is used as a heuristic for selecting the first lane
2830 /// to start operand reordering.
2831 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2832 unsigned CntTrue = 0;
2833 unsigned NumOperands = getNumOperands();
2834 // Operands with the same APO can be reordered. We therefore need to count
2835 // how many of them we have for each APO, like this: Cnt[APO] = x.
2836 // Since we only have two APOs, namely true and false, we can avoid using
2837 // a map. Instead we can simply count the number of operands that
2838 // correspond to one of them (in this case the 'true' APO), and calculate
2839 // the other by subtracting it from the total number of operands.
2840 // Operands with the same instruction opcode and parent are more
2841 // profitable since we don't need to move them in many cases, with a high
2842 // probability such lane already can be vectorized effectively.
2843 bool AllUndefs = true;
2844 unsigned NumOpsWithSameOpcodeParent = 0;
2845 Instruction *OpcodeI = nullptr;
2846 BasicBlock *Parent = nullptr;
2847 unsigned Hash = 0;
2848 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2849 const OperandData &OpData = getData(OpIdx, Lane);
2850 if (OpData.APO)
2851 ++CntTrue;
2852 // Use Boyer-Moore majority voting for finding the majority opcode and
2853 // the number of times it occurs.
2854 if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
2855 if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI) ||
2856 I->getParent() != Parent) {
2857 if (NumOpsWithSameOpcodeParent == 0) {
2858 NumOpsWithSameOpcodeParent = 1;
2859 OpcodeI = I;
2860 Parent = I->getParent();
2861 } else {
2862 --NumOpsWithSameOpcodeParent;
2863 }
2864 } else {
2865 ++NumOpsWithSameOpcodeParent;
2866 }
2867 }
2868 Hash = hash_combine(
2869 args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1)));
2870 AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
2871 }
2872 if (AllUndefs)
2873 return {};
2874 OperandsOrderData Data;
2875 Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
2876 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2877 Data.Hash = Hash;
2878 return Data;
2879 }
2880
2881 /// Go through the instructions in VL and append their operands.
2882 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
2883 const InstructionsState &S) {
2884 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
2885 assert((empty() || all_of(Operands,
2886 [this](const ValueList &VL) {
2887 return VL.size() == getNumLanes();
2888 })) &&
2889 "Expected same number of lanes");
2890 assert(S.valid() && "InstructionsState is invalid.");
2891 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2892 // arguments to the intrinsic produces the same result.
2893 constexpr unsigned IntrinsicNumOperands = 2;
2894 Instruction *MainOp = S.getMainOp();
2895 unsigned NumOperands = MainOp->getNumOperands();
2896 ArgSize = isa<IntrinsicInst>(Val: MainOp) ? IntrinsicNumOperands : NumOperands;
2897 OpsVec.resize(N: ArgSize);
2898 unsigned NumLanes = VL.size();
2899 for (OperandDataVec &Ops : OpsVec)
2900 Ops.resize(N: NumLanes);
2901 for (unsigned Lane : seq<unsigned>(Size: NumLanes)) {
2902 Value *V = VL[Lane];
2903 assert((isa<Instruction>(V) || isa<PoisonValue>(V)) &&
2904 "Expected instruction or poison value");
2905 // Our tree has just 3 nodes: the root and two operands.
2906 // It is therefore trivial to get the APO. We only need to check the
2907 // opcode of V and whether the operand at OpIdx is the LHS or RHS
2908 // operand. The LHS operand of both add and sub is never attached to an
2909 // inversese operation in the linearized form, therefore its APO is
2910 // false. The RHS is true only if V is an inverse operation.
2911
2912 // Since operand reordering is performed on groups of commutative
2913 // operations or alternating sequences (e.g., +, -), we can safely tell
2914 // the inverse operations by checking commutativity.
2915 if (isa<PoisonValue>(Val: V)) {
2916 for (unsigned OpIdx : seq<unsigned>(Size: NumOperands))
2917 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
2918 continue;
2919 }
2920 auto [SelectedOp, Ops] = convertTo(I: cast<Instruction>(Val: V), S);
2921 // We cannot check commutativity by the converted instruction
2922 // (SelectedOp) because isCommutative also examines def-use
2923 // relationships.
2924 bool IsInverseOperation =
2925 !isCommutative(I: SelectedOp, InstWithUses: cast<Instruction>(Val: V));
2926 for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) {
2927 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2928 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
2929 }
2930 }
2931 }
2932
2933 /// \returns the number of operands.
2934 unsigned getNumOperands() const { return ArgSize; }
2935
2936 /// \returns the number of lanes.
2937 unsigned getNumLanes() const { return OpsVec[0].size(); }
2938
2939 /// \returns the operand value at \p OpIdx and \p Lane.
2940 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2941 return getData(OpIdx, Lane).V;
2942 }
2943
2944 /// \returns true if the data structure is empty.
2945 bool empty() const { return OpsVec.empty(); }
2946
2947 /// Clears the data.
2948 void clear() { OpsVec.clear(); }
2949
2950 /// \Returns true if there are enough operands identical to \p Op to fill
2951 /// the whole vector (it is mixed with constants or loop invariant values).
2952 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2953 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2954 assert(Op == getValue(OpIdx, Lane) &&
2955 "Op is expected to be getValue(OpIdx, Lane).");
2956 // Small number of loads - try load matching.
2957 if (isa<LoadInst>(Val: Op) && getNumLanes() == 2 && getNumOperands() == 2)
2958 return false;
2959 bool OpAPO = getData(OpIdx, Lane).APO;
2960 bool IsInvariant = L && L->isLoopInvariant(V: Op);
2961 unsigned Cnt = 0;
2962 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2963 if (Ln == Lane)
2964 continue;
2965 // This is set to true if we found a candidate for broadcast at Lane.
2966 bool FoundCandidate = false;
2967 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2968 OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2969 if (Data.APO != OpAPO || Data.IsUsed)
2970 continue;
2971 Value *OpILane = getValue(OpIdx: OpI, Lane);
2972 bool IsConstantOp = isa<Constant>(Val: OpILane);
2973 // Consider the broadcast candidate if:
2974 // 1. Same value is found in one of the operands.
2975 if (Data.V == Op ||
2976 // 2. The operand in the given lane is not constant but there is a
2977 // constant operand in another lane (which can be moved to the
2978 // given lane). In this case we can represent it as a simple
2979 // permutation of constant and broadcast.
2980 (!IsConstantOp &&
2981 ((Lns > 2 && isa<Constant>(Val: Data.V)) ||
2982 // 2.1. If we have only 2 lanes, need to check that value in the
2983 // next lane does not build same opcode sequence.
2984 (Lns == 2 &&
2985 !getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + 1) % OpE, Lane: Ln)}, TLI) &&
2986 isa<Constant>(Val: Data.V)))) ||
2987 // 3. The operand in the current lane is loop invariant (can be
2988 // hoisted out) and another operand is also a loop invariant
2989 // (though not a constant). In this case the whole vector can be
2990 // hoisted out.
2991 // FIXME: need to teach the cost model about this case for better
2992 // estimation.
2993 (IsInvariant && !isa<Constant>(Val: Data.V) &&
2994 !getSameOpcode(VL: {Op, Data.V}, TLI) &&
2995 L->isLoopInvariant(V: Data.V))) {
2996 FoundCandidate = true;
2997 Data.IsUsed = Data.V == Op;
2998 if (Data.V == Op)
2999 ++Cnt;
3000 break;
3001 }
3002 }
3003 if (!FoundCandidate)
3004 return false;
3005 }
3006 return getNumLanes() == 2 || Cnt > 1;
3007 }
3008
3009 /// Checks if there is at least single compatible operand in lanes other
3010 /// than \p Lane, compatible with the operand \p Op.
3011 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3012 assert(Op == getValue(OpIdx, Lane) &&
3013 "Op is expected to be getValue(OpIdx, Lane).");
3014 bool OpAPO = getData(OpIdx, Lane).APO;
3015 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3016 if (Ln == Lane)
3017 continue;
3018 if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
3019 const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3020 if (Data.APO != OpAPO || Data.IsUsed)
3021 return true;
3022 Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
3023 return (L && L->isLoopInvariant(V: OpILn)) ||
3024 (getSameOpcode(VL: {Op, OpILn}, TLI) &&
3025 allSameBlock(VL: {Op, OpILn}));
3026 }))
3027 return true;
3028 }
3029 return false;
3030 }
3031
3032 public:
3033 /// Initialize with all the operands of the instruction vector \p RootVL.
3034 VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,
3035 const InstructionsState &S, const BoUpSLP &R)
3036 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3037 L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) {
3038 // Append all the operands of RootVL.
3039 appendOperands(VL: RootVL, Operands, S);
3040 }
3041
3042 /// \Returns a value vector with the operands across all lanes for the
3043 /// opearnd at \p OpIdx.
3044 ValueList getVL(unsigned OpIdx) const {
3045 ValueList OpVL(OpsVec[OpIdx].size());
3046 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3047 "Expected same num of lanes across all operands");
3048 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3049 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3050 return OpVL;
3051 }
3052
3053 // Performs operand reordering for 2 or more operands.
3054 // The original operands are in OrigOps[OpIdx][Lane].
3055 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3056 void reorder() {
3057 unsigned NumOperands = getNumOperands();
3058 unsigned NumLanes = getNumLanes();
3059 // Each operand has its own mode. We are using this mode to help us select
3060 // the instructions for each lane, so that they match best with the ones
3061 // we have selected so far.
3062 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3063
3064 // This is a greedy single-pass algorithm. We are going over each lane
3065 // once and deciding on the best order right away with no back-tracking.
3066 // However, in order to increase its effectiveness, we start with the lane
3067 // that has operands that can move the least. For example, given the
3068 // following lanes:
3069 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3070 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3071 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3072 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3073 // we will start at Lane 1, since the operands of the subtraction cannot
3074 // be reordered. Then we will visit the rest of the lanes in a circular
3075 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3076
3077 // Find the first lane that we will start our search from.
3078 unsigned FirstLane = getBestLaneToStartReordering();
3079
3080 // Initialize the modes.
3081 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3082 Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
3083 // Keep track if we have instructions with all the same opcode on one
3084 // side.
3085 if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
3086 // Check if OpLane0 should be broadcast.
3087 if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) ||
3088 !canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
3089 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3090 else if (isa<LoadInst>(Val: OpILane0))
3091 ReorderingModes[OpIdx] = ReorderingMode::Load;
3092 else
3093 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3094 } else if (isa<Constant>(Val: OpLane0)) {
3095 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3096 } else if (isa<Argument>(Val: OpLane0)) {
3097 // Our best hope is a Splat. It may save some cost in some cases.
3098 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3099 } else {
3100 llvm_unreachable("Unexpected value kind.");
3101 }
3102 }
3103
3104 // Check that we don't have same operands. No need to reorder if operands
3105 // are just perfect diamond or shuffled diamond match. Do not do it only
3106 // for possible broadcasts or non-power of 2 number of scalars (just for
3107 // now).
3108 auto &&SkipReordering = [this]() {
3109 SmallPtrSet<Value *, 4> UniqueValues;
3110 ArrayRef<OperandData> Op0 = OpsVec.front();
3111 for (const OperandData &Data : Op0)
3112 UniqueValues.insert(Ptr: Data.V);
3113 for (ArrayRef<OperandData> Op :
3114 ArrayRef(OpsVec).slice(N: 1, M: getNumOperands() - 1)) {
3115 if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
3116 return !UniqueValues.contains(Ptr: Data.V);
3117 }))
3118 return false;
3119 }
3120 // TODO: Check if we can remove a check for non-power-2 number of
3121 // scalars after full support of non-power-2 vectorization.
3122 return UniqueValues.size() != 2 &&
3123 hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(),
3124 Sz: UniqueValues.size());
3125 };
3126
3127 // If the initial strategy fails for any of the operand indexes, then we
3128 // perform reordering again in a second pass. This helps avoid assigning
3129 // high priority to the failed strategy, and should improve reordering for
3130 // the non-failed operand indexes.
3131 for (int Pass = 0; Pass != 2; ++Pass) {
3132 // Check if no need to reorder operands since they're are perfect or
3133 // shuffled diamond match.
3134 // Need to do it to avoid extra external use cost counting for
3135 // shuffled matches, which may cause regressions.
3136 if (SkipReordering())
3137 break;
3138 // Skip the second pass if the first pass did not fail.
3139 bool StrategyFailed = false;
3140 // Mark all operand data as free to use.
3141 clearUsed();
3142 // We keep the original operand order for the FirstLane, so reorder the
3143 // rest of the lanes. We are visiting the nodes in a circular fashion,
3144 // using FirstLane as the center point and increasing the radius
3145 // distance.
3146 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3147 for (unsigned I = 0; I < NumOperands; ++I)
3148 MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
3149
3150 SmallBitVector UsedLanes(NumLanes);
3151 UsedLanes.set(FirstLane);
3152 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3153 // Visit the lane on the right and then the lane on the left.
3154 for (int Direction : {+1, -1}) {
3155 int Lane = FirstLane + Direction * Distance;
3156 if (Lane < 0 || Lane >= (int)NumLanes)
3157 continue;
3158 UsedLanes.set(Lane);
3159 int LastLane = Lane - Direction;
3160 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3161 "Out of bounds");
3162 // Look for a good match for each operand.
3163 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3164 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3165 std::optional<unsigned> BestIdx =
3166 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3167 MainAltOps: MainAltOps[OpIdx], UsedLanes);
3168 // By not selecting a value, we allow the operands that follow to
3169 // select a better matching value. We will get a non-null value in
3170 // the next run of getBestOperand().
3171 if (BestIdx) {
3172 // Swap the current operand with the one returned by
3173 // getBestOperand().
3174 swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
3175 } else {
3176 // Enable the second pass.
3177 StrategyFailed = true;
3178 }
3179 // Try to get the alternate opcode and follow it during analysis.
3180 if (MainAltOps[OpIdx].size() != 2) {
3181 OperandData &AltOp = getData(OpIdx, Lane);
3182 InstructionsState OpS =
3183 getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3184 if (OpS && OpS.isAltShuffle())
3185 MainAltOps[OpIdx].push_back(Elt: AltOp.V);
3186 }
3187 }
3188 }
3189 }
3190 // Skip second pass if the strategy did not fail.
3191 if (!StrategyFailed)
3192 break;
3193 }
3194 }
3195
3196#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3197 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3198 switch (RMode) {
3199 case ReorderingMode::Load:
3200 return "Load";
3201 case ReorderingMode::Opcode:
3202 return "Opcode";
3203 case ReorderingMode::Constant:
3204 return "Constant";
3205 case ReorderingMode::Splat:
3206 return "Splat";
3207 case ReorderingMode::Failed:
3208 return "Failed";
3209 }
3210 llvm_unreachable("Unimplemented Reordering Type");
3211 }
3212
3213 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3214 raw_ostream &OS) {
3215 return OS << getModeStr(RMode);
3216 }
3217
3218 /// Debug print.
3219 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3220 printMode(RMode, dbgs());
3221 }
3222
3223 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3224 return printMode(RMode, OS);
3225 }
3226
3227 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
3228 const unsigned Indent = 2;
3229 unsigned Cnt = 0;
3230 for (const OperandDataVec &OpDataVec : OpsVec) {
3231 OS << "Operand " << Cnt++ << "\n";
3232 for (const OperandData &OpData : OpDataVec) {
3233 OS.indent(Indent) << "{";
3234 if (Value *V = OpData.V)
3235 OS << *V;
3236 else
3237 OS << "null";
3238 OS << ", APO:" << OpData.APO << "}\n";
3239 }
3240 OS << "\n";
3241 }
3242 return OS;
3243 }
3244
3245 /// Debug print.
3246 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3247#endif
3248 };
3249
3250 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3251 /// for a pair which have highest score deemed to have best chance to form
3252 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3253 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3254 /// of the cost, considered to be good enough score.
3255 std::optional<int>
3256 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3257 int Limit = LookAheadHeuristics::ScoreFail) const {
3258 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3259 RootLookAheadMaxDepth);
3260 int BestScore = Limit;
3261 std::optional<int> Index;
3262 for (int I : seq<int>(Begin: 0, End: Candidates.size())) {
3263 int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first,
3264 RHS: Candidates[I].second,
3265 /*U1=*/nullptr, /*U2=*/nullptr,
3266 /*CurrLevel=*/1, MainAltOps: {});
3267 if (Score > BestScore) {
3268 BestScore = Score;
3269 Index = I;
3270 }
3271 }
3272 return Index;
3273 }
3274
3275 /// Checks if the instruction is marked for deletion.
3276 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); }
3277
3278 /// Removes an instruction from its block and eventually deletes it.
3279 /// It's like Instruction::eraseFromParent() except that the actual deletion
3280 /// is delayed until BoUpSLP is destructed.
3281 void eraseInstruction(Instruction *I) {
3282 DeletedInstructions.insert(V: I);
3283 }
3284
3285 /// Remove instructions from the parent function and clear the operands of \p
3286 /// DeadVals instructions, marking for deletion trivially dead operands.
3287 template <typename T>
3288 void removeInstructionsAndOperands(
3289 ArrayRef<T *> DeadVals,
3290 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3291 SmallVector<WeakTrackingVH> DeadInsts;
3292 for (T *V : DeadVals) {
3293 auto *I = cast<Instruction>(V);
3294 eraseInstruction(I);
3295 }
3296 DenseSet<Value *> Processed;
3297 for (T *V : DeadVals) {
3298 if (!V || !Processed.insert(V).second)
3299 continue;
3300 auto *I = cast<Instruction>(V);
3301 salvageDebugInfo(*I);
3302 ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I);
3303 for (Use &U : I->operands()) {
3304 if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
3305 OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
3306 wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
3307 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3308 return Entry->VectorizedValue == OpI;
3309 })))
3310 DeadInsts.push_back(Elt: OpI);
3311 }
3312 I->dropAllReferences();
3313 }
3314 for (T *V : DeadVals) {
3315 auto *I = cast<Instruction>(V);
3316 if (!I->getParent())
3317 continue;
3318 assert((I->use_empty() || all_of(I->uses(),
3319 [&](Use &U) {
3320 return isDeleted(
3321 cast<Instruction>(U.getUser()));
3322 })) &&
3323 "trying to erase instruction with users.");
3324 I->removeFromParent();
3325 SE->forgetValue(V: I);
3326 }
3327 // Process the dead instruction list until empty.
3328 while (!DeadInsts.empty()) {
3329 Value *V = DeadInsts.pop_back_val();
3330 Instruction *VI = cast_or_null<Instruction>(Val: V);
3331 if (!VI || !VI->getParent())
3332 continue;
3333 assert(isInstructionTriviallyDead(VI, TLI) &&
3334 "Live instruction found in dead worklist!");
3335 assert(VI->use_empty() && "Instructions with uses are not dead.");
3336
3337 // Don't lose the debug info while deleting the instructions.
3338 salvageDebugInfo(I&: *VI);
3339
3340 // Null out all of the instruction's operands to see if any operand
3341 // becomes dead as we go.
3342 for (Use &OpU : VI->operands()) {
3343 Value *OpV = OpU.get();
3344 if (!OpV)
3345 continue;
3346 OpU.set(nullptr);
3347
3348 if (!OpV->use_empty())
3349 continue;
3350
3351 // If the operand is an instruction that became dead as we nulled out
3352 // the operand, and if it is 'trivially' dead, delete it in a future
3353 // loop iteration.
3354 if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
3355 if (!DeletedInstructions.contains(V: OpI) &&
3356 (!OpI->getType()->isVectorTy() ||
3357 none_of(VectorValuesAndScales,
3358 [&](const std::tuple<Value *, unsigned, bool> &V) {
3359 return std::get<0>(t: V) == OpI;
3360 })) &&
3361 isInstructionTriviallyDead(I: OpI, TLI))
3362 DeadInsts.push_back(Elt: OpI);
3363 }
3364
3365 VI->removeFromParent();
3366 eraseInstruction(I: VI);
3367 SE->forgetValue(V: VI);
3368 }
3369 }
3370
3371 /// Checks if the instruction was already analyzed for being possible
3372 /// reduction root.
3373 bool isAnalyzedReductionRoot(Instruction *I) const {
3374 return AnalyzedReductionsRoots.count(Ptr: I);
3375 }
3376 /// Register given instruction as already analyzed for being possible
3377 /// reduction root.
3378 void analyzedReductionRoot(Instruction *I) {
3379 AnalyzedReductionsRoots.insert(Ptr: I);
3380 }
3381 /// Checks if the provided list of reduced values was checked already for
3382 /// vectorization.
3383 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
3384 return AnalyzedReductionVals.contains(V: hash_value(S: VL));
3385 }
3386 /// Adds the list of reduced values to list of already checked values for the
3387 /// vectorization.
3388 void analyzedReductionVals(ArrayRef<Value *> VL) {
3389 AnalyzedReductionVals.insert(V: hash_value(S: VL));
3390 }
3391 /// Clear the list of the analyzed reduction root instructions.
3392 void clearReductionData() {
3393 AnalyzedReductionsRoots.clear();
3394 AnalyzedReductionVals.clear();
3395 AnalyzedMinBWVals.clear();
3396 }
3397 /// Checks if the given value is gathered in one of the nodes.
3398 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3399 return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); });
3400 }
3401 /// Checks if the given value is gathered in one of the nodes.
3402 bool isGathered(const Value *V) const {
3403 return MustGather.contains(Ptr: V);
3404 }
3405 /// Checks if the specified value was not schedule.
3406 bool isNotScheduled(const Value *V) const {
3407 return NonScheduledFirst.contains(Ptr: V);
3408 }
3409
3410 /// Check if the value is vectorized in the tree.
3411 bool isVectorized(const Value *V) const {
3412 assert(V && "V cannot be nullptr.");
3413 return ScalarToTreeEntries.contains(Val: V);
3414 }
3415
3416 ~BoUpSLP();
3417
3418private:
3419 /// Determine if a node \p E in can be demoted to a smaller type with a
3420 /// truncation. We collect the entries that will be demoted in ToDemote.
3421 /// \param E Node for analysis
3422 /// \param ToDemote indices of the nodes to be demoted.
3423 bool collectValuesToDemote(
3424 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3425 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
3426 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3427 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3428
3429 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3430 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3431 /// they have only one user and reordarable).
3432 /// \param ReorderableGathers List of all gather nodes that require reordering
3433 /// (e.g., gather of extractlements or partially vectorizable loads).
3434 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3435 /// reordering, subset of \p NonVectorized.
3436 void buildReorderableOperands(
3437 TreeEntry *UserTE,
3438 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3439 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3440 SmallVectorImpl<TreeEntry *> &GatherOps);
3441
3442 /// Checks if the given \p TE is a gather node with clustered reused scalars
3443 /// and reorders it per given \p Mask.
3444 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3445
3446 /// Checks if all users of \p I are the part of the vectorization tree.
3447 bool areAllUsersVectorized(
3448 Instruction *I,
3449 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3450
3451 /// Return information about the vector formed for the specified index
3452 /// of a vector of (the same) instruction.
3453 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
3454
3455 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3456 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3457 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3458 return const_cast<TreeEntry *>(
3459 getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx));
3460 }
3461
3462 /// Gets the root instruction for the given node. If the node is a strided
3463 /// load/store node with the reverse order, the root instruction is the last
3464 /// one.
3465 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3466
3467 /// \returns Cast context for the given graph node.
3468 TargetTransformInfo::CastContextHint
3469 getCastContextHint(const TreeEntry &TE) const;
3470
3471 /// \returns the cost of the vectorizable entry.
3472 InstructionCost getEntryCost(const TreeEntry *E,
3473 ArrayRef<Value *> VectorizedVals,
3474 SmallPtrSetImpl<Value *> &CheckedExtracts);
3475
3476 /// Checks if it is legal and profitable to build SplitVectorize node for the
3477 /// given \p VL.
3478 /// \param Op1 first homogeneous scalars.
3479 /// \param Op2 second homogeneous scalars.
3480 /// \param ReorderIndices indices to reorder the scalars.
3481 /// \returns true if the node was successfully built.
3482 bool canBuildSplitNode(ArrayRef<Value *> VL,
3483 const InstructionsState &LocalState,
3484 SmallVectorImpl<Value *> &Op1,
3485 SmallVectorImpl<Value *> &Op2,
3486 OrdersType &ReorderIndices) const;
3487
3488 /// This is the recursive part of buildTree.
3489 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3490 unsigned InterleaveFactor = 0);
3491
3492 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3493 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3494 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3495 /// returns false, setting \p CurrentOrder to either an empty vector or a
3496 /// non-identity permutation that allows to reuse extract instructions.
3497 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3498 /// extract order.
3499 bool canReuseExtract(ArrayRef<Value *> VL,
3500 SmallVectorImpl<unsigned> &CurrentOrder,
3501 bool ResizeAllowed = false) const;
3502
3503 /// Vectorize a single entry in the tree.
3504 Value *vectorizeTree(TreeEntry *E);
3505
3506 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3507 /// \p E.
3508 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3509
3510 /// Create a new vector from a list of scalar values. Produces a sequence
3511 /// which exploits values reused across lanes, and arranges the inserts
3512 /// for ease of later optimization.
3513 template <typename BVTy, typename ResTy, typename... Args>
3514 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3515
3516 /// Create a new vector from a list of scalar values. Produces a sequence
3517 /// which exploits values reused across lanes, and arranges the inserts
3518 /// for ease of later optimization.
3519 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3520
3521 /// Returns the instruction in the bundle, which can be used as a base point
3522 /// for scheduling. Usually it is the last instruction in the bundle, except
3523 /// for the case when all operands are external (in this case, it is the first
3524 /// instruction in the list).
3525 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3526
3527 /// Tries to find extractelement instructions with constant indices from fixed
3528 /// vector type and gather such instructions into a bunch, which highly likely
3529 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3530 /// was successful, the matched scalars are replaced by poison values in \p VL
3531 /// for future analysis.
3532 std::optional<TargetTransformInfo::ShuffleKind>
3533 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3534 SmallVectorImpl<int> &Mask) const;
3535
3536 /// Tries to find extractelement instructions with constant indices from fixed
3537 /// vector type and gather such instructions into a bunch, which highly likely
3538 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3539 /// was successful, the matched scalars are replaced by poison values in \p VL
3540 /// for future analysis.
3541 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3542 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3543 SmallVectorImpl<int> &Mask,
3544 unsigned NumParts) const;
3545
3546 /// Checks if the gathered \p VL can be represented as a single register
3547 /// shuffle(s) of previous tree entries.
3548 /// \param TE Tree entry checked for permutation.
3549 /// \param VL List of scalars (a subset of the TE scalar), checked for
3550 /// permutations. Must form single-register vector.
3551 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3552 /// commands to build the mask using the original vector value, without
3553 /// relying on the potential reordering.
3554 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3555 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3556 std::optional<TargetTransformInfo::ShuffleKind>
3557 isGatherShuffledSingleRegisterEntry(
3558 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3559 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3560 bool ForOrder);
3561
3562 /// Checks if the gathered \p VL can be represented as multi-register
3563 /// shuffle(s) of previous tree entries.
3564 /// \param TE Tree entry checked for permutation.
3565 /// \param VL List of scalars (a subset of the TE scalar), checked for
3566 /// permutations.
3567 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3568 /// commands to build the mask using the original vector value, without
3569 /// relying on the potential reordering.
3570 /// \returns per-register series of ShuffleKind, if gathered values can be
3571 /// represented as shuffles of previous tree entries. \p Mask is filled with
3572 /// the shuffle mask (also on per-register base).
3573 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3574 isGatherShuffledEntry(
3575 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3576 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3577 unsigned NumParts, bool ForOrder = false);
3578
3579 /// \returns the cost of gathering (inserting) the values in \p VL into a
3580 /// vector.
3581 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3582 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3583 Type *ScalarTy) const;
3584
3585 /// Set the Builder insert point to one after the last instruction in
3586 /// the bundle
3587 void setInsertPointAfterBundle(const TreeEntry *E);
3588
3589 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3590 /// specified, the starting vector value is poison.
3591 Value *
3592 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3593 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3594
3595 /// \returns whether the VectorizableTree is fully vectorizable and will
3596 /// be beneficial even the tree height is tiny.
3597 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3598
3599 /// Run through the list of all gathered loads in the graph and try to find
3600 /// vector loads/masked gathers instead of regular gathers. Later these loads
3601 /// are reshufled to build final gathered nodes.
3602 void tryToVectorizeGatheredLoads(
3603 const SmallMapVector<
3604 std::tuple<BasicBlock *, Value *, Type *>,
3605 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3606 &GatheredLoads);
3607
3608 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3609 /// users of \p TE and collects the stores. It returns the map from the store
3610 /// pointers to the collected stores.
3611 SmallVector<SmallVector<StoreInst *>>
3612 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3613
3614 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3615 /// stores in \p StoresVec can form a vector instruction. If so it returns
3616 /// true and populates \p ReorderIndices with the shuffle indices of the
3617 /// stores when compared to the sorted vector.
3618 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3619 OrdersType &ReorderIndices) const;
3620
3621 /// Iterates through the users of \p TE, looking for scalar stores that can be
3622 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3623 /// their order and builds an order index vector for each store bundle. It
3624 /// returns all these order vectors found.
3625 /// We run this after the tree has formed, otherwise we may come across user
3626 /// instructions that are not yet in the tree.
3627 SmallVector<OrdersType, 1>
3628 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3629
3630 /// Tries to reorder the gathering node for better vectorization
3631 /// opportunities.
3632 void reorderGatherNode(TreeEntry &TE);
3633
3634 class TreeEntry {
3635 public:
3636 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3637 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3638
3639 /// \returns Common mask for reorder indices and reused scalars.
3640 SmallVector<int> getCommonMask() const {
3641 if (State == TreeEntry::SplitVectorize)
3642 return {};
3643 SmallVector<int> Mask;
3644 inversePermutation(Indices: ReorderIndices, Mask);
3645 ::addMask(Mask, SubMask: ReuseShuffleIndices);
3646 return Mask;
3647 }
3648
3649 /// \returns The mask for split nodes.
3650 SmallVector<int> getSplitMask() const {
3651 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3652 "Expected only split vectorize node.");
3653 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3654 unsigned CommonVF = std::max<unsigned>(
3655 a: CombinedEntriesWithIndices.back().second,
3656 b: Scalars.size() - CombinedEntriesWithIndices.back().second);
3657 for (auto [Idx, I] : enumerate(First: ReorderIndices))
3658 Mask[I] =
3659 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3660 ? CommonVF - CombinedEntriesWithIndices.back().second
3661 : 0);
3662 return Mask;
3663 }
3664
3665 /// Updates (reorders) SplitVectorize node according to the given mask \p
3666 /// Mask and order \p MaskOrder.
3667 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3668 ArrayRef<int> MaskOrder);
3669
3670 /// \returns true if the scalars in VL are equal to this entry.
3671 bool isSame(ArrayRef<Value *> VL) const {
3672 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3673 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3674 return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
3675 return VL.size() == Mask.size() &&
3676 std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
3677 binary_pred: [Scalars](Value *V, int Idx) {
3678 return (isa<UndefValue>(Val: V) &&
3679 Idx == PoisonMaskElem) ||
3680 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3681 });
3682 };
3683 if (!ReorderIndices.empty()) {
3684 // TODO: implement matching if the nodes are just reordered, still can
3685 // treat the vector as the same if the list of scalars matches VL
3686 // directly, without reordering.
3687 SmallVector<int> Mask;
3688 inversePermutation(Indices: ReorderIndices, Mask);
3689 if (VL.size() == Scalars.size())
3690 return IsSame(Scalars, Mask);
3691 if (VL.size() == ReuseShuffleIndices.size()) {
3692 ::addMask(Mask, SubMask: ReuseShuffleIndices);
3693 return IsSame(Scalars, Mask);
3694 }
3695 return false;
3696 }
3697 return IsSame(Scalars, ReuseShuffleIndices);
3698 }
3699
3700 /// \returns true if current entry has same operands as \p TE.
3701 bool hasEqualOperands(const TreeEntry &TE) const {
3702 if (TE.getNumOperands() != getNumOperands())
3703 return false;
3704 SmallBitVector Used(getNumOperands());
3705 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3706 unsigned PrevCount = Used.count();
3707 for (unsigned K = 0; K < E; ++K) {
3708 if (Used.test(Idx: K))
3709 continue;
3710 if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
3711 Used.set(K);
3712 break;
3713 }
3714 }
3715 // Check if we actually found the matching operand.
3716 if (PrevCount == Used.count())
3717 return false;
3718 }
3719 return true;
3720 }
3721
3722 /// \return Final vectorization factor for the node. Defined by the total
3723 /// number of vectorized scalars, including those, used several times in the
3724 /// entry and counted in the \a ReuseShuffleIndices, if any.
3725 unsigned getVectorFactor() const {
3726 if (!ReuseShuffleIndices.empty())
3727 return ReuseShuffleIndices.size();
3728 return Scalars.size();
3729 };
3730
3731 /// Checks if the current node is a gather node.
3732 bool isGather() const { return State == NeedToGather; }
3733
3734 /// A vector of scalars.
3735 ValueList Scalars;
3736
3737 /// The Scalars are vectorized into this value. It is initialized to Null.
3738 WeakTrackingVH VectorizedValue = nullptr;
3739
3740 /// Do we need to gather this sequence or vectorize it
3741 /// (either with vector instruction or with scatter/gather
3742 /// intrinsics for store/load)?
3743 enum EntryState {
3744 Vectorize, ///< The node is regularly vectorized.
3745 ScatterVectorize, ///< Masked scatter/gather node.
3746 StridedVectorize, ///< Strided loads (and stores)
3747 CompressVectorize, ///< (Masked) load with compress.
3748 NeedToGather, ///< Gather/buildvector node.
3749 CombinedVectorize, ///< Vectorized node, combined with its user into more
3750 ///< complex node like select/cmp to minmax, mul/add to
3751 ///< fma, etc. Must be used for the following nodes in
3752 ///< the pattern, not the very first one.
3753 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
3754 ///< independently and then combines back.
3755 };
3756 EntryState State;
3757
3758 /// List of combined opcodes supported by the vectorizer.
3759 enum CombinedOpcode {
3760 NotCombinedOp = -1,
3761 MinMax = Instruction::OtherOpsEnd + 1,
3762 };
3763 CombinedOpcode CombinedOp = NotCombinedOp;
3764
3765 /// Does this sequence require some shuffling?
3766 SmallVector<int, 4> ReuseShuffleIndices;
3767
3768 /// Does this entry require reordering?
3769 SmallVector<unsigned, 4> ReorderIndices;
3770
3771 /// Points back to the VectorizableTree.
3772 ///
3773 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3774 /// to be a pointer and needs to be able to initialize the child iterator.
3775 /// Thus we need a reference back to the container to translate the indices
3776 /// to entries.
3777 VecTreeTy &Container;
3778
3779 /// The TreeEntry index containing the user of this entry.
3780 EdgeInfo UserTreeIndex;
3781
3782 /// The index of this treeEntry in VectorizableTree.
3783 unsigned Idx = 0;
3784
3785 /// For gather/buildvector/alt opcode nodes, which are combined from
3786 /// other nodes as a series of insertvector instructions.
3787 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3788
3789 private:
3790 /// The operands of each instruction in each lane Operands[op_index][lane].
3791 /// Note: This helps avoid the replication of the code that performs the
3792 /// reordering of operands during buildTreeRec() and vectorizeTree().
3793 SmallVector<ValueList, 2> Operands;
3794
3795 /// MainOp and AltOp are recorded inside. S should be obtained from
3796 /// newTreeEntry.
3797 InstructionsState S = InstructionsState::invalid();
3798
3799 /// Interleaving factor for interleaved loads Vectorize nodes.
3800 unsigned InterleaveFactor = 0;
3801
3802 /// True if the node does not require scheduling.
3803 bool DoesNotNeedToSchedule = false;
3804
3805 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3806 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3807 if (Operands.size() < OpIdx + 1)
3808 Operands.resize(N: OpIdx + 1);
3809 assert(Operands[OpIdx].empty() && "Already resized?");
3810 assert(OpVL.size() <= Scalars.size() &&
3811 "Number of operands is greater than the number of scalars.");
3812 Operands[OpIdx].resize(N: OpVL.size());
3813 copy(Range&: OpVL, Out: Operands[OpIdx].begin());
3814 }
3815
3816 public:
3817 /// Returns interleave factor for interleave nodes.
3818 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3819 /// Sets interleaving factor for the interleaving nodes.
3820 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3821
3822 /// Marks the node as one that does not require scheduling.
3823 void setDoesNotNeedToSchedule() {
3824 assert(::doesNotNeedToSchedule(Scalars) &&
3825 "Expected to not need scheduling");
3826 DoesNotNeedToSchedule = true;
3827 }
3828 /// Returns true if the node is marked as one that does not require
3829 /// scheduling.
3830 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
3831
3832 /// Set this bundle's operands from \p Operands.
3833 void setOperands(ArrayRef<ValueList> Operands) {
3834 for (unsigned I : seq<unsigned>(Size: Operands.size()))
3835 setOperand(OpIdx: I, OpVL: Operands[I]);
3836 }
3837
3838 /// Reorders operands of the node to the given mask \p Mask.
3839 void reorderOperands(ArrayRef<int> Mask) {
3840 for (ValueList &Operand : Operands)
3841 reorderScalars(Scalars&: Operand, Mask);
3842 }
3843
3844 /// \returns the \p OpIdx operand of this TreeEntry.
3845 ValueList &getOperand(unsigned OpIdx) {
3846 assert(OpIdx < Operands.size() && "Off bounds");
3847 return Operands[OpIdx];
3848 }
3849
3850 /// \returns the \p OpIdx operand of this TreeEntry.
3851 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3852 assert(OpIdx < Operands.size() && "Off bounds");
3853 return Operands[OpIdx];
3854 }
3855
3856 /// \returns the number of operands.
3857 unsigned getNumOperands() const { return Operands.size(); }
3858
3859 /// \return the single \p OpIdx operand.
3860 Value *getSingleOperand(unsigned OpIdx) const {
3861 assert(OpIdx < Operands.size() && "Off bounds");
3862 assert(!Operands[OpIdx].empty() && "No operand available");
3863 return Operands[OpIdx][0];
3864 }
3865
3866 /// Some of the instructions in the list have alternate opcodes.
3867 bool isAltShuffle() const { return S.isAltShuffle(); }
3868
3869 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
3870 return S.getMatchingMainOpOrAltOp(I);
3871 }
3872
3873 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3874 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3875 /// \p OpValue.
3876 Value *isOneOf(Value *Op) const {
3877 auto *I = dyn_cast<Instruction>(Val: Op);
3878 if (I && getMatchingMainOpOrAltOp(I))
3879 return Op;
3880 return S.getMainOp();
3881 }
3882
3883 void setOperations(const InstructionsState &S) {
3884 assert(S && "InstructionsState is invalid.");
3885 this->S = S;
3886 }
3887
3888 Instruction *getMainOp() const { return S.getMainOp(); }
3889
3890 Instruction *getAltOp() const { return S.getAltOp(); }
3891
3892 /// The main/alternate opcodes for the list of instructions.
3893 unsigned getOpcode() const { return S.getOpcode(); }
3894
3895 unsigned getAltOpcode() const { return S.getAltOpcode(); }
3896
3897 bool hasState() const { return S.valid(); }
3898
3899 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3900 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3901 int findLaneForValue(Value *V) const {
3902 unsigned FoundLane = getVectorFactor();
3903 for (auto *It = find(Range: Scalars, Val: V), *End = Scalars.end(); It != End;
3904 std::advance(i&: It, n: 1)) {
3905 if (*It != V)
3906 continue;
3907 FoundLane = std::distance(first: Scalars.begin(), last: It);
3908 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3909 if (!ReorderIndices.empty())
3910 FoundLane = ReorderIndices[FoundLane];
3911 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3912 if (ReuseShuffleIndices.empty())
3913 break;
3914 if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane);
3915 RIt != ReuseShuffleIndices.end()) {
3916 FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt);
3917 break;
3918 }
3919 }
3920 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3921 return FoundLane;
3922 }
3923
3924 /// Build a shuffle mask for graph entry which represents a merge of main
3925 /// and alternate operations.
3926 void
3927 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3928 SmallVectorImpl<int> &Mask,
3929 SmallVectorImpl<Value *> *OpScalars = nullptr,
3930 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3931
3932 /// Return true if this is a non-power-of-2 node.
3933 bool isNonPowOf2Vec() const {
3934 bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size());
3935 return IsNonPowerOf2;
3936 }
3937
3938 /// Return true if this is a node, which tries to vectorize number of
3939 /// elements, forming whole vectors.
3940 bool
3941 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3942 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3943 TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size());
3944 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3945 "Reshuffling not supported with non-power-of-2 vectors yet.");
3946 return IsNonPowerOf2;
3947 }
3948
3949 Value *getOrdered(unsigned Idx) const {
3950 assert(isGather() && "Must be used only for buildvectors/gathers.");
3951 if (ReorderIndices.empty())
3952 return Scalars[Idx];
3953 SmallVector<int> Mask;
3954 inversePermutation(Indices: ReorderIndices, Mask);
3955 return Scalars[Mask[Idx]];
3956 }
3957
3958#ifndef NDEBUG
3959 /// Debug printer.
3960 LLVM_DUMP_METHOD void dump() const {
3961 dbgs() << Idx << ".\n";
3962 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3963 dbgs() << "Operand " << OpI << ":\n";
3964 for (const Value *V : Operands[OpI])
3965 dbgs().indent(2) << *V << "\n";
3966 }
3967 dbgs() << "Scalars: \n";
3968 for (Value *V : Scalars)
3969 dbgs().indent(2) << *V << "\n";
3970 dbgs() << "State: ";
3971 switch (State) {
3972 case Vectorize:
3973 if (InterleaveFactor > 0) {
3974 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3975 << "\n";
3976 } else {
3977 dbgs() << "Vectorize\n";
3978 }
3979 break;
3980 case ScatterVectorize:
3981 dbgs() << "ScatterVectorize\n";
3982 break;
3983 case StridedVectorize:
3984 dbgs() << "StridedVectorize\n";
3985 break;
3986 case CompressVectorize:
3987 dbgs() << "CompressVectorize\n";
3988 break;
3989 case NeedToGather:
3990 dbgs() << "NeedToGather\n";
3991 break;
3992 case CombinedVectorize:
3993 dbgs() << "CombinedVectorize\n";
3994 break;
3995 case SplitVectorize:
3996 dbgs() << "SplitVectorize\n";
3997 break;
3998 }
3999 if (S) {
4000 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4001 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4002 } else {
4003 dbgs() << "MainOp: NULL\n";
4004 dbgs() << "AltOp: NULL\n";
4005 }
4006 dbgs() << "VectorizedValue: ";
4007 if (VectorizedValue)
4008 dbgs() << *VectorizedValue << "\n";
4009 else
4010 dbgs() << "NULL\n";
4011 dbgs() << "ReuseShuffleIndices: ";
4012 if (ReuseShuffleIndices.empty())
4013 dbgs() << "Empty";
4014 else
4015 for (int ReuseIdx : ReuseShuffleIndices)
4016 dbgs() << ReuseIdx << ", ";
4017 dbgs() << "\n";
4018 dbgs() << "ReorderIndices: ";
4019 for (unsigned ReorderIdx : ReorderIndices)
4020 dbgs() << ReorderIdx << ", ";
4021 dbgs() << "\n";
4022 dbgs() << "UserTreeIndex: ";
4023 if (UserTreeIndex)
4024 dbgs() << UserTreeIndex;
4025 else
4026 dbgs() << "<invalid>";
4027 dbgs() << "\n";
4028 if (!CombinedEntriesWithIndices.empty()) {
4029 dbgs() << "Combined entries: ";
4030 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4031 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4032 });
4033 dbgs() << "\n";
4034 }
4035 }
4036#endif
4037 };
4038
4039#ifndef NDEBUG
4040 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4041 InstructionCost VecCost, InstructionCost ScalarCost,
4042 StringRef Banner) const {
4043 dbgs() << "SLP: " << Banner << ":\n";
4044 E->dump();
4045 dbgs() << "SLP: Costs:\n";
4046 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4047 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4048 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4049 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4050 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4051 }
4052#endif
4053
4054 /// Create a new gather TreeEntry
4055 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4056 const InstructionsState &S,
4057 const EdgeInfo &UserTreeIdx,
4058 ArrayRef<int> ReuseShuffleIndices = {}) {
4059 auto Invalid = ScheduleBundle::invalid();
4060 return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4061 }
4062
4063 /// Create a new VectorizableTree entry.
4064 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4065 const InstructionsState &S,
4066 const EdgeInfo &UserTreeIdx,
4067 ArrayRef<int> ReuseShuffleIndices = {},
4068 ArrayRef<unsigned> ReorderIndices = {},
4069 unsigned InterleaveFactor = 0) {
4070 TreeEntry::EntryState EntryState =
4071 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4072 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4073 ReuseShuffleIndices, ReorderIndices);
4074 if (E && InterleaveFactor > 0)
4075 E->setInterleave(InterleaveFactor);
4076 return E;
4077 }
4078
4079 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4080 TreeEntry::EntryState EntryState,
4081 ScheduleBundle &Bundle, const InstructionsState &S,
4082 const EdgeInfo &UserTreeIdx,
4083 ArrayRef<int> ReuseShuffleIndices = {},
4084 ArrayRef<unsigned> ReorderIndices = {}) {
4085 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4086 EntryState == TreeEntry::SplitVectorize)) ||
4087 (Bundle && EntryState != TreeEntry::NeedToGather &&
4088 EntryState != TreeEntry::SplitVectorize)) &&
4089 "Need to vectorize gather entry?");
4090 // Gathered loads still gathered? Do not create entry, use the original one.
4091 if (GatheredLoadsEntriesFirst.has_value() &&
4092 EntryState == TreeEntry::NeedToGather && S &&
4093 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4094 !UserTreeIdx.UserTE)
4095 return nullptr;
4096 VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
4097 TreeEntry *Last = VectorizableTree.back().get();
4098 Last->Idx = VectorizableTree.size() - 1;
4099 Last->State = EntryState;
4100 if (UserTreeIdx.UserTE)
4101 OperandsToTreeEntry.try_emplace(
4102 Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last);
4103 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4104 // for non-power-of-two vectors.
4105 assert(
4106 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4107 ReuseShuffleIndices.empty()) &&
4108 "Reshuffling scalars not yet supported for nodes with padding");
4109 Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
4110 in_end: ReuseShuffleIndices.end());
4111 if (ReorderIndices.empty()) {
4112 Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
4113 if (S)
4114 Last->setOperations(S);
4115 } else {
4116 // Reorder scalars and build final mask.
4117 Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
4118 transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
4119 F: [VL](unsigned Idx) -> Value * {
4120 if (Idx >= VL.size())
4121 return UndefValue::get(T: VL.front()->getType());
4122 return VL[Idx];
4123 });
4124 InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
4125 if (S)
4126 Last->setOperations(S);
4127 Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
4128 }
4129 if (EntryState == TreeEntry::SplitVectorize) {
4130 assert(S && "Split nodes must have operations.");
4131 Last->setOperations(S);
4132 SmallPtrSet<Value *, 4> Processed;
4133 for (Value *V : VL) {
4134 auto *I = dyn_cast<Instruction>(Val: V);
4135 if (!I)
4136 continue;
4137 auto It = ScalarsInSplitNodes.find(Val: V);
4138 if (It == ScalarsInSplitNodes.end()) {
4139 ScalarsInSplitNodes.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4140 (void)Processed.insert(Ptr: V);
4141 } else if (Processed.insert(Ptr: V).second) {
4142 assert(!is_contained(It->getSecond(), Last) &&
4143 "Value already associated with the node.");
4144 It->getSecond().push_back(Elt: Last);
4145 }
4146 }
4147 } else if (!Last->isGather()) {
4148 if (doesNotNeedToSchedule(VL))
4149 Last->setDoesNotNeedToSchedule();
4150 SmallPtrSet<Value *, 4> Processed;
4151 for (Value *V : VL) {
4152 if (isa<PoisonValue>(Val: V))
4153 continue;
4154 auto It = ScalarToTreeEntries.find(Val: V);
4155 if (It == ScalarToTreeEntries.end()) {
4156 ScalarToTreeEntries.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4157 (void)Processed.insert(Ptr: V);
4158 } else if (Processed.insert(Ptr: V).second) {
4159 assert(!is_contained(It->getSecond(), Last) &&
4160 "Value already associated with the node.");
4161 It->getSecond().push_back(Elt: Last);
4162 }
4163 }
4164 // Update the scheduler bundle to point to this TreeEntry.
4165 assert((!Bundle.getBundle().empty() || isa<PHINode>(S.getMainOp()) ||
4166 isVectorLikeInstWithConstOps(S.getMainOp()) ||
4167 Last->doesNotNeedToSchedule()) &&
4168 "Bundle and VL out of sync");
4169 if (!Bundle.getBundle().empty()) {
4170#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4171 auto *BundleMember = Bundle.getBundle().begin();
4172 SmallPtrSet<Value *, 4> Processed;
4173 for (Value *V : VL) {
4174 if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second)
4175 continue;
4176 ++BundleMember;
4177 }
4178 assert(BundleMember == Bundle.getBundle().end() &&
4179 "Bundle and VL out of sync");
4180#endif
4181 Bundle.setTreeEntry(Last);
4182 }
4183 } else {
4184 // Build a map for gathered scalars to the nodes where they are used.
4185 bool AllConstsOrCasts = true;
4186 for (Value *V : VL)
4187 if (!isConstant(V)) {
4188 auto *I = dyn_cast<CastInst>(Val: V);
4189 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4190 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4191 !UserTreeIdx.UserTE->isGather())
4192 ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(X: Last);
4193 }
4194 if (AllConstsOrCasts)
4195 CastMaxMinBWSizes =
4196 std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1);
4197 MustGather.insert_range(R&: VL);
4198 }
4199
4200 if (UserTreeIdx.UserTE)
4201 Last->UserTreeIndex = UserTreeIdx;
4202 return Last;
4203 }
4204
4205 /// -- Vectorization State --
4206 /// Holds all of the tree entries.
4207 TreeEntry::VecTreeTy VectorizableTree;
4208
4209#ifndef NDEBUG
4210 /// Debug printer.
4211 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4212 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4213 VectorizableTree[Id]->dump();
4214 dbgs() << "\n";
4215 }
4216 }
4217#endif
4218
4219 /// Get list of vector entries, associated with the value \p V.
4220 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4221 assert(V && "V cannot be nullptr.");
4222 auto It = ScalarToTreeEntries.find(Val: V);
4223 if (It == ScalarToTreeEntries.end())
4224 return {};
4225 return It->getSecond();
4226 }
4227
4228 /// Get list of split vector entries, associated with the value \p V.
4229 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4230 assert(V && "V cannot be nullptr.");
4231 auto It = ScalarsInSplitNodes.find(Val: V);
4232 if (It == ScalarsInSplitNodes.end())
4233 return {};
4234 return It->getSecond();
4235 }
4236
4237 /// Returns first vector node for value \p V, matching values \p VL.
4238 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4239 bool SameVF = false) const {
4240 assert(V && "V cannot be nullptr.");
4241 for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V))
4242 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4243 return TE;
4244 return nullptr;
4245 }
4246
4247 /// Check that the operand node of alternate node does not generate
4248 /// buildvector sequence. If it is, then probably not worth it to build
4249 /// alternate shuffle, if number of buildvector operands + alternate
4250 /// instruction > than the number of buildvector instructions.
4251 /// \param S the instructions state of the analyzed values.
4252 /// \param VL list of the instructions with alternate opcodes.
4253 bool areAltOperandsProfitable(const InstructionsState &S,
4254 ArrayRef<Value *> VL) const;
4255
4256 /// Contains all the outputs of legality analysis for a list of values to
4257 /// vectorize.
4258 class ScalarsVectorizationLegality {
4259 InstructionsState S;
4260 bool IsLegal;
4261 bool TryToFindDuplicates;
4262 bool TrySplitVectorize;
4263
4264 public:
4265 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4266 bool TryToFindDuplicates = true,
4267 bool TrySplitVectorize = false)
4268 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4269 TrySplitVectorize(TrySplitVectorize) {
4270 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4271 "Inconsistent state");
4272 }
4273 const InstructionsState &getInstructionsState() const { return S; };
4274 bool isLegal() const { return IsLegal; }
4275 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4276 bool trySplitVectorize() const { return TrySplitVectorize; }
4277 };
4278
4279 /// Checks if the specified list of the instructions/values can be vectorized
4280 /// in general.
4281 ScalarsVectorizationLegality
4282 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4283 const EdgeInfo &UserTreeIdx) const;
4284
4285 /// Checks if the specified list of the instructions/values can be vectorized
4286 /// and fills required data before actual scheduling of the instructions.
4287 TreeEntry::EntryState
4288 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
4289 bool IsScatterVectorizeUserTE,
4290 OrdersType &CurrentOrder,
4291 SmallVectorImpl<Value *> &PointerOps);
4292
4293 /// Maps a specific scalar to its tree entry(ies).
4294 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4295
4296 /// Maps the operand index and entry to the corresponding tree entry.
4297 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4298 OperandsToTreeEntry;
4299
4300 /// Scalars, used in split vectorize nodes.
4301 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4302
4303 /// Maps a value to the proposed vectorizable size.
4304 SmallDenseMap<Value *, unsigned> InstrElementSize;
4305
4306 /// A list of scalars that we found that we need to keep as scalars.
4307 ValueSet MustGather;
4308
4309 /// A set of first non-schedulable values.
4310 ValueSet NonScheduledFirst;
4311
4312 /// A map between the vectorized entries and the last instructions in the
4313 /// bundles. The bundles are built in use order, not in the def order of the
4314 /// instructions. So, we cannot rely directly on the last instruction in the
4315 /// bundle being the last instruction in the program order during
4316 /// vectorization process since the basic blocks are affected, need to
4317 /// pre-gather them before.
4318 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4319
4320 /// List of gather nodes, depending on other gather/vector nodes, which should
4321 /// be emitted after the vector instruction emission process to correctly
4322 /// handle order of the vector instructions and shuffles.
4323 SetVector<const TreeEntry *> PostponedGathers;
4324
4325 using ValueToGatherNodesMap =
4326 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4327 ValueToGatherNodesMap ValueToGatherNodes;
4328
4329 /// A list of the load entries (node indices), which can be vectorized using
4330 /// strided or masked gather approach, but attempted to be represented as
4331 /// contiguous loads.
4332 SetVector<unsigned> LoadEntriesToVectorize;
4333
4334 /// true if graph nodes transforming mode is on.
4335 bool IsGraphTransformMode = false;
4336
4337 /// The index of the first gathered load entry in the VectorizeTree.
4338 std::optional<unsigned> GatheredLoadsEntriesFirst;
4339
4340 /// Maps compress entries to their mask data for the final codegen.
4341 SmallDenseMap<const TreeEntry *,
4342 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4343 CompressEntryToData;
4344
4345 /// This POD struct describes one external user in the vectorized tree.
4346 struct ExternalUser {
4347 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
4348 : Scalar(S), User(U), E(E), Lane(L) {}
4349
4350 /// Which scalar in our function.
4351 Value *Scalar = nullptr;
4352
4353 /// Which user that uses the scalar.
4354 llvm::User *User = nullptr;
4355
4356 /// Vector node, the value is part of.
4357 const TreeEntry &E;
4358
4359 /// Which lane does the scalar belong to.
4360 int Lane;
4361 };
4362 using UserList = SmallVector<ExternalUser, 16>;
4363
4364 /// Checks if two instructions may access the same memory.
4365 ///
4366 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4367 /// is invariant in the calling loop.
4368 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4369 Instruction *Inst2) {
4370 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4371 if (!isSimple(I: Inst2))
4372 return true;
4373 // First check if the result is already in the cache.
4374 AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
4375 auto Res = AliasCache.try_emplace(Key);
4376 if (!Res.second)
4377 return Res.first->second;
4378 bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
4379 // Store the result in the cache.
4380 Res.first->getSecond() = Aliased;
4381 AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased);
4382 return Aliased;
4383 }
4384
4385 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4386
4387 /// Cache for alias results.
4388 /// TODO: consider moving this to the AliasAnalysis itself.
4389 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4390
4391 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4392 // globally through SLP because we don't perform any action which
4393 // invalidates capture results.
4394 BatchAAResults BatchAA;
4395
4396 /// Temporary store for deleted instructions. Instructions will be deleted
4397 /// eventually when the BoUpSLP is destructed. The deferral is required to
4398 /// ensure that there are no incorrect collisions in the AliasCache, which
4399 /// can happen if a new instruction is allocated at the same address as a
4400 /// previously deleted instruction.
4401 DenseSet<Instruction *> DeletedInstructions;
4402
4403 /// Set of the instruction, being analyzed already for reductions.
4404 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4405
4406 /// Set of hashes for the list of reduction values already being analyzed.
4407 DenseSet<size_t> AnalyzedReductionVals;
4408
4409 /// Values, already been analyzed for mininmal bitwidth and found to be
4410 /// non-profitable.
4411 DenseSet<Value *> AnalyzedMinBWVals;
4412
4413 /// A list of values that need to extracted out of the tree.
4414 /// This list holds pairs of (Internal Scalar : External User). External User
4415 /// can be nullptr, it means that this Internal Scalar will be used later,
4416 /// after vectorization.
4417 UserList ExternalUses;
4418
4419 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4420 /// extractelement instructions.
4421 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4422
4423 /// Values used only by @llvm.assume calls.
4424 SmallPtrSet<const Value *, 32> EphValues;
4425
4426 /// Holds all of the instructions that we gathered, shuffle instructions and
4427 /// extractelements.
4428 SetVector<Instruction *> GatherShuffleExtractSeq;
4429
4430 /// A list of blocks that we are going to CSE.
4431 DenseSet<BasicBlock *> CSEBlocks;
4432
4433 /// List of hashes of vector of loads, which are known to be non vectorizable.
4434 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4435
4436 /// Represents a scheduling entity, either ScheduleData or ScheduleBundle.
4437 /// ScheduleData used to gather dependecies for a single instructions, while
4438 /// ScheduleBundle represents a batch of instructions, going to be groupped
4439 /// together.
4440 class ScheduleEntity {
4441 friend class ScheduleBundle;
4442 friend class ScheduleData;
4443
4444 protected:
4445 enum class Kind { ScheduleData, ScheduleBundle };
4446 Kind getKind() const { return K; }
4447 ScheduleEntity(Kind K) : K(K) {}
4448
4449 private:
4450 /// Used for getting a "good" final ordering of instructions.
4451 int SchedulingPriority = 0;
4452 /// True if this instruction (or bundle) is scheduled (or considered as
4453 /// scheduled in the dry-run).
4454 bool IsScheduled = false;
4455 /// The kind of the ScheduleEntity.
4456 const Kind K = Kind::ScheduleData;
4457
4458 public:
4459 ScheduleEntity() = delete;
4460 /// Gets/sets the scheduling priority.
4461 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4462 int getSchedulingPriority() const { return SchedulingPriority; }
4463 bool isReady() const {
4464 if (auto *SD = dyn_cast<ScheduleData>(Val: this))
4465 return SD->isReady();
4466 return cast<ScheduleBundle>(Val: this)->isReady();
4467 }
4468 /// Gets/sets if the bundle is scheduled.
4469 bool isScheduled() const { return IsScheduled; }
4470 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4471
4472 static bool classof(const ScheduleEntity *) { return true; }
4473 };
4474
4475 /// Contains all scheduling relevant data for an instruction.
4476 /// A ScheduleData either represents a single instruction or a member of an
4477 /// instruction bundle (= a group of instructions which is combined into a
4478 /// vector instruction).
4479 class ScheduleData final : public ScheduleEntity {
4480 public:
4481 // The initial value for the dependency counters. It means that the
4482 // dependencies are not calculated yet.
4483 enum { InvalidDeps = -1 };
4484
4485 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4486 static bool classof(const ScheduleEntity *Entity) {
4487 return Entity->getKind() == Kind::ScheduleData;
4488 }
4489
4490 void init(int BlockSchedulingRegionID, Instruction *I) {
4491 NextLoadStore = nullptr;
4492 IsScheduled = false;
4493 SchedulingRegionID = BlockSchedulingRegionID;
4494 clearDependencies();
4495 Inst = I;
4496 }
4497
4498 /// Verify basic self consistency properties
4499 void verify() {
4500 if (hasValidDependencies()) {
4501 assert(UnscheduledDeps <= Dependencies && "invariant");
4502 } else {
4503 assert(UnscheduledDeps == Dependencies && "invariant");
4504 }
4505
4506 if (IsScheduled) {
4507 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4508 "unexpected scheduled state");
4509 }
4510 }
4511
4512 /// Returns true if the dependency information has been calculated.
4513 /// Note that depenendency validity can vary between instructions within
4514 /// a single bundle.
4515 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4516
4517 /// Returns true if it is ready for scheduling, i.e. it has no more
4518 /// unscheduled depending instructions/bundles.
4519 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4520
4521 /// Modifies the number of unscheduled dependencies for this instruction,
4522 /// and returns the number of remaining dependencies for the containing
4523 /// bundle.
4524 int incrementUnscheduledDeps(int Incr) {
4525 assert(hasValidDependencies() &&
4526 "increment of unscheduled deps would be meaningless");
4527 UnscheduledDeps += Incr;
4528 return UnscheduledDeps;
4529 }
4530
4531 /// Sets the number of unscheduled dependencies to the number of
4532 /// dependencies.
4533 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4534
4535 /// Clears all dependency information.
4536 void clearDependencies() {
4537 Dependencies = InvalidDeps;
4538 resetUnscheduledDeps();
4539 MemoryDependencies.clear();
4540 ControlDependencies.clear();
4541 IsScheduled = false;
4542 }
4543
4544 /// Gets the number of unscheduled dependencies.
4545 int getUnscheduledDeps() const { return UnscheduledDeps; }
4546 /// Gets the number of dependencies.
4547 int getDependencies() const { return Dependencies; }
4548 /// Initializes the number of dependencies.
4549 void initDependencies() { Dependencies = 0; }
4550 /// Increments the number of dependencies.
4551 void incDependencies() { Dependencies++; }
4552
4553 /// Gets scheduling region ID.
4554 int getSchedulingRegionID() const { return SchedulingRegionID; }
4555
4556 /// Gets the instruction.
4557 Instruction *getInst() const { return Inst; }
4558
4559 /// Gets the list of memory dependencies.
4560 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4561 return MemoryDependencies;
4562 }
4563 /// Adds a memory dependency.
4564 void addMemoryDependency(ScheduleData *Dep) {
4565 MemoryDependencies.push_back(Elt: Dep);
4566 }
4567 /// Gets the list of control dependencies.
4568 ArrayRef<ScheduleData *> getControlDependencies() const {
4569 return ControlDependencies;
4570 }
4571 /// Adds a control dependency.
4572 void addControlDependency(ScheduleData *Dep) {
4573 ControlDependencies.push_back(Elt: Dep);
4574 }
4575 /// Gets/sets the next load/store instruction in the block.
4576 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
4577 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
4578
4579 void dump(raw_ostream &OS) const { OS << *Inst; }
4580
4581 LLVM_DUMP_METHOD void dump() const {
4582 dump(OS&: dbgs());
4583 dbgs() << '\n';
4584 }
4585
4586 private:
4587 Instruction *Inst = nullptr;
4588
4589 /// Single linked list of all memory instructions (e.g. load, store, call)
4590 /// in the block - until the end of the scheduling region.
4591 ScheduleData *NextLoadStore = nullptr;
4592
4593 /// The dependent memory instructions.
4594 /// This list is derived on demand in calculateDependencies().
4595 SmallVector<ScheduleData *> MemoryDependencies;
4596
4597 /// List of instructions which this instruction could be control dependent
4598 /// on. Allowing such nodes to be scheduled below this one could introduce
4599 /// a runtime fault which didn't exist in the original program.
4600 /// ex: this is a load or udiv following a readonly call which inf loops
4601 SmallVector<ScheduleData *> ControlDependencies;
4602
4603 /// This ScheduleData is in the current scheduling region if this matches
4604 /// the current SchedulingRegionID of BlockScheduling.
4605 int SchedulingRegionID = 0;
4606
4607 /// The number of dependencies. Constitutes of the number of users of the
4608 /// instruction plus the number of dependent memory instructions (if any).
4609 /// This value is calculated on demand.
4610 /// If InvalidDeps, the number of dependencies is not calculated yet.
4611 int Dependencies = InvalidDeps;
4612
4613 /// The number of dependencies minus the number of dependencies of scheduled
4614 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4615 /// for scheduling.
4616 /// Note that this is negative as long as Dependencies is not calculated.
4617 int UnscheduledDeps = InvalidDeps;
4618 };
4619
4620#ifndef NDEBUG
4621 friend inline raw_ostream &operator<<(raw_ostream &OS,
4622 const BoUpSLP::ScheduleData &SD) {
4623 SD.dump(OS);
4624 return OS;
4625 }
4626#endif
4627
4628 class ScheduleBundle final : public ScheduleEntity {
4629 /// The schedule data for the instructions in the bundle.
4630 SmallVector<ScheduleData *> Bundle;
4631 /// True if this bundle is valid.
4632 bool IsValid = true;
4633 /// The TreeEntry that this instruction corresponds to.
4634 TreeEntry *TE = nullptr;
4635 ScheduleBundle(bool IsValid)
4636 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
4637
4638 public:
4639 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
4640 static bool classof(const ScheduleEntity *Entity) {
4641 return Entity->getKind() == Kind::ScheduleBundle;
4642 }
4643
4644 /// Verify basic self consistency properties
4645 void verify() const {
4646 for (const ScheduleData *SD : Bundle) {
4647 if (SD->hasValidDependencies()) {
4648 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
4649 "invariant");
4650 } else {
4651 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
4652 "invariant");
4653 }
4654
4655 if (isScheduled()) {
4656 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
4657 "unexpected scheduled state");
4658 }
4659 }
4660 }
4661
4662 /// Returns the number of unscheduled dependencies in the bundle.
4663 int unscheduledDepsInBundle() const {
4664 assert(*this && "bundle must not be empty");
4665 int Sum = 0;
4666 for (const ScheduleData *BundleMember : Bundle) {
4667 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
4668 return ScheduleData::InvalidDeps;
4669 Sum += BundleMember->getUnscheduledDeps();
4670 }
4671 return Sum;
4672 }
4673
4674 /// Returns true if the dependency information has been calculated.
4675 /// Note that depenendency validity can vary between instructions within
4676 /// a single bundle.
4677 bool hasValidDependencies() const {
4678 return all_of(Range: Bundle, P: [](const ScheduleData *SD) {
4679 return SD->hasValidDependencies();
4680 });
4681 }
4682
4683 /// Returns true if it is ready for scheduling, i.e. it has no more
4684 /// unscheduled depending instructions/bundles.
4685 bool isReady() const {
4686 assert(*this && "bundle must not be empty");
4687 return unscheduledDepsInBundle() == 0 && !isScheduled();
4688 }
4689
4690 /// Returns the bundle of scheduling data, associated with the current
4691 /// instruction.
4692 ArrayRef<ScheduleData *> getBundle() { return Bundle; }
4693 ArrayRef<const ScheduleData *> getBundle() const { return Bundle; }
4694 /// Adds an instruction to the bundle.
4695 void add(ScheduleData *SD) { Bundle.push_back(Elt: SD); }
4696
4697 /// Gets/sets the associated tree entry.
4698 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
4699 TreeEntry *getTreeEntry() const { return TE; }
4700
4701 static ScheduleBundle invalid() { return {false}; }
4702
4703 operator bool() const { return IsValid; }
4704
4705#ifndef NDEBUG
4706 void dump(raw_ostream &OS) const {
4707 if (!*this) {
4708 OS << "[]";
4709 return;
4710 }
4711 OS << '[';
4712 interleaveComma(Bundle, OS,
4713 [&](const ScheduleData *SD) { OS << *SD->getInst(); });
4714 OS << ']';
4715 }
4716
4717 LLVM_DUMP_METHOD void dump() const {
4718 dump(dbgs());
4719 dbgs() << '\n';
4720 }
4721#endif // NDEBUG
4722 };
4723
4724#ifndef NDEBUG
4725 friend inline raw_ostream &operator<<(raw_ostream &OS,
4726 const BoUpSLP::ScheduleBundle &Bundle) {
4727 Bundle.dump(OS);
4728 return OS;
4729 }
4730#endif
4731
4732 friend struct GraphTraits<BoUpSLP *>;
4733 friend struct DOTGraphTraits<BoUpSLP *>;
4734
4735 /// Contains all scheduling data for a basic block.
4736 /// It does not schedules instructions, which are not memory read/write
4737 /// instructions and their operands are either constants, or arguments, or
4738 /// phis, or instructions from others blocks, or their users are phis or from
4739 /// the other blocks. The resulting vector instructions can be placed at the
4740 /// beginning of the basic block without scheduling (if operands does not need
4741 /// to be scheduled) or at the end of the block (if users are outside of the
4742 /// block). It allows to save some compile time and memory used by the
4743 /// compiler.
4744 /// ScheduleData is assigned for each instruction in between the boundaries of
4745 /// the tree entry, even for those, which are not part of the graph. It is
4746 /// required to correctly follow the dependencies between the instructions and
4747 /// their correct scheduling. The ScheduleData is not allocated for the
4748 /// instructions, which do not require scheduling, like phis, nodes with
4749 /// extractelements/insertelements only or nodes with instructions, with
4750 /// uses/operands outside of the block.
4751 struct BlockScheduling {
4752 BlockScheduling(BasicBlock *BB)
4753 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4754
4755 void clear() {
4756 ScheduledBundles.clear();
4757 ScheduledBundlesList.clear();
4758 ReadyInsts.clear();
4759 ScheduleStart = nullptr;
4760 ScheduleEnd = nullptr;
4761 FirstLoadStoreInRegion = nullptr;
4762 LastLoadStoreInRegion = nullptr;
4763 RegionHasStackSave = false;
4764
4765 // Reduce the maximum schedule region size by the size of the
4766 // previous scheduling run.
4767 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4768 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4769 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4770 ScheduleRegionSize = 0;
4771
4772 // Make a new scheduling region, i.e. all existing ScheduleData is not
4773 // in the new region yet.
4774 ++SchedulingRegionID;
4775 }
4776
4777 ScheduleData *getScheduleData(Instruction *I) {
4778 if (!I)
4779 return nullptr;
4780 if (BB != I->getParent())
4781 // Avoid lookup if can't possibly be in map.
4782 return nullptr;
4783 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
4784 if (SD && isInSchedulingRegion(SD))
4785 return SD;
4786 return nullptr;
4787 }
4788
4789 ScheduleData *getScheduleData(Value *V) {
4790 return getScheduleData(I: dyn_cast<Instruction>(Val: V));
4791 }
4792
4793 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
4794 auto *I = dyn_cast<Instruction>(Val: V);
4795 if (!I)
4796 return {};
4797 auto It = ScheduledBundles.find(Val: I);
4798 if (It == ScheduledBundles.end())
4799 return {};
4800 return It->getSecond();
4801 }
4802
4803 bool isInSchedulingRegion(ScheduleData *SD) const {
4804 return SD->getSchedulingRegionID() == SchedulingRegionID;
4805 }
4806
4807 bool isInSchedulingRegion(const ScheduleBundle &Bundle) const {
4808 return all_of(Range: Bundle.getBundle(), P: [&](const ScheduleData *BundleMember) {
4809 return BundleMember->getSchedulingRegionID() == SchedulingRegionID;
4810 });
4811 }
4812
4813 /// Marks an instruction as scheduled and puts all dependent ready
4814 /// instructions into the ready-list.
4815 template <typename ReadyListType>
4816 void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) {
4817 auto ProcessBundleMember = [&](ScheduleData *BundleMember,
4818 ScheduleBundle *Bundle) {
4819 // Handle the def-use chain dependencies.
4820
4821 // Decrement the unscheduled counter and insert to ready list if ready.
4822 auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) {
4823 if ((IsControl || Data->hasValidDependencies()) &&
4824 Data->incrementUnscheduledDeps(Incr: -1) == 0) {
4825 // There are no more unscheduled dependencies after
4826 // decrementing, so we can put the dependent instruction
4827 // into the ready list.
4828 if (ArrayRef<ScheduleBundle *> Bundles =
4829 getScheduleBundles(V: Data->getInst());
4830 !Bundles.empty()) {
4831 for (ScheduleBundle *Bundle : Bundles) {
4832 if (Bundle->unscheduledDepsInBundle() == 0) {
4833 assert(!Bundle->isScheduled() &&
4834 "already scheduled bundle gets ready");
4835 ReadyList.insert(Bundle);
4836 LLVM_DEBUG(dbgs()
4837 << "SLP: gets ready: " << *Bundle << "\n");
4838 }
4839 }
4840 return;
4841 }
4842 assert(!Data->isScheduled() &&
4843 "already scheduled bundle gets ready");
4844 ReadyList.insert(Data);
4845 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
4846 }
4847 };
4848
4849 auto DecrUnschedForInst = [&](Instruction *I) {
4850 if (ScheduleData *OpSD = getScheduleData(I))
4851 DecrUnsched(OpSD, /*IsControl=*/false);
4852 };
4853
4854 // If BundleMember is a vector bundle, its operands may have been
4855 // reordered during buildTree(). We therefore need to get its operands
4856 // through the TreeEntry.
4857 if (Bundle) {
4858 // Need to search for the lane since the tree entry can be reordered.
4859 auto *In = BundleMember->getInst();
4860 int Lane = std::distance(first: Bundle->getTreeEntry()->Scalars.begin(),
4861 last: find(Range&: Bundle->getTreeEntry()->Scalars, Val: In));
4862 assert(Lane >= 0 && "Lane not set");
4863
4864 // Since vectorization tree is being built recursively this assertion
4865 // ensures that the tree entry has all operands set before reaching
4866 // this code. Couple of exceptions known at the moment are extracts
4867 // where their second (immediate) operand is not added. Since
4868 // immediates do not affect scheduler behavior this is considered
4869 // okay.
4870 assert(In &&
4871 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
4872 In->getNumOperands() ==
4873 Bundle->getTreeEntry()->getNumOperands()) &&
4874 "Missed TreeEntry operands?");
4875
4876 for (unsigned OpIdx :
4877 seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands()))
4878 if (auto *I = dyn_cast<Instruction>(
4879 Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
4880 LLVM_DEBUG(dbgs()
4881 << "SLP: check for readiness (def): " << *I << "\n");
4882 DecrUnschedForInst(I);
4883 }
4884 } else {
4885 // If BundleMember is a stand-alone instruction, no operand reordering
4886 // has taken place, so we directly access its operands.
4887 for (Use &U : BundleMember->getInst()->operands())
4888 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
4889 LLVM_DEBUG(dbgs()
4890 << "SLP: check for readiness (def): " << *I << "\n");
4891 DecrUnschedForInst(I);
4892 }
4893 }
4894 // Handle the memory dependencies.
4895 for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) {
4896 // There are no more unscheduled dependencies after decrementing,
4897 // so we can put the dependent instruction into the ready list.
4898 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
4899 << *MemoryDep << "\n");
4900 DecrUnsched(MemoryDep);
4901 }
4902 // Handle the control dependencies.
4903 for (ScheduleData *Dep : BundleMember->getControlDependencies()) {
4904 // There are no more unscheduled dependencies after decrementing,
4905 // so we can put the dependent instruction into the ready list.
4906 LLVM_DEBUG(dbgs()
4907 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
4908 DecrUnsched(Dep, /*IsControl=*/true);
4909 }
4910 };
4911 if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) {
4912 SD->setScheduled(/*Scheduled=*/true);
4913 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4914 ProcessBundleMember(SD, nullptr);
4915 } else {
4916 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data);
4917 Bundle.setScheduled(/*Scheduled=*/true);
4918 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
4919 auto AreAllBundlesScheduled = [&](const ScheduleData *SD) {
4920 ArrayRef<ScheduleBundle *> SDBundles =
4921 getScheduleBundles(V: SD->getInst());
4922 return !SDBundles.empty() &&
4923 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
4924 return SDBundle->isScheduled();
4925 });
4926 };
4927 for (ScheduleData *SD : Bundle.getBundle()) {
4928 if (AreAllBundlesScheduled(SD)) {
4929 SD->setScheduled(/*Scheduled=*/true);
4930 ProcessBundleMember(SD, &Bundle);
4931 }
4932 }
4933 }
4934 }
4935
4936 /// Verify basic self consistency properties of the data structure.
4937 void verify() {
4938 if (!ScheduleStart)
4939 return;
4940
4941 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4942 ScheduleStart->comesBefore(ScheduleEnd) &&
4943 "Not a valid scheduling region?");
4944
4945 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4946 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
4947 if (!Bundles.empty()) {
4948 for (ScheduleBundle *Bundle : Bundles) {
4949 assert(isInSchedulingRegion(*Bundle) &&
4950 "primary schedule data not in window?");
4951 Bundle->verify();
4952 }
4953 continue;
4954 }
4955 auto *SD = getScheduleData(I);
4956 if (!SD)
4957 continue;
4958 assert(isInSchedulingRegion(SD) &&
4959 "primary schedule data not in window?");
4960 SD->verify();
4961 }
4962
4963 assert(all_of(ReadyInsts,
4964 [](const ScheduleEntity *Bundle) {
4965 return Bundle->isReady();
4966 }) &&
4967 "item in ready list not ready?");
4968 }
4969
4970 /// Put all instructions into the ReadyList which are ready for scheduling.
4971 template <typename ReadyListType>
4972 void initialFillReadyList(ReadyListType &ReadyList) {
4973 SmallPtrSet<ScheduleBundle *, 16> Visited;
4974 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4975 ScheduleData *SD = getScheduleData(I);
4976 if (SD && SD->hasValidDependencies() && SD->isReady()) {
4977 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
4978 !Bundles.empty()) {
4979 for (ScheduleBundle *Bundle : Bundles) {
4980 if (!Visited.insert(Ptr: Bundle).second)
4981 continue;
4982 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
4983 ReadyList.insert(Bundle);
4984 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
4985 << *Bundle << "\n");
4986 }
4987 }
4988 continue;
4989 }
4990 ReadyList.insert(SD);
4991 LLVM_DEBUG(dbgs()
4992 << "SLP: initially in ready list: " << *SD << "\n");
4993 }
4994 }
4995 }
4996
4997 /// Build a bundle from the ScheduleData nodes corresponding to the
4998 /// scalar instruction for each lane.
4999 ScheduleBundle &buildBundle(ArrayRef<Value *> VL);
5000
5001 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5002 /// cyclic dependencies. This is only a dry-run, no instructions are
5003 /// actually moved at this stage.
5004 /// \returns the scheduling bundle. The returned Optional value is not
5005 /// std::nullopt if \p VL is allowed to be scheduled.
5006 std::optional<ScheduleBundle *>
5007 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5008 const InstructionsState &S);
5009
5010 /// Allocates schedule data chunk.
5011 ScheduleData *allocateScheduleDataChunks();
5012
5013 /// Extends the scheduling region so that V is inside the region.
5014 /// \returns true if the region size is within the limit.
5015 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5016
5017 /// Initialize the ScheduleData structures for new instructions in the
5018 /// scheduling region.
5019 void initScheduleData(Instruction *FromI, Instruction *ToI,
5020 ScheduleData *PrevLoadStore,
5021 ScheduleData *NextLoadStore);
5022
5023 /// Updates the dependency information of a bundle and of all instructions/
5024 /// bundles which depend on the original bundle.
5025 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5026 BoUpSLP *SLP);
5027
5028 /// Sets all instruction in the scheduling region to un-scheduled.
5029 void resetSchedule();
5030
5031 BasicBlock *BB;
5032
5033 /// Simple memory allocation for ScheduleData.
5034 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
5035
5036 /// The size of a ScheduleData array in ScheduleDataChunks.
5037 int ChunkSize;
5038
5039 /// The allocator position in the current chunk, which is the last entry
5040 /// of ScheduleDataChunks.
5041 int ChunkPos;
5042
5043 /// Attaches ScheduleData to Instruction.
5044 /// Note that the mapping survives during all vectorization iterations, i.e.
5045 /// ScheduleData structures are recycled.
5046 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
5047
5048 /// Attaches ScheduleBundle to Instruction.
5049 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
5050 ScheduledBundles;
5051 /// The list of ScheduleBundles.
5052 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
5053
5054 /// The ready-list for scheduling (only used for the dry-run).
5055 SetVector<ScheduleEntity *> ReadyInsts;
5056
5057 /// The first instruction of the scheduling region.
5058 Instruction *ScheduleStart = nullptr;
5059
5060 /// The first instruction _after_ the scheduling region.
5061 Instruction *ScheduleEnd = nullptr;
5062
5063 /// The first memory accessing instruction in the scheduling region
5064 /// (can be null).
5065 ScheduleData *FirstLoadStoreInRegion = nullptr;
5066
5067 /// The last memory accessing instruction in the scheduling region
5068 /// (can be null).
5069 ScheduleData *LastLoadStoreInRegion = nullptr;
5070
5071 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
5072 /// region? Used to optimize the dependence calculation for the
5073 /// common case where there isn't.
5074 bool RegionHasStackSave = false;
5075
5076 /// The current size of the scheduling region.
5077 int ScheduleRegionSize = 0;
5078
5079 /// The maximum size allowed for the scheduling region.
5080 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
5081
5082 /// The ID of the scheduling region. For a new vectorization iteration this
5083 /// is incremented which "removes" all ScheduleData from the region.
5084 /// Make sure that the initial SchedulingRegionID is greater than the
5085 /// initial SchedulingRegionID in ScheduleData (which is 0).
5086 int SchedulingRegionID = 1;
5087 };
5088
5089 /// Attaches the BlockScheduling structures to basic blocks.
5090 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
5091
5092 /// Performs the "real" scheduling. Done before vectorization is actually
5093 /// performed in a basic block.
5094 void scheduleBlock(BlockScheduling *BS);
5095
5096 /// List of users to ignore during scheduling and that don't need extracting.
5097 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
5098
5099 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
5100 /// sorted SmallVectors of unsigned.
5101 struct OrdersTypeDenseMapInfo {
5102 static OrdersType getEmptyKey() {
5103 OrdersType V;
5104 V.push_back(Elt: ~1U);
5105 return V;
5106 }
5107
5108 static OrdersType getTombstoneKey() {
5109 OrdersType V;
5110 V.push_back(Elt: ~2U);
5111 return V;
5112 }
5113
5114 static unsigned getHashValue(const OrdersType &V) {
5115 return static_cast<unsigned>(hash_combine_range(R: V));
5116 }
5117
5118 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
5119 return LHS == RHS;
5120 }
5121 };
5122
5123 // Analysis and block reference.
5124 Function *F;
5125 ScalarEvolution *SE;
5126 TargetTransformInfo *TTI;
5127 TargetLibraryInfo *TLI;
5128 LoopInfo *LI;
5129 DominatorTree *DT;
5130 AssumptionCache *AC;
5131 DemandedBits *DB;
5132 const DataLayout *DL;
5133 OptimizationRemarkEmitter *ORE;
5134
5135 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
5136 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
5137
5138 /// Instruction builder to construct the vectorized tree.
5139 IRBuilder<TargetFolder> Builder;
5140
5141 /// A map of scalar integer values to the smallest bit width with which they
5142 /// can legally be represented. The values map to (width, signed) pairs,
5143 /// where "width" indicates the minimum bit width and "signed" is True if the
5144 /// value must be signed-extended, rather than zero-extended, back to its
5145 /// original width.
5146 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
5147
5148 /// Final size of the reduced vector, if the current graph represents the
5149 /// input for the reduction and it was possible to narrow the size of the
5150 /// reduction.
5151 unsigned ReductionBitWidth = 0;
5152
5153 /// Canonical graph size before the transformations.
5154 unsigned BaseGraphSize = 1;
5155
5156 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
5157 /// type sizes, used in the tree.
5158 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
5159
5160 /// Indices of the vectorized nodes, which supposed to be the roots of the new
5161 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
5162 DenseSet<unsigned> ExtraBitWidthNodes;
5163};
5164
5165} // end namespace slpvectorizer
5166
5167template <> struct GraphTraits<BoUpSLP *> {
5168 using TreeEntry = BoUpSLP::TreeEntry;
5169
5170 /// NodeRef has to be a pointer per the GraphWriter.
5171 using NodeRef = TreeEntry *;
5172
5173 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
5174
5175 /// Add the VectorizableTree to the index iterator to be able to return
5176 /// TreeEntry pointers.
5177 struct ChildIteratorType
5178 : public iterator_adaptor_base<
5179 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
5180 ContainerTy &VectorizableTree;
5181
5182 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
5183 ContainerTy &VT)
5184 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
5185
5186 NodeRef operator*() { return I->UserTE; }
5187 };
5188
5189 static NodeRef getEntryNode(BoUpSLP &R) {
5190 return R.VectorizableTree[0].get();
5191 }
5192
5193 static ChildIteratorType child_begin(NodeRef N) {
5194 return {&N->UserTreeIndex, N->Container};
5195 }
5196
5197 static ChildIteratorType child_end(NodeRef N) {
5198 return {&N->UserTreeIndex + 1, N->Container};
5199 }
5200
5201 /// For the node iterator we just need to turn the TreeEntry iterator into a
5202 /// TreeEntry* iterator so that it dereferences to NodeRef.
5203 class nodes_iterator {
5204 using ItTy = ContainerTy::iterator;
5205 ItTy It;
5206
5207 public:
5208 nodes_iterator(const ItTy &It2) : It(It2) {}
5209 NodeRef operator*() { return It->get(); }
5210 nodes_iterator operator++() {
5211 ++It;
5212 return *this;
5213 }
5214 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
5215 };
5216
5217 static nodes_iterator nodes_begin(BoUpSLP *R) {
5218 return nodes_iterator(R->VectorizableTree.begin());
5219 }
5220
5221 static nodes_iterator nodes_end(BoUpSLP *R) {
5222 return nodes_iterator(R->VectorizableTree.end());
5223 }
5224
5225 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
5226};
5227
5228template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
5229 using TreeEntry = BoUpSLP::TreeEntry;
5230
5231 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
5232
5233 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
5234 std::string Str;
5235 raw_string_ostream OS(Str);
5236 OS << Entry->Idx << ".\n";
5237 if (isSplat(VL: Entry->Scalars))
5238 OS << "<splat> ";
5239 for (auto *V : Entry->Scalars) {
5240 OS << *V;
5241 if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
5242 return EU.Scalar == V;
5243 }))
5244 OS << " <extract>";
5245 OS << "\n";
5246 }
5247 return Str;
5248 }
5249
5250 static std::string getNodeAttributes(const TreeEntry *Entry,
5251 const BoUpSLP *) {
5252 if (Entry->isGather())
5253 return "color=red";
5254 if (Entry->State == TreeEntry::ScatterVectorize ||
5255 Entry->State == TreeEntry::StridedVectorize ||
5256 Entry->State == TreeEntry::CompressVectorize)
5257 return "color=blue";
5258 return "";
5259 }
5260};
5261
5262} // end namespace llvm
5263
5264BoUpSLP::~BoUpSLP() {
5265 SmallVector<WeakTrackingVH> DeadInsts;
5266 for (auto *I : DeletedInstructions) {
5267 if (!I->getParent()) {
5268 // Temporarily insert instruction back to erase them from parent and
5269 // memory later.
5270 if (isa<PHINode>(Val: I))
5271 // Phi nodes must be the very first instructions in the block.
5272 I->insertBefore(BB&: F->getEntryBlock(),
5273 InsertPos: F->getEntryBlock().getFirstNonPHIIt());
5274 else
5275 I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator());
5276 continue;
5277 }
5278 for (Use &U : I->operands()) {
5279 auto *Op = dyn_cast<Instruction>(Val: U.get());
5280 if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
5281 wouldInstructionBeTriviallyDead(I: Op, TLI))
5282 DeadInsts.emplace_back(Args&: Op);
5283 }
5284 I->dropAllReferences();
5285 }
5286 for (auto *I : DeletedInstructions) {
5287 assert(I->use_empty() &&
5288 "trying to erase instruction with users.");
5289 I->eraseFromParent();
5290 }
5291
5292 // Cleanup any dead scalar code feeding the vectorized instructions
5293 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
5294
5295#ifdef EXPENSIVE_CHECKS
5296 // If we could guarantee that this call is not extremely slow, we could
5297 // remove the ifdef limitation (see PR47712).
5298 assert(!verifyFunction(*F, &dbgs()));
5299#endif
5300}
5301
5302/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
5303/// contains original mask for the scalars reused in the node. Procedure
5304/// transform this mask in accordance with the given \p Mask.
5305static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
5306 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
5307 "Expected non-empty mask.");
5308 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
5309 Prev.swap(RHS&: Reuses);
5310 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
5311 if (Mask[I] != PoisonMaskElem)
5312 Reuses[Mask[I]] = Prev[I];
5313}
5314
5315/// Reorders the given \p Order according to the given \p Mask. \p Order - is
5316/// the original order of the scalars. Procedure transforms the provided order
5317/// in accordance with the given \p Mask. If the resulting \p Order is just an
5318/// identity order, \p Order is cleared.
5319static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
5320 bool BottomOrder = false) {
5321 assert(!Mask.empty() && "Expected non-empty mask.");
5322 unsigned Sz = Mask.size();
5323 if (BottomOrder) {
5324 SmallVector<unsigned> PrevOrder;
5325 if (Order.empty()) {
5326 PrevOrder.resize(N: Sz);
5327 std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0);
5328 } else {
5329 PrevOrder.swap(RHS&: Order);
5330 }
5331 Order.assign(NumElts: Sz, Elt: Sz);
5332 for (unsigned I = 0; I < Sz; ++I)
5333 if (Mask[I] != PoisonMaskElem)
5334 Order[I] = PrevOrder[Mask[I]];
5335 if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
5336 return Data.value() == Sz || Data.index() == Data.value();
5337 })) {
5338 Order.clear();
5339 return;
5340 }
5341 fixupOrderingIndices(Order);
5342 return;
5343 }
5344 SmallVector<int> MaskOrder;
5345 if (Order.empty()) {
5346 MaskOrder.resize(N: Sz);
5347 std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0);
5348 } else {
5349 inversePermutation(Indices: Order, Mask&: MaskOrder);
5350 }
5351 reorderReuses(Reuses&: MaskOrder, Mask);
5352 if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
5353 Order.clear();
5354 return;
5355 }
5356 Order.assign(NumElts: Sz, Elt: Sz);
5357 for (unsigned I = 0; I < Sz; ++I)
5358 if (MaskOrder[I] != PoisonMaskElem)
5359 Order[MaskOrder[I]] = I;
5360 fixupOrderingIndices(Order);
5361}
5362
5363std::optional<BoUpSLP::OrdersType>
5364BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
5365 bool TopToBottom, bool IgnoreReorder) {
5366 assert(TE.isGather() && "Expected gather node only.");
5367 // Try to find subvector extract/insert patterns and reorder only such
5368 // patterns.
5369 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
5370 Type *ScalarTy = GatheredScalars.front()->getType();
5371 size_t NumScalars = GatheredScalars.size();
5372 if (!isValidElementType(Ty: ScalarTy))
5373 return std::nullopt;
5374 auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
5375 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars);
5376 SmallVector<int> ExtractMask;
5377 SmallVector<int> Mask;
5378 SmallVector<SmallVector<const TreeEntry *>> Entries;
5379 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
5380 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
5381 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
5382 isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
5383 /*ForOrder=*/true);
5384 // No shuffled operands - ignore.
5385 if (GatherShuffles.empty() && ExtractShuffles.empty())
5386 return std::nullopt;
5387 OrdersType CurrentOrder(NumScalars, NumScalars);
5388 if (GatherShuffles.size() == 1 &&
5389 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
5390 Entries.front().front()->isSame(VL: TE.Scalars)) {
5391 // If the full matched node in whole tree rotation - no need to consider the
5392 // matching order, rotating the whole tree.
5393 if (TopToBottom)
5394 return std::nullopt;
5395 // No need to keep the order for the same user node.
5396 if (Entries.front().front()->UserTreeIndex.UserTE ==
5397 TE.UserTreeIndex.UserTE)
5398 return std::nullopt;
5399 // No need to keep the order for the matched root node, if it can be freely
5400 // reordered.
5401 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
5402 return std::nullopt;
5403 // If shuffling 2 elements only and the matching node has reverse reuses -
5404 // no need to count order, both work fine.
5405 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
5406 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
5407 any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices),
5408 P: [](const auto &P) {
5409 return P.value() % 2 != static_cast<int>(P.index()) % 2;
5410 }))
5411 return std::nullopt;
5412
5413 // Perfect match in the graph, will reuse the previously vectorized
5414 // node. Cost is 0.
5415 std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0);
5416 return CurrentOrder;
5417 }
5418 auto IsSplatMask = [](ArrayRef<int> Mask) {
5419 int SingleElt = PoisonMaskElem;
5420 return all_of(Range&: Mask, P: [&](int I) {
5421 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
5422 SingleElt = I;
5423 return I == PoisonMaskElem || I == SingleElt;
5424 });
5425 };
5426 // Exclusive broadcast mask - ignore.
5427 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
5428 (Entries.size() != 1 ||
5429 Entries.front().front()->ReorderIndices.empty())) ||
5430 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
5431 return std::nullopt;
5432 SmallBitVector ShuffledSubMasks(NumParts);
5433 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
5434 ArrayRef<int> Mask, int PartSz, int NumParts,
5435 function_ref<unsigned(unsigned)> GetVF) {
5436 for (int I : seq<int>(Begin: 0, End: NumParts)) {
5437 if (ShuffledSubMasks.test(Idx: I))
5438 continue;
5439 const int VF = GetVF(I);
5440 if (VF == 0)
5441 continue;
5442 unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
5443 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
5444 // Shuffle of at least 2 vectors - ignore.
5445 if (any_of(Range&: Slice, P: [&](unsigned I) { return I != NumScalars; })) {
5446 llvm::fill(Range&: Slice, Value&: NumScalars);
5447 ShuffledSubMasks.set(I);
5448 continue;
5449 }
5450 // Try to include as much elements from the mask as possible.
5451 int FirstMin = INT_MAX;
5452 int SecondVecFound = false;
5453 for (int K : seq<int>(Size: Limit)) {
5454 int Idx = Mask[I * PartSz + K];
5455 if (Idx == PoisonMaskElem) {
5456 Value *V = GatheredScalars[I * PartSz + K];
5457 if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
5458 SecondVecFound = true;
5459 break;
5460 }
5461 continue;
5462 }
5463 if (Idx < VF) {
5464 if (FirstMin > Idx)
5465 FirstMin = Idx;
5466 } else {
5467 SecondVecFound = true;
5468 break;
5469 }
5470 }
5471 FirstMin = (FirstMin / PartSz) * PartSz;
5472 // Shuffle of at least 2 vectors - ignore.
5473 if (SecondVecFound) {
5474 llvm::fill(Range&: Slice, Value&: NumScalars);
5475 ShuffledSubMasks.set(I);
5476 continue;
5477 }
5478 for (int K : seq<int>(Size: Limit)) {
5479 int Idx = Mask[I * PartSz + K];
5480 if (Idx == PoisonMaskElem)
5481 continue;
5482 Idx -= FirstMin;
5483 if (Idx >= PartSz) {
5484 SecondVecFound = true;
5485 break;
5486 }
5487 if (CurrentOrder[I * PartSz + Idx] >
5488 static_cast<unsigned>(I * PartSz + K) &&
5489 CurrentOrder[I * PartSz + Idx] !=
5490 static_cast<unsigned>(I * PartSz + Idx))
5491 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
5492 }
5493 // Shuffle of at least 2 vectors - ignore.
5494 if (SecondVecFound) {
5495 llvm::fill(Range&: Slice, Value&: NumScalars);
5496 ShuffledSubMasks.set(I);
5497 continue;
5498 }
5499 }
5500 };
5501 int PartSz = getPartNumElems(Size: NumScalars, NumParts);
5502 if (!ExtractShuffles.empty())
5503 TransformMaskToOrder(
5504 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
5505 if (!ExtractShuffles[I])
5506 return 0U;
5507 unsigned VF = 0;
5508 unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
5509 for (unsigned Idx : seq<unsigned>(Size: Sz)) {
5510 int K = I * PartSz + Idx;
5511 if (ExtractMask[K] == PoisonMaskElem)
5512 continue;
5513 if (!TE.ReuseShuffleIndices.empty())
5514 K = TE.ReuseShuffleIndices[K];
5515 if (K == PoisonMaskElem)
5516 continue;
5517 if (!TE.ReorderIndices.empty())
5518 K = std::distance(first: TE.ReorderIndices.begin(),
5519 last: find(Range: TE.ReorderIndices, Val: K));
5520 auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]);
5521 if (!EI)
5522 continue;
5523 VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
5524 ->getElementCount()
5525 .getKnownMinValue());
5526 }
5527 return VF;
5528 });
5529 // Check special corner case - single shuffle of the same entry.
5530 if (GatherShuffles.size() == 1 && NumParts != 1) {
5531 if (ShuffledSubMasks.any())
5532 return std::nullopt;
5533 PartSz = NumScalars;
5534 NumParts = 1;
5535 }
5536 if (!Entries.empty())
5537 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
5538 if (!GatherShuffles[I])
5539 return 0U;
5540 return std::max(a: Entries[I].front()->getVectorFactor(),
5541 b: Entries[I].back()->getVectorFactor());
5542 });
5543 unsigned NumUndefs =
5544 count_if(Range&: CurrentOrder, P: [&](unsigned Idx) { return Idx == NumScalars; });
5545 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
5546 return std::nullopt;
5547 return std::move(CurrentOrder);
5548}
5549
5550static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
5551 const TargetLibraryInfo &TLI,
5552 bool CompareOpcodes = true) {
5553 if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) !=
5554 getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth))
5555 return false;
5556 auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
5557 auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
5558 return (!GEP1 || GEP1->getNumOperands() == 2) &&
5559 (!GEP2 || GEP2->getNumOperands() == 2) &&
5560 (((!GEP1 || isConstant(V: GEP1->getOperand(i_nocapture: 1))) &&
5561 (!GEP2 || isConstant(V: GEP2->getOperand(i_nocapture: 1)))) ||
5562 !CompareOpcodes ||
5563 (GEP1 && GEP2 &&
5564 getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI)));
5565}
5566
5567/// Calculates minimal alignment as a common alignment.
5568template <typename T>
5569static Align computeCommonAlignment(ArrayRef<Value *> VL) {
5570 Align CommonAlignment = cast<T>(VL.front())->getAlign();
5571 for (Value *V : VL.drop_front())
5572 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
5573 return CommonAlignment;
5574}
5575
5576/// Check if \p Order represents reverse order.
5577static bool isReverseOrder(ArrayRef<unsigned> Order) {
5578 assert(!Order.empty() &&
5579 "Order is empty. Please check it before using isReverseOrder.");
5580 unsigned Sz = Order.size();
5581 return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
5582 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
5583 });
5584}
5585
5586/// Checks if the provided list of pointers \p Pointers represents the strided
5587/// pointers for type ElemTy. If they are not, std::nullopt is returned.
5588/// Otherwise, if \p Inst is not specified, just initialized optional value is
5589/// returned to show that the pointers represent strided pointers. If \p Inst
5590/// specified, the runtime stride is materialized before the given \p Inst.
5591/// \returns std::nullopt if the pointers are not pointers with the runtime
5592/// stride, nullptr or actual stride value, otherwise.
5593static std::optional<Value *>
5594calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
5595 const DataLayout &DL, ScalarEvolution &SE,
5596 SmallVectorImpl<unsigned> &SortedIndices,
5597 Instruction *Inst = nullptr) {
5598 SmallVector<const SCEV *> SCEVs;
5599 const SCEV *PtrSCEVLowest = nullptr;
5600 const SCEV *PtrSCEVHighest = nullptr;
5601 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
5602 // addresses).
5603 for (Value *Ptr : PointerOps) {
5604 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
5605 if (!PtrSCEV)
5606 return std::nullopt;
5607 SCEVs.push_back(Elt: PtrSCEV);
5608 if (!PtrSCEVLowest && !PtrSCEVHighest) {
5609 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
5610 continue;
5611 }
5612 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
5613 if (isa<SCEVCouldNotCompute>(Val: Diff))
5614 return std::nullopt;
5615 if (Diff->isNonConstantNegative()) {
5616 PtrSCEVLowest = PtrSCEV;
5617 continue;
5618 }
5619 const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
5620 if (isa<SCEVCouldNotCompute>(Val: Diff1))
5621 return std::nullopt;
5622 if (Diff1->isNonConstantNegative()) {
5623 PtrSCEVHighest = PtrSCEV;
5624 continue;
5625 }
5626 }
5627 // Dist = PtrSCEVHighest - PtrSCEVLowest;
5628 const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
5629 if (isa<SCEVCouldNotCompute>(Val: Dist))
5630 return std::nullopt;
5631 int Size = DL.getTypeStoreSize(Ty: ElemTy);
5632 auto TryGetStride = [&](const SCEV *Dist,
5633 const SCEV *Multiplier) -> const SCEV * {
5634 if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
5635 if (M->getOperand(i: 0) == Multiplier)
5636 return M->getOperand(i: 1);
5637 if (M->getOperand(i: 1) == Multiplier)
5638 return M->getOperand(i: 0);
5639 return nullptr;
5640 }
5641 if (Multiplier == Dist)
5642 return SE.getConstant(Ty: Dist->getType(), V: 1);
5643 return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
5644 };
5645 // Stride_in_elements = Dist / element_size * (num_elems - 1).
5646 const SCEV *Stride = nullptr;
5647 if (Size != 1 || SCEVs.size() > 2) {
5648 const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1));
5649 Stride = TryGetStride(Dist, Sz);
5650 if (!Stride)
5651 return std::nullopt;
5652 }
5653 if (!Stride || isa<SCEVConstant>(Val: Stride))
5654 return std::nullopt;
5655 // Iterate through all pointers and check if all distances are
5656 // unique multiple of Stride.
5657 using DistOrdPair = std::pair<int64_t, int>;
5658 auto Compare = llvm::less_first();
5659 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
5660 int Cnt = 0;
5661 bool IsConsecutive = true;
5662 for (const SCEV *PtrSCEV : SCEVs) {
5663 unsigned Dist = 0;
5664 if (PtrSCEV != PtrSCEVLowest) {
5665 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
5666 const SCEV *Coeff = TryGetStride(Diff, Stride);
5667 if (!Coeff)
5668 return std::nullopt;
5669 const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
5670 if (!SC || isa<SCEVCouldNotCompute>(Val: SC))
5671 return std::nullopt;
5672 if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
5673 RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
5674 ->isZero())
5675 return std::nullopt;
5676 Dist = SC->getAPInt().getZExtValue();
5677 }
5678 // If the strides are not the same or repeated, we can't vectorize.
5679 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
5680 return std::nullopt;
5681 auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
5682 if (!Res.second)
5683 return std::nullopt;
5684 // Consecutive order if the inserted element is the last one.
5685 IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
5686 ++Cnt;
5687 }
5688 if (Offsets.size() != SCEVs.size())
5689 return std::nullopt;
5690 SortedIndices.clear();
5691 if (!IsConsecutive) {
5692 // Fill SortedIndices array only if it is non-consecutive.
5693 SortedIndices.resize(N: PointerOps.size());
5694 Cnt = 0;
5695 for (const std::pair<int64_t, int> &Pair : Offsets) {
5696 SortedIndices[Cnt] = Pair.second;
5697 ++Cnt;
5698 }
5699 }
5700 if (!Inst)
5701 return nullptr;
5702 SCEVExpander Expander(SE, DL, "strided-load-vec");
5703 return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst);
5704}
5705
5706static std::pair<InstructionCost, InstructionCost>
5707getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
5708 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
5709 Type *ScalarTy, VectorType *VecTy);
5710
5711/// Returns the cost of the shuffle instructions with the given \p Kind, vector
5712/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
5713/// subvector pattern.
5714static InstructionCost
5715getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
5716 VectorType *Tp, ArrayRef<int> Mask = {},
5717 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
5718 int Index = 0, VectorType *SubTp = nullptr,
5719 ArrayRef<const Value *> Args = {}) {
5720 VectorType *DstTy = Tp;
5721 if (!Mask.empty())
5722 DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size());
5723
5724 if (Kind != TTI::SK_PermuteTwoSrc)
5725 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
5726 Args);
5727 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
5728 int NumSubElts;
5729 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
5730 Mask, NumSrcElts, NumSubElts, Index)) {
5731 if (Index + NumSubElts > NumSrcElts &&
5732 Index + NumSrcElts <= static_cast<int>(Mask.size()))
5733 return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask,
5734 CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
5735 }
5736 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
5737 Args);
5738}
5739
5740/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
5741/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
5742/// instead of a scalar.
5743static InstructionCost
5744getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
5745 VectorType *Ty, const APInt &DemandedElts, bool Insert,
5746 bool Extract, TTI::TargetCostKind CostKind,
5747 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
5748 assert(!isa<ScalableVectorType>(Ty) &&
5749 "ScalableVectorType is not supported.");
5750 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
5751 getNumElements(Ty) &&
5752 "Incorrect usage.");
5753 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
5754 assert(SLPReVec && "Only supported by REVEC.");
5755 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
5756 // of CreateInsertElement.
5757 unsigned ScalarTyNumElements = VecTy->getNumElements();
5758 InstructionCost Cost = 0;
5759 for (unsigned I : seq(Size: DemandedElts.getBitWidth())) {
5760 if (!DemandedElts[I])
5761 continue;
5762 if (Insert)
5763 Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind,
5764 Index: I * ScalarTyNumElements, SubTp: VecTy);
5765 if (Extract)
5766 Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind,
5767 Index: I * ScalarTyNumElements, SubTp: VecTy);
5768 }
5769 return Cost;
5770 }
5771 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
5772 CostKind, ForPoisonSrc, VL);
5773}
5774
5775/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
5776/// is a FixedVectorType, a vector will be extracted instead of a scalar.
5777static InstructionCost getVectorInstrCost(
5778 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
5779 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
5780 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
5781 if (Opcode == Instruction::ExtractElement) {
5782 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
5783 assert(SLPReVec && "Only supported by REVEC.");
5784 assert(isa<VectorType>(Val) && "Val must be a vector type.");
5785 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
5786 Tp: cast<VectorType>(Val), Mask: {}, CostKind,
5787 Index: Index * VecTy->getNumElements(), SubTp: VecTy);
5788 }
5789 }
5790 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
5791 ScalarUserAndIdx);
5792}
5793
5794/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
5795/// is a FixedVectorType, a vector will be extracted instead of a scalar.
5796static InstructionCost getExtractWithExtendCost(
5797 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
5798 VectorType *VecTy, unsigned Index,
5799 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
5800 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) {
5801 assert(SLPReVec && "Only supported by REVEC.");
5802 auto *SubTp =
5803 getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements());
5804 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind,
5805 Index: Index * ScalarTy->getNumElements(), SubTp) +
5806 TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None,
5807 CostKind);
5808 }
5809 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
5810}
5811
5812/// Correctly creates insert_subvector, checking that the index is multiple of
5813/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5814/// using default shuffle.
5815static Value *createInsertVector(
5816 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
5817 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
5818 const unsigned SubVecVF = getNumElements(Ty: V->getType());
5819 if (Index % SubVecVF == 0) {
5820 Vec = Builder.CreateInsertVector(DstType: Vec->getType(), SrcVec: Vec, SubVec: V, Idx: Index);
5821 } else {
5822 // Create shuffle, insertvector requires that index is multiple of
5823 // the subvector length.
5824 const unsigned VecVF = getNumElements(Ty: Vec->getType());
5825 SmallVector<int> Mask(VecVF, PoisonMaskElem);
5826 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
5827 for (unsigned I : seq<unsigned>(Size: SubVecVF))
5828 Mask[I + Index] = I + VecVF;
5829 if (Generator) {
5830 Vec = Generator(Vec, V, Mask);
5831 } else {
5832 // 1. Resize V to the size of Vec.
5833 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
5834 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: 0);
5835 V = Builder.CreateShuffleVector(V, Mask: ResizeMask);
5836 Vec = Builder.CreateShuffleVector(V1: Vec, V2: V, Mask);
5837 }
5838 }
5839 return Vec;
5840}
5841
5842/// Correctly creates extract_subvector, checking that the index is multiple of
5843/// the subvectors length. Otherwise, generates shuffle using \p Generator or
5844/// using default shuffle.
5845static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
5846 unsigned SubVecVF, unsigned Index) {
5847 if (Index % SubVecVF == 0) {
5848 VectorType *SubVecTy =
5849 getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: SubVecVF);
5850 return Builder.CreateExtractVector(DstType: SubVecTy, SrcVec: Vec, Idx: Index);
5851 }
5852 // Create shuffle, extract_subvector requires that index is multiple of
5853 // the subvector length.
5854 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
5855 std::iota(first: Mask.begin(), last: Mask.end(), value: Index);
5856 return Builder.CreateShuffleVector(V: Vec, Mask);
5857}
5858
5859/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
5860/// with \p Order.
5861/// \return true if the mask represents strided access, false - otherwise.
5862static bool buildCompressMask(ArrayRef<Value *> PointerOps,
5863 ArrayRef<unsigned> Order, Type *ScalarTy,
5864 const DataLayout &DL, ScalarEvolution &SE,
5865 SmallVectorImpl<int> &CompressMask) {
5866 const unsigned Sz = PointerOps.size();
5867 CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
5868 // The first element always set.
5869 CompressMask[0] = 0;
5870 // Check if the mask represents strided access.
5871 std::optional<unsigned> Stride = 0;
5872 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
5873 for (unsigned I : seq<unsigned>(Begin: 1, End: Sz)) {
5874 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
5875 std::optional<int64_t> OptPos =
5876 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
5877 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
5878 return false;
5879 unsigned Pos = static_cast<unsigned>(*OptPos);
5880 CompressMask[I] = Pos;
5881 if (!Stride)
5882 continue;
5883 if (*Stride == 0) {
5884 *Stride = Pos;
5885 continue;
5886 }
5887 if (Pos != *Stride * I)
5888 Stride.reset();
5889 }
5890 return Stride.has_value();
5891}
5892
5893/// Checks if the \p VL can be transformed to a (masked)load + compress or
5894/// (masked) interleaved load.
5895static bool isMaskedLoadCompress(
5896 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
5897 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
5898 const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
5899 const DominatorTree &DT, const TargetLibraryInfo &TLI,
5900 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
5901 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
5902 VectorType *&LoadVecTy) {
5903 InterleaveFactor = 0;
5904 Type *ScalarTy = VL.front()->getType();
5905 const size_t Sz = VL.size();
5906 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
5907 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5908 SmallVector<int> Mask;
5909 if (!Order.empty())
5910 inversePermutation(Indices: Order, Mask);
5911 // Check external uses.
5912 for (const auto [I, V] : enumerate(First&: VL)) {
5913 if (AreAllUsersVectorized(V))
5914 continue;
5915 InstructionCost ExtractCost =
5916 TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
5917 Index: Mask.empty() ? I : Mask[I]);
5918 InstructionCost ScalarCost =
5919 TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind);
5920 if (ExtractCost <= ScalarCost)
5921 return false;
5922 }
5923 Value *Ptr0;
5924 Value *PtrN;
5925 if (Order.empty()) {
5926 Ptr0 = PointerOps.front();
5927 PtrN = PointerOps.back();
5928 } else {
5929 Ptr0 = PointerOps[Order.front()];
5930 PtrN = PointerOps[Order.back()];
5931 }
5932 std::optional<int64_t> Diff =
5933 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE);
5934 if (!Diff)
5935 return false;
5936 const size_t MaxRegSize =
5937 TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
5938 .getFixedValue();
5939 // Check for very large distances between elements.
5940 if (*Diff / Sz >= MaxRegSize / 8)
5941 return false;
5942 LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + 1);
5943 auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()]);
5944 Align CommonAlignment = LI->getAlign();
5945 IsMasked = !isSafeToLoadUnconditionally(
5946 V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL,
5947 ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL[Order.back()]), AC: &AC, DT: &DT,
5948 TLI: &TLI);
5949 if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment,
5950 AddressSpace: LI->getPointerAddressSpace()))
5951 return false;
5952 // TODO: perform the analysis of each scalar load for better
5953 // safe-load-unconditionally analysis.
5954 bool IsStrided =
5955 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
5956 assert(CompressMask.size() >= 2 && "At least two elements are required");
5957 SmallVector<Value *> OrderedPointerOps(PointerOps);
5958 if (!Order.empty())
5959 reorderScalars(Scalars&: OrderedPointerOps, Mask);
5960 auto [ScalarGEPCost, VectorGEPCost] =
5961 getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(),
5962 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy);
5963 // The cost of scalar loads.
5964 InstructionCost ScalarLoadsCost =
5965 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
5966 binary_op: [&](InstructionCost C, Value *V) {
5967 return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V),
5968 CostKind);
5969 }) +
5970 ScalarGEPCost;
5971 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
5972 InstructionCost GatherCost =
5973 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
5974 /*Insert=*/true,
5975 /*Extract=*/false, CostKind) +
5976 ScalarLoadsCost;
5977 InstructionCost LoadCost = 0;
5978 if (IsMasked) {
5979 LoadCost =
5980 TTI.getMaskedMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
5981 AddressSpace: LI->getPointerAddressSpace(), CostKind);
5982 } else {
5983 LoadCost =
5984 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
5985 AddressSpace: LI->getPointerAddressSpace(), CostKind);
5986 }
5987 if (IsStrided && !IsMasked && Order.empty()) {
5988 // Check for potential segmented(interleaved) loads.
5989 VectorType *AlignedLoadVecTy = getWidenedType(
5990 ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + 1));
5991 if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment,
5992 DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT,
5993 TLI: &TLI))
5994 AlignedLoadVecTy = LoadVecTy;
5995 if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask[1],
5996 Alignment: CommonAlignment,
5997 AddrSpace: LI->getPointerAddressSpace())) {
5998 InstructionCost InterleavedCost =
5999 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
6000 Opcode: Instruction::Load, VecTy: AlignedLoadVecTy,
6001 Factor: CompressMask[1], Indices: {}, Alignment: CommonAlignment,
6002 AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked);
6003 if (InterleavedCost < GatherCost) {
6004 InterleaveFactor = CompressMask[1];
6005 LoadVecTy = AlignedLoadVecTy;
6006 return true;
6007 }
6008 }
6009 }
6010 InstructionCost CompressCost = ::getShuffleCost(
6011 TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind);
6012 if (!Order.empty()) {
6013 SmallVector<int> NewMask(Sz, PoisonMaskElem);
6014 for (unsigned I : seq<unsigned>(Size: Sz)) {
6015 NewMask[I] = CompressMask[Mask[I]];
6016 }
6017 CompressMask.swap(RHS&: NewMask);
6018 }
6019 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
6020 return TotalVecCost < GatherCost;
6021}
6022
6023/// Checks if the \p VL can be transformed to a (masked)load + compress or
6024/// (masked) interleaved load.
6025static bool
6026isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6027 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6028 const DataLayout &DL, ScalarEvolution &SE,
6029 AssumptionCache &AC, const DominatorTree &DT,
6030 const TargetLibraryInfo &TLI,
6031 const function_ref<bool(Value *)> AreAllUsersVectorized) {
6032 bool IsMasked;
6033 unsigned InterleaveFactor;
6034 SmallVector<int> CompressMask;
6035 VectorType *LoadVecTy;
6036 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
6037 AreAllUsersVectorized, IsMasked, InterleaveFactor,
6038 CompressMask, LoadVecTy);
6039}
6040
6041/// Checks if strided loads can be generated out of \p VL loads with pointers \p
6042/// PointerOps:
6043/// 1. Target with strided load support is detected.
6044/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
6045/// potential stride <= MaxProfitableLoadStride and the potential stride is
6046/// power-of-2 (to avoid perf regressions for the very small number of loads)
6047/// and max distance > number of loads, or potential stride is -1.
6048/// 3. The loads are ordered, or number of unordered loads <=
6049/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
6050/// to avoid extra costs for very expensive shuffles).
6051/// 4. Any pointer operand is an instruction with the users outside of the
6052/// current graph (for masked gathers extra extractelement instructions
6053/// might be required).
6054static bool isStridedLoad(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6055 ArrayRef<unsigned> Order,
6056 const TargetTransformInfo &TTI, const DataLayout &DL,
6057 ScalarEvolution &SE,
6058 const bool IsAnyPointerUsedOutGraph,
6059 const int64_t Diff) {
6060 const size_t Sz = VL.size();
6061 const uint64_t AbsoluteDiff = std::abs(i: Diff);
6062 Type *ScalarTy = VL.front()->getType();
6063 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6064 if (IsAnyPointerUsedOutGraph ||
6065 (AbsoluteDiff > Sz &&
6066 (Sz > MinProfitableStridedLoads ||
6067 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
6068 AbsoluteDiff % Sz == 0 && has_single_bit(Value: AbsoluteDiff / Sz)))) ||
6069 Diff == -(static_cast<int64_t>(Sz) - 1)) {
6070 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
6071 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
6072 return false;
6073 Align Alignment =
6074 cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()])
6075 ->getAlign();
6076 if (!TTI.isLegalStridedLoadStore(DataType: VecTy, Alignment))
6077 return false;
6078 Value *Ptr0;
6079 Value *PtrN;
6080 if (Order.empty()) {
6081 Ptr0 = PointerOps.front();
6082 PtrN = PointerOps.back();
6083 } else {
6084 Ptr0 = PointerOps[Order.front()];
6085 PtrN = PointerOps[Order.back()];
6086 }
6087 // Iterate through all pointers and check if all distances are
6088 // unique multiple of Dist.
6089 SmallSet<int64_t, 4> Dists;
6090 for (Value *Ptr : PointerOps) {
6091 int64_t Dist = 0;
6092 if (Ptr == PtrN)
6093 Dist = Diff;
6094 else if (Ptr != Ptr0)
6095 Dist = *getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
6096 // If the strides are not the same or repeated, we can't
6097 // vectorize.
6098 if (((Dist / Stride) * Stride) != Dist || !Dists.insert(V: Dist).second)
6099 break;
6100 }
6101 if (Dists.size() == Sz)
6102 return true;
6103 }
6104 return false;
6105}
6106
6107BoUpSLP::LoadsState
6108BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
6109 SmallVectorImpl<unsigned> &Order,
6110 SmallVectorImpl<Value *> &PointerOps,
6111 unsigned *BestVF, bool TryRecursiveCheck) const {
6112 // Check that a vectorized load would load the same memory as a scalar
6113 // load. For example, we don't want to vectorize loads that are smaller
6114 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6115 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6116 // from such a struct, we read/write packed bits disagreeing with the
6117 // unvectorized version.
6118 if (BestVF)
6119 *BestVF = 0;
6120 if (areKnownNonVectorizableLoads(VL))
6121 return LoadsState::Gather;
6122 Type *ScalarTy = VL0->getType();
6123
6124 if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
6125 return LoadsState::Gather;
6126
6127 // Make sure all loads in the bundle are simple - we can't vectorize
6128 // atomic or volatile loads.
6129 PointerOps.clear();
6130 const size_t Sz = VL.size();
6131 PointerOps.resize(N: Sz);
6132 auto *POIter = PointerOps.begin();
6133 for (Value *V : VL) {
6134 auto *L = dyn_cast<LoadInst>(Val: V);
6135 if (!L || !L->isSimple())
6136 return LoadsState::Gather;
6137 *POIter = L->getPointerOperand();
6138 ++POIter;
6139 }
6140
6141 Order.clear();
6142 // Check the order of pointer operands or that all pointers are the same.
6143 bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order);
6144
6145 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6146 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
6147 if (!IsSorted) {
6148 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy)) {
6149 if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) &&
6150 calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
6151 return LoadsState::StridedVectorize;
6152 }
6153
6154 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
6155 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
6156 return LoadsState::Gather;
6157
6158 if (!all_of(Range&: PointerOps, P: [&](Value *P) {
6159 return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
6160 }))
6161 return LoadsState::Gather;
6162
6163 } else {
6164 Value *Ptr0;
6165 Value *PtrN;
6166 if (Order.empty()) {
6167 Ptr0 = PointerOps.front();
6168 PtrN = PointerOps.back();
6169 } else {
6170 Ptr0 = PointerOps[Order.front()];
6171 PtrN = PointerOps[Order.back()];
6172 }
6173 std::optional<int64_t> Diff =
6174 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
6175 // Check that the sorted loads are consecutive.
6176 if (static_cast<uint64_t>(*Diff) == Sz - 1)
6177 return LoadsState::Vectorize;
6178 if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
6179 TLI: *TLI, AreAllUsersVectorized: [&](Value *V) {
6180 return areAllUsersVectorized(
6181 I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList);
6182 }))
6183 return LoadsState::CompressVectorize;
6184 // Simple check if not a strided access - clear order.
6185 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
6186 // Try to generate strided load node.
6187 auto IsAnyPointerUsedOutGraph =
6188 IsPossibleStrided && any_of(Range&: PointerOps, P: [&](Value *V) {
6189 return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) {
6190 return !isVectorized(V: U) && !MustGather.contains(Ptr: U);
6191 });
6192 });
6193 if (IsPossibleStrided &&
6194 isStridedLoad(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE,
6195 IsAnyPointerUsedOutGraph, Diff: *Diff))
6196 return LoadsState::StridedVectorize;
6197 }
6198 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
6199 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
6200 return LoadsState::Gather;
6201 // Correctly identify compare the cost of loads + shuffles rather than
6202 // strided/masked gather loads. Returns true if vectorized + shuffles
6203 // representation is better than just gather.
6204 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
6205 unsigned *BestVF,
6206 bool ProfitableGatherPointers) {
6207 if (BestVF)
6208 *BestVF = 0;
6209 // Compare masked gather cost and loads + insert subvector costs.
6210 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6211 auto [ScalarGEPCost, VectorGEPCost] =
6212 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(),
6213 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
6214 // Estimate the cost of masked gather GEP. If not a splat, roughly
6215 // estimate as a buildvector, otherwise estimate as splat.
6216 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
6217 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
6218 VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz);
6219 if (static_cast<unsigned>(count_if(
6220 Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
6221 any_of(Range&: PointerOps, P: [&](Value *V) {
6222 return getUnderlyingObject(V) !=
6223 getUnderlyingObject(V: PointerOps.front());
6224 }))
6225 VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy,
6226 DemandedElts, /*Insert=*/true,
6227 /*Extract=*/false, CostKind);
6228 else
6229 VectorGEPCost +=
6230 getScalarizationOverhead(
6231 TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: 0),
6232 /*Insert=*/true, /*Extract=*/false, CostKind) +
6233 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind);
6234 // The cost of scalar loads.
6235 InstructionCost ScalarLoadsCost =
6236 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
6237 binary_op: [&](InstructionCost C, Value *V) {
6238 return C + TTI.getInstructionCost(
6239 U: cast<Instruction>(Val: V), CostKind);
6240 }) +
6241 ScalarGEPCost;
6242 // The cost of masked gather.
6243 InstructionCost MaskedGatherCost =
6244 TTI.getGatherScatterOpCost(
6245 Opcode: Instruction::Load, DataTy: VecTy, Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(),
6246 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind) +
6247 (ProfitableGatherPointers ? 0 : VectorGEPCost);
6248 InstructionCost GatherCost =
6249 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
6250 /*Insert=*/true,
6251 /*Extract=*/false, CostKind) +
6252 ScalarLoadsCost;
6253 // The list of loads is small or perform partial check already - directly
6254 // compare masked gather cost and gather cost.
6255 constexpr unsigned ListLimit = 4;
6256 if (!TryRecursiveCheck || VL.size() < ListLimit)
6257 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
6258
6259 // FIXME: The following code has not been updated for non-power-of-2
6260 // vectors (and not whole registers). The splitting logic here does not
6261 // cover the original vector if the vector factor is not a power of two.
6262 if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size()))
6263 return false;
6264
6265 unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
6266 unsigned MinVF = getMinVF(Sz: 2 * Sz);
6267 DemandedElts.clearAllBits();
6268 // Iterate through possible vectorization factors and check if vectorized +
6269 // shuffles is better than just gather.
6270 for (unsigned VF =
6271 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - 1);
6272 VF >= MinVF;
6273 VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - 1)) {
6274 SmallVector<LoadsState> States;
6275 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
6276 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
6277 SmallVector<unsigned> Order;
6278 SmallVector<Value *> PointerOps;
6279 LoadsState LS =
6280 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps, BestVF,
6281 /*TryRecursiveCheck=*/false);
6282 // Check that the sorted loads are consecutive.
6283 if (LS == LoadsState::Gather) {
6284 if (BestVF) {
6285 DemandedElts.setAllBits();
6286 break;
6287 }
6288 DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF);
6289 continue;
6290 }
6291 // If need the reorder - consider as high-cost masked gather for now.
6292 if ((LS == LoadsState::Vectorize ||
6293 LS == LoadsState::StridedVectorize ||
6294 LS == LoadsState::CompressVectorize) &&
6295 !Order.empty() && !isReverseOrder(Order))
6296 LS = LoadsState::ScatterVectorize;
6297 States.push_back(Elt: LS);
6298 }
6299 if (DemandedElts.isAllOnes())
6300 // All loads gathered - try smaller VF.
6301 continue;
6302 // Can be vectorized later as a serie of loads/insertelements.
6303 InstructionCost VecLdCost = 0;
6304 if (!DemandedElts.isZero()) {
6305 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
6306 /*Insert=*/true,
6307 /*Extract=*/false, CostKind) +
6308 ScalarGEPCost;
6309 for (unsigned Idx : seq<unsigned>(Size: VL.size()))
6310 if (DemandedElts[Idx])
6311 VecLdCost +=
6312 TTI.getInstructionCost(U: cast<Instruction>(Val: VL[Idx]), CostKind);
6313 }
6314 auto *SubVecTy = getWidenedType(ScalarTy, VF);
6315 for (auto [I, LS] : enumerate(First&: States)) {
6316 auto *LI0 = cast<LoadInst>(Val: VL[I * VF]);
6317 InstructionCost VectorGEPCost =
6318 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
6319 ? 0
6320 : getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
6321 BasePtr: LI0->getPointerOperand(),
6322 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy,
6323 VecTy: SubVecTy)
6324 .second;
6325 if (LS == LoadsState::ScatterVectorize) {
6326 if (static_cast<unsigned>(
6327 count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) <
6328 PointerOps.size() - 1 ||
6329 any_of(Range&: PointerOps, P: [&](Value *V) {
6330 return getUnderlyingObject(V) !=
6331 getUnderlyingObject(V: PointerOps.front());
6332 }))
6333 VectorGEPCost += getScalarizationOverhead(
6334 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF),
6335 /*Insert=*/true, /*Extract=*/false, CostKind);
6336 else
6337 VectorGEPCost +=
6338 getScalarizationOverhead(
6339 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: 0),
6340 /*Insert=*/true, /*Extract=*/false, CostKind) +
6341 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {},
6342 CostKind);
6343 }
6344 switch (LS) {
6345 case LoadsState::Vectorize:
6346 VecLdCost +=
6347 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
6348 AddressSpace: LI0->getPointerAddressSpace(), CostKind,
6349 OpdInfo: TTI::OperandValueInfo()) +
6350 VectorGEPCost;
6351 break;
6352 case LoadsState::StridedVectorize:
6353 VecLdCost += TTI.getStridedMemoryOpCost(Opcode: Instruction::Load, DataTy: SubVecTy,
6354 Ptr: LI0->getPointerOperand(),
6355 /*VariableMask=*/false,
6356 Alignment: CommonAlignment, CostKind) +
6357 VectorGEPCost;
6358 break;
6359 case LoadsState::CompressVectorize:
6360 VecLdCost += TTI.getMaskedMemoryOpCost(
6361 Opcode: Instruction::Load, Src: SubVecTy, Alignment: CommonAlignment,
6362 AddressSpace: LI0->getPointerAddressSpace(), CostKind) +
6363 VectorGEPCost +
6364 ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy,
6365 Mask: {}, CostKind);
6366 break;
6367 case LoadsState::ScatterVectorize:
6368 VecLdCost += TTI.getGatherScatterOpCost(Opcode: Instruction::Load, DataTy: SubVecTy,
6369 Ptr: LI0->getPointerOperand(),
6370 /*VariableMask=*/false,
6371 Alignment: CommonAlignment, CostKind) +
6372 VectorGEPCost;
6373 break;
6374 case LoadsState::Gather:
6375 // Gathers are already calculated - ignore.
6376 continue;
6377 }
6378 SmallVector<int> ShuffleMask(VL.size());
6379 for (int Idx : seq<int>(Begin: 0, End: VL.size()))
6380 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
6381 if (I > 0)
6382 VecLdCost +=
6383 ::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
6384 CostKind, Index: I * VF, SubTp: SubVecTy);
6385 }
6386 // If masked gather cost is higher - better to vectorize, so
6387 // consider it as a gather node. It will be better estimated
6388 // later.
6389 if (MaskedGatherCost >= VecLdCost &&
6390 VecLdCost - GatherCost < -SLPCostThreshold) {
6391 if (BestVF)
6392 *BestVF = VF;
6393 return true;
6394 }
6395 }
6396 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
6397 };
6398 // TODO: need to improve analysis of the pointers, if not all of them are
6399 // GEPs or have > 2 operands, we end up with a gather node, which just
6400 // increases the cost.
6401 Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
6402 bool ProfitableGatherPointers =
6403 L && Sz > 2 && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
6404 return L->isLoopInvariant(V);
6405 })) <= Sz / 2;
6406 if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [](Value *P) {
6407 auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
6408 return (!GEP && doesNotNeedToBeScheduled(V: P)) ||
6409 (GEP && GEP->getNumOperands() == 2 &&
6410 isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1)));
6411 })) {
6412 // Check if potential masked gather can be represented as series
6413 // of loads + insertsubvectors.
6414 // If masked gather cost is higher - better to vectorize, so
6415 // consider it as a gather node. It will be better estimated
6416 // later.
6417 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
6418 ProfitableGatherPointers))
6419 return LoadsState::ScatterVectorize;
6420 }
6421
6422 return LoadsState::Gather;
6423}
6424
6425static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
6426 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
6427 const DataLayout &DL, ScalarEvolution &SE,
6428 SmallVectorImpl<unsigned> &SortedIndices) {
6429 assert(
6430 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
6431 "Expected list of pointer operands.");
6432 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
6433 // Ptr into, sort and return the sorted indices with values next to one
6434 // another.
6435 SmallMapVector<
6436 std::pair<BasicBlock *, Value *>,
6437 SmallVector<SmallVector<std::tuple<Value *, int64_t, unsigned>>>, 8>
6438 Bases;
6439 Bases
6440 .try_emplace(Key: std::make_pair(
6441 x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth)))
6442 .first->second.emplace_back().emplace_back(Args: VL.front(), Args: 0U, Args: 0U);
6443
6444 SortedIndices.clear();
6445 for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) {
6446 auto Key = std::make_pair(x: BBs[Cnt + 1],
6447 y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth));
6448 bool Found = any_of(Range&: Bases.try_emplace(Key).first->second,
6449 P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
6450 std::optional<int64_t> Diff =
6451 getPointersDiff(ElemTy, std::get<0>(Base.front()),
6452 ElemTy, Ptr, DL, SE,
6453 /*StrictCheck=*/true);
6454 if (!Diff)
6455 return false;
6456
6457 Base.emplace_back(Ptr, *Diff, Cnt + 1);
6458 return true;
6459 });
6460
6461 if (!Found) {
6462 // If we haven't found enough to usefully cluster, return early.
6463 if (Bases.size() > VL.size() / 2 - 1)
6464 return false;
6465
6466 // Not found already - add a new Base
6467 Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: 0, Args: Cnt + 1);
6468 }
6469 }
6470
6471 if (Bases.size() == VL.size())
6472 return false;
6473
6474 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
6475 Bases.front().second.size() == VL.size()))
6476 return false;
6477
6478 // For each of the bases sort the pointers by Offset and check if any of the
6479 // base become consecutively allocated.
6480 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
6481 SmallPtrSet<Value *, 13> FirstPointers;
6482 SmallPtrSet<Value *, 13> SecondPointers;
6483 Value *P1 = Ptr1;
6484 Value *P2 = Ptr2;
6485 unsigned Depth = 0;
6486 while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) {
6487 if (P1 == P2 || Depth > RecursionMaxDepth)
6488 return false;
6489 FirstPointers.insert(Ptr: P1);
6490 SecondPointers.insert(Ptr: P2);
6491 P1 = getUnderlyingObject(V: P1, /*MaxLookup=*/1);
6492 P2 = getUnderlyingObject(V: P2, /*MaxLookup=*/1);
6493 ++Depth;
6494 }
6495 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
6496 "Unable to find matching root.");
6497 return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1);
6498 };
6499 for (auto &Base : Bases) {
6500 for (auto &Vec : Base.second) {
6501 if (Vec.size() > 1) {
6502 stable_sort(Range&: Vec, C: llvm::less_second());
6503 int64_t InitialOffset = std::get<1>(t&: Vec[0]);
6504 bool AnyConsecutive =
6505 all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
6506 return std::get<1>(P.value()) ==
6507 int64_t(P.index()) + InitialOffset;
6508 });
6509 // Fill SortedIndices array only if it looks worth-while to sort the
6510 // ptrs.
6511 if (!AnyConsecutive)
6512 return false;
6513 }
6514 }
6515 stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) {
6516 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
6517 });
6518 }
6519
6520 for (auto &T : Bases)
6521 for (const auto &Vec : T.second)
6522 for (const auto &P : Vec)
6523 SortedIndices.push_back(Elt: std::get<2>(t: P));
6524
6525 assert(SortedIndices.size() == VL.size() &&
6526 "Expected SortedIndices to be the size of VL");
6527 return true;
6528}
6529
6530std::optional<BoUpSLP::OrdersType>
6531BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
6532 assert(TE.isGather() && "Expected gather node only.");
6533 Type *ScalarTy = TE.Scalars[0]->getType();
6534
6535 SmallVector<Value *> Ptrs;
6536 Ptrs.reserve(N: TE.Scalars.size());
6537 SmallVector<BasicBlock *> BBs;
6538 BBs.reserve(N: TE.Scalars.size());
6539 for (Value *V : TE.Scalars) {
6540 auto *L = dyn_cast<LoadInst>(Val: V);
6541 if (!L || !L->isSimple())
6542 return std::nullopt;
6543 Ptrs.push_back(Elt: L->getPointerOperand());
6544 BBs.push_back(Elt: L->getParent());
6545 }
6546
6547 BoUpSLP::OrdersType Order;
6548 if (!LoadEntriesToVectorize.contains(key: TE.Idx) &&
6549 clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
6550 return std::move(Order);
6551 return std::nullopt;
6552}
6553
6554/// Check if two insertelement instructions are from the same buildvector.
6555static bool areTwoInsertFromSameBuildVector(
6556 InsertElementInst *VU, InsertElementInst *V,
6557 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
6558 // Instructions must be from the same basic blocks.
6559 if (VU->getParent() != V->getParent())
6560 return false;
6561 // Checks if 2 insertelements are from the same buildvector.
6562 if (VU->getType() != V->getType())
6563 return false;
6564 // Multiple used inserts are separate nodes.
6565 if (!VU->hasOneUse() && !V->hasOneUse())
6566 return false;
6567 auto *IE1 = VU;
6568 auto *IE2 = V;
6569 std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
6570 std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
6571 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
6572 return false;
6573 // Go through the vector operand of insertelement instructions trying to find
6574 // either VU as the original vector for IE2 or V as the original vector for
6575 // IE1.
6576 SmallBitVector ReusedIdx(
6577 cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
6578 bool IsReusedIdx = false;
6579 do {
6580 if (IE2 == VU && !IE1)
6581 return VU->hasOneUse();
6582 if (IE1 == V && !IE2)
6583 return V->hasOneUse();
6584 if (IE1 && IE1 != V) {
6585 unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
6586 IsReusedIdx |= ReusedIdx.test(Idx: Idx1);
6587 ReusedIdx.set(Idx1);
6588 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
6589 IE1 = nullptr;
6590 else
6591 IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1));
6592 }
6593 if (IE2 && IE2 != VU) {
6594 unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
6595 IsReusedIdx |= ReusedIdx.test(Idx: Idx2);
6596 ReusedIdx.set(Idx2);
6597 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
6598 IE2 = nullptr;
6599 else
6600 IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2));
6601 }
6602 } while (!IsReusedIdx && (IE1 || IE2));
6603 return false;
6604}
6605
6606/// Checks if the specified instruction \p I is an alternate operation for
6607/// the given \p MainOp and \p AltOp instructions.
6608static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
6609 Instruction *AltOp,
6610 const TargetLibraryInfo &TLI);
6611
6612std::optional<BoUpSLP::OrdersType>
6613BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
6614 bool IgnoreReorder) {
6615 // No need to reorder if need to shuffle reuses, still need to shuffle the
6616 // node.
6617 if (!TE.ReuseShuffleIndices.empty()) {
6618 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
6619 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
6620 "Reshuffling scalars not yet supported for nodes with padding");
6621
6622 if (isSplat(VL: TE.Scalars))
6623 return std::nullopt;
6624 // Check if reuse shuffle indices can be improved by reordering.
6625 // For this, check that reuse mask is "clustered", i.e. each scalar values
6626 // is used once in each submask of size <number_of_scalars>.
6627 // Example: 4 scalar values.
6628 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
6629 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
6630 // element 3 is used twice in the second submask.
6631 unsigned Sz = TE.Scalars.size();
6632 if (TE.isGather()) {
6633 if (std::optional<OrdersType> CurrentOrder =
6634 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
6635 SmallVector<int> Mask;
6636 fixupOrderingIndices(Order: *CurrentOrder);
6637 inversePermutation(Indices: *CurrentOrder, Mask);
6638 ::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
6639 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
6640 unsigned Sz = TE.Scalars.size();
6641 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
6642 for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
6643 if (Idx != PoisonMaskElem)
6644 Res[Idx + K * Sz] = I + K * Sz;
6645 }
6646 return std::move(Res);
6647 }
6648 }
6649 if (Sz == 2 && TE.getVectorFactor() == 4 &&
6650 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
6651 VF: 2 * TE.getVectorFactor())) == 1)
6652 return std::nullopt;
6653 if (TE.ReuseShuffleIndices.size() % Sz != 0)
6654 return std::nullopt;
6655 if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
6656 VF: Sz)) {
6657 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
6658 if (TE.ReorderIndices.empty())
6659 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
6660 else
6661 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
6662 ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
6663 unsigned VF = ReorderMask.size();
6664 OrdersType ResOrder(VF, VF);
6665 unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
6666 SmallBitVector UsedVals(NumParts);
6667 for (unsigned I = 0; I < VF; I += Sz) {
6668 int Val = PoisonMaskElem;
6669 unsigned UndefCnt = 0;
6670 unsigned Limit = std::min(a: Sz, b: VF - I);
6671 if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
6672 P: [&](int Idx) {
6673 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
6674 Val = Idx;
6675 if (Idx == PoisonMaskElem)
6676 ++UndefCnt;
6677 return Idx != PoisonMaskElem && Idx != Val;
6678 }) ||
6679 Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) ||
6680 UndefCnt > Sz / 2)
6681 return std::nullopt;
6682 UsedVals.set(Val);
6683 for (unsigned K = 0; K < NumParts; ++K) {
6684 unsigned Idx = Val + Sz * K;
6685 if (Idx < VF && I + K < VF)
6686 ResOrder[Idx] = I + K;
6687 }
6688 }
6689 return std::move(ResOrder);
6690 }
6691 unsigned VF = TE.getVectorFactor();
6692 // Try build correct order for extractelement instructions.
6693 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
6694 TE.ReuseShuffleIndices.end());
6695 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
6696 all_of(Range: TE.Scalars, P: [Sz](Value *V) {
6697 if (isa<PoisonValue>(Val: V))
6698 return true;
6699 std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
6700 return Idx && *Idx < Sz;
6701 })) {
6702 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
6703 "by BinaryOperator and CastInst.");
6704 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
6705 if (TE.ReorderIndices.empty())
6706 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
6707 else
6708 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
6709 for (unsigned I = 0; I < VF; ++I) {
6710 int &Idx = ReusedMask[I];
6711 if (Idx == PoisonMaskElem)
6712 continue;
6713 Value *V = TE.Scalars[ReorderMask[Idx]];
6714 std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
6715 Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
6716 }
6717 }
6718 // Build the order of the VF size, need to reorder reuses shuffles, they are
6719 // always of VF size.
6720 OrdersType ResOrder(VF);
6721 std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0);
6722 auto *It = ResOrder.begin();
6723 for (unsigned K = 0; K < VF; K += Sz) {
6724 OrdersType CurrentOrder(TE.ReorderIndices);
6725 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
6726 if (SubMask.front() == PoisonMaskElem)
6727 std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0);
6728 reorderOrder(Order&: CurrentOrder, Mask: SubMask);
6729 transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
6730 std::advance(i&: It, n: Sz);
6731 }
6732 if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
6733 return Data.index() == Data.value();
6734 }))
6735 return std::nullopt; // No need to reorder.
6736 return std::move(ResOrder);
6737 }
6738 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
6739 (!TE.UserTreeIndex ||
6740 !Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) &&
6741 (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices)))
6742 return std::nullopt;
6743 if (TE.State == TreeEntry::SplitVectorize ||
6744 ((TE.State == TreeEntry::Vectorize ||
6745 TE.State == TreeEntry::StridedVectorize ||
6746 TE.State == TreeEntry::CompressVectorize) &&
6747 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) ||
6748 (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) {
6749 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
6750 "Alternate instructions are only supported by "
6751 "BinaryOperator and CastInst.");
6752 return TE.ReorderIndices;
6753 }
6754 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
6755 TE.isAltShuffle()) {
6756 assert(TE.ReuseShuffleIndices.empty() &&
6757 "ReuseShuffleIndices should be "
6758 "empty for alternate instructions.");
6759 SmallVector<int> Mask;
6760 TE.buildAltOpShuffleMask(
6761 IsAltOp: [&](Instruction *I) {
6762 assert(TE.getMatchingMainOpOrAltOp(I) &&
6763 "Unexpected main/alternate opcode");
6764 return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI);
6765 },
6766 Mask);
6767 const int VF = TE.getVectorFactor();
6768 OrdersType ResOrder(VF, VF);
6769 for (unsigned I : seq<unsigned>(Size: VF)) {
6770 if (Mask[I] == PoisonMaskElem)
6771 continue;
6772 ResOrder[Mask[I] % VF] = I;
6773 }
6774 return std::move(ResOrder);
6775 }
6776 if (!TE.ReorderIndices.empty())
6777 return TE.ReorderIndices;
6778 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
6779 if (!TE.ReorderIndices.empty())
6780 return TE.ReorderIndices;
6781
6782 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
6783 for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) {
6784 if (isa<Constant>(Val: V) || !V->hasNUsesOrMore(N: 1))
6785 continue;
6786 auto *II = dyn_cast<InsertElementInst>(Val: *V->user_begin());
6787 if (!II)
6788 continue;
6789 Instruction *BVHead = nullptr;
6790 BasicBlock *BB = II->getParent();
6791 while (II && II->hasOneUse() && II->getParent() == BB) {
6792 BVHead = II;
6793 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
6794 }
6795 I = BVHead;
6796 }
6797
6798 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
6799 assert(BB1 != BB2 && "Expected different basic blocks.");
6800 if (!DT->isReachableFromEntry(A: BB1))
6801 return false;
6802 if (!DT->isReachableFromEntry(A: BB2))
6803 return true;
6804 auto *NodeA = DT->getNode(BB: BB1);
6805 auto *NodeB = DT->getNode(BB: BB2);
6806 assert(NodeA && "Should only process reachable instructions");
6807 assert(NodeB && "Should only process reachable instructions");
6808 assert((NodeA == NodeB) ==
6809 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
6810 "Different nodes should have different DFS numbers");
6811 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
6812 };
6813 auto PHICompare = [&](unsigned I1, unsigned I2) {
6814 Value *V1 = TE.Scalars[I1];
6815 Value *V2 = TE.Scalars[I2];
6816 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
6817 return false;
6818 if (isa<PoisonValue>(Val: V1))
6819 return true;
6820 if (isa<PoisonValue>(Val: V2))
6821 return false;
6822 if (V1->getNumUses() < V2->getNumUses())
6823 return true;
6824 if (V1->getNumUses() > V2->getNumUses())
6825 return false;
6826 auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin());
6827 auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin());
6828 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
6829 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
6830 FirstUserOfPhi2->getParent());
6831 auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1);
6832 auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2);
6833 auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1);
6834 auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2);
6835 if (IE1 && !IE2)
6836 return true;
6837 if (!IE1 && IE2)
6838 return false;
6839 if (IE1 && IE2) {
6840 if (UserBVHead[I1] && !UserBVHead[I2])
6841 return true;
6842 if (!UserBVHead[I1])
6843 return false;
6844 if (UserBVHead[I1] == UserBVHead[I2])
6845 return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
6846 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
6847 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
6848 UserBVHead[I2]->getParent());
6849 return UserBVHead[I1]->comesBefore(Other: UserBVHead[I2]);
6850 }
6851 if (EE1 && !EE2)
6852 return true;
6853 if (!EE1 && EE2)
6854 return false;
6855 if (EE1 && EE2) {
6856 auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: 0));
6857 auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: 0));
6858 auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: 0));
6859 auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: 0));
6860 if (!Inst2 && !P2)
6861 return Inst1 || P1;
6862 if (EE1->getOperand(i_nocapture: 0) == EE2->getOperand(i_nocapture: 0))
6863 return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
6864 if (!Inst1 && Inst2)
6865 return false;
6866 if (Inst1 && Inst2) {
6867 if (Inst1->getParent() != Inst2->getParent())
6868 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
6869 return Inst1->comesBefore(Other: Inst2);
6870 }
6871 if (!P1 && P2)
6872 return false;
6873 assert(P1 && P2 &&
6874 "Expected either instructions or arguments vector operands.");
6875 return P1->getArgNo() < P2->getArgNo();
6876 }
6877 return false;
6878 };
6879 OrdersType Phis(TE.Scalars.size());
6880 std::iota(first: Phis.begin(), last: Phis.end(), value: 0);
6881 stable_sort(Range&: Phis, C: PHICompare);
6882 if (isIdentityOrder(Order: Phis))
6883 return std::nullopt; // No need to reorder.
6884 return std::move(Phis);
6885 }
6886 if (TE.isGather() &&
6887 (!TE.hasState() || !TE.isAltShuffle() ||
6888 ScalarsInSplitNodes.contains(Val: TE.getMainOp())) &&
6889 allSameType(VL: TE.Scalars)) {
6890 // TODO: add analysis of other gather nodes with extractelement
6891 // instructions and other values/instructions, not only undefs.
6892 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
6893 (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
6894 any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
6895 all_of(Range: TE.Scalars, P: [](Value *V) {
6896 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
6897 return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType());
6898 })) {
6899 // Check that gather of extractelements can be represented as
6900 // just a shuffle of a single vector.
6901 OrdersType CurrentOrder;
6902 bool Reuse =
6903 canReuseExtract(VL: TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
6904 if (Reuse || !CurrentOrder.empty())
6905 return std::move(CurrentOrder);
6906 }
6907 // If the gather node is <undef, v, .., poison> and
6908 // insertelement poison, v, 0 [+ permute]
6909 // is cheaper than
6910 // insertelement poison, v, n - try to reorder.
6911 // If rotating the whole graph, exclude the permute cost, the whole graph
6912 // might be transformed.
6913 int Sz = TE.Scalars.size();
6914 if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
6915 count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) {
6916 const auto *It = find_if_not(Range: TE.Scalars, P: isConstant);
6917 if (It == TE.Scalars.begin())
6918 return OrdersType();
6919 auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
6920 if (It != TE.Scalars.end()) {
6921 OrdersType Order(Sz, Sz);
6922 unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
6923 Order[Idx] = 0;
6924 fixupOrderingIndices(Order);
6925 SmallVector<int> Mask;
6926 inversePermutation(Indices: Order, Mask);
6927 InstructionCost PermuteCost =
6928 TopToBottom
6929 ? 0
6930 : ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
6931 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
6932 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0,
6933 Op0: PoisonValue::get(T: Ty), Op1: *It);
6934 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
6935 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
6936 Op0: PoisonValue::get(T: Ty), Op1: *It);
6937 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
6938 OrdersType Order(Sz, Sz);
6939 Order[Idx] = 0;
6940 return std::move(Order);
6941 }
6942 }
6943 }
6944 if (isSplat(VL: TE.Scalars))
6945 return std::nullopt;
6946 if (TE.Scalars.size() >= 3)
6947 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
6948 return Order;
6949 // Check if can include the order of vectorized loads. For masked gathers do
6950 // extra analysis later, so include such nodes into a special list.
6951 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
6952 SmallVector<Value *> PointerOps;
6953 OrdersType CurrentOrder;
6954 LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(),
6955 Order&: CurrentOrder, PointerOps);
6956 if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
6957 Res == LoadsState::CompressVectorize)
6958 return std::move(CurrentOrder);
6959 }
6960 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
6961 // has been auditted for correctness with non-power-of-two vectors.
6962 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
6963 if (std::optional<OrdersType> CurrentOrder =
6964 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
6965 return CurrentOrder;
6966 }
6967 return std::nullopt;
6968}
6969
6970/// Checks if the given mask is a "clustered" mask with the same clusters of
6971/// size \p Sz, which are not identity submasks.
6972static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
6973 unsigned Sz) {
6974 ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz);
6975 if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
6976 return false;
6977 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
6978 ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
6979 if (Cluster != FirstCluster)
6980 return false;
6981 }
6982 return true;
6983}
6984
6985void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
6986 // Reorder reuses mask.
6987 reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
6988 const unsigned Sz = TE.Scalars.size();
6989 // For vectorized and non-clustered reused no need to do anything else.
6990 if (!TE.isGather() ||
6991 !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
6992 VF: Sz) ||
6993 !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
6994 return;
6995 SmallVector<int> NewMask;
6996 inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
6997 addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
6998 // Clear reorder since it is going to be applied to the new mask.
6999 TE.ReorderIndices.clear();
7000 // Try to improve gathered nodes with clustered reuses, if possible.
7001 ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz);
7002 SmallVector<unsigned> NewOrder(Slice);
7003 inversePermutation(Indices: NewOrder, Mask&: NewMask);
7004 reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
7005 // Fill the reuses mask with the identity submasks.
7006 for (auto *It = TE.ReuseShuffleIndices.begin(),
7007 *End = TE.ReuseShuffleIndices.end();
7008 It != End; std::advance(i&: It, n: Sz))
7009 std::iota(first: It, last: std::next(x: It, n: Sz), value: 0);
7010}
7011
7012static void combineOrders(MutableArrayRef<unsigned> Order,
7013 ArrayRef<unsigned> SecondaryOrder) {
7014 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
7015 "Expected same size of orders");
7016 size_t Sz = Order.size();
7017 SmallBitVector UsedIndices(Sz);
7018 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) {
7019 if (Order[Idx] != Sz)
7020 UsedIndices.set(Order[Idx]);
7021 }
7022 if (SecondaryOrder.empty()) {
7023 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
7024 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
7025 Order[Idx] = Idx;
7026 } else {
7027 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
7028 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
7029 !UsedIndices.test(Idx: SecondaryOrder[Idx]))
7030 Order[Idx] = SecondaryOrder[Idx];
7031 }
7032}
7033
7034bool BoUpSLP::isProfitableToReorder() const {
7035 constexpr unsigned TinyVF = 2;
7036 constexpr unsigned TinyTree = 10;
7037 constexpr unsigned PhiOpsLimit = 12;
7038 constexpr unsigned GatherLoadsLimit = 2;
7039 if (VectorizableTree.size() <= TinyTree)
7040 return true;
7041 if (VectorizableTree.front()->hasState() &&
7042 !VectorizableTree.front()->isGather() &&
7043 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
7044 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
7045 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
7046 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
7047 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
7048 VectorizableTree.front()->ReorderIndices.empty()) {
7049 // Check if the tree has only single store and single (unordered) load node,
7050 // other nodes are phis or geps/binops, combined with phis, and/orsingle
7051 // gather load node
7052 bool HasPhis = false;
7053 if (VectorizableTree.front()->getOpcode() == Instruction::PHI &&
7054 VectorizableTree.front()->Scalars.size() == TinyVF &&
7055 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
7056 return false;
7057 bool HasLoad = true;
7058 unsigned GatherLoads = 0;
7059 for (const std::unique_ptr<TreeEntry> &TE :
7060 ArrayRef(VectorizableTree).drop_front()) {
7061 if (!TE->hasState()) {
7062 if (all_of(Range&: TE->Scalars, P: IsaPred<Constant, PHINode>) ||
7063 all_of(Range&: TE->Scalars, P: IsaPred<BinaryOperator, PHINode>))
7064 continue;
7065 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7066 any_of(Range&: TE->Scalars, P: IsaPred<PHINode, GEPOperator>))
7067 continue;
7068 return true;
7069 }
7070 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
7071 if (!TE->isGather()) {
7072 HasLoad = false;
7073 continue;
7074 }
7075 if (HasLoad)
7076 return true;
7077 ++GatherLoads;
7078 if (GatherLoads >= GatherLoadsLimit)
7079 return true;
7080 }
7081 if (TE->getOpcode() == Instruction::GetElementPtr ||
7082 Instruction::isBinaryOp(Opcode: TE->getOpcode()))
7083 continue;
7084 if (TE->getOpcode() != Instruction::PHI)
7085 return true;
7086 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
7087 TE->getNumOperands() > PhiOpsLimit)
7088 return false;
7089 HasPhis = true;
7090 }
7091 return !HasPhis;
7092 }
7093 return true;
7094}
7095
7096void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
7097 ArrayRef<int> MaskOrder) {
7098 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
7099 SmallVector<int> NewMask(getVectorFactor());
7100 SmallVector<int> NewMaskOrder(getVectorFactor());
7101 std::iota(first: NewMask.begin(), last: NewMask.end(), value: 0);
7102 std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: 0);
7103 if (Idx == 0) {
7104 copy(Range&: Mask, Out: NewMask.begin());
7105 copy(Range&: MaskOrder, Out: NewMaskOrder.begin());
7106 } else {
7107 assert(Idx == 1 && "Expected either 0 or 1 index.");
7108 unsigned Offset = CombinedEntriesWithIndices.back().second;
7109 for (unsigned I : seq<unsigned>(Size: Mask.size())) {
7110 NewMask[I + Offset] = Mask[I] + Offset;
7111 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
7112 }
7113 }
7114 reorderScalars(Scalars, Mask: NewMask);
7115 reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /*BottomOrder=*/true);
7116 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices))
7117 ReorderIndices.clear();
7118}
7119
7120void BoUpSLP::reorderTopToBottom() {
7121 // Maps VF to the graph nodes.
7122 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
7123 // ExtractElement gather nodes which can be vectorized and need to handle
7124 // their ordering.
7125 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
7126
7127 // Phi nodes can have preferred ordering based on their result users
7128 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
7129
7130 // AltShuffles can also have a preferred ordering that leads to fewer
7131 // instructions, e.g., the addsub instruction in x86.
7132 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
7133
7134 // Maps a TreeEntry to the reorder indices of external users.
7135 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
7136 ExternalUserReorderMap;
7137 // Find all reorderable nodes with the given VF.
7138 // Currently the are vectorized stores,loads,extracts + some gathering of
7139 // extracts.
7140 for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
7141 const std::unique_ptr<TreeEntry> &TE) {
7142 // Look for external users that will probably be vectorized.
7143 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
7144 findExternalStoreUsersReorderIndices(TE: TE.get());
7145 if (!ExternalUserReorderIndices.empty()) {
7146 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
7147 ExternalUserReorderMap.try_emplace(Key: TE.get(),
7148 Args: std::move(ExternalUserReorderIndices));
7149 }
7150
7151 // Patterns like [fadd,fsub] can be combined into a single instruction in
7152 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
7153 // to take into account their order when looking for the most used order.
7154 if (TE->hasState() && TE->isAltShuffle() &&
7155 TE->State != TreeEntry::SplitVectorize) {
7156 Type *ScalarTy = TE->Scalars[0]->getType();
7157 VectorType *VecTy = getWidenedType(ScalarTy, VF: TE->Scalars.size());
7158 unsigned Opcode0 = TE->getOpcode();
7159 unsigned Opcode1 = TE->getAltOpcode();
7160 SmallBitVector OpcodeMask(
7161 getAltInstrMask(VL: TE->Scalars, ScalarTy, Opcode0, Opcode1));
7162 // If this pattern is supported by the target then we consider the order.
7163 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
7164 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
7165 AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType());
7166 }
7167 // TODO: Check the reverse order too.
7168 }
7169
7170 bool IgnoreReorder =
7171 !UserIgnoreList && VectorizableTree.front()->hasState() &&
7172 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
7173 VectorizableTree.front()->getOpcode() == Instruction::Store);
7174 if (std::optional<OrdersType> CurrentOrder =
7175 getReorderingData(TE: *TE, /*TopToBottom=*/true, IgnoreReorder)) {
7176 // Do not include ordering for nodes used in the alt opcode vectorization,
7177 // better to reorder them during bottom-to-top stage. If follow the order
7178 // here, it causes reordering of the whole graph though actually it is
7179 // profitable just to reorder the subgraph that starts from the alternate
7180 // opcode vectorization node. Such nodes already end-up with the shuffle
7181 // instruction and it is just enough to change this shuffle rather than
7182 // rotate the scalars for the whole graph.
7183 unsigned Cnt = 0;
7184 const TreeEntry *UserTE = TE.get();
7185 while (UserTE && Cnt < RecursionMaxDepth) {
7186 if (!UserTE->UserTreeIndex)
7187 break;
7188 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
7189 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
7190 UserTE->UserTreeIndex.UserTE->Idx != 0)
7191 return;
7192 UserTE = UserTE->UserTreeIndex.UserTE;
7193 ++Cnt;
7194 }
7195 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
7196 if (!(TE->State == TreeEntry::Vectorize ||
7197 TE->State == TreeEntry::StridedVectorize ||
7198 TE->State == TreeEntry::SplitVectorize ||
7199 TE->State == TreeEntry::CompressVectorize) ||
7200 !TE->ReuseShuffleIndices.empty())
7201 GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
7202 if (TE->State == TreeEntry::Vectorize &&
7203 TE->getOpcode() == Instruction::PHI)
7204 PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
7205 }
7206 });
7207
7208 // Reorder the graph nodes according to their vectorization factor.
7209 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
7210 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
7211 auto It = VFToOrderedEntries.find(Val: VF);
7212 if (It == VFToOrderedEntries.end())
7213 continue;
7214 // Try to find the most profitable order. We just are looking for the most
7215 // used order and reorder scalar elements in the nodes according to this
7216 // mostly used order.
7217 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
7218 // Delete VF entry upon exit.
7219 auto Cleanup = make_scope_exit(F: [&]() { VFToOrderedEntries.erase(I: It); });
7220
7221 // All operands are reordered and used only in this node - propagate the
7222 // most used order to the user node.
7223 MapVector<OrdersType, unsigned,
7224 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
7225 OrdersUses;
7226 for (const TreeEntry *OpTE : OrderedEntries) {
7227 // No need to reorder this nodes, still need to extend and to use shuffle,
7228 // just need to merge reordering shuffle and the reuse shuffle.
7229 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) &&
7230 OpTE->State != TreeEntry::SplitVectorize)
7231 continue;
7232 // Count number of orders uses.
7233 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
7234 &PhisToOrders]() -> const OrdersType & {
7235 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
7236 auto It = GathersToOrders.find(Val: OpTE);
7237 if (It != GathersToOrders.end())
7238 return It->second;
7239 }
7240 if (OpTE->hasState() && OpTE->isAltShuffle()) {
7241 auto It = AltShufflesToOrders.find(Val: OpTE);
7242 if (It != AltShufflesToOrders.end())
7243 return It->second;
7244 }
7245 if (OpTE->State == TreeEntry::Vectorize &&
7246 OpTE->getOpcode() == Instruction::PHI) {
7247 auto It = PhisToOrders.find(Val: OpTE);
7248 if (It != PhisToOrders.end())
7249 return It->second;
7250 }
7251 return OpTE->ReorderIndices;
7252 }();
7253 // First consider the order of the external scalar users.
7254 auto It = ExternalUserReorderMap.find(Val: OpTE);
7255 if (It != ExternalUserReorderMap.end()) {
7256 const auto &ExternalUserReorderIndices = It->second;
7257 // If the OpTE vector factor != number of scalars - use natural order,
7258 // it is an attempt to reorder node with reused scalars but with
7259 // external uses.
7260 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
7261 OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0)).first->second +=
7262 ExternalUserReorderIndices.size();
7263 } else {
7264 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
7265 ++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: 0)).first->second;
7266 }
7267 // No other useful reorder data in this entry.
7268 if (Order.empty())
7269 continue;
7270 }
7271 // Stores actually store the mask, not the order, need to invert.
7272 if (OpTE->State == TreeEntry::Vectorize &&
7273 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
7274 assert(!OpTE->isAltShuffle() &&
7275 "Alternate instructions are only supported by BinaryOperator "
7276 "and CastInst.");
7277 SmallVector<int> Mask;
7278 inversePermutation(Indices: Order, Mask);
7279 unsigned E = Order.size();
7280 OrdersType CurrentOrder(E, E);
7281 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
7282 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
7283 });
7284 fixupOrderingIndices(Order: CurrentOrder);
7285 ++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second;
7286 } else {
7287 ++OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second;
7288 }
7289 }
7290 if (OrdersUses.empty())
7291 continue;
7292 // Choose the most used order.
7293 unsigned IdentityCnt = 0;
7294 unsigned FilledIdentityCnt = 0;
7295 OrdersType IdentityOrder(VF, VF);
7296 for (auto &Pair : OrdersUses) {
7297 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
7298 if (!Pair.first.empty())
7299 FilledIdentityCnt += Pair.second;
7300 IdentityCnt += Pair.second;
7301 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
7302 }
7303 }
7304 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
7305 unsigned Cnt = IdentityCnt;
7306 for (auto &Pair : OrdersUses) {
7307 // Prefer identity order. But, if filled identity found (non-empty order)
7308 // with same number of uses, as the new candidate order, we can choose
7309 // this candidate order.
7310 if (Cnt < Pair.second ||
7311 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
7312 Cnt == Pair.second && !BestOrder.empty() &&
7313 isIdentityOrder(Order: BestOrder))) {
7314 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
7315 BestOrder = Pair.first;
7316 Cnt = Pair.second;
7317 } else {
7318 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
7319 }
7320 }
7321 // Set order of the user node.
7322 if (isIdentityOrder(Order: BestOrder))
7323 continue;
7324 fixupOrderingIndices(Order: BestOrder);
7325 SmallVector<int> Mask;
7326 inversePermutation(Indices: BestOrder, Mask);
7327 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
7328 unsigned E = BestOrder.size();
7329 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
7330 return I < E ? static_cast<int>(I) : PoisonMaskElem;
7331 });
7332 // Do an actual reordering, if profitable.
7333 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7334 // Just do the reordering for the nodes with the given VF.
7335 if (TE->Scalars.size() != VF) {
7336 if (TE->ReuseShuffleIndices.size() == VF) {
7337 assert(TE->State != TreeEntry::SplitVectorize &&
7338 "Split vectorized not expected.");
7339 // Need to reorder the reuses masks of the operands with smaller VF to
7340 // be able to find the match between the graph nodes and scalar
7341 // operands of the given node during vectorization/cost estimation.
7342 assert(
7343 (!TE->UserTreeIndex ||
7344 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
7345 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
7346 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
7347 "All users must be of VF size.");
7348 if (SLPReVec) {
7349 assert(SLPReVec && "Only supported by REVEC.");
7350 // ShuffleVectorInst does not do reorderOperands (and it should not
7351 // because ShuffleVectorInst supports only a limited set of
7352 // patterns). Only do reorderNodeWithReuses if the user is not
7353 // ShuffleVectorInst.
7354 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
7355 isa<ShuffleVectorInst>(Val: TE->UserTreeIndex.UserTE->getMainOp()))
7356 continue;
7357 }
7358 // Update ordering of the operands with the smaller VF than the given
7359 // one.
7360 reorderNodeWithReuses(TE&: *TE, Mask);
7361 // Update orders in user split vectorize nodes.
7362 if (TE->UserTreeIndex &&
7363 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
7364 TE->UserTreeIndex.UserTE->reorderSplitNode(
7365 Idx: TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
7366 }
7367 continue;
7368 }
7369 if ((TE->State == TreeEntry::SplitVectorize &&
7370 TE->ReuseShuffleIndices.empty()) ||
7371 ((TE->State == TreeEntry::Vectorize ||
7372 TE->State == TreeEntry::StridedVectorize ||
7373 TE->State == TreeEntry::CompressVectorize) &&
7374 (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
7375 InsertElementInst>(Val: TE->getMainOp()) ||
7376 (SLPReVec && isa<ShuffleVectorInst>(Val: TE->getMainOp()))))) {
7377 assert(
7378 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
7379 TE->ReuseShuffleIndices.empty())) &&
7380 "Alternate instructions are only supported by BinaryOperator "
7381 "and CastInst.");
7382 // Build correct orders for extract{element,value}, loads,
7383 // stores and alternate (split) nodes.
7384 reorderOrder(Order&: TE->ReorderIndices, Mask);
7385 if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp()))
7386 TE->reorderOperands(Mask);
7387 } else {
7388 // Reorder the node and its operands.
7389 TE->reorderOperands(Mask);
7390 assert(TE->ReorderIndices.empty() &&
7391 "Expected empty reorder sequence.");
7392 reorderScalars(Scalars&: TE->Scalars, Mask);
7393 }
7394 if (!TE->ReuseShuffleIndices.empty()) {
7395 // Apply reversed order to keep the original ordering of the reused
7396 // elements to avoid extra reorder indices shuffling.
7397 OrdersType CurrentOrder;
7398 reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
7399 SmallVector<int> NewReuses;
7400 inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
7401 addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices);
7402 TE->ReuseShuffleIndices.swap(RHS&: NewReuses);
7403 } else if (TE->UserTreeIndex &&
7404 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
7405 // Update orders in user split vectorize nodes.
7406 TE->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE->UserTreeIndex.EdgeIdx,
7407 Mask, MaskOrder);
7408 }
7409 }
7410}
7411
7412void BoUpSLP::buildReorderableOperands(
7413 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
7414 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
7415 SmallVectorImpl<TreeEntry *> &GatherOps) {
7416 for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) {
7417 if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
7418 return OpData.first == I &&
7419 (OpData.second->State == TreeEntry::Vectorize ||
7420 OpData.second->State == TreeEntry::StridedVectorize ||
7421 OpData.second->State == TreeEntry::CompressVectorize ||
7422 OpData.second->State == TreeEntry::SplitVectorize);
7423 }))
7424 continue;
7425 // Do not request operands, if they do not exist.
7426 if (UserTE->hasState()) {
7427 if (UserTE->getOpcode() == Instruction::ExtractElement ||
7428 UserTE->getOpcode() == Instruction::ExtractValue)
7429 continue;
7430 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
7431 continue;
7432 if (UserTE->getOpcode() == Instruction::Store &&
7433 UserTE->State == TreeEntry::Vectorize && I == 1)
7434 continue;
7435 if (UserTE->getOpcode() == Instruction::Load &&
7436 (UserTE->State == TreeEntry::Vectorize ||
7437 UserTE->State == TreeEntry::StridedVectorize ||
7438 UserTE->State == TreeEntry::CompressVectorize))
7439 continue;
7440 }
7441 TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I);
7442 assert(TE && "Expected operand entry.");
7443 if (!TE->isGather()) {
7444 // Add the node to the list of the ordered nodes with the identity
7445 // order.
7446 Edges.emplace_back(Args&: I, Args&: TE);
7447 // Add ScatterVectorize nodes to the list of operands, where just
7448 // reordering of the scalars is required. Similar to the gathers, so
7449 // simply add to the list of gathered ops.
7450 // If there are reused scalars, process this node as a regular vectorize
7451 // node, just reorder reuses mask.
7452 if (TE->State == TreeEntry::ScatterVectorize &&
7453 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
7454 GatherOps.push_back(Elt: TE);
7455 continue;
7456 }
7457 if (ReorderableGathers.contains(Ptr: TE))
7458 GatherOps.push_back(Elt: TE);
7459 }
7460}
7461
7462void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
7463 struct TreeEntryCompare {
7464 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
7465 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
7466 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
7467 return LHS->Idx < RHS->Idx;
7468 }
7469 };
7470 PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;
7471 DenseSet<const TreeEntry *> GathersToOrders;
7472 // Find all reorderable leaf nodes with the given VF.
7473 // Currently the are vectorized loads,extracts without alternate operands +
7474 // some gathering of extracts.
7475 SmallPtrSet<const TreeEntry *, 4> NonVectorized;
7476 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
7477 if (TE->State != TreeEntry::Vectorize &&
7478 TE->State != TreeEntry::StridedVectorize &&
7479 TE->State != TreeEntry::CompressVectorize &&
7480 TE->State != TreeEntry::SplitVectorize)
7481 NonVectorized.insert(Ptr: TE.get());
7482 if (std::optional<OrdersType> CurrentOrder =
7483 getReorderingData(TE: *TE, /*TopToBottom=*/false, IgnoreReorder)) {
7484 Queue.push(x: TE.get());
7485 if (!(TE->State == TreeEntry::Vectorize ||
7486 TE->State == TreeEntry::StridedVectorize ||
7487 TE->State == TreeEntry::CompressVectorize ||
7488 TE->State == TreeEntry::SplitVectorize) ||
7489 !TE->ReuseShuffleIndices.empty())
7490 GathersToOrders.insert(V: TE.get());
7491 }
7492 }
7493
7494 // 1. Propagate order to the graph nodes, which use only reordered nodes.
7495 // I.e., if the node has operands, that are reordered, try to make at least
7496 // one operand order in the natural order and reorder others + reorder the
7497 // user node itself.
7498 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
7499 while (!Queue.empty()) {
7500 // 1. Filter out only reordered nodes.
7501 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
7502 TreeEntry *TE = Queue.top();
7503 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
7504 Queue.pop();
7505 SmallVector<TreeEntry *> OrderedOps(1, TE);
7506 while (!Queue.empty()) {
7507 TE = Queue.top();
7508 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
7509 break;
7510 Queue.pop();
7511 OrderedOps.push_back(Elt: TE);
7512 }
7513 for (TreeEntry *TE : OrderedOps) {
7514 if (!(TE->State == TreeEntry::Vectorize ||
7515 TE->State == TreeEntry::StridedVectorize ||
7516 TE->State == TreeEntry::CompressVectorize ||
7517 TE->State == TreeEntry::SplitVectorize ||
7518 (TE->isGather() && GathersToOrders.contains(V: TE))) ||
7519 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
7520 !Visited.insert(Ptr: TE).second)
7521 continue;
7522 // Build a map between user nodes and their operands order to speedup
7523 // search. The graph currently does not provide this dependency directly.
7524 Users.first = TE->UserTreeIndex.UserTE;
7525 Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE);
7526 }
7527 if (Users.first) {
7528 auto &Data = Users;
7529 if (Data.first->State == TreeEntry::SplitVectorize) {
7530 assert(
7531 Data.second.size() <= 2 &&
7532 "Expected not greater than 2 operands for split vectorize node.");
7533 if (any_of(Range&: Data.second,
7534 P: [](const auto &Op) { return !Op.second->UserTreeIndex; }))
7535 continue;
7536 // Update orders in user split vectorize nodes.
7537 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
7538 "Expected exactly 2 entries.");
7539 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
7540 TreeEntry &OpTE = *VectorizableTree[P.first];
7541 OrdersType Order = OpTE.ReorderIndices;
7542 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
7543 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
7544 continue;
7545 const auto BestOrder =
7546 getReorderingData(TE: OpTE, /*TopToBottom=*/false, IgnoreReorder);
7547 if (!BestOrder || BestOrder->empty() || isIdentityOrder(Order: *BestOrder))
7548 continue;
7549 Order = *BestOrder;
7550 }
7551 fixupOrderingIndices(Order);
7552 SmallVector<int> Mask;
7553 inversePermutation(Indices: Order, Mask);
7554 const unsigned E = Order.size();
7555 SmallVector<int> MaskOrder(E, PoisonMaskElem);
7556 transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) {
7557 return I < E ? static_cast<int>(I) : PoisonMaskElem;
7558 });
7559 Data.first->reorderSplitNode(Idx: P.second ? 1 : 0, Mask, MaskOrder);
7560 // Clear ordering of the operand.
7561 if (!OpTE.ReorderIndices.empty()) {
7562 OpTE.ReorderIndices.clear();
7563 } else if (!OpTE.ReuseShuffleIndices.empty()) {
7564 reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask);
7565 } else {
7566 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
7567 reorderScalars(Scalars&: OpTE.Scalars, Mask);
7568 }
7569 }
7570 if (Data.first->ReuseShuffleIndices.empty() &&
7571 !Data.first->ReorderIndices.empty()) {
7572 // Insert user node to the list to try to sink reordering deeper in
7573 // the graph.
7574 Queue.push(x: Data.first);
7575 }
7576 continue;
7577 }
7578 // Check that operands are used only in the User node.
7579 SmallVector<TreeEntry *> GatherOps;
7580 buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
7581 GatherOps);
7582 // All operands are reordered and used only in this node - propagate the
7583 // most used order to the user node.
7584 MapVector<OrdersType, unsigned,
7585 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
7586 OrdersUses;
7587 // Do the analysis for each tree entry only once, otherwise the order of
7588 // the same node my be considered several times, though might be not
7589 // profitable.
7590 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
7591 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
7592 for (const auto &Op : Data.second) {
7593 TreeEntry *OpTE = Op.second;
7594 if (!VisitedOps.insert(Ptr: OpTE).second)
7595 continue;
7596 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
7597 continue;
7598 const auto Order = [&]() -> const OrdersType {
7599 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
7600 return getReorderingData(TE: *OpTE, /*TopToBottom=*/false,
7601 IgnoreReorder)
7602 .value_or(u: OrdersType(1));
7603 return OpTE->ReorderIndices;
7604 }();
7605 // The order is partially ordered, skip it in favor of fully non-ordered
7606 // orders.
7607 if (Order.size() == 1)
7608 continue;
7609
7610 // Check that the reordering does not increase number of shuffles, i.e.
7611 // same-values-nodes has same parents or their parents has same parents.
7612 if (!Order.empty() && !isIdentityOrder(Order)) {
7613 Value *Root = OpTE->hasState()
7614 ? OpTE->getMainOp()
7615 : *find_if_not(Range&: OpTE->Scalars, P: isConstant);
7616 auto GetSameNodesUsers = [&](Value *Root) {
7617 SmallSetVector<TreeEntry *, 4> Res;
7618 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) {
7619 if (TE != OpTE && TE->UserTreeIndex &&
7620 TE->getVectorFactor() == OpTE->getVectorFactor() &&
7621 TE->Scalars.size() == OpTE->Scalars.size() &&
7622 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
7623 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
7624 Res.insert(X: TE->UserTreeIndex.UserTE);
7625 }
7626 for (const TreeEntry *TE : getTreeEntries(V: Root)) {
7627 if (TE != OpTE && TE->UserTreeIndex &&
7628 TE->getVectorFactor() == OpTE->getVectorFactor() &&
7629 TE->Scalars.size() == OpTE->Scalars.size() &&
7630 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
7631 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
7632 Res.insert(X: TE->UserTreeIndex.UserTE);
7633 }
7634 return Res.takeVector();
7635 };
7636 auto GetNumOperands = [](const TreeEntry *TE) {
7637 if (TE->State == TreeEntry::SplitVectorize)
7638 return TE->getNumOperands();
7639 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
7640 return CI->arg_size();
7641 return TE->getNumOperands();
7642 };
7643 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
7644 const TreeEntry *TE) {
7645 Intrinsic::ID ID = Intrinsic::not_intrinsic;
7646 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
7647 ID = getVectorIntrinsicIDForCall(CI, TLI);
7648 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(TE))) {
7649 if (ID != Intrinsic::not_intrinsic &&
7650 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
7651 continue;
7652 const TreeEntry *Op = getOperandEntry(E: TE, Idx);
7653 if (Op->isGather() && Op->hasState()) {
7654 const TreeEntry *VecOp =
7655 getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars);
7656 if (VecOp)
7657 Op = VecOp;
7658 }
7659 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
7660 return false;
7661 }
7662 return true;
7663 };
7664 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
7665 if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) {
7666 if (!RevisitedOps.insert(Ptr: UTE).second)
7667 return false;
7668 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
7669 !UTE->ReuseShuffleIndices.empty() ||
7670 (UTE->UserTreeIndex &&
7671 UTE->UserTreeIndex.UserTE == Data.first) ||
7672 (Data.first->UserTreeIndex &&
7673 Data.first->UserTreeIndex.UserTE == UTE) ||
7674 (IgnoreReorder && UTE->UserTreeIndex &&
7675 UTE->UserTreeIndex.UserTE->Idx == 0) ||
7676 NodeShouldBeReorderedWithOperands(UTE);
7677 }))
7678 continue;
7679 for (TreeEntry *UTE : Users) {
7680 Intrinsic::ID ID = Intrinsic::not_intrinsic;
7681 if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI)
7682 ID = getVectorIntrinsicIDForCall(CI, TLI);
7683 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(UTE))) {
7684 if (ID != Intrinsic::not_intrinsic &&
7685 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
7686 continue;
7687 const TreeEntry *Op = getOperandEntry(E: UTE, Idx);
7688 Visited.erase(Ptr: Op);
7689 Queue.push(x: const_cast<TreeEntry *>(Op));
7690 }
7691 }
7692 }
7693 unsigned NumOps = count_if(
7694 Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
7695 return P.second == OpTE;
7696 });
7697 // Stores actually store the mask, not the order, need to invert.
7698 if (OpTE->State == TreeEntry::Vectorize &&
7699 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
7700 assert(!OpTE->isAltShuffle() &&
7701 "Alternate instructions are only supported by BinaryOperator "
7702 "and CastInst.");
7703 SmallVector<int> Mask;
7704 inversePermutation(Indices: Order, Mask);
7705 unsigned E = Order.size();
7706 OrdersType CurrentOrder(E, E);
7707 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
7708 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
7709 });
7710 fixupOrderingIndices(Order: CurrentOrder);
7711 OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second +=
7712 NumOps;
7713 } else {
7714 OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second += NumOps;
7715 }
7716 auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0));
7717 const auto AllowsReordering = [&](const TreeEntry *TE) {
7718 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
7719 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
7720 (IgnoreReorder && TE->Idx == 0))
7721 return true;
7722 if (TE->isGather()) {
7723 if (GathersToOrders.contains(V: TE))
7724 return !getReorderingData(TE: *TE, /*TopToBottom=*/false,
7725 IgnoreReorder)
7726 .value_or(u: OrdersType(1))
7727 .empty();
7728 return true;
7729 }
7730 return false;
7731 };
7732 if (OpTE->UserTreeIndex) {
7733 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
7734 if (!VisitedUsers.insert(Ptr: UserTE).second)
7735 continue;
7736 // May reorder user node if it requires reordering, has reused
7737 // scalars, is an alternate op vectorize node or its op nodes require
7738 // reordering.
7739 if (AllowsReordering(UserTE))
7740 continue;
7741 // Check if users allow reordering.
7742 // Currently look up just 1 level of operands to avoid increase of
7743 // the compile time.
7744 // Profitable to reorder if definitely more operands allow
7745 // reordering rather than those with natural order.
7746 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
7747 if (static_cast<unsigned>(count_if(
7748 Range&: Ops, P: [UserTE, &AllowsReordering](
7749 const std::pair<unsigned, TreeEntry *> &Op) {
7750 return AllowsReordering(Op.second) &&
7751 Op.second->UserTreeIndex.UserTE == UserTE;
7752 })) <= Ops.size() / 2)
7753 ++Res.first->second;
7754 }
7755 }
7756 if (OrdersUses.empty()) {
7757 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
7758 continue;
7759 }
7760 // Choose the most used order.
7761 unsigned IdentityCnt = 0;
7762 unsigned VF = Data.second.front().second->getVectorFactor();
7763 OrdersType IdentityOrder(VF, VF);
7764 for (auto &Pair : OrdersUses) {
7765 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
7766 IdentityCnt += Pair.second;
7767 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
7768 }
7769 }
7770 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
7771 unsigned Cnt = IdentityCnt;
7772 for (auto &Pair : OrdersUses) {
7773 // Prefer identity order. But, if filled identity found (non-empty
7774 // order) with same number of uses, as the new candidate order, we can
7775 // choose this candidate order.
7776 if (Cnt < Pair.second) {
7777 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
7778 BestOrder = Pair.first;
7779 Cnt = Pair.second;
7780 } else {
7781 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
7782 }
7783 }
7784 // Set order of the user node.
7785 if (isIdentityOrder(Order: BestOrder)) {
7786 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
7787 continue;
7788 }
7789 fixupOrderingIndices(Order: BestOrder);
7790 // Erase operands from OrderedEntries list and adjust their orders.
7791 VisitedOps.clear();
7792 SmallVector<int> Mask;
7793 inversePermutation(Indices: BestOrder, Mask);
7794 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
7795 unsigned E = BestOrder.size();
7796 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
7797 return I < E ? static_cast<int>(I) : PoisonMaskElem;
7798 });
7799 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
7800 TreeEntry *TE = Op.second;
7801 if (!VisitedOps.insert(Ptr: TE).second)
7802 continue;
7803 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
7804 reorderNodeWithReuses(TE&: *TE, Mask);
7805 continue;
7806 }
7807 // Gathers are processed separately.
7808 if (TE->State != TreeEntry::Vectorize &&
7809 TE->State != TreeEntry::StridedVectorize &&
7810 TE->State != TreeEntry::CompressVectorize &&
7811 TE->State != TreeEntry::SplitVectorize &&
7812 (TE->State != TreeEntry::ScatterVectorize ||
7813 TE->ReorderIndices.empty()))
7814 continue;
7815 assert((BestOrder.size() == TE->ReorderIndices.size() ||
7816 TE->ReorderIndices.empty()) &&
7817 "Non-matching sizes of user/operand entries.");
7818 reorderOrder(Order&: TE->ReorderIndices, Mask);
7819 if (IgnoreReorder && TE == VectorizableTree.front().get())
7820 IgnoreReorder = false;
7821 }
7822 // For gathers just need to reorder its scalars.
7823 for (TreeEntry *Gather : GatherOps) {
7824 assert(Gather->ReorderIndices.empty() &&
7825 "Unexpected reordering of gathers.");
7826 if (!Gather->ReuseShuffleIndices.empty()) {
7827 // Just reorder reuses indices.
7828 reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
7829 continue;
7830 }
7831 reorderScalars(Scalars&: Gather->Scalars, Mask);
7832 Visited.insert(Ptr: Gather);
7833 }
7834 // Reorder operands of the user node and set the ordering for the user
7835 // node itself.
7836 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
7837 return TE.isAltShuffle() &&
7838 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
7839 TE.ReorderIndices.empty());
7840 };
7841 if (Data.first->State != TreeEntry::Vectorize ||
7842 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
7843 Val: Data.first->getMainOp()) ||
7844 IsNotProfitableAltCodeNode(*Data.first))
7845 Data.first->reorderOperands(Mask);
7846 if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) ||
7847 IsNotProfitableAltCodeNode(*Data.first) ||
7848 Data.first->State == TreeEntry::StridedVectorize ||
7849 Data.first->State == TreeEntry::CompressVectorize) {
7850 reorderScalars(Scalars&: Data.first->Scalars, Mask);
7851 reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
7852 /*BottomOrder=*/true);
7853 if (Data.first->ReuseShuffleIndices.empty() &&
7854 !Data.first->ReorderIndices.empty() &&
7855 !IsNotProfitableAltCodeNode(*Data.first)) {
7856 // Insert user node to the list to try to sink reordering deeper in
7857 // the graph.
7858 Queue.push(x: Data.first);
7859 }
7860 } else {
7861 reorderOrder(Order&: Data.first->ReorderIndices, Mask);
7862 }
7863 }
7864 }
7865 // If the reordering is unnecessary, just remove the reorder.
7866 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
7867 VectorizableTree.front()->ReuseShuffleIndices.empty())
7868 VectorizableTree.front()->ReorderIndices.clear();
7869}
7870
7871Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
7872 if ((Entry.getOpcode() == Instruction::Store ||
7873 Entry.getOpcode() == Instruction::Load) &&
7874 Entry.State == TreeEntry::StridedVectorize &&
7875 !Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices))
7876 return dyn_cast<Instruction>(Val: Entry.Scalars[Entry.ReorderIndices.front()]);
7877 return dyn_cast<Instruction>(Val: Entry.Scalars.front());
7878}
7879
7880void BoUpSLP::buildExternalUses(
7881 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
7882 DenseMap<Value *, unsigned> ScalarToExtUses;
7883 // Collect the values that we need to extract from the tree.
7884 for (auto &TEPtr : VectorizableTree) {
7885 TreeEntry *Entry = TEPtr.get();
7886
7887 // No need to handle users of gathered values.
7888 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
7889 continue;
7890
7891 // For each lane:
7892 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
7893 Value *Scalar = Entry->Scalars[Lane];
7894 if (!isa<Instruction>(Val: Scalar))
7895 continue;
7896 // All uses must be replaced already? No need to do it again.
7897 auto It = ScalarToExtUses.find(Val: Scalar);
7898 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
7899 continue;
7900
7901 // Check if the scalar is externally used as an extra arg.
7902 const auto ExtI = ExternallyUsedValues.find(V: Scalar);
7903 if (ExtI != ExternallyUsedValues.end()) {
7904 int FoundLane = Entry->findLaneForValue(V: Scalar);
7905 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
7906 << FoundLane << " from " << *Scalar << ".\n");
7907 ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
7908 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
7909 continue;
7910 }
7911 for (User *U : Scalar->users()) {
7912 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
7913
7914 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
7915 if (!UserInst || isDeleted(I: UserInst))
7916 continue;
7917
7918 // Ignore users in the user ignore list.
7919 if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
7920 continue;
7921
7922 // Skip in-tree scalars that become vectors
7923 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U);
7924 !UseEntries.empty()) {
7925 // Some in-tree scalars will remain as scalar in vectorized
7926 // instructions. If that is the case, the one in FoundLane will
7927 // be used.
7928 if (all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) {
7929 return UseEntry->State == TreeEntry::ScatterVectorize ||
7930 !doesInTreeUserNeedToExtract(
7931 Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI,
7932 TTI);
7933 })) {
7934 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
7935 << ".\n");
7936 assert(none_of(UseEntries,
7937 [](TreeEntry *UseEntry) {
7938 return UseEntry->isGather();
7939 }) &&
7940 "Bad state");
7941 continue;
7942 }
7943 U = nullptr;
7944 if (It != ScalarToExtUses.end()) {
7945 ExternalUses[It->second].User = nullptr;
7946 break;
7947 }
7948 }
7949
7950 if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
7951 U = nullptr;
7952 int FoundLane = Entry->findLaneForValue(V: Scalar);
7953 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
7954 << " from lane " << FoundLane << " from " << *Scalar
7955 << ".\n");
7956 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
7957 ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane);
7958 if (!U)
7959 break;
7960 }
7961 }
7962 }
7963}
7964
7965SmallVector<SmallVector<StoreInst *>>
7966BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
7967 SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
7968 SmallVector<StoreInst *>, 8>
7969 PtrToStoresMap;
7970 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) {
7971 Value *V = TE->Scalars[Lane];
7972 // Don't iterate over the users of constant data.
7973 if (!isa<Instruction>(Val: V))
7974 continue;
7975 // To save compilation time we don't visit if we have too many users.
7976 if (V->hasNUsesOrMore(N: UsesLimit))
7977 break;
7978
7979 // Collect stores per pointer object.
7980 for (User *U : V->users()) {
7981 auto *SI = dyn_cast<StoreInst>(Val: U);
7982 // Test whether we can handle the store. V might be a global, which could
7983 // be used in a different function.
7984 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
7985 !isValidElementType(Ty: SI->getValueOperand()->getType()))
7986 continue;
7987 // Skip entry if already
7988 if (isVectorized(V: U))
7989 continue;
7990
7991 Value *Ptr =
7992 getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
7993 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
7994 SI->getValueOperand()->getType(), Ptr}];
7995 // For now just keep one store per pointer object per lane.
7996 // TODO: Extend this to support multiple stores per pointer per lane
7997 if (StoresVec.size() > Lane)
7998 continue;
7999 if (!StoresVec.empty()) {
8000 std::optional<int64_t> Diff = getPointersDiff(
8001 ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(),
8002 ElemTyB: SI->getValueOperand()->getType(),
8003 PtrB: StoresVec.front()->getPointerOperand(), DL: *DL, SE&: *SE,
8004 /*StrictCheck=*/true);
8005 // We failed to compare the pointers so just abandon this store.
8006 if (!Diff)
8007 continue;
8008 }
8009 StoresVec.push_back(Elt: SI);
8010 }
8011 }
8012 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
8013 unsigned I = 0;
8014 for (auto &P : PtrToStoresMap) {
8015 Res[I].swap(RHS&: P.second);
8016 ++I;
8017 }
8018 return Res;
8019}
8020
8021bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
8022 OrdersType &ReorderIndices) const {
8023 // We check whether the stores in StoreVec can form a vector by sorting them
8024 // and checking whether they are consecutive.
8025
8026 // To avoid calling getPointersDiff() while sorting we create a vector of
8027 // pairs {store, offset from first} and sort this instead.
8028 SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;
8029 StoreInst *S0 = StoresVec[0];
8030 StoreOffsetVec.emplace_back(Args: 0, Args: 0);
8031 Type *S0Ty = S0->getValueOperand()->getType();
8032 Value *S0Ptr = S0->getPointerOperand();
8033 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) {
8034 StoreInst *SI = StoresVec[Idx];
8035 std::optional<int64_t> Diff =
8036 getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
8037 PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
8038 /*StrictCheck=*/true);
8039 StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx);
8040 }
8041
8042 // Check if the stores are consecutive by checking if their difference is 1.
8043 if (StoreOffsetVec.size() != StoresVec.size())
8044 return false;
8045 sort(C&: StoreOffsetVec, Comp: llvm::less_first());
8046 unsigned Idx = 0;
8047 int64_t PrevDist = 0;
8048 for (const auto &P : StoreOffsetVec) {
8049 if (Idx > 0 && P.first != PrevDist + 1)
8050 return false;
8051 PrevDist = P.first;
8052 ++Idx;
8053 }
8054
8055 // Calculate the shuffle indices according to their offset against the sorted
8056 // StoreOffsetVec.
8057 ReorderIndices.assign(NumElts: StoresVec.size(), Elt: 0);
8058 bool IsIdentity = true;
8059 for (auto [I, P] : enumerate(First&: StoreOffsetVec)) {
8060 ReorderIndices[P.second] = I;
8061 IsIdentity &= P.second == I;
8062 }
8063 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
8064 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
8065 // same convention here.
8066 if (IsIdentity)
8067 ReorderIndices.clear();
8068
8069 return true;
8070}
8071
8072#ifndef NDEBUG
8073LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
8074 for (unsigned Idx : Order)
8075 dbgs() << Idx << ", ";
8076 dbgs() << "\n";
8077}
8078#endif
8079
8080SmallVector<BoUpSLP::OrdersType, 1>
8081BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
8082 unsigned NumLanes = TE->Scalars.size();
8083
8084 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
8085
8086 // Holds the reorder indices for each candidate store vector that is a user of
8087 // the current TreeEntry.
8088 SmallVector<OrdersType, 1> ExternalReorderIndices;
8089
8090 // Now inspect the stores collected per pointer and look for vectorization
8091 // candidates. For each candidate calculate the reorder index vector and push
8092 // it into `ExternalReorderIndices`
8093 for (ArrayRef<StoreInst *> StoresVec : Stores) {
8094 // If we have fewer than NumLanes stores, then we can't form a vector.
8095 if (StoresVec.size() != NumLanes)
8096 continue;
8097
8098 // If the stores are not consecutive then abandon this StoresVec.
8099 OrdersType ReorderIndices;
8100 if (!canFormVector(StoresVec, ReorderIndices))
8101 continue;
8102
8103 // We now know that the scalars in StoresVec can form a vector instruction,
8104 // so set the reorder indices.
8105 ExternalReorderIndices.push_back(Elt: ReorderIndices);
8106 }
8107 return ExternalReorderIndices;
8108}
8109
8110void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
8111 const SmallDenseSet<Value *> &UserIgnoreLst) {
8112 deleteTree();
8113 UserIgnoreList = &UserIgnoreLst;
8114 if (!allSameType(VL: Roots))
8115 return;
8116 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
8117}
8118
8119void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
8120 deleteTree();
8121 if (!allSameType(VL: Roots))
8122 return;
8123 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
8124}
8125
8126/// Tries to find subvector of loads and builds new vector of only loads if can
8127/// be profitable.
8128static void gatherPossiblyVectorizableLoads(
8129 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
8130 ScalarEvolution &SE, const TargetTransformInfo &TTI,
8131 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
8132 bool AddNew = true) {
8133 if (VL.empty())
8134 return;
8135 Type *ScalarTy = getValueType(V: VL.front());
8136 if (!isValidElementType(Ty: ScalarTy))
8137 return;
8138 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;
8139 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
8140 for (Value *V : VL) {
8141 auto *LI = dyn_cast<LoadInst>(Val: V);
8142 if (!LI)
8143 continue;
8144 if (R.isDeleted(I: LI) || R.isVectorized(V: LI) || !LI->isSimple())
8145 continue;
8146 bool IsFound = false;
8147 for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) {
8148 assert(LI->getParent() == Data.front().first->getParent() &&
8149 LI->getType() == Data.front().first->getType() &&
8150 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
8151 getUnderlyingObject(Data.front().first->getPointerOperand(),
8152 RecursionMaxDepth) &&
8153 "Expected loads with the same type, same parent and same "
8154 "underlying pointer.");
8155 std::optional<int64_t> Dist = getPointersDiff(
8156 ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(),
8157 PtrB: Data.front().first->getPointerOperand(), DL, SE,
8158 /*StrictCheck=*/true);
8159 if (!Dist)
8160 continue;
8161 auto It = Map.find(Val: *Dist);
8162 if (It != Map.end() && It->second != LI)
8163 continue;
8164 if (It == Map.end()) {
8165 Data.emplace_back(Args&: LI, Args&: *Dist);
8166 Map.try_emplace(Key: *Dist, Args&: LI);
8167 }
8168 IsFound = true;
8169 break;
8170 }
8171 if (!IsFound) {
8172 ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: 0);
8173 ClusteredDistToLoad.emplace_back().try_emplace(Key: 0, Args&: LI);
8174 }
8175 }
8176 auto FindMatchingLoads =
8177 [&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,
8178 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>
8179 &GatheredLoads,
8180 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
8181 int64_t &Offset, unsigned &Start) {
8182 if (Loads.empty())
8183 return GatheredLoads.end();
8184 LoadInst *LI = Loads.front().first;
8185 for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) {
8186 if (Idx < Start)
8187 continue;
8188 ToAdd.clear();
8189 if (LI->getParent() != Data.front().first->getParent() ||
8190 LI->getType() != Data.front().first->getType())
8191 continue;
8192 std::optional<int64_t> Dist =
8193 getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(),
8194 ElemTyB: Data.front().first->getType(),
8195 PtrB: Data.front().first->getPointerOperand(), DL, SE,
8196 /*StrictCheck=*/true);
8197 if (!Dist)
8198 continue;
8199 SmallSet<int64_t, 4> DataDists;
8200 SmallPtrSet<LoadInst *, 4> DataLoads;
8201 for (std::pair<LoadInst *, int64_t> P : Data) {
8202 DataDists.insert(V: P.second);
8203 DataLoads.insert(Ptr: P.first);
8204 }
8205 // Found matching gathered loads - check if all loads are unique or
8206 // can be effectively vectorized.
8207 unsigned NumUniques = 0;
8208 for (auto [Cnt, Pair] : enumerate(First&: Loads)) {
8209 bool Used = DataLoads.contains(Ptr: Pair.first);
8210 if (!Used && !DataDists.contains(V: *Dist + Pair.second)) {
8211 ++NumUniques;
8212 ToAdd.insert(X: Cnt);
8213 } else if (Used) {
8214 Repeated.insert(X: Cnt);
8215 }
8216 }
8217 if (NumUniques > 0 &&
8218 (Loads.size() == NumUniques ||
8219 (Loads.size() - NumUniques >= 2 &&
8220 Loads.size() - NumUniques >= Loads.size() / 2 &&
8221 (has_single_bit(Value: Data.size() + NumUniques) ||
8222 bit_ceil(Value: Data.size()) <
8223 bit_ceil(Value: Data.size() + NumUniques))))) {
8224 Offset = *Dist;
8225 Start = Idx + 1;
8226 return std::next(x: GatheredLoads.begin(), n: Idx);
8227 }
8228 }
8229 ToAdd.clear();
8230 return GatheredLoads.end();
8231 };
8232 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
8233 unsigned Start = 0;
8234 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
8235 int64_t Offset = 0;
8236 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
8237 Offset, Start);
8238 while (It != GatheredLoads.end()) {
8239 assert(!LocalToAdd.empty() && "Expected some elements to add.");
8240 for (unsigned Idx : LocalToAdd)
8241 It->emplace_back(Args: Data[Idx].first, Args: Data[Idx].second + Offset);
8242 ToAdd.insert_range(R&: LocalToAdd);
8243 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
8244 Start);
8245 }
8246 if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) {
8247 return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx);
8248 })) {
8249 auto AddNewLoads =
8250 [&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {
8251 for (unsigned Idx : seq<unsigned>(Size: Data.size())) {
8252 if (ToAdd.contains(key: Idx) || Repeated.contains(key: Idx))
8253 continue;
8254 Loads.push_back(Elt: Data[Idx]);
8255 }
8256 };
8257 if (!AddNew) {
8258 LoadInst *LI = Data.front().first;
8259 It = find_if(
8260 Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
8261 return PD.front().first->getParent() == LI->getParent() &&
8262 PD.front().first->getType() == LI->getType();
8263 });
8264 while (It != GatheredLoads.end()) {
8265 AddNewLoads(*It);
8266 It = std::find_if(
8267 first: std::next(x: It), last: GatheredLoads.end(),
8268 pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
8269 return PD.front().first->getParent() == LI->getParent() &&
8270 PD.front().first->getType() == LI->getType();
8271 });
8272 }
8273 }
8274 GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end());
8275 AddNewLoads(GatheredLoads.emplace_back());
8276 }
8277 }
8278}
8279
8280void BoUpSLP::tryToVectorizeGatheredLoads(
8281 const SmallMapVector<
8282 std::tuple<BasicBlock *, Value *, Type *>,
8283 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
8284 &GatheredLoads) {
8285 GatheredLoadsEntriesFirst = VectorizableTree.size();
8286
8287 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
8288 LoadEntriesToVectorize.size());
8289 for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize))
8290 Set.insert_range(R&: VectorizableTree[Idx]->Scalars);
8291
8292 // Sort loads by distance.
8293 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
8294 const std::pair<LoadInst *, int64_t> &L2) {
8295 return L1.second > L2.second;
8296 };
8297
8298 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
8299 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
8300 Loads.size());
8301 Align Alignment = computeCommonAlignment<LoadInst>(VL: Values);
8302 auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size());
8303 return TTI->isLegalMaskedGather(DataType: Ty, Alignment) &&
8304 !TTI->forceScalarizeMaskedGather(Type: Ty, Alignment);
8305 };
8306
8307 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
8308 BoUpSLP::ValueSet &VectorizedLoads,
8309 SmallVectorImpl<LoadInst *> &NonVectorized,
8310 bool Final, unsigned MaxVF) {
8311 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
8312 unsigned StartIdx = 0;
8313 SmallVector<int> CandidateVFs;
8314 if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + 1))
8315 CandidateVFs.push_back(Elt: MaxVF);
8316 for (int NumElts = getFloorFullVectorNumberOfElements(
8317 TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF);
8318 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
8319 TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - 1)) {
8320 CandidateVFs.push_back(Elt: NumElts);
8321 if (VectorizeNonPowerOf2 && NumElts > 2)
8322 CandidateVFs.push_back(Elt: NumElts - 1);
8323 }
8324
8325 if (Final && CandidateVFs.empty())
8326 return Results;
8327
8328 unsigned BestVF = Final ? CandidateVFs.back() : 0;
8329 for (unsigned NumElts : CandidateVFs) {
8330 if (Final && NumElts > BestVF)
8331 continue;
8332 SmallVector<unsigned> MaskedGatherVectorized;
8333 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
8334 ++Cnt) {
8335 ArrayRef<LoadInst *> Slice =
8336 ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt));
8337 if (VectorizedLoads.count(Ptr: Slice.front()) ||
8338 VectorizedLoads.count(Ptr: Slice.back()) ||
8339 areKnownNonVectorizableLoads(VL: Slice))
8340 continue;
8341 // Check if it is profitable to try vectorizing gathered loads. It is
8342 // profitable if we have more than 3 consecutive loads or if we have
8343 // less but all users are vectorized or deleted.
8344 bool AllowToVectorize = false;
8345 // Check if it is profitable to vectorize 2-elements loads.
8346 if (NumElts == 2) {
8347 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
8348 ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts));
8349 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
8350 for (LoadInst *LI : Slice) {
8351 // If single use/user - allow to vectorize.
8352 if (LI->hasOneUse())
8353 continue;
8354 // 1. Check if number of uses equals number of users.
8355 // 2. All users are deleted.
8356 // 3. The load broadcasts are not allowed or the load is not
8357 // broadcasted.
8358 if (static_cast<unsigned int>(std::distance(
8359 first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses())
8360 return false;
8361 if (!IsLegalBroadcastLoad)
8362 continue;
8363 if (LI->hasNUsesOrMore(N: UsesLimit))
8364 return false;
8365 for (User *U : LI->users()) {
8366 if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI))
8367 continue;
8368 for (const TreeEntry *UTE : getTreeEntries(V: U)) {
8369 for (int I : seq<int>(Size: UTE->getNumOperands())) {
8370 if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) {
8371 return V == LI || isa<PoisonValue>(Val: V);
8372 }))
8373 // Found legal broadcast - do not vectorize.
8374 return false;
8375 }
8376 }
8377 }
8378 }
8379 return true;
8380 };
8381 AllowToVectorize = CheckIfAllowed(Slice);
8382 } else {
8383 AllowToVectorize =
8384 (NumElts >= 3 ||
8385 any_of(Range: ValueToGatherNodes.at(Val: Slice.front()),
8386 P: [=](const TreeEntry *TE) {
8387 return TE->Scalars.size() == 2 &&
8388 ((TE->Scalars.front() == Slice.front() &&
8389 TE->Scalars.back() == Slice.back()) ||
8390 (TE->Scalars.front() == Slice.back() &&
8391 TE->Scalars.back() == Slice.front()));
8392 })) &&
8393 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(),
8394 Sz: Slice.size());
8395 }
8396 if (AllowToVectorize) {
8397 SmallVector<Value *> PointerOps;
8398 OrdersType CurrentOrder;
8399 // Try to build vector load.
8400 ArrayRef<Value *> Values(
8401 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
8402 LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder,
8403 PointerOps, BestVF: &BestVF);
8404 if (LS != LoadsState::Gather ||
8405 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
8406 if (LS == LoadsState::ScatterVectorize) {
8407 if (MaskedGatherVectorized.empty() ||
8408 Cnt >= MaskedGatherVectorized.back() + NumElts)
8409 MaskedGatherVectorized.push_back(Elt: Cnt);
8410 continue;
8411 }
8412 if (LS != LoadsState::Gather) {
8413 Results.emplace_back(Args&: Values, Args&: LS);
8414 VectorizedLoads.insert_range(R&: Slice);
8415 // If we vectorized initial block, no need to try to vectorize it
8416 // again.
8417 if (Cnt == StartIdx)
8418 StartIdx += NumElts;
8419 }
8420 // Check if the whole array was vectorized already - exit.
8421 if (StartIdx >= Loads.size())
8422 break;
8423 // Erase last masked gather candidate, if another candidate within
8424 // the range is found to be better.
8425 if (!MaskedGatherVectorized.empty() &&
8426 Cnt < MaskedGatherVectorized.back() + NumElts)
8427 MaskedGatherVectorized.pop_back();
8428 Cnt += NumElts - 1;
8429 continue;
8430 }
8431 }
8432 if (!AllowToVectorize || BestVF == 0)
8433 registerNonVectorizableLoads(VL: Slice);
8434 }
8435 // Mark masked gathers candidates as vectorized, if any.
8436 for (unsigned Cnt : MaskedGatherVectorized) {
8437 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
8438 N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt));
8439 ArrayRef<Value *> Values(
8440 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
8441 Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize);
8442 VectorizedLoads.insert_range(R&: Slice);
8443 // If we vectorized initial block, no need to try to vectorize it again.
8444 if (Cnt == StartIdx)
8445 StartIdx += NumElts;
8446 }
8447 }
8448 for (LoadInst *LI : Loads) {
8449 if (!VectorizedLoads.contains(Ptr: LI))
8450 NonVectorized.push_back(Elt: LI);
8451 }
8452 return Results;
8453 };
8454 auto ProcessGatheredLoads =
8455 [&, &TTI = *TTI](
8456 ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
8457 bool Final = false) {
8458 SmallVector<LoadInst *> NonVectorized;
8459 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
8460 GatheredLoads) {
8461 if (LoadsDists.size() <= 1) {
8462 NonVectorized.push_back(Elt: LoadsDists.back().first);
8463 continue;
8464 }
8465 SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
8466 LoadsDists);
8467 SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists));
8468 stable_sort(Range&: LocalLoadsDists, C: LoadSorter);
8469 SmallVector<LoadInst *> Loads;
8470 unsigned MaxConsecutiveDistance = 0;
8471 unsigned CurrentConsecutiveDist = 1;
8472 int64_t LastDist = LocalLoadsDists.front().second;
8473 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
8474 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
8475 if (isVectorized(V: L.first))
8476 continue;
8477 assert(LastDist >= L.second &&
8478 "Expected first distance always not less than second");
8479 if (static_cast<uint64_t>(LastDist - L.second) ==
8480 CurrentConsecutiveDist) {
8481 ++CurrentConsecutiveDist;
8482 MaxConsecutiveDistance =
8483 std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist);
8484 Loads.push_back(Elt: L.first);
8485 continue;
8486 }
8487 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
8488 !Loads.empty())
8489 Loads.pop_back();
8490 CurrentConsecutiveDist = 1;
8491 LastDist = L.second;
8492 Loads.push_back(Elt: L.first);
8493 }
8494 if (Loads.size() <= 1)
8495 continue;
8496 if (AllowMaskedGather)
8497 MaxConsecutiveDistance = Loads.size();
8498 else if (MaxConsecutiveDistance < 2)
8499 continue;
8500 BoUpSLP::ValueSet VectorizedLoads;
8501 SmallVector<LoadInst *> SortedNonVectorized;
8502 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
8503 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
8504 Final, MaxConsecutiveDistance);
8505 if (!Results.empty() && !SortedNonVectorized.empty() &&
8506 OriginalLoads.size() == Loads.size() &&
8507 MaxConsecutiveDistance == Loads.size() &&
8508 all_of(Range&: Results,
8509 P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
8510 return P.second == LoadsState::ScatterVectorize;
8511 })) {
8512 VectorizedLoads.clear();
8513 SmallVector<LoadInst *> UnsortedNonVectorized;
8514 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
8515 UnsortedResults =
8516 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
8517 UnsortedNonVectorized, Final,
8518 OriginalLoads.size());
8519 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
8520 SortedNonVectorized.swap(RHS&: UnsortedNonVectorized);
8521 Results.swap(RHS&: UnsortedResults);
8522 }
8523 }
8524 for (auto [Slice, _] : Results) {
8525 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
8526 << Slice.size() << ")\n");
8527 if (any_of(Range&: Slice, P: [&](Value *V) { return isVectorized(V); })) {
8528 for (Value *L : Slice)
8529 if (!isVectorized(V: L))
8530 SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L));
8531 continue;
8532 }
8533
8534 // Select maximum VF as a maximum of user gathered nodes and
8535 // distance between scalar loads in these nodes.
8536 unsigned MaxVF = Slice.size();
8537 unsigned UserMaxVF = 0;
8538 unsigned InterleaveFactor = 0;
8539 if (MaxVF == 2) {
8540 UserMaxVF = MaxVF;
8541 } else {
8542 // Found distance between segments of the interleaved loads.
8543 std::optional<unsigned> InterleavedLoadsDistance = 0;
8544 unsigned Order = 0;
8545 std::optional<unsigned> CommonVF = 0;
8546 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
8547 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
8548 for (auto [Idx, V] : enumerate(First&: Slice)) {
8549 for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) {
8550 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size());
8551 unsigned Pos =
8552 EntryToPosition.try_emplace(Key: E, Args&: Idx).first->second;
8553 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + 1);
8554 if (CommonVF) {
8555 if (*CommonVF == 0) {
8556 CommonVF = E->Scalars.size();
8557 continue;
8558 }
8559 if (*CommonVF != E->Scalars.size())
8560 CommonVF.reset();
8561 }
8562 // Check if the load is the part of the interleaved load.
8563 if (Pos != Idx && InterleavedLoadsDistance) {
8564 if (!DeinterleavedNodes.contains(Ptr: E) &&
8565 any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) {
8566 if (isa<Constant>(Val: V))
8567 return false;
8568 if (isVectorized(V))
8569 return true;
8570 const auto &Nodes = ValueToGatherNodes.at(Val: V);
8571 return (Nodes.size() != 1 || !Nodes.contains(key: E)) &&
8572 !is_contained(Range: Slice, Element: V);
8573 })) {
8574 InterleavedLoadsDistance.reset();
8575 continue;
8576 }
8577 DeinterleavedNodes.insert(Ptr: E);
8578 if (*InterleavedLoadsDistance == 0) {
8579 InterleavedLoadsDistance = Idx - Pos;
8580 continue;
8581 }
8582 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
8583 (Idx - Pos) / *InterleavedLoadsDistance < Order)
8584 InterleavedLoadsDistance.reset();
8585 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: 1);
8586 }
8587 }
8588 }
8589 DeinterleavedNodes.clear();
8590 // Check if the large load represents interleaved load operation.
8591 if (InterleavedLoadsDistance.value_or(u: 0) > 1 &&
8592 CommonVF.value_or(u: 0) != 0) {
8593 InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance);
8594 unsigned VF = *CommonVF;
8595 OrdersType Order;
8596 SmallVector<Value *> PointerOps;
8597 // Segmented load detected - vectorize at maximum vector factor.
8598 if (InterleaveFactor <= Slice.size() &&
8599 TTI.isLegalInterleavedAccessType(
8600 VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF),
8601 Factor: InterleaveFactor,
8602 Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(),
8603 AddrSpace: cast<LoadInst>(Val: Slice.front())
8604 ->getPointerAddressSpace()) &&
8605 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
8606 PointerOps) == LoadsState::Vectorize) {
8607 UserMaxVF = InterleaveFactor * VF;
8608 } else {
8609 InterleaveFactor = 0;
8610 }
8611 }
8612 // Cannot represent the loads as consecutive vectorizable nodes -
8613 // just exit.
8614 unsigned ConsecutiveNodesSize = 0;
8615 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
8616 any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
8617 P: [&, Slice = Slice](const auto &P) {
8618 const auto *It = find_if(Slice, [&](Value *V) {
8619 return std::get<1>(P).contains(V);
8620 });
8621 if (It == Slice.end())
8622 return false;
8623 const TreeEntry &TE =
8624 *VectorizableTree[std::get<0>(P)];
8625 ArrayRef<Value *> VL = TE.Scalars;
8626 OrdersType Order;
8627 SmallVector<Value *> PointerOps;
8628 LoadsState State = canVectorizeLoads(
8629 VL, VL0: VL.front(), Order, PointerOps);
8630 if (State == LoadsState::ScatterVectorize ||
8631 State == LoadsState::CompressVectorize)
8632 return false;
8633 ConsecutiveNodesSize += VL.size();
8634 unsigned Start = std::distance(Slice.begin(), It);
8635 unsigned Sz = Slice.size() - Start;
8636 return Sz < VL.size() ||
8637 Slice.slice(std::distance(Slice.begin(), It),
8638 VL.size()) != VL;
8639 }))
8640 continue;
8641 // Try to build long masked gather loads.
8642 UserMaxVF = bit_ceil(Value: UserMaxVF);
8643 if (InterleaveFactor == 0 &&
8644 any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF),
8645 P: [&, Slice = Slice](unsigned Idx) {
8646 OrdersType Order;
8647 SmallVector<Value *> PointerOps;
8648 return canVectorizeLoads(
8649 VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF),
8650 VL0: Slice[Idx * UserMaxVF], Order,
8651 PointerOps) ==
8652 LoadsState::ScatterVectorize;
8653 }))
8654 UserMaxVF = MaxVF;
8655 if (Slice.size() != ConsecutiveNodesSize)
8656 MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF);
8657 }
8658 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
8659 bool IsVectorized = true;
8660 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
8661 ArrayRef<Value *> SubSlice =
8662 Slice.slice(N: I, M: std::min(a: VF, b: E - I));
8663 if (isVectorized(V: SubSlice.front()))
8664 continue;
8665 // Check if the subslice is to be-vectorized entry, which is not
8666 // equal to entry.
8667 if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
8668 P: [&](const auto &P) {
8669 return !SubSlice.equals(
8670 RHS: VectorizableTree[std::get<0>(P)]
8671 ->Scalars) &&
8672 set_is_subset(SubSlice, std::get<1>(P));
8673 }))
8674 continue;
8675 unsigned Sz = VectorizableTree.size();
8676 buildTreeRec(Roots: SubSlice, Depth: 0, EI: EdgeInfo(), InterleaveFactor);
8677 if (Sz == VectorizableTree.size()) {
8678 IsVectorized = false;
8679 // Try non-interleaved vectorization with smaller vector
8680 // factor.
8681 if (InterleaveFactor > 0) {
8682 VF = 2 * (MaxVF / InterleaveFactor);
8683 InterleaveFactor = 0;
8684 }
8685 continue;
8686 }
8687 }
8688 if (IsVectorized)
8689 break;
8690 }
8691 }
8692 NonVectorized.append(RHS: SortedNonVectorized);
8693 }
8694 return NonVectorized;
8695 };
8696 for (const auto &GLs : GatheredLoads) {
8697 const auto &Ref = GLs.second;
8698 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
8699 if (!Ref.empty() && !NonVectorized.empty() &&
8700 std::accumulate(
8701 first: Ref.begin(), last: Ref.end(), init: 0u,
8702 binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
8703 -> unsigned { return S + LoadsDists.size(); }) !=
8704 NonVectorized.size() &&
8705 IsMaskedGatherSupported(NonVectorized)) {
8706 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>
8707 FinalGatheredLoads;
8708 for (LoadInst *LI : NonVectorized) {
8709 // Reinsert non-vectorized loads to other list of loads with the same
8710 // base pointers.
8711 gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: *DL, SE&: *SE, TTI: *TTI,
8712 GatheredLoads&: FinalGatheredLoads,
8713 /*AddNew=*/false);
8714 }
8715 // Final attempt to vectorize non-vectorized loads.
8716 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
8717 }
8718 }
8719 // Try to vectorize postponed load entries, previously marked as gathered.
8720 for (unsigned Idx : LoadEntriesToVectorize) {
8721 const TreeEntry &E = *VectorizableTree[Idx];
8722 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
8723 // Avoid reordering, if possible.
8724 if (!E.ReorderIndices.empty()) {
8725 // Build a mask out of the reorder indices and reorder scalars per this
8726 // mask.
8727 SmallVector<int> ReorderMask;
8728 inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask);
8729 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
8730 }
8731 buildTreeRec(Roots: GatheredScalars, Depth: 0, EI: EdgeInfo());
8732 }
8733 // If no new entries created, consider it as no gathered loads entries must be
8734 // handled.
8735 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
8736 VectorizableTree.size())
8737 GatheredLoadsEntriesFirst.reset();
8738}
8739
8740/// Generates key/subkey pair for the given value to provide effective sorting
8741/// of the values and better detection of the vectorizable values sequences. The
8742/// keys/subkeys can be used for better sorting of the values themselves (keys)
8743/// and in values subgroups (subkeys).
8744static std::pair<size_t, size_t> generateKeySubkey(
8745 Value *V, const TargetLibraryInfo *TLI,
8746 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
8747 bool AllowAlternate) {
8748 hash_code Key = hash_value(value: V->getValueID() + 2);
8749 hash_code SubKey = hash_value(value: 0);
8750 // Sort the loads by the distance between the pointers.
8751 if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
8752 Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
8753 if (LI->isSimple())
8754 SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI));
8755 else
8756 Key = SubKey = hash_value(ptr: LI);
8757 } else if (isVectorLikeInstWithConstOps(V)) {
8758 // Sort extracts by the vector operands.
8759 if (isa<ExtractElementInst, UndefValue>(Val: V))
8760 Key = hash_value(value: Value::UndefValueVal + 1);
8761 if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
8762 if (!isUndefVector(V: EI->getVectorOperand()).all() &&
8763 !isa<UndefValue>(Val: EI->getIndexOperand()))
8764 SubKey = hash_value(ptr: EI->getVectorOperand());
8765 }
8766 } else if (auto *I = dyn_cast<Instruction>(Val: V)) {
8767 // Sort other instructions just by the opcodes except for CMPInst.
8768 // For CMP also sort by the predicate kind.
8769 if ((isa<BinaryOperator, CastInst>(Val: I)) &&
8770 isValidForAlternation(Opcode: I->getOpcode())) {
8771 if (AllowAlternate)
8772 Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0);
8773 else
8774 Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
8775 SubKey = hash_combine(
8776 args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
8777 args: hash_value(ptr: isa<BinaryOperator>(Val: I)
8778 ? I->getType()
8779 : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType()));
8780 // For casts, look through the only operand to improve compile time.
8781 if (isa<CastInst>(Val: I)) {
8782 std::pair<size_t, size_t> OpVals =
8783 generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator,
8784 /*AllowAlternate=*/true);
8785 Key = hash_combine(args: OpVals.first, args: Key);
8786 SubKey = hash_combine(args: OpVals.first, args: SubKey);
8787 }
8788 } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
8789 CmpInst::Predicate Pred = CI->getPredicate();
8790 if (CI->isCommutative())
8791 Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
8792 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
8793 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
8794 args: hash_value(value: SwapPred),
8795 args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType()));
8796 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
8797 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
8798 if (isTriviallyVectorizable(ID)) {
8799 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
8800 } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) {
8801 SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
8802 args: hash_value(ptr: Call->getCalledFunction()));
8803 } else {
8804 Key = hash_combine(args: hash_value(ptr: Call), args: Key);
8805 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
8806 }
8807 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
8808 SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
8809 args: hash_value(ptr: Op.Tag), args: SubKey);
8810 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
8811 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1)))
8812 SubKey = hash_value(ptr: Gep->getPointerOperand());
8813 else
8814 SubKey = hash_value(ptr: Gep);
8815 } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
8816 !isa<ConstantInt>(Val: I->getOperand(i: 1))) {
8817 // Do not try to vectorize instructions with potentially high cost.
8818 SubKey = hash_value(ptr: I);
8819 } else {
8820 SubKey = hash_value(value: I->getOpcode());
8821 }
8822 Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
8823 }
8824 return std::make_pair(x&: Key, y&: SubKey);
8825}
8826
8827/// Checks if the specified instruction \p I is an main operation for the given
8828/// \p MainOp and \p AltOp instructions.
8829static bool isMainInstruction(Instruction *I, Instruction *MainOp,
8830 Instruction *AltOp, const TargetLibraryInfo &TLI);
8831
8832bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
8833 ArrayRef<Value *> VL) const {
8834 Type *ScalarTy = S.getMainOp()->getType();
8835 unsigned Opcode0 = S.getOpcode();
8836 unsigned Opcode1 = S.getAltOpcode();
8837 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
8838 // If this pattern is supported by the target then consider it profitable.
8839 if (TTI->isLegalAltInstr(VecTy: getWidenedType(ScalarTy, VF: VL.size()), Opcode0,
8840 Opcode1, OpcodeMask))
8841 return true;
8842 SmallVector<ValueList> Operands;
8843 for (unsigned I : seq<unsigned>(Size: S.getMainOp()->getNumOperands())) {
8844 Operands.emplace_back();
8845 // Prepare the operand vector.
8846 for (Value *V : VL) {
8847 if (isa<PoisonValue>(Val: V)) {
8848 Operands.back().push_back(
8849 Elt: PoisonValue::get(T: S.getMainOp()->getOperand(i: I)->getType()));
8850 continue;
8851 }
8852 Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
8853 }
8854 }
8855 if (Operands.size() == 2) {
8856 // Try find best operands candidates.
8857 for (unsigned I : seq<unsigned>(Begin: 0, End: VL.size() - 1)) {
8858 SmallVector<std::pair<Value *, Value *>> Candidates(3);
8859 Candidates[0] = std::make_pair(x&: Operands[0][I], y&: Operands[0][I + 1]);
8860 Candidates[1] = std::make_pair(x&: Operands[0][I], y&: Operands[1][I + 1]);
8861 Candidates[2] = std::make_pair(x&: Operands[1][I], y&: Operands[0][I + 1]);
8862 std::optional<int> Res = findBestRootPair(Candidates);
8863 switch (Res.value_or(u: 0)) {
8864 case 0:
8865 break;
8866 case 1:
8867 std::swap(a&: Operands[0][I + 1], b&: Operands[1][I + 1]);
8868 break;
8869 case 2:
8870 std::swap(a&: Operands[0][I], b&: Operands[1][I]);
8871 break;
8872 default:
8873 llvm_unreachable("Unexpected index.");
8874 }
8875 }
8876 }
8877 DenseSet<unsigned> UniqueOpcodes;
8878 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
8879 unsigned NonInstCnt = 0;
8880 // Estimate number of instructions, required for the vectorized node and for
8881 // the buildvector node.
8882 unsigned UndefCnt = 0;
8883 // Count the number of extra shuffles, required for vector nodes.
8884 unsigned ExtraShuffleInsts = 0;
8885 // Check that operands do not contain same values and create either perfect
8886 // diamond match or shuffled match.
8887 if (Operands.size() == 2) {
8888 // Do not count same operands twice.
8889 if (Operands.front() == Operands.back()) {
8890 Operands.erase(CI: Operands.begin());
8891 } else if (!allConstant(VL: Operands.front()) &&
8892 all_of(Range&: Operands.front(), P: [&](Value *V) {
8893 return is_contained(Range&: Operands.back(), Element: V);
8894 })) {
8895 Operands.erase(CI: Operands.begin());
8896 ++ExtraShuffleInsts;
8897 }
8898 }
8899 const Loop *L = LI->getLoopFor(BB: S.getMainOp()->getParent());
8900 // Vectorize node, if:
8901 // 1. at least single operand is constant or splat.
8902 // 2. Operands have many loop invariants (the instructions are not loop
8903 // invariants).
8904 // 3. At least single unique operands is supposed to vectorized.
8905 return none_of(Range&: Operands,
8906 P: [&](ArrayRef<Value *> Op) {
8907 if (allConstant(VL: Op) ||
8908 (!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
8909 getSameOpcode(VL: Op, TLI: *TLI)))
8910 return false;
8911 DenseMap<Value *, unsigned> Uniques;
8912 for (Value *V : Op) {
8913 if (isa<Constant, ExtractElementInst>(Val: V) ||
8914 isVectorized(V) || (L && L->isLoopInvariant(V))) {
8915 if (isa<UndefValue>(Val: V))
8916 ++UndefCnt;
8917 continue;
8918 }
8919 auto Res = Uniques.try_emplace(Key: V, Args: 0);
8920 // Found first duplicate - need to add shuffle.
8921 if (!Res.second && Res.first->second == 1)
8922 ++ExtraShuffleInsts;
8923 ++Res.first->getSecond();
8924 if (auto *I = dyn_cast<Instruction>(Val: V))
8925 UniqueOpcodes.insert(V: I->getOpcode());
8926 else if (Res.second)
8927 ++NonInstCnt;
8928 }
8929 return none_of(Range&: Uniques, P: [&](const auto &P) {
8930 return P.first->hasNUsesOrMore(P.second + 1) &&
8931 none_of(P.first->users(), [&](User *U) {
8932 return isVectorized(V: U) || Uniques.contains(Val: U);
8933 });
8934 });
8935 }) ||
8936 // Do not vectorize node, if estimated number of vector instructions is
8937 // more than estimated number of buildvector instructions. Number of
8938 // vector operands is number of vector instructions + number of vector
8939 // instructions for operands (buildvectors). Number of buildvector
8940 // instructions is just number_of_operands * number_of_scalars.
8941 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
8942 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
8943 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
8944}
8945
8946/// Builds the arguments types vector for the given call instruction with the
8947/// given \p ID for the specified vector factor.
8948static SmallVector<Type *>
8949buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
8950 const unsigned VF, unsigned MinBW,
8951 const TargetTransformInfo *TTI) {
8952 SmallVector<Type *> ArgTys;
8953 for (auto [Idx, Arg] : enumerate(First: CI->args())) {
8954 if (ID != Intrinsic::not_intrinsic) {
8955 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) {
8956 ArgTys.push_back(Elt: Arg->getType());
8957 continue;
8958 }
8959 if (MinBW > 0) {
8960 ArgTys.push_back(
8961 Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
8962 continue;
8963 }
8964 }
8965 ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg->getType(), VF));
8966 }
8967 return ArgTys;
8968}
8969
8970/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
8971/// function (if possible) calls. Returns invalid cost for the corresponding
8972/// calls, if they cannot be vectorized/will be scalarized.
8973static std::pair<InstructionCost, InstructionCost>
8974getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
8975 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8976 ArrayRef<Type *> ArgTys) {
8977 auto Shape = VFShape::get(FTy: CI->getFunctionType(),
8978 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
8979 HasGlobalPred: false /*HasGlobalPred*/);
8980 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
8981 auto LibCost = InstructionCost::getInvalid();
8982 if (!CI->isNoBuiltin() && VecFunc) {
8983 // Calculate the cost of the vector library call.
8984 // If the corresponding vector call is cheaper, return its cost.
8985 LibCost =
8986 TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
8987 }
8988 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8989
8990 // Calculate the cost of the vector intrinsic call.
8991 FastMathFlags FMF;
8992 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
8993 FMF = FPCI->getFastMathFlags();
8994 const InstructionCost ScalarLimit = 10000;
8995 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
8996 LibCost.isValid() ? LibCost : ScalarLimit);
8997 auto IntrinsicCost =
8998 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
8999 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
9000 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
9001 IntrinsicCost = InstructionCost::getInvalid();
9002
9003 return {IntrinsicCost, LibCost};
9004}
9005
9006BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
9007 const InstructionsState &S, ArrayRef<Value *> VL,
9008 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
9009 SmallVectorImpl<Value *> &PointerOps) {
9010 assert(S.getMainOp() &&
9011 "Expected instructions with same/alternate opcodes only.");
9012
9013 unsigned ShuffleOrOp =
9014 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9015 Instruction *VL0 = S.getMainOp();
9016 switch (ShuffleOrOp) {
9017 case Instruction::PHI: {
9018 // Too many operands - gather, most probably won't be vectorized.
9019 if (VL0->getNumOperands() > MaxPHINumOperands)
9020 return TreeEntry::NeedToGather;
9021 // Check for terminator values (e.g. invoke).
9022 for (Value *V : VL) {
9023 auto *PHI = dyn_cast<PHINode>(Val: V);
9024 if (!PHI)
9025 continue;
9026 for (Value *Incoming : PHI->incoming_values()) {
9027 Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
9028 if (Term && Term->isTerminator()) {
9029 LLVM_DEBUG(dbgs()
9030 << "SLP: Need to swizzle PHINodes (terminator use).\n");
9031 return TreeEntry::NeedToGather;
9032 }
9033 }
9034 }
9035
9036 return TreeEntry::Vectorize;
9037 }
9038 case Instruction::ExtractElement:
9039 if (any_of(Range&: VL, P: [&](Value *V) {
9040 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
9041 if (!EI)
9042 return true;
9043 return isVectorized(V: EI->getOperand(i_nocapture: 0));
9044 }))
9045 return TreeEntry::NeedToGather;
9046 [[fallthrough]];
9047 case Instruction::ExtractValue: {
9048 bool Reuse = canReuseExtract(VL, CurrentOrder);
9049 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
9050 // non-full registers).
9051 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size()))
9052 return TreeEntry::NeedToGather;
9053 if (Reuse || !CurrentOrder.empty())
9054 return TreeEntry::Vectorize;
9055 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
9056 return TreeEntry::NeedToGather;
9057 }
9058 case Instruction::InsertElement: {
9059 // Check that we have a buildvector and not a shuffle of 2 or more
9060 // different vectors.
9061 ValueSet SourceVectors;
9062 for (Value *V : VL) {
9063 SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0));
9064 assert(getElementIndex(V) != std::nullopt &&
9065 "Non-constant or undef index?");
9066 }
9067
9068 if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
9069 return !SourceVectors.contains(Ptr: V);
9070 }) >= 2) {
9071 // Found 2nd source vector - cancel.
9072 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9073 "different source vectors.\n");
9074 return TreeEntry::NeedToGather;
9075 }
9076
9077 if (any_of(Range&: VL, P: [&SourceVectors](Value *V) {
9078 // The last InsertElement can have multiple uses.
9079 return SourceVectors.contains(Ptr: V) && !V->hasOneUse();
9080 })) {
9081 assert(SLPReVec && "Only supported by REVEC.");
9082 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
9083 "multiple uses.\n");
9084 return TreeEntry::NeedToGather;
9085 }
9086
9087 return TreeEntry::Vectorize;
9088 }
9089 case Instruction::Load: {
9090 // Check that a vectorized load would load the same memory as a scalar
9091 // load. For example, we don't want to vectorize loads that are smaller
9092 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
9093 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
9094 // from such a struct, we read/write packed bits disagreeing with the
9095 // unvectorized version.
9096 auto IsGatheredNode = [&]() {
9097 if (!GatheredLoadsEntriesFirst)
9098 return false;
9099 return all_of(Range&: VL, P: [&](Value *V) {
9100 if (isa<PoisonValue>(Val: V))
9101 return true;
9102 return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) {
9103 return TE->Idx >= *GatheredLoadsEntriesFirst;
9104 });
9105 });
9106 };
9107 switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) {
9108 case LoadsState::Vectorize:
9109 return TreeEntry::Vectorize;
9110 case LoadsState::CompressVectorize:
9111 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9112 // Delay slow vectorized nodes for better vectorization attempts.
9113 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
9114 return TreeEntry::NeedToGather;
9115 }
9116 return IsGatheredNode() ? TreeEntry::NeedToGather
9117 : TreeEntry::CompressVectorize;
9118 case LoadsState::ScatterVectorize:
9119 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
9120 // Delay slow vectorized nodes for better vectorization attempts.
9121 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
9122 return TreeEntry::NeedToGather;
9123 }
9124 return IsGatheredNode() ? TreeEntry::NeedToGather
9125 : TreeEntry::ScatterVectorize;
9126 case LoadsState::StridedVectorize:
9127 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
9128 // Delay slow vectorized nodes for better vectorization attempts.
9129 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
9130 return TreeEntry::NeedToGather;
9131 }
9132 return IsGatheredNode() ? TreeEntry::NeedToGather
9133 : TreeEntry::StridedVectorize;
9134 case LoadsState::Gather:
9135#ifndef NDEBUG
9136 Type *ScalarTy = VL0->getType();
9137 if (DL->getTypeSizeInBits(ScalarTy) !=
9138 DL->getTypeAllocSizeInBits(ScalarTy))
9139 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
9140 else if (any_of(VL, [](Value *V) {
9141 auto *LI = dyn_cast<LoadInst>(V);
9142 return !LI || !LI->isSimple();
9143 }))
9144 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
9145 else
9146 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
9147#endif // NDEBUG
9148 registerNonVectorizableLoads(VL);
9149 return TreeEntry::NeedToGather;
9150 }
9151 llvm_unreachable("Unexpected state of loads");
9152 }
9153 case Instruction::ZExt:
9154 case Instruction::SExt:
9155 case Instruction::FPToUI:
9156 case Instruction::FPToSI:
9157 case Instruction::FPExt:
9158 case Instruction::PtrToInt:
9159 case Instruction::IntToPtr:
9160 case Instruction::SIToFP:
9161 case Instruction::UIToFP:
9162 case Instruction::Trunc:
9163 case Instruction::FPTrunc:
9164 case Instruction::BitCast: {
9165 Type *SrcTy = VL0->getOperand(i: 0)->getType();
9166 for (Value *V : VL) {
9167 if (isa<PoisonValue>(Val: V))
9168 continue;
9169 Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType();
9170 if (Ty != SrcTy || !isValidElementType(Ty)) {
9171 LLVM_DEBUG(
9172 dbgs() << "SLP: Gathering casts with different src types.\n");
9173 return TreeEntry::NeedToGather;
9174 }
9175 }
9176 return TreeEntry::Vectorize;
9177 }
9178 case Instruction::ICmp:
9179 case Instruction::FCmp: {
9180 // Check that all of the compares have the same predicate.
9181 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
9182 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
9183 Type *ComparedTy = VL0->getOperand(i: 0)->getType();
9184 for (Value *V : VL) {
9185 if (isa<PoisonValue>(Val: V))
9186 continue;
9187 auto *Cmp = cast<CmpInst>(Val: V);
9188 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
9189 Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) {
9190 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
9191 return TreeEntry::NeedToGather;
9192 }
9193 }
9194 return TreeEntry::Vectorize;
9195 }
9196 case Instruction::Select:
9197 case Instruction::FNeg:
9198 case Instruction::Add:
9199 case Instruction::FAdd:
9200 case Instruction::Sub:
9201 case Instruction::FSub:
9202 case Instruction::Mul:
9203 case Instruction::FMul:
9204 case Instruction::UDiv:
9205 case Instruction::SDiv:
9206 case Instruction::FDiv:
9207 case Instruction::URem:
9208 case Instruction::SRem:
9209 case Instruction::FRem:
9210 case Instruction::Shl:
9211 case Instruction::LShr:
9212 case Instruction::AShr:
9213 case Instruction::And:
9214 case Instruction::Or:
9215 case Instruction::Xor:
9216 case Instruction::Freeze:
9217 if (S.getMainOp()->getType()->isFloatingPointTy() &&
9218 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
9219 auto *I = dyn_cast<Instruction>(Val: V);
9220 return I && I->isBinaryOp() && !I->isFast();
9221 }))
9222 return TreeEntry::NeedToGather;
9223 return TreeEntry::Vectorize;
9224 case Instruction::GetElementPtr: {
9225 // We don't combine GEPs with complicated (nested) indexing.
9226 for (Value *V : VL) {
9227 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
9228 if (!I)
9229 continue;
9230 if (I->getNumOperands() != 2) {
9231 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
9232 return TreeEntry::NeedToGather;
9233 }
9234 }
9235
9236 // We can't combine several GEPs into one vector if they operate on
9237 // different types.
9238 Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
9239 for (Value *V : VL) {
9240 auto *GEP = dyn_cast<GEPOperator>(Val: V);
9241 if (!GEP)
9242 continue;
9243 Type *CurTy = GEP->getSourceElementType();
9244 if (Ty0 != CurTy) {
9245 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
9246 return TreeEntry::NeedToGather;
9247 }
9248 }
9249
9250 // We don't combine GEPs with non-constant indexes.
9251 Type *Ty1 = VL0->getOperand(i: 1)->getType();
9252 for (Value *V : VL) {
9253 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
9254 if (!I)
9255 continue;
9256 auto *Op = I->getOperand(i_nocapture: 1);
9257 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
9258 (Op->getType() != Ty1 &&
9259 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
9260 Op->getType()->getScalarSizeInBits() >
9261 DL->getIndexSizeInBits(
9262 AS: V->getType()->getPointerAddressSpace())))) {
9263 LLVM_DEBUG(
9264 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
9265 return TreeEntry::NeedToGather;
9266 }
9267 }
9268
9269 return TreeEntry::Vectorize;
9270 }
9271 case Instruction::Store: {
9272 // Check if the stores are consecutive or if we need to swizzle them.
9273 llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
9274 // Avoid types that are padded when being allocated as scalars, while
9275 // being packed together in a vector (such as i1).
9276 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
9277 DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
9278 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
9279 return TreeEntry::NeedToGather;
9280 }
9281 // Make sure all stores in the bundle are simple - we can't vectorize
9282 // atomic or volatile stores.
9283 for (Value *V : VL) {
9284 auto *SI = cast<StoreInst>(Val: V);
9285 if (!SI->isSimple()) {
9286 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
9287 return TreeEntry::NeedToGather;
9288 }
9289 PointerOps.push_back(Elt: SI->getPointerOperand());
9290 }
9291
9292 // Check the order of pointer operands.
9293 if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) {
9294 Value *Ptr0;
9295 Value *PtrN;
9296 if (CurrentOrder.empty()) {
9297 Ptr0 = PointerOps.front();
9298 PtrN = PointerOps.back();
9299 } else {
9300 Ptr0 = PointerOps[CurrentOrder.front()];
9301 PtrN = PointerOps[CurrentOrder.back()];
9302 }
9303 std::optional<int64_t> Dist =
9304 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
9305 // Check that the sorted pointer operands are consecutive.
9306 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
9307 return TreeEntry::Vectorize;
9308 }
9309
9310 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
9311 return TreeEntry::NeedToGather;
9312 }
9313 case Instruction::Call: {
9314 if (S.getMainOp()->getType()->isFloatingPointTy() &&
9315 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
9316 auto *I = dyn_cast<Instruction>(Val: V);
9317 return I && !I->isFast();
9318 }))
9319 return TreeEntry::NeedToGather;
9320 // Check if the calls are all to the same vectorizable intrinsic or
9321 // library function.
9322 CallInst *CI = cast<CallInst>(Val: VL0);
9323 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9324
9325 VFShape Shape = VFShape::get(
9326 FTy: CI->getFunctionType(),
9327 EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
9328 HasGlobalPred: false /*HasGlobalPred*/);
9329 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9330
9331 if (!VecFunc && !isTriviallyVectorizable(ID)) {
9332 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
9333 return TreeEntry::NeedToGather;
9334 }
9335 Function *F = CI->getCalledFunction();
9336 unsigned NumArgs = CI->arg_size();
9337 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
9338 for (unsigned J = 0; J != NumArgs; ++J)
9339 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI))
9340 ScalarArgs[J] = CI->getArgOperand(i: J);
9341 for (Value *V : VL) {
9342 CallInst *CI2 = dyn_cast<CallInst>(Val: V);
9343 if (!CI2 || CI2->getCalledFunction() != F ||
9344 getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID ||
9345 (VecFunc &&
9346 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
9347 !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
9348 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
9349 << "\n");
9350 return TreeEntry::NeedToGather;
9351 }
9352 // Some intrinsics have scalar arguments and should be same in order for
9353 // them to be vectorized.
9354 for (unsigned J = 0; J != NumArgs; ++J) {
9355 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) {
9356 Value *A1J = CI2->getArgOperand(i: J);
9357 if (ScalarArgs[J] != A1J) {
9358 LLVM_DEBUG(dbgs()
9359 << "SLP: mismatched arguments in call:" << *CI
9360 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
9361 return TreeEntry::NeedToGather;
9362 }
9363 }
9364 }
9365 // Verify that the bundle operands are identical between the two calls.
9366 if (CI->hasOperandBundles() &&
9367 !std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
9368 last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
9369 first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
9370 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
9371 << "!=" << *V << '\n');
9372 return TreeEntry::NeedToGather;
9373 }
9374 }
9375 SmallVector<Type *> ArgTys =
9376 buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: 0, TTI);
9377 auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
9378 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9379 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
9380 return TreeEntry::NeedToGather;
9381
9382 return TreeEntry::Vectorize;
9383 }
9384 case Instruction::ShuffleVector: {
9385 if (!S.isAltShuffle()) {
9386 // REVEC can support non alternate shuffle.
9387 if (SLPReVec && getShufflevectorNumGroups(VL))
9388 return TreeEntry::Vectorize;
9389 // If this is not an alternate sequence of opcode like add-sub
9390 // then do not vectorize this instruction.
9391 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
9392 return TreeEntry::NeedToGather;
9393 }
9394 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
9395 LLVM_DEBUG(
9396 dbgs()
9397 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
9398 "the whole alt sequence is not profitable.\n");
9399 return TreeEntry::NeedToGather;
9400 }
9401
9402 return TreeEntry::Vectorize;
9403 }
9404 default:
9405 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
9406 return TreeEntry::NeedToGather;
9407 }
9408}
9409
9410namespace {
9411/// Allows to correctly handle operands of the phi nodes based on the \p Main
9412/// PHINode order of incoming basic blocks/values.
9413class PHIHandler {
9414 DominatorTree &DT;
9415 PHINode *Main = nullptr;
9416 SmallVector<Value *> Phis;
9417 SmallVector<SmallVector<Value *>> Operands;
9418
9419public:
9420 PHIHandler() = delete;
9421 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
9422 : DT(DT), Main(Main), Phis(Phis),
9423 Operands(Main->getNumIncomingValues(),
9424 SmallVector<Value *>(Phis.size(), nullptr)) {}
9425 void buildOperands() {
9426 constexpr unsigned FastLimit = 4;
9427 if (Main->getNumIncomingValues() <= FastLimit) {
9428 for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) {
9429 BasicBlock *InBB = Main->getIncomingBlock(i: I);
9430 if (!DT.isReachableFromEntry(A: InBB)) {
9431 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
9432 continue;
9433 }
9434 // Prepare the operand vector.
9435 for (auto [Idx, V] : enumerate(First&: Phis)) {
9436 auto *P = dyn_cast<PHINode>(Val: V);
9437 if (!P) {
9438 assert(isa<PoisonValue>(V) &&
9439 "Expected isa instruction or poison value.");
9440 Operands[I][Idx] = V;
9441 continue;
9442 }
9443 if (P->getIncomingBlock(i: I) == InBB)
9444 Operands[I][Idx] = P->getIncomingValue(i: I);
9445 else
9446 Operands[I][Idx] = P->getIncomingValueForBlock(BB: InBB);
9447 }
9448 }
9449 return;
9450 }
9451 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
9452 Blocks;
9453 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) {
9454 BasicBlock *InBB = Main->getIncomingBlock(i: I);
9455 if (!DT.isReachableFromEntry(A: InBB)) {
9456 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
9457 continue;
9458 }
9459 Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
9460 }
9461 for (auto [Idx, V] : enumerate(First&: Phis)) {
9462 if (isa<PoisonValue>(Val: V)) {
9463 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues()))
9464 Operands[I][Idx] = V;
9465 continue;
9466 }
9467 auto *P = cast<PHINode>(Val: V);
9468 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
9469 BasicBlock *InBB = P->getIncomingBlock(i: I);
9470 if (InBB == Main->getIncomingBlock(i: I)) {
9471 if (isa_and_nonnull<PoisonValue>(Val: Operands[I][Idx]))
9472 continue;
9473 Operands[I][Idx] = P->getIncomingValue(i: I);
9474 continue;
9475 }
9476 auto *It = Blocks.find(Key: InBB);
9477 if (It == Blocks.end())
9478 continue;
9479 Operands[It->second.front()][Idx] = P->getIncomingValue(i: I);
9480 }
9481 }
9482 for (const auto &P : Blocks) {
9483 ArrayRef<unsigned> IncomingValues = P.second;
9484 if (IncomingValues.size() <= 1)
9485 continue;
9486 unsigned BasicI = IncomingValues.front();
9487 for (unsigned I : IncomingValues.drop_front()) {
9488 assert(all_of(enumerate(Operands[I]),
9489 [&](const auto &Data) {
9490 return !Data.value() ||
9491 Data.value() == Operands[BasicI][Data.index()];
9492 }) &&
9493 "Expected empty operands list.");
9494 Operands[I] = Operands[BasicI];
9495 }
9496 }
9497 }
9498 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
9499};
9500} // namespace
9501
9502/// Returns main/alternate instructions for the given \p VL. Unlike
9503/// getSameOpcode supports non-compatible instructions for better SplitVectorize
9504/// node support.
9505/// \returns first main/alt instructions, if only poisons and instruction with
9506/// only 2 opcodes exists. Returns pair of nullptr otherwise.
9507static std::pair<Instruction *, Instruction *>
9508getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
9509 Instruction *MainOp = nullptr;
9510 Instruction *AltOp = nullptr;
9511 for (Value *V : VL) {
9512 if (isa<PoisonValue>(Val: V))
9513 continue;
9514 auto *I = dyn_cast<Instruction>(Val: V);
9515 if (!I)
9516 return {};
9517 if (!MainOp) {
9518 MainOp = I;
9519 continue;
9520 }
9521 if (MainOp->getOpcode() == I->getOpcode()) {
9522 if (I->getParent() != MainOp->getParent())
9523 return {};
9524 continue;
9525 }
9526 if (!AltOp) {
9527 AltOp = I;
9528 continue;
9529 }
9530 if (AltOp->getOpcode() == I->getOpcode()) {
9531 if (I->getParent() != AltOp->getParent())
9532 return {};
9533 continue;
9534 }
9535 return {};
9536 }
9537 if (!AltOp)
9538 return {};
9539 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
9540 "Expected different main and alt instructions.");
9541 return std::make_pair(x&: MainOp, y&: AltOp);
9542}
9543
9544/// Checks that every instruction appears once in the list and if not, packs
9545/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
9546/// unique scalars is extended by poison values to the whole register size.
9547///
9548/// \returns false if \p VL could not be uniquified, in which case \p VL is
9549/// unchanged and \p ReuseShuffleIndices is empty.
9550static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
9551 SmallVectorImpl<int> &ReuseShuffleIndices,
9552 const TargetTransformInfo &TTI,
9553 const TargetLibraryInfo &TLI,
9554 const InstructionsState &S,
9555 const BoUpSLP::EdgeInfo &UserTreeIdx,
9556 bool TryPad = false) {
9557 // Check that every instruction appears once in this bundle.
9558 SmallVector<Value *> UniqueValues;
9559 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
9560 for (Value *V : VL) {
9561 if (isConstant(V)) {
9562 // Constants are always considered distinct, even if the same constant
9563 // appears multiple times in VL.
9564 ReuseShuffleIndices.emplace_back(
9565 Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
9566 UniqueValues.emplace_back(Args&: V);
9567 continue;
9568 }
9569 auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
9570 ReuseShuffleIndices.emplace_back(Args&: Res.first->second);
9571 if (Res.second)
9572 UniqueValues.emplace_back(Args&: V);
9573 }
9574
9575 // Easy case: VL has unique values and a "natural" size
9576 size_t NumUniqueScalarValues = UniqueValues.size();
9577 bool IsFullVectors = hasFullVectorsOrPowerOf2(
9578 TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues);
9579 if (NumUniqueScalarValues == VL.size() &&
9580 (VectorizeNonPowerOf2 || IsFullVectors)) {
9581 ReuseShuffleIndices.clear();
9582 return true;
9583 }
9584
9585 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
9586 if ((UserTreeIdx.UserTE &&
9587 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
9588 !hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) {
9589 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
9590 "for nodes with padding.\n");
9591 ReuseShuffleIndices.clear();
9592 return false;
9593 }
9594
9595 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
9596 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
9597 (UniquePositions.size() == 1 && all_of(Range&: UniqueValues, P: [](Value *V) {
9598 return isa<UndefValue>(Val: V) || !isConstant(V);
9599 }))) {
9600 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
9601 S.getMainOp()->isSafeToRemove() &&
9602 all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>)) {
9603 // Find the number of elements, which forms full vectors.
9604 unsigned PWSz = getFullVectorNumberOfElements(
9605 TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size());
9606 PWSz = std::min<unsigned>(a: PWSz, b: VL.size());
9607 if (PWSz == VL.size()) {
9608 // We ended up with the same size after removing duplicates and
9609 // upgrading the resulting vector size to a "nice size". Just keep
9610 // the initial VL then.
9611 ReuseShuffleIndices.clear();
9612 } else {
9613 // Pad unique values with poison to grow the vector to a "nice" size
9614 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
9615 UniqueValues.end());
9616 PaddedUniqueValues.append(
9617 NumInputs: PWSz - UniqueValues.size(),
9618 Elt: PoisonValue::get(T: UniqueValues.front()->getType()));
9619 // Check that extended with poisons operations are still valid for
9620 // vectorization (div/rem are not allowed).
9621 if (!getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) {
9622 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
9623 ReuseShuffleIndices.clear();
9624 return false;
9625 }
9626 VL = std::move(PaddedUniqueValues);
9627 }
9628 return true;
9629 }
9630 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
9631 ReuseShuffleIndices.clear();
9632 return false;
9633 }
9634 VL = std::move(UniqueValues);
9635 return true;
9636}
9637
9638bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
9639 const InstructionsState &LocalState,
9640 SmallVectorImpl<Value *> &Op1,
9641 SmallVectorImpl<Value *> &Op2,
9642 OrdersType &ReorderIndices) const {
9643 constexpr unsigned SmallNodeSize = 4;
9644 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
9645 !SplitAlternateInstructions)
9646 return false;
9647
9648 // Check if this is a duplicate of another split entry.
9649 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
9650 << ".\n");
9651 for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) {
9652 if (E->isSame(VL)) {
9653 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
9654 << *LocalState.getMainOp() << ".\n");
9655 return false;
9656 }
9657 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
9658 if (all_of(Range&: VL, P: [&](Value *V) {
9659 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V);
9660 })) {
9661 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
9662 return false;
9663 }
9664 }
9665
9666 ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size());
9667 SmallBitVector Op1Indices(VL.size());
9668 for (auto [Idx, V] : enumerate(First&: VL)) {
9669 auto *I = dyn_cast<Instruction>(Val: V);
9670 if (!I) {
9671 Op1.push_back(Elt: V);
9672 Op1Indices.set(Idx);
9673 continue;
9674 }
9675 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
9676 isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(),
9677 TLI: *TLI)) ||
9678 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
9679 !isAlternateInstruction(I, MainOp: LocalState.getMainOp(),
9680 AltOp: LocalState.getAltOp(), TLI: *TLI))) {
9681 Op1.push_back(Elt: V);
9682 Op1Indices.set(Idx);
9683 continue;
9684 }
9685 Op2.push_back(Elt: V);
9686 }
9687 Type *ScalarTy = getValueType(V: VL.front());
9688 VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
9689 unsigned Opcode0 = LocalState.getOpcode();
9690 unsigned Opcode1 = LocalState.getAltOpcode();
9691 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
9692 // Enable split node, only if all nodes do not form legal alternate
9693 // instruction (like X86 addsub).
9694 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
9695 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
9696 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
9697 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
9698 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) ||
9699 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size()))
9700 return false;
9701 // Enable split node, only if all nodes are power-of-2/full registers.
9702 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
9703 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
9704 if (Op1Indices.test(Idx)) {
9705 ReorderIndices[Op1Cnt] = Idx;
9706 ++Op1Cnt;
9707 } else {
9708 ReorderIndices[Op2Cnt] = Idx;
9709 ++Op2Cnt;
9710 }
9711 }
9712 if (isIdentityOrder(Order: ReorderIndices))
9713 ReorderIndices.clear();
9714 SmallVector<int> Mask;
9715 if (!ReorderIndices.empty())
9716 inversePermutation(Indices: ReorderIndices, Mask);
9717 unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
9718 VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size());
9719 VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size());
9720 // Check non-profitable single register ops, which better to be represented
9721 // as alternate ops.
9722 if (NumParts >= VL.size())
9723 return false;
9724 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
9725 InstructionCost InsertCost = ::getShuffleCost(
9726 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy);
9727 FixedVectorType *SubVecTy =
9728 getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size()));
9729 InstructionCost NewShuffleCost =
9730 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind);
9731 if (!LocalState.isCmpOp() && NumParts <= 1 &&
9732 (Mask.empty() || InsertCost >= NewShuffleCost))
9733 return false;
9734 if ((LocalState.getMainOp()->isBinaryOp() &&
9735 LocalState.getAltOp()->isBinaryOp() &&
9736 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
9737 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
9738 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
9739 (LocalState.getMainOp()->isUnaryOp() &&
9740 LocalState.getAltOp()->isUnaryOp())) {
9741 InstructionCost OriginalVecOpsCost =
9742 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) +
9743 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind);
9744 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
9745 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
9746 if (isa<PoisonValue>(Val: VL[Idx]))
9747 continue;
9748 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
9749 }
9750 InstructionCost OriginalCost =
9751 OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
9752 Tp: VecTy, Mask: OriginalMask, CostKind: Kind);
9753 InstructionCost NewVecOpsCost =
9754 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) +
9755 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind);
9756 InstructionCost NewCost =
9757 NewVecOpsCost + InsertCost +
9758 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
9759 VectorizableTree.front()->getOpcode() == Instruction::Store
9760 ? NewShuffleCost
9761 : 0);
9762 // If not profitable to split - exit.
9763 if (NewCost >= OriginalCost)
9764 return false;
9765 }
9766 return true;
9767}
9768
9769namespace {
9770/// Class accepts incoming list of values and generates the list of values
9771/// for scheduling and list of operands for the new nodes.
9772class InstructionsCompatibilityAnalysis {
9773 DominatorTree &DT;
9774 const DataLayout &DL;
9775 const TargetTransformInfo &TTI;
9776 const TargetLibraryInfo &TLI;
9777
9778 /// Builds operands for the original instructions.
9779 void
9780 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
9781 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
9782
9783 unsigned ShuffleOrOp =
9784 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
9785 Instruction *VL0 = S.getMainOp();
9786
9787 switch (ShuffleOrOp) {
9788 case Instruction::PHI: {
9789 auto *PH = cast<PHINode>(Val: VL0);
9790
9791 // Keeps the reordered operands to avoid code duplication.
9792 PHIHandler Handler(DT, PH, VL);
9793 Handler.buildOperands();
9794 Operands.assign(NumElts: PH->getNumOperands(), Elt: {});
9795 for (unsigned I : seq<unsigned>(Size: PH->getNumOperands()))
9796 Operands[I].assign(in_start: Handler.getOperands(I).begin(),
9797 in_end: Handler.getOperands(I).end());
9798 return;
9799 }
9800 case Instruction::ExtractValue:
9801 case Instruction::ExtractElement:
9802 // This is a special case, as it does not gather, but at the same time
9803 // we are not extending buildTree_rec() towards the operands.
9804 Operands.assign(NumElts: 1, Elt: {VL.size(), VL0->getOperand(i: 0)});
9805 return;
9806 case Instruction::InsertElement:
9807 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
9808 for (auto [Idx, V] : enumerate(First&: VL)) {
9809 auto *IE = cast<InsertElementInst>(Val: V);
9810 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
9811 Ops[Idx] = IE->getOperand(i_nocapture: OpIdx);
9812 }
9813 return;
9814 case Instruction::Load:
9815 Operands.assign(
9816 NumElts: 1, Elt: {VL.size(),
9817 PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())});
9818 for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) {
9819 auto *LI = dyn_cast<LoadInst>(Val: V);
9820 if (!LI)
9821 continue;
9822 Op = LI->getPointerOperand();
9823 }
9824 return;
9825 case Instruction::ZExt:
9826 case Instruction::SExt:
9827 case Instruction::FPToUI:
9828 case Instruction::FPToSI:
9829 case Instruction::FPExt:
9830 case Instruction::PtrToInt:
9831 case Instruction::IntToPtr:
9832 case Instruction::SIToFP:
9833 case Instruction::UIToFP:
9834 case Instruction::Trunc:
9835 case Instruction::FPTrunc:
9836 case Instruction::BitCast:
9837 case Instruction::ICmp:
9838 case Instruction::FCmp:
9839 case Instruction::Select:
9840 case Instruction::FNeg:
9841 case Instruction::Add:
9842 case Instruction::FAdd:
9843 case Instruction::Sub:
9844 case Instruction::FSub:
9845 case Instruction::Mul:
9846 case Instruction::FMul:
9847 case Instruction::UDiv:
9848 case Instruction::SDiv:
9849 case Instruction::FDiv:
9850 case Instruction::URem:
9851 case Instruction::SRem:
9852 case Instruction::FRem:
9853 case Instruction::Shl:
9854 case Instruction::LShr:
9855 case Instruction::AShr:
9856 case Instruction::And:
9857 case Instruction::Or:
9858 case Instruction::Xor:
9859 case Instruction::Freeze:
9860 case Instruction::Store:
9861 case Instruction::ShuffleVector:
9862 Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
9863 for (auto [Idx, V] : enumerate(First&: VL)) {
9864 auto *I = dyn_cast<Instruction>(Val: V);
9865 if (!I) {
9866 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
9867 Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
9868 continue;
9869 }
9870 auto [Op, ConvertedOps] = convertTo(I, S);
9871 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
9872 Ops[Idx] = ConvertedOps[OpIdx];
9873 }
9874 return;
9875 case Instruction::GetElementPtr: {
9876 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
9877 // Need to cast all indices to the same type before vectorization to
9878 // avoid crash.
9879 // Required to be able to find correct matches between different gather
9880 // nodes and reuse the vectorized values rather than trying to gather them
9881 // again.
9882 const unsigned IndexIdx = 1;
9883 Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
9884 Type *Ty =
9885 all_of(Range&: VL,
9886 P: [&](Value *V) {
9887 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
9888 return !GEP || VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
9889 })
9890 ? VL0Ty
9891 : DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
9892 ->getPointerOperandType()
9893 ->getScalarType());
9894 for (auto [Idx, V] : enumerate(First&: VL)) {
9895 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
9896 if (!GEP) {
9897 Operands[0][Idx] = V;
9898 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
9899 continue;
9900 }
9901 Operands[0][Idx] = GEP->getPointerOperand();
9902 auto *Op = GEP->getOperand(i_nocapture: IndexIdx);
9903 auto *CI = dyn_cast<ConstantInt>(Val: Op);
9904 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
9905 C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL)
9906 : Op;
9907 }
9908 return;
9909 }
9910 case Instruction::Call: {
9911 auto *CI = cast<CallInst>(Val: VL0);
9912 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
9913 for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) {
9914 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI))
9915 continue;
9916 auto &Ops = Operands.emplace_back();
9917 for (Value *V : VL) {
9918 auto *I = dyn_cast<Instruction>(Val: V);
9919 Ops.push_back(Elt: I ? I->getOperand(i: Idx)
9920 : PoisonValue::get(T: VL0->getOperand(i: Idx)->getType()));
9921 }
9922 }
9923 return;
9924 }
9925 default:
9926 break;
9927 }
9928 llvm_unreachable("Unexpected vectorization of the instructions.");
9929 }
9930
9931public:
9932 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
9933 const TargetTransformInfo &TTI,
9934 const TargetLibraryInfo &TLI)
9935 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
9936
9937 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
9938 ArrayRef<Value *> VL) {
9939 assert(S && "Invalid state!");
9940 SmallVector<BoUpSLP::ValueList> Operands;
9941 buildOriginalOperands(S, VL, Operands);
9942 return Operands;
9943 }
9944};
9945} // namespace
9946
9947BoUpSLP::ScalarsVectorizationLegality
9948BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
9949 const EdgeInfo &UserTreeIdx) const {
9950 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
9951
9952 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
9953
9954 // Don't go into catchswitch blocks, which can happen with PHIs.
9955 // Such blocks can only have PHIs and the catchswitch. There is no
9956 // place to insert a shuffle if we need to, so just avoid that issue.
9957 if (S && isa<CatchSwitchInst>(Val: S.getMainOp()->getParent()->getTerminator())) {
9958 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
9959 // Do not try to pack to avoid extra instructions here.
9960 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
9961 /*TryToFindDuplicates=*/false);
9962 }
9963
9964 // Check if this is a duplicate of another entry.
9965 if (S) {
9966 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
9967 for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) {
9968 if (E->isSame(VL)) {
9969 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
9970 << ".\n");
9971 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
9972 }
9973 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
9974 if (all_of(Range&: VL, P: [&](Value *V) {
9975 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V);
9976 })) {
9977 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
9978 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
9979 }
9980 }
9981 }
9982
9983 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
9984 // a load), in which case peek through to include it in the tree, without
9985 // ballooning over-budget.
9986 if (Depth >= RecursionMaxDepth &&
9987 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
9988 (match(V: S.getMainOp(), P: m_Load(Op: m_Value())) ||
9989 all_of(Range&: VL, P: [&S](const Value *I) {
9990 return match(V: I,
9991 P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
9992 cast<Instruction>(Val: I)->getOpcode() == S.getOpcode();
9993 })))) {
9994 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
9995 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
9996 }
9997
9998 // Don't handle scalable vectors
9999 if (S && S.getOpcode() == Instruction::ExtractElement &&
10000 isa<ScalableVectorType>(
10001 Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) {
10002 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
10003 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
10004 }
10005
10006 // Don't handle vectors.
10007 if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) {
10008 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
10009 // Do not try to pack to avoid extra instructions here.
10010 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
10011 /*TryToFindDuplicates=*/false);
10012 }
10013
10014 // If all of the operands are identical or constant we have a simple solution.
10015 // If we deal with insert/extract instructions, they all must have constant
10016 // indices, otherwise we should gather them, not try to vectorize.
10017 // If alternate op node with 2 elements with gathered operands - do not
10018 // vectorize.
10019 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
10020 if (!S || !S.isAltShuffle() || VL.size() > 2)
10021 return false;
10022 if (VectorizableTree.size() < MinTreeSize)
10023 return false;
10024 if (Depth >= RecursionMaxDepth - 1)
10025 return true;
10026 // Check if all operands are extracts, part of vector node or can build a
10027 // regular vectorize node.
10028 SmallVector<unsigned, 8> InstsCount;
10029 for (Value *V : VL) {
10030 auto *I = cast<Instruction>(Val: V);
10031 InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
10032 return isa<Instruction>(Val: Op) || isVectorLikeInstWithConstOps(V: Op);
10033 }));
10034 }
10035 bool IsCommutative =
10036 isCommutative(I: S.getMainOp()) || isCommutative(I: S.getAltOp());
10037 if ((IsCommutative &&
10038 std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: 0) < 2) ||
10039 (!IsCommutative &&
10040 all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < 2; })))
10041 return true;
10042 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
10043 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
10044 auto *I1 = cast<Instruction>(Val: VL.front());
10045 auto *I2 = cast<Instruction>(Val: VL.back());
10046 for (int Op : seq<int>(Size: S.getMainOp()->getNumOperands()))
10047 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
10048 Args: I2->getOperand(i: Op));
10049 if (static_cast<unsigned>(count_if(
10050 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
10051 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
10052 })) >= S.getMainOp()->getNumOperands() / 2)
10053 return false;
10054 if (S.getMainOp()->getNumOperands() > 2)
10055 return true;
10056 if (IsCommutative) {
10057 // Check permuted operands.
10058 Candidates.clear();
10059 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
10060 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
10061 Args: I2->getOperand(i: (Op + 1) % E));
10062 if (any_of(
10063 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
10064 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
10065 }))
10066 return false;
10067 }
10068 return true;
10069 };
10070 SmallVector<unsigned> SortedIndices;
10071 BasicBlock *BB = nullptr;
10072 bool IsScatterVectorizeUserTE =
10073 UserTreeIdx.UserTE &&
10074 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
10075 bool AreAllSameBlock = S && allSameBlock(VL);
10076 bool AreScatterAllGEPSameBlock =
10077 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
10078 VL.size() > 2 &&
10079 all_of(Range&: VL,
10080 P: [&BB](Value *V) {
10081 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10082 if (!I)
10083 return doesNotNeedToBeScheduled(V);
10084 if (!BB)
10085 BB = I->getParent();
10086 return BB == I->getParent() && I->getNumOperands() == 2;
10087 }) &&
10088 BB &&
10089 sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL, SE&: *SE,
10090 SortedIndices));
10091 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
10092 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
10093 (S &&
10094 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
10095 Val: S.getMainOp()) &&
10096 !all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) ||
10097 NotProfitableForVectorization(VL)) {
10098 if (!S) {
10099 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
10100 "C,S,B,O, small shuffle. \n");
10101 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
10102 /*TryToFindDuplicates=*/true,
10103 /*TrySplitVectorize=*/true);
10104 }
10105 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
10106 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
10107 }
10108
10109 // Don't vectorize ephemeral values.
10110 if (S && !EphValues.empty()) {
10111 for (Value *V : VL) {
10112 if (EphValues.count(Ptr: V)) {
10113 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
10114 << ") is ephemeral.\n");
10115 // Do not try to pack to avoid extra instructions here.
10116 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
10117 /*TryToFindDuplicates=*/false);
10118 }
10119 }
10120 }
10121
10122 // We now know that this is a vector of instructions of the same type from
10123 // the same block.
10124
10125 // Check that none of the instructions in the bundle are already in the tree
10126 // and the node may be not profitable for the vectorization as the small
10127 // alternate node.
10128 if (S && S.isAltShuffle()) {
10129 auto GetNumVectorizedExtracted = [&]() {
10130 APInt Extracted = APInt::getZero(numBits: VL.size());
10131 APInt Vectorized = APInt::getAllOnes(numBits: VL.size());
10132 for (auto [Idx, V] : enumerate(First&: VL)) {
10133 auto *I = dyn_cast<Instruction>(Val: V);
10134 if (!I || doesNotNeedToBeScheduled(V: I) ||
10135 all_of(Range: I->operands(), P: [&](const Use &U) {
10136 return isa<ExtractElementInst>(Val: U.get());
10137 }))
10138 continue;
10139 if (isVectorized(V: I))
10140 Vectorized.clearBit(BitPosition: Idx);
10141 else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList))
10142 Extracted.setBit(Idx);
10143 }
10144 return std::make_pair(x&: Vectorized, y&: Extracted);
10145 };
10146 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
10147 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10148 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
10149 if (!Vectorized.isAllOnes() && !PreferScalarize) {
10150 // Rough cost estimation, if the vector code (+ potential extracts) is
10151 // more profitable than the scalar + buildvector.
10152 Type *ScalarTy = VL.front()->getType();
10153 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
10154 InstructionCost VectorizeCostEstimate =
10155 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) +
10156 ::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted,
10157 /*Insert=*/false, /*Extract=*/true, CostKind: Kind);
10158 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
10159 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized,
10160 /*Insert=*/true, /*Extract=*/false, CostKind: Kind, /*ForPoisonSrc=*/false);
10161 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
10162 }
10163 if (PreferScalarize) {
10164 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
10165 "node is not profitable.\n");
10166 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
10167 }
10168 }
10169
10170 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
10171 if (UserIgnoreList && !UserIgnoreList->empty()) {
10172 for (Value *V : VL) {
10173 if (UserIgnoreList->contains(V)) {
10174 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
10175 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
10176 }
10177 }
10178 }
10179
10180 // Special processing for sorted pointers for ScatterVectorize node with
10181 // constant indeces only.
10182 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
10183 assert(VL.front()->getType()->isPointerTy() &&
10184 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
10185 "Expected pointers only.");
10186 // Reset S to make it GetElementPtr kind of node.
10187 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
10188 assert(It != VL.end() && "Expected at least one GEP.");
10189 S = getSameOpcode(VL: *It, TLI: *TLI);
10190 }
10191
10192 // Check that all of the users of the scalars that we want to vectorize are
10193 // schedulable.
10194 Instruction *VL0 = S.getMainOp();
10195 BB = VL0->getParent();
10196
10197 if (S &&
10198 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) ||
10199 !DT->isReachableFromEntry(A: BB))) {
10200 // Don't go into unreachable blocks. They may contain instructions with
10201 // dependency cycles which confuse the final scheduling.
10202 // Do not vectorize EH and non-returning blocks, not profitable in most
10203 // cases.
10204 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
10205 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
10206 }
10207 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
10208}
10209
10210void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
10211 const EdgeInfo &UserTreeIdx,
10212 unsigned InterleaveFactor) {
10213 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
10214
10215 SmallVector<int> ReuseShuffleIndices;
10216 SmallVector<Value *> VL(VLRef);
10217
10218 // Tries to build split node.
10219 auto TrySplitNode = [&](const InstructionsState &LocalState) {
10220 SmallVector<Value *> Op1, Op2;
10221 OrdersType ReorderIndices;
10222 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
10223 return false;
10224
10225 SmallVector<Value *> NewVL(VL.size());
10226 copy(Range&: Op1, Out: NewVL.begin());
10227 copy(Range&: Op2, Out: std::next(x: NewVL.begin(), n: Op1.size()));
10228 auto Invalid = ScheduleBundle::invalid();
10229 auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState,
10230 UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices);
10231 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
10232 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
10233 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
10234 if (S && (isa<LoadInst>(Val: S.getMainOp()) ||
10235 getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /*SameVF=*/true))) {
10236 // Build gather node for loads, they will be gathered later.
10237 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
10238 Args: Idx == 0 ? 0 : Op1.size());
10239 (void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx});
10240 } else {
10241 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
10242 Args: Idx == 0 ? 0 : Op1.size());
10243 buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx});
10244 }
10245 };
10246 AddNode(Op1, 0);
10247 AddNode(Op2, 1);
10248 return true;
10249 };
10250
10251 ScalarsVectorizationLegality Legality =
10252 getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
10253 const InstructionsState &S = Legality.getInstructionsState();
10254 if (!Legality.isLegal()) {
10255 if (Legality.trySplitVectorize()) {
10256 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
10257 // Last chance to try to vectorize alternate node.
10258 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
10259 return;
10260 }
10261 if (Legality.tryToFindDuplicates())
10262 tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx);
10263
10264 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10265 return;
10266 }
10267
10268 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
10269 if (S.isAltShuffle() && TrySplitNode(S))
10270 return;
10271
10272 // Check that every instruction appears once in this bundle.
10273 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx,
10274 /*TryPad=*/true)) {
10275 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10276 return;
10277 }
10278
10279 // Perform specific checks for each particular instruction kind.
10280 bool IsScatterVectorizeUserTE =
10281 UserTreeIdx.UserTE &&
10282 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
10283 OrdersType CurrentOrder;
10284 SmallVector<Value *> PointerOps;
10285 TreeEntry::EntryState State = getScalarsVectorizationState(
10286 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
10287 if (State == TreeEntry::NeedToGather) {
10288 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10289 return;
10290 }
10291
10292 Instruction *VL0 = S.getMainOp();
10293 BasicBlock *BB = VL0->getParent();
10294 auto &BSRef = BlocksSchedules[BB];
10295 if (!BSRef)
10296 BSRef = std::make_unique<BlockScheduling>(args&: BB);
10297
10298 BlockScheduling &BS = *BSRef;
10299
10300 SetVector<Value *> UniqueValues(llvm::from_range, VL);
10301 std::optional<ScheduleBundle *> BundlePtr =
10302 BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S);
10303#ifdef EXPENSIVE_CHECKS
10304 // Make sure we didn't break any internal invariants
10305 BS.verify();
10306#endif
10307 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
10308 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
10309 // Last chance to try to vectorize alternate node.
10310 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
10311 return;
10312 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
10313 NonScheduledFirst.insert(Ptr: VL.front());
10314 if (S.getOpcode() == Instruction::Load &&
10315 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
10316 registerNonVectorizableLoads(VL: ArrayRef(VL));
10317 return;
10318 }
10319 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
10320 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
10321 ScheduleBundle Empty;
10322 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
10323 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
10324
10325 unsigned ShuffleOrOp =
10326 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10327 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
10328 // Postpone PHI nodes creation
10329 SmallVector<unsigned> PHIOps;
10330 for (unsigned I : seq<unsigned>(Operands.size())) {
10331 ArrayRef<Value *> Op = Operands[I];
10332 if (Op.empty())
10333 continue;
10334 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
10335 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
10336 buildTreeRec(VLRef: Op, Depth: Depth + 1, UserTreeIdx: {TE, I});
10337 else
10338 PHIOps.push_back(Elt: I);
10339 }
10340 for (unsigned I : PHIOps)
10341 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
10342 };
10343 switch (ShuffleOrOp) {
10344 case Instruction::PHI: {
10345 TreeEntry *TE =
10346 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
10347 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
10348 TE->dump());
10349
10350 TE->setOperands(Operands);
10351 CreateOperandNodes(TE, Operands);
10352 return;
10353 }
10354 case Instruction::ExtractValue:
10355 case Instruction::ExtractElement: {
10356 if (CurrentOrder.empty()) {
10357 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
10358 } else {
10359 LLVM_DEBUG({
10360 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
10361 "with order";
10362 for (unsigned Idx : CurrentOrder)
10363 dbgs() << " " << Idx;
10364 dbgs() << "\n";
10365 });
10366 fixupOrderingIndices(Order: CurrentOrder);
10367 }
10368 // Insert new order with initial value 0, if it does not exist,
10369 // otherwise return the iterator to the existing one.
10370 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10371 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10372 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
10373 "(ExtractValueInst/ExtractElementInst).\n";
10374 TE->dump());
10375 // This is a special case, as it does not gather, but at the same time
10376 // we are not extending buildTreeRec() towards the operands.
10377 TE->setOperands(Operands);
10378 return;
10379 }
10380 case Instruction::InsertElement: {
10381 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
10382
10383 auto OrdCompare = [](const std::pair<int, int> &P1,
10384 const std::pair<int, int> &P2) {
10385 return P1.first > P2.first;
10386 };
10387 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
10388 decltype(OrdCompare)>
10389 Indices(OrdCompare);
10390 for (int I = 0, E = VL.size(); I < E; ++I) {
10391 unsigned Idx = *getElementIndex(Inst: VL[I]);
10392 Indices.emplace(args&: Idx, args&: I);
10393 }
10394 OrdersType CurrentOrder(VL.size(), VL.size());
10395 bool IsIdentity = true;
10396 for (int I = 0, E = VL.size(); I < E; ++I) {
10397 CurrentOrder[Indices.top().second] = I;
10398 IsIdentity &= Indices.top().second == I;
10399 Indices.pop();
10400 }
10401 if (IsIdentity)
10402 CurrentOrder.clear();
10403 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10404 ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder);
10405 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
10406 TE->dump());
10407
10408 TE->setOperands(Operands);
10409 buildTreeRec(VLRef: TE->getOperand(OpIdx: 1), Depth: Depth + 1, UserTreeIdx: {TE, 1});
10410 return;
10411 }
10412 case Instruction::Load: {
10413 // Check that a vectorized load would load the same memory as a scalar
10414 // load. For example, we don't want to vectorize loads that are smaller
10415 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10416 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10417 // from such a struct, we read/write packed bits disagreeing with the
10418 // unvectorized version.
10419 TreeEntry *TE = nullptr;
10420 fixupOrderingIndices(Order: CurrentOrder);
10421 switch (State) {
10422 case TreeEntry::Vectorize:
10423 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10424 ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor);
10425 if (CurrentOrder.empty())
10426 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
10427 TE->dump());
10428 else
10429 LLVM_DEBUG(dbgs()
10430 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
10431 TE->dump());
10432 break;
10433 case TreeEntry::CompressVectorize:
10434 // Vectorizing non-consecutive loads with (masked)load + compress.
10435 TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S,
10436 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10437 LLVM_DEBUG(
10438 dbgs()
10439 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
10440 TE->dump());
10441 break;
10442 case TreeEntry::StridedVectorize:
10443 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
10444 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
10445 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10446 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
10447 TE->dump());
10448 break;
10449 case TreeEntry::ScatterVectorize:
10450 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
10451 TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
10452 UserTreeIdx, ReuseShuffleIndices);
10453 LLVM_DEBUG(
10454 dbgs()
10455 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
10456 TE->dump());
10457 break;
10458 case TreeEntry::CombinedVectorize:
10459 case TreeEntry::SplitVectorize:
10460 case TreeEntry::NeedToGather:
10461 llvm_unreachable("Unexpected loads state.");
10462 }
10463 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
10464 assert(Operands.size() == 1 && "Expected a single operand only");
10465 SmallVector<int> Mask;
10466 inversePermutation(Indices: CurrentOrder, Mask);
10467 reorderScalars(Scalars&: Operands.front(), Mask);
10468 }
10469 TE->setOperands(Operands);
10470 if (State == TreeEntry::ScatterVectorize)
10471 buildTreeRec(VLRef: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0});
10472 return;
10473 }
10474 case Instruction::ZExt:
10475 case Instruction::SExt:
10476 case Instruction::FPToUI:
10477 case Instruction::FPToSI:
10478 case Instruction::FPExt:
10479 case Instruction::PtrToInt:
10480 case Instruction::IntToPtr:
10481 case Instruction::SIToFP:
10482 case Instruction::UIToFP:
10483 case Instruction::Trunc:
10484 case Instruction::FPTrunc:
10485 case Instruction::BitCast: {
10486 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
10487 u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
10488 y: std::numeric_limits<unsigned>::max()));
10489 if (ShuffleOrOp == Instruction::ZExt ||
10490 ShuffleOrOp == Instruction::SExt) {
10491 CastMaxMinBWSizes = std::make_pair(
10492 x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
10493 b: PrevMaxBW),
10494 y: std::min<unsigned>(
10495 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
10496 b: PrevMinBW));
10497 } else if (ShuffleOrOp == Instruction::Trunc) {
10498 CastMaxMinBWSizes = std::make_pair(
10499 x: std::max<unsigned>(
10500 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
10501 b: PrevMaxBW),
10502 y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
10503 b: PrevMinBW));
10504 }
10505 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10506 ReuseShuffleIndices);
10507 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
10508 TE->dump());
10509
10510 TE->setOperands(Operands);
10511 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
10512 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
10513 if (ShuffleOrOp == Instruction::Trunc) {
10514 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
10515 } else if (ShuffleOrOp == Instruction::SIToFP ||
10516 ShuffleOrOp == Instruction::UIToFP) {
10517 unsigned NumSignBits =
10518 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
10519 if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) {
10520 APInt Mask = DB->getDemandedBits(I: OpI);
10521 NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
10522 }
10523 if (NumSignBits * 2 >=
10524 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
10525 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
10526 }
10527 return;
10528 }
10529 case Instruction::ICmp:
10530 case Instruction::FCmp: {
10531 // Check that all of the compares have the same predicate.
10532 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
10533 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10534 ReuseShuffleIndices);
10535 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
10536 TE->dump());
10537
10538 VLOperands Ops(VL, Operands, S, *this);
10539 if (cast<CmpInst>(Val: VL0)->isCommutative()) {
10540 // Commutative predicate - collect + sort operands of the instructions
10541 // so that each side is more likely to have the same opcode.
10542 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
10543 "Commutative Predicate mismatch");
10544 Ops.reorder();
10545 Operands.front() = Ops.getVL(OpIdx: 0);
10546 Operands.back() = Ops.getVL(OpIdx: 1);
10547 } else {
10548 // Collect operands - commute if it uses the swapped predicate.
10549 for (auto [Idx, V] : enumerate(First&: VL)) {
10550 if (isa<PoisonValue>(Val: V))
10551 continue;
10552 auto *Cmp = cast<CmpInst>(Val: V);
10553 if (Cmp->getPredicate() != P0)
10554 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
10555 }
10556 }
10557 TE->setOperands(Operands);
10558 buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0});
10559 buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1});
10560 if (ShuffleOrOp == Instruction::ICmp) {
10561 unsigned NumSignBits0 =
10562 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
10563 if (NumSignBits0 * 2 >=
10564 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
10565 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
10566 unsigned NumSignBits1 =
10567 ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
10568 if (NumSignBits1 * 2 >=
10569 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType()))
10570 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx);
10571 }
10572 return;
10573 }
10574 case Instruction::Select:
10575 case Instruction::FNeg:
10576 case Instruction::Add:
10577 case Instruction::FAdd:
10578 case Instruction::Sub:
10579 case Instruction::FSub:
10580 case Instruction::Mul:
10581 case Instruction::FMul:
10582 case Instruction::UDiv:
10583 case Instruction::SDiv:
10584 case Instruction::FDiv:
10585 case Instruction::URem:
10586 case Instruction::SRem:
10587 case Instruction::FRem:
10588 case Instruction::Shl:
10589 case Instruction::LShr:
10590 case Instruction::AShr:
10591 case Instruction::And:
10592 case Instruction::Or:
10593 case Instruction::Xor:
10594 case Instruction::Freeze: {
10595 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10596 ReuseShuffleIndices);
10597 LLVM_DEBUG(
10598 dbgs() << "SLP: added a new TreeEntry "
10599 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
10600 TE->dump());
10601
10602 if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
10603 VLOperands Ops(VL, Operands, S, *this);
10604 Ops.reorder();
10605 Operands[0] = Ops.getVL(OpIdx: 0);
10606 Operands[1] = Ops.getVL(OpIdx: 1);
10607 }
10608 TE->setOperands(Operands);
10609 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
10610 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
10611 return;
10612 }
10613 case Instruction::GetElementPtr: {
10614 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10615 ReuseShuffleIndices);
10616 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
10617 TE->dump());
10618 TE->setOperands(Operands);
10619
10620 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
10621 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
10622 return;
10623 }
10624 case Instruction::Store: {
10625 bool Consecutive = CurrentOrder.empty();
10626 if (!Consecutive)
10627 fixupOrderingIndices(Order: CurrentOrder);
10628 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10629 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
10630 if (Consecutive)
10631 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
10632 TE->dump());
10633 else
10634 LLVM_DEBUG(
10635 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
10636 TE->dump());
10637 TE->setOperands(Operands);
10638 buildTreeRec(VLRef: TE->getOperand(OpIdx: 0), Depth: Depth + 1, UserTreeIdx: {TE, 0});
10639 return;
10640 }
10641 case Instruction::Call: {
10642 // Check if the calls are all to the same vectorizable intrinsic or
10643 // library function.
10644 CallInst *CI = cast<CallInst>(Val: VL0);
10645 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10646
10647 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10648 ReuseShuffleIndices);
10649 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
10650 TE->dump());
10651 if (isCommutative(I: VL0)) {
10652 VLOperands Ops(VL, Operands, S, *this);
10653 Ops.reorder();
10654 Operands[0] = Ops.getVL(OpIdx: 0);
10655 Operands[1] = Ops.getVL(OpIdx: 1);
10656 }
10657 TE->setOperands(Operands);
10658 for (unsigned I : seq<unsigned>(Size: CI->arg_size())) {
10659 // For scalar operands no need to create an entry since no need to
10660 // vectorize it.
10661 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI))
10662 continue;
10663 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
10664 }
10665 return;
10666 }
10667 case Instruction::ShuffleVector: {
10668 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
10669 ReuseShuffleIndices);
10670 if (S.isAltShuffle()) {
10671 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
10672 TE->dump());
10673 } else {
10674 assert(SLPReVec && "Only supported by REVEC.");
10675 LLVM_DEBUG(
10676 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
10677 TE->dump());
10678 }
10679
10680 // Reorder operands if reordering would enable vectorization.
10681 auto *CI = dyn_cast<CmpInst>(Val: VL0);
10682 if (CI && any_of(Range&: VL, P: [](Value *V) {
10683 return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative();
10684 })) {
10685 auto *MainCI = cast<CmpInst>(Val: S.getMainOp());
10686 auto *AltCI = cast<CmpInst>(Val: S.getAltOp());
10687 CmpInst::Predicate MainP = MainCI->getPredicate();
10688 CmpInst::Predicate AltP = AltCI->getPredicate();
10689 assert(MainP != AltP &&
10690 "Expected different main/alternate predicates.");
10691 // Collect operands - commute if it uses the swapped predicate or
10692 // alternate operation.
10693 for (auto [Idx, V] : enumerate(First&: VL)) {
10694 if (isa<PoisonValue>(Val: V))
10695 continue;
10696 auto *Cmp = cast<CmpInst>(Val: V);
10697
10698 if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
10699 if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
10700 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
10701 } else {
10702 if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
10703 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
10704 }
10705 }
10706 TE->setOperands(Operands);
10707 buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0});
10708 buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1});
10709 return;
10710 }
10711
10712 if (isa<BinaryOperator>(Val: VL0) || CI) {
10713 VLOperands Ops(VL, Operands, S, *this);
10714 Ops.reorder();
10715 Operands[0] = Ops.getVL(OpIdx: 0);
10716 Operands[1] = Ops.getVL(OpIdx: 1);
10717 }
10718 TE->setOperands(Operands);
10719 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
10720 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
10721 return;
10722 }
10723 default:
10724 break;
10725 }
10726 llvm_unreachable("Unexpected vectorization of the instructions.");
10727}
10728
10729unsigned BoUpSLP::canMapToVector(Type *T) const {
10730 unsigned N = 1;
10731 Type *EltTy = T;
10732
10733 while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
10734 if (EltTy->isEmptyTy())
10735 return 0;
10736 if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
10737 // Check that struct is homogeneous.
10738 for (const auto *Ty : ST->elements())
10739 if (Ty != *ST->element_begin())
10740 return 0;
10741 N *= ST->getNumElements();
10742 EltTy = *ST->element_begin();
10743 } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
10744 N *= AT->getNumElements();
10745 EltTy = AT->getElementType();
10746 } else {
10747 auto *VT = cast<FixedVectorType>(Val: EltTy);
10748 N *= VT->getNumElements();
10749 EltTy = VT->getElementType();
10750 }
10751 }
10752
10753 if (!isValidElementType(Ty: EltTy))
10754 return 0;
10755 size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
10756 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
10757 VTSize != DL->getTypeStoreSizeInBits(Ty: T))
10758 return 0;
10759 return N;
10760}
10761
10762bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
10763 SmallVectorImpl<unsigned> &CurrentOrder,
10764 bool ResizeAllowed) const {
10765 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
10766 assert(It != VL.end() && "Expected at least one extract instruction.");
10767 auto *E0 = cast<Instruction>(Val: *It);
10768 assert(
10769 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
10770 "Invalid opcode");
10771 // Check if all of the extracts come from the same vector and from the
10772 // correct offset.
10773 Value *Vec = E0->getOperand(i: 0);
10774
10775 CurrentOrder.clear();
10776
10777 // We have to extract from a vector/aggregate with the same number of elements.
10778 unsigned NElts;
10779 if (E0->getOpcode() == Instruction::ExtractValue) {
10780 NElts = canMapToVector(T: Vec->getType());
10781 if (!NElts)
10782 return false;
10783 // Check if load can be rewritten as load of vector.
10784 LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
10785 if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size()))
10786 return false;
10787 } else {
10788 NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
10789 }
10790
10791 unsigned E = VL.size();
10792 if (!ResizeAllowed && NElts != E)
10793 return false;
10794 SmallVector<int> Indices(E, PoisonMaskElem);
10795 unsigned MinIdx = NElts, MaxIdx = 0;
10796 for (auto [I, V] : enumerate(First&: VL)) {
10797 auto *Inst = dyn_cast<Instruction>(Val: V);
10798 if (!Inst)
10799 continue;
10800 if (Inst->getOperand(i: 0) != Vec)
10801 return false;
10802 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
10803 if (isa<UndefValue>(Val: EE->getIndexOperand()))
10804 continue;
10805 std::optional<unsigned> Idx = getExtractIndex(E: Inst);
10806 if (!Idx)
10807 return false;
10808 const unsigned ExtIdx = *Idx;
10809 if (ExtIdx >= NElts)
10810 continue;
10811 Indices[I] = ExtIdx;
10812 if (MinIdx > ExtIdx)
10813 MinIdx = ExtIdx;
10814 if (MaxIdx < ExtIdx)
10815 MaxIdx = ExtIdx;
10816 }
10817 if (MaxIdx - MinIdx + 1 > E)
10818 return false;
10819 if (MaxIdx + 1 <= E)
10820 MinIdx = 0;
10821
10822 // Check that all of the indices extract from the correct offset.
10823 bool ShouldKeepOrder = true;
10824 // Assign to all items the initial value E + 1 so we can check if the extract
10825 // instruction index was used already.
10826 // Also, later we can check that all the indices are used and we have a
10827 // consecutive access in the extract instructions, by checking that no
10828 // element of CurrentOrder still has value E + 1.
10829 CurrentOrder.assign(NumElts: E, Elt: E);
10830 for (unsigned I = 0; I < E; ++I) {
10831 if (Indices[I] == PoisonMaskElem)
10832 continue;
10833 const unsigned ExtIdx = Indices[I] - MinIdx;
10834 if (CurrentOrder[ExtIdx] != E) {
10835 CurrentOrder.clear();
10836 return false;
10837 }
10838 ShouldKeepOrder &= ExtIdx == I;
10839 CurrentOrder[ExtIdx] = I;
10840 }
10841 if (ShouldKeepOrder)
10842 CurrentOrder.clear();
10843
10844 return ShouldKeepOrder;
10845}
10846
10847bool BoUpSLP::areAllUsersVectorized(
10848 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
10849 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) ||
10850 all_of(Range: I->users(), P: [this](User *U) {
10851 return isVectorized(V: U) || isVectorLikeInstWithConstOps(V: U) ||
10852 (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
10853 });
10854}
10855
10856void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
10857 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
10858 SmallVectorImpl<Value *> *OpScalars,
10859 SmallVectorImpl<Value *> *AltScalars) const {
10860 unsigned Sz = Scalars.size();
10861 Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
10862 SmallVector<int> OrderMask;
10863 if (!ReorderIndices.empty())
10864 inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
10865 for (unsigned I = 0; I < Sz; ++I) {
10866 unsigned Idx = I;
10867 if (!ReorderIndices.empty())
10868 Idx = OrderMask[I];
10869 if (isa<PoisonValue>(Val: Scalars[Idx]))
10870 continue;
10871 auto *OpInst = cast<Instruction>(Val: Scalars[Idx]);
10872 if (IsAltOp(OpInst)) {
10873 Mask[I] = Sz + Idx;
10874 if (AltScalars)
10875 AltScalars->push_back(Elt: OpInst);
10876 } else {
10877 Mask[I] = Idx;
10878 if (OpScalars)
10879 OpScalars->push_back(Elt: OpInst);
10880 }
10881 }
10882 if (!ReuseShuffleIndices.empty()) {
10883 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
10884 transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
10885 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
10886 });
10887 Mask.swap(RHS&: NewMask);
10888 }
10889}
10890
10891static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10892 Instruction *AltOp,
10893 const TargetLibraryInfo &TLI) {
10894 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
10895}
10896
10897static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
10898 Instruction *AltOp,
10899 const TargetLibraryInfo &TLI) {
10900 if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
10901 auto *AltCI = cast<CmpInst>(Val: AltOp);
10902 CmpInst::Predicate MainP = MainCI->getPredicate();
10903 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
10904 assert(MainP != AltP && "Expected different main/alternate predicates.");
10905 auto *CI = cast<CmpInst>(Val: I);
10906 if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
10907 return false;
10908 if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
10909 return true;
10910 CmpInst::Predicate P = CI->getPredicate();
10911 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
10912
10913 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
10914 "CmpInst expected to match either main or alternate predicate or "
10915 "their swap.");
10916 return MainP != P && MainP != SwappedP;
10917 }
10918 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
10919}
10920
10921TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
10922 assert(!Ops.empty());
10923 const auto *Op0 = Ops.front();
10924
10925 const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
10926 // TODO: We should allow undef elements here
10927 return isConstant(V) && !isa<UndefValue>(Val: V);
10928 });
10929 const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
10930 // TODO: We should allow undef elements here
10931 return V == Op0;
10932 });
10933 const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
10934 // TODO: We should allow undef elements here
10935 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
10936 return CI->getValue().isPowerOf2();
10937 return false;
10938 });
10939 const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
10940 // TODO: We should allow undef elements here
10941 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
10942 return CI->getValue().isNegatedPowerOf2();
10943 return false;
10944 });
10945
10946 TTI::OperandValueKind VK = TTI::OK_AnyValue;
10947 if (IsConstant && IsUniform)
10948 VK = TTI::OK_UniformConstantValue;
10949 else if (IsConstant)
10950 VK = TTI::OK_NonUniformConstantValue;
10951 else if (IsUniform)
10952 VK = TTI::OK_UniformValue;
10953
10954 TTI::OperandValueProperties VP = TTI::OP_None;
10955 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
10956 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
10957
10958 return {.Kind: VK, .Properties: VP};
10959}
10960
10961namespace {
10962/// The base class for shuffle instruction emission and shuffle cost estimation.
10963class BaseShuffleAnalysis {
10964protected:
10965 Type *ScalarTy = nullptr;
10966
10967 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
10968
10969 /// V is expected to be a vectorized value.
10970 /// When REVEC is disabled, there is no difference between VF and
10971 /// VNumElements.
10972 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
10973 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
10974 /// of 8.
10975 unsigned getVF(Value *V) const {
10976 assert(V && "V cannot be nullptr");
10977 assert(isa<FixedVectorType>(V->getType()) &&
10978 "V does not have FixedVectorType");
10979 assert(ScalarTy && "ScalarTy cannot be nullptr");
10980 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
10981 unsigned VNumElements =
10982 cast<FixedVectorType>(Val: V->getType())->getNumElements();
10983 assert(VNumElements > ScalarTyNumElements &&
10984 "the number of elements of V is not large enough");
10985 assert(VNumElements % ScalarTyNumElements == 0 &&
10986 "the number of elements of V is not a vectorized value");
10987 return VNumElements / ScalarTyNumElements;
10988 }
10989
10990 /// Checks if the mask is an identity mask.
10991 /// \param IsStrict if is true the function returns false if mask size does
10992 /// not match vector size.
10993 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
10994 bool IsStrict) {
10995 int Limit = Mask.size();
10996 int VF = VecTy->getNumElements();
10997 int Index = -1;
10998 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
10999 return true;
11000 if (!IsStrict) {
11001 // Consider extract subvector starting from index 0.
11002 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
11003 Index == 0)
11004 return true;
11005 // All VF-size submasks are identity (e.g.
11006 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
11007 if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) {
11008 ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
11009 return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) ||
11010 ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
11011 }))
11012 return true;
11013 }
11014 return false;
11015 }
11016
11017 /// Tries to combine 2 different masks into single one.
11018 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
11019 /// change the size of the vector, \p LocalVF is the original size of the
11020 /// shuffled vector.
11021 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
11022 ArrayRef<int> ExtMask) {
11023 unsigned VF = Mask.size();
11024 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11025 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11026 if (ExtMask[I] == PoisonMaskElem)
11027 continue;
11028 int MaskedIdx = Mask[ExtMask[I] % VF];
11029 NewMask[I] =
11030 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
11031 }
11032 Mask.swap(RHS&: NewMask);
11033 }
11034
11035 /// Looks through shuffles trying to reduce final number of shuffles in the
11036 /// code. The function looks through the previously emitted shuffle
11037 /// instructions and properly mark indices in mask as undef.
11038 /// For example, given the code
11039 /// \code
11040 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11041 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11042 /// \endcode
11043 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11044 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
11045 /// <0, 1, 2, 3> for the shuffle.
11046 /// If 2 operands are of different size, the smallest one will be resized and
11047 /// the mask recalculated properly.
11048 /// For example, given the code
11049 /// \code
11050 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11051 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11052 /// \endcode
11053 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11054 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
11055 /// <0, 1, 2, 3> for the shuffle.
11056 /// So, it tries to transform permutations to simple vector merge, if
11057 /// possible.
11058 /// \param V The input vector which must be shuffled using the given \p Mask.
11059 /// If the better candidate is found, \p V is set to this best candidate
11060 /// vector.
11061 /// \param Mask The input mask for the shuffle. If the best candidate is found
11062 /// during looking-through-shuffles attempt, it is updated accordingly.
11063 /// \param SinglePermute true if the shuffle operation is originally a
11064 /// single-value-permutation. In this case the look-through-shuffles procedure
11065 /// may look for resizing shuffles as the best candidates.
11066 /// \return true if the shuffle results in the non-resizing identity shuffle
11067 /// (and thus can be ignored), false - otherwise.
11068 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
11069 bool SinglePermute) {
11070 Value *Op = V;
11071 ShuffleVectorInst *IdentityOp = nullptr;
11072 SmallVector<int> IdentityMask;
11073 while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
11074 // Exit if not a fixed vector type or changing size shuffle.
11075 auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
11076 if (!SVTy)
11077 break;
11078 // Remember the identity or broadcast mask, if it is not a resizing
11079 // shuffle. If no better candidates are found, this Op and Mask will be
11080 // used in the final shuffle.
11081 if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) {
11082 if (!IdentityOp || !SinglePermute ||
11083 (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) &&
11084 !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
11085 NumSrcElts: IdentityMask.size()))) {
11086 IdentityOp = SV;
11087 // Store current mask in the IdentityMask so later we did not lost
11088 // this info if IdentityOp is selected as the best candidate for the
11089 // permutation.
11090 IdentityMask.assign(RHS: Mask);
11091 }
11092 }
11093 // Remember the broadcast mask. If no better candidates are found, this Op
11094 // and Mask will be used in the final shuffle.
11095 // Zero splat can be used as identity too, since it might be used with
11096 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
11097 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
11098 // expensive, the analysis founds out, that the source vector is just a
11099 // broadcast, this original mask can be transformed to identity mask <0,
11100 // 1, 2, 3>.
11101 // \code
11102 // %0 = shuffle %v, poison, zeroinitalizer
11103 // %res = shuffle %0, poison, <3, 1, 2, 0>
11104 // \endcode
11105 // may be transformed to
11106 // \code
11107 // %0 = shuffle %v, poison, zeroinitalizer
11108 // %res = shuffle %0, poison, <0, 1, 2, 3>
11109 // \endcode
11110 if (SV->isZeroEltSplat()) {
11111 IdentityOp = SV;
11112 IdentityMask.assign(RHS: Mask);
11113 }
11114 int LocalVF = Mask.size();
11115 if (auto *SVOpTy =
11116 dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()))
11117 LocalVF = SVOpTy->getNumElements();
11118 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
11119 for (auto [Idx, I] : enumerate(First&: Mask)) {
11120 if (I == PoisonMaskElem ||
11121 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
11122 continue;
11123 ExtMask[Idx] = SV->getMaskValue(Elt: I);
11124 }
11125 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
11126 V: SV->getOperand(i_nocapture: 0),
11127 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
11128 .all();
11129 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
11130 V: SV->getOperand(i_nocapture: 1),
11131 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
11132 .all();
11133 if (!IsOp1Undef && !IsOp2Undef) {
11134 // Update mask and mark undef elems.
11135 for (int &I : Mask) {
11136 if (I == PoisonMaskElem)
11137 continue;
11138 if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
11139 PoisonMaskElem)
11140 I = PoisonMaskElem;
11141 }
11142 break;
11143 }
11144 SmallVector<int> ShuffleMask(SV->getShuffleMask());
11145 combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
11146 Mask.swap(RHS&: ShuffleMask);
11147 if (IsOp2Undef)
11148 Op = SV->getOperand(i_nocapture: 0);
11149 else
11150 Op = SV->getOperand(i_nocapture: 1);
11151 }
11152 if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
11153 !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) ||
11154 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
11155 if (IdentityOp) {
11156 V = IdentityOp;
11157 assert(Mask.size() == IdentityMask.size() &&
11158 "Expected masks of same sizes.");
11159 // Clear known poison elements.
11160 for (auto [I, Idx] : enumerate(First&: Mask))
11161 if (Idx == PoisonMaskElem)
11162 IdentityMask[I] = PoisonMaskElem;
11163 Mask.swap(RHS&: IdentityMask);
11164 auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
11165 return SinglePermute &&
11166 (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
11167 /*IsStrict=*/true) ||
11168 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
11169 Shuffle->isZeroEltSplat() &&
11170 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) &&
11171 all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) {
11172 return P.value() == PoisonMaskElem ||
11173 Shuffle->getShuffleMask()[P.index()] == 0;
11174 })));
11175 }
11176 V = Op;
11177 return false;
11178 }
11179 V = Op;
11180 return true;
11181 }
11182
11183 /// Smart shuffle instruction emission, walks through shuffles trees and
11184 /// tries to find the best matching vector for the actual shuffle
11185 /// instruction.
11186 template <typename T, typename ShuffleBuilderTy>
11187 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
11188 ShuffleBuilderTy &Builder, Type *ScalarTy) {
11189 assert(V1 && "Expected at least one vector value.");
11190 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
11191 SmallVector<int> NewMask(Mask);
11192 if (ScalarTyNumElements != 1) {
11193 assert(SLPReVec && "FixedVectorType is not expected.");
11194 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask);
11195 Mask = NewMask;
11196 }
11197 if (V2)
11198 Builder.resizeToMatch(V1, V2);
11199 int VF = Mask.size();
11200 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
11201 VF = FTy->getNumElements();
11202 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
11203 V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg))
11204 .all()) {
11205 // Peek through shuffles.
11206 Value *Op1 = V1;
11207 Value *Op2 = V2;
11208 int VF =
11209 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
11210 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
11211 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
11212 for (int I = 0, E = Mask.size(); I < E; ++I) {
11213 if (Mask[I] < VF)
11214 CombinedMask1[I] = Mask[I];
11215 else
11216 CombinedMask2[I] = Mask[I] - VF;
11217 }
11218 Value *PrevOp1;
11219 Value *PrevOp2;
11220 do {
11221 PrevOp1 = Op1;
11222 PrevOp2 = Op2;
11223 (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false);
11224 (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false);
11225 // Check if we have 2 resizing shuffles - need to peek through operands
11226 // again.
11227 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
11228 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
11229 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
11230 for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
11231 if (I == PoisonMaskElem)
11232 continue;
11233 ExtMask1[Idx] = SV1->getMaskValue(Elt: I);
11234 }
11235 SmallBitVector UseMask1 = buildUseMask(
11236 VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType())
11237 ->getNumElements(),
11238 Mask: ExtMask1, MaskArg: UseMask::SecondArg);
11239 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
11240 for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
11241 if (I == PoisonMaskElem)
11242 continue;
11243 ExtMask2[Idx] = SV2->getMaskValue(Elt: I);
11244 }
11245 SmallBitVector UseMask2 = buildUseMask(
11246 VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType())
11247 ->getNumElements(),
11248 Mask: ExtMask2, MaskArg: UseMask::SecondArg);
11249 if (SV1->getOperand(i_nocapture: 0)->getType() ==
11250 SV2->getOperand(i_nocapture: 0)->getType() &&
11251 SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() &&
11252 isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() &&
11253 isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) {
11254 Op1 = SV1->getOperand(i_nocapture: 0);
11255 Op2 = SV2->getOperand(i_nocapture: 0);
11256 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
11257 int LocalVF = ShuffleMask1.size();
11258 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
11259 LocalVF = FTy->getNumElements();
11260 combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
11261 CombinedMask1.swap(RHS&: ShuffleMask1);
11262 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
11263 LocalVF = ShuffleMask2.size();
11264 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
11265 LocalVF = FTy->getNumElements();
11266 combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
11267 CombinedMask2.swap(RHS&: ShuffleMask2);
11268 }
11269 }
11270 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
11271 Builder.resizeToMatch(Op1, Op2);
11272 VF = std::max(a: cast<VectorType>(Val: Op1->getType())
11273 ->getElementCount()
11274 .getKnownMinValue(),
11275 b: cast<VectorType>(Val: Op2->getType())
11276 ->getElementCount()
11277 .getKnownMinValue());
11278 for (int I = 0, E = Mask.size(); I < E; ++I) {
11279 if (CombinedMask2[I] != PoisonMaskElem) {
11280 assert(CombinedMask1[I] == PoisonMaskElem &&
11281 "Expected undefined mask element");
11282 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
11283 }
11284 }
11285 if (Op1 == Op2 &&
11286 (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) ||
11287 (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
11288 isa<ShuffleVectorInst>(Val: Op1) &&
11289 cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
11290 ArrayRef(CombinedMask1))))
11291 return Builder.createIdentity(Op1);
11292 return Builder.createShuffleVector(
11293 Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
11294 CombinedMask1);
11295 }
11296 if (isa<PoisonValue>(Val: V1))
11297 return Builder.createPoison(
11298 cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
11299 bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true);
11300 assert(V1 && "Expected non-null value after looking through shuffles.");
11301
11302 if (!IsIdentity)
11303 return Builder.createShuffleVector(V1, NewMask);
11304 return Builder.createIdentity(V1);
11305 }
11306
11307 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11308 /// shuffle emission.
11309 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11310 ArrayRef<int> Mask) {
11311 for (unsigned I : seq<unsigned>(Size: CommonMask.size()))
11312 if (Mask[I] != PoisonMaskElem)
11313 CommonMask[I] = I;
11314 }
11315};
11316} // namespace
11317
11318/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
11319static std::pair<InstructionCost, InstructionCost>
11320getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
11321 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
11322 Type *ScalarTy, VectorType *VecTy) {
11323 InstructionCost ScalarCost = 0;
11324 InstructionCost VecCost = 0;
11325 // Here we differentiate two cases: (1) when Ptrs represent a regular
11326 // vectorization tree node (as they are pointer arguments of scattered
11327 // loads) or (2) when Ptrs are the arguments of loads or stores being
11328 // vectorized as plane wide unit-stride load/store since all the
11329 // loads/stores are known to be from/to adjacent locations.
11330 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
11331 // Case 2: estimate costs for pointer related costs when vectorizing to
11332 // a wide load/store.
11333 // Scalar cost is estimated as a set of pointers with known relationship
11334 // between them.
11335 // For vector code we will use BasePtr as argument for the wide load/store
11336 // but we also need to account all the instructions which are going to
11337 // stay in vectorized code due to uses outside of these scalar
11338 // loads/stores.
11339 ScalarCost = TTI.getPointersChainCost(
11340 Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
11341 CostKind);
11342
11343 SmallVector<const Value *> PtrsRetainedInVecCode;
11344 for (Value *V : Ptrs) {
11345 if (V == BasePtr) {
11346 PtrsRetainedInVecCode.push_back(Elt: V);
11347 continue;
11348 }
11349 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
11350 // For simplicity assume Ptr to stay in vectorized code if it's not a
11351 // GEP instruction. We don't care since it's cost considered free.
11352 // TODO: We should check for any uses outside of vectorizable tree
11353 // rather than just single use.
11354 if (!Ptr || !Ptr->hasOneUse())
11355 PtrsRetainedInVecCode.push_back(Elt: V);
11356 }
11357
11358 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
11359 // If all pointers stay in vectorized code then we don't have
11360 // any savings on that.
11361 return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
11362 }
11363 VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
11364 Info: TTI::PointersChainInfo::getKnownStride(),
11365 AccessTy: VecTy, CostKind);
11366 } else {
11367 // Case 1: Ptrs are the arguments of loads that we are going to transform
11368 // into masked gather load intrinsic.
11369 // All the scalar GEPs will be removed as a result of vectorization.
11370 // For any external uses of some lanes extract element instructions will
11371 // be generated (which cost is estimated separately).
11372 TTI::PointersChainInfo PtrsInfo =
11373 all_of(Range&: Ptrs,
11374 P: [](const Value *V) {
11375 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
11376 return Ptr && !Ptr->hasAllConstantIndices();
11377 })
11378 ? TTI::PointersChainInfo::getUnknownStride()
11379 : TTI::PointersChainInfo::getKnownStride();
11380
11381 ScalarCost =
11382 TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
11383 auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
11384 if (!BaseGEP) {
11385 auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
11386 if (It != Ptrs.end())
11387 BaseGEP = cast<GEPOperator>(Val: *It);
11388 }
11389 if (BaseGEP) {
11390 SmallVector<const Value *> Indices(BaseGEP->indices());
11391 VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
11392 Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
11393 CostKind);
11394 }
11395 }
11396
11397 return std::make_pair(x&: ScalarCost, y&: VecCost);
11398}
11399
11400void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
11401 assert(TE.isGather() && TE.ReorderIndices.empty() &&
11402 "Expected gather node without reordering.");
11403 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
11404 SmallSet<size_t, 2> LoadKeyUsed;
11405
11406 // Do not reorder nodes if it small (just 2 elements), all-constant or all
11407 // instructions have same opcode already.
11408 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
11409 all_of(Range&: TE.Scalars, P: isConstant))
11410 return;
11411
11412 if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) {
11413 return VectorizableTree[Idx]->isSame(VL: TE.Scalars);
11414 }))
11415 return;
11416
11417 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
11418 Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
11419 Value *Ptr =
11420 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
11421 if (LoadKeyUsed.contains(V: Key)) {
11422 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
11423 if (LIt != LoadsMap.end()) {
11424 for (LoadInst *RLI : LIt->second) {
11425 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
11426 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: *DL, SE&: *SE,
11427 /*StrictCheck=*/true))
11428 return hash_value(ptr: RLI->getPointerOperand());
11429 }
11430 for (LoadInst *RLI : LIt->second) {
11431 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
11432 Ptr2: LI->getPointerOperand(), TLI: *TLI)) {
11433 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
11434 return SubKey;
11435 }
11436 }
11437 if (LIt->second.size() > 2) {
11438 hash_code SubKey =
11439 hash_value(ptr: LIt->second.back()->getPointerOperand());
11440 return SubKey;
11441 }
11442 }
11443 }
11444 LoadKeyUsed.insert(V: Key);
11445 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first->second.push_back(Elt: LI);
11446 return hash_value(ptr: LI->getPointerOperand());
11447 };
11448 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
11449 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
11450 bool IsOrdered = true;
11451 unsigned NumInstructions = 0;
11452 // Try to "cluster" scalar instructions, to be able to build extra vectorized
11453 // nodes.
11454 for (auto [I, V] : enumerate(First&: TE.Scalars)) {
11455 size_t Key = 1, Idx = 1;
11456 if (auto *Inst = dyn_cast<Instruction>(Val: V);
11457 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) &&
11458 !isDeleted(I: Inst) && !isVectorized(V)) {
11459 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
11460 /*AllowAlternate=*/false);
11461 ++NumInstructions;
11462 }
11463 auto &Container = SortedValues[Key];
11464 if (IsOrdered && !KeyToIndex.contains(Val: V) &&
11465 !(isa<Constant, ExtractElementInst>(Val: V) ||
11466 isVectorLikeInstWithConstOps(V)) &&
11467 ((Container.contains(Key: Idx) &&
11468 KeyToIndex.at(Val: Container[Idx].back()).back() != I - 1) ||
11469 (!Container.empty() && !Container.contains(Key: Idx) &&
11470 KeyToIndex.at(Val: Container.back().second.back()).back() != I - 1)))
11471 IsOrdered = false;
11472 auto &KTI = KeyToIndex[V];
11473 if (KTI.empty())
11474 Container[Idx].push_back(Elt: V);
11475 KTI.push_back(Elt: I);
11476 }
11477 SmallVector<std::pair<unsigned, unsigned>> SubVectors;
11478 APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
11479 if (!IsOrdered && NumInstructions > 1) {
11480 unsigned Cnt = 0;
11481 TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size());
11482 for (const auto &D : SortedValues) {
11483 for (const auto &P : D.second) {
11484 unsigned Sz = 0;
11485 for (Value *V : P.second) {
11486 ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V);
11487 for (auto [K, Idx] : enumerate(First&: Indices)) {
11488 TE.ReorderIndices[Cnt + K] = Idx;
11489 TE.Scalars[Cnt + K] = V;
11490 }
11491 Sz += Indices.size();
11492 Cnt += Indices.size();
11493 }
11494 if (Sz > 1 && isa<Instruction>(Val: P.second.front())) {
11495 const unsigned SubVF = getFloorFullVectorNumberOfElements(
11496 TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz);
11497 SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF);
11498 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF))
11499 DemandedElts.clearBit(BitPosition: I);
11500 } else if (!P.second.empty() && isConstant(V: P.second.front())) {
11501 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt))
11502 DemandedElts.clearBit(BitPosition: I);
11503 }
11504 }
11505 }
11506 }
11507 // Reuses always require shuffles, so consider it as profitable.
11508 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
11509 return;
11510 // Do simple cost estimation.
11511 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11512 InstructionCost Cost = 0;
11513 auto *ScalarTy = TE.Scalars.front()->getType();
11514 auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size());
11515 for (auto [Idx, Sz] : SubVectors) {
11516 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind,
11517 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz));
11518 }
11519 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
11520 /*Insert=*/true,
11521 /*Extract=*/false, CostKind);
11522 int Sz = TE.Scalars.size();
11523 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
11524 TE.ReorderIndices.end());
11525 for (unsigned I : seq<unsigned>(Size: Sz)) {
11526 Value *V = TE.getOrdered(Idx: I);
11527 if (isa<PoisonValue>(Val: V)) {
11528 ReorderMask[I] = PoisonMaskElem;
11529 } else if (isConstant(V) || DemandedElts[I]) {
11530 ReorderMask[I] = I + TE.ReorderIndices.size();
11531 }
11532 }
11533 Cost += ::getShuffleCost(TTI: *TTI,
11534 Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; })
11535 ? TTI::SK_PermuteTwoSrc
11536 : TTI::SK_PermuteSingleSrc,
11537 Tp: VecTy, Mask: ReorderMask);
11538 DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
11539 ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
11540 for (unsigned I : seq<unsigned>(Size: Sz)) {
11541 Value *V = TE.getOrdered(Idx: I);
11542 if (isConstant(V)) {
11543 DemandedElts.clearBit(BitPosition: I);
11544 if (!isa<PoisonValue>(Val: V))
11545 ReorderMask[I] = I;
11546 } else {
11547 ReorderMask[I] = I + Sz;
11548 }
11549 }
11550 InstructionCost BVCost =
11551 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
11552 /*Insert=*/true, /*Extract=*/false, CostKind);
11553 if (!DemandedElts.isAllOnes())
11554 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask);
11555 if (Cost >= BVCost) {
11556 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
11557 reorderScalars(Scalars&: TE.Scalars, Mask);
11558 TE.ReorderIndices.clear();
11559 }
11560}
11561
11562void BoUpSLP::transformNodes() {
11563 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11564 BaseGraphSize = VectorizableTree.size();
11565 // Turn graph transforming mode on and off, when done.
11566 class GraphTransformModeRAAI {
11567 bool &SavedIsGraphTransformMode;
11568
11569 public:
11570 GraphTransformModeRAAI(bool &IsGraphTransformMode)
11571 : SavedIsGraphTransformMode(IsGraphTransformMode) {
11572 IsGraphTransformMode = true;
11573 }
11574 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
11575 } TransformContext(IsGraphTransformMode);
11576 // Operands are profitable if they are:
11577 // 1. At least one constant
11578 // or
11579 // 2. Splats
11580 // or
11581 // 3. Results in good vectorization opportunity, i.e. may generate vector
11582 // nodes and reduce cost of the graph.
11583 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
11584 const InstructionsState &S) {
11585 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
11586 for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands()))
11587 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
11588 Args: I2->getOperand(i: Op));
11589 return all_of(
11590 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11591 return all_of(Range&: Cand,
11592 P: [](const std::pair<Value *, Value *> &P) {
11593 return isa<Constant>(Val: P.first) ||
11594 isa<Constant>(Val: P.second) || P.first == P.second;
11595 }) ||
11596 findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads);
11597 });
11598 };
11599
11600 // Try to reorder gather nodes for better vectorization opportunities.
11601 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
11602 TreeEntry &E = *VectorizableTree[Idx];
11603 if (E.isGather())
11604 reorderGatherNode(TE&: E);
11605 }
11606
11607 // Better to use full gathered loads analysis, if there are only 2 loads
11608 // gathered nodes each having less than 16 elements.
11609 constexpr unsigned VFLimit = 16;
11610 bool ForceLoadGather =
11611 count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
11612 return TE->isGather() && TE->hasState() &&
11613 TE->getOpcode() == Instruction::Load &&
11614 TE->getVectorFactor() < VFLimit;
11615 }) == 2;
11616
11617 // Checks if the scalars are used in other node.
11618 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
11619 function_ref<bool(Value *)> CheckContainer) {
11620 return TE->isSame(VL) || all_of(Range&: VL, P: [&](Value *V) {
11621 if (isa<PoisonValue>(Val: V))
11622 return true;
11623 auto *I = dyn_cast<Instruction>(Val: V);
11624 if (!I)
11625 return false;
11626 return is_contained(Range: TE->Scalars, Element: I) || CheckContainer(I);
11627 });
11628 };
11629 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
11630 if (E.hasState()) {
11631 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp());
11632 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
11633 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
11634 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
11635 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
11636 return is_contained(Range&: TEs, Element: TE);
11637 });
11638 });
11639 }))
11640 return true;
11641 ;
11642 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp());
11643 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
11644 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
11645 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
11646 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
11647 return is_contained(Range&: TEs, Element: TE);
11648 });
11649 });
11650 }))
11651 return true;
11652 } else {
11653 // Check if the gather node full copy of split node.
11654 auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>);
11655 if (It != E.Scalars.end()) {
11656 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: *It);
11657 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
11658 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
11659 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
11660 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
11661 return is_contained(Range&: TEs, Element: TE);
11662 });
11663 });
11664 }))
11665 return true;
11666 }
11667 }
11668 return false;
11669 };
11670 // The tree may grow here, so iterate over nodes, built before.
11671 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
11672 TreeEntry &E = *VectorizableTree[Idx];
11673 if (E.isGather()) {
11674 ArrayRef<Value *> VL = E.Scalars;
11675 const unsigned Sz = getVectorElementSize(V: VL.front());
11676 unsigned MinVF = getMinVF(Sz: 2 * Sz);
11677 // Do not try partial vectorization for small nodes (<= 2), nodes with the
11678 // same opcode and same parent block or all constants.
11679 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(key: Idx) ||
11680 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
11681 // We use allSameOpcode instead of isAltShuffle because we don't
11682 // want to use interchangeable instruction here.
11683 !allSameOpcode(VL) || !allSameBlock(VL)) ||
11684 allConstant(VL) || isSplat(VL))
11685 continue;
11686 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
11687 continue;
11688 // Check if the node is a copy of other vector nodes.
11689 if (CheckForSameVectorNodes(E))
11690 continue;
11691 // Try to find vectorizable sequences and transform them into a series of
11692 // insertvector instructions.
11693 unsigned StartIdx = 0;
11694 unsigned End = VL.size();
11695 for (unsigned VF = getFloorFullVectorNumberOfElements(
11696 TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - 1);
11697 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
11698 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
11699 if (StartIdx + VF > End)
11700 continue;
11701 SmallVector<std::pair<unsigned, unsigned>> Slices;
11702 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
11703 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
11704 // If any instruction is vectorized already - do not try again.
11705 // Reuse the existing node, if it fully matches the slice.
11706 if (isVectorized(V: Slice.front()) &&
11707 !getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /*SameVF=*/true))
11708 continue;
11709 // Constant already handled effectively - skip.
11710 if (allConstant(VL: Slice))
11711 continue;
11712 // Do not try to vectorize small splats (less than vector register and
11713 // only with the single non-undef element).
11714 bool IsSplat = isSplat(VL: Slice);
11715 bool IsTwoRegisterSplat = true;
11716 if (IsSplat && VF == 2) {
11717 unsigned NumRegs2VF = ::getNumberOfParts(
11718 TTI: *TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: 2 * VF));
11719 IsTwoRegisterSplat = NumRegs2VF == 2;
11720 }
11721 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
11722 count(Range&: Slice, Element: Slice.front()) ==
11723 static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - 1
11724 : 1)) {
11725 if (IsSplat)
11726 continue;
11727 InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI);
11728 if (!S || !allSameOpcode(VL: Slice) || !allSameBlock(VL: Slice) ||
11729 (S.getOpcode() == Instruction::Load &&
11730 areKnownNonVectorizableLoads(VL: Slice)) ||
11731 (S.getOpcode() != Instruction::Load &&
11732 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF)))
11733 continue;
11734 if (VF == 2) {
11735 // Try to vectorize reduced values or if all users are vectorized.
11736 // For expensive instructions extra extracts might be profitable.
11737 if ((!UserIgnoreList || E.Idx != 0) &&
11738 TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
11739 TTI::TCC_Expensive &&
11740 !all_of(Range&: Slice, P: [&](Value *V) {
11741 if (isa<PoisonValue>(Val: V))
11742 return true;
11743 return areAllUsersVectorized(I: cast<Instruction>(Val: V),
11744 VectorizedVals: UserIgnoreList);
11745 }))
11746 continue;
11747 if (S.getOpcode() == Instruction::Load) {
11748 OrdersType Order;
11749 SmallVector<Value *> PointerOps;
11750 LoadsState Res =
11751 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps);
11752 // Do not vectorize gathers.
11753 if (Res == LoadsState::ScatterVectorize ||
11754 Res == LoadsState::Gather) {
11755 if (Res == LoadsState::Gather) {
11756 registerNonVectorizableLoads(VL: Slice);
11757 // If reductions and the scalars from the root node are
11758 // analyzed - mark as non-vectorizable reduction.
11759 if (UserIgnoreList && E.Idx == 0)
11760 analyzedReductionVals(VL: Slice);
11761 }
11762 continue;
11763 }
11764 } else if (S.getOpcode() == Instruction::ExtractElement ||
11765 (TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
11766 TTI::TCC_Expensive &&
11767 !CheckOperandsProfitability(
11768 S.getMainOp(),
11769 cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice),
11770 P: IsaPred<Instruction>)),
11771 S))) {
11772 // Do not vectorize extractelements (handled effectively
11773 // alread). Do not vectorize non-profitable instructions (with
11774 // low cost and non-vectorizable operands.)
11775 continue;
11776 }
11777 }
11778 }
11779 Slices.emplace_back(Args&: Cnt, Args: Slice.size());
11780 }
11781 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
11782 E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt);
11783 if (StartIdx == Cnt)
11784 StartIdx = Cnt + Sz;
11785 if (End == Cnt + Sz)
11786 End = Cnt;
11787 };
11788 for (auto [Cnt, Sz] : Slices) {
11789 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz);
11790 const TreeEntry *SameTE = nullptr;
11791 if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>);
11792 It != Slice.end()) {
11793 // If any instruction is vectorized already - do not try again.
11794 SameTE = getSameValuesTreeEntry(V: *It, VL: Slice);
11795 }
11796 unsigned PrevSize = VectorizableTree.size();
11797 [[maybe_unused]] unsigned PrevEntriesSize =
11798 LoadEntriesToVectorize.size();
11799 buildTreeRec(VLRef: Slice, Depth: 0, UserTreeIdx: EdgeInfo(&E, UINT_MAX));
11800 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
11801 VectorizableTree[PrevSize]->isGather() &&
11802 VectorizableTree[PrevSize]->hasState() &&
11803 VectorizableTree[PrevSize]->getOpcode() !=
11804 Instruction::ExtractElement &&
11805 !isSplat(VL: Slice)) {
11806 if (UserIgnoreList && E.Idx == 0 && VF == 2)
11807 analyzedReductionVals(VL: Slice);
11808 VectorizableTree.pop_back();
11809 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
11810 "LoadEntriesToVectorize expected to remain the same");
11811 continue;
11812 }
11813 AddCombinedNode(PrevSize, Cnt, Sz);
11814 }
11815 }
11816 // Restore ordering, if no extra vectorization happened.
11817 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
11818 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
11819 reorderScalars(Scalars&: E.Scalars, Mask);
11820 E.ReorderIndices.clear();
11821 }
11822 }
11823 if (!E.hasState())
11824 continue;
11825 switch (E.getOpcode()) {
11826 case Instruction::Load: {
11827 // No need to reorder masked gather loads, just reorder the scalar
11828 // operands.
11829 if (E.State != TreeEntry::Vectorize)
11830 break;
11831 Type *ScalarTy = E.getMainOp()->getType();
11832 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
11833 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
11834 // Check if profitable to represent consecutive load + reverse as strided
11835 // load with stride -1.
11836 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
11837 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
11838 SmallVector<int> Mask;
11839 inversePermutation(Indices: E.ReorderIndices, Mask);
11840 auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
11841 InstructionCost OriginalVecCost =
11842 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
11843 AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
11844 OpdInfo: TTI::OperandValueInfo()) +
11845 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
11846 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
11847 Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(),
11848 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseLI);
11849 if (StridedCost < OriginalVecCost)
11850 // Strided load is more profitable than consecutive load + reverse -
11851 // transform the node to strided load.
11852 E.State = TreeEntry::StridedVectorize;
11853 }
11854 break;
11855 }
11856 case Instruction::Store: {
11857 Type *ScalarTy =
11858 cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
11859 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
11860 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
11861 // Check if profitable to represent consecutive load + reverse as strided
11862 // load with stride -1.
11863 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
11864 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
11865 SmallVector<int> Mask;
11866 inversePermutation(Indices: E.ReorderIndices, Mask);
11867 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
11868 InstructionCost OriginalVecCost =
11869 TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
11870 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
11871 OpdInfo: TTI::OperandValueInfo()) +
11872 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
11873 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
11874 Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
11875 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseSI);
11876 if (StridedCost < OriginalVecCost)
11877 // Strided store is more profitable than reverse + consecutive store -
11878 // transform the node to strided store.
11879 E.State = TreeEntry::StridedVectorize;
11880 } else if (!E.ReorderIndices.empty()) {
11881 // Check for interleaved stores.
11882 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
11883 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front());
11884 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
11885 if (Mask.size() < 4)
11886 return 0u;
11887 for (unsigned Factor : seq<unsigned>(Begin: 2, End: Mask.size() / 2 + 1)) {
11888 if (ShuffleVectorInst::isInterleaveMask(
11889 Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) &&
11890 TTI.isLegalInterleavedAccessType(
11891 VTy: VecTy, Factor, Alignment: BaseSI->getAlign(),
11892 AddrSpace: BaseSI->getPointerAddressSpace()))
11893 return Factor;
11894 }
11895
11896 return 0u;
11897 };
11898 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
11899 unsigned InterleaveFactor = IsInterleaveMask(Mask);
11900 if (InterleaveFactor != 0)
11901 E.setInterleave(InterleaveFactor);
11902 }
11903 break;
11904 }
11905 case Instruction::Select: {
11906 if (E.State != TreeEntry::Vectorize)
11907 break;
11908 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars);
11909 if (MinMaxID == Intrinsic::not_intrinsic)
11910 break;
11911 // This node is a minmax node.
11912 E.CombinedOp = TreeEntry::MinMax;
11913 TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: 0);
11914 if (SelectOnly && CondEntry->UserTreeIndex &&
11915 CondEntry->State == TreeEntry::Vectorize) {
11916 // The condition node is part of the combined minmax node.
11917 CondEntry->State = TreeEntry::CombinedVectorize;
11918 }
11919 break;
11920 }
11921 default:
11922 break;
11923 }
11924 }
11925
11926 if (LoadEntriesToVectorize.empty()) {
11927 // Single load node - exit.
11928 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
11929 VectorizableTree.front()->getOpcode() == Instruction::Load)
11930 return;
11931 // Small graph with small VF - exit.
11932 constexpr unsigned SmallTree = 3;
11933 constexpr unsigned SmallVF = 2;
11934 if ((VectorizableTree.size() <= SmallTree &&
11935 VectorizableTree.front()->Scalars.size() == SmallVF) ||
11936 (VectorizableTree.size() <= 2 && UserIgnoreList))
11937 return;
11938
11939 if (VectorizableTree.front()->isNonPowOf2Vec() &&
11940 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
11941 getCanonicalGraphSize() <= SmallTree &&
11942 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
11943 P: [](const std::unique_ptr<TreeEntry> &TE) {
11944 return TE->isGather() && TE->hasState() &&
11945 TE->getOpcode() == Instruction::Load &&
11946 !allSameBlock(VL: TE->Scalars);
11947 }) == 1)
11948 return;
11949 }
11950
11951 // A list of loads to be gathered during the vectorization process. We can
11952 // try to vectorize them at the end, if profitable.
11953 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
11954 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
11955 GatheredLoads;
11956
11957 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11958 TreeEntry &E = *TE;
11959 if (E.isGather() &&
11960 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
11961 (!E.hasState() && any_of(Range&: E.Scalars,
11962 P: [&](Value *V) {
11963 return isa<LoadInst>(Val: V) &&
11964 !isVectorized(V) &&
11965 !isDeleted(I: cast<Instruction>(Val: V));
11966 }))) &&
11967 !isSplat(VL: E.Scalars)) {
11968 for (Value *V : E.Scalars) {
11969 auto *LI = dyn_cast<LoadInst>(Val: V);
11970 if (!LI)
11971 continue;
11972 if (isDeleted(I: LI) || isVectorized(V: LI) || !LI->isSimple())
11973 continue;
11974 gatherPossiblyVectorizableLoads(
11975 R: *this, VL: V, DL: *DL, SE&: *SE, TTI: *TTI,
11976 GatheredLoads&: GatheredLoads[std::make_tuple(
11977 args: LI->getParent(),
11978 args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth),
11979 args: LI->getType())]);
11980 }
11981 }
11982 }
11983 // Try to vectorize gathered loads if this is not just a gather of loads.
11984 if (!GatheredLoads.empty())
11985 tryToVectorizeGatheredLoads(GatheredLoads);
11986}
11987
11988/// Merges shuffle masks and emits final shuffle instruction, if required. It
11989/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11990/// when the actual shuffle instruction is generated only if this is actually
11991/// required. Otherwise, the shuffle instruction emission is delayed till the
11992/// end of the process, to reduce the number of emitted instructions and further
11993/// analysis/transformations.
11994class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
11995 bool IsFinalized = false;
11996 SmallVector<int> CommonMask;
11997 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
11998 const TargetTransformInfo &TTI;
11999 InstructionCost Cost = 0;
12000 SmallDenseSet<Value *> VectorizedVals;
12001 BoUpSLP &R;
12002 SmallPtrSetImpl<Value *> &CheckedExtracts;
12003 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
12004 /// While set, still trying to estimate the cost for the same nodes and we
12005 /// can delay actual cost estimation (virtual shuffle instruction emission).
12006 /// May help better estimate the cost if same nodes must be permuted + allows
12007 /// to move most of the long shuffles cost estimation to TTI.
12008 bool SameNodesEstimated = true;
12009
12010 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
12011 if (Ty->getScalarType()->isPointerTy()) {
12012 Constant *Res = ConstantExpr::getIntToPtr(
12013 C: ConstantInt::getAllOnesValue(
12014 Ty: IntegerType::get(C&: Ty->getContext(),
12015 NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
12016 Ty: Ty->getScalarType());
12017 if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
12018 Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
12019 return Res;
12020 }
12021 return Constant::getAllOnesValue(Ty);
12022 }
12023
12024 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
12025 if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>))
12026 return TTI::TCC_Free;
12027 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
12028 InstructionCost GatherCost = 0;
12029 SmallVector<Value *> Gathers(VL);
12030 if (!Root && isSplat(VL)) {
12031 // Found the broadcasting of the single scalar, calculate the cost as
12032 // the broadcast.
12033 const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
12034 assert(It != VL.end() && "Expected at least one non-undef value.");
12035 // Add broadcast for non-identity shuffle only.
12036 bool NeedShuffle =
12037 count(Range&: VL, Element: *It) > 1 &&
12038 (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
12039 if (!NeedShuffle) {
12040 if (isa<FixedVectorType>(Val: ScalarTy)) {
12041 assert(SLPReVec && "FixedVectorType is not expected.");
12042 return TTI.getShuffleCost(
12043 Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind,
12044 Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy),
12045 SubTp: cast<FixedVectorType>(Val: ScalarTy));
12046 }
12047 return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
12048 CostKind, Index: std::distance(first: VL.begin(), last: It),
12049 Op0: PoisonValue::get(T: VecTy), Op1: *It);
12050 }
12051
12052 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
12053 transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
12054 return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0;
12055 });
12056 InstructionCost InsertCost =
12057 TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0,
12058 Op0: PoisonValue::get(T: VecTy), Op1: *It);
12059 return InsertCost + ::getShuffleCost(TTI,
12060 Kind: TargetTransformInfo::SK_Broadcast,
12061 Tp: VecTy, Mask: ShuffleMask, CostKind,
12062 /*Index=*/0, /*SubTp=*/nullptr,
12063 /*Args=*/*It);
12064 }
12065 return GatherCost +
12066 (all_of(Range&: Gathers, P: IsaPred<UndefValue>)
12067 ? TTI::TCC_Free
12068 : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
12069 ScalarTy));
12070 };
12071
12072 /// Compute the cost of creating a vector containing the extracted values from
12073 /// \p VL.
12074 InstructionCost
12075 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
12076 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12077 unsigned NumParts) {
12078 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
12079 unsigned NumElts =
12080 std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) {
12081 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
12082 if (!EE)
12083 return Sz;
12084 auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
12085 if (!VecTy)
12086 return Sz;
12087 return std::max(a: Sz, b: VecTy->getNumElements());
12088 });
12089 // FIXME: this must be moved to TTI for better estimation.
12090 unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
12091 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
12092 SmallVectorImpl<unsigned> &Indices,
12093 SmallVectorImpl<unsigned> &SubVecSizes)
12094 -> std::optional<TTI::ShuffleKind> {
12095 if (NumElts <= EltsPerVector)
12096 return std::nullopt;
12097 int OffsetReg0 =
12098 alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
12099 binary_op: [](int S, int I) {
12100 if (I == PoisonMaskElem)
12101 return S;
12102 return std::min(a: S, b: I);
12103 }),
12104 Align: EltsPerVector);
12105 int OffsetReg1 = OffsetReg0;
12106 DenseSet<int> RegIndices;
12107 // Check that if trying to permute same single/2 input vectors.
12108 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
12109 int FirstRegId = -1;
12110 Indices.assign(NumElts: 1, Elt: OffsetReg0);
12111 for (auto [Pos, I] : enumerate(First&: Mask)) {
12112 if (I == PoisonMaskElem)
12113 continue;
12114 int Idx = I - OffsetReg0;
12115 int RegId =
12116 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
12117 if (FirstRegId < 0)
12118 FirstRegId = RegId;
12119 RegIndices.insert(V: RegId);
12120 if (RegIndices.size() > 2)
12121 return std::nullopt;
12122 if (RegIndices.size() == 2) {
12123 ShuffleKind = TTI::SK_PermuteTwoSrc;
12124 if (Indices.size() == 1) {
12125 OffsetReg1 = alignDown(
12126 Value: std::accumulate(
12127 first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
12128 binary_op: [&](int S, int I) {
12129 if (I == PoisonMaskElem)
12130 return S;
12131 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
12132 ((I - OffsetReg0) % NumElts) / EltsPerVector;
12133 if (RegId == FirstRegId)
12134 return S;
12135 return std::min(a: S, b: I);
12136 }),
12137 Align: EltsPerVector);
12138 unsigned Index = OffsetReg1 % NumElts;
12139 Indices.push_back(Elt: Index);
12140 SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector));
12141 }
12142 Idx = I - OffsetReg1;
12143 }
12144 I = (Idx % NumElts) % EltsPerVector +
12145 (RegId == FirstRegId ? 0 : EltsPerVector);
12146 }
12147 return ShuffleKind;
12148 };
12149 InstructionCost Cost = 0;
12150
12151 // Process extracts in blocks of EltsPerVector to check if the source vector
12152 // operand can be re-used directly. If not, add the cost of creating a
12153 // shuffle to extract the values into a vector register.
12154 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
12155 if (!ShuffleKinds[Part])
12156 continue;
12157 ArrayRef<int> MaskSlice = Mask.slice(
12158 N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
12159 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
12160 copy(Range&: MaskSlice, Out: SubMask.begin());
12161 SmallVector<unsigned, 2> Indices;
12162 SmallVector<unsigned, 2> SubVecSizes;
12163 std::optional<TTI::ShuffleKind> RegShuffleKind =
12164 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
12165 if (!RegShuffleKind) {
12166 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
12167 !ShuffleVectorInst::isIdentityMask(
12168 Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
12169 Cost +=
12170 ::getShuffleCost(TTI, Kind: *ShuffleKinds[Part],
12171 Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
12172 continue;
12173 }
12174 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
12175 !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
12176 Cost +=
12177 ::getShuffleCost(TTI, Kind: *RegShuffleKind,
12178 Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
12179 }
12180 const unsigned BaseVF = getFullVectorNumberOfElements(
12181 TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector));
12182 for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) {
12183 assert((Idx + SubVecSize) <= BaseVF &&
12184 "SK_ExtractSubvector index out of range");
12185 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
12186 Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind,
12187 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize));
12188 }
12189 // Second attempt to check, if just a permute is better estimated than
12190 // subvector extract.
12191 SubMask.assign(NumElts, Elt: PoisonMaskElem);
12192 copy(Range&: MaskSlice, Out: SubMask.begin());
12193 InstructionCost OriginalCost = ::getShuffleCost(
12194 TTI, Kind: *ShuffleKinds[Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
12195 if (OriginalCost < Cost)
12196 Cost = OriginalCost;
12197 }
12198 return Cost;
12199 }
12200 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
12201 /// mask \p Mask, register number \p Part, that includes \p SliceSize
12202 /// elements.
12203 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
12204 ArrayRef<int> Mask, unsigned Part,
12205 unsigned SliceSize) {
12206 if (SameNodesEstimated) {
12207 // Delay the cost estimation if the same nodes are reshuffling.
12208 // If we already requested the cost of reshuffling of E1 and E2 before, no
12209 // need to estimate another cost with the sub-Mask, instead include this
12210 // sub-Mask into the CommonMask to estimate it later and avoid double cost
12211 // estimation.
12212 if ((InVectors.size() == 2 &&
12213 cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 &&
12214 cast<const TreeEntry *>(Val&: InVectors.back()) == E2) ||
12215 (!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) {
12216 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
12217 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
12218 [](int Idx) { return Idx == PoisonMaskElem; }) &&
12219 "Expected all poisoned elements.");
12220 ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
12221 copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
12222 return;
12223 }
12224 // Found non-matching nodes - need to estimate the cost for the matched
12225 // and transform mask.
12226 Cost += createShuffle(P1: InVectors.front(),
12227 P2: InVectors.size() == 1 ? nullptr : InVectors.back(),
12228 Mask: CommonMask);
12229 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12230 } else if (InVectors.size() == 2) {
12231 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
12232 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12233 }
12234 SameNodesEstimated = false;
12235 if (!E2 && InVectors.size() == 1) {
12236 unsigned VF = E1.getVectorFactor();
12237 if (Value *V1 = dyn_cast<Value *>(Val&: InVectors.front())) {
12238 VF = std::max(a: VF, b: getVF(V: V1));
12239 } else {
12240 const auto *E = cast<const TreeEntry *>(Val&: InVectors.front());
12241 VF = std::max(a: VF, b: E->getVectorFactor());
12242 }
12243 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12244 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12245 CommonMask[Idx] = Mask[Idx] + VF;
12246 Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
12247 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12248 } else {
12249 auto P = InVectors.front();
12250 Cost += createShuffle(P1: &E1, P2: E2, Mask);
12251 unsigned VF = Mask.size();
12252 if (Value *V1 = dyn_cast<Value *>(Val&: P)) {
12253 VF = std::max(a: VF,
12254 b: getNumElements(Ty: V1->getType()));
12255 } else {
12256 const auto *E = cast<const TreeEntry *>(Val&: P);
12257 VF = std::max(a: VF, b: E->getVectorFactor());
12258 }
12259 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12260 if (Mask[Idx] != PoisonMaskElem)
12261 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
12262 Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask);
12263 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12264 }
12265 }
12266
12267 class ShuffleCostBuilder {
12268 const TargetTransformInfo &TTI;
12269
12270 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
12271 int Index = -1;
12272 return Mask.empty() ||
12273 (VF == Mask.size() &&
12274 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) ||
12275 (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
12276 Index == 0);
12277 }
12278
12279 public:
12280 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
12281 ~ShuffleCostBuilder() = default;
12282 InstructionCost createShuffleVector(Value *V1, Value *,
12283 ArrayRef<int> Mask) const {
12284 // Empty mask or identity mask are free.
12285 unsigned VF =
12286 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12287 if (isEmptyOrIdentity(Mask, VF))
12288 return TTI::TCC_Free;
12289 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
12290 Tp: cast<VectorType>(Val: V1->getType()), Mask);
12291 }
12292 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
12293 // Empty mask or identity mask are free.
12294 unsigned VF =
12295 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12296 if (isEmptyOrIdentity(Mask, VF))
12297 return TTI::TCC_Free;
12298 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
12299 Tp: cast<VectorType>(Val: V1->getType()), Mask);
12300 }
12301 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
12302 InstructionCost createPoison(Type *Ty, unsigned VF) const {
12303 return TTI::TCC_Free;
12304 }
12305 void resizeToMatch(Value *&, Value *&) const {}
12306 };
12307
12308 /// Smart shuffle instruction emission, walks through shuffles trees and
12309 /// tries to find the best matching vector for the actual shuffle
12310 /// instruction.
12311 InstructionCost
12312 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
12313 const PointerUnion<Value *, const TreeEntry *> &P2,
12314 ArrayRef<int> Mask) {
12315 ShuffleCostBuilder Builder(TTI);
12316 SmallVector<int> CommonMask(Mask);
12317 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
12318 unsigned CommonVF = Mask.size();
12319 InstructionCost ExtraCost = 0;
12320 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
12321 unsigned VF) -> InstructionCost {
12322 if (E.isGather() && allConstant(VL: E.Scalars))
12323 return TTI::TCC_Free;
12324 Type *EScalarTy = E.Scalars.front()->getType();
12325 bool IsSigned = true;
12326 if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
12327 EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
12328 IsSigned = It->second.second;
12329 }
12330 if (EScalarTy != ScalarTy) {
12331 unsigned CastOpcode = Instruction::Trunc;
12332 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
12333 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
12334 if (DstSz > SrcSz)
12335 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
12336 return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
12337 Src: getWidenedType(ScalarTy: EScalarTy, VF),
12338 CCH: TTI::CastContextHint::None, CostKind);
12339 }
12340 return TTI::TCC_Free;
12341 };
12342 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
12343 if (isa<Constant>(Val: V))
12344 return TTI::TCC_Free;
12345 auto *VecTy = cast<VectorType>(Val: V->getType());
12346 Type *EScalarTy = VecTy->getElementType();
12347 if (EScalarTy != ScalarTy) {
12348 bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL));
12349 unsigned CastOpcode = Instruction::Trunc;
12350 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
12351 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
12352 if (DstSz > SrcSz)
12353 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
12354 return TTI.getCastInstrCost(
12355 Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
12356 Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
12357 }
12358 return TTI::TCC_Free;
12359 };
12360 if (!V1 && !V2 && !P2.isNull()) {
12361 // Shuffle 2 entry nodes.
12362 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
12363 unsigned VF = E->getVectorFactor();
12364 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
12365 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
12366 assert(all_of(Mask,
12367 [=](int Idx) {
12368 return Idx < 2 * static_cast<int>(CommonVF);
12369 }) &&
12370 "All elements in mask must be less than 2 * CommonVF.");
12371 if (E->Scalars.size() == E2->Scalars.size()) {
12372 SmallVector<int> EMask = E->getCommonMask();
12373 SmallVector<int> E2Mask = E2->getCommonMask();
12374 if (!EMask.empty() || !E2Mask.empty()) {
12375 for (int &Idx : CommonMask) {
12376 if (Idx == PoisonMaskElem)
12377 continue;
12378 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
12379 Idx = EMask[Idx];
12380 else if (Idx >= static_cast<int>(CommonVF))
12381 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
12382 E->Scalars.size();
12383 }
12384 }
12385 CommonVF = E->Scalars.size();
12386 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
12387 GetNodeMinBWAffectedCost(*E2, CommonVF);
12388 } else {
12389 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
12390 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
12391 }
12392 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12393 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12394 } else if (!V1 && P2.isNull()) {
12395 // Shuffle single entry node.
12396 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
12397 unsigned VF = E->getVectorFactor();
12398 CommonVF = VF;
12399 assert(
12400 all_of(Mask,
12401 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
12402 "All elements in mask must be less than CommonVF.");
12403 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
12404 SmallVector<int> EMask = E->getCommonMask();
12405 assert(!EMask.empty() && "Expected non-empty common mask.");
12406 for (int &Idx : CommonMask) {
12407 if (Idx != PoisonMaskElem)
12408 Idx = EMask[Idx];
12409 }
12410 CommonVF = E->Scalars.size();
12411 } else if (unsigned Factor = E->getInterleaveFactor();
12412 Factor > 0 && E->Scalars.size() != Mask.size() &&
12413 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask,
12414 Factor)) {
12415 // Deinterleaved nodes are free.
12416 std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: 0);
12417 }
12418 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
12419 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12420 // Not identity/broadcast? Try to see if the original vector is better.
12421 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
12422 CommonVF == CommonMask.size() &&
12423 any_of(Range: enumerate(First&: CommonMask),
12424 P: [](const auto &&P) {
12425 return P.value() != PoisonMaskElem &&
12426 static_cast<unsigned>(P.value()) != P.index();
12427 }) &&
12428 any_of(Range&: CommonMask,
12429 P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
12430 SmallVector<int> ReorderMask;
12431 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
12432 ::addMask(Mask&: CommonMask, SubMask: ReorderMask);
12433 }
12434 } else if (V1 && P2.isNull()) {
12435 // Shuffle single vector.
12436 ExtraCost += GetValueMinBWAffectedCost(V1);
12437 CommonVF = getVF(V: V1);
12438 assert(
12439 all_of(Mask,
12440 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
12441 "All elements in mask must be less than CommonVF.");
12442 } else if (V1 && !V2) {
12443 // Shuffle vector and tree node.
12444 unsigned VF = getVF(V: V1);
12445 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
12446 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
12447 assert(all_of(Mask,
12448 [=](int Idx) {
12449 return Idx < 2 * static_cast<int>(CommonVF);
12450 }) &&
12451 "All elements in mask must be less than 2 * CommonVF.");
12452 if (E2->Scalars.size() == VF && VF != CommonVF) {
12453 SmallVector<int> E2Mask = E2->getCommonMask();
12454 assert(!E2Mask.empty() && "Expected non-empty common mask.");
12455 for (int &Idx : CommonMask) {
12456 if (Idx == PoisonMaskElem)
12457 continue;
12458 if (Idx >= static_cast<int>(CommonVF))
12459 Idx = E2Mask[Idx - CommonVF] + VF;
12460 }
12461 CommonVF = VF;
12462 }
12463 ExtraCost += GetValueMinBWAffectedCost(V1);
12464 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12465 ExtraCost += GetNodeMinBWAffectedCost(
12466 *E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
12467 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12468 } else if (!V1 && V2) {
12469 // Shuffle vector and tree node.
12470 unsigned VF = getVF(V: V2);
12471 const TreeEntry *E1 = cast<const TreeEntry *>(Val: P1);
12472 CommonVF = std::max(a: VF, b: E1->getVectorFactor());
12473 assert(all_of(Mask,
12474 [=](int Idx) {
12475 return Idx < 2 * static_cast<int>(CommonVF);
12476 }) &&
12477 "All elements in mask must be less than 2 * CommonVF.");
12478 if (E1->Scalars.size() == VF && VF != CommonVF) {
12479 SmallVector<int> E1Mask = E1->getCommonMask();
12480 assert(!E1Mask.empty() && "Expected non-empty common mask.");
12481 for (int &Idx : CommonMask) {
12482 if (Idx == PoisonMaskElem)
12483 continue;
12484 if (Idx >= static_cast<int>(CommonVF))
12485 Idx = E1Mask[Idx - CommonVF] + VF;
12486 else
12487 Idx = E1Mask[Idx];
12488 }
12489 CommonVF = VF;
12490 }
12491 ExtraCost += GetNodeMinBWAffectedCost(
12492 *E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
12493 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12494 ExtraCost += GetValueMinBWAffectedCost(V2);
12495 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12496 } else {
12497 assert(V1 && V2 && "Expected both vectors.");
12498 unsigned VF = getVF(V: V1);
12499 CommonVF = std::max(a: VF, b: getVF(V: V2));
12500 assert(all_of(Mask,
12501 [=](int Idx) {
12502 return Idx < 2 * static_cast<int>(CommonVF);
12503 }) &&
12504 "All elements in mask must be less than 2 * CommonVF.");
12505 ExtraCost +=
12506 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
12507 if (V1->getType() != V2->getType()) {
12508 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12509 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12510 } else {
12511 if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
12512 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
12513 if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
12514 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
12515 }
12516 }
12517 InVectors.front() =
12518 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
12519 if (InVectors.size() == 2)
12520 InVectors.pop_back();
12521 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
12522 V1, V2, Mask: CommonMask, Builder, ScalarTy);
12523 }
12524
12525public:
12526 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
12527 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
12528 SmallPtrSetImpl<Value *> &CheckedExtracts)
12529 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
12530 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
12531 CheckedExtracts(CheckedExtracts) {}
12532 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
12533 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
12534 unsigned NumParts, bool &UseVecBaseAsInput) {
12535 UseVecBaseAsInput = false;
12536 if (Mask.empty())
12537 return nullptr;
12538 Value *VecBase = nullptr;
12539 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
12540 if (!E->ReorderIndices.empty()) {
12541 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
12542 E->ReorderIndices.end());
12543 reorderScalars(Scalars&: VL, Mask: ReorderMask);
12544 }
12545 // Check if it can be considered reused if same extractelements were
12546 // vectorized already.
12547 bool PrevNodeFound = any_of(
12548 Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
12549 P: [&](const std::unique_ptr<TreeEntry> &TE) {
12550 return ((TE->hasState() && !TE->isAltShuffle() &&
12551 TE->getOpcode() == Instruction::ExtractElement) ||
12552 TE->isGather()) &&
12553 all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) {
12554 return VL.size() > Data.index() &&
12555 (Mask[Data.index()] == PoisonMaskElem ||
12556 isa<UndefValue>(VL[Data.index()]) ||
12557 Data.value() == VL[Data.index()]);
12558 });
12559 });
12560 SmallPtrSet<Value *, 4> UniqueBases;
12561 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
12562 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
12563 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
12564 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
12565 ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
12566 for (auto [I, V] :
12567 enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) {
12568 // Ignore non-extractelement scalars.
12569 if (isa<UndefValue>(Val: V) ||
12570 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
12571 continue;
12572 // If all users of instruction are going to be vectorized and this
12573 // instruction itself is not going to be vectorized, consider this
12574 // instruction as dead and remove its cost from the final cost of the
12575 // vectorized tree.
12576 // Also, avoid adjusting the cost for extractelements with multiple uses
12577 // in different graph entries.
12578 auto *EE = cast<ExtractElementInst>(Val: V);
12579 VecBase = EE->getVectorOperand();
12580 UniqueBases.insert(Ptr: VecBase);
12581 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
12582 if (!CheckedExtracts.insert(Ptr: V).second ||
12583 !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) ||
12584 any_of(Range: EE->users(),
12585 P: [&](User *U) {
12586 return isa<GetElementPtrInst>(Val: U) &&
12587 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
12588 VectorizedVals: &VectorizedVals);
12589 }) ||
12590 (!VEs.empty() && !is_contained(Range&: VEs, Element: E)))
12591 continue;
12592 std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
12593 if (!EEIdx)
12594 continue;
12595 unsigned Idx = *EEIdx;
12596 // Take credit for instruction that will become dead.
12597 if (EE->hasOneUse() || !PrevNodeFound) {
12598 Instruction *Ext = EE->user_back();
12599 if (isa<SExtInst, ZExtInst>(Val: Ext) &&
12600 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
12601 // Use getExtractWithExtendCost() to calculate the cost of
12602 // extractelement/ext pair.
12603 Cost -= TTI.getExtractWithExtendCost(
12604 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(),
12605 Index: Idx, CostKind);
12606 // Add back the cost of s|zext which is subtracted separately.
12607 Cost += TTI.getCastInstrCost(
12608 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
12609 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
12610 continue;
12611 }
12612 }
12613 APInt &DemandedElts =
12614 VectorOpsToExtracts
12615 .try_emplace(Key: VecBase,
12616 Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType())))
12617 .first->getSecond();
12618 DemandedElts.setBit(Idx);
12619 }
12620 }
12621 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
12622 Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()),
12623 DemandedElts, /*Insert=*/false,
12624 /*Extract=*/true, CostKind);
12625 // Check that gather of extractelements can be represented as just a
12626 // shuffle of a single/two vectors the scalars are extracted from.
12627 // Found the bunch of extractelement instructions that must be gathered
12628 // into a vector and can be represented as a permutation elements in a
12629 // single input vector or of 2 input vectors.
12630 // Done for reused if same extractelements were vectorized already.
12631 if (!PrevNodeFound)
12632 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
12633 InVectors.assign(NumElts: 1, Elt: E);
12634 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12635 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12636 SameNodesEstimated = false;
12637 if (NumParts != 1 && UniqueBases.size() != 1) {
12638 UseVecBaseAsInput = true;
12639 VecBase =
12640 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
12641 }
12642 return VecBase;
12643 }
12644 /// Checks if the specified entry \p E needs to be delayed because of its
12645 /// dependency nodes.
12646 std::optional<InstructionCost>
12647 needToDelay(const TreeEntry *,
12648 ArrayRef<SmallVector<const TreeEntry *>>) const {
12649 // No need to delay the cost estimation during analysis.
12650 return std::nullopt;
12651 }
12652 /// Reset the builder to handle perfect diamond match.
12653 void resetForSameNode() {
12654 IsFinalized = false;
12655 CommonMask.clear();
12656 InVectors.clear();
12657 Cost = 0;
12658 VectorizedVals.clear();
12659 SameNodesEstimated = true;
12660 }
12661 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12662 if (&E1 == &E2) {
12663 assert(all_of(Mask,
12664 [&](int Idx) {
12665 return Idx < static_cast<int>(E1.getVectorFactor());
12666 }) &&
12667 "Expected single vector shuffle mask.");
12668 add(E1, Mask);
12669 return;
12670 }
12671 if (InVectors.empty()) {
12672 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12673 InVectors.assign(IL: {&E1, &E2});
12674 return;
12675 }
12676 assert(!CommonMask.empty() && "Expected non-empty common mask.");
12677 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
12678 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
12679 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
12680 const auto *It =
12681 find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
12682 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
12683 estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
12684 }
12685 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12686 if (InVectors.empty()) {
12687 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12688 InVectors.assign(NumElts: 1, Elt: &E1);
12689 return;
12690 }
12691 assert(!CommonMask.empty() && "Expected non-empty common mask.");
12692 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
12693 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
12694 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
12695 const auto *It =
12696 find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
12697 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
12698 estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
12699 if (!SameNodesEstimated && InVectors.size() == 1)
12700 InVectors.emplace_back(Args: &E1);
12701 }
12702 /// Adds 2 input vectors and the mask for their shuffling.
12703 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12704 // May come only for shuffling of 2 vectors with extractelements, already
12705 // handled in adjustExtracts.
12706 assert(InVectors.size() == 1 &&
12707 all_of(enumerate(CommonMask),
12708 [&](auto P) {
12709 if (P.value() == PoisonMaskElem)
12710 return Mask[P.index()] == PoisonMaskElem;
12711 auto *EI = cast<ExtractElementInst>(
12712 cast<const TreeEntry *>(InVectors.front())
12713 ->getOrdered(P.index()));
12714 return EI->getVectorOperand() == V1 ||
12715 EI->getVectorOperand() == V2;
12716 }) &&
12717 "Expected extractelement vectors.");
12718 }
12719 /// Adds another one input vector and the mask for the shuffling.
12720 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
12721 if (InVectors.empty()) {
12722 assert(CommonMask.empty() && !ForExtracts &&
12723 "Expected empty input mask/vectors.");
12724 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12725 InVectors.assign(NumElts: 1, Elt: V1);
12726 return;
12727 }
12728 if (ForExtracts) {
12729 // No need to add vectors here, already handled them in adjustExtracts.
12730 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
12731 !CommonMask.empty() &&
12732 all_of(enumerate(CommonMask),
12733 [&](auto P) {
12734 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
12735 ->getOrdered(P.index());
12736 if (P.value() == PoisonMaskElem)
12737 return P.value() == Mask[P.index()] ||
12738 isa<UndefValue>(Scalar);
12739 if (isa<Constant>(V1))
12740 return true;
12741 auto *EI = cast<ExtractElementInst>(Scalar);
12742 return EI->getVectorOperand() == V1;
12743 }) &&
12744 "Expected only tree entry for extractelement vectors.");
12745 return;
12746 }
12747 assert(!InVectors.empty() && !CommonMask.empty() &&
12748 "Expected only tree entries from extracts/reused buildvectors.");
12749 unsigned VF = getVF(V: V1);
12750 if (InVectors.size() == 2) {
12751 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
12752 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12753 VF = std::max<unsigned>(a: VF, b: CommonMask.size());
12754 } else if (const auto *InTE =
12755 InVectors.front().dyn_cast<const TreeEntry *>()) {
12756 VF = std::max(a: VF, b: InTE->getVectorFactor());
12757 } else {
12758 VF = std::max(
12759 a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType())
12760 ->getNumElements());
12761 }
12762 InVectors.push_back(Elt: V1);
12763 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12764 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12765 CommonMask[Idx] = Mask[Idx] + VF;
12766 }
12767 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12768 Value *Root = nullptr) {
12769 Cost += getBuildVectorCost(VL, Root);
12770 if (!Root) {
12771 // FIXME: Need to find a way to avoid use of getNullValue here.
12772 SmallVector<Constant *> Vals;
12773 unsigned VF = VL.size();
12774 if (MaskVF != 0)
12775 VF = std::min(a: VF, b: MaskVF);
12776 Type *VLScalarTy = VL.front()->getType();
12777 for (Value *V : VL.take_front(N: VF)) {
12778 Type *ScalarTy = VLScalarTy->getScalarType();
12779 if (isa<PoisonValue>(Val: V)) {
12780 Vals.push_back(Elt: PoisonValue::get(T: ScalarTy));
12781 continue;
12782 }
12783 if (isa<UndefValue>(Val: V)) {
12784 Vals.push_back(Elt: UndefValue::get(T: ScalarTy));
12785 continue;
12786 }
12787 Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy));
12788 }
12789 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) {
12790 assert(SLPReVec && "FixedVectorType is not expected.");
12791 // When REVEC is enabled, we need to expand vector types into scalar
12792 // types.
12793 Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements());
12794 }
12795 return ConstantVector::get(V: Vals);
12796 }
12797 return ConstantVector::getSplat(
12798 EC: ElementCount::getFixed(
12799 MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
12800 Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType()));
12801 }
12802 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
12803 /// Finalize emission of the shuffles.
12804 InstructionCost finalize(
12805 ArrayRef<int> ExtMask,
12806 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
12807 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
12808 function_ref<void(Value *&, SmallVectorImpl<int> &,
12809 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
12810 Action = {}) {
12811 IsFinalized = true;
12812 if (Action) {
12813 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
12814 if (InVectors.size() == 2)
12815 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
12816 else
12817 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
12818 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12819 assert(VF > 0 &&
12820 "Expected vector length for the final value before action.");
12821 Value *V = cast<Value *>(Val: Vec);
12822 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
12823 Cost += createShuffle(P1: V1, P2: V2, Mask);
12824 return V1;
12825 });
12826 InVectors.front() = V;
12827 }
12828 if (!SubVectors.empty()) {
12829 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
12830 if (InVectors.size() == 2)
12831 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
12832 else
12833 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
12834 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12835 // Add subvectors permutation cost.
12836 if (!SubVectorsMask.empty()) {
12837 assert(SubVectorsMask.size() <= CommonMask.size() &&
12838 "Expected same size of masks for subvectors and common mask.");
12839 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
12840 copy(Range&: SubVectorsMask, Out: SVMask.begin());
12841 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
12842 if (I2 != PoisonMaskElem) {
12843 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
12844 I1 = I2 + CommonMask.size();
12845 }
12846 }
12847 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
12848 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()),
12849 Mask: SVMask, CostKind);
12850 }
12851 for (auto [E, Idx] : SubVectors) {
12852 Type *EScalarTy = E->Scalars.front()->getType();
12853 bool IsSigned = true;
12854 if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) {
12855 EScalarTy =
12856 IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
12857 IsSigned = It->second.second;
12858 }
12859 if (ScalarTy != EScalarTy) {
12860 unsigned CastOpcode = Instruction::Trunc;
12861 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
12862 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
12863 if (DstSz > SrcSz)
12864 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
12865 Cost += TTI.getCastInstrCost(
12866 Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()),
12867 Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()),
12868 CCH: TTI::CastContextHint::Normal, CostKind);
12869 }
12870 Cost += ::getShuffleCost(
12871 TTI, Kind: TTI::SK_InsertSubvector,
12872 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx,
12873 SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor()));
12874 if (!CommonMask.empty()) {
12875 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
12876 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
12877 value: Idx);
12878 }
12879 }
12880 }
12881
12882 if (!ExtMask.empty()) {
12883 if (CommonMask.empty()) {
12884 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
12885 } else {
12886 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12887 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12888 if (ExtMask[I] == PoisonMaskElem)
12889 continue;
12890 NewMask[I] = CommonMask[ExtMask[I]];
12891 }
12892 CommonMask.swap(RHS&: NewMask);
12893 }
12894 }
12895 if (CommonMask.empty()) {
12896 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12897 return Cost;
12898 }
12899 return Cost +
12900 createShuffle(P1: InVectors.front(),
12901 P2: InVectors.size() == 2 ? InVectors.back() : nullptr,
12902 Mask: CommonMask);
12903 }
12904
12905 ~ShuffleCostEstimator() {
12906 assert((IsFinalized || CommonMask.empty()) &&
12907 "Shuffle construction must be finalized.");
12908 }
12909};
12910
12911const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
12912 unsigned Idx) const {
12913 TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx});
12914 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
12915 return Op;
12916}
12917
12918TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
12919 if (TE.State == TreeEntry::ScatterVectorize ||
12920 TE.State == TreeEntry::StridedVectorize)
12921 return TTI::CastContextHint::GatherScatter;
12922 if (TE.State == TreeEntry::CompressVectorize)
12923 return TTI::CastContextHint::Masked;
12924 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
12925 !TE.isAltShuffle()) {
12926 if (TE.ReorderIndices.empty())
12927 return TTI::CastContextHint::Normal;
12928 SmallVector<int> Mask;
12929 inversePermutation(Indices: TE.ReorderIndices, Mask);
12930 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
12931 return TTI::CastContextHint::Reversed;
12932 }
12933 return TTI::CastContextHint::None;
12934}
12935
12936InstructionCost
12937BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
12938 SmallPtrSetImpl<Value *> &CheckedExtracts) {
12939 ArrayRef<Value *> VL = E->Scalars;
12940
12941 Type *ScalarTy = getValueType(V: VL[0]);
12942 if (!isValidElementType(Ty: ScalarTy))
12943 return InstructionCost::getInvalid();
12944 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
12945
12946 // If we have computed a smaller type for the expression, update VecTy so
12947 // that the costs will be accurate.
12948 auto It = MinBWs.find(Val: E);
12949 Type *OrigScalarTy = ScalarTy;
12950 if (It != MinBWs.end()) {
12951 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
12952 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
12953 if (VecTy)
12954 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
12955 }
12956 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
12957 unsigned EntryVF = E->getVectorFactor();
12958 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
12959
12960 if (E->isGather()) {
12961 if (allConstant(VL))
12962 return 0;
12963 if (isa<InsertElementInst>(Val: VL[0]))
12964 return InstructionCost::getInvalid();
12965 if (isa<CmpInst>(Val: VL.front()))
12966 ScalarTy = VL.front()->getType();
12967 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
12968 E, ScalarTy, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts);
12969 }
12970 if (E->State == TreeEntry::SplitVectorize) {
12971 assert(E->CombinedEntriesWithIndices.size() == 2 &&
12972 "Expected exactly 2 combined entries.");
12973 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
12974 InstructionCost VectorCost = 0;
12975 if (E->ReorderIndices.empty()) {
12976 VectorCost = ::getShuffleCost(
12977 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind,
12978 Index: E->CombinedEntriesWithIndices.back().second,
12979 SubTp: getWidenedType(
12980 ScalarTy,
12981 VF: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
12982 ->getVectorFactor()));
12983 } else {
12984 unsigned CommonVF =
12985 std::max(a: VectorizableTree[E->CombinedEntriesWithIndices.front().first]
12986 ->getVectorFactor(),
12987 b: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
12988 ->getVectorFactor());
12989 VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
12990 Tp: getWidenedType(ScalarTy, VF: CommonVF),
12991 Mask: E->getSplitMask(), CostKind);
12992 }
12993 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
12994 return VectorCost;
12995 }
12996 InstructionCost CommonCost = 0;
12997 SmallVector<int> Mask;
12998 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
12999 (E->State != TreeEntry::StridedVectorize ||
13000 !isReverseOrder(Order: E->ReorderIndices))) {
13001 SmallVector<int> NewMask;
13002 if (E->getOpcode() == Instruction::Store) {
13003 // For stores the order is actually a mask.
13004 NewMask.resize(N: E->ReorderIndices.size());
13005 copy(Range: E->ReorderIndices, Out: NewMask.begin());
13006 } else {
13007 inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
13008 }
13009 ::addMask(Mask, SubMask: NewMask);
13010 }
13011 if (!E->ReuseShuffleIndices.empty())
13012 ::addMask(Mask, SubMask: E->ReuseShuffleIndices);
13013 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
13014 CommonCost =
13015 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
13016 assert((E->State == TreeEntry::Vectorize ||
13017 E->State == TreeEntry::ScatterVectorize ||
13018 E->State == TreeEntry::StridedVectorize ||
13019 E->State == TreeEntry::CompressVectorize) &&
13020 "Unhandled state");
13021 assert(E->getOpcode() &&
13022 ((allSameType(VL) && allSameBlock(VL)) ||
13023 (E->getOpcode() == Instruction::GetElementPtr &&
13024 E->getMainOp()->getType()->isPointerTy())) &&
13025 "Invalid VL");
13026 Instruction *VL0 = E->getMainOp();
13027 unsigned ShuffleOrOp =
13028 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
13029 if (E->CombinedOp != TreeEntry::NotCombinedOp)
13030 ShuffleOrOp = E->CombinedOp;
13031 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
13032 const unsigned Sz = UniqueValues.size();
13033 SmallBitVector UsedScalars(Sz, false);
13034 for (unsigned I = 0; I < Sz; ++I) {
13035 if (isa<Instruction>(Val: UniqueValues[I]) &&
13036 getTreeEntries(V: UniqueValues[I]).front() == E)
13037 continue;
13038 UsedScalars.set(I);
13039 }
13040 auto GetCastContextHint = [&](Value *V) {
13041 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
13042 return getCastContextHint(TE: *OpTEs.front());
13043 InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI);
13044 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
13045 !SrcState.isAltShuffle())
13046 return TTI::CastContextHint::GatherScatter;
13047 return TTI::CastContextHint::None;
13048 };
13049 auto GetCostDiff =
13050 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
13051 function_ref<InstructionCost(InstructionCost)> VectorCost) {
13052 // Calculate the cost of this instruction.
13053 InstructionCost ScalarCost = 0;
13054 if (isa<CastInst, CallInst>(Val: VL0)) {
13055 // For some of the instructions no need to calculate cost for each
13056 // particular instruction, we can use the cost of the single
13057 // instruction x total number of scalar instructions.
13058 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
13059 } else {
13060 for (unsigned I = 0; I < Sz; ++I) {
13061 if (UsedScalars.test(Idx: I))
13062 continue;
13063 ScalarCost += ScalarEltCost(I);
13064 }
13065 }
13066
13067 InstructionCost VecCost = VectorCost(CommonCost);
13068 // Check if the current node must be resized, if the parent node is not
13069 // resized.
13070 if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) &&
13071 E->Idx != 0 &&
13072 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
13073 const EdgeInfo &EI = E->UserTreeIndex;
13074 if (!EI.UserTE->hasState() ||
13075 EI.UserTE->getOpcode() != Instruction::Select ||
13076 EI.EdgeIdx != 0) {
13077 auto UserBWIt = MinBWs.find(Val: EI.UserTE);
13078 Type *UserScalarTy =
13079 (EI.UserTE->isGather() ||
13080 EI.UserTE->State == TreeEntry::SplitVectorize)
13081 ? EI.UserTE->Scalars.front()->getType()
13082 : EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
13083 if (UserBWIt != MinBWs.end())
13084 UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
13085 NumBits: UserBWIt->second.first);
13086 if (ScalarTy != UserScalarTy) {
13087 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13088 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
13089 unsigned VecOpcode;
13090 auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size());
13091 if (BWSz > SrcBWSz)
13092 VecOpcode = Instruction::Trunc;
13093 else
13094 VecOpcode =
13095 It->second.second ? Instruction::SExt : Instruction::ZExt;
13096 TTI::CastContextHint CCH = GetCastContextHint(VL0);
13097 VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
13098 CostKind);
13099 }
13100 }
13101 }
13102 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
13103 ScalarCost, "Calculated costs for Tree"));
13104 return VecCost - ScalarCost;
13105 };
13106 // Calculate cost difference from vectorizing set of GEPs.
13107 // Negative value means vectorizing is profitable.
13108 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
13109 assert((E->State == TreeEntry::Vectorize ||
13110 E->State == TreeEntry::StridedVectorize ||
13111 E->State == TreeEntry::CompressVectorize) &&
13112 "Entry state expected to be Vectorize, StridedVectorize or "
13113 "MaskedLoadCompressVectorize here.");
13114 InstructionCost ScalarCost = 0;
13115 InstructionCost VecCost = 0;
13116 std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
13117 TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
13118 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
13119 "Calculated GEPs cost for Tree"));
13120
13121 return VecCost - ScalarCost;
13122 };
13123
13124 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
13125 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL);
13126 if (MinMaxID == Intrinsic::not_intrinsic)
13127 return InstructionCost::getInvalid();
13128 Type *CanonicalType = Ty;
13129 if (CanonicalType->isPtrOrPtrVectorTy())
13130 CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
13131 C&: CanonicalType->getContext(),
13132 NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
13133
13134 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
13135 {CanonicalType, CanonicalType});
13136 InstructionCost IntrinsicCost =
13137 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13138 // If the selects are the only uses of the compares, they will be
13139 // dead and we can adjust the cost by removing their cost.
13140 if (VI && SelectOnly) {
13141 assert((!Ty->isVectorTy() || SLPReVec) &&
13142 "Expected only for scalar type.");
13143 auto *CI = cast<CmpInst>(Val: VI->getOperand(i: 0));
13144 IntrinsicCost -= TTI->getCmpSelInstrCost(
13145 Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(),
13146 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
13147 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI);
13148 }
13149 return IntrinsicCost;
13150 };
13151 switch (ShuffleOrOp) {
13152 case Instruction::PHI: {
13153 // Count reused scalars.
13154 InstructionCost ScalarCost = 0;
13155 SmallPtrSet<const TreeEntry *, 4> CountedOps;
13156 for (Value *V : UniqueValues) {
13157 auto *PHI = dyn_cast<PHINode>(Val: V);
13158 if (!PHI)
13159 continue;
13160
13161 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
13162 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
13163 Value *Op = PHI->getIncomingValue(i: I);
13164 Operands[I] = Op;
13165 }
13166 if (const TreeEntry *OpTE =
13167 getSameValuesTreeEntry(V: Operands.front(), VL: Operands))
13168 if (CountedOps.insert(Ptr: OpTE).second &&
13169 !OpTE->ReuseShuffleIndices.empty())
13170 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
13171 OpTE->Scalars.size());
13172 }
13173
13174 return CommonCost - ScalarCost;
13175 }
13176 case Instruction::ExtractValue:
13177 case Instruction::ExtractElement: {
13178 APInt DemandedElts;
13179 VectorType *SrcVecTy = nullptr;
13180 auto GetScalarCost = [&](unsigned Idx) {
13181 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
13182 return InstructionCost(TTI::TCC_Free);
13183
13184 auto *I = cast<Instruction>(Val: UniqueValues[Idx]);
13185 if (!SrcVecTy) {
13186 if (ShuffleOrOp == Instruction::ExtractElement) {
13187 auto *EE = cast<ExtractElementInst>(Val: I);
13188 SrcVecTy = EE->getVectorOperandType();
13189 } else {
13190 auto *EV = cast<ExtractValueInst>(Val: I);
13191 Type *AggregateTy = EV->getAggregateOperand()->getType();
13192 unsigned NumElts;
13193 if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
13194 NumElts = ATy->getNumElements();
13195 else
13196 NumElts = AggregateTy->getStructNumElements();
13197 SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
13198 }
13199 }
13200 if (I->hasOneUse()) {
13201 Instruction *Ext = I->user_back();
13202 if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) &&
13203 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
13204 // Use getExtractWithExtendCost() to calculate the cost of
13205 // extractelement/ext pair.
13206 InstructionCost Cost = TTI->getExtractWithExtendCost(
13207 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I),
13208 CostKind);
13209 // Subtract the cost of s|zext which is subtracted separately.
13210 Cost -= TTI->getCastInstrCost(
13211 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
13212 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
13213 return Cost;
13214 }
13215 }
13216 if (DemandedElts.isZero())
13217 DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy));
13218 DemandedElts.setBit(*getExtractIndex(E: I));
13219 return InstructionCost(TTI::TCC_Free);
13220 };
13221 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
13222 return CommonCost - (DemandedElts.isZero()
13223 ? TTI::TCC_Free
13224 : TTI.getScalarizationOverhead(
13225 Ty: SrcVecTy, DemandedElts, /*Insert=*/false,
13226 /*Extract=*/true, CostKind));
13227 };
13228 return GetCostDiff(GetScalarCost, GetVectorCost);
13229 }
13230 case Instruction::InsertElement: {
13231 assert(E->ReuseShuffleIndices.empty() &&
13232 "Unique insertelements only are expected.");
13233 auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
13234 unsigned const NumElts = SrcVecTy->getNumElements();
13235 unsigned const NumScalars = VL.size();
13236
13237 unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy);
13238
13239 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13240 unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
13241 unsigned OffsetEnd = OffsetBeg;
13242 InsertMask[OffsetBeg] = 0;
13243 for (auto [I, V] : enumerate(First: VL.drop_front())) {
13244 unsigned Idx = *getElementIndex(Inst: V);
13245 if (OffsetBeg > Idx)
13246 OffsetBeg = Idx;
13247 else if (OffsetEnd < Idx)
13248 OffsetEnd = Idx;
13249 InsertMask[Idx] = I + 1;
13250 }
13251 unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
13252 if (NumOfParts > 0 && NumOfParts < NumElts)
13253 VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts);
13254 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
13255 VecScalarsSz;
13256 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
13257 unsigned InsertVecSz = std::min<unsigned>(
13258 a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1),
13259 b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
13260 bool IsWholeSubvector =
13261 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
13262 // Check if we can safely insert a subvector. If it is not possible, just
13263 // generate a whole-sized vector and shuffle the source vector and the new
13264 // subvector.
13265 if (OffsetBeg + InsertVecSz > VecSz) {
13266 // Align OffsetBeg to generate correct mask.
13267 OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
13268 InsertVecSz = VecSz;
13269 }
13270
13271 APInt DemandedElts = APInt::getZero(numBits: NumElts);
13272 // TODO: Add support for Instruction::InsertValue.
13273 SmallVector<int> Mask;
13274 if (!E->ReorderIndices.empty()) {
13275 inversePermutation(Indices: E->ReorderIndices, Mask);
13276 Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
13277 } else {
13278 Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
13279 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0);
13280 }
13281 bool IsIdentity = true;
13282 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
13283 Mask.swap(RHS&: PrevMask);
13284 for (unsigned I = 0; I < NumScalars; ++I) {
13285 unsigned InsertIdx = *getElementIndex(Inst: VL[PrevMask[I]]);
13286 DemandedElts.setBit(InsertIdx);
13287 IsIdentity &= InsertIdx - OffsetBeg == I;
13288 Mask[InsertIdx - OffsetBeg] = I;
13289 }
13290 assert(Offset < NumElts && "Failed to find vector index offset");
13291
13292 InstructionCost Cost = 0;
13293 Cost -=
13294 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts,
13295 /*Insert*/ true, /*Extract*/ false, CostKind);
13296
13297 // First cost - resize to actual vector size if not identity shuffle or
13298 // need to shift the vector.
13299 // Do not calculate the cost if the actual size is the register size and
13300 // we can merge this shuffle with the following SK_Select.
13301 auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
13302 if (!IsIdentity)
13303 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc,
13304 Tp: InsertVecTy, Mask);
13305 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) {
13306 return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
13307 }));
13308 // Second cost - permutation with subvector, if some elements are from the
13309 // initial vector or inserting a subvector.
13310 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
13311 // subvector of ActualVecTy.
13312 SmallBitVector InMask =
13313 isUndefVector(V: FirstInsert->getOperand(i: 0),
13314 UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
13315 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
13316 if (InsertVecSz != VecSz) {
13317 auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
13318 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {},
13319 CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy);
13320 } else {
13321 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
13322 Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
13323 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
13324 I <= End; ++I)
13325 if (Mask[I] != PoisonMaskElem)
13326 Mask[I] = I + VecSz;
13327 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
13328 Mask[I] =
13329 ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I;
13330 Cost +=
13331 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
13332 }
13333 }
13334 return Cost;
13335 }
13336 case Instruction::ZExt:
13337 case Instruction::SExt:
13338 case Instruction::FPToUI:
13339 case Instruction::FPToSI:
13340 case Instruction::FPExt:
13341 case Instruction::PtrToInt:
13342 case Instruction::IntToPtr:
13343 case Instruction::SIToFP:
13344 case Instruction::UIToFP:
13345 case Instruction::Trunc:
13346 case Instruction::FPTrunc:
13347 case Instruction::BitCast: {
13348 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
13349 Type *SrcScalarTy = VL0->getOperand(i: 0)->getType();
13350 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
13351 unsigned Opcode = ShuffleOrOp;
13352 unsigned VecOpcode = Opcode;
13353 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
13354 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
13355 // Check if the values are candidates to demote.
13356 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType());
13357 if (SrcIt != MinBWs.end()) {
13358 SrcBWSz = SrcIt->second.first;
13359 unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy);
13360 SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
13361 SrcVecTy =
13362 getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements);
13363 }
13364 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
13365 if (BWSz == SrcBWSz) {
13366 VecOpcode = Instruction::BitCast;
13367 } else if (BWSz < SrcBWSz) {
13368 VecOpcode = Instruction::Trunc;
13369 } else if (It != MinBWs.end()) {
13370 assert(BWSz > SrcBWSz && "Invalid cast!");
13371 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13372 } else if (SrcIt != MinBWs.end()) {
13373 assert(BWSz > SrcBWSz && "Invalid cast!");
13374 VecOpcode =
13375 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13376 }
13377 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13378 !SrcIt->second.second) {
13379 VecOpcode = Instruction::UIToFP;
13380 }
13381 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
13382 assert(Idx == 0 && "Expected 0 index only");
13383 return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
13384 Src: VL0->getOperand(i: 0)->getType(),
13385 CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0);
13386 };
13387 auto GetVectorCost = [=](InstructionCost CommonCost) {
13388 // Do not count cost here if minimum bitwidth is in effect and it is just
13389 // a bitcast (here it is just a noop).
13390 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
13391 return CommonCost;
13392 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
13393 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0));
13394
13395 bool IsArithmeticExtendedReduction =
13396 E->Idx == 0 && UserIgnoreList &&
13397 all_of(Range: *UserIgnoreList, P: [](Value *V) {
13398 auto *I = cast<Instruction>(Val: V);
13399 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
13400 Instruction::Mul, Instruction::FMul,
13401 Instruction::And, Instruction::Or,
13402 Instruction::Xor},
13403 Element: I->getOpcode());
13404 });
13405 if (IsArithmeticExtendedReduction &&
13406 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
13407 return CommonCost;
13408 return CommonCost +
13409 TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
13410 I: VecOpcode == Opcode ? VI : nullptr);
13411 };
13412 return GetCostDiff(GetScalarCost, GetVectorCost);
13413 }
13414 case Instruction::FCmp:
13415 case Instruction::ICmp:
13416 case Instruction::Select: {
13417 CmpPredicate VecPred, SwappedVecPred;
13418 auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
13419 if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) ||
13420 match(V: VL0, P: MatchCmp))
13421 SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
13422 else
13423 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
13424 ? CmpInst::BAD_FCMP_PREDICATE
13425 : CmpInst::BAD_ICMP_PREDICATE;
13426 auto GetScalarCost = [&](unsigned Idx) {
13427 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
13428 return InstructionCost(TTI::TCC_Free);
13429
13430 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
13431 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
13432 ? CmpInst::BAD_FCMP_PREDICATE
13433 : CmpInst::BAD_ICMP_PREDICATE;
13434 auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
13435 if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
13436 !match(V: VI, P: MatchCmp)) ||
13437 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
13438 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
13439 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
13440 ? CmpInst::BAD_FCMP_PREDICATE
13441 : CmpInst::BAD_ICMP_PREDICATE;
13442
13443 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
13444 Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
13445 CostKind, Op1Info: getOperandInfo(Ops: VI->getOperand(i: 0)),
13446 Op2Info: getOperandInfo(Ops: VI->getOperand(i: 1)), I: VI);
13447 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
13448 if (IntrinsicCost.isValid())
13449 ScalarCost = IntrinsicCost;
13450
13451 return ScalarCost;
13452 };
13453 auto GetVectorCost = [&](InstructionCost CommonCost) {
13454 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
13455
13456 InstructionCost VecCost =
13457 TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred,
13458 CostKind, Op1Info: getOperandInfo(Ops: E->getOperand(OpIdx: 0)),
13459 Op2Info: getOperandInfo(Ops: E->getOperand(OpIdx: 1)), I: VL0);
13460 if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) {
13461 auto *CondType =
13462 getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size());
13463 unsigned CondNumElements = CondType->getNumElements();
13464 unsigned VecTyNumElements = getNumElements(Ty: VecTy);
13465 assert(VecTyNumElements >= CondNumElements &&
13466 VecTyNumElements % CondNumElements == 0 &&
13467 "Cannot vectorize Instruction::Select");
13468 if (CondNumElements != VecTyNumElements) {
13469 // When the return type is i1 but the source is fixed vector type, we
13470 // need to duplicate the condition value.
13471 VecCost += ::getShuffleCost(
13472 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType,
13473 Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements,
13474 VF: CondNumElements));
13475 }
13476 }
13477 return VecCost + CommonCost;
13478 };
13479 return GetCostDiff(GetScalarCost, GetVectorCost);
13480 }
13481 case TreeEntry::MinMax: {
13482 auto GetScalarCost = [&](unsigned Idx) {
13483 return GetMinMaxCost(OrigScalarTy);
13484 };
13485 auto GetVectorCost = [&](InstructionCost CommonCost) {
13486 InstructionCost VecCost = GetMinMaxCost(VecTy);
13487 return VecCost + CommonCost;
13488 };
13489 return GetCostDiff(GetScalarCost, GetVectorCost);
13490 }
13491 case Instruction::FNeg:
13492 case Instruction::Add:
13493 case Instruction::FAdd:
13494 case Instruction::Sub:
13495 case Instruction::FSub:
13496 case Instruction::Mul:
13497 case Instruction::FMul:
13498 case Instruction::UDiv:
13499 case Instruction::SDiv:
13500 case Instruction::FDiv:
13501 case Instruction::URem:
13502 case Instruction::SRem:
13503 case Instruction::FRem:
13504 case Instruction::Shl:
13505 case Instruction::LShr:
13506 case Instruction::AShr:
13507 case Instruction::And:
13508 case Instruction::Or:
13509 case Instruction::Xor: {
13510 auto GetScalarCost = [&](unsigned Idx) {
13511 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
13512 return InstructionCost(TTI::TCC_Free);
13513
13514 // We cannot retrieve the operand from UniqueValues[Idx] because an
13515 // interchangeable instruction may be used. The order and the actual
13516 // operand might differ from what is retrieved from UniqueValues[Idx].
13517 Value *Op1 = E->getOperand(OpIdx: 0)[Idx];
13518 Value *Op2;
13519 SmallVector<const Value *, 2> Operands(1, Op1);
13520 if (isa<UnaryOperator>(Val: UniqueValues[Idx])) {
13521 Op2 = Op1;
13522 } else {
13523 Op2 = E->getOperand(OpIdx: 1)[Idx];
13524 Operands.push_back(Elt: Op2);
13525 }
13526 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1);
13527 TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2);
13528 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind,
13529 Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands);
13530 };
13531 auto GetVectorCost = [=](InstructionCost CommonCost) {
13532 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13533 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
13534 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
13535 if (all_of(Range&: Ops, P: [&](Value *Op) {
13536 auto *CI = dyn_cast<ConstantInt>(Val: Op);
13537 return CI && CI->getValue().countr_one() >= It->second.first;
13538 }))
13539 return CommonCost;
13540 }
13541 }
13542 unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1;
13543 TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
13544 TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
13545 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
13546 Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) +
13547 CommonCost;
13548 };
13549 return GetCostDiff(GetScalarCost, GetVectorCost);
13550 }
13551 case Instruction::GetElementPtr: {
13552 return CommonCost + GetGEPCostDiff(VL, VL0);
13553 }
13554 case Instruction::Load: {
13555 auto GetScalarCost = [&](unsigned Idx) {
13556 auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]);
13557 return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
13558 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
13559 CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI);
13560 };
13561 auto *LI0 = cast<LoadInst>(Val: VL0);
13562 auto GetVectorCost = [&](InstructionCost CommonCost) {
13563 InstructionCost VecLdCost;
13564 switch (E->State) {
13565 case TreeEntry::Vectorize:
13566 if (unsigned Factor = E->getInterleaveFactor()) {
13567 VecLdCost = TTI->getInterleavedMemoryOpCost(
13568 Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(),
13569 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
13570
13571 } else {
13572 VecLdCost = TTI->getMemoryOpCost(
13573 Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
13574 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
13575 }
13576 break;
13577 case TreeEntry::StridedVectorize: {
13578 Align CommonAlignment =
13579 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
13580 VecLdCost = TTI->getStridedMemoryOpCost(
13581 Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
13582 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
13583 break;
13584 }
13585 case TreeEntry::CompressVectorize: {
13586 bool IsMasked;
13587 unsigned InterleaveFactor;
13588 SmallVector<int> CompressMask;
13589 VectorType *LoadVecTy;
13590 SmallVector<Value *> Scalars(VL);
13591 if (!E->ReorderIndices.empty()) {
13592 SmallVector<int> Mask(E->ReorderIndices.begin(),
13593 E->ReorderIndices.end());
13594 reorderScalars(Scalars, Mask);
13595 }
13596 SmallVector<Value *> PointerOps(Scalars.size());
13597 for (auto [I, V] : enumerate(First&: Scalars))
13598 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
13599 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
13600 VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
13601 TLI: *TLI, AreAllUsersVectorized: [](Value *) { return true; }, IsMasked, InterleaveFactor,
13602 CompressMask, LoadVecTy);
13603 assert(IsVectorized && "Failed to vectorize load");
13604 CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy,
13605 Args&: InterleaveFactor, Args&: IsMasked);
13606 Align CommonAlignment = LI0->getAlign();
13607 if (InterleaveFactor) {
13608 VecLdCost = TTI->getInterleavedMemoryOpCost(
13609 Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {},
13610 Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind);
13611 } else if (IsMasked) {
13612 VecLdCost = TTI->getMaskedMemoryOpCost(
13613 Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
13614 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
13615 // TODO: include this cost into CommonCost.
13616 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
13617 Tp: LoadVecTy, Mask: CompressMask, CostKind);
13618 } else {
13619 VecLdCost = TTI->getMemoryOpCost(
13620 Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
13621 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
13622 // TODO: include this cost into CommonCost.
13623 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
13624 Tp: LoadVecTy, Mask: CompressMask, CostKind);
13625 }
13626 break;
13627 }
13628 case TreeEntry::ScatterVectorize: {
13629 Align CommonAlignment =
13630 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
13631 VecLdCost = TTI->getGatherScatterOpCost(
13632 Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
13633 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
13634 break;
13635 }
13636 case TreeEntry::CombinedVectorize:
13637 case TreeEntry::SplitVectorize:
13638 case TreeEntry::NeedToGather:
13639 llvm_unreachable("Unexpected vectorization state.");
13640 }
13641 return VecLdCost + CommonCost;
13642 };
13643
13644 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
13645 // If this node generates masked gather load then it is not a terminal node.
13646 // Hence address operand cost is estimated separately.
13647 if (E->State == TreeEntry::ScatterVectorize)
13648 return Cost;
13649
13650 // Estimate cost of GEPs since this tree node is a terminator.
13651 SmallVector<Value *> PointerOps(VL.size());
13652 for (auto [I, V] : enumerate(First&: VL))
13653 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
13654 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
13655 }
13656 case Instruction::Store: {
13657 bool IsReorder = !E->ReorderIndices.empty();
13658 auto GetScalarCost = [=](unsigned Idx) {
13659 auto *VI = cast<StoreInst>(Val: VL[Idx]);
13660 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
13661 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
13662 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
13663 CostKind, OpdInfo: OpInfo, I: VI);
13664 };
13665 auto *BaseSI =
13666 cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0);
13667 auto GetVectorCost = [=](InstructionCost CommonCost) {
13668 // We know that we can merge the stores. Calculate the cost.
13669 InstructionCost VecStCost;
13670 if (E->State == TreeEntry::StridedVectorize) {
13671 Align CommonAlignment =
13672 computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
13673 VecStCost = TTI->getStridedMemoryOpCost(
13674 Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
13675 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
13676 } else {
13677 assert(E->State == TreeEntry::Vectorize &&
13678 "Expected either strided or consecutive stores.");
13679 if (unsigned Factor = E->getInterleaveFactor()) {
13680 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
13681 "No reused shuffles expected");
13682 CommonCost = 0;
13683 VecStCost = TTI->getInterleavedMemoryOpCost(
13684 Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(),
13685 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind);
13686 } else {
13687 TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
13688 VecStCost = TTI->getMemoryOpCost(
13689 Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
13690 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
13691 }
13692 }
13693 return VecStCost + CommonCost;
13694 };
13695 SmallVector<Value *> PointerOps(VL.size());
13696 for (auto [I, V] : enumerate(First&: VL)) {
13697 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
13698 PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
13699 }
13700
13701 return GetCostDiff(GetScalarCost, GetVectorCost) +
13702 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
13703 }
13704 case Instruction::Call: {
13705 auto GetScalarCost = [&](unsigned Idx) {
13706 auto *CI = cast<CallInst>(Val: UniqueValues[Idx]);
13707 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13708 if (ID != Intrinsic::not_intrinsic) {
13709 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
13710 return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13711 }
13712 return TTI->getCallInstrCost(F: CI->getCalledFunction(),
13713 RetTy: CI->getFunctionType()->getReturnType(),
13714 Tys: CI->getFunctionType()->params(), CostKind);
13715 };
13716 auto GetVectorCost = [=](InstructionCost CommonCost) {
13717 auto *CI = cast<CallInst>(Val: VL0);
13718 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13719 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
13720 CI, ID, VF: VecTy->getNumElements(),
13721 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
13722 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13723 return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
13724 };
13725 return GetCostDiff(GetScalarCost, GetVectorCost);
13726 }
13727 case Instruction::ShuffleVector: {
13728 if (!SLPReVec || E->isAltShuffle())
13729 assert(E->isAltShuffle() &&
13730 ((Instruction::isBinaryOp(E->getOpcode()) &&
13731 Instruction::isBinaryOp(E->getAltOpcode())) ||
13732 (Instruction::isCast(E->getOpcode()) &&
13733 Instruction::isCast(E->getAltOpcode())) ||
13734 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13735 "Invalid Shuffle Vector Operand");
13736 // Try to find the previous shuffle node with the same operands and same
13737 // main/alternate ops.
13738 auto TryFindNodeWithEqualOperands = [=]() {
13739 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13740 if (TE.get() == E)
13741 break;
13742 if (TE->hasState() && TE->isAltShuffle() &&
13743 ((TE->getOpcode() == E->getOpcode() &&
13744 TE->getAltOpcode() == E->getAltOpcode()) ||
13745 (TE->getOpcode() == E->getAltOpcode() &&
13746 TE->getAltOpcode() == E->getOpcode())) &&
13747 TE->hasEqualOperands(TE: *E))
13748 return true;
13749 }
13750 return false;
13751 };
13752 auto GetScalarCost = [&](unsigned Idx) {
13753 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
13754 return InstructionCost(TTI::TCC_Free);
13755
13756 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
13757 assert(E->getMatchingMainOpOrAltOp(VI) &&
13758 "Unexpected main/alternate opcode");
13759 (void)E;
13760 return TTI->getInstructionCost(U: VI, CostKind);
13761 };
13762 // Need to clear CommonCost since the final shuffle cost is included into
13763 // vector cost.
13764 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
13765 // VecCost is equal to sum of the cost of creating 2 vectors
13766 // and the cost of creating shuffle.
13767 InstructionCost VecCost = 0;
13768 if (TryFindNodeWithEqualOperands()) {
13769 LLVM_DEBUG({
13770 dbgs() << "SLP: diamond match for alternate node found.\n";
13771 E->dump();
13772 });
13773 // No need to add new vector costs here since we're going to reuse
13774 // same main/alternate vector ops, just do different shuffling.
13775 } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
13776 VecCost =
13777 TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
13778 VecCost +=
13779 TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
13780 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
13781 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
13782 VecCost = TTIRef.getCmpSelInstrCost(
13783 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind,
13784 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
13785 I: VL0);
13786 VecCost += TTIRef.getCmpSelInstrCost(
13787 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
13788 VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
13789 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
13790 I: E->getAltOp());
13791 } else {
13792 Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType();
13793 auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
13794 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
13795 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
13796 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13797 unsigned SrcBWSz =
13798 DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType());
13799 if (SrcIt != MinBWs.end()) {
13800 SrcBWSz = SrcIt->second.first;
13801 SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
13802 SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
13803 }
13804 if (BWSz <= SrcBWSz) {
13805 if (BWSz < SrcBWSz)
13806 VecCost =
13807 TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
13808 CCH: TTI::CastContextHint::None, CostKind);
13809 LLVM_DEBUG({
13810 dbgs()
13811 << "SLP: alternate extension, which should be truncated.\n";
13812 E->dump();
13813 });
13814 return VecCost;
13815 }
13816 }
13817 VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
13818 CCH: TTI::CastContextHint::None, CostKind);
13819 VecCost +=
13820 TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
13821 CCH: TTI::CastContextHint::None, CostKind);
13822 }
13823 SmallVector<int> Mask;
13824 E->buildAltOpShuffleMask(
13825 IsAltOp: [&](Instruction *I) {
13826 assert(E->getMatchingMainOpOrAltOp(I) &&
13827 "Unexpected main/alternate opcode");
13828 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
13829 TLI: *TLI);
13830 },
13831 Mask);
13832 VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
13833 Tp: FinalVecTy, Mask, CostKind);
13834 // Patterns like [fadd,fsub] can be combined into a single instruction
13835 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
13836 // need to take into account their order when looking for the most used
13837 // order.
13838 unsigned Opcode0 = E->getOpcode();
13839 unsigned Opcode1 = E->getAltOpcode();
13840 SmallBitVector OpcodeMask(
13841 getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1));
13842 // If this pattern is supported by the target then we consider the
13843 // order.
13844 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
13845 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
13846 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
13847 return AltVecCost < VecCost ? AltVecCost : VecCost;
13848 }
13849 // TODO: Check the reverse order too.
13850 return VecCost;
13851 };
13852 if (SLPReVec && !E->isAltShuffle())
13853 return GetCostDiff(
13854 GetScalarCost, [&](InstructionCost) -> InstructionCost {
13855 // If a group uses mask in order, the shufflevector can be
13856 // eliminated by instcombine. Then the cost is 0.
13857 assert(isa<ShuffleVectorInst>(VL.front()) &&
13858 "Not supported shufflevector usage.");
13859 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
13860 unsigned SVNumElements =
13861 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())
13862 ->getNumElements();
13863 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
13864 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
13865 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
13866 int NextIndex = 0;
13867 if (!all_of(Range&: Group, P: [&](Value *V) {
13868 assert(isa<ShuffleVectorInst>(V) &&
13869 "Not supported shufflevector usage.");
13870 auto *SV = cast<ShuffleVectorInst>(Val: V);
13871 int Index;
13872 [[maybe_unused]] bool IsExtractSubvectorMask =
13873 SV->isExtractSubvectorMask(Index);
13874 assert(IsExtractSubvectorMask &&
13875 "Not supported shufflevector usage.");
13876 if (NextIndex != Index)
13877 return false;
13878 NextIndex += SV->getShuffleMask().size();
13879 return true;
13880 }))
13881 return ::getShuffleCost(
13882 TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy,
13883 Mask: calculateShufflevectorMask(VL: E->Scalars));
13884 }
13885 return TTI::TCC_Free;
13886 });
13887 return GetCostDiff(GetScalarCost, GetVectorCost);
13888 }
13889 case Instruction::Freeze:
13890 return CommonCost;
13891 default:
13892 llvm_unreachable("Unknown instruction");
13893 }
13894}
13895
13896bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
13897 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
13898 << VectorizableTree.size() << " is fully vectorizable .\n");
13899
13900 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
13901 SmallVector<int> Mask;
13902 return TE->isGather() &&
13903 !any_of(Range: TE->Scalars,
13904 P: [this](Value *V) { return EphValues.contains(Ptr: V); }) &&
13905 (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) ||
13906 TE->Scalars.size() < Limit ||
13907 (((TE->hasState() &&
13908 TE->getOpcode() == Instruction::ExtractElement) ||
13909 all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
13910 isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) ||
13911 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
13912 !TE->isAltShuffle()) ||
13913 any_of(Range: TE->Scalars, P: IsaPred<LoadInst>));
13914 };
13915
13916 // We only handle trees of heights 1 and 2.
13917 if (VectorizableTree.size() == 1 &&
13918 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
13919 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
13920 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
13921 (ForReduction &&
13922 AreVectorizableGathers(VectorizableTree[0].get(),
13923 VectorizableTree[0]->Scalars.size()) &&
13924 VectorizableTree[0]->getVectorFactor() > 2)))
13925 return true;
13926
13927 if (VectorizableTree.size() != 2)
13928 return false;
13929
13930 // Handle splat and all-constants stores. Also try to vectorize tiny trees
13931 // with the second gather nodes if they have less scalar operands rather than
13932 // the initial tree element (may be profitable to shuffle the second gather)
13933 // or they are extractelements, which form shuffle.
13934 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
13935 AreVectorizableGathers(VectorizableTree[1].get(),
13936 VectorizableTree[0]->Scalars.size()))
13937 return true;
13938
13939 // Gathering cost would be too much for tiny trees.
13940 if (VectorizableTree[0]->isGather() ||
13941 (VectorizableTree[1]->isGather() &&
13942 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
13943 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
13944 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
13945 return false;
13946
13947 return true;
13948}
13949
13950static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
13951 TargetTransformInfo *TTI,
13952 bool MustMatchOrInst) {
13953 // Look past the root to find a source value. Arbitrarily follow the
13954 // path through operand 0 of any 'or'. Also, peek through optional
13955 // shift-left-by-multiple-of-8-bits.
13956 Value *ZextLoad = Root;
13957 const APInt *ShAmtC;
13958 bool FoundOr = false;
13959 while (!isa<ConstantExpr>(Val: ZextLoad) &&
13960 (match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) ||
13961 (match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
13962 ShAmtC->urem(RHS: 8) == 0))) {
13963 auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
13964 ZextLoad = BinOp->getOperand(i_nocapture: 0);
13965 if (BinOp->getOpcode() == Instruction::Or)
13966 FoundOr = true;
13967 }
13968 // Check if the input is an extended load of the required or/shift expression.
13969 Value *Load;
13970 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
13971 !match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) || !isa<LoadInst>(Val: Load))
13972 return false;
13973
13974 // Require that the total load bit width is a legal integer type.
13975 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
13976 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
13977 Type *SrcTy = Load->getType();
13978 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
13979 if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
13980 return false;
13981
13982 // Everything matched - assume that we can fold the whole sequence using
13983 // load combining.
13984 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
13985 << *(cast<Instruction>(Root)) << "\n");
13986
13987 return true;
13988}
13989
13990bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
13991 if (RdxKind != RecurKind::Or)
13992 return false;
13993
13994 unsigned NumElts = VectorizableTree[0]->Scalars.size();
13995 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
13996 return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
13997 /* MatchOr */ MustMatchOrInst: false);
13998}
13999
14000bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
14001 // Peek through a final sequence of stores and check if all operations are
14002 // likely to be load-combined.
14003 unsigned NumElts = Stores.size();
14004 for (Value *Scalar : Stores) {
14005 Value *X;
14006 if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) ||
14007 !isLoadCombineCandidateImpl(Root: X, NumElts, TTI, /* MatchOr */ MustMatchOrInst: true))
14008 return false;
14009 }
14010 return true;
14011}
14012
14013bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
14014 if (!DebugCounter::shouldExecute(CounterName: VectorizedGraphs))
14015 return true;
14016
14017 // Graph is empty - do nothing.
14018 if (VectorizableTree.empty()) {
14019 assert(ExternalUses.empty() && "We shouldn't have any external users");
14020
14021 return true;
14022 }
14023
14024 // No need to vectorize inserts of gathered values.
14025 if (VectorizableTree.size() == 2 &&
14026 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
14027 VectorizableTree[1]->isGather() &&
14028 (VectorizableTree[1]->getVectorFactor() <= 2 ||
14029 !(isSplat(VL: VectorizableTree[1]->Scalars) ||
14030 allConstant(VL: VectorizableTree[1]->Scalars))))
14031 return true;
14032
14033 // If the graph includes only PHI nodes and gathers, it is defnitely not
14034 // profitable for the vectorization, we can skip it, if the cost threshold is
14035 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
14036 // gathers/buildvectors.
14037 constexpr int Limit = 4;
14038 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
14039 !VectorizableTree.empty() &&
14040 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
14041 return (TE->isGather() &&
14042 (!TE->hasState() ||
14043 TE->getOpcode() != Instruction::ExtractElement) &&
14044 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) ||
14045 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
14046 }))
14047 return true;
14048
14049 // Do not vectorize small tree of phis only, if all vector phis are also
14050 // gathered.
14051 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
14052 VectorizableTree.size() <= Limit &&
14053 all_of(Range: VectorizableTree,
14054 P: [&](const std::unique_ptr<TreeEntry> &TE) {
14055 return (TE->isGather() &&
14056 (!TE->hasState() ||
14057 TE->getOpcode() != Instruction::ExtractElement) &&
14058 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <=
14059 Limit) ||
14060 (TE->hasState() &&
14061 (TE->getOpcode() == Instruction::InsertElement ||
14062 (TE->getOpcode() == Instruction::PHI &&
14063 all_of(Range&: TE->Scalars, P: [&](Value *V) {
14064 return isa<PoisonValue>(Val: V) || MustGather.contains(Ptr: V);
14065 }))));
14066 }) &&
14067 any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
14068 return TE->State == TreeEntry::Vectorize &&
14069 TE->getOpcode() == Instruction::PHI;
14070 }))
14071 return true;
14072
14073 // If the tree contains only phis, buildvectors, split nodes and
14074 // small nodes with reuses, we can skip it.
14075 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
14076 all_of(Range: VectorizableTree, P: [](const std::unique_ptr<TreeEntry> &TE) {
14077 return TE->State == TreeEntry::SplitVectorize ||
14078 (TE->isGather() &&
14079 none_of(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>)) ||
14080 (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
14081 (!TE->ReuseShuffleIndices.empty() &&
14082 TE->Scalars.size() == 2)));
14083 }))
14084 return true;
14085
14086 // We can vectorize the tree if its size is greater than or equal to the
14087 // minimum size specified by the MinTreeSize command line option.
14088 if (VectorizableTree.size() >= MinTreeSize)
14089 return false;
14090
14091 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
14092 // can vectorize it if we can prove it fully vectorizable.
14093 if (isFullyVectorizableTinyTree(ForReduction))
14094 return false;
14095
14096 // Check if any of the gather node forms an insertelement buildvector
14097 // somewhere.
14098 bool IsAllowedSingleBVNode =
14099 VectorizableTree.size() > 1 ||
14100 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
14101 !VectorizableTree.front()->isAltShuffle() &&
14102 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
14103 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
14104 allSameBlock(VL: VectorizableTree.front()->Scalars));
14105 if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
14106 return TE->isGather() && all_of(Range&: TE->Scalars, P: [&](Value *V) {
14107 return isa<ExtractElementInst, Constant>(Val: V) ||
14108 (IsAllowedSingleBVNode &&
14109 !V->hasNUsesOrMore(N: UsesLimit) &&
14110 any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
14111 });
14112 }))
14113 return false;
14114
14115 if (VectorizableTree.back()->isGather() &&
14116 VectorizableTree.back()->hasState() &&
14117 VectorizableTree.back()->isAltShuffle() &&
14118 VectorizableTree.back()->getVectorFactor() > 2 &&
14119 allSameBlock(VL: VectorizableTree.back()->Scalars) &&
14120 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
14121 TTI->getScalarizationOverhead(
14122 Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(),
14123 VF: VectorizableTree.back()->getVectorFactor()),
14124 DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()),
14125 /*Insert=*/true, /*Extract=*/false,
14126 CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold)
14127 return false;
14128
14129 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
14130 // vectorizable.
14131 return true;
14132}
14133
14134bool BoUpSLP::isTreeNotExtendable() const {
14135 if (getCanonicalGraphSize() != getTreeSize()) {
14136 constexpr unsigned SmallTree = 3;
14137 if (VectorizableTree.front()->isNonPowOf2Vec() &&
14138 getCanonicalGraphSize() <= SmallTree &&
14139 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
14140 P: [](const std::unique_ptr<TreeEntry> &TE) {
14141 return TE->isGather() && TE->hasState() &&
14142 TE->getOpcode() == Instruction::Load &&
14143 !allSameBlock(VL: TE->Scalars);
14144 }) == 1)
14145 return true;
14146 return false;
14147 }
14148 bool Res = false;
14149 for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) {
14150 TreeEntry &E = *VectorizableTree[Idx];
14151 if (E.State == TreeEntry::SplitVectorize)
14152 return false;
14153 if (!E.isGather())
14154 continue;
14155 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
14156 (!E.hasState() &&
14157 all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) ||
14158 (isa<ExtractElementInst>(Val: E.Scalars.front()) &&
14159 getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid()))
14160 return false;
14161 if (isSplat(VL: E.Scalars) || allConstant(VL: E.Scalars))
14162 continue;
14163 Res = true;
14164 }
14165 return Res;
14166}
14167
14168InstructionCost BoUpSLP::getSpillCost() {
14169 // Walk from the bottom of the tree to the top, tracking which values are
14170 // live. When we see a call instruction that is not part of our tree,
14171 // query TTI to see if there is a cost to keeping values live over it
14172 // (for example, if spills and fills are required).
14173
14174 const TreeEntry *Root = VectorizableTree.front().get();
14175 if (Root->isGather())
14176 return 0;
14177
14178 InstructionCost Cost = 0;
14179 SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
14180 EntriesToOperands;
14181 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
14182 SmallPtrSet<const Instruction *, 8> LastInstructions;
14183 for (const auto &TEPtr : VectorizableTree) {
14184 if (!TEPtr->isGather()) {
14185 Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get());
14186 EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst);
14187 LastInstructions.insert(Ptr: LastInst);
14188 }
14189 if (TEPtr->UserTreeIndex)
14190 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get());
14191 }
14192
14193 auto NoCallIntrinsic = [this](const Instruction *I) {
14194 const auto *II = dyn_cast<IntrinsicInst>(Val: I);
14195 if (!II)
14196 return false;
14197 if (II->isAssumeLikeIntrinsic())
14198 return true;
14199 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
14200 InstructionCost IntrCost =
14201 TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
14202 InstructionCost CallCost = TTI->getCallInstrCost(
14203 F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput);
14204 return IntrCost < CallCost;
14205 };
14206
14207 // Maps last instruction in the entry to the last instruction for the one of
14208 // operand entries and the flag. If the flag is true, there are no calls in
14209 // between these instructions.
14210 SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
14211 CheckedInstructions;
14212 unsigned Budget = 0;
14213 const unsigned BudgetLimit =
14214 ScheduleRegionSizeBudget / VectorizableTree.size();
14215 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
14216 const Instruction *Last) {
14217 assert(First->getParent() == Last->getParent() &&
14218 "Expected instructions in same block.");
14219 if (auto It = CheckedInstructions.find(Val: Last);
14220 It != CheckedInstructions.end()) {
14221 const Instruction *Checked = It->second.getPointer();
14222 if (Checked == First || Checked->comesBefore(Other: First))
14223 return It->second.getInt() != 0;
14224 Last = Checked;
14225 } else if (Last == First || Last->comesBefore(Other: First)) {
14226 return true;
14227 }
14228 BasicBlock::const_reverse_iterator InstIt =
14229 ++First->getIterator().getReverse(),
14230 PrevInstIt =
14231 Last->getIterator().getReverse();
14232 SmallVector<const Instruction *> LastInstsInRange;
14233 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
14234 // Debug information does not impact spill cost.
14235 // Vectorized calls, represented as vector intrinsics, do not impact spill
14236 // cost.
14237 if (const auto *CB = dyn_cast<CallBase>(Val: &*PrevInstIt);
14238 CB && !NoCallIntrinsic(CB) && !isVectorized(V: CB)) {
14239 for (const Instruction *LastInst : LastInstsInRange)
14240 CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: 0);
14241 return false;
14242 }
14243 if (LastInstructions.contains(Ptr: &*PrevInstIt))
14244 LastInstsInRange.push_back(Elt: &*PrevInstIt);
14245
14246 ++PrevInstIt;
14247 ++Budget;
14248 }
14249 for (const Instruction *LastInst : LastInstsInRange)
14250 CheckedInstructions.try_emplace(
14251 Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt,
14252 Args: Budget <= BudgetLimit ? 1 : 0);
14253 return Budget <= BudgetLimit;
14254 };
14255 auto AddCosts = [&](const TreeEntry *Op) {
14256 Type *ScalarTy = Op->Scalars.front()->getType();
14257 auto It = MinBWs.find(Val: Op);
14258 if (It != MinBWs.end())
14259 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
14260 auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor());
14261 Cost += TTI->getCostOfKeepingLiveOverCall(Tys: VecTy);
14262 if (ScalarTy->isVectorTy()) {
14263 // Handle revec dead vector instructions.
14264 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy);
14265 }
14266 };
14267 // Memoize the relationship between blocks, i.e. if there is (at least one)
14268 // non-vectorized call between the blocks. This allows to skip the analysis of
14269 // the same block paths multiple times.
14270 SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
14271 ParentOpParentToPreds;
14272 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
14273 BasicBlock *OpParent) {
14274 auto Key = std::make_pair(x&: Root, y&: OpParent);
14275 if (auto It = ParentOpParentToPreds.find(Val: Key);
14276 It != ParentOpParentToPreds.end())
14277 return It->second;
14278 SmallVector<BasicBlock *> Worklist;
14279 if (Pred)
14280 Worklist.push_back(Elt: Pred);
14281 else
14282 Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root));
14283 SmallPtrSet<const BasicBlock *, 16> Visited;
14284 SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
14285 ParentsPairsToAdd;
14286 bool Res = false;
14287 auto Cleanup = make_scope_exit(F: [&]() {
14288 for (const auto &KeyPair : ParentsPairsToAdd) {
14289 assert(!ParentOpParentToPreds.contains(KeyPair) &&
14290 "Should not have been added before.");
14291 ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res);
14292 }
14293 });
14294 while (!Worklist.empty()) {
14295 BasicBlock *BB = Worklist.pop_back_val();
14296 if (BB == OpParent || !Visited.insert(Ptr: BB).second)
14297 continue;
14298 auto Pair = std::make_pair(x&: BB, y&: OpParent);
14299 if (auto It = ParentOpParentToPreds.find(Val: Pair);
14300 It != ParentOpParentToPreds.end()) {
14301 Res = It->second;
14302 return Res;
14303 }
14304 ParentsPairsToAdd.insert(V: Pair);
14305 unsigned BlockSize = BB->size();
14306 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
14307 return Res;
14308 Budget += BlockSize;
14309 if (Budget > BudgetLimit)
14310 return Res;
14311 if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) &&
14312 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
14313 BB->getTerminator()))
14314 return Res;
14315 Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB));
14316 }
14317 Res = true;
14318 return Res;
14319 };
14320 SmallVector<const TreeEntry *> LiveEntries(1, Root);
14321 while (!LiveEntries.empty()) {
14322 const TreeEntry *Entry = LiveEntries.pop_back_val();
14323 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry);
14324 if (Operands.empty())
14325 continue;
14326 Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry);
14327 BasicBlock *Parent = LastInst->getParent();
14328 for (const TreeEntry *Op : Operands) {
14329 if (!Op->isGather())
14330 LiveEntries.push_back(Elt: Op);
14331 if (Entry->State == TreeEntry::SplitVectorize ||
14332 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
14333 (Op->isGather() && allConstant(VL: Op->Scalars)))
14334 continue;
14335 Budget = 0;
14336 BasicBlock *Pred = nullptr;
14337 if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp()))
14338 Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
14339 BasicBlock *OpParent;
14340 Instruction *OpLastInst;
14341 if (Op->isGather()) {
14342 assert(Entry->getOpcode() == Instruction::PHI &&
14343 "Expected phi node only.");
14344 OpParent = cast<PHINode>(Val: Entry->getMainOp())
14345 ->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
14346 OpLastInst = OpParent->getTerminator();
14347 for (Value *V : Op->Scalars) {
14348 auto *Inst = dyn_cast<Instruction>(Val: V);
14349 if (!Inst)
14350 continue;
14351 if (isVectorized(V)) {
14352 OpParent = Inst->getParent();
14353 OpLastInst = Inst;
14354 break;
14355 }
14356 }
14357 } else {
14358 OpLastInst = EntriesToLastInstruction.at(Val: Op);
14359 OpParent = OpLastInst->getParent();
14360 }
14361 // Check the call instructions within the same basic blocks.
14362 if (OpParent == Parent) {
14363 if (Entry->getOpcode() == Instruction::PHI) {
14364 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
14365 AddCosts(Op);
14366 continue;
14367 }
14368 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
14369 AddCosts(Op);
14370 continue;
14371 }
14372 // Check for call instruction in between blocks.
14373 // 1. Check entry's block to the head.
14374 if (Entry->getOpcode() != Instruction::PHI &&
14375 !CheckForNonVecCallsInSameBlock(
14376 &*LastInst->getParent()->getFirstNonPHIOrDbgOrAlloca(),
14377 LastInst)) {
14378 AddCosts(Op);
14379 continue;
14380 }
14381 // 2. Check op's block from the end.
14382 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
14383 OpParent->getTerminator())) {
14384 AddCosts(Op);
14385 continue;
14386 }
14387 // 3. Check the predecessors of entry's block till op's block.
14388 if (!CheckPredecessors(Parent, Pred, OpParent)) {
14389 AddCosts(Op);
14390 continue;
14391 }
14392 }
14393 }
14394
14395 return Cost;
14396}
14397
14398/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
14399/// buildvector sequence.
14400static bool isFirstInsertElement(const InsertElementInst *IE1,
14401 const InsertElementInst *IE2) {
14402 if (IE1 == IE2)
14403 return false;
14404 const auto *I1 = IE1;
14405 const auto *I2 = IE2;
14406 const InsertElementInst *PrevI1;
14407 const InsertElementInst *PrevI2;
14408 unsigned Idx1 = *getElementIndex(Inst: IE1);
14409 unsigned Idx2 = *getElementIndex(Inst: IE2);
14410 do {
14411 if (I2 == IE1)
14412 return true;
14413 if (I1 == IE2)
14414 return false;
14415 PrevI1 = I1;
14416 PrevI2 = I2;
14417 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
14418 getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
14419 I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0));
14420 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
14421 getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
14422 I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0));
14423 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
14424 llvm_unreachable("Two different buildvectors not expected.");
14425}
14426
14427namespace {
14428/// Returns incoming Value *, if the requested type is Value * too, or a default
14429/// value, otherwise.
14430struct ValueSelect {
14431 template <typename U>
14432 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
14433 return V;
14434 }
14435 template <typename U>
14436 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
14437 return U();
14438 }
14439};
14440} // namespace
14441
14442/// Does the analysis of the provided shuffle masks and performs the requested
14443/// actions on the vectors with the given shuffle masks. It tries to do it in
14444/// several steps.
14445/// 1. If the Base vector is not undef vector, resizing the very first mask to
14446/// have common VF and perform action for 2 input vectors (including non-undef
14447/// Base). Other shuffle masks are combined with the resulting after the 1 stage
14448/// and processed as a shuffle of 2 elements.
14449/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
14450/// action only for 1 vector with the given mask, if it is not the identity
14451/// mask.
14452/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
14453/// vectors, combing the masks properly between the steps.
14454template <typename T>
14455static T *performExtractsShuffleAction(
14456 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
14457 function_ref<unsigned(T *)> GetVF,
14458 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
14459 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
14460 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
14461 SmallVector<int> Mask(ShuffleMask.begin()->second);
14462 auto VMIt = std::next(ShuffleMask.begin());
14463 T *Prev = nullptr;
14464 SmallBitVector UseMask =
14465 buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
14466 SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
14467 if (!IsBaseUndef.all()) {
14468 // Base is not undef, need to combine it with the next subvectors.
14469 std::pair<T *, bool> Res =
14470 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
14471 SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
14472 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
14473 if (Mask[Idx] == PoisonMaskElem)
14474 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
14475 else
14476 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
14477 }
14478 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
14479 assert((!V || GetVF(V) == Mask.size()) &&
14480 "Expected base vector of VF number of elements.");
14481 Prev = Action(Mask, {nullptr, Res.first});
14482 } else if (ShuffleMask.size() == 1) {
14483 // Base is undef and only 1 vector is shuffled - perform the action only for
14484 // single vector, if the mask is not the identity mask.
14485 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
14486 /*ForSingleMask=*/true);
14487 if (Res.second)
14488 // Identity mask is found.
14489 Prev = Res.first;
14490 else
14491 Prev = Action(Mask, {ShuffleMask.begin()->first});
14492 } else {
14493 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
14494 // shuffles step by step, combining shuffle between the steps.
14495 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
14496 unsigned Vec2VF = GetVF(VMIt->first);
14497 if (Vec1VF == Vec2VF) {
14498 // No need to resize the input vectors since they are of the same size, we
14499 // can shuffle them directly.
14500 ArrayRef<int> SecMask = VMIt->second;
14501 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
14502 if (SecMask[I] != PoisonMaskElem) {
14503 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
14504 Mask[I] = SecMask[I] + Vec1VF;
14505 }
14506 }
14507 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
14508 } else {
14509 // Vectors of different sizes - resize and reshuffle.
14510 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
14511 /*ForSingleMask=*/false);
14512 std::pair<T *, bool> Res2 =
14513 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
14514 ArrayRef<int> SecMask = VMIt->second;
14515 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
14516 if (Mask[I] != PoisonMaskElem) {
14517 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
14518 if (Res1.second)
14519 Mask[I] = I;
14520 } else if (SecMask[I] != PoisonMaskElem) {
14521 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
14522 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
14523 }
14524 }
14525 Prev = Action(Mask, {Res1.first, Res2.first});
14526 }
14527 VMIt = std::next(VMIt);
14528 }
14529 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
14530 // Perform requested actions for the remaining masks/vectors.
14531 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
14532 // Shuffle other input vectors, if any.
14533 std::pair<T *, bool> Res =
14534 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
14535 ArrayRef<int> SecMask = VMIt->second;
14536 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
14537 if (SecMask[I] != PoisonMaskElem) {
14538 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
14539 "Multiple uses of scalars.");
14540 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
14541 } else if (Mask[I] != PoisonMaskElem) {
14542 Mask[I] = I;
14543 }
14544 }
14545 Prev = Action(Mask, {Prev, Res.first});
14546 }
14547 return Prev;
14548}
14549
14550namespace {
14551/// Data type for handling buildvector sequences with the reused scalars from
14552/// other tree entries.
14553template <typename T> struct ShuffledInsertData {
14554 /// List of insertelements to be replaced by shuffles.
14555 SmallVector<InsertElementInst *> InsertElements;
14556 /// The parent vectors and shuffle mask for the given list of inserts.
14557 MapVector<T, SmallVector<int>> ValueMasks;
14558};
14559} // namespace
14560
14561InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
14562 InstructionCost ReductionCost) {
14563 InstructionCost Cost = ReductionCost;
14564 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
14565 << VectorizableTree.size() << ".\n");
14566
14567 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
14568
14569 SmallPtrSet<Value *, 4> CheckedExtracts;
14570 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
14571 TreeEntry &TE = *VectorizableTree[I];
14572 // No need to count the cost for combined entries, they are combined and
14573 // just skip their cost.
14574 if (TE.State == TreeEntry::CombinedVectorize) {
14575 LLVM_DEBUG(
14576 dbgs() << "SLP: Skipping cost for combined node that starts with "
14577 << *TE.Scalars[0] << ".\n";
14578 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
14579 continue;
14580 }
14581 if (TE.hasState() &&
14582 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
14583 if (const TreeEntry *E =
14584 getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars);
14585 E && E->getVectorFactor() == TE.getVectorFactor()) {
14586 // Some gather nodes might be absolutely the same as some vectorizable
14587 // nodes after reordering, need to handle it.
14588 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
14589 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
14590 << "SLP: Current total cost = " << Cost << "\n");
14591 continue;
14592 }
14593 }
14594
14595 // Exclude cost of gather loads nodes which are not used. These nodes were
14596 // built as part of the final attempt to vectorize gathered loads.
14597 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
14598 "Expected gather nodes with users only.");
14599
14600 InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
14601 Cost += C;
14602 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
14603 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
14604 << "SLP: Current total cost = " << Cost << "\n");
14605 }
14606
14607 if (Cost >= -SLPCostThreshold &&
14608 none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) {
14609 return isa_and_nonnull<InsertElementInst>(Val: EU.User);
14610 }))
14611 return Cost;
14612
14613 SmallPtrSet<Value *, 16> ExtractCostCalculated;
14614 InstructionCost ExtractCost = 0;
14615 SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
14616 SmallVector<APInt> DemandedElts;
14617 SmallDenseSet<Value *, 4> UsedInserts;
14618 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
14619 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
14620 DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
14621 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
14622 // Keep track {Scalar, Index, User} tuple.
14623 // On AArch64, this helps in fusing a mov instruction, associated with
14624 // extractelement, with fmul in the backend so that extractelement is free.
14625 SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
14626 for (ExternalUser &EU : ExternalUses) {
14627 ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane);
14628 }
14629 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
14630 for (ExternalUser &EU : ExternalUses) {
14631 // Uses by ephemeral values are free (because the ephemeral value will be
14632 // removed prior to code generation, and so the extraction will be
14633 // removed as well).
14634 if (EphValues.count(Ptr: EU.User))
14635 continue;
14636
14637 // Check if the scalar for the given user or all users is accounted already.
14638 if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second ||
14639 (EU.User &&
14640 CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr))))
14641 continue;
14642
14643 // Used in unreachable blocks or in EH pads (rarely executed) or is
14644 // terminated with unreachable instruction.
14645 if (BasicBlock *UserParent =
14646 EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr;
14647 UserParent &&
14648 (!DT->isReachableFromEntry(A: UserParent) || UserParent->isEHPad() ||
14649 isa_and_present<UnreachableInst>(Val: UserParent->getTerminator())))
14650 continue;
14651
14652 // We only add extract cost once for the same scalar.
14653 if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
14654 !ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
14655 continue;
14656
14657 // No extract cost for vector "scalar" if REVEC is disabled
14658 if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType()))
14659 continue;
14660
14661 // If found user is an insertelement, do not calculate extract cost but try
14662 // to detect it as a final shuffled/identity match.
14663 // TODO: what if a user is insertvalue when REVEC is enabled?
14664 if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
14665 VU && VU->getOperand(i_nocapture: 1) == EU.Scalar) {
14666 if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
14667 if (!UsedInserts.insert(V: VU).second)
14668 continue;
14669 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
14670 if (InsertIdx) {
14671 const TreeEntry *ScalarTE = &EU.E;
14672 auto *It = find_if(
14673 Range&: ShuffledInserts,
14674 P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
14675 // Checks if 2 insertelements are from the same buildvector.
14676 InsertElementInst *VecInsert = Data.InsertElements.front();
14677 return areTwoInsertFromSameBuildVector(
14678 VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst *II) -> Value * {
14679 Value *Op0 = II->getOperand(i_nocapture: 0);
14680 if (isVectorized(V: II) && !isVectorized(V: Op0))
14681 return nullptr;
14682 return Op0;
14683 });
14684 });
14685 int VecId = -1;
14686 if (It == ShuffledInserts.end()) {
14687 auto &Data = ShuffledInserts.emplace_back();
14688 Data.InsertElements.emplace_back(Args&: VU);
14689 DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
14690 VecId = ShuffledInserts.size() - 1;
14691 auto It = MinBWs.find(Val: ScalarTE);
14692 if (It != MinBWs.end() &&
14693 VectorCasts
14694 .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
14695 .second) {
14696 unsigned BWSz = It->second.first;
14697 unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
14698 unsigned VecOpcode;
14699 if (DstBWSz < BWSz)
14700 VecOpcode = Instruction::Trunc;
14701 else
14702 VecOpcode =
14703 It->second.second ? Instruction::SExt : Instruction::ZExt;
14704 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14705 InstructionCost C = TTI->getCastInstrCost(
14706 Opcode: VecOpcode, Dst: FTy,
14707 Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
14708 VF: FTy->getNumElements()),
14709 CCH: TTI::CastContextHint::None, CostKind);
14710 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
14711 << " for extending externally used vector with "
14712 "non-equal minimum bitwidth.\n");
14713 Cost += C;
14714 }
14715 } else {
14716 if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front()))
14717 It->InsertElements.front() = VU;
14718 VecId = std::distance(first: ShuffledInserts.begin(), last: It);
14719 }
14720 int InIdx = *InsertIdx;
14721 SmallVectorImpl<int> &Mask =
14722 ShuffledInserts[VecId].ValueMasks[ScalarTE];
14723 if (Mask.empty())
14724 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
14725 Mask[InIdx] = EU.Lane;
14726 DemandedElts[VecId].setBit(InIdx);
14727 continue;
14728 }
14729 }
14730 }
14731
14732 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14733 // If we plan to rewrite the tree in a smaller type, we will need to sign
14734 // extend the extracted value back to the original type. Here, we account
14735 // for the extract and the added cost of the sign extend if needed.
14736 InstructionCost ExtraCost = TTI::TCC_Free;
14737 auto *ScalarTy = EU.Scalar->getType();
14738 auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth);
14739 const TreeEntry *Entry = &EU.E;
14740 auto It = MinBWs.find(Val: Entry);
14741 if (It != MinBWs.end()) {
14742 Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
14743 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
14744 MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements());
14745 unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery(*DL))
14746 ? Instruction::ZExt
14747 : Instruction::SExt;
14748 VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
14749 ExtraCost =
14750 getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane);
14751 } else {
14752 ExtraCost =
14753 getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy,
14754 CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx);
14755 }
14756 // Leave the scalar instructions as is if they are cheaper than extracts.
14757 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
14758 Entry->getOpcode() == Instruction::Load) {
14759 // Checks if the user of the external scalar is phi in loop body.
14760 auto IsPhiInLoop = [&](const ExternalUser &U) {
14761 if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) {
14762 auto *I = cast<Instruction>(Val: U.Scalar);
14763 const Loop *L = LI->getLoopFor(BB: Phi->getParent());
14764 return L && (Phi->getParent() == I->getParent() ||
14765 L == LI->getLoopFor(BB: I->getParent()));
14766 }
14767 return false;
14768 };
14769 if (!ValueToExtUses) {
14770 ValueToExtUses.emplace();
14771 for (const auto &P : enumerate(First&: ExternalUses)) {
14772 // Ignore phis in loops.
14773 if (IsPhiInLoop(P.value()))
14774 continue;
14775
14776 ValueToExtUses->try_emplace(Key: P.value().Scalar, Args: P.index());
14777 }
14778 }
14779 // Can use original instruction, if no operands vectorized or they are
14780 // marked as externally used already.
14781 auto *Inst = cast<Instruction>(Val: EU.Scalar);
14782 InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind);
14783 auto OperandIsScalar = [&](Value *V) {
14784 if (!isVectorized(V)) {
14785 // Some extractelements might be not vectorized, but
14786 // transformed into shuffle and removed from the function,
14787 // consider it here.
14788 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V))
14789 return !EE->hasOneUse() || !MustGather.contains(Ptr: EE);
14790 return true;
14791 }
14792 return ValueToExtUses->contains(Val: V);
14793 };
14794 bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar);
14795 bool CanBeUsedAsScalarCast = false;
14796 if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) {
14797 if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: 0));
14798 Op && all_of(Range: Op->operands(), P: OperandIsScalar)) {
14799 InstructionCost OpCost =
14800 (isVectorized(V: Op) && !ValueToExtUses->contains(Val: Op))
14801 ? TTI->getInstructionCost(U: Op, CostKind)
14802 : 0;
14803 if (ScalarCost + OpCost <= ExtraCost) {
14804 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
14805 ScalarCost += OpCost;
14806 }
14807 }
14808 }
14809 if (CanBeUsedAsScalar) {
14810 bool KeepScalar = ScalarCost <= ExtraCost;
14811 // Try to keep original scalar if the user is the phi node from the same
14812 // block as the root phis, currently vectorized. It allows to keep
14813 // better ordering info of PHIs, being vectorized currently.
14814 bool IsProfitablePHIUser =
14815 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
14816 VectorizableTree.front()->Scalars.size() > 2)) &&
14817 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
14818 !Inst->hasNUsesOrMore(N: UsesLimit) &&
14819 none_of(Range: Inst->users(),
14820 P: [&](User *U) {
14821 auto *PHIUser = dyn_cast<PHINode>(Val: U);
14822 return (!PHIUser ||
14823 PHIUser->getParent() !=
14824 cast<Instruction>(
14825 Val: VectorizableTree.front()->getMainOp())
14826 ->getParent()) &&
14827 !isVectorized(V: U);
14828 }) &&
14829 count_if(Range: Entry->Scalars, P: [&](Value *V) {
14830 return ValueToExtUses->contains(Val: V);
14831 }) <= 2;
14832 if (IsProfitablePHIUser) {
14833 KeepScalar = true;
14834 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
14835 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
14836 (!GatheredLoadsEntriesFirst.has_value() ||
14837 Entry->Idx < *GatheredLoadsEntriesFirst)) {
14838 unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) {
14839 return ValueToExtUses->contains(Val: V);
14840 });
14841 auto It = ExtractsCount.find(Val: Entry);
14842 if (It != ExtractsCount.end()) {
14843 assert(ScalarUsesCount >= It->getSecond().size() &&
14844 "Expected total number of external uses not less than "
14845 "number of scalar uses.");
14846 ScalarUsesCount -= It->getSecond().size();
14847 }
14848 // Keep original scalar if number of externally used instructions in
14849 // the same entry is not power of 2. It may help to do some extra
14850 // vectorization for now.
14851 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(Value: ScalarUsesCount);
14852 }
14853 if (KeepScalar) {
14854 ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar);
14855 for (Value *V : Inst->operands()) {
14856 auto It = ValueToExtUses->find(Val: V);
14857 if (It != ValueToExtUses->end()) {
14858 // Replace all uses to avoid compiler crash.
14859 ExternalUses[It->second].User = nullptr;
14860 }
14861 }
14862 ExtraCost = ScalarCost;
14863 if (!IsPhiInLoop(EU))
14864 ExtractsCount[Entry].insert(V: Inst);
14865 if (CanBeUsedAsScalarCast) {
14866 ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: 0));
14867 // Update the users of the operands of the cast operand to avoid
14868 // compiler crash.
14869 if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: 0))) {
14870 for (Value *V : IOp->operands()) {
14871 auto It = ValueToExtUses->find(Val: V);
14872 if (It != ValueToExtUses->end()) {
14873 // Replace all uses to avoid compiler crash.
14874 ExternalUses[It->second].User = nullptr;
14875 }
14876 }
14877 }
14878 }
14879 }
14880 }
14881 }
14882
14883 ExtractCost += ExtraCost;
14884 }
14885 // Insert externals for extract of operands of casts to be emitted as scalars
14886 // instead of extractelement.
14887 for (Value *V : ScalarOpsFromCasts) {
14888 ExternalUsesAsOriginalScalar.insert(Ptr: V);
14889 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
14890 ExternalUses.emplace_back(Args&: V, Args: nullptr, Args&: *TEs.front(),
14891 Args: TEs.front()->findLaneForValue(V));
14892 }
14893 }
14894 // Add reduced value cost, if resized.
14895 if (!VectorizedVals.empty()) {
14896 const TreeEntry &Root = *VectorizableTree.front();
14897 auto BWIt = MinBWs.find(Val: &Root);
14898 if (BWIt != MinBWs.end()) {
14899 Type *DstTy = Root.Scalars.front()->getType();
14900 unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType());
14901 unsigned SrcSz =
14902 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
14903 if (OriginalSz != SrcSz) {
14904 unsigned Opcode = Instruction::Trunc;
14905 if (OriginalSz > SrcSz)
14906 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
14907 Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
14908 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) {
14909 assert(SLPReVec && "Only supported by REVEC.");
14910 SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements());
14911 }
14912 Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
14913 CCH: TTI::CastContextHint::None,
14914 CostKind: TTI::TCK_RecipThroughput);
14915 }
14916 }
14917 }
14918
14919 Cost += ExtractCost;
14920 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
14921 bool ForSingleMask) {
14922 InstructionCost C = 0;
14923 unsigned VF = Mask.size();
14924 unsigned VecVF = TE->getVectorFactor();
14925 bool HasLargeIndex =
14926 any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); });
14927 if ((VF != VecVF && HasLargeIndex) ||
14928 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) {
14929
14930 if (HasLargeIndex) {
14931 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
14932 std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
14933 result: OrigMask.begin());
14934 C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
14935 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
14936 Mask: OrigMask);
14937 LLVM_DEBUG(
14938 dbgs() << "SLP: Adding cost " << C
14939 << " for final shuffle of insertelement external users.\n";
14940 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
14941 Cost += C;
14942 return std::make_pair(x&: TE, y: true);
14943 }
14944
14945 if (!ForSingleMask) {
14946 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14947 for (unsigned I = 0; I < VF; ++I) {
14948 if (Mask[I] != PoisonMaskElem)
14949 ResizeMask[Mask[I]] = Mask[I];
14950 }
14951 if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF))
14952 C = ::getShuffleCost(
14953 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
14954 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask);
14955 LLVM_DEBUG(
14956 dbgs() << "SLP: Adding cost " << C
14957 << " for final shuffle of insertelement external users.\n";
14958 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
14959
14960 Cost += C;
14961 }
14962 }
14963 return std::make_pair(x&: TE, y: false);
14964 };
14965 // Calculate the cost of the reshuffled vectors, if any.
14966 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14967 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(i_nocapture: 0);
14968 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14969 unsigned VF = 0;
14970 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
14971 ArrayRef<const TreeEntry *> TEs) {
14972 assert((TEs.size() == 1 || TEs.size() == 2) &&
14973 "Expected exactly 1 or 2 tree entries.");
14974 if (TEs.size() == 1) {
14975 if (VF == 0)
14976 VF = TEs.front()->getVectorFactor();
14977 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
14978 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
14979 !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
14980 return Data.value() == PoisonMaskElem ||
14981 (Data.index() < VF &&
14982 static_cast<int>(Data.index()) == Data.value());
14983 })) {
14984 InstructionCost C =
14985 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
14986 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
14987 << " for final shuffle of insertelement "
14988 "external users.\n";
14989 TEs.front()->dump();
14990 dbgs() << "SLP: Current total cost = " << Cost << "\n");
14991 Cost += C;
14992 }
14993 } else {
14994 if (VF == 0) {
14995 if (TEs.front() &&
14996 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
14997 VF = TEs.front()->getVectorFactor();
14998 else
14999 VF = Mask.size();
15000 }
15001 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
15002 InstructionCost C =
15003 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
15004 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
15005 << " for final shuffle of vector node and external "
15006 "insertelement users.\n";
15007 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
15008 dbgs() << "SLP: Current total cost = " << Cost << "\n");
15009 Cost += C;
15010 }
15011 VF = Mask.size();
15012 return TEs.back();
15013 };
15014 (void)performExtractsShuffleAction<const TreeEntry>(
15015 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
15016 GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF,
15017 Action: EstimateShufflesCost);
15018 InstructionCost InsertCost = TTI->getScalarizationOverhead(
15019 Ty: cast<FixedVectorType>(
15020 Val: ShuffledInserts[I].InsertElements.front()->getType()),
15021 DemandedElts: DemandedElts[I],
15022 /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
15023 Cost -= InsertCost;
15024 }
15025
15026 // Add the cost for reduced value resize (if required).
15027 if (ReductionBitWidth != 0) {
15028 assert(UserIgnoreList && "Expected reduction tree.");
15029 const TreeEntry &E = *VectorizableTree.front();
15030 auto It = MinBWs.find(Val: &E);
15031 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
15032 unsigned SrcSize = It->second.first;
15033 unsigned DstSize = ReductionBitWidth;
15034 unsigned Opcode = Instruction::Trunc;
15035 if (SrcSize < DstSize) {
15036 bool IsArithmeticExtendedReduction =
15037 all_of(Range: *UserIgnoreList, P: [](Value *V) {
15038 auto *I = cast<Instruction>(Val: V);
15039 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
15040 Instruction::Mul, Instruction::FMul,
15041 Instruction::And, Instruction::Or,
15042 Instruction::Xor},
15043 Element: I->getOpcode());
15044 });
15045 if (IsArithmeticExtendedReduction)
15046 Opcode =
15047 Instruction::BitCast; // Handle it by getExtendedReductionCost
15048 else
15049 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15050 }
15051 if (Opcode != Instruction::BitCast) {
15052 auto *SrcVecTy =
15053 getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
15054 auto *DstVecTy =
15055 getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
15056 TTI::CastContextHint CCH = getCastContextHint(TE: E);
15057 InstructionCost CastCost;
15058 switch (E.getOpcode()) {
15059 case Instruction::SExt:
15060 case Instruction::ZExt:
15061 case Instruction::Trunc: {
15062 const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0);
15063 CCH = getCastContextHint(TE: *OpTE);
15064 break;
15065 }
15066 default:
15067 break;
15068 }
15069 CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
15070 CostKind: TTI::TCK_RecipThroughput);
15071 Cost += CastCost;
15072 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
15073 << " for final resize for reduction from " << SrcVecTy
15074 << " to " << DstVecTy << "\n";
15075 dbgs() << "SLP: Current total cost = " << Cost << "\n");
15076 }
15077 }
15078 }
15079
15080 std::optional<InstructionCost> SpillCost;
15081 if (Cost < -SLPCostThreshold) {
15082 SpillCost = getSpillCost();
15083 Cost += *SpillCost;
15084 }
15085#ifndef NDEBUG
15086 SmallString<256> Str;
15087 {
15088 raw_svector_ostream OS(Str);
15089 OS << "SLP: Spill Cost = ";
15090 if (SpillCost)
15091 OS << *SpillCost;
15092 else
15093 OS << "<skipped>";
15094 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
15095 << "SLP: Total Cost = " << Cost << ".\n";
15096 }
15097 LLVM_DEBUG(dbgs() << Str);
15098 if (ViewSLPTree)
15099 ViewGraph(this, "SLP" + F->getName(), false, Str);
15100#endif
15101
15102 return Cost;
15103}
15104
15105/// Tries to find extractelement instructions with constant indices from fixed
15106/// vector type and gather such instructions into a bunch, which highly likely
15107/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
15108/// successful, the matched scalars are replaced by poison values in \p VL for
15109/// future analysis.
15110std::optional<TTI::ShuffleKind>
15111BoUpSLP::tryToGatherSingleRegisterExtractElements(
15112 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
15113 // Scan list of gathered scalars for extractelements that can be represented
15114 // as shuffles.
15115 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
15116 SmallVector<int> UndefVectorExtracts;
15117 for (int I = 0, E = VL.size(); I < E; ++I) {
15118 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
15119 if (!EI) {
15120 if (isa<UndefValue>(Val: VL[I]))
15121 UndefVectorExtracts.push_back(Elt: I);
15122 continue;
15123 }
15124 auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
15125 if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
15126 continue;
15127 std::optional<unsigned> Idx = getExtractIndex(E: EI);
15128 // Undefined index.
15129 if (!Idx) {
15130 UndefVectorExtracts.push_back(Elt: I);
15131 continue;
15132 }
15133 if (Idx >= VecTy->getNumElements()) {
15134 UndefVectorExtracts.push_back(Elt: I);
15135 continue;
15136 }
15137 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
15138 ExtractMask.reset(Idx: *Idx);
15139 if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
15140 UndefVectorExtracts.push_back(Elt: I);
15141 continue;
15142 }
15143 VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I);
15144 }
15145 // Sort the vector operands by the maximum number of uses in extractelements.
15146 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
15147 VectorOpToIdx.takeVector();
15148 stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
15149 return P1.second.size() > P2.second.size();
15150 });
15151 // Find the best pair of the vectors or a single vector.
15152 const int UndefSz = UndefVectorExtracts.size();
15153 unsigned SingleMax = 0;
15154 unsigned PairMax = 0;
15155 if (!Vectors.empty()) {
15156 SingleMax = Vectors.front().second.size() + UndefSz;
15157 if (Vectors.size() > 1) {
15158 auto *ItNext = std::next(x: Vectors.begin());
15159 PairMax = SingleMax + ItNext->second.size();
15160 }
15161 }
15162 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
15163 return std::nullopt;
15164 // Check if better to perform a shuffle of 2 vectors or just of a single
15165 // vector.
15166 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
15167 SmallVector<Value *> GatheredExtracts(
15168 VL.size(), PoisonValue::get(T: VL.front()->getType()));
15169 if (SingleMax >= PairMax && SingleMax) {
15170 for (int Idx : Vectors.front().second)
15171 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
15172 } else if (!Vectors.empty()) {
15173 for (unsigned Idx : {0, 1})
15174 for (int Idx : Vectors[Idx].second)
15175 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
15176 }
15177 // Add extracts from undefs too.
15178 for (int Idx : UndefVectorExtracts)
15179 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
15180 // Check that gather of extractelements can be represented as just a
15181 // shuffle of a single/two vectors the scalars are extracted from.
15182 std::optional<TTI::ShuffleKind> Res =
15183 isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC);
15184 if (!Res || all_of(Range&: Mask, P: [](int Idx) { return Idx == PoisonMaskElem; })) {
15185 // TODO: try to check other subsets if possible.
15186 // Restore the original VL if attempt was not successful.
15187 copy(Range&: SavedVL, Out: VL.begin());
15188 return std::nullopt;
15189 }
15190 // Restore unused scalars from mask, if some of the extractelements were not
15191 // selected for shuffle.
15192 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
15193 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) &&
15194 isa<UndefValue>(Val: GatheredExtracts[I])) {
15195 std::swap(a&: VL[I], b&: GatheredExtracts[I]);
15196 continue;
15197 }
15198 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
15199 if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) ||
15200 !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) ||
15201 is_contained(Range&: UndefVectorExtracts, Element: I))
15202 continue;
15203 }
15204 return Res;
15205}
15206
15207/// Tries to find extractelement instructions with constant indices from fixed
15208/// vector type and gather such instructions into a bunch, which highly likely
15209/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
15210/// successful, the matched scalars are replaced by poison values in \p VL for
15211/// future analysis.
15212SmallVector<std::optional<TTI::ShuffleKind>>
15213BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
15214 SmallVectorImpl<int> &Mask,
15215 unsigned NumParts) const {
15216 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
15217 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
15218 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
15219 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
15220 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
15221 // Scan list of gathered scalars for extractelements that can be represented
15222 // as shuffles.
15223 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
15224 N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
15225 SmallVector<int> SubMask;
15226 std::optional<TTI::ShuffleKind> Res =
15227 tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
15228 ShufflesRes[Part] = Res;
15229 copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
15230 }
15231 if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
15232 return Res.has_value();
15233 }))
15234 ShufflesRes.clear();
15235 return ShufflesRes;
15236}
15237
15238std::optional<TargetTransformInfo::ShuffleKind>
15239BoUpSLP::isGatherShuffledSingleRegisterEntry(
15240 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
15241 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
15242 Entries.clear();
15243 // TODO: currently checking only for Scalars in the tree entry, need to count
15244 // reused elements too for better cost estimation.
15245 auto GetUserEntry = [&](const TreeEntry *TE) {
15246 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
15247 TE = TE->UserTreeIndex.UserTE;
15248 if (TE == VectorizableTree.front().get())
15249 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
15250 return TE->UserTreeIndex;
15251 };
15252 auto HasGatherUser = [&](const TreeEntry *TE) {
15253 while (TE->Idx != 0 && TE->UserTreeIndex) {
15254 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
15255 return true;
15256 TE = TE->UserTreeIndex.UserTE;
15257 }
15258 return false;
15259 };
15260 const EdgeInfo TEUseEI = GetUserEntry(TE);
15261 if (!TEUseEI)
15262 return std::nullopt;
15263 const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
15264 const BasicBlock *TEInsertBlock = nullptr;
15265 // Main node of PHI entries keeps the correct order of operands/incoming
15266 // blocks.
15267 if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp());
15268 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
15269 TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
15270 TEInsertPt = TEInsertBlock->getTerminator();
15271 } else {
15272 TEInsertBlock = TEInsertPt->getParent();
15273 }
15274 if (!DT->isReachableFromEntry(A: TEInsertBlock))
15275 return std::nullopt;
15276 auto *NodeUI = DT->getNode(BB: TEInsertBlock);
15277 assert(NodeUI && "Should only process reachable instructions");
15278 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
15279 auto CheckOrdering = [&](const Instruction *InsertPt) {
15280 // Argument InsertPt is an instruction where vector code for some other
15281 // tree entry (one that shares one or more scalars with TE) is going to be
15282 // generated. This lambda returns true if insertion point of vector code
15283 // for the TE dominates that point (otherwise dependency is the other way
15284 // around). The other node is not limited to be of a gather kind. Gather
15285 // nodes are not scheduled and their vector code is inserted before their
15286 // first user. If user is PHI, that is supposed to be at the end of a
15287 // predecessor block. Otherwise it is the last instruction among scalars of
15288 // the user node. So, instead of checking dependency between instructions
15289 // themselves, we check dependency between their insertion points for vector
15290 // code (since each scalar instruction ends up as a lane of a vector
15291 // instruction).
15292 const BasicBlock *InsertBlock = InsertPt->getParent();
15293 auto *NodeEUI = DT->getNode(BB: InsertBlock);
15294 if (!NodeEUI)
15295 return false;
15296 assert((NodeUI == NodeEUI) ==
15297 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
15298 "Different nodes should have different DFS numbers");
15299 // Check the order of the gather nodes users.
15300 if (TEInsertPt->getParent() != InsertBlock &&
15301 (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI)))
15302 return false;
15303 if (TEInsertPt->getParent() == InsertBlock &&
15304 TEInsertPt->comesBefore(Other: InsertPt))
15305 return false;
15306 return true;
15307 };
15308 // Find all tree entries used by the gathered values. If no common entries
15309 // found - not a shuffle.
15310 // Here we build a set of tree nodes for each gathered value and trying to
15311 // find the intersection between these sets. If we have at least one common
15312 // tree node for each gathered value - we have just a permutation of the
15313 // single vector. If we have 2 different sets, we're in situation where we
15314 // have a permutation of 2 input vectors.
15315 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
15316 SmallDenseMap<Value *, int> UsedValuesEntry;
15317 SmallPtrSet<const Value *, 16> VisitedValue;
15318 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
15319 // The node is reused - exit.
15320 if ((TEPtr->getVectorFactor() != VL.size() &&
15321 TEPtr->Scalars.size() != VL.size()) ||
15322 (!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars)))
15323 return false;
15324 UsedTEs.clear();
15325 UsedTEs.emplace_back().insert(Ptr: TEPtr);
15326 for (Value *V : VL) {
15327 if (isConstant(V))
15328 continue;
15329 UsedValuesEntry.try_emplace(Key: V, Args: 0);
15330 }
15331 return true;
15332 };
15333 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
15334 unsigned EdgeIdx) {
15335 const TreeEntry *Ptr1 = User1;
15336 const TreeEntry *Ptr2 = User2;
15337 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
15338 while (Ptr2) {
15339 PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx);
15340 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
15341 Ptr2 = Ptr2->UserTreeIndex.UserTE;
15342 }
15343 while (Ptr1) {
15344 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
15345 Ptr1 = Ptr1->UserTreeIndex.UserTE;
15346 if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end())
15347 return Idx < It->second;
15348 }
15349 return false;
15350 };
15351 for (Value *V : VL) {
15352 if (isConstant(V) || !VisitedValue.insert(Ptr: V).second)
15353 continue;
15354 // Build a list of tree entries where V is used.
15355 SmallPtrSet<const TreeEntry *, 4> VToTEs;
15356 for (const TreeEntry *TEPtr : ValueToGatherNodes.lookup(Val: V)) {
15357 if (TEPtr == TE || TEPtr->Idx == 0)
15358 continue;
15359 assert(any_of(TEPtr->Scalars,
15360 [&](Value *V) { return GatheredScalars.contains(V); }) &&
15361 "Must contain at least single gathered value.");
15362 assert(TEPtr->UserTreeIndex &&
15363 "Expected only single user of a gather node.");
15364 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
15365
15366 PHINode *UserPHI = UseEI.UserTE->State != TreeEntry::SplitVectorize
15367 ? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp())
15368 : nullptr;
15369 Instruction *InsertPt =
15370 UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
15371 : &getLastInstructionInBundle(E: UseEI.UserTE);
15372 if (TEInsertPt == InsertPt) {
15373 // Check nodes, which might be emitted first.
15374 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
15375 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
15376 TEUseEI.UserTE->isAltShuffle()) &&
15377 all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) {
15378 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
15379 (UseEI.UserTE->getOpcode() == Instruction::PHI &&
15380 !UseEI.UserTE->isAltShuffle()) ||
15381 !all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock))
15382 continue;
15383 }
15384
15385 // If the schedulable insertion point is used in multiple entries - just
15386 // exit, no known ordering at this point, available only after real
15387 // scheduling.
15388 if (!doesNotNeedToBeScheduled(V: InsertPt) &&
15389 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
15390 continue;
15391 // If the users are the PHI nodes with the same incoming blocks - skip.
15392 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
15393 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
15394 UseEI.UserTE->State == TreeEntry::Vectorize &&
15395 UseEI.UserTE->getOpcode() == Instruction::PHI &&
15396 TEUseEI.UserTE != UseEI.UserTE)
15397 continue;
15398 // If 2 gathers are operands of the same entry (regardless of whether
15399 // user is PHI or else), compare operands indices, use the earlier one
15400 // as the base.
15401 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
15402 continue;
15403 // If the user instruction is used for some reason in different
15404 // vectorized nodes - make it depend on index.
15405 if (TEUseEI.UserTE != UseEI.UserTE &&
15406 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
15407 HasGatherUser(TEUseEI.UserTE)))
15408 continue;
15409 // If the user node is the operand of the other user node - skip.
15410 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
15411 continue;
15412 }
15413
15414 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
15415 TEUseEI.UserTE->doesNotNeedToSchedule() !=
15416 UseEI.UserTE->doesNotNeedToSchedule() &&
15417 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))
15418 continue;
15419 // Check if the user node of the TE comes after user node of TEPtr,
15420 // otherwise TEPtr depends on TE.
15421 if ((TEInsertBlock != InsertPt->getParent() ||
15422 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
15423 !CheckOrdering(InsertPt))
15424 continue;
15425 // The node is reused - exit.
15426 if (CheckAndUseSameNode(TEPtr))
15427 break;
15428 VToTEs.insert(Ptr: TEPtr);
15429 }
15430 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
15431 const auto *It = find_if(
15432 Range&: VTEs, P: [&](const TreeEntry *MTE) { return MTE != TEUseEI.UserTE; });
15433 if (It != VTEs.end()) {
15434 const TreeEntry *VTE = *It;
15435 if (none_of(Range: TE->CombinedEntriesWithIndices,
15436 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
15437 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
15438 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
15439 continue;
15440 }
15441 // The node is reused - exit.
15442 if (CheckAndUseSameNode(VTE))
15443 break;
15444 VToTEs.insert(Ptr: VTE);
15445 }
15446 }
15447 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
15448 const TreeEntry *VTE = VTEs.front();
15449 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: 0) &&
15450 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
15451 VTEs = VTEs.drop_front();
15452 // Iterate through all vectorized nodes.
15453 const auto *MIt = find_if(Range&: VTEs, P: [](const TreeEntry *MTE) {
15454 return MTE->State == TreeEntry::Vectorize;
15455 });
15456 if (MIt == VTEs.end())
15457 continue;
15458 VTE = *MIt;
15459 }
15460 if (none_of(Range: TE->CombinedEntriesWithIndices,
15461 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
15462 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
15463 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
15464 continue;
15465 }
15466 // The node is reused - exit.
15467 if (CheckAndUseSameNode(VTE))
15468 break;
15469 VToTEs.insert(Ptr: VTE);
15470 }
15471 if (VToTEs.empty())
15472 continue;
15473 if (UsedTEs.empty()) {
15474 // The first iteration, just insert the list of nodes to vector.
15475 UsedTEs.push_back(Elt: VToTEs);
15476 UsedValuesEntry.try_emplace(Key: V, Args: 0);
15477 } else {
15478 // Need to check if there are any previously used tree nodes which use V.
15479 // If there are no such nodes, consider that we have another one input
15480 // vector.
15481 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
15482 unsigned Idx = 0;
15483 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
15484 // Do we have a non-empty intersection of previously listed tree entries
15485 // and tree entries using current V?
15486 set_intersect(S1&: VToTEs, S2: Set);
15487 if (!VToTEs.empty()) {
15488 // Yes, write the new subset and continue analysis for the next
15489 // scalar.
15490 Set.swap(RHS&: VToTEs);
15491 break;
15492 }
15493 VToTEs = SavedVToTEs;
15494 ++Idx;
15495 }
15496 // No non-empty intersection found - need to add a second set of possible
15497 // source vectors.
15498 if (Idx == UsedTEs.size()) {
15499 // If the number of input vectors is greater than 2 - not a permutation,
15500 // fallback to the regular gather.
15501 // TODO: support multiple reshuffled nodes.
15502 if (UsedTEs.size() == 2)
15503 continue;
15504 UsedTEs.push_back(Elt: SavedVToTEs);
15505 Idx = UsedTEs.size() - 1;
15506 }
15507 UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
15508 }
15509 }
15510
15511 if (UsedTEs.empty()) {
15512 Entries.clear();
15513 return std::nullopt;
15514 }
15515
15516 unsigned VF = 0;
15517 if (UsedTEs.size() == 1) {
15518 // Keep the order to avoid non-determinism.
15519 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
15520 UsedTEs.front().end());
15521 sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
15522 return TE1->Idx < TE2->Idx;
15523 });
15524 // Try to find the perfect match in another gather node at first.
15525 auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) {
15526 return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars);
15527 });
15528 if (It != FirstEntries.end() &&
15529 ((*It)->getVectorFactor() == VL.size() ||
15530 ((*It)->getVectorFactor() == TE->Scalars.size() &&
15531 TE->ReuseShuffleIndices.size() == VL.size() &&
15532 (*It)->isSame(VL: TE->Scalars)))) {
15533 Entries.push_back(Elt: *It);
15534 if ((*It)->getVectorFactor() == VL.size()) {
15535 std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15536 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0);
15537 } else {
15538 SmallVector<int> CommonMask = TE->getCommonMask();
15539 copy(Range&: CommonMask, Out: Mask.begin());
15540 }
15541 // Clear undef scalars.
15542 for (unsigned I : seq<unsigned>(Size: VL.size()))
15543 if (isa<PoisonValue>(Val: VL[I]))
15544 Mask[Part * VL.size() + I] = PoisonMaskElem;
15545 return TargetTransformInfo::SK_PermuteSingleSrc;
15546 }
15547 // No perfect match, just shuffle, so choose the first tree node from the
15548 // tree.
15549 Entries.push_back(Elt: FirstEntries.front());
15550 // Update mapping between values and corresponding tree entries.
15551 for (auto &P : UsedValuesEntry)
15552 P.second = 0;
15553 VF = FirstEntries.front()->getVectorFactor();
15554 } else {
15555 // Try to find nodes with the same vector factor.
15556 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
15557 // Keep the order of tree nodes to avoid non-determinism.
15558 DenseMap<int, const TreeEntry *> VFToTE;
15559 for (const TreeEntry *TE : UsedTEs.front()) {
15560 unsigned VF = TE->getVectorFactor();
15561 auto It = VFToTE.find(Val: VF);
15562 if (It != VFToTE.end()) {
15563 if (It->second->Idx > TE->Idx)
15564 It->getSecond() = TE;
15565 continue;
15566 }
15567 VFToTE.try_emplace(Key: VF, Args&: TE);
15568 }
15569 // Same, keep the order to avoid non-determinism.
15570 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
15571 UsedTEs.back().end());
15572 sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
15573 return TE1->Idx < TE2->Idx;
15574 });
15575 for (const TreeEntry *TE : SecondEntries) {
15576 auto It = VFToTE.find(Val: TE->getVectorFactor());
15577 if (It != VFToTE.end()) {
15578 VF = It->first;
15579 Entries.push_back(Elt: It->second);
15580 Entries.push_back(Elt: TE);
15581 break;
15582 }
15583 }
15584 // No 2 source vectors with the same vector factor - just choose 2 with max
15585 // index.
15586 if (Entries.empty()) {
15587 Entries.push_back(Elt: *llvm::max_element(
15588 Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) {
15589 return TE1->Idx < TE2->Idx;
15590 }));
15591 Entries.push_back(Elt: SecondEntries.front());
15592 VF = std::max(a: Entries.front()->getVectorFactor(),
15593 b: Entries.back()->getVectorFactor());
15594 } else {
15595 VF = Entries.front()->getVectorFactor();
15596 }
15597 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
15598 for (const TreeEntry *E : Entries)
15599 ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(),
15600 E: E->Scalars.end());
15601 // Update mapping between values and corresponding tree entries.
15602 for (auto &P : UsedValuesEntry) {
15603 for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size()))
15604 if (ValuesToEntries[Idx].contains(Ptr: P.first)) {
15605 P.second = Idx;
15606 break;
15607 }
15608 }
15609 }
15610
15611 bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>);
15612 // Checks if the 2 PHIs are compatible in terms of high possibility to be
15613 // vectorized.
15614 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
15615 auto *PHI = cast<PHINode>(Val: V);
15616 auto *PHI1 = cast<PHINode>(Val: V1);
15617 // Check that all incoming values are compatible/from same parent (if they
15618 // are instructions).
15619 // The incoming values are compatible if they all are constants, or
15620 // instruction with the same/alternate opcodes from the same basic block.
15621 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
15622 Value *In = PHI->getIncomingValue(i: I);
15623 Value *In1 = PHI1->getIncomingValue(i: I);
15624 if (isConstant(V: In) && isConstant(V: In1))
15625 continue;
15626 if (!getSameOpcode(VL: {In, In1}, TLI: *TLI))
15627 return false;
15628 if (cast<Instruction>(Val: In)->getParent() !=
15629 cast<Instruction>(Val: In1)->getParent())
15630 return false;
15631 }
15632 return true;
15633 };
15634 // Check if the value can be ignored during analysis for shuffled gathers.
15635 // We suppose it is better to ignore instruction, which do not form splats,
15636 // are not vectorized/not extractelements (these instructions will be handled
15637 // by extractelements processing) or may form vector node in future.
15638 auto MightBeIgnored = [=](Value *V) {
15639 auto *I = dyn_cast<Instruction>(Val: V);
15640 return I && !IsSplatOrUndefs && !isVectorized(V: I) &&
15641 !isVectorLikeInstWithConstOps(V: I) &&
15642 !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
15643 };
15644 // Check that the neighbor instruction may form a full vector node with the
15645 // current instruction V. It is possible, if they have same/alternate opcode
15646 // and same parent basic block.
15647 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
15648 Value *V1 = VL[Idx];
15649 bool UsedInSameVTE = false;
15650 auto It = UsedValuesEntry.find(Val: V1);
15651 if (It != UsedValuesEntry.end())
15652 UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second;
15653 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
15654 getSameOpcode(VL: {V, V1}, TLI: *TLI) &&
15655 cast<Instruction>(Val: V)->getParent() ==
15656 cast<Instruction>(Val: V1)->getParent() &&
15657 (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1));
15658 };
15659 // Build a shuffle mask for better cost estimation and vector emission.
15660 SmallBitVector UsedIdxs(Entries.size());
15661 SmallVector<std::pair<unsigned, int>> EntryLanes;
15662 for (int I = 0, E = VL.size(); I < E; ++I) {
15663 Value *V = VL[I];
15664 auto It = UsedValuesEntry.find(Val: V);
15665 if (It == UsedValuesEntry.end())
15666 continue;
15667 // Do not try to shuffle scalars, if they are constants, or instructions
15668 // that can be vectorized as a result of the following vector build
15669 // vectorization.
15670 if (isConstant(V) || (MightBeIgnored(V) &&
15671 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
15672 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
15673 continue;
15674 unsigned Idx = It->second;
15675 EntryLanes.emplace_back(Args&: Idx, Args&: I);
15676 UsedIdxs.set(Idx);
15677 }
15678 // Iterate through all shuffled scalars and select entries, which can be used
15679 // for final shuffle.
15680 SmallVector<const TreeEntry *> TempEntries;
15681 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
15682 if (!UsedIdxs.test(Idx: I))
15683 continue;
15684 // Fix the entry number for the given scalar. If it is the first entry, set
15685 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
15686 // These indices are used when calculating final shuffle mask as the vector
15687 // offset.
15688 for (std::pair<unsigned, int> &Pair : EntryLanes)
15689 if (Pair.first == I)
15690 Pair.first = TempEntries.size();
15691 TempEntries.push_back(Elt: Entries[I]);
15692 }
15693 Entries.swap(RHS&: TempEntries);
15694 if (EntryLanes.size() == Entries.size() &&
15695 !VL.equals(RHS: ArrayRef(TE->Scalars)
15696 .slice(N: Part * VL.size(),
15697 M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
15698 // We may have here 1 or 2 entries only. If the number of scalars is equal
15699 // to the number of entries, no need to do the analysis, it is not very
15700 // profitable. Since VL is not the same as TE->Scalars, it means we already
15701 // have some shuffles before. Cut off not profitable case.
15702 Entries.clear();
15703 return std::nullopt;
15704 }
15705 // Build the final mask, check for the identity shuffle, if possible.
15706 bool IsIdentity = Entries.size() == 1;
15707 // Pair.first is the offset to the vector, while Pair.second is the index of
15708 // scalar in the list.
15709 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
15710 unsigned Idx = Part * VL.size() + Pair.second;
15711 Mask[Idx] =
15712 Pair.first * VF +
15713 (ForOrder ? std::distance(
15714 first: Entries[Pair.first]->Scalars.begin(),
15715 last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second]))
15716 : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second]));
15717 IsIdentity &= Mask[Idx] == Pair.second;
15718 }
15719 if (ForOrder || IsIdentity || Entries.empty()) {
15720 switch (Entries.size()) {
15721 case 1:
15722 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
15723 return TargetTransformInfo::SK_PermuteSingleSrc;
15724 break;
15725 case 2:
15726 if (EntryLanes.size() > 2 || VL.size() <= 2)
15727 return TargetTransformInfo::SK_PermuteTwoSrc;
15728 break;
15729 default:
15730 break;
15731 }
15732 } else if (!isa<VectorType>(Val: VL.front()->getType()) &&
15733 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
15734 // Do the cost estimation if shuffle beneficial than buildvector.
15735 SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()),
15736 std::next(x: Mask.begin(), n: (Part + 1) * VL.size()));
15737 int MinElement = SubMask.front(), MaxElement = SubMask.front();
15738 for (int Idx : SubMask) {
15739 if (Idx == PoisonMaskElem)
15740 continue;
15741 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
15742 MinElement = Idx;
15743 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
15744 MaxElement = Idx;
15745 }
15746 assert(MaxElement >= 0 && MinElement >= 0 &&
15747 MaxElement % VF >= MinElement % VF &&
15748 "Expected at least single element.");
15749 unsigned NewVF = std::max<unsigned>(
15750 a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(),
15751 Sz: (MaxElement % VF) -
15752 (MinElement % VF) + 1));
15753 if (NewVF < VF) {
15754 for (int &Idx : SubMask) {
15755 if (Idx == PoisonMaskElem)
15756 continue;
15757 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
15758 (Idx >= static_cast<int>(VF) ? NewVF : 0);
15759 }
15760 } else {
15761 NewVF = VF;
15762 }
15763
15764 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15765 auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF);
15766 auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size());
15767 auto GetShuffleCost = [&,
15768 &TTI = *TTI](ArrayRef<int> Mask,
15769 ArrayRef<const TreeEntry *> Entries,
15770 VectorType *VecTy) -> InstructionCost {
15771 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
15772 ShuffleVectorInst::isDeInterleaveMaskOfFactor(
15773 Mask, Factor: Entries.front()->getInterleaveFactor()))
15774 return TTI::TCC_Free;
15775 return ::getShuffleCost(TTI,
15776 Kind: Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
15777 : TTI::SK_PermuteSingleSrc,
15778 Tp: VecTy, Mask, CostKind);
15779 };
15780 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
15781 InstructionCost FirstShuffleCost = 0;
15782 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
15783 if (Entries.size() == 1 || !Entries[0]->isGather()) {
15784 FirstShuffleCost = ShuffleCost;
15785 } else {
15786 // Transform mask to include only first entry.
15787 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
15788 bool IsIdentity = true;
15789 for (auto [I, Idx] : enumerate(First&: FirstMask)) {
15790 if (Idx >= static_cast<int>(NewVF)) {
15791 Idx = PoisonMaskElem;
15792 } else {
15793 DemandedElts.clearBit(BitPosition: I);
15794 if (Idx != PoisonMaskElem)
15795 IsIdentity &= static_cast<int>(I) == Idx;
15796 }
15797 }
15798 if (!IsIdentity)
15799 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
15800 FirstShuffleCost += getScalarizationOverhead(
15801 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
15802 /*Extract=*/false, CostKind);
15803 }
15804 InstructionCost SecondShuffleCost = 0;
15805 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
15806 if (Entries.size() == 1 || !Entries[1]->isGather()) {
15807 SecondShuffleCost = ShuffleCost;
15808 } else {
15809 // Transform mask to include only first entry.
15810 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
15811 bool IsIdentity = true;
15812 for (auto [I, Idx] : enumerate(First&: SecondMask)) {
15813 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
15814 Idx = PoisonMaskElem;
15815 } else {
15816 DemandedElts.clearBit(BitPosition: I);
15817 if (Idx != PoisonMaskElem) {
15818 Idx -= NewVF;
15819 IsIdentity &= static_cast<int>(I) == Idx;
15820 }
15821 }
15822 }
15823 if (!IsIdentity)
15824 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
15825 SecondShuffleCost += getScalarizationOverhead(
15826 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
15827 /*Extract=*/false, CostKind);
15828 }
15829 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
15830 for (auto [I, Idx] : enumerate(First&: SubMask))
15831 if (Idx == PoisonMaskElem)
15832 DemandedElts.clearBit(BitPosition: I);
15833 InstructionCost BuildVectorCost = getScalarizationOverhead(
15834 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
15835 /*Extract=*/false, CostKind);
15836 const TreeEntry *BestEntry = nullptr;
15837 if (FirstShuffleCost < ShuffleCost) {
15838 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15839 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
15840 f: [&](int &Idx) {
15841 if (Idx >= static_cast<int>(VF))
15842 Idx = PoisonMaskElem;
15843 });
15844 BestEntry = Entries.front();
15845 ShuffleCost = FirstShuffleCost;
15846 }
15847 if (SecondShuffleCost < ShuffleCost) {
15848 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15849 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
15850 f: [&](int &Idx) {
15851 if (Idx < static_cast<int>(VF))
15852 Idx = PoisonMaskElem;
15853 else
15854 Idx -= VF;
15855 });
15856 BestEntry = Entries[1];
15857 ShuffleCost = SecondShuffleCost;
15858 }
15859 if (BuildVectorCost >= ShuffleCost) {
15860 if (BestEntry) {
15861 Entries.clear();
15862 Entries.push_back(Elt: BestEntry);
15863 }
15864 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
15865 : TargetTransformInfo::SK_PermuteSingleSrc;
15866 }
15867 }
15868 Entries.clear();
15869 // Clear the corresponding mask elements.
15870 std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
15871 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: PoisonMaskElem);
15872 return std::nullopt;
15873}
15874
15875SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
15876BoUpSLP::isGatherShuffledEntry(
15877 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
15878 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
15879 bool ForOrder) {
15880 assert(NumParts > 0 && NumParts < VL.size() &&
15881 "Expected positive number of registers.");
15882 Entries.clear();
15883 // No need to check for the topmost gather node.
15884 if (TE == VectorizableTree.front().get() &&
15885 (!GatheredLoadsEntriesFirst.has_value() ||
15886 none_of(Range: ArrayRef(VectorizableTree).drop_front(),
15887 P: [](const std::unique_ptr<TreeEntry> &TE) {
15888 return !TE->isGather();
15889 })))
15890 return {};
15891 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
15892 // implemented yet.
15893 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
15894 return {};
15895 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
15896 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
15897 "Expected only single user of the gather node.");
15898 assert(VL.size() % NumParts == 0 &&
15899 "Number of scalars must be divisible by NumParts.");
15900 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
15901 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
15902 (TE->Idx == 0 ||
15903 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
15904 isSplat(VL: TE->Scalars) ||
15905 (TE->hasState() &&
15906 getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars))))
15907 return {};
15908 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
15909 SmallVector<std::optional<TTI::ShuffleKind>> Res;
15910 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
15911 ArrayRef<Value *> SubVL =
15912 VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
15913 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
15914 std::optional<TTI::ShuffleKind> SubRes =
15915 isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
15916 ForOrder);
15917 if (!SubRes)
15918 SubEntries.clear();
15919 Res.push_back(Elt: SubRes);
15920 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
15921 SubEntries.front()->getVectorFactor() == VL.size() &&
15922 (SubEntries.front()->isSame(VL: TE->Scalars) ||
15923 SubEntries.front()->isSame(VL))) {
15924 SmallVector<const TreeEntry *> LocalSubEntries;
15925 LocalSubEntries.swap(RHS&: SubEntries);
15926 Entries.clear();
15927 Res.clear();
15928 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
15929 // Clear undef scalars.
15930 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
15931 if (isa<PoisonValue>(Val: VL[I]))
15932 Mask[I] = PoisonMaskElem;
15933 Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front());
15934 Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
15935 return Res;
15936 }
15937 }
15938 if (all_of(Range&: Res,
15939 P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
15940 Entries.clear();
15941 return {};
15942 }
15943 return Res;
15944}
15945
15946InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
15947 Type *ScalarTy) const {
15948 const unsigned VF = VL.size();
15949 auto *VecTy = getWidenedType(ScalarTy, VF);
15950 // Find the cost of inserting/extracting values from the vector.
15951 // Check if the same elements are inserted several times and count them as
15952 // shuffle candidates.
15953 APInt DemandedElements = APInt::getZero(numBits: VF);
15954 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15955 InstructionCost Cost;
15956 auto EstimateInsertCost = [&](unsigned I, Value *V) {
15957 DemandedElements.setBit(I);
15958 if (V->getType() != ScalarTy)
15959 Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
15960 CCH: TTI::CastContextHint::None, CostKind);
15961 };
15962 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
15963 std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: 0);
15964 for (auto [I, V] : enumerate(First&: VL)) {
15965 // No need to shuffle duplicates for constants.
15966 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V))
15967 continue;
15968
15969 if (isConstant(V)) {
15970 ConstantShuffleMask[I] = I + VF;
15971 continue;
15972 }
15973 EstimateInsertCost(I, V);
15974 }
15975 // FIXME: add a cost for constant vector materialization.
15976 bool IsAnyNonUndefConst =
15977 any_of(Range&: VL, P: [](Value *V) { return !isa<UndefValue>(Val: V) && isConstant(V); });
15978 // 1. Shuffle input source vector and constant vector.
15979 if (!ForPoisonSrc && IsAnyNonUndefConst) {
15980 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy,
15981 Mask: ConstantShuffleMask);
15982 }
15983
15984 // 2. Insert unique non-constants.
15985 if (!DemandedElements.isZero())
15986 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements,
15987 /*Insert=*/true,
15988 /*Extract=*/false, CostKind,
15989 ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL);
15990 return Cost;
15991}
15992
15993Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
15994 auto It = EntryToLastInstruction.find(Val: E);
15995 if (It != EntryToLastInstruction.end())
15996 return *cast<Instruction>(Val&: It->second);
15997 Instruction *Res = nullptr;
15998 // Get the basic block this bundle is in. All instructions in the bundle
15999 // should be in this block (except for extractelement-like instructions with
16000 // constant indices or gathered loads).
16001 auto *Front = E->getMainOp();
16002 auto *BB = Front->getParent();
16003 assert(((GatheredLoadsEntriesFirst.has_value() &&
16004 E->getOpcode() == Instruction::Load && E->isGather() &&
16005 E->Idx < *GatheredLoadsEntriesFirst) ||
16006 E->State == TreeEntry::SplitVectorize ||
16007 all_of(E->Scalars,
16008 [=](Value *V) -> bool {
16009 if (E->getOpcode() == Instruction::GetElementPtr &&
16010 !isa<GetElementPtrInst>(V))
16011 return true;
16012 auto *I = dyn_cast<Instruction>(V);
16013 return !I || !E->getMatchingMainOpOrAltOp(I) ||
16014 I->getParent() == BB ||
16015 isVectorLikeInstWithConstOps(I);
16016 })) &&
16017 "Expected gathered loads or GEPs or instructions from same basic "
16018 "block.");
16019
16020 auto FindLastInst = [&]() {
16021 Instruction *LastInst = Front;
16022 for (Value *V : E->Scalars) {
16023 auto *I = dyn_cast<Instruction>(Val: V);
16024 if (!I)
16025 continue;
16026 if (LastInst->getParent() == I->getParent()) {
16027 if (LastInst->comesBefore(Other: I))
16028 LastInst = I;
16029 continue;
16030 }
16031 assert(((E->getOpcode() == Instruction::GetElementPtr &&
16032 !isa<GetElementPtrInst>(I)) ||
16033 E->State == TreeEntry::SplitVectorize ||
16034 (isVectorLikeInstWithConstOps(LastInst) &&
16035 isVectorLikeInstWithConstOps(I)) ||
16036 (GatheredLoadsEntriesFirst.has_value() &&
16037 E->getOpcode() == Instruction::Load && E->isGather() &&
16038 E->Idx < *GatheredLoadsEntriesFirst)) &&
16039 "Expected vector-like or non-GEP in GEP node insts only.");
16040 if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
16041 LastInst = I;
16042 continue;
16043 }
16044 if (!DT->isReachableFromEntry(A: I->getParent()))
16045 continue;
16046 auto *NodeA = DT->getNode(BB: LastInst->getParent());
16047 auto *NodeB = DT->getNode(BB: I->getParent());
16048 assert(NodeA && "Should only process reachable instructions");
16049 assert(NodeB && "Should only process reachable instructions");
16050 assert((NodeA == NodeB) ==
16051 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
16052 "Different nodes should have different DFS numbers");
16053 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
16054 LastInst = I;
16055 }
16056 BB = LastInst->getParent();
16057 return LastInst;
16058 };
16059
16060 auto FindFirstInst = [&]() {
16061 Instruction *FirstInst = Front;
16062 for (Value *V : E->Scalars) {
16063 auto *I = dyn_cast<Instruction>(Val: V);
16064 if (!I)
16065 continue;
16066 if (FirstInst->getParent() == I->getParent()) {
16067 if (I->comesBefore(Other: FirstInst))
16068 FirstInst = I;
16069 continue;
16070 }
16071 assert(((E->getOpcode() == Instruction::GetElementPtr &&
16072 !isa<GetElementPtrInst>(I)) ||
16073 (isVectorLikeInstWithConstOps(FirstInst) &&
16074 isVectorLikeInstWithConstOps(I))) &&
16075 "Expected vector-like or non-GEP in GEP node insts only.");
16076 if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
16077 FirstInst = I;
16078 continue;
16079 }
16080 if (!DT->isReachableFromEntry(A: I->getParent()))
16081 continue;
16082 auto *NodeA = DT->getNode(BB: FirstInst->getParent());
16083 auto *NodeB = DT->getNode(BB: I->getParent());
16084 assert(NodeA && "Should only process reachable instructions");
16085 assert(NodeB && "Should only process reachable instructions");
16086 assert((NodeA == NodeB) ==
16087 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
16088 "Different nodes should have different DFS numbers");
16089 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
16090 FirstInst = I;
16091 }
16092 return FirstInst;
16093 };
16094
16095 if (E->State == TreeEntry::SplitVectorize) {
16096 Res = FindLastInst();
16097 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) {
16098 for (auto *E : Entries) {
16099 auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue);
16100 if (!I)
16101 I = &getLastInstructionInBundle(E);
16102 if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I))
16103 Res = I;
16104 }
16105 }
16106 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16107 return *Res;
16108 }
16109
16110 // Set insertpoint for gathered loads to the very first load.
16111 if (GatheredLoadsEntriesFirst.has_value() &&
16112 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
16113 E->getOpcode() == Instruction::Load) {
16114 Res = FindFirstInst();
16115 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16116 return *Res;
16117 }
16118
16119 // Set the insert point to the beginning of the basic block if the entry
16120 // should not be scheduled.
16121 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
16122 if (E->isGather())
16123 return nullptr;
16124 // Found previously that the instruction do not need to be scheduled.
16125 const auto *It = BlocksSchedules.find(Key: BB);
16126 if (It == BlocksSchedules.end())
16127 return nullptr;
16128 for (Value *V : E->Scalars) {
16129 auto *I = dyn_cast<Instruction>(Val: V);
16130 if (!I || isa<PHINode>(Val: I) || doesNotNeedToBeScheduled(V: I))
16131 continue;
16132 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(V: I);
16133 if (Bundles.empty())
16134 continue;
16135 const auto *It = find_if(
16136 Range&: Bundles, P: [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
16137 if (It != Bundles.end())
16138 return *It;
16139 }
16140 return nullptr;
16141 };
16142 const ScheduleBundle *Bundle = FindScheduleBundle(E);
16143 if (!E->isGather() && !Bundle) {
16144 if ((E->getOpcode() == Instruction::GetElementPtr &&
16145 any_of(Range: E->Scalars,
16146 P: [](Value *V) {
16147 return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
16148 })) ||
16149 all_of(Range: E->Scalars, P: [](Value *V) {
16150 return isa<PoisonValue>(Val: V) ||
16151 (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V));
16152 }))
16153 Res = FindLastInst();
16154 else
16155 Res = FindFirstInst();
16156 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16157 return *Res;
16158 }
16159
16160 // Find the last instruction. The common case should be that BB has been
16161 // scheduled, and the last instruction is VL.back(). So we start with
16162 // VL.back() and iterate over schedule data until we reach the end of the
16163 // bundle. The end of the bundle is marked by null ScheduleData.
16164 if (Bundle) {
16165 assert(!E->isGather() && "Gathered instructions should not be scheduled");
16166 Res = Bundle->getBundle().back()->getInst();
16167 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16168 return *Res;
16169 }
16170
16171 // LastInst can still be null at this point if there's either not an entry
16172 // for BB in BlocksSchedules or there's no ScheduleData available for
16173 // VL.back(). This can be the case if buildTreeRec aborts for various
16174 // reasons (e.g., the maximum recursion depth is reached, the maximum region
16175 // size is reached, etc.). ScheduleData is initialized in the scheduling
16176 // "dry-run".
16177 //
16178 // If this happens, we can still find the last instruction by brute force. We
16179 // iterate forwards from Front (inclusive) until we either see all
16180 // instructions in the bundle or reach the end of the block. If Front is the
16181 // last instruction in program order, LastInst will be set to Front, and we
16182 // will visit all the remaining instructions in the block.
16183 //
16184 // One of the reasons we exit early from buildTreeRec is to place an upper
16185 // bound on compile-time. Thus, taking an additional compile-time hit here is
16186 // not ideal. However, this should be exceedingly rare since it requires that
16187 // we both exit early from buildTreeRec and that the bundle be out-of-order
16188 // (causing us to iterate all the way to the end of the block).
16189 if (!Res)
16190 Res = FindLastInst();
16191 assert(Res && "Failed to find last instruction in bundle");
16192 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
16193 return *Res;
16194}
16195
16196void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
16197 auto *Front = E->getMainOp();
16198 Instruction *LastInst = &getLastInstructionInBundle(E);
16199 assert(LastInst && "Failed to find last instruction in bundle");
16200 BasicBlock::iterator LastInstIt = LastInst->getIterator();
16201 // If the instruction is PHI, set the insert point after all the PHIs.
16202 bool IsPHI = isa<PHINode>(Val: LastInst);
16203 if (IsPHI) {
16204 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
16205 if (LastInstIt != LastInst->getParent()->end() &&
16206 LastInstIt->getParent()->isLandingPad())
16207 LastInstIt = std::next(x: LastInstIt);
16208 }
16209 if (IsPHI ||
16210 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
16211 E->doesNotNeedToSchedule()) ||
16212 (GatheredLoadsEntriesFirst.has_value() &&
16213 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
16214 E->getOpcode() == Instruction::Load)) {
16215 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
16216 } else {
16217 // Set the insertion point after the last instruction in the bundle. Set the
16218 // debug location to Front.
16219 Builder.SetInsertPoint(
16220 TheBB: LastInst->getParent(),
16221 IP: LastInst->getNextNonDebugInstruction()->getIterator());
16222 }
16223 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
16224}
16225
16226Value *BoUpSLP::gather(
16227 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
16228 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
16229 // List of instructions/lanes from current block and/or the blocks which are
16230 // part of the current loop. These instructions will be inserted at the end to
16231 // make it possible to optimize loops and hoist invariant instructions out of
16232 // the loops body with better chances for success.
16233 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
16234 SmallSet<int, 4> PostponedIndices;
16235 Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
16236 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
16237 SmallPtrSet<BasicBlock *, 4> Visited;
16238 while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
16239 InsertBB = InsertBB->getSinglePredecessor();
16240 return InsertBB && InsertBB == InstBB;
16241 };
16242 for (int I = 0, E = VL.size(); I < E; ++I) {
16243 if (auto *Inst = dyn_cast<Instruction>(Val: VL[I]))
16244 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
16245 isVectorized(V: Inst) ||
16246 (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
16247 PostponedIndices.insert(V: I).second)
16248 PostponedInsts.emplace_back(Args&: Inst, Args&: I);
16249 }
16250
16251 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
16252 Type *Ty) {
16253 Value *Scalar = V;
16254 if (Scalar->getType() != Ty) {
16255 assert(Scalar->getType()->isIntOrIntVectorTy() &&
16256 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
16257 Value *V = Scalar;
16258 if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
16259 isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
16260 Value *Op = CI->getOperand(i_nocapture: 0);
16261 if (auto *IOp = dyn_cast<Instruction>(Val: Op);
16262 !IOp || !(isDeleted(I: IOp) || isVectorized(V: IOp)))
16263 V = Op;
16264 }
16265 Scalar = Builder.CreateIntCast(
16266 V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
16267 }
16268
16269 Instruction *InsElt;
16270 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
16271 assert(SLPReVec && "FixedVectorType is not expected.");
16272 Vec =
16273 createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy));
16274 auto *II = dyn_cast<IntrinsicInst>(Val: Vec);
16275 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
16276 return Vec;
16277 InsElt = II;
16278 } else {
16279 Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
16280 InsElt = dyn_cast<InsertElementInst>(Val: Vec);
16281 if (!InsElt)
16282 return Vec;
16283 }
16284 GatherShuffleExtractSeq.insert(X: InsElt);
16285 CSEBlocks.insert(V: InsElt->getParent());
16286 // Add to our 'need-to-extract' list.
16287 if (isa<Instruction>(Val: V)) {
16288 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V); !Entries.empty()) {
16289 // Find which lane we need to extract.
16290 User *UserOp = nullptr;
16291 if (Scalar != V) {
16292 if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
16293 UserOp = SI;
16294 } else {
16295 UserOp = InsElt;
16296 }
16297 if (UserOp) {
16298 unsigned FoundLane = Entries.front()->findLaneForValue(V);
16299 ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: *Entries.front(), Args&: FoundLane);
16300 }
16301 }
16302 }
16303 return Vec;
16304 };
16305 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
16306 Value *Vec = PoisonValue::get(T: VecTy);
16307 SmallVector<int> NonConsts;
16308 SmallVector<int> Mask(VL.size());
16309 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
16310 Value *OriginalRoot = Root;
16311 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root);
16312 SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: 1)) &&
16313 SV->getOperand(i_nocapture: 0)->getType() == VecTy) {
16314 Root = SV->getOperand(i_nocapture: 0);
16315 Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end());
16316 }
16317 // Insert constant values at first.
16318 for (int I = 0, E = VL.size(); I < E; ++I) {
16319 if (PostponedIndices.contains(V: I))
16320 continue;
16321 if (!isConstant(V: VL[I])) {
16322 NonConsts.push_back(Elt: I);
16323 continue;
16324 }
16325 if (isa<PoisonValue>(Val: VL[I]))
16326 continue;
16327 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
16328 Mask[I] = I + E;
16329 }
16330 if (Root) {
16331 if (isa<PoisonValue>(Val: Vec)) {
16332 Vec = OriginalRoot;
16333 } else {
16334 Vec = CreateShuffle(Root, Vec, Mask);
16335 if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot);
16336 OI && OI->use_empty() &&
16337 none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16338 return TE->VectorizedValue == OI;
16339 }))
16340 eraseInstruction(I: OI);
16341 }
16342 }
16343 // Insert non-constant values.
16344 for (int I : NonConsts)
16345 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
16346 // Append instructions, which are/may be part of the loop, in the end to make
16347 // it possible to hoist non-loop-based instructions.
16348 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
16349 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
16350
16351 return Vec;
16352}
16353
16354/// Merges shuffle masks and emits final shuffle instruction, if required. It
16355/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
16356/// when the actual shuffle instruction is generated only if this is actually
16357/// required. Otherwise, the shuffle instruction emission is delayed till the
16358/// end of the process, to reduce the number of emitted instructions and further
16359/// analysis/transformations.
16360/// The class also will look through the previously emitted shuffle instructions
16361/// and properly mark indices in mask as undef.
16362/// For example, given the code
16363/// \code
16364/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
16365/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
16366/// \endcode
16367/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
16368/// look through %s1 and %s2 and emit
16369/// \code
16370/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
16371/// \endcode
16372/// instead.
16373/// If 2 operands are of different size, the smallest one will be resized and
16374/// the mask recalculated properly.
16375/// For example, given the code
16376/// \code
16377/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
16378/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
16379/// \endcode
16380/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
16381/// look through %s1 and %s2 and emit
16382/// \code
16383/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
16384/// \endcode
16385/// instead.
16386class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
16387 bool IsFinalized = false;
16388 /// Combined mask for all applied operands and masks. It is built during
16389 /// analysis and actual emission of shuffle vector instructions.
16390 SmallVector<int> CommonMask;
16391 /// List of operands for the shuffle vector instruction. It hold at max 2
16392 /// operands, if the 3rd is going to be added, the first 2 are combined into
16393 /// shuffle with \p CommonMask mask, the first operand sets to be the
16394 /// resulting shuffle and the second operand sets to be the newly added
16395 /// operand. The \p CommonMask is transformed in the proper way after that.
16396 SmallVector<Value *, 2> InVectors;
16397 IRBuilderBase &Builder;
16398 BoUpSLP &R;
16399
16400 class ShuffleIRBuilder {
16401 IRBuilderBase &Builder;
16402 /// Holds all of the instructions that we gathered.
16403 SetVector<Instruction *> &GatherShuffleExtractSeq;
16404 /// A list of blocks that we are going to CSE.
16405 DenseSet<BasicBlock *> &CSEBlocks;
16406 /// Data layout.
16407 const DataLayout &DL;
16408
16409 public:
16410 ShuffleIRBuilder(IRBuilderBase &Builder,
16411 SetVector<Instruction *> &GatherShuffleExtractSeq,
16412 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
16413 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
16414 CSEBlocks(CSEBlocks), DL(DL) {}
16415 ~ShuffleIRBuilder() = default;
16416 /// Creates shufflevector for the 2 operands with the given mask.
16417 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
16418 if (V1->getType() != V2->getType()) {
16419 assert(V1->getType()->isIntOrIntVectorTy() &&
16420 V1->getType()->isIntOrIntVectorTy() &&
16421 "Expected integer vector types only.");
16422 if (V1->getType() != V2->getType()) {
16423 if (cast<VectorType>(Val: V2->getType())
16424 ->getElementType()
16425 ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
16426 ->getElementType()
16427 ->getIntegerBitWidth())
16428 V2 = Builder.CreateIntCast(
16429 V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL)));
16430 else
16431 V1 = Builder.CreateIntCast(
16432 V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL)));
16433 }
16434 }
16435 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
16436 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
16437 GatherShuffleExtractSeq.insert(X: I);
16438 CSEBlocks.insert(V: I->getParent());
16439 }
16440 return Vec;
16441 }
16442 /// Creates permutation of the single vector operand with the given mask, if
16443 /// it is not identity mask.
16444 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
16445 if (Mask.empty())
16446 return V1;
16447 unsigned VF = Mask.size();
16448 unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
16449 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
16450 return V1;
16451 Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
16452 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
16453 GatherShuffleExtractSeq.insert(X: I);
16454 CSEBlocks.insert(V: I->getParent());
16455 }
16456 return Vec;
16457 }
16458 Value *createIdentity(Value *V) { return V; }
16459 Value *createPoison(Type *Ty, unsigned VF) {
16460 return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
16461 }
16462 /// Resizes 2 input vector to match the sizes, if the they are not equal
16463 /// yet. The smallest vector is resized to the size of the larger vector.
16464 void resizeToMatch(Value *&V1, Value *&V2) {
16465 if (V1->getType() == V2->getType())
16466 return;
16467 int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
16468 int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
16469 int VF = std::max(a: V1VF, b: V2VF);
16470 int MinVF = std::min(a: V1VF, b: V2VF);
16471 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
16472 std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
16473 value: 0);
16474 Value *&Op = MinVF == V1VF ? V1 : V2;
16475 Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
16476 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
16477 GatherShuffleExtractSeq.insert(X: I);
16478 CSEBlocks.insert(V: I->getParent());
16479 }
16480 if (MinVF == V1VF)
16481 V1 = Op;
16482 else
16483 V2 = Op;
16484 }
16485 };
16486
16487 /// Smart shuffle instruction emission, walks through shuffles trees and
16488 /// tries to find the best matching vector for the actual shuffle
16489 /// instruction.
16490 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
16491 assert(V1 && "Expected at least one vector value.");
16492 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
16493 R.CSEBlocks, *R.DL);
16494 return BaseShuffleAnalysis::createShuffle<Value *>(
16495 V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy);
16496 }
16497
16498 /// Cast value \p V to the vector type with the same number of elements, but
16499 /// the base type \p ScalarTy.
16500 Value *castToScalarTyElem(Value *V,
16501 std::optional<bool> IsSigned = std::nullopt) {
16502 auto *VecTy = cast<VectorType>(Val: V->getType());
16503 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
16504 if (VecTy->getElementType() == ScalarTy->getScalarType())
16505 return V;
16506 return Builder.CreateIntCast(
16507 V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
16508 isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL))));
16509 }
16510
16511public:
16512 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
16513 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
16514
16515 /// Adjusts extractelements after reusing them.
16516 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
16517 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
16518 unsigned NumParts, bool &UseVecBaseAsInput) {
16519 UseVecBaseAsInput = false;
16520 SmallPtrSet<Value *, 4> UniqueBases;
16521 Value *VecBase = nullptr;
16522 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
16523 if (!E->ReorderIndices.empty()) {
16524 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
16525 E->ReorderIndices.end());
16526 reorderScalars(Scalars&: VL, Mask: ReorderMask);
16527 }
16528 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
16529 int Idx = Mask[I];
16530 if (Idx == PoisonMaskElem)
16531 continue;
16532 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
16533 VecBase = EI->getVectorOperand();
16534 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty())
16535 VecBase = TEs.front()->VectorizedValue;
16536 assert(VecBase && "Expected vectorized value.");
16537 UniqueBases.insert(Ptr: VecBase);
16538 // If the only one use is vectorized - can delete the extractelement
16539 // itself.
16540 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) ||
16541 (NumParts != 1 && count(Range&: VL, Element: EI) > 1) ||
16542 any_of(Range: EI->users(), P: [&](User *U) {
16543 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U);
16544 return UTEs.empty() || UTEs.size() > 1 ||
16545 (isa<GetElementPtrInst>(Val: U) &&
16546 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) ||
16547 (!UTEs.empty() &&
16548 count_if(Range&: R.VectorizableTree,
16549 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16550 return TE->UserTreeIndex.UserTE ==
16551 UTEs.front() &&
16552 is_contained(Range&: VL, Element: EI);
16553 }) != 1);
16554 }))
16555 continue;
16556 R.eraseInstruction(I: EI);
16557 }
16558 if (NumParts == 1 || UniqueBases.size() == 1) {
16559 assert(VecBase && "Expected vectorized value.");
16560 return castToScalarTyElem(V: VecBase);
16561 }
16562 UseVecBaseAsInput = true;
16563 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
16564 for (auto [I, Idx] : enumerate(First&: Mask))
16565 if (Idx != PoisonMaskElem)
16566 Idx = I;
16567 };
16568 // Perform multi-register vector shuffle, joining them into a single virtual
16569 // long vector.
16570 // Need to shuffle each part independently and then insert all this parts
16571 // into a long virtual vector register, forming the original vector.
16572 Value *Vec = nullptr;
16573 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
16574 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
16575 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
16576 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
16577 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(N: Part * SliceSize, M: Limit);
16578 MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
16579 constexpr int MaxBases = 2;
16580 SmallVector<Value *, MaxBases> Bases(MaxBases);
16581 auto VLMask = zip(t&: SubVL, u&: SubMask);
16582 const unsigned VF = std::accumulate(
16583 first: VLMask.begin(), last: VLMask.end(), init: 0U, binary_op: [&](unsigned S, const auto &D) {
16584 if (std::get<1>(D) == PoisonMaskElem)
16585 return S;
16586 Value *VecOp =
16587 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
16588 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp);
16589 !TEs.empty())
16590 VecOp = TEs.front()->VectorizedValue;
16591 assert(VecOp && "Expected vectorized value.");
16592 const unsigned Size =
16593 cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
16594 return std::max(a: S, b: Size);
16595 });
16596 for (const auto [V, I] : VLMask) {
16597 if (I == PoisonMaskElem)
16598 continue;
16599 Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
16600 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty())
16601 VecOp = TEs.front()->VectorizedValue;
16602 assert(VecOp && "Expected vectorized value.");
16603 VecOp = castToScalarTyElem(V: VecOp);
16604 Bases[I / VF] = VecOp;
16605 }
16606 if (!Bases.front())
16607 continue;
16608 Value *SubVec;
16609 if (Bases.back()) {
16610 SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
16611 TransformToIdentity(SubMask);
16612 } else {
16613 SubVec = Bases.front();
16614 }
16615 if (!Vec) {
16616 Vec = SubVec;
16617 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
16618 [&](unsigned P) {
16619 ArrayRef<int> SubMask =
16620 Mask.slice(P * SliceSize,
16621 getNumElems(Mask.size(),
16622 SliceSize, P));
16623 return all_of(SubMask, [](int Idx) {
16624 return Idx == PoisonMaskElem;
16625 });
16626 })) &&
16627 "Expected first part or all previous parts masked.");
16628 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
16629 } else {
16630 unsigned NewVF =
16631 cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
16632 if (Vec->getType() != SubVec->getType()) {
16633 unsigned SubVecVF =
16634 cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
16635 NewVF = std::max(a: NewVF, b: SubVecVF);
16636 }
16637 // Adjust SubMask.
16638 for (int &Idx : SubMask)
16639 if (Idx != PoisonMaskElem)
16640 Idx += NewVF;
16641 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
16642 Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
16643 TransformToIdentity(VecMask);
16644 }
16645 }
16646 copy(Range&: VecMask, Out: Mask.begin());
16647 return Vec;
16648 }
16649 /// Checks if the specified entry \p E needs to be delayed because of its
16650 /// dependency nodes.
16651 std::optional<Value *>
16652 needToDelay(const TreeEntry *E,
16653 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
16654 // No need to delay emission if all deps are ready.
16655 if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
16656 return all_of(
16657 Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; });
16658 }))
16659 return std::nullopt;
16660 // Postpone gather emission, will be emitted after the end of the
16661 // process to keep correct order.
16662 auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
16663 return Builder.CreateAlignedLoad(
16664 Ty: ResVecTy,
16665 Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
16666 Align: MaybeAlign());
16667 }
16668 /// Reset the builder to handle perfect diamond match.
16669 void resetForSameNode() {
16670 IsFinalized = false;
16671 CommonMask.clear();
16672 InVectors.clear();
16673 }
16674 /// Adds 2 input vectors (in form of tree entries) and the mask for their
16675 /// shuffling.
16676 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
16677 Value *V1 = E1.VectorizedValue;
16678 if (V1->getType()->isIntOrIntVectorTy())
16679 V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
16680 if (isa<PoisonValue>(Val: V))
16681 return false;
16682 return !isKnownNonNegative(
16683 V, SQ: SimplifyQuery(*R.DL));
16684 }));
16685 Value *V2 = E2.VectorizedValue;
16686 if (V2->getType()->isIntOrIntVectorTy())
16687 V2 = castToScalarTyElem(V: V2, IsSigned: any_of(Range: E2.Scalars, P: [&](Value *V) {
16688 if (isa<PoisonValue>(Val: V))
16689 return false;
16690 return !isKnownNonNegative(
16691 V, SQ: SimplifyQuery(*R.DL));
16692 }));
16693 add(V1, V2, Mask);
16694 }
16695 /// Adds single input vector (in form of tree entry) and the mask for its
16696 /// shuffling.
16697 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
16698 Value *V1 = E1.VectorizedValue;
16699 if (V1->getType()->isIntOrIntVectorTy())
16700 V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
16701 if (isa<PoisonValue>(Val: V))
16702 return false;
16703 return !isKnownNonNegative(
16704 V, SQ: SimplifyQuery(*R.DL));
16705 }));
16706 add(V1, Mask);
16707 }
16708 /// Adds 2 input vectors and the mask for their shuffling.
16709 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
16710 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
16711 assert(isa<FixedVectorType>(V1->getType()) &&
16712 isa<FixedVectorType>(V2->getType()) &&
16713 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
16714 V1 = castToScalarTyElem(V: V1);
16715 V2 = castToScalarTyElem(V: V2);
16716 if (InVectors.empty()) {
16717 InVectors.push_back(Elt: V1);
16718 InVectors.push_back(Elt: V2);
16719 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
16720 return;
16721 }
16722 Value *Vec = InVectors.front();
16723 if (InVectors.size() == 2) {
16724 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
16725 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16726 } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
16727 Mask.size()) {
16728 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
16729 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16730 }
16731 V1 = createShuffle(V1, V2, Mask);
16732 unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec));
16733 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16734 if (Mask[Idx] != PoisonMaskElem)
16735 CommonMask[Idx] = Idx + VF;
16736 InVectors.front() = Vec;
16737 if (InVectors.size() == 2)
16738 InVectors.back() = V1;
16739 else
16740 InVectors.push_back(Elt: V1);
16741 }
16742 /// Adds another one input vector and the mask for the shuffling.
16743 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
16744 assert(isa<FixedVectorType>(V1->getType()) &&
16745 "castToScalarTyElem expects V1 to be FixedVectorType");
16746 V1 = castToScalarTyElem(V: V1);
16747 if (InVectors.empty()) {
16748 InVectors.push_back(Elt: V1);
16749 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
16750 return;
16751 }
16752 const auto *It = find(Range&: InVectors, Val: V1);
16753 if (It == InVectors.end()) {
16754 if (InVectors.size() == 2 ||
16755 InVectors.front()->getType() != V1->getType()) {
16756 Value *V = InVectors.front();
16757 if (InVectors.size() == 2) {
16758 V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
16759 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16760 } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
16761 CommonMask.size()) {
16762 V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
16763 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16764 }
16765 unsigned VF = std::max(a: CommonMask.size(), b: Mask.size());
16766 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16767 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
16768 CommonMask[Idx] = V->getType() != V1->getType()
16769 ? Idx + VF
16770 : Mask[Idx] + getVF(V: V1);
16771 if (V->getType() != V1->getType())
16772 V1 = createShuffle(V1, V2: nullptr, Mask);
16773 InVectors.front() = V;
16774 if (InVectors.size() == 2)
16775 InVectors.back() = V1;
16776 else
16777 InVectors.push_back(Elt: V1);
16778 return;
16779 }
16780 // Check if second vector is required if the used elements are already
16781 // used from the first one.
16782 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16783 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
16784 InVectors.push_back(Elt: V1);
16785 break;
16786 }
16787 }
16788 unsigned VF = 0;
16789 for (Value *V : InVectors)
16790 VF = std::max(a: VF, b: getVF(V));
16791 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
16792 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
16793 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
16794 }
16795 /// Adds another one input vector and the mask for the shuffling.
16796 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
16797 SmallVector<int> NewMask;
16798 inversePermutation(Indices: Order, Mask&: NewMask);
16799 add(V1, Mask: NewMask);
16800 }
16801 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
16802 Value *Root = nullptr) {
16803 return R.gather(VL, Root, ScalarTy,
16804 CreateShuffle: [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16805 return createShuffle(V1, V2, Mask);
16806 });
16807 }
16808 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
16809 /// Finalize emission of the shuffles.
16810 /// \param Action the action (if any) to be performed before final applying of
16811 /// the \p ExtMask mask.
16812 Value *finalize(
16813 ArrayRef<int> ExtMask,
16814 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
16815 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
16816 function_ref<void(Value *&, SmallVectorImpl<int> &,
16817 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
16818 Action = {}) {
16819 IsFinalized = true;
16820 if (Action) {
16821 Value *Vec = InVectors.front();
16822 if (InVectors.size() == 2) {
16823 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
16824 InVectors.pop_back();
16825 } else {
16826 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
16827 }
16828 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16829 assert(VF > 0 &&
16830 "Expected vector length for the final value before action.");
16831 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
16832 if (VecVF < VF) {
16833 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16834 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
16835 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
16836 }
16837 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
16838 return createShuffle(V1, V2, Mask);
16839 });
16840 InVectors.front() = Vec;
16841 }
16842 if (!SubVectors.empty()) {
16843 Value *Vec = InVectors.front();
16844 if (InVectors.size() == 2) {
16845 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
16846 InVectors.pop_back();
16847 } else {
16848 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
16849 }
16850 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
16851 auto CreateSubVectors = [&](Value *Vec,
16852 SmallVectorImpl<int> &CommonMask) {
16853 for (auto [E, Idx] : SubVectors) {
16854 Value *V = E->VectorizedValue;
16855 if (V->getType()->isIntOrIntVectorTy())
16856 V = castToScalarTyElem(V, IsSigned: any_of(Range: E->Scalars, P: [&](Value *V) {
16857 if (isa<PoisonValue>(Val: V))
16858 return false;
16859 return !isKnownNonNegative(
16860 V, SQ: SimplifyQuery(*R.DL));
16861 }));
16862 unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy);
16863 Vec = createInsertVector(
16864 Builder, Vec, V, Index: InsertionIndex,
16865 Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2,
16866 args: _3));
16867 if (!CommonMask.empty()) {
16868 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
16869 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
16870 value: Idx);
16871 }
16872 }
16873 return Vec;
16874 };
16875 if (SubVectorsMask.empty()) {
16876 Vec = CreateSubVectors(Vec, CommonMask);
16877 } else {
16878 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
16879 copy(Range&: SubVectorsMask, Out: SVMask.begin());
16880 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
16881 if (I2 != PoisonMaskElem) {
16882 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
16883 I1 = I2 + CommonMask.size();
16884 }
16885 }
16886 Value *InsertVec =
16887 CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask);
16888 Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask);
16889 transformMaskAfterShuffle(CommonMask, Mask: SVMask);
16890 }
16891 InVectors.front() = Vec;
16892 }
16893
16894 if (!ExtMask.empty()) {
16895 if (CommonMask.empty()) {
16896 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
16897 } else {
16898 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
16899 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
16900 if (ExtMask[I] == PoisonMaskElem)
16901 continue;
16902 NewMask[I] = CommonMask[ExtMask[I]];
16903 }
16904 CommonMask.swap(RHS&: NewMask);
16905 }
16906 }
16907 if (CommonMask.empty()) {
16908 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
16909 return InVectors.front();
16910 }
16911 if (InVectors.size() == 2)
16912 return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
16913 return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
16914 }
16915
16916 ~ShuffleInstructionBuilder() {
16917 assert((IsFinalized || CommonMask.empty()) &&
16918 "Shuffle construction must be finalized.");
16919 }
16920};
16921
16922Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
16923 return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx));
16924}
16925
16926template <typename BVTy, typename ResTy, typename... Args>
16927ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
16928 Args &...Params) {
16929 assert(E->isGather() && "Expected gather node.");
16930 unsigned VF = E->getVectorFactor();
16931
16932 bool NeedFreeze = false;
16933 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
16934 // Clear values, to be replaced by insertvector instructions.
16935 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
16936 for_each(MutableArrayRef(GatheredScalars)
16937 .slice(N: Idx, M: VectorizableTree[EIdx]->getVectorFactor()),
16938 [&](Value *&V) { V = PoisonValue::get(T: V->getType()); });
16939 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
16940 E->CombinedEntriesWithIndices.size());
16941 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
16942 [&](const auto &P) {
16943 return std::make_pair(VectorizableTree[P.first].get(), P.second);
16944 });
16945 // Build a mask out of the reorder indices and reorder scalars per this
16946 // mask.
16947 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
16948 E->ReorderIndices.end());
16949 if (!ReorderMask.empty())
16950 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
16951 SmallVector<int> SubVectorsMask;
16952 inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask);
16953 // Transform non-clustered elements in the mask to poison (-1).
16954 // "Clustered" operations will be reordered using this mask later.
16955 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
16956 for (unsigned I : seq<unsigned>(Size: GatheredScalars.size()))
16957 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
16958 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
16959 } else {
16960 SubVectorsMask.clear();
16961 }
16962 SmallVector<Value *> StoredGS(GatheredScalars);
16963 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
16964 unsigned I, unsigned SliceSize,
16965 bool IsNotPoisonous) {
16966 if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) {
16967 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
16968 }))
16969 return false;
16970 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
16971 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
16972 if (UserTE->getNumOperands() != 2)
16973 return false;
16974 if (!IsNotPoisonous) {
16975 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + 1),
16976 [=](const std::unique_ptr<TreeEntry> &TE) {
16977 return TE->UserTreeIndex.UserTE == UserTE &&
16978 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
16979 });
16980 if (It == VectorizableTree.end())
16981 return false;
16982 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
16983 if (!(*It)->ReorderIndices.empty()) {
16984 inversePermutation((*It)->ReorderIndices, ReorderMask);
16985 reorderScalars(Scalars&: GS, Mask: ReorderMask);
16986 }
16987 if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) {
16988 Value *V0 = std::get<0>(P);
16989 Value *V1 = std::get<1>(P);
16990 return !isa<UndefValue>(Val: V0) || isa<PoisonValue>(Val: V0) ||
16991 (isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) &&
16992 is_contained(Range: E->Scalars, Element: V1));
16993 }))
16994 return false;
16995 }
16996 int Idx;
16997 if ((Mask.size() < InputVF &&
16998 ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
16999 Idx == 0) ||
17000 (Mask.size() == InputVF &&
17001 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
17002 std::iota(
17003 first: std::next(x: Mask.begin(), n: I * SliceSize),
17004 last: std::next(x: Mask.begin(),
17005 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
17006 value: 0);
17007 } else {
17008 unsigned IVal =
17009 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
17010 std::fill(
17011 first: std::next(x: Mask.begin(), n: I * SliceSize),
17012 last: std::next(x: Mask.begin(),
17013 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
17014 value: IVal);
17015 }
17016 return true;
17017 };
17018 BVTy ShuffleBuilder(ScalarTy, Params...);
17019 ResTy Res = ResTy();
17020 SmallVector<int> Mask;
17021 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
17022 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
17023 Value *ExtractVecBase = nullptr;
17024 bool UseVecBaseAsInput = false;
17025 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
17026 SmallVector<SmallVector<const TreeEntry *>> Entries;
17027 Type *OrigScalarTy = GatheredScalars.front()->getType();
17028 auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
17029 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size());
17030 if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
17031 // Check for gathered extracts.
17032 bool Resized = false;
17033 ExtractShuffles =
17034 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
17035 if (!ExtractShuffles.empty()) {
17036 SmallVector<const TreeEntry *> ExtractEntries;
17037 for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
17038 if (I == PoisonMaskElem)
17039 continue;
17040 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
17041 V: cast<ExtractElementInst>(Val: StoredGS[Idx])->getVectorOperand());
17042 !TEs.empty())
17043 ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end());
17044 }
17045 if (std::optional<ResTy> Delayed =
17046 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
17047 // Delay emission of gathers which are not ready yet.
17048 PostponedGathers.insert(X: E);
17049 // Postpone gather emission, will be emitted after the end of the
17050 // process to keep correct order.
17051 return *Delayed;
17052 }
17053 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
17054 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
17055 ExtractVecBase = VecBase;
17056 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
17057 if (VF == VecBaseTy->getNumElements() &&
17058 GatheredScalars.size() != VF) {
17059 Resized = true;
17060 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
17061 Elt: PoisonValue::get(T: OrigScalarTy));
17062 NumParts =
17063 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF);
17064 }
17065 }
17066 }
17067 // Gather extracts after we check for full matched gathers only.
17068 if (!ExtractShuffles.empty() || !E->hasState() ||
17069 E->getOpcode() != Instruction::Load ||
17070 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
17071 any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) &&
17072 any_of(E->Scalars,
17073 [this](Value *V) {
17074 return isa<LoadInst>(Val: V) && isVectorized(V);
17075 })) ||
17076 (E->hasState() && E->isAltShuffle()) ||
17077 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
17078 isSplat(VL: E->Scalars) ||
17079 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
17080 GatherShuffles =
17081 isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
17082 }
17083 if (!GatherShuffles.empty()) {
17084 if (std::optional<ResTy> Delayed =
17085 ShuffleBuilder.needToDelay(E, Entries)) {
17086 // Delay emission of gathers which are not ready yet.
17087 PostponedGathers.insert(X: E);
17088 // Postpone gather emission, will be emitted after the end of the
17089 // process to keep correct order.
17090 return *Delayed;
17091 }
17092 if (GatherShuffles.size() == 1 &&
17093 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
17094 Entries.front().front()->isSame(VL: E->Scalars)) {
17095 // Perfect match in the graph, will reuse the previously vectorized
17096 // node. Cost is 0.
17097 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
17098 << shortBundleName(E->Scalars, E->Idx) << ".\n");
17099 // Restore the mask for previous partially matched values.
17100 Mask.resize(N: E->Scalars.size());
17101 const TreeEntry *FrontTE = Entries.front().front();
17102 if (FrontTE->ReorderIndices.empty() &&
17103 ((FrontTE->ReuseShuffleIndices.empty() &&
17104 E->Scalars.size() == FrontTE->Scalars.size()) ||
17105 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
17106 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
17107 } else {
17108 for (auto [I, V] : enumerate(First: E->Scalars)) {
17109 if (isa<PoisonValue>(Val: V)) {
17110 Mask[I] = PoisonMaskElem;
17111 continue;
17112 }
17113 Mask[I] = FrontTE->findLaneForValue(V);
17114 }
17115 }
17116 // Reset the builder(s) to correctly handle perfect diamond matched
17117 // nodes.
17118 ShuffleBuilder.resetForSameNode();
17119 ShuffleBuilder.add(*FrontTE, Mask);
17120 // Full matched entry found, no need to insert subvectors.
17121 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
17122 return Res;
17123 }
17124 if (!Resized) {
17125 if (GatheredScalars.size() != VF &&
17126 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
17127 return any_of(TEs, [&](const TreeEntry *TE) {
17128 return TE->getVectorFactor() == VF;
17129 });
17130 }))
17131 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
17132 Elt: PoisonValue::get(T: OrigScalarTy));
17133 }
17134 // Remove shuffled elements from list of gathers.
17135 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
17136 if (Mask[I] != PoisonMaskElem)
17137 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
17138 }
17139 }
17140 }
17141 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
17142 SmallVectorImpl<int> &ReuseMask,
17143 bool IsRootPoison) {
17144 // For splats with can emit broadcasts instead of gathers, so try to find
17145 // such sequences.
17146 bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
17147 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
17148 Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
17149 SmallVector<int> UndefPos;
17150 DenseMap<Value *, unsigned> UniquePositions;
17151 // Gather unique non-const values and all constant values.
17152 // For repeated values, just shuffle them.
17153 int NumNonConsts = 0;
17154 int SinglePos = 0;
17155 for (auto [I, V] : enumerate(First&: Scalars)) {
17156 if (isa<UndefValue>(Val: V)) {
17157 if (!isa<PoisonValue>(Val: V)) {
17158 ReuseMask[I] = I;
17159 UndefPos.push_back(Elt: I);
17160 }
17161 continue;
17162 }
17163 if (isConstant(V)) {
17164 ReuseMask[I] = I;
17165 continue;
17166 }
17167 ++NumNonConsts;
17168 SinglePos = I;
17169 Value *OrigV = V;
17170 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
17171 if (IsSplat) {
17172 Scalars.front() = OrigV;
17173 ReuseMask[I] = 0;
17174 } else {
17175 const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
17176 Scalars[Res.first->second] = OrigV;
17177 ReuseMask[I] = Res.first->second;
17178 }
17179 }
17180 if (NumNonConsts == 1) {
17181 // Restore single insert element.
17182 if (IsSplat) {
17183 ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
17184 std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]);
17185 if (!UndefPos.empty() && UndefPos.front() == 0)
17186 Scalars.front() = UndefValue::get(T: OrigScalarTy);
17187 }
17188 ReuseMask[SinglePos] = SinglePos;
17189 } else if (!UndefPos.empty() && IsSplat) {
17190 // For undef values, try to replace them with the simple broadcast.
17191 // We can do it if the broadcasted value is guaranteed to be
17192 // non-poisonous, or by freezing the incoming scalar value first.
17193 auto *It = find_if(Scalars, [this, E](Value *V) {
17194 return !isa<UndefValue>(Val: V) &&
17195 (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
17196 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
17197 // Check if the value already used in the same operation in
17198 // one of the nodes already.
17199 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
17200 is_contained(Range&: E->UserTreeIndex.UserTE->Scalars,
17201 Element: U.getUser());
17202 })));
17203 });
17204 if (It != Scalars.end()) {
17205 // Replace undefs by the non-poisoned scalars and emit broadcast.
17206 int Pos = std::distance(Scalars.begin(), It);
17207 for (int I : UndefPos) {
17208 // Set the undef position to the non-poisoned scalar.
17209 ReuseMask[I] = Pos;
17210 // Replace the undef by the poison, in the mask it is replaced by
17211 // non-poisoned scalar already.
17212 if (I != Pos)
17213 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
17214 }
17215 } else {
17216 // Replace undefs by the poisons, emit broadcast and then emit
17217 // freeze.
17218 for (int I : UndefPos) {
17219 ReuseMask[I] = PoisonMaskElem;
17220 if (isa<UndefValue>(Val: Scalars[I]))
17221 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
17222 }
17223 NeedFreeze = true;
17224 }
17225 }
17226 };
17227 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
17228 bool IsNonPoisoned = true;
17229 bool IsUsedInExpr = true;
17230 Value *Vec1 = nullptr;
17231 if (!ExtractShuffles.empty()) {
17232 // Gather of extractelements can be represented as just a shuffle of
17233 // a single/two vectors the scalars are extracted from.
17234 // Find input vectors.
17235 Value *Vec2 = nullptr;
17236 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
17237 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
17238 ExtractMask[I] = PoisonMaskElem;
17239 }
17240 if (UseVecBaseAsInput) {
17241 Vec1 = ExtractVecBase;
17242 } else {
17243 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
17244 if (ExtractMask[I] == PoisonMaskElem)
17245 continue;
17246 if (isa<UndefValue>(Val: StoredGS[I]))
17247 continue;
17248 auto *EI = cast<ExtractElementInst>(Val: StoredGS[I]);
17249 Value *VecOp = EI->getVectorOperand();
17250 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp);
17251 !TEs.empty() && TEs.front()->VectorizedValue)
17252 VecOp = TEs.front()->VectorizedValue;
17253 if (!Vec1) {
17254 Vec1 = VecOp;
17255 } else if (Vec1 != VecOp) {
17256 assert((!Vec2 || Vec2 == VecOp) &&
17257 "Expected only 1 or 2 vectors shuffle.");
17258 Vec2 = VecOp;
17259 }
17260 }
17261 }
17262 if (Vec2) {
17263 IsUsedInExpr = false;
17264 IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) &&
17265 isGuaranteedNotToBePoison(V: Vec2, AC);
17266 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
17267 } else if (Vec1) {
17268 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC);
17269 IsUsedInExpr &= FindReusedSplat(
17270 ExtractMask,
17271 cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0,
17272 ExtractMask.size(), IsNotPoisonedVec);
17273 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
17274 IsNonPoisoned &= IsNotPoisonedVec;
17275 } else {
17276 IsUsedInExpr = false;
17277 ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
17278 /*ForExtracts=*/true);
17279 }
17280 }
17281 if (!GatherShuffles.empty()) {
17282 unsigned SliceSize =
17283 getPartNumElems(Size: E->Scalars.size(),
17284 NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size()));
17285 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
17286 for (const auto [I, TEs] : enumerate(First&: Entries)) {
17287 if (TEs.empty()) {
17288 assert(!GatherShuffles[I] &&
17289 "No shuffles with empty entries list expected.");
17290 continue;
17291 }
17292 assert((TEs.size() == 1 || TEs.size() == 2) &&
17293 "Expected shuffle of 1 or 2 entries.");
17294 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
17295 auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
17296 VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
17297 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
17298 if (TEs.size() == 1) {
17299 bool IsNotPoisonedVec =
17300 TEs.front()->VectorizedValue
17301 ? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC)
17302 : true;
17303 IsUsedInExpr &=
17304 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
17305 SliceSize, IsNotPoisonedVec);
17306 ShuffleBuilder.add(*TEs.front(), VecMask);
17307 IsNonPoisoned &= IsNotPoisonedVec;
17308 } else {
17309 IsUsedInExpr = false;
17310 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
17311 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
17312 IsNonPoisoned &=
17313 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) &&
17314 isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC);
17315 }
17316 }
17317 }
17318 // Try to figure out best way to combine values: build a shuffle and insert
17319 // elements or just build several shuffles.
17320 // Insert non-constant scalars.
17321 SmallVector<Value *> NonConstants(GatheredScalars);
17322 int EMSz = ExtractMask.size();
17323 int MSz = Mask.size();
17324 // Try to build constant vector and shuffle with it only if currently we
17325 // have a single permutation and more than 1 scalar constants.
17326 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
17327 bool IsIdentityShuffle =
17328 ((UseVecBaseAsInput ||
17329 all_of(ExtractShuffles,
17330 [](const std::optional<TTI::ShuffleKind> &SK) {
17331 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
17332 TTI::SK_PermuteSingleSrc;
17333 })) &&
17334 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
17335 ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) ||
17336 (!GatherShuffles.empty() &&
17337 all_of(GatherShuffles,
17338 [](const std::optional<TTI::ShuffleKind> &SK) {
17339 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
17340 TTI::SK_PermuteSingleSrc;
17341 }) &&
17342 none_of(Mask, [&](int I) { return I >= MSz; }) &&
17343 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
17344 bool EnoughConstsForShuffle =
17345 IsSingleShuffle &&
17346 (none_of(GatheredScalars,
17347 [](Value *V) {
17348 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
17349 }) ||
17350 any_of(GatheredScalars,
17351 [](Value *V) {
17352 return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
17353 })) &&
17354 (!IsIdentityShuffle ||
17355 (GatheredScalars.size() == 2 &&
17356 any_of(GatheredScalars,
17357 [](Value *V) { return !isa<UndefValue>(Val: V); })) ||
17358 count_if(GatheredScalars, [](Value *V) {
17359 return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
17360 }) > 1);
17361 // NonConstants array contains just non-constant values, GatheredScalars
17362 // contains only constant to build final vector and then shuffle.
17363 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
17364 if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I]))
17365 NonConstants[I] = PoisonValue::get(T: OrigScalarTy);
17366 else
17367 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
17368 }
17369 // Generate constants for final shuffle and build a mask for them.
17370 if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
17371 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
17372 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
17373 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
17374 ShuffleBuilder.add(BV, BVMask);
17375 }
17376 if (all_of(NonConstants, [=](Value *V) {
17377 return isa<PoisonValue>(Val: V) ||
17378 (IsSingleShuffle && ((IsIdentityShuffle &&
17379 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V));
17380 }))
17381 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
17382 SubVectorsMask);
17383 else
17384 Res = ShuffleBuilder.finalize(
17385 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
17386 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
17387 bool IsSplat = isSplat(VL: NonConstants);
17388 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
17389 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
17390 auto CheckIfSplatIsProfitable = [&]() {
17391 // Estimate the cost of splatting + shuffle and compare with
17392 // insert + shuffle.
17393 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17394 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
17395 if (isa<ExtractElementInst>(Val: V) || isVectorized(V))
17396 return false;
17397 InstructionCost SplatCost = TTI->getVectorInstrCost(
17398 Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /*Index=*/0,
17399 Op0: PoisonValue::get(T: VecTy), Op1: V);
17400 SmallVector<int> NewMask(Mask.begin(), Mask.end());
17401 for (auto [Idx, I] : enumerate(First&: BVMask))
17402 if (I != PoisonMaskElem)
17403 NewMask[Idx] = Mask.size();
17404 SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy,
17405 Mask: NewMask, CostKind);
17406 InstructionCost BVCost = TTI->getVectorInstrCost(
17407 Instruction::InsertElement, VecTy, CostKind,
17408 *find_if(Mask, [](int I) { return I != PoisonMaskElem; }),
17409 Vec, V);
17410 // Shuffle required?
17411 if (count(Range&: BVMask, Element: PoisonMaskElem) <
17412 static_cast<int>(BVMask.size() - 1)) {
17413 SmallVector<int> NewMask(Mask.begin(), Mask.end());
17414 for (auto [Idx, I] : enumerate(First&: BVMask))
17415 if (I != PoisonMaskElem)
17416 NewMask[Idx] = I;
17417 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17418 Tp: VecTy, Mask: NewMask, CostKind);
17419 }
17420 return SplatCost <= BVCost;
17421 };
17422 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
17423 for (auto [Idx, I] : enumerate(First&: BVMask))
17424 if (I != PoisonMaskElem)
17425 Mask[Idx] = I;
17426 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
17427 } else {
17428 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
17429 SmallVector<Value *> Values(NonConstants.size(),
17430 PoisonValue::get(T: ScalarTy));
17431 Values[0] = V;
17432 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
17433 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
17434 transform(BVMask, SplatMask.begin(), [](int I) {
17435 return I == PoisonMaskElem ? PoisonMaskElem : 0;
17436 });
17437 if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF))
17438 BV = CreateShuffle(BV, nullptr, SplatMask);
17439 for (auto [Idx, I] : enumerate(First&: BVMask))
17440 if (I != PoisonMaskElem)
17441 Mask[Idx] = BVMask.size() + Idx;
17442 Vec = CreateShuffle(Vec, BV, Mask);
17443 for (auto [Idx, I] : enumerate(First&: Mask))
17444 if (I != PoisonMaskElem)
17445 Mask[Idx] = Idx;
17446 }
17447 });
17448 } else if (!allConstant(VL: GatheredScalars)) {
17449 // Gather unique scalars and all constants.
17450 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
17451 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
17452 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
17453 ShuffleBuilder.add(BV, ReuseMask);
17454 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
17455 SubVectorsMask);
17456 } else {
17457 // Gather all constants.
17458 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
17459 for (auto [I, V] : enumerate(First&: GatheredScalars)) {
17460 if (!isa<PoisonValue>(Val: V))
17461 Mask[I] = I;
17462 }
17463 Value *BV = ShuffleBuilder.gather(GatheredScalars);
17464 ShuffleBuilder.add(BV, Mask);
17465 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
17466 SubVectorsMask);
17467 }
17468
17469 if (NeedFreeze)
17470 Res = ShuffleBuilder.createFreeze(Res);
17471 return Res;
17472}
17473
17474Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
17475 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
17476 (void)vectorizeTree(E: VectorizableTree[EIdx].get());
17477 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
17478 Params&: Builder, Params&: *this);
17479}
17480
17481/// \returns \p I after propagating metadata from \p VL only for instructions in
17482/// \p VL.
17483static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
17484 SmallVector<Value *> Insts;
17485 for (Value *V : VL)
17486 if (isa<Instruction>(Val: V))
17487 Insts.push_back(Elt: V);
17488 return llvm::propagateMetadata(I: Inst, VL: Insts);
17489}
17490
17491static DebugLoc getDebugLocFromPHI(PHINode &PN) {
17492 if (DebugLoc DL = PN.getDebugLoc())
17493 return DL;
17494 return DebugLoc::getUnknown();
17495}
17496
17497Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
17498 IRBuilderBase::InsertPointGuard Guard(Builder);
17499
17500 Value *V = E->Scalars.front();
17501 Type *ScalarTy = V->getType();
17502 if (!isa<CmpInst>(Val: V))
17503 ScalarTy = getValueType(V);
17504 auto It = MinBWs.find(Val: E);
17505 if (It != MinBWs.end()) {
17506 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
17507 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
17508 if (VecTy)
17509 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
17510 }
17511 if (E->VectorizedValue)
17512 return E->VectorizedValue;
17513 auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
17514 if (E->isGather()) {
17515 // Set insert point for non-reduction initial nodes.
17516 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
17517 setInsertPointAfterBundle(E);
17518 Value *Vec = createBuildVector(E, ScalarTy);
17519 E->VectorizedValue = Vec;
17520 return Vec;
17521 }
17522 if (E->State == TreeEntry::SplitVectorize) {
17523 assert(E->CombinedEntriesWithIndices.size() == 2 &&
17524 "Expected exactly 2 combined entries.");
17525 setInsertPointAfterBundle(E);
17526 TreeEntry &OpTE1 =
17527 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
17528 assert(OpTE1.isSame(
17529 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
17530 "Expected same first part of scalars.");
17531 Value *Op1 = vectorizeTree(E: &OpTE1);
17532 TreeEntry &OpTE2 =
17533 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
17534 assert(
17535 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
17536 "Expected same second part of scalars.");
17537 Value *Op2 = vectorizeTree(E: &OpTE2);
17538 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
17539 bool IsSigned = false;
17540 auto It = MinBWs.find(Val: OpE);
17541 if (It != MinBWs.end())
17542 IsSigned = It->second.second;
17543 else
17544 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
17545 if (isa<PoisonValue>(Val: V))
17546 return false;
17547 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
17548 });
17549 return IsSigned;
17550 };
17551 if (cast<VectorType>(Val: Op1->getType())->getElementType() !=
17552 ScalarTy->getScalarType()) {
17553 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
17554 Op1 = Builder.CreateIntCast(
17555 V: Op1,
17556 DestTy: getWidenedType(
17557 ScalarTy,
17558 VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()),
17559 isSigned: GetOperandSignedness(&OpTE1));
17560 }
17561 if (cast<VectorType>(Val: Op2->getType())->getElementType() !=
17562 ScalarTy->getScalarType()) {
17563 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
17564 Op2 = Builder.CreateIntCast(
17565 V: Op2,
17566 DestTy: getWidenedType(
17567 ScalarTy,
17568 VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()),
17569 isSigned: GetOperandSignedness(&OpTE2));
17570 }
17571 if (E->ReorderIndices.empty()) {
17572 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
17573 std::iota(
17574 first: Mask.begin(),
17575 last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second),
17576 value: 0);
17577 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
17578 if (ScalarTyNumElements != 1) {
17579 assert(SLPReVec && "Only supported by REVEC.");
17580 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask);
17581 }
17582 Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask);
17583 Vec = createInsertVector(Builder, Vec, V: Op2,
17584 Index: E->CombinedEntriesWithIndices.back().second *
17585 ScalarTyNumElements);
17586 E->VectorizedValue = Vec;
17587 return Vec;
17588 }
17589 unsigned CommonVF =
17590 std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor());
17591 if (getNumElements(Ty: Op1->getType()) != CommonVF) {
17592 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
17593 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE1.getVectorFactor()),
17594 value: 0);
17595 Op1 = Builder.CreateShuffleVector(V: Op1, Mask);
17596 }
17597 if (getNumElements(Ty: Op2->getType()) != CommonVF) {
17598 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
17599 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE2.getVectorFactor()),
17600 value: 0);
17601 Op2 = Builder.CreateShuffleVector(V: Op2, Mask);
17602 }
17603 Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask());
17604 E->VectorizedValue = Vec;
17605 return Vec;
17606 }
17607
17608 bool IsReverseOrder =
17609 !E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices);
17610 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
17611 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
17612 if (E->getOpcode() == Instruction::Store &&
17613 E->State == TreeEntry::Vectorize) {
17614 ArrayRef<int> Mask =
17615 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
17616 E->ReorderIndices.size());
17617 ShuffleBuilder.add(V1: V, Mask);
17618 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
17619 E->State == TreeEntry::CompressVectorize) {
17620 ShuffleBuilder.addOrdered(V1: V, Order: {});
17621 } else {
17622 ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
17623 }
17624 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
17625 E->CombinedEntriesWithIndices.size());
17626 transform(
17627 Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) {
17628 return std::make_pair(VectorizableTree[P.first].get(), P.second);
17629 });
17630 assert(
17631 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
17632 "Expected either combined subnodes or reordering");
17633 return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {});
17634 };
17635
17636 assert(!E->isGather() && "Unhandled state");
17637 unsigned ShuffleOrOp =
17638 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
17639 Instruction *VL0 = E->getMainOp();
17640 auto GetOperandSignedness = [&](unsigned Idx) {
17641 const TreeEntry *OpE = getOperandEntry(E, Idx);
17642 bool IsSigned = false;
17643 auto It = MinBWs.find(Val: OpE);
17644 if (It != MinBWs.end())
17645 IsSigned = It->second.second;
17646 else
17647 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
17648 if (isa<PoisonValue>(Val: V))
17649 return false;
17650 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
17651 });
17652 return IsSigned;
17653 };
17654 switch (ShuffleOrOp) {
17655 case Instruction::PHI: {
17656 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
17657 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
17658 "PHI reordering is free.");
17659 auto *PH = cast<PHINode>(Val: VL0);
17660 Builder.SetInsertPoint(TheBB: PH->getParent(),
17661 IP: PH->getParent()->getFirstNonPHIIt());
17662 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
17663 PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
17664 Value *V = NewPhi;
17665
17666 // Adjust insertion point once all PHI's have been generated.
17667 Builder.SetInsertPoint(TheBB: PH->getParent(),
17668 IP: PH->getParent()->getFirstInsertionPt());
17669 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
17670
17671 V = FinalShuffle(V, E);
17672
17673 E->VectorizedValue = V;
17674 // If phi node is fully emitted - exit.
17675 if (NewPhi->getNumIncomingValues() != 0)
17676 return NewPhi;
17677
17678 // PHINodes may have multiple entries from the same block. We want to
17679 // visit every block once.
17680 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
17681
17682 for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) {
17683 BasicBlock *IBB = PH->getIncomingBlock(i: I);
17684
17685 // Stop emission if all incoming values are generated.
17686 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
17687 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
17688 return NewPhi;
17689 }
17690
17691 if (!VisitedBBs.insert(Ptr: IBB).second) {
17692 Value *VecOp = NewPhi->getIncomingValueForBlock(BB: IBB);
17693 NewPhi->addIncoming(V: VecOp, BB: IBB);
17694 TreeEntry *OpTE = getOperandEntry(E, Idx: I);
17695 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
17696 OpTE->VectorizedValue = VecOp;
17697 continue;
17698 }
17699
17700 Builder.SetInsertPoint(IBB->getTerminator());
17701 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
17702 Value *Vec = vectorizeOperand(E, NodeIdx: I);
17703 if (VecTy != Vec->getType()) {
17704 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
17705 MinBWs.contains(getOperandEntry(E, I))) &&
17706 "Expected item in MinBWs.");
17707 Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
17708 }
17709 NewPhi->addIncoming(V: Vec, BB: IBB);
17710 }
17711
17712 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
17713 "Invalid number of incoming values");
17714 assert(E->VectorizedValue && "Expected vectorized value.");
17715 return E->VectorizedValue;
17716 }
17717
17718 case Instruction::ExtractElement: {
17719 Value *V = E->getSingleOperand(OpIdx: 0);
17720 setInsertPointAfterBundle(E);
17721 V = FinalShuffle(V, E);
17722 E->VectorizedValue = V;
17723 return V;
17724 }
17725 case Instruction::ExtractValue: {
17726 auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0));
17727 Builder.SetInsertPoint(LI);
17728 Value *Ptr = LI->getPointerOperand();
17729 LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
17730 Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars);
17731 NewV = FinalShuffle(NewV, E);
17732 E->VectorizedValue = NewV;
17733 return NewV;
17734 }
17735 case Instruction::InsertElement: {
17736 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
17737 Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
17738 Value *V = vectorizeOperand(E, NodeIdx: 1);
17739 ArrayRef<Value *> Op = E->getOperand(OpIdx: 1);
17740 Type *ScalarTy = Op.front()->getType();
17741 if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
17742 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
17743 std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1));
17744 assert(Res.first > 0 && "Expected item in MinBWs.");
17745 V = Builder.CreateIntCast(
17746 V,
17747 DestTy: getWidenedType(
17748 ScalarTy,
17749 VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
17750 isSigned: Res.second);
17751 }
17752
17753 // Create InsertVector shuffle if necessary
17754 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) {
17755 return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
17756 }));
17757 const unsigned NumElts =
17758 cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
17759 const unsigned NumScalars = E->Scalars.size();
17760
17761 unsigned Offset = *getElementIndex(Inst: VL0);
17762 assert(Offset < NumElts && "Failed to find vector index offset");
17763
17764 // Create shuffle to resize vector
17765 SmallVector<int> Mask;
17766 if (!E->ReorderIndices.empty()) {
17767 inversePermutation(Indices: E->ReorderIndices, Mask);
17768 Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
17769 } else {
17770 Mask.assign(NumElts, Elt: PoisonMaskElem);
17771 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0);
17772 }
17773 // Create InsertVector shuffle if necessary
17774 bool IsIdentity = true;
17775 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
17776 Mask.swap(RHS&: PrevMask);
17777 for (unsigned I = 0; I < NumScalars; ++I) {
17778 Value *Scalar = E->Scalars[PrevMask[I]];
17779 unsigned InsertIdx = *getElementIndex(Inst: Scalar);
17780 IsIdentity &= InsertIdx - Offset == I;
17781 Mask[InsertIdx - Offset] = I;
17782 }
17783 if (!IsIdentity || NumElts != NumScalars) {
17784 Value *V2 = nullptr;
17785 bool IsVNonPoisonous =
17786 !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
17787 SmallVector<int> InsertMask(Mask);
17788 if (NumElts != NumScalars && Offset == 0) {
17789 // Follow all insert element instructions from the current buildvector
17790 // sequence.
17791 InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
17792 do {
17793 std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
17794 if (!InsertIdx)
17795 break;
17796 if (InsertMask[*InsertIdx] == PoisonMaskElem)
17797 InsertMask[*InsertIdx] = *InsertIdx;
17798 if (!Ins->hasOneUse())
17799 break;
17800 Ins = dyn_cast_or_null<InsertElementInst>(
17801 Val: Ins->getUniqueUndroppableUser());
17802 } while (Ins);
17803 SmallBitVector UseMask =
17804 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
17805 SmallBitVector IsFirstPoison =
17806 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
17807 SmallBitVector IsFirstUndef =
17808 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
17809 if (!IsFirstPoison.all()) {
17810 unsigned Idx = 0;
17811 for (unsigned I = 0; I < NumElts; I++) {
17812 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
17813 IsFirstUndef.test(Idx: I)) {
17814 if (IsVNonPoisonous) {
17815 InsertMask[I] = I < NumScalars ? I : 0;
17816 continue;
17817 }
17818 if (!V2)
17819 V2 = UndefValue::get(T: V->getType());
17820 if (Idx >= NumScalars)
17821 Idx = NumScalars - 1;
17822 InsertMask[I] = NumScalars + Idx;
17823 ++Idx;
17824 } else if (InsertMask[I] != PoisonMaskElem &&
17825 Mask[I] == PoisonMaskElem) {
17826 InsertMask[I] = PoisonMaskElem;
17827 }
17828 }
17829 } else {
17830 InsertMask = Mask;
17831 }
17832 }
17833 if (!V2)
17834 V2 = PoisonValue::get(T: V->getType());
17835 V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
17836 if (auto *I = dyn_cast<Instruction>(Val: V)) {
17837 GatherShuffleExtractSeq.insert(X: I);
17838 CSEBlocks.insert(V: I->getParent());
17839 }
17840 }
17841
17842 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
17843 for (unsigned I = 0; I < NumElts; I++) {
17844 if (Mask[I] != PoisonMaskElem)
17845 InsertMask[Offset + I] = I;
17846 }
17847 SmallBitVector UseMask =
17848 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
17849 SmallBitVector IsFirstUndef =
17850 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
17851 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
17852 NumElts != NumScalars) {
17853 if (IsFirstUndef.all()) {
17854 if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
17855 SmallBitVector IsFirstPoison =
17856 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
17857 if (!IsFirstPoison.all()) {
17858 for (unsigned I = 0; I < NumElts; I++) {
17859 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
17860 InsertMask[I] = I + NumElts;
17861 }
17862 }
17863 V = Builder.CreateShuffleVector(
17864 V1: V,
17865 V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
17866 : FirstInsert->getOperand(i: 0),
17867 Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
17868 if (auto *I = dyn_cast<Instruction>(Val: V)) {
17869 GatherShuffleExtractSeq.insert(X: I);
17870 CSEBlocks.insert(V: I->getParent());
17871 }
17872 }
17873 } else {
17874 SmallBitVector IsFirstPoison =
17875 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
17876 for (unsigned I = 0; I < NumElts; I++) {
17877 if (InsertMask[I] == PoisonMaskElem)
17878 InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
17879 else
17880 InsertMask[I] += NumElts;
17881 }
17882 V = Builder.CreateShuffleVector(
17883 V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask,
17884 Name: cast<Instruction>(Val: E->Scalars.back())->getName());
17885 if (auto *I = dyn_cast<Instruction>(Val: V)) {
17886 GatherShuffleExtractSeq.insert(X: I);
17887 CSEBlocks.insert(V: I->getParent());
17888 }
17889 }
17890 }
17891
17892 ++NumVectorInstructions;
17893 E->VectorizedValue = V;
17894 return V;
17895 }
17896 case Instruction::ZExt:
17897 case Instruction::SExt:
17898 case Instruction::FPToUI:
17899 case Instruction::FPToSI:
17900 case Instruction::FPExt:
17901 case Instruction::PtrToInt:
17902 case Instruction::IntToPtr:
17903 case Instruction::SIToFP:
17904 case Instruction::UIToFP:
17905 case Instruction::Trunc:
17906 case Instruction::FPTrunc:
17907 case Instruction::BitCast: {
17908 setInsertPointAfterBundle(E);
17909
17910 Value *InVec = vectorizeOperand(E, NodeIdx: 0);
17911
17912 auto *CI = cast<CastInst>(Val: VL0);
17913 Instruction::CastOps VecOpcode = CI->getOpcode();
17914 Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
17915 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
17916 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
17917 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
17918 SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType()->getScalarType())) {
17919 // Check if the values are candidates to demote.
17920 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
17921 if (SrcIt != MinBWs.end())
17922 SrcBWSz = SrcIt->second.first;
17923 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
17924 if (BWSz == SrcBWSz) {
17925 VecOpcode = Instruction::BitCast;
17926 } else if (BWSz < SrcBWSz) {
17927 VecOpcode = Instruction::Trunc;
17928 } else if (It != MinBWs.end()) {
17929 assert(BWSz > SrcBWSz && "Invalid cast!");
17930 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17931 } else if (SrcIt != MinBWs.end()) {
17932 assert(BWSz > SrcBWSz && "Invalid cast!");
17933 VecOpcode =
17934 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
17935 }
17936 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
17937 !SrcIt->second.second) {
17938 VecOpcode = Instruction::UIToFP;
17939 }
17940 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
17941 ? InVec
17942 : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
17943 V = FinalShuffle(V, E);
17944
17945 E->VectorizedValue = V;
17946 ++NumVectorInstructions;
17947 return V;
17948 }
17949 case Instruction::FCmp:
17950 case Instruction::ICmp: {
17951 setInsertPointAfterBundle(E);
17952
17953 Value *L = vectorizeOperand(E, NodeIdx: 0);
17954 Value *R = vectorizeOperand(E, NodeIdx: 1);
17955 if (L->getType() != R->getType()) {
17956 assert((getOperandEntry(E, 0)->isGather() ||
17957 getOperandEntry(E, 1)->isGather() ||
17958 MinBWs.contains(getOperandEntry(E, 0)) ||
17959 MinBWs.contains(getOperandEntry(E, 1))) &&
17960 "Expected item in MinBWs.");
17961 if (cast<VectorType>(Val: L->getType())
17962 ->getElementType()
17963 ->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
17964 ->getElementType()
17965 ->getIntegerBitWidth()) {
17966 Type *CastTy = R->getType();
17967 L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0));
17968 } else {
17969 Type *CastTy = L->getType();
17970 R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1));
17971 }
17972 }
17973
17974 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
17975 Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
17976 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
17977 if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end())
17978 ICmp->setSameSign(/*B=*/false);
17979 // Do not cast for cmps.
17980 VecTy = cast<FixedVectorType>(Val: V->getType());
17981 V = FinalShuffle(V, E);
17982
17983 E->VectorizedValue = V;
17984 ++NumVectorInstructions;
17985 return V;
17986 }
17987 case Instruction::Select: {
17988 setInsertPointAfterBundle(E);
17989
17990 Value *Cond = vectorizeOperand(E, NodeIdx: 0);
17991 Value *True = vectorizeOperand(E, NodeIdx: 1);
17992 Value *False = vectorizeOperand(E, NodeIdx: 2);
17993 if (True->getType() != VecTy || False->getType() != VecTy) {
17994 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
17995 getOperandEntry(E, 2)->isGather() ||
17996 MinBWs.contains(getOperandEntry(E, 1)) ||
17997 MinBWs.contains(getOperandEntry(E, 2))) &&
17998 "Expected item in MinBWs.");
17999 if (True->getType() != VecTy)
18000 True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1));
18001 if (False->getType() != VecTy)
18002 False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2));
18003 }
18004
18005 unsigned CondNumElements = getNumElements(Ty: Cond->getType());
18006 unsigned TrueNumElements = getNumElements(Ty: True->getType());
18007 assert(TrueNumElements >= CondNumElements &&
18008 TrueNumElements % CondNumElements == 0 &&
18009 "Cannot vectorize Instruction::Select");
18010 assert(TrueNumElements == getNumElements(False->getType()) &&
18011 "Cannot vectorize Instruction::Select");
18012 if (CondNumElements != TrueNumElements) {
18013 // When the return type is i1 but the source is fixed vector type, we
18014 // need to duplicate the condition value.
18015 Cond = Builder.CreateShuffleVector(
18016 V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements,
18017 VF: CondNumElements));
18018 }
18019 assert(getNumElements(Cond->getType()) == TrueNumElements &&
18020 "Cannot vectorize Instruction::Select");
18021 Value *V = Builder.CreateSelect(C: Cond, True, False);
18022 V = FinalShuffle(V, E);
18023
18024 E->VectorizedValue = V;
18025 ++NumVectorInstructions;
18026 return V;
18027 }
18028 case Instruction::FNeg: {
18029 setInsertPointAfterBundle(E);
18030
18031 Value *Op = vectorizeOperand(E, NodeIdx: 0);
18032
18033 Value *V = Builder.CreateUnOp(
18034 Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
18035 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
18036 if (auto *I = dyn_cast<Instruction>(Val: V))
18037 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18038
18039 V = FinalShuffle(V, E);
18040
18041 E->VectorizedValue = V;
18042 ++NumVectorInstructions;
18043
18044 return V;
18045 }
18046 case Instruction::Freeze: {
18047 setInsertPointAfterBundle(E);
18048
18049 Value *Op = vectorizeOperand(E, NodeIdx: 0);
18050
18051 if (Op->getType() != VecTy) {
18052 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
18053 MinBWs.contains(getOperandEntry(E, 0))) &&
18054 "Expected item in MinBWs.");
18055 Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness(0));
18056 }
18057 Value *V = Builder.CreateFreeze(V: Op);
18058 V = FinalShuffle(V, E);
18059
18060 E->VectorizedValue = V;
18061 ++NumVectorInstructions;
18062
18063 return V;
18064 }
18065 case Instruction::Add:
18066 case Instruction::FAdd:
18067 case Instruction::Sub:
18068 case Instruction::FSub:
18069 case Instruction::Mul:
18070 case Instruction::FMul:
18071 case Instruction::UDiv:
18072 case Instruction::SDiv:
18073 case Instruction::FDiv:
18074 case Instruction::URem:
18075 case Instruction::SRem:
18076 case Instruction::FRem:
18077 case Instruction::Shl:
18078 case Instruction::LShr:
18079 case Instruction::AShr:
18080 case Instruction::And:
18081 case Instruction::Or:
18082 case Instruction::Xor: {
18083 setInsertPointAfterBundle(E);
18084
18085 Value *LHS = vectorizeOperand(E, NodeIdx: 0);
18086 Value *RHS = vectorizeOperand(E, NodeIdx: 1);
18087 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
18088 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
18089 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
18090 if (all_of(Range&: Ops, P: [&](Value *Op) {
18091 auto *CI = dyn_cast<ConstantInt>(Val: Op);
18092 return CI && CI->getValue().countr_one() >= It->second.first;
18093 })) {
18094 V = FinalShuffle(I == 0 ? RHS : LHS, E);
18095 E->VectorizedValue = V;
18096 ++NumVectorInstructions;
18097 return V;
18098 }
18099 }
18100 }
18101 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
18102 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
18103 getOperandEntry(E, 1)->isGather() ||
18104 MinBWs.contains(getOperandEntry(E, 0)) ||
18105 MinBWs.contains(getOperandEntry(E, 1))) &&
18106 "Expected item in MinBWs.");
18107 if (LHS->getType() != VecTy)
18108 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0));
18109 if (RHS->getType() != VecTy)
18110 RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1));
18111 }
18112
18113 Value *V = Builder.CreateBinOp(
18114 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
18115 RHS);
18116 propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end());
18117 if (auto *I = dyn_cast<Instruction>(Val: V)) {
18118 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18119 // Drop nuw flags for abs(sub(commutative), true).
18120 if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
18121 any_of(Range&: E->Scalars, P: [](Value *V) {
18122 return isa<PoisonValue>(Val: V) || isCommutative(I: cast<Instruction>(Val: V));
18123 }))
18124 I->setHasNoUnsignedWrap(/*b=*/false);
18125 }
18126
18127 V = FinalShuffle(V, E);
18128
18129 E->VectorizedValue = V;
18130 ++NumVectorInstructions;
18131
18132 return V;
18133 }
18134 case Instruction::Load: {
18135 // Loads are inserted at the head of the tree because we don't want to
18136 // sink them all the way down past store instructions.
18137 setInsertPointAfterBundle(E);
18138
18139 LoadInst *LI = cast<LoadInst>(Val: VL0);
18140 Instruction *NewLI;
18141 Value *PO = LI->getPointerOperand();
18142 if (E->State == TreeEntry::Vectorize) {
18143 NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
18144 } else if (E->State == TreeEntry::CompressVectorize) {
18145 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
18146 CompressEntryToData.at(Val: E);
18147 Align CommonAlignment = LI->getAlign();
18148 if (IsMasked) {
18149 unsigned VF = getNumElements(Ty: LoadVecTy);
18150 SmallVector<Constant *> MaskValues(
18151 VF / getNumElements(Ty: LI->getType()),
18152 ConstantInt::getFalse(Context&: VecTy->getContext()));
18153 for (int I : CompressMask)
18154 MaskValues[I] = ConstantInt::getTrue(Context&: VecTy->getContext());
18155 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
18156 assert(SLPReVec && "Only supported by REVEC.");
18157 MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements());
18158 }
18159 Constant *MaskValue = ConstantVector::get(V: MaskValues);
18160 NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment,
18161 Mask: MaskValue);
18162 } else {
18163 NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment);
18164 }
18165 NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
18166 // TODO: include this cost into CommonCost.
18167 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
18168 assert(SLPReVec && "FixedVectorType is not expected.");
18169 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(),
18170 Mask&: CompressMask);
18171 }
18172 NewLI =
18173 cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask));
18174 } else if (E->State == TreeEntry::StridedVectorize) {
18175 Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
18176 Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
18177 PO = IsReverseOrder ? PtrN : Ptr0;
18178 std::optional<int64_t> Diff = getPointersDiff(
18179 ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: *DL, SE&: *SE);
18180 Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
18181 Value *StrideVal;
18182 if (Diff) {
18183 int64_t Stride =
18184 *Diff / (static_cast<int64_t>(E->Scalars.size()) - 1);
18185 StrideVal =
18186 ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) * Stride *
18187 DL->getTypeAllocSize(Ty: ScalarTy));
18188 } else {
18189 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
18190 transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) {
18191 return cast<LoadInst>(Val: V)->getPointerOperand();
18192 });
18193 OrdersType Order;
18194 std::optional<Value *> Stride =
18195 calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order,
18196 Inst: &*Builder.GetInsertPoint());
18197 Value *NewStride =
18198 Builder.CreateIntCast(V: *Stride, DestTy: StrideTy, /*isSigned=*/true);
18199 StrideVal = Builder.CreateMul(
18200 LHS: NewStride,
18201 RHS: ConstantInt::get(
18202 Ty: StrideTy,
18203 V: (IsReverseOrder ? -1 : 1) *
18204 static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))));
18205 }
18206 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
18207 auto *Inst = Builder.CreateIntrinsic(
18208 ID: Intrinsic::experimental_vp_strided_load,
18209 Types: {VecTy, PO->getType(), StrideTy},
18210 Args: {PO, StrideVal, Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
18211 Builder.getInt32(C: E->Scalars.size())});
18212 Inst->addParamAttr(
18213 /*ArgNo=*/0,
18214 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
18215 NewLI = Inst;
18216 } else {
18217 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
18218 Value *VecPtr = vectorizeOperand(E, NodeIdx: 0);
18219 if (isa<FixedVectorType>(Val: ScalarTy)) {
18220 assert(SLPReVec && "FixedVectorType is not expected.");
18221 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
18222 // to expand VecPtr if ScalarTy is a vector type.
18223 unsigned ScalarTyNumElements =
18224 cast<FixedVectorType>(Val: ScalarTy)->getNumElements();
18225 unsigned VecTyNumElements =
18226 cast<FixedVectorType>(Val: VecTy)->getNumElements();
18227 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
18228 "Cannot expand getelementptr.");
18229 unsigned VF = VecTyNumElements / ScalarTyNumElements;
18230 SmallVector<Constant *> Indices(VecTyNumElements);
18231 transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) {
18232 return Builder.getInt64(C: I % ScalarTyNumElements);
18233 });
18234 VecPtr = Builder.CreateGEP(
18235 Ty: VecTy->getElementType(),
18236 Ptr: Builder.CreateShuffleVector(
18237 V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)),
18238 IdxList: ConstantVector::get(V: Indices));
18239 }
18240 // Use the minimum alignment of the gathered loads.
18241 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
18242 NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
18243 }
18244 Value *V = E->State == TreeEntry::CompressVectorize
18245 ? NewLI
18246 : ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
18247
18248 V = FinalShuffle(V, E);
18249 E->VectorizedValue = V;
18250 ++NumVectorInstructions;
18251 return V;
18252 }
18253 case Instruction::Store: {
18254 auto *SI = cast<StoreInst>(Val: VL0);
18255
18256 setInsertPointAfterBundle(E);
18257
18258 Value *VecValue = vectorizeOperand(E, NodeIdx: 0);
18259 if (VecValue->getType() != VecTy)
18260 VecValue =
18261 Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0));
18262 VecValue = FinalShuffle(VecValue, E);
18263
18264 Value *Ptr = SI->getPointerOperand();
18265 Instruction *ST;
18266 if (E->State == TreeEntry::Vectorize) {
18267 ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
18268 } else {
18269 assert(E->State == TreeEntry::StridedVectorize &&
18270 "Expected either strided or consecutive stores.");
18271 if (!E->ReorderIndices.empty()) {
18272 SI = cast<StoreInst>(Val: E->Scalars[E->ReorderIndices.front()]);
18273 Ptr = SI->getPointerOperand();
18274 }
18275 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
18276 Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
18277 auto *Inst = Builder.CreateIntrinsic(
18278 ID: Intrinsic::experimental_vp_strided_store,
18279 Types: {VecTy, Ptr->getType(), StrideTy},
18280 Args: {VecValue, Ptr,
18281 ConstantInt::get(
18282 Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
18283 Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
18284 Builder.getInt32(C: E->Scalars.size())});
18285 Inst->addParamAttr(
18286 /*ArgNo=*/1,
18287 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
18288 ST = Inst;
18289 }
18290
18291 Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars);
18292
18293 E->VectorizedValue = V;
18294 ++NumVectorInstructions;
18295 return V;
18296 }
18297 case Instruction::GetElementPtr: {
18298 auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
18299 setInsertPointAfterBundle(E);
18300
18301 Value *Op0 = vectorizeOperand(E, NodeIdx: 0);
18302
18303 SmallVector<Value *> OpVecs;
18304 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
18305 Value *OpVec = vectorizeOperand(E, NodeIdx: J);
18306 OpVecs.push_back(Elt: OpVec);
18307 }
18308
18309 Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
18310 if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
18311 SmallVector<Value *> GEPs;
18312 for (Value *V : E->Scalars) {
18313 if (isa<GetElementPtrInst>(Val: V))
18314 GEPs.push_back(Elt: V);
18315 }
18316 V = ::propagateMetadata(Inst: I, VL: GEPs);
18317 }
18318
18319 V = FinalShuffle(V, E);
18320
18321 E->VectorizedValue = V;
18322 ++NumVectorInstructions;
18323
18324 return V;
18325 }
18326 case Instruction::Call: {
18327 CallInst *CI = cast<CallInst>(Val: VL0);
18328 setInsertPointAfterBundle(E);
18329
18330 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
18331
18332 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
18333 CI, ID, VF: VecTy->getNumElements(),
18334 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
18335 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
18336 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
18337 VecCallCosts.first <= VecCallCosts.second;
18338
18339 Value *ScalarArg = nullptr;
18340 SmallVector<Value *> OpVecs;
18341 SmallVector<Type *, 2> TysForDecl;
18342 // Add return type if intrinsic is overloaded on it.
18343 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1, TTI))
18344 TysForDecl.push_back(Elt: VecTy);
18345 auto *CEI = cast<CallInst>(Val: VL0);
18346 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
18347 // Some intrinsics have scalar arguments. This argument should not be
18348 // vectorized.
18349 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) {
18350 ScalarArg = CEI->getArgOperand(i: I);
18351 // if decided to reduce bitwidth of abs intrinsic, it second argument
18352 // must be set false (do not return poison, if value issigned min).
18353 if (ID == Intrinsic::abs && It != MinBWs.end() &&
18354 It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
18355 ScalarArg = Builder.getFalse();
18356 OpVecs.push_back(Elt: ScalarArg);
18357 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
18358 TysForDecl.push_back(Elt: ScalarArg->getType());
18359 continue;
18360 }
18361
18362 Value *OpVec = vectorizeOperand(E, NodeIdx: I);
18363 ScalarArg = CEI->getArgOperand(i: I);
18364 if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
18365 ScalarArg->getType()->getScalarType() &&
18366 It == MinBWs.end()) {
18367 auto *CastTy =
18368 getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
18369 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I));
18370 } else if (It != MinBWs.end()) {
18371 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
18372 }
18373 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
18374 OpVecs.push_back(Elt: OpVec);
18375 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
18376 TysForDecl.push_back(Elt: OpVec->getType());
18377 }
18378
18379 Function *CF;
18380 if (!UseIntrinsic) {
18381 VFShape Shape =
18382 VFShape::get(FTy: CI->getFunctionType(),
18383 EC: ElementCount::getFixed(
18384 MinVal: static_cast<unsigned>(VecTy->getNumElements())),
18385 HasGlobalPred: false /*HasGlobalPred*/);
18386 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
18387 } else {
18388 CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
18389 }
18390
18391 SmallVector<OperandBundleDef, 1> OpBundles;
18392 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
18393 Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
18394
18395 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
18396 V = FinalShuffle(V, E);
18397
18398 E->VectorizedValue = V;
18399 ++NumVectorInstructions;
18400 return V;
18401 }
18402 case Instruction::ShuffleVector: {
18403 Value *V;
18404 if (SLPReVec && !E->isAltShuffle()) {
18405 setInsertPointAfterBundle(E);
18406 Value *Src = vectorizeOperand(E, NodeIdx: 0);
18407 SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars));
18408 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) {
18409 SmallVector<int> NewMask(ThisMask.size());
18410 transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) {
18411 return SVSrc->getShuffleMask()[Mask];
18412 });
18413 V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: 0),
18414 V2: SVSrc->getOperand(i_nocapture: 1), Mask: NewMask);
18415 } else {
18416 V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask);
18417 }
18418 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
18419 if (auto *I = dyn_cast<Instruction>(Val: V))
18420 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18421 V = FinalShuffle(V, E);
18422 } else {
18423 assert(E->isAltShuffle() &&
18424 ((Instruction::isBinaryOp(E->getOpcode()) &&
18425 Instruction::isBinaryOp(E->getAltOpcode())) ||
18426 (Instruction::isCast(E->getOpcode()) &&
18427 Instruction::isCast(E->getAltOpcode())) ||
18428 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
18429 "Invalid Shuffle Vector Operand");
18430
18431 Value *LHS = nullptr, *RHS = nullptr;
18432 if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) {
18433 setInsertPointAfterBundle(E);
18434 LHS = vectorizeOperand(E, NodeIdx: 0);
18435 RHS = vectorizeOperand(E, NodeIdx: 1);
18436 } else {
18437 setInsertPointAfterBundle(E);
18438 LHS = vectorizeOperand(E, NodeIdx: 0);
18439 }
18440 if (LHS && RHS &&
18441 ((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
18442 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
18443 (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
18444 assert((It != MinBWs.end() ||
18445 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
18446 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
18447 MinBWs.contains(getOperandEntry(E, 0)) ||
18448 MinBWs.contains(getOperandEntry(E, 1))) &&
18449 "Expected item in MinBWs.");
18450 Type *CastTy = VecTy;
18451 if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
18452 if (cast<VectorType>(Val: LHS->getType())
18453 ->getElementType()
18454 ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
18455 ->getElementType()
18456 ->getIntegerBitWidth())
18457 CastTy = RHS->getType();
18458 else
18459 CastTy = LHS->getType();
18460 }
18461 if (LHS->getType() != CastTy)
18462 LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0));
18463 if (RHS->getType() != CastTy)
18464 RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1));
18465 }
18466
18467 Value *V0, *V1;
18468 if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
18469 V0 = Builder.CreateBinOp(
18470 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
18471 V1 = Builder.CreateBinOp(
18472 Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
18473 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
18474 V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
18475 auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
18476 CmpInst::Predicate AltPred = AltCI->getPredicate();
18477 V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
18478 } else {
18479 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
18480 unsigned SrcBWSz = DL->getTypeSizeInBits(
18481 Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
18482 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
18483 if (BWSz <= SrcBWSz) {
18484 if (BWSz < SrcBWSz)
18485 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first);
18486 assert(LHS->getType() == VecTy &&
18487 "Expected same type as operand.");
18488 if (auto *I = dyn_cast<Instruction>(Val: LHS))
18489 LHS = ::propagateMetadata(Inst: I, VL: E->Scalars);
18490 LHS = FinalShuffle(LHS, E);
18491 E->VectorizedValue = LHS;
18492 ++NumVectorInstructions;
18493 return LHS;
18494 }
18495 }
18496 V0 = Builder.CreateCast(
18497 Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
18498 V1 = Builder.CreateCast(
18499 Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
18500 }
18501 // Add V0 and V1 to later analysis to try to find and remove matching
18502 // instruction, if any.
18503 for (Value *V : {V0, V1}) {
18504 if (auto *I = dyn_cast<Instruction>(Val: V)) {
18505 GatherShuffleExtractSeq.insert(X: I);
18506 CSEBlocks.insert(V: I->getParent());
18507 }
18508 }
18509
18510 // Create shuffle to take alternate operations from the vector.
18511 // Also, gather up main and alt scalar ops to propagate IR flags to
18512 // each vector operation.
18513 ValueList OpScalars, AltScalars;
18514 SmallVector<int> Mask;
18515 E->buildAltOpShuffleMask(
18516 IsAltOp: [E, this](Instruction *I) {
18517 assert(E->getMatchingMainOpOrAltOp(I) &&
18518 "Unexpected main/alternate opcode");
18519 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
18520 TLI: *TLI);
18521 },
18522 Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
18523
18524 propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
18525 propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
18526 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
18527 // Drop nuw flags for abs(sub(commutative), true).
18528 if (auto *I = dyn_cast<Instruction>(Val: Vec);
18529 I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
18530 any_of(Range&: E->Scalars, P: [](Value *V) {
18531 if (isa<PoisonValue>(Val: V))
18532 return false;
18533 auto *IV = cast<Instruction>(Val: V);
18534 return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV);
18535 }))
18536 I->setHasNoUnsignedWrap(/*b=*/false);
18537 };
18538 DropNuwFlag(V0, E->getOpcode());
18539 DropNuwFlag(V1, E->getAltOpcode());
18540
18541 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
18542 assert(SLPReVec && "FixedVectorType is not expected.");
18543 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask);
18544 }
18545 V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
18546 if (auto *I = dyn_cast<Instruction>(Val: V)) {
18547 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
18548 GatherShuffleExtractSeq.insert(X: I);
18549 CSEBlocks.insert(V: I->getParent());
18550 }
18551 }
18552
18553 E->VectorizedValue = V;
18554 ++NumVectorInstructions;
18555
18556 return V;
18557 }
18558 default:
18559 llvm_unreachable("unknown inst");
18560 }
18561 return nullptr;
18562}
18563
18564Value *BoUpSLP::vectorizeTree() {
18565 ExtraValueToDebugLocsMap ExternallyUsedValues;
18566 return vectorizeTree(ExternallyUsedValues);
18567}
18568
18569Value *BoUpSLP::vectorizeTree(
18570 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
18571 Instruction *ReductionRoot,
18572 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
18573 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
18574 // need to rebuild it.
18575 EntryToLastInstruction.clear();
18576 // All blocks must be scheduled before any instructions are inserted.
18577 for (auto &BSIter : BlocksSchedules)
18578 scheduleBlock(BS: BSIter.second.get());
18579 // Cache last instructions for the nodes to avoid side effects, which may
18580 // appear during vectorization, like extra uses, etc.
18581 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18582 if (TE->isGather())
18583 continue;
18584 (void)getLastInstructionInBundle(E: TE.get());
18585 }
18586
18587 if (ReductionRoot)
18588 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
18589 IP: ReductionRoot->getIterator());
18590 else
18591 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
18592
18593 // Vectorize gather operands of the nodes with the external uses only.
18594 SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
18595 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18596 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
18597 TE->UserTreeIndex.UserTE->hasState() &&
18598 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
18599 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
18600 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
18601 all_of(Range&: TE->UserTreeIndex.UserTE->Scalars,
18602 P: [](Value *V) { return isUsedOutsideBlock(V); })) {
18603 Instruction &LastInst =
18604 getLastInstructionInBundle(E: TE->UserTreeIndex.UserTE);
18605 GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst);
18606 }
18607 }
18608 for (auto &Entry : GatherEntries) {
18609 IRBuilderBase::InsertPointGuard Guard(Builder);
18610 Builder.SetInsertPoint(Entry.second);
18611 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
18612 (void)vectorizeTree(E: Entry.first);
18613 }
18614 // Emit gathered loads first to emit better code for the users of those
18615 // gathered loads.
18616 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18617 if (GatheredLoadsEntriesFirst.has_value() &&
18618 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
18619 (!TE->isGather() || TE->UserTreeIndex)) {
18620 assert((TE->UserTreeIndex ||
18621 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
18622 "Expected gathered load node.");
18623 (void)vectorizeTree(E: TE.get());
18624 }
18625 }
18626 (void)vectorizeTree(E: VectorizableTree[0].get());
18627 // Run through the list of postponed gathers and emit them, replacing the temp
18628 // emitted allocas with actual vector instructions.
18629 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
18630 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
18631 for (const TreeEntry *E : PostponedNodes) {
18632 auto *TE = const_cast<TreeEntry *>(E);
18633 auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
18634 TE->VectorizedValue = nullptr;
18635 auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue);
18636 // If user is a PHI node, its vector code have to be inserted right before
18637 // block terminator. Since the node was delayed, there were some unresolved
18638 // dependencies at the moment when stab instruction was emitted. In a case
18639 // when any of these dependencies turn out an operand of another PHI, coming
18640 // from this same block, position of a stab instruction will become invalid.
18641 // The is because source vector that supposed to feed this gather node was
18642 // inserted at the end of the block [after stab instruction]. So we need
18643 // to adjust insertion point again to the end of block.
18644 if (isa<PHINode>(Val: UserI)) {
18645 // Insert before all users.
18646 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
18647 for (User *U : PrevVec->users()) {
18648 if (U == UserI)
18649 continue;
18650 auto *UI = dyn_cast<Instruction>(Val: U);
18651 if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent())
18652 continue;
18653 if (UI->comesBefore(Other: InsertPt))
18654 InsertPt = UI;
18655 }
18656 Builder.SetInsertPoint(InsertPt);
18657 } else {
18658 Builder.SetInsertPoint(PrevVec);
18659 }
18660 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
18661 Value *Vec = vectorizeTree(E: TE);
18662 if (auto *VecI = dyn_cast<Instruction>(Val: Vec);
18663 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
18664 Builder.GetInsertPoint()->comesBefore(Other: VecI))
18665 VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(),
18666 I: Builder.GetInsertPoint());
18667 if (Vec->getType() != PrevVec->getType()) {
18668 assert(Vec->getType()->isIntOrIntVectorTy() &&
18669 PrevVec->getType()->isIntOrIntVectorTy() &&
18670 "Expected integer vector types only.");
18671 std::optional<bool> IsSigned;
18672 for (Value *V : TE->Scalars) {
18673 if (isVectorized(V)) {
18674 for (const TreeEntry *MNTE : getTreeEntries(V)) {
18675 auto It = MinBWs.find(Val: MNTE);
18676 if (It != MinBWs.end()) {
18677 IsSigned = IsSigned.value_or(u: false) || It->second.second;
18678 if (*IsSigned)
18679 break;
18680 }
18681 }
18682 if (IsSigned.value_or(u: false))
18683 break;
18684 // Scan through gather nodes.
18685 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
18686 auto It = MinBWs.find(Val: BVE);
18687 if (It != MinBWs.end()) {
18688 IsSigned = IsSigned.value_or(u: false) || It->second.second;
18689 if (*IsSigned)
18690 break;
18691 }
18692 }
18693 if (IsSigned.value_or(u: false))
18694 break;
18695 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
18696 IsSigned =
18697 IsSigned.value_or(u: false) ||
18698 !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL));
18699 continue;
18700 }
18701 if (IsSigned.value_or(u: false))
18702 break;
18703 }
18704 }
18705 if (IsSigned.value_or(u: false)) {
18706 // Final attempt - check user node.
18707 auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE);
18708 if (It != MinBWs.end())
18709 IsSigned = It->second.second;
18710 }
18711 assert(IsSigned &&
18712 "Expected user node or perfect diamond match in MinBWs.");
18713 Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
18714 }
18715 PrevVec->replaceAllUsesWith(V: Vec);
18716 PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE);
18717 // Replace the stub vector node, if it was used before for one of the
18718 // buildvector nodes already.
18719 auto It = PostponedValues.find(Val: PrevVec);
18720 if (It != PostponedValues.end()) {
18721 for (TreeEntry *VTE : It->getSecond())
18722 VTE->VectorizedValue = Vec;
18723 }
18724 eraseInstruction(I: PrevVec);
18725 }
18726
18727 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
18728 << " values .\n");
18729
18730 SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
18731 // Maps vector instruction to original insertelement instruction
18732 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
18733 // Maps extract Scalar to the corresponding extractelement instruction in the
18734 // basic block. Only one extractelement per block should be emitted.
18735 DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
18736 ScalarToEEs;
18737 SmallDenseSet<Value *, 4> UsedInserts;
18738 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
18739 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
18740 SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
18741 // Extract all of the elements with the external uses.
18742 for (const auto &ExternalUse : ExternalUses) {
18743 Value *Scalar = ExternalUse.Scalar;
18744 llvm::User *User = ExternalUse.User;
18745
18746 // Skip users that we already RAUW. This happens when one instruction
18747 // has multiple uses of the same value.
18748 if (User && !is_contained(Range: Scalar->users(), Element: User))
18749 continue;
18750 const TreeEntry *E = &ExternalUse.E;
18751 assert(E && "Invalid scalar");
18752 assert(!E->isGather() && "Extracting from a gather list");
18753 // Non-instruction pointers are not deleted, just skip them.
18754 if (E->getOpcode() == Instruction::GetElementPtr &&
18755 !isa<GetElementPtrInst>(Val: Scalar))
18756 continue;
18757
18758 Value *Vec = E->VectorizedValue;
18759 assert(Vec && "Can't find vectorizable value");
18760
18761 Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
18762 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
18763 if (Scalar->getType() != Vec->getType()) {
18764 Value *Ex = nullptr;
18765 Value *ExV = nullptr;
18766 auto *Inst = dyn_cast<Instruction>(Val: Scalar);
18767 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst);
18768 auto It = ScalarToEEs.find(Val: Scalar);
18769 if (It != ScalarToEEs.end()) {
18770 // No need to emit many extracts, just move the only one in the
18771 // current block.
18772 auto EEIt = It->second.find(Val: ReplaceInst ? Inst->getParent()
18773 : Builder.GetInsertBlock());
18774 if (EEIt != It->second.end()) {
18775 Value *PrevV = EEIt->second.first;
18776 if (auto *I = dyn_cast<Instruction>(Val: PrevV);
18777 I && !ReplaceInst &&
18778 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
18779 Builder.GetInsertPoint()->comesBefore(Other: I)) {
18780 I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
18781 I: Builder.GetInsertPoint());
18782 if (auto *CI = dyn_cast<Instruction>(Val: EEIt->second.second))
18783 CI->moveAfter(MovePos: I);
18784 }
18785 Ex = PrevV;
18786 ExV = EEIt->second.second ? EEIt->second.second : Ex;
18787 }
18788 }
18789 if (!Ex) {
18790 // "Reuse" the existing extract to improve final codegen.
18791 if (ReplaceInst) {
18792 // Leave the instruction as is, if it cheaper extracts and all
18793 // operands are scalar.
18794 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) {
18795 IgnoredExtracts.insert(V: EE);
18796 Ex = EE;
18797 } else {
18798 auto *CloneInst = Inst->clone();
18799 CloneInst->insertBefore(InsertPos: Inst->getIterator());
18800 if (Inst->hasName())
18801 CloneInst->takeName(V: Inst);
18802 Ex = CloneInst;
18803 }
18804 } else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar);
18805 ES && isa<Instruction>(Val: Vec)) {
18806 Value *V = ES->getVectorOperand();
18807 auto *IVec = cast<Instruction>(Val: Vec);
18808 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
18809 V = ETEs.front()->VectorizedValue;
18810 if (auto *IV = dyn_cast<Instruction>(Val: V);
18811 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
18812 IV->comesBefore(Other: IVec))
18813 Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
18814 else
18815 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
18816 } else if (auto *VecTy =
18817 dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
18818 assert(SLPReVec && "FixedVectorType is not expected.");
18819 unsigned VecTyNumElements = VecTy->getNumElements();
18820 // When REVEC is enabled, we need to extract a vector.
18821 // Note: The element size of Scalar may be different from the
18822 // element size of Vec.
18823 Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements,
18824 Index: ExternalUse.Lane * VecTyNumElements);
18825 } else {
18826 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
18827 }
18828 // If necessary, sign-extend or zero-extend ScalarRoot
18829 // to the larger type.
18830 ExV = Ex;
18831 if (Scalar->getType() != Ex->getType())
18832 ExV = Builder.CreateIntCast(
18833 V: Ex, DestTy: Scalar->getType(),
18834 isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
18835 auto *I = dyn_cast<Instruction>(Val: Ex);
18836 ScalarToEEs[Scalar].try_emplace(Key: I ? I->getParent()
18837 : &F->getEntryBlock(),
18838 Args: std::make_pair(x&: Ex, y&: ExV));
18839 }
18840 // The then branch of the previous if may produce constants, since 0
18841 // operand might be a constant.
18842 if (auto *ExI = dyn_cast<Instruction>(Val: Ex);
18843 ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) {
18844 GatherShuffleExtractSeq.insert(X: ExI);
18845 CSEBlocks.insert(V: ExI->getParent());
18846 }
18847 return ExV;
18848 }
18849 assert(isa<FixedVectorType>(Scalar->getType()) &&
18850 isa<InsertElementInst>(Scalar) &&
18851 "In-tree scalar of vector type is not insertelement?");
18852 auto *IE = cast<InsertElementInst>(Val: Scalar);
18853 VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
18854 return Vec;
18855 };
18856 // If User == nullptr, the Scalar remains as scalar in vectorized
18857 // instructions or is used as extra arg. Generate ExtractElement instruction
18858 // and update the record for this scalar in ExternallyUsedValues.
18859 if (!User) {
18860 if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
18861 continue;
18862 assert(
18863 (ExternallyUsedValues.count(Scalar) ||
18864 Scalar->hasNUsesOrMore(UsesLimit) ||
18865 ExternalUsesAsOriginalScalar.contains(Scalar) ||
18866 any_of(
18867 Scalar->users(),
18868 [&, TTI = TTI](llvm::User *U) {
18869 if (ExternalUsesAsOriginalScalar.contains(U))
18870 return true;
18871 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
18872 return !UseEntries.empty() &&
18873 (E->State == TreeEntry::Vectorize ||
18874 E->State == TreeEntry::StridedVectorize ||
18875 E->State == TreeEntry::CompressVectorize) &&
18876 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
18877 return (UseEntry->State == TreeEntry::Vectorize ||
18878 UseEntry->State ==
18879 TreeEntry::StridedVectorize ||
18880 UseEntry->State ==
18881 TreeEntry::CompressVectorize) &&
18882 doesInTreeUserNeedToExtract(
18883 Scalar, getRootEntryInstruction(*UseEntry),
18884 TLI, TTI);
18885 });
18886 })) &&
18887 "Scalar with nullptr User must be registered in "
18888 "ExternallyUsedValues map or remain as scalar in vectorized "
18889 "instructions");
18890 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
18891 if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) {
18892 if (PHI->getParent()->isLandingPad())
18893 Builder.SetInsertPoint(
18894 TheBB: PHI->getParent(),
18895 IP: std::next(
18896 x: PHI->getParent()->getLandingPadInst()->getIterator()));
18897 else
18898 Builder.SetInsertPoint(TheBB: PHI->getParent(),
18899 IP: PHI->getParent()->getFirstNonPHIIt());
18900 } else {
18901 Builder.SetInsertPoint(TheBB: VecI->getParent(),
18902 IP: std::next(x: VecI->getIterator()));
18903 }
18904 } else {
18905 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
18906 }
18907 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
18908 // Required to update internally referenced instructions.
18909 if (Scalar != NewInst) {
18910 assert((!isa<ExtractElementInst>(Scalar) ||
18911 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
18912 "Extractelements should not be replaced.");
18913 Scalar->replaceAllUsesWith(V: NewInst);
18914 }
18915 continue;
18916 }
18917
18918 if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
18919 VU && VU->getOperand(i_nocapture: 1) == Scalar) {
18920 // Skip if the scalar is another vector op or Vec is not an instruction.
18921 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
18922 if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
18923 if (!UsedInserts.insert(V: VU).second)
18924 continue;
18925 // Need to use original vector, if the root is truncated.
18926 auto BWIt = MinBWs.find(Val: E);
18927 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
18928 auto *ScalarTy = FTy->getElementType();
18929 auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
18930 auto VecIt = VectorCasts.find(Val: Key);
18931 if (VecIt == VectorCasts.end()) {
18932 IRBuilderBase::InsertPointGuard Guard(Builder);
18933 if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) {
18934 if (IVec->getParent()->isLandingPad())
18935 Builder.SetInsertPoint(TheBB: IVec->getParent(),
18936 IP: std::next(x: IVec->getParent()
18937 ->getLandingPadInst()
18938 ->getIterator()));
18939 else
18940 Builder.SetInsertPoint(
18941 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
18942 } else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) {
18943 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
18944 }
18945 Vec = Builder.CreateIntCast(
18946 V: Vec,
18947 DestTy: getWidenedType(
18948 ScalarTy,
18949 VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
18950 isSigned: BWIt->second.second);
18951 VectorCasts.try_emplace(Key, Args&: Vec);
18952 } else {
18953 Vec = VecIt->second;
18954 }
18955 }
18956
18957 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
18958 if (InsertIdx) {
18959 auto *It = find_if(
18960 Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) {
18961 // Checks if 2 insertelements are from the same buildvector.
18962 InsertElementInst *VecInsert = Data.InsertElements.front();
18963 return areTwoInsertFromSameBuildVector(
18964 VU, V: VecInsert,
18965 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); });
18966 });
18967 unsigned Idx = *InsertIdx;
18968 if (It == ShuffledInserts.end()) {
18969 (void)ShuffledInserts.emplace_back();
18970 It = std::next(x: ShuffledInserts.begin(),
18971 n: ShuffledInserts.size() - 1);
18972 }
18973 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
18974 if (Mask.empty())
18975 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
18976 Mask[Idx] = ExternalUse.Lane;
18977 It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
18978 continue;
18979 }
18980 }
18981 }
18982 }
18983
18984 // Generate extracts for out-of-tree users.
18985 // Find the insertion point for the extractelement lane.
18986 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
18987 if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
18988 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
18989 if (PH->getIncomingValue(i: I) == Scalar) {
18990 Instruction *IncomingTerminator =
18991 PH->getIncomingBlock(i: I)->getTerminator();
18992 if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
18993 Builder.SetInsertPoint(TheBB: VecI->getParent(),
18994 IP: std::next(x: VecI->getIterator()));
18995 } else {
18996 Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
18997 }
18998 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
18999 PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
19000 }
19001 }
19002 } else {
19003 Builder.SetInsertPoint(cast<Instruction>(Val: User));
19004 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
19005 User->replaceUsesOfWith(From: Scalar, To: NewInst);
19006 }
19007 } else {
19008 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
19009 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
19010 User->replaceUsesOfWith(From: Scalar, To: NewInst);
19011 }
19012
19013 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
19014 }
19015
19016 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19017 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
19018 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
19019 int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19020 for (int I = 0, E = Mask.size(); I < E; ++I) {
19021 if (Mask[I] < VF)
19022 CombinedMask1[I] = Mask[I];
19023 else
19024 CombinedMask2[I] = Mask[I] - VF;
19025 }
19026 ShuffleInstructionBuilder ShuffleBuilder(
19027 cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
19028 ShuffleBuilder.add(V1, Mask: CombinedMask1);
19029 if (V2)
19030 ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
19031 return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {});
19032 };
19033
19034 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
19035 bool ForSingleMask) {
19036 unsigned VF = Mask.size();
19037 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19038 if (VF != VecVF) {
19039 if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
19040 Vec = CreateShuffle(Vec, nullptr, Mask);
19041 return std::make_pair(x&: Vec, y: true);
19042 }
19043 if (!ForSingleMask) {
19044 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19045 for (unsigned I = 0; I < VF; ++I) {
19046 if (Mask[I] != PoisonMaskElem)
19047 ResizeMask[Mask[I]] = Mask[I];
19048 }
19049 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
19050 }
19051 }
19052
19053 return std::make_pair(x&: Vec, y: false);
19054 };
19055 // Perform shuffling of the vectorize tree entries for better handling of
19056 // external extracts.
19057 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
19058 // Find the first and the last instruction in the list of insertelements.
19059 sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement);
19060 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
19061 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
19062 Builder.SetInsertPoint(LastInsert);
19063 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
19064 Value *NewInst = performExtractsShuffleAction<Value>(
19065 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
19066 Base: FirstInsert->getOperand(i_nocapture: 0),
19067 GetVF: [](Value *Vec) {
19068 return cast<VectorType>(Val: Vec->getType())
19069 ->getElementCount()
19070 .getKnownMinValue();
19071 },
19072 ResizeAction: ResizeToVF,
19073 Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
19074 ArrayRef<Value *> Vals) {
19075 assert((Vals.size() == 1 || Vals.size() == 2) &&
19076 "Expected exactly 1 or 2 input values.");
19077 if (Vals.size() == 1) {
19078 // Do not create shuffle if the mask is a simple identity
19079 // non-resizing mask.
19080 if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
19081 ->getNumElements() ||
19082 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
19083 return CreateShuffle(Vals.front(), nullptr, Mask);
19084 return Vals.front();
19085 }
19086 return CreateShuffle(Vals.front() ? Vals.front()
19087 : FirstInsert->getOperand(i_nocapture: 0),
19088 Vals.back(), Mask);
19089 });
19090 auto It = ShuffledInserts[I].InsertElements.rbegin();
19091 // Rebuild buildvector chain.
19092 InsertElementInst *II = nullptr;
19093 if (It != ShuffledInserts[I].InsertElements.rend())
19094 II = *It;
19095 SmallVector<Instruction *> Inserts;
19096 while (It != ShuffledInserts[I].InsertElements.rend()) {
19097 assert(II && "Must be an insertelement instruction.");
19098 if (*It == II)
19099 ++It;
19100 else
19101 Inserts.push_back(Elt: cast<Instruction>(Val: II));
19102 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
19103 }
19104 for (Instruction *II : reverse(C&: Inserts)) {
19105 II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst);
19106 if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
19107 if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
19108 II->moveAfter(MovePos: NewI);
19109 NewInst = II;
19110 }
19111 LastInsert->replaceAllUsesWith(V: NewInst);
19112 for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) {
19113 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0),
19114 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType()));
19115 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1),
19116 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType()));
19117 eraseInstruction(I: IE);
19118 }
19119 CSEBlocks.insert(V: LastInsert->getParent());
19120 }
19121
19122 SmallVector<Instruction *> RemovedInsts;
19123 // For each vectorized value:
19124 for (auto &TEPtr : VectorizableTree) {
19125 TreeEntry *Entry = TEPtr.get();
19126
19127 // No need to handle users of gathered values.
19128 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize)
19129 continue;
19130
19131 assert(Entry->VectorizedValue && "Can't find vectorizable value");
19132
19133 // For each lane:
19134 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
19135 Value *Scalar = Entry->Scalars[Lane];
19136
19137 if (Entry->getOpcode() == Instruction::GetElementPtr &&
19138 !isa<GetElementPtrInst>(Val: Scalar))
19139 continue;
19140 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar);
19141 EE && IgnoredExtracts.contains(V: EE))
19142 continue;
19143 if (isa<PoisonValue>(Val: Scalar))
19144 continue;
19145#ifndef NDEBUG
19146 Type *Ty = Scalar->getType();
19147 if (!Ty->isVoidTy()) {
19148 for (User *U : Scalar->users()) {
19149 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
19150
19151 // It is legal to delete users in the ignorelist.
19152 assert((isVectorized(U) ||
19153 (UserIgnoreList && UserIgnoreList->contains(U)) ||
19154 (isa_and_nonnull<Instruction>(U) &&
19155 isDeleted(cast<Instruction>(U)))) &&
19156 "Deleting out-of-tree value");
19157 }
19158 }
19159#endif
19160 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
19161 auto *I = cast<Instruction>(Val: Scalar);
19162 RemovedInsts.push_back(Elt: I);
19163 }
19164 }
19165
19166 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
19167 // new vector instruction.
19168 if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue))
19169 V->mergeDIAssignID(SourceInstructions: RemovedInsts);
19170
19171 // Clear up reduction references, if any.
19172 if (UserIgnoreList) {
19173 for (Instruction *I : RemovedInsts) {
19174 const TreeEntry *IE = getTreeEntries(V: I).front();
19175 if (IE->Idx != 0 &&
19176 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
19177 (ValueToGatherNodes.lookup(Val: I).contains(
19178 key: VectorizableTree.front().get()) ||
19179 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
19180 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
19181 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
19182 IE->UserTreeIndex &&
19183 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
19184 !(GatheredLoadsEntriesFirst.has_value() &&
19185 IE->Idx >= *GatheredLoadsEntriesFirst &&
19186 VectorizableTree.front()->isGather() &&
19187 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)))
19188 continue;
19189 SmallVector<SelectInst *> LogicalOpSelects;
19190 I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
19191 // Do not replace condition of the logical op in form select <cond>.
19192 bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
19193 (match(V: U.getUser(), P: m_LogicalAnd()) ||
19194 match(V: U.getUser(), P: m_LogicalOr())) &&
19195 U.getOperandNo() == 0;
19196 if (IsPoisoningLogicalOp) {
19197 LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
19198 return false;
19199 }
19200 return UserIgnoreList->contains(V: U.getUser());
19201 });
19202 // Replace conditions of the poisoning logical ops with the non-poison
19203 // constant value.
19204 for (SelectInst *SI : LogicalOpSelects)
19205 SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
19206 }
19207 }
19208 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
19209 // cache correctness.
19210 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
19211 // - instructions are not deleted until later.
19212 removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales);
19213
19214 Builder.ClearInsertionPoint();
19215 InstrElementSize.clear();
19216
19217 const TreeEntry &RootTE = *VectorizableTree.front();
19218 Value *Vec = RootTE.VectorizedValue;
19219 if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 &&
19220 It != MinBWs.end() &&
19221 ReductionBitWidth != It->second.first) {
19222 IRBuilder<>::InsertPointGuard Guard(Builder);
19223 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
19224 IP: ReductionRoot->getIterator());
19225 Vec = Builder.CreateIntCast(
19226 V: Vec,
19227 DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
19228 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
19229 isSigned: It->second.second);
19230 }
19231 return Vec;
19232}
19233
19234void BoUpSLP::optimizeGatherSequence() {
19235 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
19236 << " gather sequences instructions.\n");
19237 // LICM InsertElementInst sequences.
19238 for (Instruction *I : GatherShuffleExtractSeq) {
19239 if (isDeleted(I))
19240 continue;
19241
19242 // Check if this block is inside a loop.
19243 Loop *L = LI->getLoopFor(BB: I->getParent());
19244 if (!L)
19245 continue;
19246
19247 // Check if it has a preheader.
19248 BasicBlock *PreHeader = L->getLoopPreheader();
19249 if (!PreHeader)
19250 continue;
19251
19252 // If the vector or the element that we insert into it are
19253 // instructions that are defined in this basic block then we can't
19254 // hoist this instruction.
19255 if (any_of(Range: I->operands(), P: [L](Value *V) {
19256 auto *OpI = dyn_cast<Instruction>(Val: V);
19257 return OpI && L->contains(Inst: OpI);
19258 }))
19259 continue;
19260
19261 // We can hoist this instruction. Move it to the pre-header.
19262 I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator());
19263 CSEBlocks.insert(V: PreHeader);
19264 }
19265
19266 // Make a list of all reachable blocks in our CSE queue.
19267 SmallVector<const DomTreeNode *, 8> CSEWorkList;
19268 CSEWorkList.reserve(N: CSEBlocks.size());
19269 for (BasicBlock *BB : CSEBlocks)
19270 if (DomTreeNode *N = DT->getNode(BB)) {
19271 assert(DT->isReachableFromEntry(N));
19272 CSEWorkList.push_back(Elt: N);
19273 }
19274
19275 // Sort blocks by domination. This ensures we visit a block after all blocks
19276 // dominating it are visited.
19277 llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) {
19278 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
19279 "Different nodes should have different DFS numbers");
19280 return A->getDFSNumIn() < B->getDFSNumIn();
19281 });
19282
19283 // Less defined shuffles can be replaced by the more defined copies.
19284 // Between two shuffles one is less defined if it has the same vector operands
19285 // and its mask indeces are the same as in the first one or undefs. E.g.
19286 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
19287 // poison, <0, 0, 0, 0>.
19288 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
19289 Instruction *I2,
19290 SmallVectorImpl<int> &NewMask) {
19291 if (I1->getType() != I2->getType())
19292 return false;
19293 auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
19294 auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
19295 if (!SI1 || !SI2)
19296 return I1->isIdenticalTo(I: I2);
19297 if (SI1->isIdenticalTo(I: SI2))
19298 return true;
19299 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
19300 if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
19301 return false;
19302 // Check if the second instruction is more defined than the first one.
19303 NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
19304 ArrayRef<int> SM1 = SI1->getShuffleMask();
19305 // Count trailing undefs in the mask to check the final number of used
19306 // registers.
19307 unsigned LastUndefsCnt = 0;
19308 for (int I = 0, E = NewMask.size(); I < E; ++I) {
19309 if (SM1[I] == PoisonMaskElem)
19310 ++LastUndefsCnt;
19311 else
19312 LastUndefsCnt = 0;
19313 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
19314 NewMask[I] != SM1[I])
19315 return false;
19316 if (NewMask[I] == PoisonMaskElem)
19317 NewMask[I] = SM1[I];
19318 }
19319 // Check if the last undefs actually change the final number of used vector
19320 // registers.
19321 return SM1.size() - LastUndefsCnt > 1 &&
19322 ::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) ==
19323 ::getNumberOfParts(
19324 TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
19325 VF: SM1.size() - LastUndefsCnt));
19326 };
19327 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
19328 // instructions. TODO: We can further optimize this scan if we split the
19329 // instructions into different buckets based on the insert lane.
19330 SmallVector<Instruction *, 16> Visited;
19331 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
19332 assert(*I &&
19333 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
19334 "Worklist not sorted properly!");
19335 BasicBlock *BB = (*I)->getBlock();
19336 // For all instructions in blocks containing gather sequences:
19337 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
19338 if (isDeleted(I: &In))
19339 continue;
19340 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
19341 !GatherShuffleExtractSeq.contains(key: &In))
19342 continue;
19343
19344 // Check if we can replace this instruction with any of the
19345 // visited instructions.
19346 bool Replaced = false;
19347 for (Instruction *&V : Visited) {
19348 SmallVector<int> NewMask;
19349 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
19350 DT->dominates(A: V->getParent(), B: In.getParent())) {
19351 In.replaceAllUsesWith(V);
19352 eraseInstruction(I: &In);
19353 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
19354 if (!NewMask.empty())
19355 SI->setShuffleMask(NewMask);
19356 Replaced = true;
19357 break;
19358 }
19359 if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
19360 GatherShuffleExtractSeq.contains(key: V) &&
19361 IsIdenticalOrLessDefined(V, &In, NewMask) &&
19362 DT->dominates(A: In.getParent(), B: V->getParent())) {
19363 In.moveAfter(MovePos: V);
19364 V->replaceAllUsesWith(V: &In);
19365 eraseInstruction(I: V);
19366 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
19367 if (!NewMask.empty())
19368 SI->setShuffleMask(NewMask);
19369 V = &In;
19370 Replaced = true;
19371 break;
19372 }
19373 }
19374 if (!Replaced) {
19375 assert(!is_contained(Visited, &In));
19376 Visited.push_back(Elt: &In);
19377 }
19378 }
19379 }
19380 CSEBlocks.clear();
19381 GatherShuffleExtractSeq.clear();
19382}
19383
19384BoUpSLP::ScheduleBundle &
19385BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
19386 auto &BundlePtr =
19387 ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>());
19388 for (Value *V : VL) {
19389 if (doesNotNeedToBeScheduled(V))
19390 continue;
19391 ScheduleData *BundleMember = getScheduleData(V);
19392 assert(BundleMember && "no ScheduleData for bundle member "
19393 "(maybe not in same basic block)");
19394 // Group the instructions to a bundle.
19395 BundlePtr->add(SD: BundleMember);
19396 ScheduledBundles.try_emplace(Key: cast<Instruction>(Val: V))
19397 .first->getSecond()
19398 .push_back(Elt: BundlePtr.get());
19399 }
19400 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
19401 return *BundlePtr;
19402}
19403
19404// Groups the instructions to a bundle (which is then a single scheduling entity)
19405// and schedules instructions until the bundle gets ready.
19406std::optional<BoUpSLP::ScheduleBundle *>
19407BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
19408 const InstructionsState &S) {
19409 // No need to schedule PHIs, insertelement, extractelement and extractvalue
19410 // instructions.
19411 if (isa<PHINode>(Val: S.getMainOp()) ||
19412 isVectorLikeInstWithConstOps(V: S.getMainOp()) || doesNotNeedToSchedule(VL))
19413 return nullptr;
19414
19415 // Initialize the instruction bundle.
19416 Instruction *OldScheduleEnd = ScheduleEnd;
19417 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
19418
19419 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
19420 // The scheduling region got new instructions at the lower end (or it is a
19421 // new region for the first bundle). This makes it necessary to
19422 // recalculate all dependencies.
19423 // It is seldom that this needs to be done a second time after adding the
19424 // initial bundle to the region.
19425 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
19426 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
19427 if (ScheduleData *SD = getScheduleData(I))
19428 SD->clearDependencies();
19429 }
19430 ReSchedule = true;
19431 }
19432 if (Bundle && !Bundle.getBundle().empty()) {
19433 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
19434 << BB->getName() << "\n");
19435 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP);
19436 }
19437
19438 if (ReSchedule) {
19439 resetSchedule();
19440 initialFillReadyList(ReadyList&: ReadyInsts);
19441 }
19442
19443 // Now try to schedule the new bundle or (if no bundle) just calculate
19444 // dependencies. As soon as the bundle is "ready" it means that there are no
19445 // cyclic dependencies and we can schedule it. Note that's important that we
19446 // don't "schedule" the bundle yet.
19447 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
19448 !ReadyInsts.empty()) {
19449 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
19450 assert(Picked->isReady() && "must be ready to schedule");
19451 schedule(Data: Picked, ReadyList&: ReadyInsts);
19452 if (Picked == &Bundle)
19453 break;
19454 }
19455 };
19456
19457 // Make sure that the scheduling region contains all
19458 // instructions of the bundle.
19459 for (Value *V : VL) {
19460 if (doesNotNeedToBeScheduled(V))
19461 continue;
19462 if (!extendSchedulingRegion(V, S)) {
19463 // If the scheduling region got new instructions at the lower end (or it
19464 // is a new region for the first bundle). This makes it necessary to
19465 // recalculate all dependencies.
19466 // Otherwise the compiler may crash trying to incorrectly calculate
19467 // dependencies and emit instruction in the wrong order at the actual
19468 // scheduling.
19469 ScheduleBundle Invalid = ScheduleBundle::invalid();
19470 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
19471 return std::nullopt;
19472 }
19473 }
19474
19475 bool ReSchedule = false;
19476 for (Value *V : VL) {
19477 if (doesNotNeedToBeScheduled(V))
19478 continue;
19479 ScheduleData *BundleMember = getScheduleData(V);
19480 assert(BundleMember &&
19481 "no ScheduleData for bundle member (maybe not in same basic block)");
19482
19483 // Make sure we don't leave the pieces of the bundle in the ready list when
19484 // whole bundle might not be ready.
19485 ReadyInsts.remove(X: BundleMember);
19486 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
19487 !Bundles.empty()) {
19488 for (ScheduleBundle *B : Bundles)
19489 ReadyInsts.remove(X: B);
19490 }
19491
19492 if (!BundleMember->isScheduled())
19493 continue;
19494 // A bundle member was scheduled as single instruction before and now
19495 // needs to be scheduled as part of the bundle. We just get rid of the
19496 // existing schedule.
19497 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
19498 << " was already scheduled\n");
19499 ReSchedule = true;
19500 }
19501
19502 ScheduleBundle &Bundle = buildBundle(VL);
19503 TryScheduleBundleImpl(ReSchedule, Bundle);
19504 if (!Bundle.isReady()) {
19505 for (ScheduleData *BD : Bundle.getBundle()) {
19506 if (BD->isReady()) {
19507 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst());
19508 if (Bundles.empty()) {
19509 ReadyInsts.insert(X: BD);
19510 continue;
19511 }
19512 for (ScheduleBundle *B : Bundles)
19513 if (B->isReady())
19514 ReadyInsts.insert(X: B);
19515 }
19516 }
19517 ScheduledBundlesList.pop_back();
19518 for (Value *V : VL) {
19519 if (doesNotNeedToBeScheduled(V))
19520 continue;
19521 ScheduledBundles.find(Val: cast<Instruction>(Val: V))->getSecond().pop_back();
19522 }
19523 return std::nullopt;
19524 }
19525 return &Bundle;
19526}
19527
19528BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
19529 // Allocate a new ScheduleData for the instruction.
19530 if (ChunkPos >= ChunkSize) {
19531 ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
19532 ChunkPos = 0;
19533 }
19534 return &(ScheduleDataChunks.back()[ChunkPos++]);
19535}
19536
19537bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
19538 Value *V, const InstructionsState &S) {
19539 Instruction *I = dyn_cast<Instruction>(Val: V);
19540 assert(I && "bundle member must be an instruction");
19541 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
19542 !doesNotNeedToBeScheduled(I) &&
19543 "phi nodes/insertelements/extractelements/extractvalues don't need to "
19544 "be scheduled");
19545 if (getScheduleData(I))
19546 return true;
19547 if (!ScheduleStart) {
19548 // It's the first instruction in the new region.
19549 initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
19550 ScheduleStart = I;
19551 ScheduleEnd = I->getNextNode();
19552 assert(ScheduleEnd && "tried to vectorize a terminator?");
19553 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
19554 return true;
19555 }
19556 // Search up and down at the same time, because we don't know if the new
19557 // instruction is above or below the existing scheduling region.
19558 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
19559 // against the budget. Otherwise debug info could affect codegen.
19560 BasicBlock::reverse_iterator UpIter =
19561 ++ScheduleStart->getIterator().getReverse();
19562 BasicBlock::reverse_iterator UpperEnd = BB->rend();
19563 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
19564 BasicBlock::iterator LowerEnd = BB->end();
19565 auto IsAssumeLikeIntr = [](const Instruction &I) {
19566 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
19567 return II->isAssumeLikeIntrinsic();
19568 return false;
19569 };
19570 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
19571 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
19572 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
19573 &*DownIter != I) {
19574 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
19575 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
19576 return false;
19577 }
19578
19579 ++UpIter;
19580 ++DownIter;
19581
19582 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
19583 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
19584 }
19585 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
19586 assert(I->getParent() == ScheduleStart->getParent() &&
19587 "Instruction is in wrong basic block.");
19588 initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
19589 ScheduleStart = I;
19590 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
19591 << "\n");
19592 return true;
19593 }
19594 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
19595 "Expected to reach top of the basic block or instruction down the "
19596 "lower end.");
19597 assert(I->getParent() == ScheduleEnd->getParent() &&
19598 "Instruction is in wrong basic block.");
19599 initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
19600 NextLoadStore: nullptr);
19601 ScheduleEnd = I->getNextNode();
19602 assert(ScheduleEnd && "tried to vectorize a terminator?");
19603 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
19604 return true;
19605}
19606
19607void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
19608 Instruction *ToI,
19609 ScheduleData *PrevLoadStore,
19610 ScheduleData *NextLoadStore) {
19611 ScheduleData *CurrentLoadStore = PrevLoadStore;
19612 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
19613 // No need to allocate data for non-schedulable instructions.
19614 if (doesNotNeedToBeScheduled(V: I))
19615 continue;
19616 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
19617 if (!SD) {
19618 SD = allocateScheduleDataChunks();
19619 ScheduleDataMap[I] = SD;
19620 }
19621 assert(!isInSchedulingRegion(SD) &&
19622 "new ScheduleData already in scheduling region");
19623 SD->init(BlockSchedulingRegionID: SchedulingRegionID, I);
19624
19625 if (I->mayReadOrWriteMemory() &&
19626 (!isa<IntrinsicInst>(Val: I) ||
19627 (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
19628 cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
19629 Intrinsic::pseudoprobe))) {
19630 // Update the linked list of memory accessing instructions.
19631 if (CurrentLoadStore) {
19632 CurrentLoadStore->setNextLoadStore(SD);
19633 } else {
19634 FirstLoadStoreInRegion = SD;
19635 }
19636 CurrentLoadStore = SD;
19637 }
19638
19639 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
19640 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
19641 RegionHasStackSave = true;
19642 }
19643 if (NextLoadStore) {
19644 if (CurrentLoadStore)
19645 CurrentLoadStore->setNextLoadStore(NextLoadStore);
19646 } else {
19647 LastLoadStoreInRegion = CurrentLoadStore;
19648 }
19649}
19650
19651void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle,
19652 bool InsertInReadyList,
19653 BoUpSLP *SLP) {
19654 SmallVector<ScheduleData *> WorkList;
19655 auto ProcessNode = [&](ScheduleData *BundleMember) {
19656 if (BundleMember->hasValidDependencies())
19657 return;
19658 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
19659 BundleMember->initDependencies();
19660 BundleMember->resetUnscheduledDeps();
19661 // Handle def-use chain dependencies.
19662 for (User *U : BundleMember->getInst()->users()) {
19663 if (ScheduleData *UseSD = getScheduleData(V: U)) {
19664 BundleMember->incDependencies();
19665 if (!UseSD->isScheduled())
19666 BundleMember->incrementUnscheduledDeps(Incr: 1);
19667 WorkList.push_back(Elt: UseSD);
19668 }
19669 }
19670
19671 auto MakeControlDependent = [&](Instruction *I) {
19672 auto *DepDest = getScheduleData(I);
19673 assert(DepDest && "must be in schedule window");
19674 DepDest->addControlDependency(Dep: BundleMember);
19675 BundleMember->incDependencies();
19676 if (!DepDest->isScheduled())
19677 BundleMember->incrementUnscheduledDeps(Incr: 1);
19678 WorkList.push_back(Elt: DepDest);
19679 };
19680
19681 // Any instruction which isn't safe to speculate at the beginning of the
19682 // block is control depend on any early exit or non-willreturn call
19683 // which proceeds it.
19684 if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) {
19685 for (Instruction *I = BundleMember->getInst()->getNextNode();
19686 I != ScheduleEnd; I = I->getNextNode()) {
19687 if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
19688 continue;
19689
19690 // Add the dependency
19691 MakeControlDependent(I);
19692
19693 if (!isGuaranteedToTransferExecutionToSuccessor(I))
19694 // Everything past here must be control dependent on I.
19695 break;
19696 }
19697 }
19698
19699 if (RegionHasStackSave) {
19700 // If we have an inalloc alloca instruction, it needs to be scheduled
19701 // after any preceeding stacksave. We also need to prevent any alloca
19702 // from reordering above a preceeding stackrestore.
19703 if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) ||
19704 match(V: BundleMember->getInst(),
19705 P: m_Intrinsic<Intrinsic::stackrestore>())) {
19706 for (Instruction *I = BundleMember->getInst()->getNextNode();
19707 I != ScheduleEnd; I = I->getNextNode()) {
19708 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
19709 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
19710 // Any allocas past here must be control dependent on I, and I
19711 // must be memory dependend on BundleMember->Inst.
19712 break;
19713
19714 if (!isa<AllocaInst>(Val: I))
19715 continue;
19716
19717 // Add the dependency
19718 MakeControlDependent(I);
19719 }
19720 }
19721
19722 // In addition to the cases handle just above, we need to prevent
19723 // allocas and loads/stores from moving below a stacksave or a
19724 // stackrestore. Avoiding moving allocas below stackrestore is currently
19725 // thought to be conservatism. Moving loads/stores below a stackrestore
19726 // can lead to incorrect code.
19727 if (isa<AllocaInst>(Val: BundleMember->getInst()) ||
19728 BundleMember->getInst()->mayReadOrWriteMemory()) {
19729 for (Instruction *I = BundleMember->getInst()->getNextNode();
19730 I != ScheduleEnd; I = I->getNextNode()) {
19731 if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
19732 !match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
19733 continue;
19734
19735 // Add the dependency
19736 MakeControlDependent(I);
19737 break;
19738 }
19739 }
19740 }
19741
19742 // Handle the memory dependencies (if any).
19743 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
19744 if (!NextLoadStore)
19745 return;
19746 Instruction *SrcInst = BundleMember->getInst();
19747 assert(SrcInst->mayReadOrWriteMemory() &&
19748 "NextLoadStore list for non memory effecting bundle?");
19749 MemoryLocation SrcLoc = getLocation(I: SrcInst);
19750 bool SrcMayWrite = SrcInst->mayWriteToMemory();
19751 unsigned NumAliased = 0;
19752 unsigned DistToSrc = 1;
19753 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(I: SrcInst);
19754
19755 for (ScheduleData *DepDest = NextLoadStore; DepDest;
19756 DepDest = DepDest->getNextLoadStore()) {
19757 assert(isInSchedulingRegion(DepDest) && "Expected to be in region");
19758
19759 // We have two limits to reduce the complexity:
19760 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
19761 // SLP->isAliased (which is the expensive part in this loop).
19762 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
19763 // the whole loop (even if the loop is fast, it's quadratic).
19764 // It's important for the loop break condition (see below) to
19765 // check this limit even between two read-only instructions.
19766 if (DistToSrc >= MaxMemDepDistance ||
19767 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
19768 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
19769 SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) {
19770
19771 // We increment the counter only if the locations are aliased
19772 // (instead of counting all alias checks). This gives a better
19773 // balance between reduced runtime and accurate dependencies.
19774 NumAliased++;
19775
19776 DepDest->addMemoryDependency(Dep: BundleMember);
19777 BundleMember->incDependencies();
19778 if (!DepDest->isScheduled())
19779 BundleMember->incrementUnscheduledDeps(Incr: 1);
19780 WorkList.push_back(Elt: DepDest);
19781 }
19782
19783 // Example, explaining the loop break condition: Let's assume our
19784 // starting instruction is i0 and MaxMemDepDistance = 3.
19785 //
19786 // +--------v--v--v
19787 // i0,i1,i2,i3,i4,i5,i6,i7,i8
19788 // +--------^--^--^
19789 //
19790 // MaxMemDepDistance let us stop alias-checking at i3 and we add
19791 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
19792 // Previously we already added dependencies from i3 to i6,i7,i8
19793 // (because of MaxMemDepDistance). As we added a dependency from
19794 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
19795 // and we can abort this loop at i6.
19796 if (DistToSrc >= 2 * MaxMemDepDistance)
19797 break;
19798 DistToSrc++;
19799 }
19800 };
19801
19802 WorkList.push_back(Elt: Bundle.getBundle().front());
19803 SmallPtrSet<ScheduleBundle *, 16> Visited;
19804 while (!WorkList.empty()) {
19805 ScheduleData *SD = WorkList.pop_back_val();
19806 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: SD->getInst());
19807 if (Bundles.empty()) {
19808 ProcessNode(SD);
19809 if (InsertInReadyList && SD->isReady()) {
19810 ReadyInsts.insert(X: SD);
19811 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
19812 }
19813 continue;
19814 }
19815 for (ScheduleBundle *Bundle : Bundles) {
19816 if (!Visited.insert(Ptr: Bundle).second || Bundle->hasValidDependencies())
19817 continue;
19818 assert(isInSchedulingRegion(*Bundle) &&
19819 "ScheduleData not in scheduling region");
19820 for_each(Range: Bundle->getBundle(), F: ProcessNode);
19821 }
19822 if (InsertInReadyList && SD->isReady()) {
19823 for (ScheduleBundle *Bundle : Bundles) {
19824 assert(isInSchedulingRegion(*Bundle) &&
19825 "ScheduleData not in scheduling region");
19826 if (!Bundle->isReady())
19827 continue;
19828 ReadyInsts.insert(X: Bundle);
19829 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
19830 << "\n");
19831 }
19832 }
19833 }
19834}
19835
19836void BoUpSLP::BlockScheduling::resetSchedule() {
19837 assert(ScheduleStart &&
19838 "tried to reset schedule on block which has not been scheduled");
19839 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
19840 if (ScheduleData *SD = getScheduleData(I)) {
19841 assert(isInSchedulingRegion(SD) &&
19842 "ScheduleData not in scheduling region");
19843 SD->setScheduled(/*Scheduled=*/false);
19844 SD->resetUnscheduledDeps();
19845 }
19846 for (ScheduleBundle *Bundle : getScheduleBundles(V: I)) {
19847 assert(isInSchedulingRegion(*Bundle) &&
19848 "ScheduleBundle not in scheduling region");
19849 Bundle->setScheduled(/*Scheduled=*/false);
19850 }
19851 }
19852 ReadyInsts.clear();
19853}
19854
19855void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
19856 if (!BS->ScheduleStart)
19857 return;
19858
19859 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
19860
19861 // A key point - if we got here, pre-scheduling was able to find a valid
19862 // scheduling of the sub-graph of the scheduling window which consists
19863 // of all vector bundles and their transitive users. As such, we do not
19864 // need to reschedule anything *outside of* that subgraph.
19865
19866 BS->resetSchedule();
19867
19868 // For the real scheduling we use a more sophisticated ready-list: it is
19869 // sorted by the original instruction location. This lets the final schedule
19870 // be as close as possible to the original instruction order.
19871 // WARNING: If changing this order causes a correctness issue, that means
19872 // there is some missing dependence edge in the schedule data graph.
19873 struct ScheduleDataCompare {
19874 bool operator()(const ScheduleEntity *SD1,
19875 const ScheduleEntity *SD2) const {
19876 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
19877 }
19878 };
19879 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
19880
19881 // Ensure that all dependency data is updated (for nodes in the sub-graph)
19882 // and fill the ready-list with initial instructions.
19883 int Idx = 0;
19884 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
19885 I = I->getNextNode()) {
19886 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I);
19887 if (!Bundles.empty()) {
19888 for (ScheduleBundle *Bundle : Bundles) {
19889 Bundle->setSchedulingPriority(Idx++);
19890 if (!Bundle->hasValidDependencies())
19891 BS->calculateDependencies(Bundle&: *Bundle, /*InsertInReadyList=*/false, SLP: this);
19892 }
19893 continue;
19894 }
19895 if (ScheduleData *SD = BS->getScheduleData(I)) {
19896 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I);
19897 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
19898 SDTEs.front()->doesNotNeedToSchedule()) &&
19899 "scheduler and vectorizer bundle mismatch");
19900 SD->setSchedulingPriority(Idx++);
19901 continue;
19902 }
19903 }
19904 BS->initialFillReadyList(ReadyList&: ReadyInsts);
19905
19906 Instruction *LastScheduledInst = BS->ScheduleEnd;
19907
19908 // Do the "real" scheduling.
19909 SmallPtrSet<Instruction *, 16> Scheduled;
19910 while (!ReadyInsts.empty()) {
19911 auto *Picked = *ReadyInsts.begin();
19912 ReadyInsts.erase(position: ReadyInsts.begin());
19913
19914 // Move the scheduled instruction(s) to their dedicated places, if not
19915 // there yet.
19916 if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) {
19917 for (const ScheduleData *BundleMember : Bundle->getBundle()) {
19918 Instruction *PickedInst = BundleMember->getInst();
19919 if (!Scheduled.insert(Ptr: PickedInst).second)
19920 continue;
19921 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
19922 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
19923 LastScheduledInst = PickedInst;
19924 }
19925 EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(),
19926 Args&: LastScheduledInst);
19927 } else {
19928 auto *SD = cast<ScheduleData>(Val: Picked);
19929 Instruction *PickedInst = SD->getInst();
19930 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
19931 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
19932 LastScheduledInst = PickedInst;
19933 }
19934 BS->schedule(Data: Picked, ReadyList&: ReadyInsts);
19935 }
19936
19937 // Check that we didn't break any of our invariants.
19938#ifdef EXPENSIVE_CHECKS
19939 BS->verify();
19940#endif
19941
19942#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
19943 // Check that all schedulable entities got scheduled
19944 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
19945 I = I->getNextNode()) {
19946 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
19947 assert(all_of(Bundles,
19948 [](const ScheduleBundle *Bundle) {
19949 return Bundle->isScheduled();
19950 }) &&
19951 "must be scheduled at this point");
19952 }
19953#endif
19954
19955 // Avoid duplicate scheduling of the block.
19956 BS->ScheduleStart = nullptr;
19957}
19958
19959unsigned BoUpSLP::getVectorElementSize(Value *V) {
19960 // If V is a store, just return the width of the stored value (or value
19961 // truncated just before storing) without traversing the expression tree.
19962 // This is the common case.
19963 if (auto *Store = dyn_cast<StoreInst>(Val: V))
19964 return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
19965
19966 if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
19967 return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1));
19968
19969 auto E = InstrElementSize.find(Val: V);
19970 if (E != InstrElementSize.end())
19971 return E->second;
19972
19973 // If V is not a store, we can traverse the expression tree to find loads
19974 // that feed it. The type of the loaded value may indicate a more suitable
19975 // width than V's type. We want to base the vector element size on the width
19976 // of memory operations where possible.
19977 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
19978 SmallPtrSet<Instruction *, 16> Visited;
19979 if (auto *I = dyn_cast<Instruction>(Val: V)) {
19980 Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0);
19981 Visited.insert(Ptr: I);
19982 }
19983
19984 // Traverse the expression tree in bottom-up order looking for loads. If we
19985 // encounter an instruction we don't yet handle, we give up.
19986 auto Width = 0u;
19987 Value *FirstNonBool = nullptr;
19988 while (!Worklist.empty()) {
19989 auto [I, Parent, Level] = Worklist.pop_back_val();
19990
19991 // We should only be looking at scalar instructions here. If the current
19992 // instruction has a vector type, skip.
19993 auto *Ty = I->getType();
19994 if (isa<VectorType>(Val: Ty))
19995 continue;
19996 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
19997 FirstNonBool = I;
19998 if (Level > RecursionMaxDepth)
19999 continue;
20000
20001 // If the current instruction is a load, update MaxWidth to reflect the
20002 // width of the loaded value.
20003 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
20004 Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
20005
20006 // Otherwise, we need to visit the operands of the instruction. We only
20007 // handle the interesting cases from buildTree here. If an operand is an
20008 // instruction we haven't yet visited and from the same basic block as the
20009 // user or the use is a PHI node, we add it to the worklist.
20010 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
20011 BinaryOperator, UnaryOperator>(Val: I)) {
20012 for (Use &U : I->operands()) {
20013 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
20014 if (Visited.insert(Ptr: J).second &&
20015 (isa<PHINode>(Val: I) || J->getParent() == Parent)) {
20016 Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1);
20017 continue;
20018 }
20019 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
20020 FirstNonBool = U.get();
20021 }
20022 } else {
20023 break;
20024 }
20025 }
20026
20027 // If we didn't encounter a memory access in the expression tree, or if we
20028 // gave up for some reason, just return the width of V. Otherwise, return the
20029 // maximum width we found.
20030 if (!Width) {
20031 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
20032 V = FirstNonBool;
20033 Width = DL->getTypeSizeInBits(Ty: V->getType());
20034 }
20035
20036 for (Instruction *I : Visited)
20037 InstrElementSize[I] = Width;
20038
20039 return Width;
20040}
20041
20042bool BoUpSLP::collectValuesToDemote(
20043 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
20044 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
20045 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
20046 bool &IsProfitableToDemote, bool IsTruncRoot) const {
20047 // We can always demote constants.
20048 if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
20049 return true;
20050
20051 unsigned OrigBitWidth =
20052 DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType());
20053 if (OrigBitWidth == BitWidth) {
20054 MaxDepthLevel = 1;
20055 return true;
20056 }
20057
20058 // Check if the node was analyzed already and must keep its original bitwidth.
20059 if (NodesToKeepBWs.contains(V: E.Idx))
20060 return false;
20061
20062 // If the value is not a vectorized instruction in the expression and not used
20063 // by the insertelement instruction and not used in multiple vector nodes, it
20064 // cannot be demoted.
20065 bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
20066 if (isa<PoisonValue>(Val: R))
20067 return false;
20068 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
20069 });
20070 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
20071 if (isa<PoisonValue>(Val: V))
20072 return true;
20073 if (getTreeEntries(V).size() > 1)
20074 return false;
20075 // For lat shuffle of sext/zext with many uses need to check the extra bit
20076 // for unsigned values, otherwise may have incorrect casting for reused
20077 // scalars.
20078 bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL));
20079 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
20080 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20081 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
20082 return true;
20083 }
20084 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
20085 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
20086 if (IsSignedNode)
20087 ++BitWidth1;
20088 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20089 APInt Mask = DB->getDemandedBits(I);
20090 unsigned BitWidth2 =
20091 std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero());
20092 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
20093 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1);
20094 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
20095 break;
20096 BitWidth2 *= 2;
20097 }
20098 BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
20099 }
20100 BitWidth = std::max(a: BitWidth, b: BitWidth1);
20101 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
20102 };
20103 auto FinalAnalysis = [&, TTI = TTI]() {
20104 if (!IsProfitableToDemote)
20105 return false;
20106 bool Res = all_of(
20107 Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
20108 // Demote gathers.
20109 if (Res && E.isGather()) {
20110 if (E.hasState()) {
20111 if (const TreeEntry *SameTE =
20112 getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars);
20113 SameTE)
20114 if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth,
20115 ToDemote, Visited, NodesToKeepBWs,
20116 MaxDepthLevel, IsProfitableToDemote,
20117 IsTruncRoot)) {
20118 ToDemote.push_back(Elt: E.Idx);
20119 return true;
20120 }
20121 }
20122 // Check possible extractelement instructions bases and final vector
20123 // length.
20124 SmallPtrSet<Value *, 4> UniqueBases;
20125 for (Value *V : E.Scalars) {
20126 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
20127 if (!EE)
20128 continue;
20129 UniqueBases.insert(Ptr: EE->getVectorOperand());
20130 }
20131 const unsigned VF = E.Scalars.size();
20132 Type *OrigScalarTy = E.Scalars.front()->getType();
20133 if (UniqueBases.size() <= 2 ||
20134 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >=
20135 ::getNumberOfParts(
20136 TTI: *TTI,
20137 VecTy: getWidenedType(
20138 ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth),
20139 VF))) {
20140 ToDemote.push_back(Elt: E.Idx);
20141 return true;
20142 }
20143 }
20144 return Res;
20145 };
20146 if (E.isGather() || !Visited.insert(V: &E).second ||
20147 any_of(Range: E.Scalars, P: [&](Value *V) {
20148 return !isa<PoisonValue>(Val: V) && all_of(Range: V->users(), P: [&](User *U) {
20149 return isa<InsertElementInst>(Val: U) && !isVectorized(V: U);
20150 });
20151 }))
20152 return FinalAnalysis();
20153
20154 if (any_of(Range: E.Scalars, P: [&](Value *V) {
20155 return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) {
20156 return isVectorized(V: U) ||
20157 (E.Idx == 0 && UserIgnoreList &&
20158 UserIgnoreList->contains(V: U)) ||
20159 (!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
20160 !U->getType()->isScalableTy() &&
20161 DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
20162 }) && !IsPotentiallyTruncated(V, BitWidth);
20163 }))
20164 return false;
20165
20166 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
20167 bool &NeedToExit) {
20168 NeedToExit = false;
20169 unsigned InitLevel = MaxDepthLevel;
20170 for (const TreeEntry *Op : Operands) {
20171 unsigned Level = InitLevel;
20172 if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
20173 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level,
20174 IsProfitableToDemote, IsTruncRoot)) {
20175 if (!IsProfitableToDemote)
20176 return false;
20177 NeedToExit = true;
20178 if (!FinalAnalysis())
20179 return false;
20180 continue;
20181 }
20182 MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
20183 }
20184 return true;
20185 };
20186 auto AttemptCheckBitwidth =
20187 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
20188 // Try all bitwidth < OrigBitWidth.
20189 NeedToExit = false;
20190 unsigned BestFailBitwidth = 0;
20191 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
20192 if (Checker(BitWidth, OrigBitWidth))
20193 return true;
20194 if (BestFailBitwidth == 0 && FinalAnalysis())
20195 BestFailBitwidth = BitWidth;
20196 }
20197 if (BitWidth >= OrigBitWidth) {
20198 if (BestFailBitwidth == 0) {
20199 BitWidth = OrigBitWidth;
20200 return false;
20201 }
20202 MaxDepthLevel = 1;
20203 BitWidth = BestFailBitwidth;
20204 NeedToExit = true;
20205 return true;
20206 }
20207 return false;
20208 };
20209 auto TryProcessInstruction =
20210 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
20211 function_ref<bool(unsigned, unsigned)> Checker = {}) {
20212 if (Operands.empty()) {
20213 if (!IsTruncRoot)
20214 MaxDepthLevel = 1;
20215 for (Value *V : E.Scalars)
20216 (void)IsPotentiallyTruncated(V, BitWidth);
20217 } else {
20218 // Several vectorized uses? Check if we can truncate it, otherwise -
20219 // exit.
20220 if (any_of(Range: E.Scalars, P: [&](Value *V) {
20221 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
20222 }))
20223 return false;
20224 bool NeedToExit = false;
20225 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
20226 return false;
20227 if (NeedToExit)
20228 return true;
20229 if (!ProcessOperands(Operands, NeedToExit))
20230 return false;
20231 if (NeedToExit)
20232 return true;
20233 }
20234
20235 ++MaxDepthLevel;
20236 // Record the entry that we can demote.
20237 ToDemote.push_back(Elt: E.Idx);
20238 return IsProfitableToDemote;
20239 };
20240
20241 if (E.State == TreeEntry::SplitVectorize)
20242 return TryProcessInstruction(
20243 BitWidth,
20244 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
20245 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
20246
20247 switch (E.getOpcode()) {
20248
20249 // We can always demote truncations and extensions. Since truncations can
20250 // seed additional demotion, we save the truncated value.
20251 case Instruction::Trunc:
20252 if (IsProfitableToDemoteRoot)
20253 IsProfitableToDemote = true;
20254 return TryProcessInstruction(BitWidth);
20255 case Instruction::ZExt:
20256 case Instruction::SExt:
20257 IsProfitableToDemote = true;
20258 return TryProcessInstruction(BitWidth);
20259
20260 // We can demote certain binary operations if we can demote both of their
20261 // operands.
20262 case Instruction::Add:
20263 case Instruction::Sub:
20264 case Instruction::Mul:
20265 case Instruction::And:
20266 case Instruction::Or:
20267 case Instruction::Xor: {
20268 return TryProcessInstruction(
20269 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)});
20270 }
20271 case Instruction::Freeze:
20272 return TryProcessInstruction(BitWidth, getOperandEntry(E: &E, Idx: 0));
20273 case Instruction::Shl: {
20274 // If we are truncating the result of this SHL, and if it's a shift of an
20275 // inrange amount, we can always perform a SHL in a smaller type.
20276 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
20277 return all_of(Range: E.Scalars, P: [&](Value *V) {
20278 if (isa<PoisonValue>(Val: V))
20279 return true;
20280 auto *I = cast<Instruction>(Val: V);
20281 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
20282 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
20283 });
20284 };
20285 return TryProcessInstruction(
20286 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker);
20287 }
20288 case Instruction::LShr: {
20289 // If this is a truncate of a logical shr, we can truncate it to a smaller
20290 // lshr iff we know that the bits we would otherwise be shifting in are
20291 // already zeros.
20292 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20293 return all_of(Range: E.Scalars, P: [&](Value *V) {
20294 if (isa<PoisonValue>(Val: V))
20295 return true;
20296 auto *I = cast<Instruction>(Val: V);
20297 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
20298 APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20299 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
20300 MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits,
20301 SQ: SimplifyQuery(*DL));
20302 });
20303 };
20304 return TryProcessInstruction(
20305 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
20306 LShrChecker);
20307 }
20308 case Instruction::AShr: {
20309 // If this is a truncate of an arithmetic shr, we can truncate it to a
20310 // smaller ashr iff we know that all the bits from the sign bit of the
20311 // original type and the sign bit of the truncate type are similar.
20312 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20313 return all_of(Range: E.Scalars, P: [&](Value *V) {
20314 if (isa<PoisonValue>(Val: V))
20315 return true;
20316 auto *I = cast<Instruction>(Val: V);
20317 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
20318 unsigned ShiftedBits = OrigBitWidth - BitWidth;
20319 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
20320 ShiftedBits <
20321 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
20322 });
20323 };
20324 return TryProcessInstruction(
20325 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
20326 AShrChecker);
20327 }
20328 case Instruction::UDiv:
20329 case Instruction::URem: {
20330 // UDiv and URem can be truncated if all the truncated bits are zero.
20331 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20332 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
20333 return all_of(Range: E.Scalars, P: [&](Value *V) {
20334 auto *I = cast<Instruction>(Val: V);
20335 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20336 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)) &&
20337 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
20338 });
20339 };
20340 return TryProcessInstruction(
20341 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker);
20342 }
20343
20344 // We can demote selects if we can demote their true and false values.
20345 case Instruction::Select: {
20346 return TryProcessInstruction(
20347 BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)});
20348 }
20349
20350 // We can demote phis if we can demote all their incoming operands.
20351 case Instruction::PHI: {
20352 const unsigned NumOps = E.getNumOperands();
20353 SmallVector<const TreeEntry *> Ops(NumOps);
20354 transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(),
20355 F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); });
20356
20357 return TryProcessInstruction(BitWidth, Ops);
20358 }
20359
20360 case Instruction::Call: {
20361 auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
20362 if (!IC)
20363 break;
20364 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
20365 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
20366 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
20367 break;
20368 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0));
20369 function_ref<bool(unsigned, unsigned)> CallChecker;
20370 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20371 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
20372 return all_of(Range: E.Scalars, P: [&](Value *V) {
20373 auto *I = cast<Instruction>(Val: V);
20374 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
20375 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
20376 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
20377 SQ: SimplifyQuery(*DL)) &&
20378 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
20379 }
20380 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
20381 "Expected min/max intrinsics only.");
20382 unsigned SignBits = OrigBitWidth - BitWidth;
20383 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
20384 unsigned Op0SignBits =
20385 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
20386 unsigned Op1SignBits =
20387 ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
20388 return SignBits <= Op0SignBits &&
20389 ((SignBits != Op0SignBits &&
20390 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
20391 MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
20392 SQ: SimplifyQuery(*DL))) &&
20393 SignBits <= Op1SignBits &&
20394 ((SignBits != Op1SignBits &&
20395 !isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL))) ||
20396 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL)));
20397 });
20398 };
20399 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
20400 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
20401 return all_of(Range: E.Scalars, P: [&](Value *V) {
20402 auto *I = cast<Instruction>(Val: V);
20403 unsigned SignBits = OrigBitWidth - BitWidth;
20404 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
20405 unsigned Op0SignBits =
20406 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
20407 return SignBits <= Op0SignBits &&
20408 ((SignBits != Op0SignBits &&
20409 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
20410 MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)));
20411 });
20412 };
20413 if (ID != Intrinsic::abs) {
20414 Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1));
20415 CallChecker = CompChecker;
20416 } else {
20417 CallChecker = AbsChecker;
20418 }
20419 InstructionCost BestCost =
20420 std::numeric_limits<InstructionCost::CostType>::max();
20421 unsigned BestBitWidth = BitWidth;
20422 unsigned VF = E.Scalars.size();
20423 // Choose the best bitwidth based on cost estimations.
20424 auto Checker = [&](unsigned BitWidth, unsigned) {
20425 unsigned MinBW = PowerOf2Ceil(A: BitWidth);
20426 SmallVector<Type *> ArgTys =
20427 buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI);
20428 auto VecCallCosts = getVectorCallCosts(
20429 CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
20430 TTI, TLI, ArgTys);
20431 InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
20432 if (Cost < BestCost) {
20433 BestCost = Cost;
20434 BestBitWidth = BitWidth;
20435 }
20436 return false;
20437 };
20438 [[maybe_unused]] bool NeedToExit;
20439 (void)AttemptCheckBitwidth(Checker, NeedToExit);
20440 BitWidth = BestBitWidth;
20441 return TryProcessInstruction(BitWidth, Operands, CallChecker);
20442 }
20443
20444 // Otherwise, conservatively give up.
20445 default:
20446 break;
20447 }
20448 MaxDepthLevel = 1;
20449 return FinalAnalysis();
20450}
20451
20452static RecurKind getRdxKind(Value *V);
20453
20454void BoUpSLP::computeMinimumValueSizes() {
20455 // We only attempt to truncate integer expressions.
20456 bool IsStoreOrInsertElt =
20457 VectorizableTree.front()->hasState() &&
20458 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
20459 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
20460 if ((IsStoreOrInsertElt || UserIgnoreList) &&
20461 ExtraBitWidthNodes.size() <= 1 &&
20462 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
20463 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
20464 return;
20465
20466 unsigned NodeIdx = 0;
20467 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
20468 NodeIdx = 1;
20469
20470 // Ensure the roots of the vectorizable tree don't form a cycle.
20471 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
20472 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
20473 "Unexpected tree is graph.");
20474
20475 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
20476 // resize to the final type.
20477 bool IsTruncRoot = false;
20478 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
20479 SmallVector<unsigned> RootDemotes;
20480 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
20481 if (NodeIdx != 0 &&
20482 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
20483 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
20484 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
20485 IsTruncRoot = true;
20486 RootDemotes.push_back(Elt: NodeIdx);
20487 IsProfitableToDemoteRoot = true;
20488 ++NodeIdx;
20489 }
20490
20491 // Analyzed the reduction already and not profitable - exit.
20492 if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front()))
20493 return;
20494
20495 SmallVector<unsigned> ToDemote;
20496 auto ComputeMaxBitWidth =
20497 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
20498 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
20499 ToDemote.clear();
20500 // Check if the root is trunc and the next node is gather/buildvector, then
20501 // keep trunc in scalars, which is free in most cases.
20502 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
20503 !NodesToKeepBWs.contains(V: E.Idx) &&
20504 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
20505 all_of(Range: E.Scalars, P: [&](Value *V) {
20506 return V->hasOneUse() || isa<Constant>(Val: V) ||
20507 (!V->hasNUsesOrMore(N: UsesLimit) &&
20508 none_of(Range: V->users(), P: [&](User *U) {
20509 ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U);
20510 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
20511 if (TEs.empty() || is_contained(Range&: TEs, Element: UserTE))
20512 return false;
20513 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
20514 SelectInst>(Val: U) ||
20515 isa<SIToFPInst, UIToFPInst>(Val: U) ||
20516 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
20517 SelectInst>(Val: UserTE->getMainOp()) ||
20518 isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp()))
20519 return true;
20520 unsigned UserTESz = DL->getTypeSizeInBits(
20521 Ty: UserTE->Scalars.front()->getType());
20522 if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) {
20523 auto It = MinBWs.find(Val: TE);
20524 return It != MinBWs.end() &&
20525 It->second.first > UserTESz;
20526 }))
20527 return true;
20528 return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
20529 }));
20530 })) {
20531 ToDemote.push_back(Elt: E.Idx);
20532 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
20533 auto It = MinBWs.find(Val: UserTE);
20534 if (It != MinBWs.end())
20535 return It->second.first;
20536 unsigned MaxBitWidth =
20537 DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
20538 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
20539 if (MaxBitWidth < 8 && MaxBitWidth > 1)
20540 MaxBitWidth = 8;
20541 return MaxBitWidth;
20542 }
20543
20544 if (!E.hasState())
20545 return 0u;
20546
20547 unsigned VF = E.getVectorFactor();
20548 Type *ScalarTy = E.Scalars.front()->getType();
20549 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
20550 auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType());
20551 if (!TreeRootIT)
20552 return 0u;
20553
20554 if (any_of(Range: E.Scalars,
20555 P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
20556 return 0u;
20557
20558 unsigned NumParts = ::getNumberOfParts(
20559 TTI: *TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF * ScalarTyNumElements));
20560
20561 // The maximum bit width required to represent all the values that can be
20562 // demoted without loss of precision. It would be safe to truncate the roots
20563 // of the expression to this width.
20564 unsigned MaxBitWidth = 1u;
20565
20566 // True if the roots can be zero-extended back to their original type,
20567 // rather than sign-extended. We know that if the leading bits are not
20568 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
20569 // True.
20570 // Determine if the sign bit of all the roots is known to be zero. If not,
20571 // IsKnownPositive is set to False.
20572 bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
20573 if (isa<PoisonValue>(Val: R))
20574 return true;
20575 KnownBits Known = computeKnownBits(V: R, DL: *DL);
20576 return Known.isNonNegative();
20577 });
20578
20579 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
20580 E.UserTreeIndex.UserTE->hasState() &&
20581 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
20582 MaxBitWidth =
20583 std::min(a: DL->getTypeSizeInBits(
20584 Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()),
20585 b: DL->getTypeSizeInBits(Ty: ScalarTy));
20586
20587 // We first check if all the bits of the roots are demanded. If they're not,
20588 // we can truncate the roots to this narrower type.
20589 for (Value *Root : E.Scalars) {
20590 if (isa<PoisonValue>(Val: Root))
20591 continue;
20592 unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, AC, CxtI: nullptr, DT);
20593 TypeSize NumTypeBits =
20594 DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType());
20595 unsigned BitWidth1 = NumTypeBits - NumSignBits;
20596 // If we can't prove that the sign bit is zero, we must add one to the
20597 // maximum bit width to account for the unknown sign bit. This preserves
20598 // the existing sign bit so we can safely sign-extend the root back to the
20599 // original type. Otherwise, if we know the sign bit is zero, we will
20600 // zero-extend the root instead.
20601 //
20602 // FIXME: This is somewhat suboptimal, as there will be cases where adding
20603 // one to the maximum bit width will yield a larger-than-necessary
20604 // type. In general, we need to add an extra bit only if we can't
20605 // prove that the upper bit of the original type is equal to the
20606 // upper bit of the proposed smaller type. If these two bits are
20607 // the same (either zero or one) we know that sign-extending from
20608 // the smaller type will result in the same value. Here, since we
20609 // can't yet prove this, we are just making the proposed smaller
20610 // type larger to ensure correctness.
20611 if (!IsKnownPositive)
20612 ++BitWidth1;
20613
20614 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root));
20615 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
20616 MaxBitWidth =
20617 std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
20618 }
20619
20620 if (MaxBitWidth < 8 && MaxBitWidth > 1)
20621 MaxBitWidth = 8;
20622
20623 // If the original type is large, but reduced type does not improve the reg
20624 // use - ignore it.
20625 if (NumParts > 1 &&
20626 NumParts ==
20627 ::getNumberOfParts(
20628 TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(),
20629 NumBits: bit_ceil(Value: MaxBitWidth)),
20630 VF)))
20631 return 0u;
20632
20633 unsigned Opcode = E.getOpcode();
20634 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
20635 Opcode == Instruction::SExt ||
20636 Opcode == Instruction::ZExt || NumParts > 1;
20637 // Conservatively determine if we can actually truncate the roots of the
20638 // expression. Collect the values that can be demoted in ToDemote and
20639 // additional roots that require investigating in Roots.
20640 DenseSet<const TreeEntry *> Visited;
20641 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
20642 bool NeedToDemote = IsProfitableToDemote;
20643
20644 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
20645 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
20646 IsProfitableToDemote&: NeedToDemote, IsTruncRoot) ||
20647 (MaxDepthLevel <= Limit &&
20648 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
20649 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
20650 DL->getTypeSizeInBits(Ty: TreeRootIT) /
20651 DL->getTypeSizeInBits(
20652 Ty: E.getMainOp()->getOperand(i: 0)->getType()) >
20653 2)))))
20654 return 0u;
20655 // Round MaxBitWidth up to the next power-of-two.
20656 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
20657
20658 return MaxBitWidth;
20659 };
20660
20661 // If we can truncate the root, we must collect additional values that might
20662 // be demoted as a result. That is, those seeded by truncations we will
20663 // modify.
20664 // Add reduction ops sizes, if any.
20665 if (UserIgnoreList &&
20666 isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
20667 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
20668 // x i1> to in)).
20669 if (all_of(Range: *UserIgnoreList,
20670 P: [](Value *V) {
20671 return isa<PoisonValue>(Val: V) ||
20672 cast<Instruction>(Val: V)->getOpcode() == Instruction::Add;
20673 }) &&
20674 VectorizableTree.front()->State == TreeEntry::Vectorize &&
20675 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
20676 cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() ==
20677 Builder.getInt1Ty()) {
20678 ReductionBitWidth = 1;
20679 } else {
20680 for (Value *V : *UserIgnoreList) {
20681 if (isa<PoisonValue>(Val: V))
20682 continue;
20683 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
20684 TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
20685 unsigned BitWidth1 = NumTypeBits - NumSignBits;
20686 if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL)))
20687 ++BitWidth1;
20688 unsigned BitWidth2 = BitWidth1;
20689 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
20690 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
20691 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
20692 }
20693 ReductionBitWidth =
20694 std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
20695 }
20696 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
20697 ReductionBitWidth = 8;
20698
20699 ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
20700 }
20701 }
20702 bool IsTopRoot = NodeIdx == 0;
20703 while (NodeIdx < VectorizableTree.size() &&
20704 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
20705 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
20706 RootDemotes.push_back(Elt: NodeIdx);
20707 ++NodeIdx;
20708 IsTruncRoot = true;
20709 }
20710 bool IsSignedCmp = false;
20711 if (UserIgnoreList && all_of(Range: *UserIgnoreList, P: [](Value *V) {
20712 return match(V, P: m_SMin(L: m_Value(), R: m_Value())) ||
20713 match(V, P: m_SMax(L: m_Value(), R: m_Value()));
20714 }))
20715 IsSignedCmp = true;
20716 while (NodeIdx < VectorizableTree.size()) {
20717 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
20718 unsigned Limit = 2;
20719 if (IsTopRoot &&
20720 ReductionBitWidth ==
20721 DL->getTypeSizeInBits(
20722 Ty: VectorizableTree.front()->Scalars.front()->getType()))
20723 Limit = 3;
20724 unsigned MaxBitWidth = ComputeMaxBitWidth(
20725 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
20726 IsTruncRoot, IsSignedCmp);
20727 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
20728 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
20729 ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
20730 else if (MaxBitWidth == 0)
20731 ReductionBitWidth = 0;
20732 }
20733
20734 for (unsigned Idx : RootDemotes) {
20735 if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) {
20736 uint32_t OrigBitWidth =
20737 DL->getTypeSizeInBits(Ty: V->getType()->getScalarType());
20738 if (OrigBitWidth > MaxBitWidth) {
20739 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
20740 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
20741 }
20742 return false;
20743 }))
20744 ToDemote.push_back(Elt: Idx);
20745 }
20746 RootDemotes.clear();
20747 IsTopRoot = false;
20748 IsProfitableToDemoteRoot = true;
20749
20750 if (ExtraBitWidthNodes.empty()) {
20751 NodeIdx = VectorizableTree.size();
20752 } else {
20753 unsigned NewIdx = 0;
20754 do {
20755 NewIdx = *ExtraBitWidthNodes.begin();
20756 ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
20757 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
20758 NodeIdx = NewIdx;
20759 IsTruncRoot =
20760 NodeIdx < VectorizableTree.size() &&
20761 VectorizableTree[NodeIdx]->UserTreeIndex &&
20762 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
20763 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
20764 Instruction::Trunc &&
20765 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
20766 IsSignedCmp =
20767 NodeIdx < VectorizableTree.size() &&
20768 VectorizableTree[NodeIdx]->UserTreeIndex &&
20769 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
20770 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
20771 Instruction::ICmp &&
20772 any_of(
20773 Range&: VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
20774 P: [&](Value *V) {
20775 auto *IC = dyn_cast<ICmpInst>(Val: V);
20776 return IC && (IC->isSigned() ||
20777 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 0),
20778 SQ: SimplifyQuery(*DL)) ||
20779 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 1),
20780 SQ: SimplifyQuery(*DL)));
20781 });
20782 }
20783
20784 // If the maximum bit width we compute is less than the width of the roots'
20785 // type, we can proceed with the narrowing. Otherwise, do nothing.
20786 if (MaxBitWidth == 0 ||
20787 MaxBitWidth >=
20788 cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType())
20789 ->getBitWidth()) {
20790 if (UserIgnoreList)
20791 AnalyzedMinBWVals.insert_range(R&: TreeRoot);
20792 NodesToKeepBWs.insert_range(R&: ToDemote);
20793 continue;
20794 }
20795
20796 // Finally, map the values we can demote to the maximum bit with we
20797 // computed.
20798 for (unsigned Idx : ToDemote) {
20799 TreeEntry *TE = VectorizableTree[Idx].get();
20800 if (MinBWs.contains(Val: TE))
20801 continue;
20802 bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
20803 if (isa<PoisonValue>(Val: R))
20804 return false;
20805 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
20806 });
20807 MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
20808 }
20809 }
20810}
20811
20812PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
20813 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
20814 auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
20815 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
20816 auto *AA = &AM.getResult<AAManager>(IR&: F);
20817 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
20818 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
20819 auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
20820 auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
20821 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
20822
20823 bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
20824 if (!Changed)
20825 return PreservedAnalyses::all();
20826
20827 PreservedAnalyses PA;
20828 PA.preserveSet<CFGAnalyses>();
20829 return PA;
20830}
20831
20832bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
20833 TargetTransformInfo *TTI_,
20834 TargetLibraryInfo *TLI_, AAResults *AA_,
20835 LoopInfo *LI_, DominatorTree *DT_,
20836 AssumptionCache *AC_, DemandedBits *DB_,
20837 OptimizationRemarkEmitter *ORE_) {
20838 if (!RunSLPVectorization)
20839 return false;
20840 SE = SE_;
20841 TTI = TTI_;
20842 TLI = TLI_;
20843 AA = AA_;
20844 LI = LI_;
20845 DT = DT_;
20846 AC = AC_;
20847 DB = DB_;
20848 DL = &F.getDataLayout();
20849
20850 Stores.clear();
20851 GEPs.clear();
20852 bool Changed = false;
20853
20854 // If the target claims to have no vector registers don't attempt
20855 // vectorization.
20856 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
20857 LLVM_DEBUG(
20858 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
20859 return false;
20860 }
20861
20862 // Don't vectorize when the attribute NoImplicitFloat is used.
20863 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
20864 return false;
20865
20866 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
20867
20868 // Use the bottom up slp vectorizer to construct chains that start with
20869 // store instructions.
20870 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
20871
20872 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
20873 // delete instructions.
20874
20875 // Update DFS numbers now so that we can use them for ordering.
20876 DT->updateDFSNumbers();
20877
20878 // Scan the blocks in the function in post order.
20879 for (auto *BB : post_order(G: &F.getEntryBlock())) {
20880 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()))
20881 continue;
20882
20883 // Start new block - clear the list of reduction roots.
20884 R.clearReductionData();
20885 collectSeedInstructions(BB);
20886
20887 // Vectorize trees that end at stores.
20888 if (!Stores.empty()) {
20889 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
20890 << " underlying objects.\n");
20891 Changed |= vectorizeStoreChains(R);
20892 }
20893
20894 // Vectorize trees that end at reductions.
20895 Changed |= vectorizeChainsInBlock(BB, R);
20896
20897 // Vectorize the index computations of getelementptr instructions. This
20898 // is primarily intended to catch gather-like idioms ending at
20899 // non-consecutive loads.
20900 if (!GEPs.empty()) {
20901 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
20902 << " underlying objects.\n");
20903 Changed |= vectorizeGEPIndices(BB, R);
20904 }
20905 }
20906
20907 if (Changed) {
20908 R.optimizeGatherSequence();
20909 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
20910 }
20911 return Changed;
20912}
20913
20914std::optional<bool>
20915SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
20916 unsigned Idx, unsigned MinVF,
20917 unsigned &Size) {
20918 Size = 0;
20919 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
20920 << "\n");
20921 const unsigned Sz = R.getVectorElementSize(V: Chain[0]);
20922 unsigned VF = Chain.size();
20923
20924 if (!has_single_bit(Value: Sz) ||
20925 !hasFullVectorsOrPowerOf2(
20926 TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(),
20927 Sz: VF) ||
20928 VF < 2 || VF < MinVF) {
20929 // Check if vectorizing with a non-power-of-2 VF should be considered. At
20930 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
20931 // all vector lanes are used.
20932 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
20933 return false;
20934 }
20935
20936 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
20937 << "\n");
20938
20939 SetVector<Value *> ValOps;
20940 for (Value *V : Chain)
20941 ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
20942 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
20943 InstructionsState S = getSameOpcode(VL: ValOps.getArrayRef(), TLI: *TLI);
20944 if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > 1) {
20945 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
20946 bool IsAllowedSize =
20947 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(),
20948 Sz: ValOps.size()) ||
20949 (VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + 1));
20950 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
20951 (!S.getMainOp()->isSafeToRemove() ||
20952 any_of(Range: ValOps.getArrayRef(),
20953 P: [&](Value *V) {
20954 return !isa<ExtractElementInst>(Val: V) &&
20955 (V->getNumUses() > Chain.size() ||
20956 any_of(Range: V->users(), P: [&](User *U) {
20957 return !Stores.contains(V: U);
20958 }));
20959 }))) ||
20960 (ValOps.size() > Chain.size() / 2 && !S)) {
20961 Size = (!IsAllowedSize && S) ? 1 : 2;
20962 return false;
20963 }
20964 }
20965 if (R.isLoadCombineCandidate(Stores: Chain))
20966 return true;
20967 R.buildTree(Roots: Chain);
20968 // Check if tree tiny and store itself or its value is not vectorized.
20969 if (R.isTreeTinyAndNotFullyVectorizable()) {
20970 if (R.isGathered(V: Chain.front()) ||
20971 R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
20972 return std::nullopt;
20973 Size = R.getCanonicalGraphSize();
20974 return false;
20975 }
20976 if (R.isProfitableToReorder()) {
20977 R.reorderTopToBottom();
20978 R.reorderBottomToTop();
20979 }
20980 R.transformNodes();
20981 R.buildExternalUses();
20982
20983 R.computeMinimumValueSizes();
20984
20985 Size = R.getCanonicalGraphSize();
20986 if (S && S.getOpcode() == Instruction::Load)
20987 Size = 2; // cut off masked gather small trees
20988 InstructionCost Cost = R.getTreeCost();
20989
20990 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
20991 if (Cost < -SLPCostThreshold) {
20992 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
20993
20994 using namespace ore;
20995
20996 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "StoresVectorized",
20997 cast<StoreInst>(Val: Chain[0]))
20998 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
20999 << " and with tree size "
21000 << NV("TreeSize", R.getTreeSize()));
21001
21002 R.vectorizeTree();
21003 return true;
21004 }
21005
21006 return false;
21007}
21008
21009/// Checks if the quadratic mean deviation is less than 90% of the mean size.
21010static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
21011 bool First) {
21012 unsigned Num = 0;
21013 uint64_t Sum = std::accumulate(
21014 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
21015 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
21016 unsigned Size = First ? Val.first : Val.second;
21017 if (Size == 1)
21018 return V;
21019 ++Num;
21020 return V + Size;
21021 });
21022 if (Num == 0)
21023 return true;
21024 uint64_t Mean = Sum / Num;
21025 if (Mean == 0)
21026 return true;
21027 uint64_t Dev = std::accumulate(
21028 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
21029 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
21030 unsigned P = First ? Val.first : Val.second;
21031 if (P == 1)
21032 return V;
21033 return V + (P - Mean) * (P - Mean);
21034 }) /
21035 Num;
21036 return Dev * 96 / (Mean * Mean) == 0;
21037}
21038
21039namespace {
21040
21041/// A group of stores that we'll try to bundle together using vector ops.
21042/// They are ordered using the signed distance of their address operand to the
21043/// address of this group's BaseInstr.
21044class RelatedStoreInsts {
21045public:
21046 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
21047 : AllStores(AllStores) {
21048 reset(NewBaseInstr: BaseInstrIdx);
21049 }
21050
21051 void reset(unsigned NewBaseInstr) {
21052 assert(NewBaseInstr < AllStores.size() &&
21053 "Instruction index out of bounds");
21054 BaseInstrIdx = NewBaseInstr;
21055 Instrs.clear();
21056 insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: 0);
21057 }
21058
21059 /// Tries to insert \p InstrIdx as the store with a pointer distance of
21060 /// \p PtrDist.
21061 /// Does nothing if there is already a store with that \p PtrDist.
21062 /// \returns The previously associated Instruction index, or std::nullopt
21063 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
21064 auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx);
21065 return Inserted ? std::nullopt : std::make_optional(t&: It->second);
21066 }
21067
21068 using DistToInstMap = std::map<int64_t, unsigned>;
21069 const DistToInstMap &getStores() const { return Instrs; }
21070
21071 /// If \p SI is related to this group of stores, return the distance of its
21072 /// pointer operand to the one the group's BaseInstr.
21073 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
21074 ScalarEvolution &SE) const {
21075 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
21076 return getPointersDiff(
21077 ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(),
21078 ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE,
21079 /*StrictCheck=*/true);
21080 }
21081
21082 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
21083 /// Stores whose index is less than \p MinSafeIdx will be dropped.
21084 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
21085 int64_t DistFromCurBase) {
21086 DistToInstMap PrevSet = std::move(Instrs);
21087 reset(NewBaseInstr: NewBaseInstIdx);
21088
21089 // Re-insert stores that come after MinSafeIdx to try and vectorize them
21090 // again. Their distance will be "rebased" to use NewBaseInstIdx as
21091 // reference.
21092 for (auto [Dist, InstIdx] : PrevSet) {
21093 if (InstIdx >= MinSafeIdx)
21094 insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase);
21095 }
21096 }
21097
21098 /// Remove all stores that have been vectorized from this group.
21099 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
21100 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
21101 Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
21102 return VectorizedStores.contains(Ptr: AllStores[DistAndIdx.second]);
21103 });
21104
21105 // Get a forward iterator pointing after the last vectorized store and erase
21106 // all stores before it so we don't try to vectorize them again.
21107 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
21108 Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd);
21109 }
21110
21111private:
21112 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
21113 unsigned BaseInstrIdx;
21114
21115 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
21116 DistToInstMap Instrs;
21117
21118 /// Reference to all the stores in the BB being analyzed.
21119 ArrayRef<StoreInst *> AllStores;
21120};
21121
21122} // end anonymous namespace
21123
21124bool SLPVectorizerPass::vectorizeStores(
21125 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
21126 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
21127 &Visited) {
21128 // We may run into multiple chains that merge into a single chain. We mark the
21129 // stores that we vectorized so that we don't visit the same store twice.
21130 BoUpSLP::ValueSet VectorizedStores;
21131 bool Changed = false;
21132
21133 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
21134 int64_t PrevDist = -1;
21135 BoUpSLP::ValueList Operands;
21136 // Collect the chain into a list.
21137 for (auto [Idx, Data] : enumerate(First: StoreSeq)) {
21138 auto &[Dist, InstIdx] = Data;
21139 if (Operands.empty() || Dist - PrevDist == 1) {
21140 Operands.push_back(Elt: Stores[InstIdx]);
21141 PrevDist = Dist;
21142 if (Idx != StoreSeq.size() - 1)
21143 continue;
21144 }
21145 auto E = make_scope_exit(F: [&, &Dist = Dist, &InstIdx = InstIdx]() {
21146 Operands.clear();
21147 Operands.push_back(Elt: Stores[InstIdx]);
21148 PrevDist = Dist;
21149 });
21150
21151 if (Operands.size() <= 1 ||
21152 !Visited
21153 .insert(V: {Operands.front(),
21154 cast<StoreInst>(Val: Operands.front())->getValueOperand(),
21155 Operands.back(),
21156 cast<StoreInst>(Val: Operands.back())->getValueOperand(),
21157 Operands.size()})
21158 .second)
21159 continue;
21160
21161 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21162 unsigned EltSize = R.getVectorElementSize(V: Operands[0]);
21163 unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
21164
21165 unsigned MaxVF =
21166 std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
21167 auto *Store = cast<StoreInst>(Val: Operands[0]);
21168 Type *StoreTy = Store->getValueOperand()->getType();
21169 Type *ValueTy = StoreTy;
21170 if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
21171 ValueTy = Trunc->getSrcTy();
21172 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
21173 // getStoreMinimumVF only support scalar type as arguments. As a result,
21174 // we need to use the element type of StoreTy and ValueTy to retrieve the
21175 // VF and then transform it back.
21176 // Remember: VF is defined as the number we want to vectorize, not the
21177 // number of elements in the final vector.
21178 Type *StoreScalarTy = StoreTy->getScalarType();
21179 unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
21180 VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy,
21181 ScalarValTy: ValueTy->getScalarType()));
21182 MinVF /= getNumElements(Ty: StoreTy);
21183 MinVF = std::max<unsigned>(a: 2, b: MinVF);
21184
21185 if (MaxVF < MinVF) {
21186 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
21187 << ") < "
21188 << "MinVF (" << MinVF << ")\n");
21189 continue;
21190 }
21191
21192 unsigned NonPowerOf2VF = 0;
21193 if (VectorizeNonPowerOf2) {
21194 // First try vectorizing with a non-power-of-2 VF. At the moment, only
21195 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
21196 // lanes are used.
21197 unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF);
21198 if (has_single_bit(Value: CandVF + 1)) {
21199 NonPowerOf2VF = CandVF;
21200 assert(NonPowerOf2VF != MaxVF &&
21201 "Non-power-of-2 VF should not be equal to MaxVF");
21202 }
21203 }
21204
21205 // MaxRegVF represents the number of instructions (scalar, or vector in
21206 // case of revec) that can be vectorized to naturally fit in a vector
21207 // register.
21208 unsigned MaxRegVF = MaxVF;
21209
21210 MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
21211 if (MaxVF < MinVF) {
21212 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
21213 << ") < "
21214 << "MinVF (" << MinVF << ")\n");
21215 continue;
21216 }
21217
21218 SmallVector<unsigned> CandidateVFs;
21219 for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF;
21220 VF = divideCeil(Numerator: VF, Denominator: 2))
21221 CandidateVFs.push_back(Elt: VF);
21222
21223 unsigned End = Operands.size();
21224 unsigned Repeat = 0;
21225 constexpr unsigned MaxAttempts = 4;
21226 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
21227 for (std::pair<unsigned, unsigned> &P : RangeSizes)
21228 P.first = P.second = 1;
21229 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
21230 auto IsNotVectorized = [](bool First,
21231 const std::pair<unsigned, unsigned> &P) {
21232 return First ? P.first > 0 : P.second > 0;
21233 };
21234 auto IsVectorized = [](bool First,
21235 const std::pair<unsigned, unsigned> &P) {
21236 return First ? P.first == 0 : P.second == 0;
21237 };
21238 auto VFIsProfitable = [](bool First, unsigned Size,
21239 const std::pair<unsigned, unsigned> &P) {
21240 return First ? Size >= P.first : Size >= P.second;
21241 };
21242 auto FirstSizeSame = [](unsigned Size,
21243 const std::pair<unsigned, unsigned> &P) {
21244 return Size == P.first;
21245 };
21246 while (true) {
21247 ++Repeat;
21248 bool RepeatChanged = false;
21249 bool AnyProfitableGraph = false;
21250 for (unsigned VF : CandidateVFs) {
21251 AnyProfitableGraph = false;
21252 unsigned FirstUnvecStore =
21253 std::distance(first: RangeSizes.begin(),
21254 last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized,
21255 args: VF >= MaxRegVF, args: _1)));
21256
21257 // Form slices of size VF starting from FirstUnvecStore and try to
21258 // vectorize them.
21259 while (FirstUnvecStore < End) {
21260 unsigned FirstVecStore = std::distance(
21261 first: RangeSizes.begin(),
21262 last: find_if(Range: RangeSizes.drop_front(N: FirstUnvecStore),
21263 P: std::bind(f&: IsVectorized, args: VF >= MaxRegVF, args: _1)));
21264 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
21265 for (unsigned SliceStartIdx = FirstUnvecStore;
21266 SliceStartIdx + VF <= MaxSliceEnd;) {
21267 if (!checkTreeSizes(Sizes: RangeSizes.slice(N: SliceStartIdx, M: VF),
21268 First: VF >= MaxRegVF)) {
21269 ++SliceStartIdx;
21270 continue;
21271 }
21272 ArrayRef<Value *> Slice =
21273 ArrayRef(Operands).slice(N: SliceStartIdx, M: VF);
21274 assert(all_of(Slice,
21275 [&](Value *V) {
21276 return cast<StoreInst>(V)
21277 ->getValueOperand()
21278 ->getType() ==
21279 cast<StoreInst>(Slice.front())
21280 ->getValueOperand()
21281 ->getType();
21282 }) &&
21283 "Expected all operands of same type.");
21284 if (!NonSchedulable.empty()) {
21285 auto [NonSchedSizeMax, NonSchedSizeMin] =
21286 NonSchedulable.lookup(Val: Slice.front());
21287 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
21288 // VF is too ambitious. Try to vectorize another slice before
21289 // trying a smaller VF.
21290 SliceStartIdx += NonSchedSizeMax;
21291 continue;
21292 }
21293 }
21294 unsigned TreeSize;
21295 std::optional<bool> Res =
21296 vectorizeStoreChain(Chain: Slice, R, Idx: SliceStartIdx, MinVF, Size&: TreeSize);
21297 if (!Res) {
21298 // Update the range of non schedulable VFs for slices starting
21299 // at SliceStartIdx.
21300 NonSchedulable
21301 .try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF))
21302 .first->getSecond()
21303 .second = VF;
21304 } else if (*Res) {
21305 // Mark the vectorized stores so that we don't vectorize them
21306 // again.
21307 VectorizedStores.insert_range(R&: Slice);
21308 // Mark the vectorized stores so that we don't vectorize them
21309 // again.
21310 AnyProfitableGraph = RepeatChanged = Changed = true;
21311 // If we vectorized initial block, no need to try to vectorize
21312 // it again.
21313 for (std::pair<unsigned, unsigned> &P :
21314 RangeSizes.slice(N: SliceStartIdx, M: VF))
21315 P.first = P.second = 0;
21316 if (SliceStartIdx < FirstUnvecStore + MinVF) {
21317 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
21318 N: FirstUnvecStore, M: SliceStartIdx - FirstUnvecStore))
21319 P.first = P.second = 0;
21320 FirstUnvecStore = SliceStartIdx + VF;
21321 }
21322 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
21323 for (std::pair<unsigned, unsigned> &P :
21324 RangeSizes.slice(N: SliceStartIdx + VF,
21325 M: MaxSliceEnd - (SliceStartIdx + VF)))
21326 P.first = P.second = 0;
21327 if (MaxSliceEnd == End)
21328 End = SliceStartIdx;
21329 MaxSliceEnd = SliceStartIdx;
21330 }
21331 SliceStartIdx += VF;
21332 continue;
21333 }
21334 if (VF > 2 && Res &&
21335 !all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
21336 P: std::bind(f&: VFIsProfitable, args: VF >= MaxRegVF, args&: TreeSize,
21337 args: _1))) {
21338 SliceStartIdx += VF;
21339 continue;
21340 }
21341 // Check for the very big VFs that we're not rebuilding same
21342 // trees, just with larger number of elements.
21343 if (VF > MaxRegVF && TreeSize > 1 &&
21344 all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
21345 P: std::bind(f&: FirstSizeSame, args&: TreeSize, args: _1))) {
21346 SliceStartIdx += VF;
21347 while (SliceStartIdx != MaxSliceEnd &&
21348 RangeSizes[SliceStartIdx].first == TreeSize)
21349 ++SliceStartIdx;
21350 continue;
21351 }
21352 if (TreeSize > 1) {
21353 for (std::pair<unsigned, unsigned> &P :
21354 RangeSizes.slice(N: SliceStartIdx, M: VF)) {
21355 if (VF >= MaxRegVF)
21356 P.second = std::max(a: P.second, b: TreeSize);
21357 else
21358 P.first = std::max(a: P.first, b: TreeSize);
21359 }
21360 }
21361 ++SliceStartIdx;
21362 AnyProfitableGraph = true;
21363 }
21364 if (FirstUnvecStore >= End)
21365 break;
21366 if (MaxSliceEnd - FirstUnvecStore < VF &&
21367 MaxSliceEnd - FirstUnvecStore >= MinVF)
21368 AnyProfitableGraph = true;
21369 FirstUnvecStore = std::distance(
21370 first: RangeSizes.begin(),
21371 last: find_if(Range: RangeSizes.drop_front(N: MaxSliceEnd),
21372 P: std::bind(f&: IsNotVectorized, args: VF >= MaxRegVF, args: _1)));
21373 }
21374 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF))
21375 break;
21376 }
21377 // All values vectorized - exit.
21378 if (all_of(Range&: RangeSizes, P: [](const std::pair<unsigned, unsigned> &P) {
21379 return P.first == 0 && P.second == 0;
21380 }))
21381 break;
21382 // Check if tried all attempts or no need for the last attempts at all.
21383 if (Repeat >= MaxAttempts ||
21384 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
21385 break;
21386 constexpr unsigned StoresLimit = 64;
21387 const unsigned MaxTotalNum = std::min<unsigned>(
21388 a: Operands.size(),
21389 b: static_cast<unsigned>(
21390 End -
21391 std::distance(
21392 first: RangeSizes.begin(),
21393 last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: true, args: _1))) +
21394 1));
21395 unsigned VF = bit_ceil(Value: CandidateVFs.front()) * 2;
21396 unsigned Limit =
21397 getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: StoreTy, Sz: MaxTotalNum);
21398 CandidateVFs.clear();
21399 if (bit_floor(Value: Limit) == VF)
21400 CandidateVFs.push_back(Elt: Limit);
21401 if (VF > MaxTotalNum || VF >= StoresLimit)
21402 break;
21403 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
21404 if (P.first != 0)
21405 P.first = std::max(a: P.second, b: P.first);
21406 }
21407 // Last attempt to vectorize max number of elements, if all previous
21408 // attempts were unsuccessful because of the cost issues.
21409 CandidateVFs.push_back(Elt: VF);
21410 }
21411 }
21412 };
21413
21414 /// Groups of stores to vectorize
21415 SmallVector<RelatedStoreInsts> SortedStores;
21416
21417 // Inserts the specified store SI with the given index Idx to the set of the
21418 // stores. If the store with the same distance is found already - stop
21419 // insertion, try to vectorize already found stores. If some stores from this
21420 // sequence were not vectorized - try to vectorize them with the new store
21421 // later. But this logic is applied only to the stores, that come before the
21422 // previous store with the same distance.
21423 // Example:
21424 // 1. store x, %p
21425 // 2. store y, %p+1
21426 // 3. store z, %p+2
21427 // 4. store a, %p
21428 // 5. store b, %p+3
21429 // - Scan this from the last to first store. The very first bunch of stores is
21430 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
21431 // vector).
21432 // - The next store in the list - #1 - has the same distance from store #5 as
21433 // the store #4.
21434 // - Try to vectorize sequence of stores 4,2,3,5.
21435 // - If all these stores are vectorized - just drop them.
21436 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
21437 // - Start new stores sequence.
21438 // The new bunch of stores is {1, {1, 0}}.
21439 // - Add the stores from previous sequence, that were not vectorized.
21440 // Here we consider the stores in the reversed order, rather they are used in
21441 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
21442 // Store #3 can be added -> comes after store #4 with the same distance as
21443 // store #1.
21444 // Store #5 cannot be added - comes before store #4.
21445 // This logic allows to improve the compile time, we assume that the stores
21446 // after previous store with the same distance most likely have memory
21447 // dependencies and no need to waste compile time to try to vectorize them.
21448 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
21449 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
21450 std::optional<int64_t> PtrDist;
21451 auto *RelatedStores = find_if(
21452 Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
21453 PtrDist = StoreSeq.getPointerDiff(SI&: *SI, DL: *DL, SE&: *SE);
21454 return PtrDist.has_value();
21455 });
21456
21457 // We did not find a comparable store, start a new group.
21458 if (RelatedStores == SortedStores.end()) {
21459 SortedStores.emplace_back(Args&: Idx, Args&: Stores);
21460 return;
21461 }
21462
21463 // If there is already a store in the group with the same PtrDiff, try to
21464 // vectorize the existing instructions before adding the current store.
21465 // Otherwise, insert this store and keep collecting.
21466 if (std::optional<unsigned> PrevInst =
21467 RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) {
21468 TryToVectorize(RelatedStores->getStores());
21469 RelatedStores->clearVectorizedStores(VectorizedStores);
21470 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
21471 /*NewBaseInstIdx=*/Idx,
21472 /*DistFromCurBase=*/*PtrDist);
21473 }
21474 };
21475 Type *PrevValTy = nullptr;
21476 for (auto [I, SI] : enumerate(First&: Stores)) {
21477 if (R.isDeleted(I: SI))
21478 continue;
21479 if (!PrevValTy)
21480 PrevValTy = SI->getValueOperand()->getType();
21481 // Check that we do not try to vectorize stores of different types.
21482 if (PrevValTy != SI->getValueOperand()->getType()) {
21483 for (RelatedStoreInsts &StoreSeq : SortedStores)
21484 TryToVectorize(StoreSeq.getStores());
21485 SortedStores.clear();
21486 PrevValTy = SI->getValueOperand()->getType();
21487 }
21488 FillStoresSet(I, SI);
21489 }
21490
21491 // Final vectorization attempt.
21492 for (RelatedStoreInsts &StoreSeq : SortedStores)
21493 TryToVectorize(StoreSeq.getStores());
21494
21495 return Changed;
21496}
21497
21498void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
21499 // Initialize the collections. We will make a single pass over the block.
21500 Stores.clear();
21501 GEPs.clear();
21502
21503 // Visit the store and getelementptr instructions in BB and organize them in
21504 // Stores and GEPs according to the underlying objects of their pointer
21505 // operands.
21506 for (Instruction &I : *BB) {
21507 // Ignore store instructions that are volatile or have a pointer operand
21508 // that doesn't point to a scalar type.
21509 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
21510 if (!SI->isSimple())
21511 continue;
21512 if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
21513 continue;
21514 Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
21515 }
21516
21517 // Ignore getelementptr instructions that have more than one index, a
21518 // constant index, or a pointer operand that doesn't point to a scalar
21519 // type.
21520 else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
21521 if (GEP->getNumIndices() != 1)
21522 continue;
21523 Value *Idx = GEP->idx_begin()->get();
21524 if (isa<Constant>(Val: Idx))
21525 continue;
21526 if (!isValidElementType(Ty: Idx->getType()))
21527 continue;
21528 if (GEP->getType()->isVectorTy())
21529 continue;
21530 GEPs[GEP->getPointerOperand()].push_back(Elt: GEP);
21531 }
21532 }
21533}
21534
21535bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
21536 bool MaxVFOnly) {
21537 if (VL.size() < 2)
21538 return false;
21539
21540 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
21541 << VL.size() << ".\n");
21542
21543 // Check that all of the parts are instructions of the same type,
21544 // we permit an alternate opcode via InstructionsState.
21545 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
21546 if (!S)
21547 return false;
21548
21549 Instruction *I0 = S.getMainOp();
21550 // Make sure invalid types (including vector type) are rejected before
21551 // determining vectorization factor for scalar instructions.
21552 for (Value *V : VL) {
21553 Type *Ty = V->getType();
21554 if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
21555 // NOTE: the following will give user internal llvm type name, which may
21556 // not be useful.
21557 R.getORE()->emit(RemarkBuilder: [&]() {
21558 std::string TypeStr;
21559 llvm::raw_string_ostream OS(TypeStr);
21560 Ty->print(O&: OS);
21561 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
21562 << "Cannot SLP vectorize list: type "
21563 << TypeStr + " is unsupported by vectorizer";
21564 });
21565 return false;
21566 }
21567 }
21568
21569 Type *ScalarTy = getValueType(V: VL[0]);
21570 unsigned Sz = R.getVectorElementSize(V: I0);
21571 unsigned MinVF = R.getMinVF(Sz);
21572 unsigned MaxVF = std::max<unsigned>(
21573 a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF);
21574 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
21575 if (MaxVF < 2) {
21576 R.getORE()->emit(RemarkBuilder: [&]() {
21577 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
21578 << "Cannot SLP vectorize list: vectorization factor "
21579 << "less than 2 is not supported";
21580 });
21581 return false;
21582 }
21583
21584 bool Changed = false;
21585 bool CandidateFound = false;
21586 InstructionCost MinCost = SLPCostThreshold.getValue();
21587
21588 unsigned NextInst = 0, MaxInst = VL.size();
21589 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
21590 VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - 1)) {
21591 // No actual vectorization should happen, if number of parts is the same as
21592 // provided vectorization factor (i.e. the scalar type is used for vector
21593 // code during codegen).
21594 auto *VecTy = getWidenedType(ScalarTy, VF);
21595 if (TTI->getNumberOfParts(Tp: VecTy) == VF)
21596 continue;
21597 for (unsigned I = NextInst; I < MaxInst; ++I) {
21598 unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
21599
21600 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF))
21601 continue;
21602
21603 if (MaxVFOnly && ActualVF < MaxVF)
21604 break;
21605 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
21606 break;
21607
21608 SmallVector<Value *> Ops(ActualVF, nullptr);
21609 unsigned Idx = 0;
21610 for (Value *V : VL.drop_front(N: I)) {
21611 // Check that a previous iteration of this loop did not delete the
21612 // Value.
21613 if (auto *Inst = dyn_cast<Instruction>(Val: V);
21614 !Inst || !R.isDeleted(I: Inst)) {
21615 Ops[Idx] = V;
21616 ++Idx;
21617 if (Idx == ActualVF)
21618 break;
21619 }
21620 }
21621 // Not enough vectorizable instructions - exit.
21622 if (Idx != ActualVF)
21623 break;
21624
21625 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
21626 << "\n");
21627
21628 R.buildTree(Roots: Ops);
21629 if (R.isTreeTinyAndNotFullyVectorizable())
21630 continue;
21631 if (R.isProfitableToReorder()) {
21632 R.reorderTopToBottom();
21633 R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front()));
21634 }
21635 R.transformNodes();
21636 R.buildExternalUses();
21637
21638 R.computeMinimumValueSizes();
21639 InstructionCost Cost = R.getTreeCost();
21640 CandidateFound = true;
21641 MinCost = std::min(a: MinCost, b: Cost);
21642
21643 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
21644 << " for VF=" << ActualVF << "\n");
21645 if (Cost < -SLPCostThreshold) {
21646 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
21647 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "VectorizedList",
21648 cast<Instruction>(Val: Ops[0]))
21649 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
21650 << " and with tree size "
21651 << ore::NV("TreeSize", R.getTreeSize()));
21652
21653 R.vectorizeTree();
21654 // Move to the next bundle.
21655 I += VF - 1;
21656 NextInst = I + 1;
21657 Changed = true;
21658 }
21659 }
21660 }
21661
21662 if (!Changed && CandidateFound) {
21663 R.getORE()->emit(RemarkBuilder: [&]() {
21664 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
21665 << "List vectorization was possible but not beneficial with cost "
21666 << ore::NV("Cost", MinCost) << " >= "
21667 << ore::NV("Treshold", -SLPCostThreshold);
21668 });
21669 } else if (!Changed) {
21670 R.getORE()->emit(RemarkBuilder: [&]() {
21671 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
21672 << "Cannot SLP vectorize list: vectorization was impossible"
21673 << " with available vectorization factors";
21674 });
21675 }
21676 return Changed;
21677}
21678
21679bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
21680 if (!I)
21681 return false;
21682
21683 if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType()))
21684 return false;
21685
21686 Value *P = I->getParent();
21687
21688 // Vectorize in current basic block only.
21689 auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
21690 auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1));
21691 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
21692 R.isDeleted(I: Op0) || R.isDeleted(I: Op1))
21693 return false;
21694
21695 // First collect all possible candidates
21696 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
21697 Candidates.emplace_back(Args&: Op0, Args&: Op1);
21698
21699 auto *A = dyn_cast<BinaryOperator>(Val: Op0);
21700 auto *B = dyn_cast<BinaryOperator>(Val: Op1);
21701 // Try to skip B.
21702 if (A && B && B->hasOneUse()) {
21703 auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0));
21704 auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1));
21705 if (B0 && B0->getParent() == P && !R.isDeleted(I: B0))
21706 Candidates.emplace_back(Args&: A, Args&: B0);
21707 if (B1 && B1->getParent() == P && !R.isDeleted(I: B1))
21708 Candidates.emplace_back(Args&: A, Args&: B1);
21709 }
21710 // Try to skip A.
21711 if (B && A && A->hasOneUse()) {
21712 auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0));
21713 auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1));
21714 if (A0 && A0->getParent() == P && !R.isDeleted(I: A0))
21715 Candidates.emplace_back(Args&: A0, Args&: B);
21716 if (A1 && A1->getParent() == P && !R.isDeleted(I: A1))
21717 Candidates.emplace_back(Args&: A1, Args&: B);
21718 }
21719
21720 if (Candidates.size() == 1)
21721 return tryToVectorizeList(VL: {Op0, Op1}, R);
21722
21723 // We have multiple options. Try to pick the single best.
21724 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
21725 if (!BestCandidate)
21726 return false;
21727 return tryToVectorizeList(
21728 VL: {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
21729}
21730
21731namespace {
21732
21733/// Model horizontal reductions.
21734///
21735/// A horizontal reduction is a tree of reduction instructions that has values
21736/// that can be put into a vector as its leaves. For example:
21737///
21738/// mul mul mul mul
21739/// \ / \ /
21740/// + +
21741/// \ /
21742/// +
21743/// This tree has "mul" as its leaf values and "+" as its reduction
21744/// instructions. A reduction can feed into a store or a binary operation
21745/// feeding a phi.
21746/// ...
21747/// \ /
21748/// +
21749/// |
21750/// phi +=
21751///
21752/// Or:
21753/// ...
21754/// \ /
21755/// +
21756/// |
21757/// *p =
21758///
21759class HorizontalReduction {
21760 using ReductionOpsType = SmallVector<Value *, 16>;
21761 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
21762 ReductionOpsListType ReductionOps;
21763 /// List of possibly reduced values.
21764 SmallVector<SmallVector<Value *>> ReducedVals;
21765 /// Maps reduced value to the corresponding reduction operation.
21766 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
21767 WeakTrackingVH ReductionRoot;
21768 /// The type of reduction operation.
21769 RecurKind RdxKind;
21770 /// Checks if the optimization of original scalar identity operations on
21771 /// matched horizontal reductions is enabled and allowed.
21772 bool IsSupportedHorRdxIdentityOp = false;
21773 /// Contains vector values for reduction including their scale factor and
21774 /// signedness.
21775 SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
21776
21777 static bool isCmpSelMinMax(Instruction *I) {
21778 return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
21779 RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
21780 }
21781
21782 // And/or are potentially poison-safe logical patterns like:
21783 // select x, y, false
21784 // select x, true, y
21785 static bool isBoolLogicOp(Instruction *I) {
21786 return isa<SelectInst>(Val: I) &&
21787 (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr()));
21788 }
21789
21790 /// Checks if instruction is associative and can be vectorized.
21791 static bool isVectorizable(RecurKind Kind, Instruction *I) {
21792 if (Kind == RecurKind::None)
21793 return false;
21794
21795 // Integer ops that map to select instructions or intrinsics are fine.
21796 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
21797 isBoolLogicOp(I))
21798 return true;
21799
21800 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
21801 // FP min/max are associative except for NaN and -0.0. We do not
21802 // have to rule out -0.0 here because the intrinsic semantics do not
21803 // specify a fixed result for it.
21804 return I->getFastMathFlags().noNaNs();
21805 }
21806
21807 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
21808 return true;
21809
21810 return I->isAssociative();
21811 }
21812
21813 static Value *getRdxOperand(Instruction *I, unsigned Index) {
21814 // Poison-safe 'or' takes the form: select X, true, Y
21815 // To make that work with the normal operand processing, we skip the
21816 // true value operand.
21817 // TODO: Change the code and data structures to handle this without a hack.
21818 if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1)
21819 return I->getOperand(i: 2);
21820 return I->getOperand(i: Index);
21821 }
21822
21823 /// Creates reduction operation with the current opcode.
21824 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
21825 Value *RHS, const Twine &Name, bool UseSelect) {
21826 Type *OpTy = LHS->getType();
21827 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
21828 switch (Kind) {
21829 case RecurKind::Or: {
21830 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
21831 return Builder.CreateSelect(
21832 C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
21833 False: RHS, Name);
21834 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
21835 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
21836 Name);
21837 }
21838 case RecurKind::And: {
21839 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
21840 return Builder.CreateSelect(
21841 C: LHS, True: RHS,
21842 False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)), Name);
21843 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
21844 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
21845 Name);
21846 }
21847 case RecurKind::Add:
21848 case RecurKind::Mul:
21849 case RecurKind::Xor:
21850 case RecurKind::FAdd:
21851 case RecurKind::FMul: {
21852 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
21853 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
21854 Name);
21855 }
21856 case RecurKind::SMax:
21857 case RecurKind::SMin:
21858 case RecurKind::UMax:
21859 case RecurKind::UMin:
21860 if (UseSelect) {
21861 CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind);
21862 Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name);
21863 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
21864 }
21865 [[fallthrough]];
21866 case RecurKind::FMax:
21867 case RecurKind::FMin:
21868 case RecurKind::FMaximum:
21869 case RecurKind::FMinimum:
21870 case RecurKind::FMaximumNum:
21871 case RecurKind::FMinimumNum: {
21872 Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind);
21873 return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS);
21874 }
21875 default:
21876 llvm_unreachable("Unknown reduction operation.");
21877 }
21878 }
21879
21880 /// Creates reduction operation with the current opcode with the IR flags
21881 /// from \p ReductionOps, dropping nuw/nsw flags.
21882 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
21883 Value *RHS, const Twine &Name,
21884 const ReductionOpsListType &ReductionOps) {
21885 bool UseSelect = ReductionOps.size() == 2 ||
21886 // Logical or/and.
21887 (ReductionOps.size() == 1 &&
21888 any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
21889 assert((!UseSelect || ReductionOps.size() != 2 ||
21890 isa<SelectInst>(ReductionOps[1][0])) &&
21891 "Expected cmp + select pairs for reduction");
21892 Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
21893 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
21894 if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
21895 propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr,
21896 /*IncludeWrapFlags=*/false);
21897 propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr,
21898 /*IncludeWrapFlags=*/false);
21899 return Op;
21900 }
21901 }
21902 propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false);
21903 return Op;
21904 }
21905
21906public:
21907 static RecurKind getRdxKind(Value *V) {
21908 auto *I = dyn_cast<Instruction>(Val: V);
21909 if (!I)
21910 return RecurKind::None;
21911 if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
21912 return RecurKind::Add;
21913 if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
21914 return RecurKind::Mul;
21915 if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) ||
21916 match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
21917 return RecurKind::And;
21918 if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) ||
21919 match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
21920 return RecurKind::Or;
21921 if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
21922 return RecurKind::Xor;
21923 if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
21924 return RecurKind::FAdd;
21925 if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
21926 return RecurKind::FMul;
21927
21928 if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
21929 return RecurKind::FMax;
21930 if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
21931 return RecurKind::FMin;
21932
21933 if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value())))
21934 return RecurKind::FMaximum;
21935 if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value())))
21936 return RecurKind::FMinimum;
21937 // This matches either cmp+select or intrinsics. SLP is expected to handle
21938 // either form.
21939 // TODO: If we are canonicalizing to intrinsics, we can remove several
21940 // special-case paths that deal with selects.
21941 if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
21942 return RecurKind::SMax;
21943 if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
21944 return RecurKind::SMin;
21945 if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
21946 return RecurKind::UMax;
21947 if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
21948 return RecurKind::UMin;
21949
21950 if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
21951 // Try harder: look for min/max pattern based on instructions producing
21952 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
21953 // During the intermediate stages of SLP, it's very common to have
21954 // pattern like this (since optimizeGatherSequence is run only once
21955 // at the end):
21956 // %1 = extractelement <2 x i32> %a, i32 0
21957 // %2 = extractelement <2 x i32> %a, i32 1
21958 // %cond = icmp sgt i32 %1, %2
21959 // %3 = extractelement <2 x i32> %a, i32 0
21960 // %4 = extractelement <2 x i32> %a, i32 1
21961 // %select = select i1 %cond, i32 %3, i32 %4
21962 CmpPredicate Pred;
21963 Instruction *L1;
21964 Instruction *L2;
21965
21966 Value *LHS = Select->getTrueValue();
21967 Value *RHS = Select->getFalseValue();
21968 Value *Cond = Select->getCondition();
21969
21970 // TODO: Support inverse predicates.
21971 if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
21972 if (!isa<ExtractElementInst>(Val: RHS) ||
21973 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
21974 return RecurKind::None;
21975 } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
21976 if (!isa<ExtractElementInst>(Val: LHS) ||
21977 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
21978 return RecurKind::None;
21979 } else {
21980 if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS))
21981 return RecurKind::None;
21982 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) ||
21983 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) ||
21984 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
21985 return RecurKind::None;
21986 }
21987
21988 switch (Pred) {
21989 default:
21990 return RecurKind::None;
21991 case CmpInst::ICMP_SGT:
21992 case CmpInst::ICMP_SGE:
21993 return RecurKind::SMax;
21994 case CmpInst::ICMP_SLT:
21995 case CmpInst::ICMP_SLE:
21996 return RecurKind::SMin;
21997 case CmpInst::ICMP_UGT:
21998 case CmpInst::ICMP_UGE:
21999 return RecurKind::UMax;
22000 case CmpInst::ICMP_ULT:
22001 case CmpInst::ICMP_ULE:
22002 return RecurKind::UMin;
22003 }
22004 }
22005 return RecurKind::None;
22006 }
22007
22008 /// Get the index of the first operand.
22009 static unsigned getFirstOperandIndex(Instruction *I) {
22010 return isCmpSelMinMax(I) ? 1 : 0;
22011 }
22012
22013private:
22014 /// Total number of operands in the reduction operation.
22015 static unsigned getNumberOfOperands(Instruction *I) {
22016 return isCmpSelMinMax(I) ? 3 : 2;
22017 }
22018
22019 /// Checks if the instruction is in basic block \p BB.
22020 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
22021 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
22022 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
22023 auto *Sel = cast<SelectInst>(Val: I);
22024 auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
22025 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
22026 }
22027 return I->getParent() == BB;
22028 }
22029
22030 /// Expected number of uses for reduction operations/reduced values.
22031 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
22032 if (IsCmpSelMinMax) {
22033 // SelectInst must be used twice while the condition op must have single
22034 // use only.
22035 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
22036 return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse();
22037 return I->hasNUses(N: 2);
22038 }
22039
22040 // Arithmetic reduction operation must be used once only.
22041 return I->hasOneUse();
22042 }
22043
22044 /// Initializes the list of reduction operations.
22045 void initReductionOps(Instruction *I) {
22046 if (isCmpSelMinMax(I))
22047 ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType());
22048 else
22049 ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType());
22050 }
22051
22052 /// Add all reduction operations for the reduction instruction \p I.
22053 void addReductionOps(Instruction *I) {
22054 if (isCmpSelMinMax(I)) {
22055 ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
22056 ReductionOps[1].emplace_back(Args&: I);
22057 } else {
22058 ReductionOps[0].emplace_back(Args&: I);
22059 }
22060 }
22061
22062 static bool isGoodForReduction(ArrayRef<Value *> Data) {
22063 int Sz = Data.size();
22064 auto *I = dyn_cast<Instruction>(Val: Data.front());
22065 return Sz > 1 || isConstant(V: Data.front()) ||
22066 (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
22067 }
22068
22069public:
22070 HorizontalReduction() = default;
22071
22072 /// Try to find a reduction tree.
22073 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
22074 ScalarEvolution &SE, const DataLayout &DL,
22075 const TargetLibraryInfo &TLI) {
22076 RdxKind = HorizontalReduction::getRdxKind(V: Root);
22077 if (!isVectorizable(Kind: RdxKind, I: Root))
22078 return false;
22079
22080 // Analyze "regular" integer/FP types for reductions - no target-specific
22081 // types or pointers.
22082 Type *Ty = Root->getType();
22083 if (!isValidElementType(Ty) || Ty->isPointerTy())
22084 return false;
22085
22086 // Though the ultimate reduction may have multiple uses, its condition must
22087 // have only single use.
22088 if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
22089 if (!Sel->getCondition()->hasOneUse())
22090 return false;
22091
22092 ReductionRoot = Root;
22093
22094 // Iterate through all the operands of the possible reduction tree and
22095 // gather all the reduced values, sorting them by their value id.
22096 BasicBlock *BB = Root->getParent();
22097 bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
22098 SmallVector<std::pair<Instruction *, unsigned>> Worklist(
22099 1, std::make_pair(x&: Root, y: 0));
22100 // Checks if the operands of the \p TreeN instruction are also reduction
22101 // operations or should be treated as reduced values or an extra argument,
22102 // which is not part of the reduction.
22103 auto CheckOperands = [&](Instruction *TreeN,
22104 SmallVectorImpl<Value *> &PossibleReducedVals,
22105 SmallVectorImpl<Instruction *> &ReductionOps,
22106 unsigned Level) {
22107 for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
22108 End: getNumberOfOperands(I: TreeN)))) {
22109 Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
22110 ReducedValsToOps[EdgeVal].push_back(Elt: TreeN);
22111 auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
22112 // If the edge is not an instruction, or it is different from the main
22113 // reduction opcode or has too many uses - possible reduced value.
22114 // Also, do not try to reduce const values, if the operation is not
22115 // foldable.
22116 if (!EdgeInst || Level > RecursionMaxDepth ||
22117 getRdxKind(V: EdgeInst) != RdxKind ||
22118 IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) ||
22119 !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) ||
22120 !isVectorizable(Kind: RdxKind, I: EdgeInst) ||
22121 (R.isAnalyzedReductionRoot(I: EdgeInst) &&
22122 all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
22123 PossibleReducedVals.push_back(Elt: EdgeVal);
22124 continue;
22125 }
22126 ReductionOps.push_back(Elt: EdgeInst);
22127 }
22128 };
22129 // Try to regroup reduced values so that it gets more profitable to try to
22130 // reduce them. Values are grouped by their value ids, instructions - by
22131 // instruction op id and/or alternate op id, plus do extra analysis for
22132 // loads (grouping them by the distabce between pointers) and cmp
22133 // instructions (grouping them by the predicate).
22134 SmallMapVector<
22135 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
22136 8>
22137 PossibleReducedVals;
22138 initReductionOps(I: Root);
22139 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
22140 SmallSet<size_t, 2> LoadKeyUsed;
22141
22142 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
22143 Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
22144 Value *Ptr =
22145 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
22146 if (!LoadKeyUsed.insert(V: Key).second) {
22147 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
22148 if (LIt != LoadsMap.end()) {
22149 for (LoadInst *RLI : LIt->second) {
22150 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
22151 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
22152 /*StrictCheck=*/true))
22153 return hash_value(ptr: RLI->getPointerOperand());
22154 }
22155 for (LoadInst *RLI : LIt->second) {
22156 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
22157 Ptr2: LI->getPointerOperand(), TLI)) {
22158 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
22159 return SubKey;
22160 }
22161 }
22162 if (LIt->second.size() > 2) {
22163 hash_code SubKey =
22164 hash_value(ptr: LIt->second.back()->getPointerOperand());
22165 return SubKey;
22166 }
22167 }
22168 }
22169 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr))
22170 .first->second.push_back(Elt: LI);
22171 return hash_value(ptr: LI->getPointerOperand());
22172 };
22173
22174 while (!Worklist.empty()) {
22175 auto [TreeN, Level] = Worklist.pop_back_val();
22176 SmallVector<Value *> PossibleRedVals;
22177 SmallVector<Instruction *> PossibleReductionOps;
22178 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
22179 addReductionOps(I: TreeN);
22180 // Add reduction values. The values are sorted for better vectorization
22181 // results.
22182 for (Value *V : PossibleRedVals) {
22183 size_t Key, Idx;
22184 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
22185 /*AllowAlternate=*/false);
22186 ++PossibleReducedVals[Key][Idx]
22187 .insert(KV: std::make_pair(x&: V, y: 0))
22188 .first->second;
22189 }
22190 for (Instruction *I : reverse(C&: PossibleReductionOps))
22191 Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? 0 : Level + 1);
22192 }
22193 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
22194 // Sort values by the total number of values kinds to start the reduction
22195 // from the longest possible reduced values sequences.
22196 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
22197 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
22198 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
22199 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
22200 It != E; ++It) {
22201 PossibleRedValsVect.emplace_back();
22202 auto RedValsVect = It->second.takeVector();
22203 stable_sort(Range&: RedValsVect, C: llvm::less_second());
22204 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
22205 PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
22206 }
22207 stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
22208 return P1.size() > P2.size();
22209 });
22210 int NewIdx = -1;
22211 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
22212 if (NewIdx < 0 ||
22213 (!isGoodForReduction(Data) &&
22214 (!isa<LoadInst>(Val: Data.front()) ||
22215 !isa<LoadInst>(Val: ReducedVals[NewIdx].front()) ||
22216 getUnderlyingObject(
22217 V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) !=
22218 getUnderlyingObject(
22219 V: cast<LoadInst>(Val: ReducedVals[NewIdx].front())
22220 ->getPointerOperand())))) {
22221 NewIdx = ReducedVals.size();
22222 ReducedVals.emplace_back();
22223 }
22224 ReducedVals[NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend());
22225 }
22226 }
22227 // Sort the reduced values by number of same/alternate opcode and/or pointer
22228 // operand.
22229 stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
22230 return P1.size() > P2.size();
22231 });
22232 return true;
22233 }
22234
22235 /// Attempt to vectorize the tree found by matchAssociativeReduction.
22236 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
22237 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
22238 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
22239 constexpr unsigned RegMaxNumber = 4;
22240 constexpr unsigned RedValsMaxNumber = 128;
22241 // If there are a sufficient number of reduction values, reduce
22242 // to a nearby power-of-2. We can safely generate oversized
22243 // vectors and rely on the backend to split them to legal sizes.
22244 if (unsigned NumReducedVals = std::accumulate(
22245 first: ReducedVals.begin(), last: ReducedVals.end(), init: 0,
22246 binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
22247 if (!isGoodForReduction(Data: Vals))
22248 return Num;
22249 return Num + Vals.size();
22250 });
22251 NumReducedVals < ReductionLimit &&
22252 all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
22253 return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV);
22254 })) {
22255 for (ReductionOpsType &RdxOps : ReductionOps)
22256 for (Value *RdxOp : RdxOps)
22257 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
22258 return nullptr;
22259 }
22260
22261 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
22262 TargetFolder(DL));
22263 Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
22264
22265 // Track the reduced values in case if they are replaced by extractelement
22266 // because of the vectorization.
22267 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
22268 ReducedVals.front().size());
22269
22270 // The compare instruction of a min/max is the insertion point for new
22271 // instructions and may be replaced with a new compare instruction.
22272 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
22273 assert(isa<SelectInst>(RdxRootInst) &&
22274 "Expected min/max reduction to have select root instruction");
22275 Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
22276 assert(isa<Instruction>(ScalarCond) &&
22277 "Expected min/max reduction to have compare condition");
22278 return cast<Instruction>(Val: ScalarCond);
22279 };
22280
22281 bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) {
22282 return isBoolLogicOp(I: cast<Instruction>(Val: V));
22283 });
22284 // Return new VectorizedTree, based on previous value.
22285 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
22286 if (VectorizedTree) {
22287 // Update the final value in the reduction.
22288 Builder.SetCurrentDebugLocation(
22289 cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
22290 if (AnyBoolLogicOp) {
22291 auto It = ReducedValsToOps.find(Val: VectorizedTree);
22292 auto It1 = ReducedValsToOps.find(Val: Res);
22293 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
22294 isGuaranteedNotToBePoison(V: VectorizedTree, AC) ||
22295 (It != ReducedValsToOps.end() &&
22296 any_of(Range&: It->getSecond(), P: [&](Instruction *I) {
22297 return isBoolLogicOp(I) &&
22298 getRdxOperand(I, Index: 0) == VectorizedTree;
22299 }))) {
22300 ;
22301 } else if (isGuaranteedNotToBePoison(V: Res, AC) ||
22302 (It1 != ReducedValsToOps.end() &&
22303 any_of(Range&: It1->getSecond(), P: [&](Instruction *I) {
22304 return isBoolLogicOp(I) && getRdxOperand(I, Index: 0) == Res;
22305 }))) {
22306 std::swap(a&: VectorizedTree, b&: Res);
22307 } else {
22308 VectorizedTree = Builder.CreateFreeze(V: VectorizedTree);
22309 }
22310 }
22311
22312 return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
22313 ReductionOps);
22314 }
22315 // Initialize the final value in the reduction.
22316 return Res;
22317 };
22318 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
22319 ReductionOps.front().size());
22320 for (ReductionOpsType &RdxOps : ReductionOps)
22321 for (Value *RdxOp : RdxOps) {
22322 if (!RdxOp)
22323 continue;
22324 IgnoreList.insert(V: RdxOp);
22325 }
22326 // Intersect the fast-math-flags from all reduction operations.
22327 FastMathFlags RdxFMF;
22328 RdxFMF.set();
22329 for (Value *U : IgnoreList)
22330 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
22331 RdxFMF &= FPMO->getFastMathFlags();
22332 bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
22333
22334 // Need to track reduced vals, they may be changed during vectorization of
22335 // subvectors.
22336 for (ArrayRef<Value *> Candidates : ReducedVals)
22337 for (Value *V : Candidates)
22338 TrackedVals.try_emplace(Key: V, Args&: V);
22339
22340 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
22341 Value *V) -> unsigned & {
22342 auto *It = MV.find(Key: V);
22343 assert(It != MV.end() && "Unable to find given key.");
22344 return It->second;
22345 };
22346
22347 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
22348 // List of the values that were reduced in other trees as part of gather
22349 // nodes and thus requiring extract if fully vectorized in other trees.
22350 SmallPtrSet<Value *, 4> RequiredExtract;
22351 WeakTrackingVH VectorizedTree = nullptr;
22352 bool CheckForReusedReductionOps = false;
22353 // Try to vectorize elements based on their type.
22354 SmallVector<InstructionsState> States;
22355 for (ArrayRef<Value *> RV : ReducedVals)
22356 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
22357 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
22358 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
22359 InstructionsState S = States[I];
22360 SmallVector<Value *> Candidates;
22361 Candidates.reserve(N: 2 * OrigReducedVals.size());
22362 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
22363 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
22364 Value *RdxVal = TrackedVals.at(Val: OrigReducedVals[Cnt]);
22365 // Check if the reduction value was not overriden by the extractelement
22366 // instruction because of the vectorization and exclude it, if it is not
22367 // compatible with other values.
22368 // Also check if the instruction was folded to constant/other value.
22369 auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
22370 if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
22371 (!S || !S.getMatchingMainOpOrAltOp(I: Inst))) ||
22372 (S && !Inst))
22373 continue;
22374 Candidates.push_back(Elt: RdxVal);
22375 TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals[Cnt]);
22376 }
22377 bool ShuffledExtracts = false;
22378 // Try to handle shuffled extractelements.
22379 if (S && S.getOpcode() == Instruction::ExtractElement &&
22380 !S.isAltShuffle() && I + 1 < E) {
22381 SmallVector<Value *> CommonCandidates(Candidates);
22382 for (Value *RV : ReducedVals[I + 1]) {
22383 Value *RdxVal = TrackedVals.at(Val: RV);
22384 // Check if the reduction value was not overriden by the
22385 // extractelement instruction because of the vectorization and
22386 // exclude it, if it is not compatible with other values.
22387 auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal);
22388 if (!Inst)
22389 continue;
22390 CommonCandidates.push_back(Elt: RdxVal);
22391 TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
22392 }
22393 SmallVector<int> Mask;
22394 if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) {
22395 ++I;
22396 Candidates.swap(RHS&: CommonCandidates);
22397 ShuffledExtracts = true;
22398 }
22399 }
22400
22401 // Emit code for constant values.
22402 if (Candidates.size() > 1 && allConstant(VL: Candidates)) {
22403 Value *Res = Candidates.front();
22404 Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
22405 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
22406 for (Value *VC : ArrayRef(Candidates).drop_front()) {
22407 Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
22408 Value *OrigV = TrackedToOrig.at(Val: VC);
22409 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
22410 if (auto *ResI = dyn_cast<Instruction>(Val: Res))
22411 V.analyzedReductionRoot(I: ResI);
22412 }
22413 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
22414 continue;
22415 }
22416
22417 unsigned NumReducedVals = Candidates.size();
22418 if (NumReducedVals < ReductionLimit &&
22419 (NumReducedVals < 2 || !isSplat(VL: Candidates)))
22420 continue;
22421
22422 // Check if we support repeated scalar values processing (optimization of
22423 // original scalar identity operations on matched horizontal reductions).
22424 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
22425 RdxKind != RecurKind::FMul &&
22426 RdxKind != RecurKind::FMulAdd;
22427 // Gather same values.
22428 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
22429 if (IsSupportedHorRdxIdentityOp)
22430 for (Value *V : Candidates) {
22431 Value *OrigV = TrackedToOrig.at(Val: V);
22432 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
22433 }
22434 // Used to check if the reduced values used same number of times. In this
22435 // case the compiler may produce better code. E.g. if reduced values are
22436 // aabbccdd (8 x values), then the first node of the tree will have a node
22437 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
22438 // Plus, the final reduction will be performed on <8 x aabbccdd>.
22439 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
22440 // x abcd) * 2.
22441 // Currently it only handles add/fadd/xor. and/or/min/max do not require
22442 // this analysis, other operations may require an extra estimation of
22443 // the profitability.
22444 bool SameScaleFactor = false;
22445 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
22446 SameValuesCounter.size() != Candidates.size();
22447 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
22448 if (OptReusedScalars) {
22449 SameScaleFactor =
22450 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
22451 RdxKind == RecurKind::Xor) &&
22452 all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
22453 P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
22454 return P.second == SameValuesCounter.front().second;
22455 });
22456 Candidates.resize(N: SameValuesCounter.size());
22457 transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
22458 F: [&](const auto &P) { return TrackedVals.at(Val: P.first); });
22459 NumReducedVals = Candidates.size();
22460 // Have a reduction of the same element.
22461 if (NumReducedVals == 1) {
22462 Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
22463 unsigned Cnt = At(SameValuesCounter, OrigV);
22464 Value *RedVal =
22465 emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
22466 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
22467 VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
22468 ExternallyUsedValues.insert(V: OrigV);
22469 continue;
22470 }
22471 }
22472
22473 unsigned MaxVecRegSize = V.getMaxVecRegSize();
22474 unsigned EltSize = V.getVectorElementSize(V: Candidates[0]);
22475 const unsigned MaxElts = std::clamp<unsigned>(
22476 val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber,
22477 hi: RegMaxNumber * RedValsMaxNumber);
22478
22479 unsigned ReduxWidth = NumReducedVals;
22480 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
22481 unsigned NumParts, NumRegs;
22482 Type *ScalarTy = Candidates.front()->getType();
22483 ReduxWidth =
22484 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth);
22485 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
22486 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
22487 NumRegs =
22488 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
22489 while (NumParts > NumRegs) {
22490 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
22491 ReduxWidth = bit_floor(Value: ReduxWidth - 1);
22492 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
22493 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
22494 NumRegs =
22495 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
22496 }
22497 if (NumParts > NumRegs / 2)
22498 ReduxWidth = bit_floor(Value: ReduxWidth);
22499 return ReduxWidth;
22500 };
22501 if (!VectorizeNonPowerOf2 || !has_single_bit(Value: ReduxWidth + 1))
22502 ReduxWidth = GetVectorFactor(ReduxWidth);
22503 ReduxWidth = std::min(a: ReduxWidth, b: MaxElts);
22504
22505 unsigned Start = 0;
22506 unsigned Pos = Start;
22507 // Restarts vectorization attempt with lower vector factor.
22508 unsigned PrevReduxWidth = ReduxWidth;
22509 bool CheckForReusedReductionOpsLocal = false;
22510 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
22511 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
22512 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
22513 // Check if any of the reduction ops are gathered. If so, worth
22514 // trying again with less number of reduction ops.
22515 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
22516 }
22517 ++Pos;
22518 if (Pos < NumReducedVals - ReduxWidth + 1)
22519 return IsAnyRedOpGathered;
22520 Pos = Start;
22521 --ReduxWidth;
22522 if (ReduxWidth > 1)
22523 ReduxWidth = GetVectorFactor(ReduxWidth);
22524 return IsAnyRedOpGathered;
22525 };
22526 bool AnyVectorized = false;
22527 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
22528 while (Pos < NumReducedVals - ReduxWidth + 1 &&
22529 ReduxWidth >= ReductionLimit) {
22530 // Dependency in tree of the reduction ops - drop this attempt, try
22531 // later.
22532 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
22533 Start == 0) {
22534 CheckForReusedReductionOps = true;
22535 break;
22536 }
22537 PrevReduxWidth = ReduxWidth;
22538 ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
22539 // Been analyzed already - skip.
22540 if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) ||
22541 (!has_single_bit(Value: ReduxWidth) &&
22542 (IgnoredCandidates.contains(
22543 V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) ||
22544 IgnoredCandidates.contains(
22545 V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)),
22546 y: bit_floor(Value: ReduxWidth))))) ||
22547 V.areAnalyzedReductionVals(VL)) {
22548 (void)AdjustReducedVals(/*IgnoreVL=*/true);
22549 continue;
22550 }
22551 // Early exit if any of the reduction values were deleted during
22552 // previous vectorization attempts.
22553 if (any_of(Range&: VL, P: [&V](Value *RedVal) {
22554 auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
22555 if (!RedValI)
22556 return false;
22557 return V.isDeleted(I: RedValI);
22558 }))
22559 break;
22560 V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
22561 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
22562 if (!AdjustReducedVals())
22563 V.analyzedReductionVals(VL);
22564 continue;
22565 }
22566 if (V.isLoadCombineReductionCandidate(RdxKind)) {
22567 if (!AdjustReducedVals())
22568 V.analyzedReductionVals(VL);
22569 continue;
22570 }
22571 V.reorderTopToBottom();
22572 // No need to reorder the root node at all.
22573 V.reorderBottomToTop(/*IgnoreReorder=*/true);
22574 // Keep extracted other reduction values, if they are used in the
22575 // vectorization trees.
22576 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
22577 ExternallyUsedValues);
22578 // The reduction root is used as the insertion point for new
22579 // instructions, so set it as externally used to prevent it from being
22580 // deleted.
22581 LocalExternallyUsedValues.insert(V: ReductionRoot);
22582 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
22583 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
22584 continue;
22585 for (Value *V : ReducedVals[Cnt])
22586 if (isa<Instruction>(Val: V))
22587 LocalExternallyUsedValues.insert(V: TrackedVals[V]);
22588 }
22589 if (!IsSupportedHorRdxIdentityOp) {
22590 // Number of uses of the candidates in the vector of values.
22591 assert(SameValuesCounter.empty() &&
22592 "Reused values counter map is not empty");
22593 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
22594 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
22595 continue;
22596 Value *V = Candidates[Cnt];
22597 Value *OrigV = TrackedToOrig.at(Val: V);
22598 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
22599 }
22600 }
22601 V.transformNodes();
22602 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
22603 // Gather externally used values.
22604 SmallPtrSet<Value *, 4> Visited;
22605 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
22606 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
22607 continue;
22608 Value *RdxVal = Candidates[Cnt];
22609 if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end())
22610 RdxVal = It->second;
22611 if (!Visited.insert(Ptr: RdxVal).second)
22612 continue;
22613 // Check if the scalar was vectorized as part of the vectorization
22614 // tree but not the top node.
22615 if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
22616 LocalExternallyUsedValues.insert(V: RdxVal);
22617 continue;
22618 }
22619 Value *OrigV = TrackedToOrig.at(Val: RdxVal);
22620 unsigned NumOps =
22621 VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV);
22622 if (NumOps != ReducedValsToOps.at(Val: OrigV).size())
22623 LocalExternallyUsedValues.insert(V: RdxVal);
22624 }
22625 // Do not need the list of reused scalars in regular mode anymore.
22626 if (!IsSupportedHorRdxIdentityOp)
22627 SameValuesCounter.clear();
22628 for (Value *RdxVal : VL)
22629 if (RequiredExtract.contains(Ptr: RdxVal))
22630 LocalExternallyUsedValues.insert(V: RdxVal);
22631 V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
22632
22633 V.computeMinimumValueSizes();
22634
22635 // Estimate cost.
22636 InstructionCost ReductionCost =
22637 getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, FMF: RdxFMF, R: V);
22638 InstructionCost Cost = V.getTreeCost(VectorizedVals: VL, ReductionCost);
22639 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
22640 << " for reduction\n");
22641 if (!Cost.isValid())
22642 break;
22643 if (Cost >= -SLPCostThreshold) {
22644 V.getORE()->emit(RemarkBuilder: [&]() {
22645 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
22646 ReducedValsToOps.at(Val: VL[0]).front())
22647 << "Vectorizing horizontal reduction is possible "
22648 << "but not beneficial with cost " << ore::NV("Cost", Cost)
22649 << " and threshold "
22650 << ore::NV("Threshold", -SLPCostThreshold);
22651 });
22652 if (!AdjustReducedVals()) {
22653 V.analyzedReductionVals(VL);
22654 unsigned Offset = Pos == Start ? Pos : Pos - 1;
22655 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
22656 // Add subvectors of VL to the list of the analyzed values.
22657 for (unsigned VF = getFloorFullVectorNumberOfElements(
22658 TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - 1);
22659 VF >= ReductionLimit;
22660 VF = getFloorFullVectorNumberOfElements(
22661 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
22662 if (has_single_bit(Value: VF) &&
22663 V.getCanonicalGraphSize() != V.getTreeSize())
22664 continue;
22665 for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF))
22666 IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF));
22667 }
22668 }
22669 }
22670 continue;
22671 }
22672
22673 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
22674 << Cost << ". (HorRdx)\n");
22675 V.getORE()->emit(RemarkBuilder: [&]() {
22676 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
22677 ReducedValsToOps.at(Val: VL[0]).front())
22678 << "Vectorized horizontal reduction with cost "
22679 << ore::NV("Cost", Cost) << " and with tree size "
22680 << ore::NV("TreeSize", V.getTreeSize());
22681 });
22682
22683 Builder.setFastMathFlags(RdxFMF);
22684
22685 // Emit a reduction. If the root is a select (min/max idiom), the insert
22686 // point is the compare condition of that select.
22687 Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
22688 Instruction *InsertPt = RdxRootInst;
22689 if (IsCmpSelMinMax)
22690 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
22691
22692 // Vectorize a tree.
22693 Value *VectorizedRoot = V.vectorizeTree(
22694 ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales);
22695 // Update TrackedToOrig mapping, since the tracked values might be
22696 // updated.
22697 for (Value *RdxVal : Candidates) {
22698 Value *OrigVal = TrackedToOrig.at(Val: RdxVal);
22699 Value *TransformedRdxVal = TrackedVals.at(Val: OrigVal);
22700 if (TransformedRdxVal != RdxVal)
22701 TrackedToOrig.try_emplace(Key: TransformedRdxVal, Args&: OrigVal);
22702 }
22703
22704 Builder.SetInsertPoint(InsertPt);
22705
22706 // To prevent poison from leaking across what used to be sequential,
22707 // safe, scalar boolean logic operations, the reduction operand must be
22708 // frozen.
22709 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC))
22710 VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
22711
22712 // Emit code to correctly handle reused reduced values, if required.
22713 if (OptReusedScalars && !SameScaleFactor) {
22714 VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
22715 SameValuesCounter, TrackedToOrig);
22716 }
22717
22718 Type *ScalarTy = VL.front()->getType();
22719 Type *VecTy = VectorizedRoot->getType();
22720 Type *RedScalarTy = VecTy->getScalarType();
22721 VectorValuesAndScales.emplace_back(
22722 Args&: VectorizedRoot,
22723 Args: OptReusedScalars && SameScaleFactor
22724 ? SameValuesCounter.front().second
22725 : 1,
22726 Args: RedScalarTy != ScalarTy->getScalarType()
22727 ? V.isSignedMinBitwidthRootNode()
22728 : true);
22729
22730 // Count vectorized reduced values to exclude them from final reduction.
22731 for (Value *RdxVal : VL) {
22732 Value *OrigV = TrackedToOrig.at(Val: RdxVal);
22733 if (IsSupportedHorRdxIdentityOp) {
22734 VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV));
22735 continue;
22736 }
22737 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
22738 if (!V.isVectorized(V: RdxVal))
22739 RequiredExtract.insert(Ptr: RdxVal);
22740 }
22741 Pos += ReduxWidth;
22742 Start = Pos;
22743 ReduxWidth = NumReducedVals - Pos;
22744 if (ReduxWidth > 1)
22745 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
22746 AnyVectorized = true;
22747 }
22748 if (OptReusedScalars && !AnyVectorized) {
22749 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
22750 Value *RdxVal = TrackedVals.at(Val: P.first);
22751 Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second);
22752 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
22753 VectorizedVals.try_emplace(Key: P.first, Args: P.second);
22754 }
22755 continue;
22756 }
22757 }
22758 if (!VectorValuesAndScales.empty())
22759 VectorizedTree = GetNewVectorizedTree(
22760 VectorizedTree,
22761 emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot->getType()));
22762 if (VectorizedTree) {
22763 // Reorder operands of bool logical op in the natural order to avoid
22764 // possible problem with poison propagation. If not possible to reorder
22765 // (both operands are originally RHS), emit an extra freeze instruction
22766 // for the LHS operand.
22767 // I.e., if we have original code like this:
22768 // RedOp1 = select i1 ?, i1 LHS, i1 false
22769 // RedOp2 = select i1 RHS, i1 ?, i1 false
22770
22771 // Then, we swap LHS/RHS to create a new op that matches the poison
22772 // semantics of the original code.
22773
22774 // If we have original code like this and both values could be poison:
22775 // RedOp1 = select i1 ?, i1 LHS, i1 false
22776 // RedOp2 = select i1 ?, i1 RHS, i1 false
22777
22778 // Then, we must freeze LHS in the new op.
22779 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
22780 Instruction *RedOp1,
22781 Instruction *RedOp2,
22782 bool InitStep) {
22783 if (!AnyBoolLogicOp)
22784 return;
22785 if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
22786 getRdxOperand(I: RedOp1, Index: 0) == LHS ||
22787 isGuaranteedNotToBePoison(V: LHS, AC)))
22788 return;
22789 if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
22790 getRdxOperand(I: RedOp2, Index: 0) == RHS ||
22791 isGuaranteedNotToBePoison(V: RHS, AC))) {
22792 std::swap(a&: LHS, b&: RHS);
22793 return;
22794 }
22795 if (LHS != VectorizedTree)
22796 LHS = Builder.CreateFreeze(V: LHS);
22797 };
22798 // Finish the reduction.
22799 // Need to add extra arguments and not vectorized possible reduction
22800 // values.
22801 // Try to avoid dependencies between the scalar remainders after
22802 // reductions.
22803 auto FinalGen =
22804 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
22805 bool InitStep) {
22806 unsigned Sz = InstVals.size();
22807 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
22808 Sz % 2);
22809 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
22810 Instruction *RedOp = InstVals[I + 1].first;
22811 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
22812 Value *RdxVal1 = InstVals[I].second;
22813 Value *StableRdxVal1 = RdxVal1;
22814 auto It1 = TrackedVals.find(Val: RdxVal1);
22815 if (It1 != TrackedVals.end())
22816 StableRdxVal1 = It1->second;
22817 Value *RdxVal2 = InstVals[I + 1].second;
22818 Value *StableRdxVal2 = RdxVal2;
22819 auto It2 = TrackedVals.find(Val: RdxVal2);
22820 if (It2 != TrackedVals.end())
22821 StableRdxVal2 = It2->second;
22822 // To prevent poison from leaking across what used to be
22823 // sequential, safe, scalar boolean logic operations, the
22824 // reduction operand must be frozen.
22825 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
22826 RedOp, InitStep);
22827 Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
22828 RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
22829 ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed);
22830 }
22831 if (Sz % 2 == 1)
22832 ExtraReds[Sz / 2] = InstVals.back();
22833 return ExtraReds;
22834 };
22835 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
22836 ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
22837 Args&: VectorizedTree);
22838 SmallPtrSet<Value *, 8> Visited;
22839 for (ArrayRef<Value *> Candidates : ReducedVals) {
22840 for (Value *RdxVal : Candidates) {
22841 if (!Visited.insert(Ptr: RdxVal).second)
22842 continue;
22843 unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
22844 for (Instruction *RedOp :
22845 ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps))
22846 ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
22847 }
22848 }
22849 // Iterate through all not-vectorized reduction values/extra arguments.
22850 bool InitStep = true;
22851 while (ExtraReductions.size() > 1) {
22852 SmallVector<std::pair<Instruction *, Value *>> NewReds =
22853 FinalGen(ExtraReductions, InitStep);
22854 ExtraReductions.swap(RHS&: NewReds);
22855 InitStep = false;
22856 }
22857 VectorizedTree = ExtraReductions.front().second;
22858
22859 ReductionRoot->replaceAllUsesWith(V: VectorizedTree);
22860
22861 // The original scalar reduction is expected to have no remaining
22862 // uses outside the reduction tree itself. Assert that we got this
22863 // correct, replace internal uses with undef, and mark for eventual
22864 // deletion.
22865#ifndef NDEBUG
22866 SmallSet<Value *, 4> IgnoreSet;
22867 for (ArrayRef<Value *> RdxOps : ReductionOps)
22868 IgnoreSet.insert_range(RdxOps);
22869#endif
22870 for (ArrayRef<Value *> RdxOps : ReductionOps) {
22871 for (Value *Ignore : RdxOps) {
22872 if (!Ignore)
22873 continue;
22874#ifndef NDEBUG
22875 for (auto *U : Ignore->users()) {
22876 assert(IgnoreSet.count(U) &&
22877 "All users must be either in the reduction ops list.");
22878 }
22879#endif
22880 if (!Ignore->use_empty()) {
22881 Value *P = PoisonValue::get(T: Ignore->getType());
22882 Ignore->replaceAllUsesWith(V: P);
22883 }
22884 }
22885 V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales);
22886 }
22887 } else if (!CheckForReusedReductionOps) {
22888 for (ReductionOpsType &RdxOps : ReductionOps)
22889 for (Value *RdxOp : RdxOps)
22890 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
22891 }
22892 return VectorizedTree;
22893 }
22894
22895private:
22896 /// Creates the reduction from the given \p Vec vector value with the given
22897 /// scale \p Scale and signedness \p IsSigned.
22898 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
22899 Value *Vec, unsigned Scale, bool IsSigned,
22900 Type *DestTy) {
22901 Value *Rdx;
22902 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) {
22903 unsigned DestTyNumElements = getNumElements(Ty: VecTy);
22904 unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements;
22905 Rdx = PoisonValue::get(
22906 T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements));
22907 for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) {
22908 // Do reduction for each lane.
22909 // e.g., do reduce add for
22910 // VL[0] = <4 x Ty> <a, b, c, d>
22911 // VL[1] = <4 x Ty> <e, f, g, h>
22912 // Lane[0] = <2 x Ty> <a, e>
22913 // Lane[1] = <2 x Ty> <b, f>
22914 // Lane[2] = <2 x Ty> <c, g>
22915 // Lane[3] = <2 x Ty> <d, h>
22916 // result[0] = reduce add Lane[0]
22917 // result[1] = reduce add Lane[1]
22918 // result[2] = reduce add Lane[2]
22919 // result[3] = reduce add Lane[3]
22920 SmallVector<int, 16> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF);
22921 Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask);
22922 Rdx = Builder.CreateInsertElement(
22923 Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I);
22924 }
22925 } else {
22926 Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy);
22927 }
22928 if (Rdx->getType() != DestTy)
22929 Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned);
22930 // Improved analysis for add/fadd/xor reductions with same scale
22931 // factor for all operands of reductions. We can emit scalar ops for
22932 // them instead.
22933 if (Scale > 1)
22934 Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale);
22935 return Rdx;
22936 }
22937
22938 /// Calculate the cost of a reduction.
22939 InstructionCost getReductionCost(TargetTransformInfo *TTI,
22940 ArrayRef<Value *> ReducedVals,
22941 bool IsCmpSelMinMax, FastMathFlags FMF,
22942 const BoUpSLP &R) {
22943 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
22944 Type *ScalarTy = ReducedVals.front()->getType();
22945 unsigned ReduxWidth = ReducedVals.size();
22946 FixedVectorType *VectorTy = R.getReductionType();
22947 InstructionCost VectorCost = 0, ScalarCost;
22948 // If all of the reduced values are constant, the vector cost is 0, since
22949 // the reduction value can be calculated at the compile time.
22950 bool AllConsts = allConstant(VL: ReducedVals);
22951 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
22952 InstructionCost Cost = 0;
22953 // Scalar cost is repeated for N-1 elements.
22954 int Cnt = ReducedVals.size();
22955 for (Value *RdxVal : ReducedVals) {
22956 if (Cnt == 1)
22957 break;
22958 --Cnt;
22959 if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) {
22960 Cost += GenCostFn();
22961 continue;
22962 }
22963 InstructionCost ScalarCost = 0;
22964 for (User *U : RdxVal->users()) {
22965 auto *RdxOp = cast<Instruction>(Val: U);
22966 if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
22967 ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
22968 continue;
22969 }
22970 ScalarCost = InstructionCost::getInvalid();
22971 break;
22972 }
22973 if (ScalarCost.isValid())
22974 Cost += ScalarCost;
22975 else
22976 Cost += GenCostFn();
22977 }
22978 return Cost;
22979 };
22980 // Require reduction cost if:
22981 // 1. This type is not a full register type and no other vectors with the
22982 // same type in the storage (first vector with small type).
22983 // 2. The storage does not have any vector with full vector use (first
22984 // vector with full register use).
22985 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
22986 switch (RdxKind) {
22987 case RecurKind::Add:
22988 case RecurKind::Mul:
22989 case RecurKind::Or:
22990 case RecurKind::And:
22991 case RecurKind::Xor:
22992 case RecurKind::FAdd:
22993 case RecurKind::FMul: {
22994 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
22995 if (!AllConsts) {
22996 if (DoesRequireReductionOp) {
22997 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
22998 assert(SLPReVec && "FixedVectorType is not expected.");
22999 unsigned ScalarTyNumElements = VecTy->getNumElements();
23000 for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) {
23001 VectorCost += TTI->getShuffleCost(
23002 Kind: TTI::SK_PermuteSingleSrc,
23003 DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(),
23004 NumElts: ReducedVals.size()),
23005 SrcTy: VectorTy,
23006 Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size()));
23007 VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy,
23008 FMF, CostKind);
23009 }
23010 VectorCost += TTI->getScalarizationOverhead(
23011 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /*Insert*/ true,
23012 /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
23013 } else {
23014 Type *RedTy = VectorTy->getElementType();
23015 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
23016 u: std::make_pair(x&: RedTy, y: true));
23017 if (RType == RedTy) {
23018 VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy,
23019 FMF, CostKind);
23020 } else {
23021 VectorCost = TTI->getExtendedReductionCost(
23022 Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy,
23023 Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind);
23024 }
23025 }
23026 } else {
23027 Type *RedTy = VectorTy->getElementType();
23028 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
23029 u: std::make_pair(x&: RedTy, y: true));
23030 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
23031 VectorCost +=
23032 TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind);
23033 if (RType != RedTy) {
23034 unsigned Opcode = Instruction::Trunc;
23035 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
23036 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
23037 VectorCost += TTI->getCastInstrCost(
23038 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
23039 }
23040 }
23041 }
23042 ScalarCost = EvaluateScalarCost([&]() {
23043 return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
23044 });
23045 break;
23046 }
23047 case RecurKind::FMax:
23048 case RecurKind::FMin:
23049 case RecurKind::FMaximum:
23050 case RecurKind::FMinimum:
23051 case RecurKind::SMax:
23052 case RecurKind::SMin:
23053 case RecurKind::UMax:
23054 case RecurKind::UMin: {
23055 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
23056 if (!AllConsts) {
23057 if (DoesRequireReductionOp) {
23058 VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
23059 } else {
23060 // Check if the previous reduction already exists and account it as
23061 // series of operations + single reduction.
23062 Type *RedTy = VectorTy->getElementType();
23063 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
23064 u: std::make_pair(x&: RedTy, y: true));
23065 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
23066 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
23067 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
23068 if (RType != RedTy) {
23069 unsigned Opcode = Instruction::Trunc;
23070 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
23071 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
23072 VectorCost += TTI->getCastInstrCost(
23073 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
23074 }
23075 }
23076 }
23077 ScalarCost = EvaluateScalarCost([&]() {
23078 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
23079 return TTI->getIntrinsicInstrCost(ICA, CostKind);
23080 });
23081 break;
23082 }
23083 default:
23084 llvm_unreachable("Expected arithmetic or min/max reduction operation");
23085 }
23086
23087 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
23088 << " for reduction of " << shortBundleName(ReducedVals)
23089 << " (It is a splitting reduction)\n");
23090 return VectorCost - ScalarCost;
23091 }
23092
23093 /// Splits the values, stored in VectorValuesAndScales, into registers/free
23094 /// sub-registers, combines them with the given reduction operation as a
23095 /// vector operation and then performs single (small enough) reduction.
23096 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
23097 Type *DestTy) {
23098 Value *ReducedSubTree = nullptr;
23099 // Creates reduction and combines with the previous reduction.
23100 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
23101 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy);
23102 if (ReducedSubTree)
23103 ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx,
23104 Name: "op.rdx", ReductionOps);
23105 else
23106 ReducedSubTree = Rdx;
23107 };
23108 if (VectorValuesAndScales.size() == 1) {
23109 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
23110 CreateSingleOp(Vec, Scale, IsSigned);
23111 return ReducedSubTree;
23112 }
23113 // Scales Vec using given Cnt scale factor and then performs vector combine
23114 // with previous value of VecOp.
23115 Value *VecRes = nullptr;
23116 bool VecResSignedness = false;
23117 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
23118 Type *ScalarTy = Vec->getType()->getScalarType();
23119 // Scale Vec using given Cnt scale factor.
23120 if (Cnt > 1) {
23121 ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount();
23122 switch (RdxKind) {
23123 case RecurKind::Add: {
23124 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
23125 unsigned VF = getNumElements(Ty: Vec->getType());
23126 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
23127 << ". (HorRdx)\n");
23128 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
23129 for (unsigned I : seq<unsigned>(Size: Cnt))
23130 std::iota(first: std::next(x: Mask.begin(), n: VF * I),
23131 last: std::next(x: Mask.begin(), n: VF * (I + 1)), value: 0);
23132 ++NumVectorInstructions;
23133 Vec = Builder.CreateShuffleVector(V: Vec, Mask);
23134 break;
23135 }
23136 // res = mul vv, n
23137 if (ScalarTy != DestTy->getScalarType())
23138 Vec = Builder.CreateIntCast(
23139 V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
23140 isSigned: IsSigned);
23141 Value *Scale = ConstantVector::getSplat(
23142 EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt));
23143 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
23144 << ". (HorRdx)\n");
23145 ++NumVectorInstructions;
23146 Vec = Builder.CreateMul(LHS: Vec, RHS: Scale);
23147 break;
23148 }
23149 case RecurKind::Xor: {
23150 // res = n % 2 ? 0 : vv
23151 LLVM_DEBUG(dbgs()
23152 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
23153 if (Cnt % 2 == 0)
23154 Vec = Constant::getNullValue(Ty: Vec->getType());
23155 break;
23156 }
23157 case RecurKind::FAdd: {
23158 // res = fmul v, n
23159 Value *Scale =
23160 ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt));
23161 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
23162 << ". (HorRdx)\n");
23163 ++NumVectorInstructions;
23164 Vec = Builder.CreateFMul(L: Vec, R: Scale);
23165 break;
23166 }
23167 case RecurKind::And:
23168 case RecurKind::Or:
23169 case RecurKind::SMax:
23170 case RecurKind::SMin:
23171 case RecurKind::UMax:
23172 case RecurKind::UMin:
23173 case RecurKind::FMax:
23174 case RecurKind::FMin:
23175 case RecurKind::FMaximum:
23176 case RecurKind::FMinimum:
23177 // res = vv
23178 break;
23179 case RecurKind::Mul:
23180 case RecurKind::FMul:
23181 case RecurKind::FMulAdd:
23182 case RecurKind::AnyOf:
23183 case RecurKind::FindFirstIVSMin:
23184 case RecurKind::FindLastIVSMax:
23185 case RecurKind::FindLastIVUMax:
23186 case RecurKind::FMaximumNum:
23187 case RecurKind::FMinimumNum:
23188 case RecurKind::None:
23189 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
23190 }
23191 }
23192 // Combine Vec with the previous VecOp.
23193 if (!VecRes) {
23194 VecRes = Vec;
23195 VecResSignedness = IsSigned;
23196 } else {
23197 ++NumVectorInstructions;
23198 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
23199 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
23200 // Handle ctpop.
23201 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
23202 unsigned VecVF = getNumElements(Ty: Vec->getType());
23203 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
23204 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
23205 // Ensure that VecRes is always larger than Vec
23206 if (VecResVF < VecVF) {
23207 std::swap(a&: VecRes, b&: Vec);
23208 std::swap(a&: VecResVF, b&: VecVF);
23209 }
23210 if (VecResVF != VecVF) {
23211 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
23212 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
23213 Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask);
23214 }
23215 VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op");
23216 return;
23217 }
23218 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
23219 VecRes = Builder.CreateIntCast(
23220 V: VecRes, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: VecRes->getType())),
23221 isSigned: VecResSignedness);
23222 if (ScalarTy != DestTy->getScalarType())
23223 Vec = Builder.CreateIntCast(
23224 V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
23225 isSigned: IsSigned);
23226 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
23227 unsigned VecVF = getNumElements(Ty: Vec->getType());
23228 // Ensure that VecRes is always larger than Vec
23229 if (VecResVF < VecVF) {
23230 std::swap(a&: VecRes, b&: Vec);
23231 std::swap(a&: VecResVF, b&: VecVF);
23232 }
23233 // extract + op + insert
23234 Value *Op = VecRes;
23235 if (VecResVF != VecVF)
23236 Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /*Index=*/0);
23237 Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op", ReductionOps);
23238 if (VecResVF != VecVF)
23239 Op = createInsertVector(Builder, Vec: VecRes, V: Op, /*Index=*/0);
23240 VecRes = Op;
23241 }
23242 };
23243 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
23244 CreateVecOp(Vec, Scale, IsSigned);
23245 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
23246
23247 return ReducedSubTree;
23248 }
23249
23250 /// Emit a horizontal reduction of the vectorized value.
23251 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
23252 const TargetTransformInfo *TTI, Type *DestTy) {
23253 assert(VectorizedValue && "Need to have a vectorized tree node");
23254 assert(RdxKind != RecurKind::FMulAdd &&
23255 "A call to the llvm.fmuladd intrinsic is not handled yet");
23256
23257 auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
23258 if (FTy->getScalarType() == Builder.getInt1Ty() &&
23259 RdxKind == RecurKind::Add &&
23260 DestTy->getScalarType() != FTy->getScalarType()) {
23261 // Convert vector_reduce_add(ZExt(<n x i1>)) to
23262 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
23263 Value *V = Builder.CreateBitCast(
23264 V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements()));
23265 ++NumVectorInstructions;
23266 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V);
23267 }
23268 ++NumVectorInstructions;
23269 return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind);
23270 }
23271
23272 /// Emits optimized code for unique scalar value reused \p Cnt times.
23273 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
23274 unsigned Cnt) {
23275 assert(IsSupportedHorRdxIdentityOp &&
23276 "The optimization of matched scalar identity horizontal reductions "
23277 "must be supported.");
23278 if (Cnt == 1)
23279 return VectorizedValue;
23280 switch (RdxKind) {
23281 case RecurKind::Add: {
23282 // res = mul vv, n
23283 Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
23284 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
23285 << VectorizedValue << ". (HorRdx)\n");
23286 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
23287 }
23288 case RecurKind::Xor: {
23289 // res = n % 2 ? 0 : vv
23290 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
23291 << ". (HorRdx)\n");
23292 if (Cnt % 2 == 0)
23293 return Constant::getNullValue(Ty: VectorizedValue->getType());
23294 return VectorizedValue;
23295 }
23296 case RecurKind::FAdd: {
23297 // res = fmul v, n
23298 Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
23299 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
23300 << VectorizedValue << ". (HorRdx)\n");
23301 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
23302 }
23303 case RecurKind::And:
23304 case RecurKind::Or:
23305 case RecurKind::SMax:
23306 case RecurKind::SMin:
23307 case RecurKind::UMax:
23308 case RecurKind::UMin:
23309 case RecurKind::FMax:
23310 case RecurKind::FMin:
23311 case RecurKind::FMaximum:
23312 case RecurKind::FMinimum:
23313 // res = vv
23314 return VectorizedValue;
23315 case RecurKind::Mul:
23316 case RecurKind::FMul:
23317 case RecurKind::FMulAdd:
23318 case RecurKind::AnyOf:
23319 case RecurKind::FindFirstIVSMin:
23320 case RecurKind::FindLastIVSMax:
23321 case RecurKind::FindLastIVUMax:
23322 case RecurKind::FMaximumNum:
23323 case RecurKind::FMinimumNum:
23324 case RecurKind::None:
23325 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
23326 }
23327 return nullptr;
23328 }
23329
23330 /// Emits actual operation for the scalar identity values, found during
23331 /// horizontal reduction analysis.
23332 Value *
23333 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
23334 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
23335 const DenseMap<Value *, Value *> &TrackedToOrig) {
23336 assert(IsSupportedHorRdxIdentityOp &&
23337 "The optimization of matched scalar identity horizontal reductions "
23338 "must be supported.");
23339 ArrayRef<Value *> VL = R.getRootNodeScalars();
23340 auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
23341 if (VTy->getElementType() != VL.front()->getType()) {
23342 VectorizedValue = Builder.CreateIntCast(
23343 V: VectorizedValue,
23344 DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
23345 isSigned: R.isSignedMinBitwidthRootNode());
23346 }
23347 switch (RdxKind) {
23348 case RecurKind::Add: {
23349 // root = mul prev_root, <1, 1, n, 1>
23350 SmallVector<Constant *> Vals;
23351 for (Value *V : VL) {
23352 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
23353 Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false));
23354 }
23355 auto *Scale = ConstantVector::get(V: Vals);
23356 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
23357 << VectorizedValue << ". (HorRdx)\n");
23358 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
23359 }
23360 case RecurKind::And:
23361 case RecurKind::Or:
23362 // No need for multiple or/and(s).
23363 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
23364 << ". (HorRdx)\n");
23365 return VectorizedValue;
23366 case RecurKind::SMax:
23367 case RecurKind::SMin:
23368 case RecurKind::UMax:
23369 case RecurKind::UMin:
23370 case RecurKind::FMax:
23371 case RecurKind::FMin:
23372 case RecurKind::FMaximum:
23373 case RecurKind::FMinimum:
23374 // No need for multiple min/max(s) of the same value.
23375 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
23376 << ". (HorRdx)\n");
23377 return VectorizedValue;
23378 case RecurKind::Xor: {
23379 // Replace values with even number of repeats with 0, since
23380 // x xor x = 0.
23381 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
23382 // 7>, if elements 4th and 6th elements have even number of repeats.
23383 SmallVector<int> Mask(
23384 cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
23385 PoisonMaskElem);
23386 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
23387 bool NeedShuffle = false;
23388 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
23389 Value *V = VL[I];
23390 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
23391 if (Cnt % 2 == 0) {
23392 Mask[I] = VF;
23393 NeedShuffle = true;
23394 }
23395 }
23396 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
23397 : Mask) dbgs()
23398 << I << " ";
23399 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
23400 if (NeedShuffle)
23401 VectorizedValue = Builder.CreateShuffleVector(
23402 V1: VectorizedValue,
23403 V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
23404 return VectorizedValue;
23405 }
23406 case RecurKind::FAdd: {
23407 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
23408 SmallVector<Constant *> Vals;
23409 for (Value *V : VL) {
23410 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
23411 Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
23412 }
23413 auto *Scale = ConstantVector::get(V: Vals);
23414 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
23415 }
23416 case RecurKind::Mul:
23417 case RecurKind::FMul:
23418 case RecurKind::FMulAdd:
23419 case RecurKind::AnyOf:
23420 case RecurKind::FindFirstIVSMin:
23421 case RecurKind::FindLastIVSMax:
23422 case RecurKind::FindLastIVUMax:
23423 case RecurKind::FMaximumNum:
23424 case RecurKind::FMinimumNum:
23425 case RecurKind::None:
23426 llvm_unreachable("Unexpected reduction kind for reused scalars.");
23427 }
23428 return nullptr;
23429 }
23430};
23431} // end anonymous namespace
23432
23433/// Gets recurrence kind from the specified value.
23434static RecurKind getRdxKind(Value *V) {
23435 return HorizontalReduction::getRdxKind(V);
23436}
23437static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
23438 if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
23439 return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
23440
23441 unsigned AggregateSize = 1;
23442 auto *IV = cast<InsertValueInst>(Val: InsertInst);
23443 Type *CurrentType = IV->getType();
23444 do {
23445 if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
23446 for (auto *Elt : ST->elements())
23447 if (Elt != ST->getElementType(N: 0)) // check homogeneity
23448 return std::nullopt;
23449 AggregateSize *= ST->getNumElements();
23450 CurrentType = ST->getElementType(N: 0);
23451 } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
23452 AggregateSize *= AT->getNumElements();
23453 CurrentType = AT->getElementType();
23454 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
23455 AggregateSize *= VT->getNumElements();
23456 return AggregateSize;
23457 } else if (CurrentType->isSingleValueType()) {
23458 return AggregateSize;
23459 } else {
23460 return std::nullopt;
23461 }
23462 } while (true);
23463}
23464
23465static void findBuildAggregateRec(Instruction *LastInsertInst,
23466 TargetTransformInfo *TTI,
23467 SmallVectorImpl<Value *> &BuildVectorOpds,
23468 SmallVectorImpl<Value *> &InsertElts,
23469 unsigned OperandOffset, const BoUpSLP &R) {
23470 do {
23471 Value *InsertedOperand = LastInsertInst->getOperand(i: 1);
23472 std::optional<unsigned> OperandIndex =
23473 getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
23474 if (!OperandIndex || R.isDeleted(I: LastInsertInst))
23475 return;
23476 if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
23477 findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
23478 BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R);
23479
23480 } else {
23481 BuildVectorOpds[*OperandIndex] = InsertedOperand;
23482 InsertElts[*OperandIndex] = LastInsertInst;
23483 }
23484 LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0));
23485 } while (LastInsertInst != nullptr &&
23486 isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
23487 LastInsertInst->hasOneUse());
23488}
23489
23490/// Recognize construction of vectors like
23491/// %ra = insertelement <4 x float> poison, float %s0, i32 0
23492/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
23493/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
23494/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
23495/// starting from the last insertelement or insertvalue instruction.
23496///
23497/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
23498/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
23499/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
23500///
23501/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
23502///
23503/// \return true if it matches.
23504static bool findBuildAggregate(Instruction *LastInsertInst,
23505 TargetTransformInfo *TTI,
23506 SmallVectorImpl<Value *> &BuildVectorOpds,
23507 SmallVectorImpl<Value *> &InsertElts,
23508 const BoUpSLP &R) {
23509
23510 assert((isa<InsertElementInst>(LastInsertInst) ||
23511 isa<InsertValueInst>(LastInsertInst)) &&
23512 "Expected insertelement or insertvalue instruction!");
23513
23514 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
23515 "Expected empty result vectors!");
23516
23517 std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
23518 if (!AggregateSize)
23519 return false;
23520 BuildVectorOpds.resize(N: *AggregateSize);
23521 InsertElts.resize(N: *AggregateSize);
23522
23523 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0, R);
23524 llvm::erase(C&: BuildVectorOpds, V: nullptr);
23525 llvm::erase(C&: InsertElts, V: nullptr);
23526 if (BuildVectorOpds.size() >= 2)
23527 return true;
23528
23529 return false;
23530}
23531
23532/// Try and get a reduction instruction from a phi node.
23533///
23534/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
23535/// if they come from either \p ParentBB or a containing loop latch.
23536///
23537/// \returns A candidate reduction value if possible, or \code nullptr \endcode
23538/// if not possible.
23539static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
23540 BasicBlock *ParentBB, LoopInfo *LI) {
23541 // There are situations where the reduction value is not dominated by the
23542 // reduction phi. Vectorizing such cases has been reported to cause
23543 // miscompiles. See PR25787.
23544 auto DominatedReduxValue = [&](Value *R) {
23545 return isa<Instruction>(Val: R) &&
23546 DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
23547 };
23548
23549 Instruction *Rdx = nullptr;
23550
23551 // Return the incoming value if it comes from the same BB as the phi node.
23552 if (P->getIncomingBlock(i: 0) == ParentBB) {
23553 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
23554 } else if (P->getIncomingBlock(i: 1) == ParentBB) {
23555 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
23556 }
23557
23558 if (Rdx && DominatedReduxValue(Rdx))
23559 return Rdx;
23560
23561 // Otherwise, check whether we have a loop latch to look at.
23562 Loop *BBL = LI->getLoopFor(BB: ParentBB);
23563 if (!BBL)
23564 return nullptr;
23565 BasicBlock *BBLatch = BBL->getLoopLatch();
23566 if (!BBLatch)
23567 return nullptr;
23568
23569 // There is a loop latch, return the incoming value if it comes from
23570 // that. This reduction pattern occasionally turns up.
23571 if (P->getIncomingBlock(i: 0) == BBLatch) {
23572 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
23573 } else if (P->getIncomingBlock(i: 1) == BBLatch) {
23574 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
23575 }
23576
23577 if (Rdx && DominatedReduxValue(Rdx))
23578 return Rdx;
23579
23580 return nullptr;
23581}
23582
23583static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
23584 if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
23585 return true;
23586 if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23587 return true;
23588 if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23589 return true;
23590 if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23591 return true;
23592 if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23593 return true;
23594 if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23595 return true;
23596 if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23597 return true;
23598 if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23599 return true;
23600 if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
23601 return true;
23602 return false;
23603}
23604
23605/// We could have an initial reduction that is not an add.
23606/// r *= v1 + v2 + v3 + v4
23607/// In such a case start looking for a tree rooted in the first '+'.
23608/// \Returns the new root if found, which may be nullptr if not an instruction.
23609static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
23610 Instruction *Root) {
23611 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
23612 isa<IntrinsicInst>(Root)) &&
23613 "Expected binop, select, or intrinsic for reduction matching");
23614 Value *LHS =
23615 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
23616 Value *RHS =
23617 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1);
23618 if (LHS == Phi)
23619 return dyn_cast<Instruction>(Val: RHS);
23620 if (RHS == Phi)
23621 return dyn_cast<Instruction>(Val: LHS);
23622 return nullptr;
23623}
23624
23625/// \p Returns the first operand of \p I that does not match \p Phi. If
23626/// operand is not an instruction it returns nullptr.
23627static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
23628 Value *Op0 = nullptr;
23629 Value *Op1 = nullptr;
23630 if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
23631 return nullptr;
23632 return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
23633}
23634
23635/// \Returns true if \p I is a candidate instruction for reduction vectorization.
23636static bool isReductionCandidate(Instruction *I) {
23637 bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
23638 Value *B0 = nullptr, *B1 = nullptr;
23639 bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
23640 return IsBinop || IsSelect;
23641}
23642
23643bool SLPVectorizerPass::vectorizeHorReduction(
23644 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
23645 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
23646 if (!ShouldVectorizeHor)
23647 return false;
23648 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
23649
23650 if (Root->getParent() != BB || isa<PHINode>(Val: Root))
23651 return false;
23652
23653 // If we can find a secondary reduction root, use that instead.
23654 auto SelectRoot = [&]() {
23655 if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
23656 HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
23657 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
23658 return NewRoot;
23659 return Root;
23660 };
23661
23662 // Start analysis starting from Root instruction. If horizontal reduction is
23663 // found, try to vectorize it. If it is not a horizontal reduction or
23664 // vectorization is not possible or not effective, and currently analyzed
23665 // instruction is a binary operation, try to vectorize the operands, using
23666 // pre-order DFS traversal order. If the operands were not vectorized, repeat
23667 // the same procedure considering each operand as a possible root of the
23668 // horizontal reduction.
23669 // Interrupt the process if the Root instruction itself was vectorized or all
23670 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
23671 // If a horizintal reduction was not matched or vectorized we collect
23672 // instructions for possible later attempts for vectorization.
23673 std::queue<std::pair<Instruction *, unsigned>> Stack;
23674 Stack.emplace(args: SelectRoot(), args: 0);
23675 SmallPtrSet<Value *, 8> VisitedInstrs;
23676 bool Res = false;
23677 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
23678 if (R.isAnalyzedReductionRoot(I: Inst))
23679 return nullptr;
23680 if (!isReductionCandidate(I: Inst))
23681 return nullptr;
23682 HorizontalReduction HorRdx;
23683 if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DL: *DL, TLI: *TLI))
23684 return nullptr;
23685 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI, AC);
23686 };
23687 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
23688 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
23689 FutureSeed = getNonPhiOperand(I: Root, Phi: P);
23690 if (!FutureSeed)
23691 return false;
23692 }
23693 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
23694 // analysis is done separately.
23695 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
23696 PostponedInsts.push_back(Elt: FutureSeed);
23697 return true;
23698 };
23699
23700 while (!Stack.empty()) {
23701 Instruction *Inst;
23702 unsigned Level;
23703 std::tie(args&: Inst, args&: Level) = Stack.front();
23704 Stack.pop();
23705 // Do not try to analyze instruction that has already been vectorized.
23706 // This may happen when we vectorize instruction operands on a previous
23707 // iteration while stack was populated before that happened.
23708 if (R.isDeleted(I: Inst))
23709 continue;
23710 if (Value *VectorizedV = TryToReduce(Inst)) {
23711 Res = true;
23712 if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
23713 // Try to find another reduction.
23714 Stack.emplace(args&: I, args&: Level);
23715 continue;
23716 }
23717 if (R.isDeleted(I: Inst))
23718 continue;
23719 } else {
23720 // We could not vectorize `Inst` so try to use it as a future seed.
23721 if (!TryAppendToPostponedInsts(Inst)) {
23722 assert(Stack.empty() && "Expected empty stack");
23723 break;
23724 }
23725 }
23726
23727 // Try to vectorize operands.
23728 // Continue analysis for the instruction from the same basic block only to
23729 // save compile time.
23730 if (++Level < RecursionMaxDepth)
23731 for (auto *Op : Inst->operand_values())
23732 if (VisitedInstrs.insert(Ptr: Op).second)
23733 if (auto *I = dyn_cast<Instruction>(Val: Op))
23734 // Do not try to vectorize CmpInst operands, this is done
23735 // separately.
23736 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
23737 !R.isDeleted(I) && I->getParent() == BB)
23738 Stack.emplace(args&: I, args&: Level);
23739 }
23740 return Res;
23741}
23742
23743bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
23744 BasicBlock *BB, BoUpSLP &R) {
23745 SmallVector<WeakTrackingVH> PostponedInsts;
23746 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
23747 Res |= tryToVectorize(Insts: PostponedInsts, R);
23748 return Res;
23749}
23750
23751bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
23752 BoUpSLP &R) {
23753 bool Res = false;
23754 for (Value *V : Insts)
23755 if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
23756 Res |= tryToVectorize(I: Inst, R);
23757 return Res;
23758}
23759
23760bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
23761 BasicBlock *BB, BoUpSLP &R,
23762 bool MaxVFOnly) {
23763 if (!R.canMapToVector(T: IVI->getType()))
23764 return false;
23765
23766 SmallVector<Value *, 16> BuildVectorOpds;
23767 SmallVector<Value *, 16> BuildVectorInsts;
23768 if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R))
23769 return false;
23770
23771 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
23772 R.getORE()->emit(RemarkBuilder: [&]() {
23773 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
23774 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
23775 "trying reduction first.";
23776 });
23777 return false;
23778 }
23779 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
23780 // Aggregate value is unlikely to be processed in vector register.
23781 return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
23782}
23783
23784bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
23785 BasicBlock *BB, BoUpSLP &R,
23786 bool MaxVFOnly) {
23787 SmallVector<Value *, 16> BuildVectorInsts;
23788 SmallVector<Value *, 16> BuildVectorOpds;
23789 SmallVector<int> Mask;
23790 if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) ||
23791 (all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
23792 isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC)))
23793 return false;
23794
23795 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
23796 R.getORE()->emit(RemarkBuilder: [&]() {
23797 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
23798 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
23799 "trying reduction first.";
23800 });
23801 return false;
23802 }
23803 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
23804 return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
23805}
23806
23807template <typename T>
23808static bool tryToVectorizeSequence(
23809 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
23810 function_ref<bool(T *, T *)> AreCompatible,
23811 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
23812 bool MaxVFOnly, BoUpSLP &R) {
23813 bool Changed = false;
23814 // Sort by type, parent, operands.
23815 stable_sort(Incoming, Comparator);
23816
23817 // Try to vectorize elements base on their type.
23818 SmallVector<T *> Candidates;
23819 SmallVector<T *> VL;
23820 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
23821 VL.clear()) {
23822 // Look for the next elements with the same type, parent and operand
23823 // kinds.
23824 auto *I = dyn_cast<Instruction>(*IncIt);
23825 if (!I || R.isDeleted(I)) {
23826 ++IncIt;
23827 continue;
23828 }
23829 auto *SameTypeIt = IncIt;
23830 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
23831 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
23832 AreCompatible(*SameTypeIt, *IncIt))) {
23833 auto *I = dyn_cast<Instruction>(*SameTypeIt);
23834 ++SameTypeIt;
23835 if (I && !R.isDeleted(I))
23836 VL.push_back(cast<T>(I));
23837 }
23838
23839 // Try to vectorize them.
23840 unsigned NumElts = VL.size();
23841 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
23842 << NumElts << ")\n");
23843 // The vectorization is a 3-state attempt:
23844 // 1. Try to vectorize instructions with the same/alternate opcodes with the
23845 // size of maximal register at first.
23846 // 2. Try to vectorize remaining instructions with the same type, if
23847 // possible. This may result in the better vectorization results rather than
23848 // if we try just to vectorize instructions with the same/alternate opcodes.
23849 // 3. Final attempt to try to vectorize all instructions with the
23850 // same/alternate ops only, this may result in some extra final
23851 // vectorization.
23852 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
23853 // Success start over because instructions might have been changed.
23854 Changed = true;
23855 VL.swap(Candidates);
23856 Candidates.clear();
23857 for (T *V : VL) {
23858 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
23859 Candidates.push_back(V);
23860 }
23861 } else {
23862 /// \Returns the minimum number of elements that we will attempt to
23863 /// vectorize.
23864 auto GetMinNumElements = [&R](Value *V) {
23865 unsigned EltSize = R.getVectorElementSize(V);
23866 return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize);
23867 };
23868 if (NumElts < GetMinNumElements(*IncIt) &&
23869 (Candidates.empty() ||
23870 Candidates.front()->getType() == (*IncIt)->getType())) {
23871 for (T *V : VL) {
23872 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
23873 Candidates.push_back(V);
23874 }
23875 }
23876 }
23877 // Final attempt to vectorize instructions with the same types.
23878 if (Candidates.size() > 1 &&
23879 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
23880 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
23881 // Success start over because instructions might have been changed.
23882 Changed = true;
23883 } else if (MaxVFOnly) {
23884 // Try to vectorize using small vectors.
23885 SmallVector<T *> VL;
23886 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
23887 VL.clear()) {
23888 auto *I = dyn_cast<Instruction>(*It);
23889 if (!I || R.isDeleted(I)) {
23890 ++It;
23891 continue;
23892 }
23893 auto *SameTypeIt = It;
23894 while (SameTypeIt != End &&
23895 (!isa<Instruction>(*SameTypeIt) ||
23896 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
23897 AreCompatible(*SameTypeIt, *It))) {
23898 auto *I = dyn_cast<Instruction>(*SameTypeIt);
23899 ++SameTypeIt;
23900 if (I && !R.isDeleted(I))
23901 VL.push_back(cast<T>(I));
23902 }
23903 unsigned NumElts = VL.size();
23904 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
23905 /*MaxVFOnly=*/false))
23906 Changed = true;
23907 It = SameTypeIt;
23908 }
23909 }
23910 Candidates.clear();
23911 }
23912
23913 // Start over at the next instruction of a different type (or the end).
23914 IncIt = SameTypeIt;
23915 }
23916 return Changed;
23917}
23918
23919/// Compare two cmp instructions. If IsCompatibility is true, function returns
23920/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
23921/// operands. If IsCompatibility is false, function implements strict weak
23922/// ordering relation between two cmp instructions, returning true if the first
23923/// instruction is "less" than the second, i.e. its predicate is less than the
23924/// predicate of the second or the operands IDs are less than the operands IDs
23925/// of the second cmp instruction.
23926template <bool IsCompatibility>
23927static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
23928 const DominatorTree &DT) {
23929 assert(isValidElementType(V->getType()) &&
23930 isValidElementType(V2->getType()) &&
23931 "Expected valid element types only.");
23932 if (V == V2)
23933 return IsCompatibility;
23934 auto *CI1 = cast<CmpInst>(Val: V);
23935 auto *CI2 = cast<CmpInst>(Val: V2);
23936 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() <
23937 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
23938 return !IsCompatibility;
23939 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() >
23940 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
23941 return false;
23942 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <
23943 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
23944 return !IsCompatibility;
23945 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() >
23946 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
23947 return false;
23948 CmpInst::Predicate Pred1 = CI1->getPredicate();
23949 CmpInst::Predicate Pred2 = CI2->getPredicate();
23950 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
23951 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
23952 CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
23953 CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
23954 if (BasePred1 < BasePred2)
23955 return !IsCompatibility;
23956 if (BasePred1 > BasePred2)
23957 return false;
23958 // Compare operands.
23959 bool CI1Preds = Pred1 == BasePred1;
23960 bool CI2Preds = Pred2 == BasePred1;
23961 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
23962 auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1);
23963 auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1);
23964 if (Op1 == Op2)
23965 continue;
23966 if (Op1->getValueID() < Op2->getValueID())
23967 return !IsCompatibility;
23968 if (Op1->getValueID() > Op2->getValueID())
23969 return false;
23970 if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
23971 if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
23972 if (IsCompatibility) {
23973 if (I1->getParent() != I2->getParent())
23974 return false;
23975 } else {
23976 // Try to compare nodes with same parent.
23977 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
23978 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
23979 if (!NodeI1)
23980 return NodeI2 != nullptr;
23981 if (!NodeI2)
23982 return false;
23983 assert((NodeI1 == NodeI2) ==
23984 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
23985 "Different nodes should have different DFS numbers");
23986 if (NodeI1 != NodeI2)
23987 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
23988 }
23989 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
23990 if (S && (IsCompatibility || !S.isAltShuffle()))
23991 continue;
23992 if (IsCompatibility)
23993 return false;
23994 if (I1->getOpcode() != I2->getOpcode())
23995 return I1->getOpcode() < I2->getOpcode();
23996 }
23997 }
23998 return IsCompatibility;
23999}
24000
24001template <typename ItT>
24002bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
24003 BasicBlock *BB, BoUpSLP &R) {
24004 bool Changed = false;
24005 // Try to find reductions first.
24006 for (CmpInst *I : CmpInsts) {
24007 if (R.isDeleted(I))
24008 continue;
24009 for (Value *Op : I->operands())
24010 if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) {
24011 Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R);
24012 if (R.isDeleted(I))
24013 break;
24014 }
24015 }
24016 // Try to vectorize operands as vector bundles.
24017 for (CmpInst *I : CmpInsts) {
24018 if (R.isDeleted(I))
24019 continue;
24020 Changed |= tryToVectorize(I, R);
24021 }
24022 // Try to vectorize list of compares.
24023 // Sort by type, compare predicate, etc.
24024 auto CompareSorter = [&](Value *V, Value *V2) {
24025 if (V == V2)
24026 return false;
24027 return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT);
24028 };
24029
24030 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
24031 if (V1 == V2)
24032 return true;
24033 return compareCmp<true>(V: V1, V2, TLI&: *TLI, DT: *DT);
24034 };
24035
24036 SmallVector<Value *> Vals;
24037 for (Instruction *V : CmpInsts)
24038 if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V)))
24039 Vals.push_back(Elt: V);
24040 if (Vals.size() <= 1)
24041 return Changed;
24042 Changed |= tryToVectorizeSequence<Value>(
24043 Vals, CompareSorter, AreCompatibleCompares,
24044 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
24045 // Exclude possible reductions from other blocks.
24046 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
24047 return any_of(V->users(), [V](User *U) {
24048 auto *Select = dyn_cast<SelectInst>(Val: U);
24049 return Select &&
24050 Select->getParent() != cast<Instruction>(Val: V)->getParent();
24051 });
24052 });
24053 if (ArePossiblyReducedInOtherBlock)
24054 return false;
24055 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
24056 },
24057 /*MaxVFOnly=*/true, R);
24058 return Changed;
24059}
24060
24061bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
24062 BasicBlock *BB, BoUpSLP &R) {
24063 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
24064 "This function only accepts Insert instructions");
24065 bool OpsChanged = false;
24066 SmallVector<WeakTrackingVH> PostponedInsts;
24067 for (auto *I : reverse(C&: Instructions)) {
24068 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
24069 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
24070 continue;
24071 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
24072 OpsChanged |=
24073 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/true);
24074 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
24075 OpsChanged |=
24076 vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /*MaxVFOnly=*/true);
24077 }
24078 // pass2 - try to vectorize reductions only
24079 if (R.isDeleted(I))
24080 continue;
24081 OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts);
24082 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
24083 continue;
24084 // pass3 - try to match and vectorize a buildvector sequence.
24085 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
24086 OpsChanged |=
24087 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/false);
24088 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
24089 OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
24090 /*MaxVFOnly=*/false);
24091 }
24092 }
24093 // Now try to vectorize postponed instructions.
24094 OpsChanged |= tryToVectorize(Insts: PostponedInsts, R);
24095
24096 Instructions.clear();
24097 return OpsChanged;
24098}
24099
24100bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
24101 bool Changed = false;
24102 SmallVector<Value *, 4> Incoming;
24103 SmallPtrSet<Value *, 16> VisitedInstrs;
24104 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
24105 // node. Allows better to identify the chains that can be vectorized in the
24106 // better way.
24107 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
24108 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
24109 assert(isValidElementType(V1->getType()) &&
24110 isValidElementType(V2->getType()) &&
24111 "Expected vectorizable types only.");
24112 if (V1 == V2)
24113 return false;
24114 // It is fine to compare type IDs here, since we expect only vectorizable
24115 // types, like ints, floats and pointers, we don't care about other type.
24116 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
24117 return true;
24118 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
24119 return false;
24120 if (V1->getType()->getScalarSizeInBits() <
24121 V2->getType()->getScalarSizeInBits())
24122 return true;
24123 if (V1->getType()->getScalarSizeInBits() >
24124 V2->getType()->getScalarSizeInBits())
24125 return false;
24126 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
24127 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
24128 if (Opcodes1.size() < Opcodes2.size())
24129 return true;
24130 if (Opcodes1.size() > Opcodes2.size())
24131 return false;
24132 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
24133 {
24134 // Instructions come first.
24135 auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]);
24136 auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]);
24137 if (I1 && I2) {
24138 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
24139 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
24140 if (!NodeI1)
24141 return NodeI2 != nullptr;
24142 if (!NodeI2)
24143 return false;
24144 assert((NodeI1 == NodeI2) ==
24145 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
24146 "Different nodes should have different DFS numbers");
24147 if (NodeI1 != NodeI2)
24148 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
24149 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
24150 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
24151 const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1);
24152 const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2);
24153 if (!E1 || !E2)
24154 continue;
24155
24156 // Sort on ExtractElementInsts primarily by vector operands. Prefer
24157 // program order of the vector operands.
24158 const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand());
24159 const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand());
24160 if (V1 != V2) {
24161 if (V1 && !V2)
24162 return true;
24163 if (!V1 && V2)
24164 return false;
24165 DomTreeNodeBase<BasicBlock> *NodeI1 =
24166 DT->getNode(BB: V1->getParent());
24167 DomTreeNodeBase<BasicBlock> *NodeI2 =
24168 DT->getNode(BB: V2->getParent());
24169 if (!NodeI1)
24170 return NodeI2 != nullptr;
24171 if (!NodeI2)
24172 return false;
24173 assert((NodeI1 == NodeI2) ==
24174 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
24175 "Different nodes should have different DFS numbers");
24176 if (NodeI1 != NodeI2)
24177 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
24178 return V1->comesBefore(Other: V2);
24179 }
24180 // If we have the same vector operand, try to sort by constant
24181 // index.
24182 std::optional<unsigned> Id1 = getExtractIndex(E: E1);
24183 std::optional<unsigned> Id2 = getExtractIndex(E: E2);
24184 // Bring constants to the top
24185 if (Id1 && !Id2)
24186 return true;
24187 if (!Id1 && Id2)
24188 return false;
24189 // First elements come first.
24190 if (Id1 && Id2)
24191 return *Id1 < *Id2;
24192
24193 continue;
24194 }
24195 if (I1->getOpcode() == I2->getOpcode())
24196 continue;
24197 return I1->getOpcode() < I2->getOpcode();
24198 }
24199 if (I1)
24200 return true;
24201 if (I2)
24202 return false;
24203 }
24204 {
24205 // Non-undef constants come next.
24206 bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]);
24207 bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]);
24208 if (C1 && C2)
24209 continue;
24210 if (C1)
24211 return true;
24212 if (C2)
24213 return false;
24214 }
24215 bool U1 = isa<UndefValue>(Val: Opcodes1[I]);
24216 bool U2 = isa<UndefValue>(Val: Opcodes2[I]);
24217 {
24218 // Non-constant non-instructions come next.
24219 if (!U1 && !U2) {
24220 auto ValID1 = Opcodes1[I]->getValueID();
24221 auto ValID2 = Opcodes2[I]->getValueID();
24222 if (ValID1 == ValID2)
24223 continue;
24224 if (ValID1 < ValID2)
24225 return true;
24226 if (ValID1 > ValID2)
24227 return false;
24228 }
24229 if (!U1)
24230 return true;
24231 if (!U2)
24232 return false;
24233 }
24234 // Undefs come last.
24235 assert(U1 && U2 && "The only thing left should be undef & undef.");
24236 }
24237 return false;
24238 };
24239 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
24240 if (V1 == V2)
24241 return true;
24242 if (V1->getType() != V2->getType())
24243 return false;
24244 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
24245 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
24246 if (Opcodes1.size() != Opcodes2.size())
24247 return false;
24248 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
24249 // Undefs are compatible with any other value.
24250 if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I]))
24251 continue;
24252 if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]))
24253 if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) {
24254 if (R.isDeleted(I: I1) || R.isDeleted(I: I2))
24255 return false;
24256 if (I1->getParent() != I2->getParent())
24257 return false;
24258 if (getSameOpcode(VL: {I1, I2}, TLI: *TLI))
24259 continue;
24260 return false;
24261 }
24262 if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I]))
24263 continue;
24264 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
24265 return false;
24266 }
24267 return true;
24268 };
24269
24270 bool HaveVectorizedPhiNodes = false;
24271 do {
24272 // Collect the incoming values from the PHIs.
24273 Incoming.clear();
24274 for (Instruction &I : *BB) {
24275 auto *P = dyn_cast<PHINode>(Val: &I);
24276 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
24277 break;
24278
24279 // No need to analyze deleted, vectorized and non-vectorizable
24280 // instructions.
24281 if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
24282 isValidElementType(Ty: P->getType()))
24283 Incoming.push_back(Elt: P);
24284 }
24285
24286 if (Incoming.size() <= 1)
24287 break;
24288
24289 // Find the corresponding non-phi nodes for better matching when trying to
24290 // build the tree.
24291 for (Value *V : Incoming) {
24292 SmallVectorImpl<Value *> &Opcodes =
24293 PHIToOpcodes.try_emplace(Key: V).first->getSecond();
24294 if (!Opcodes.empty())
24295 continue;
24296 SmallVector<Value *, 4> Nodes(1, V);
24297 SmallPtrSet<Value *, 4> Visited;
24298 while (!Nodes.empty()) {
24299 auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
24300 if (!Visited.insert(Ptr: PHI).second)
24301 continue;
24302 for (Value *V : PHI->incoming_values()) {
24303 if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
24304 Nodes.push_back(Elt: PHI1);
24305 continue;
24306 }
24307 Opcodes.emplace_back(Args&: V);
24308 }
24309 }
24310 }
24311
24312 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
24313 Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
24314 TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
24315 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
24316 },
24317 /*MaxVFOnly=*/true, R);
24318 Changed |= HaveVectorizedPhiNodes;
24319 if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
24320 auto *PHI = dyn_cast<PHINode>(P.first);
24321 return !PHI || R.isDeleted(I: PHI);
24322 }))
24323 PHIToOpcodes.clear();
24324 VisitedInstrs.insert_range(R&: Incoming);
24325 } while (HaveVectorizedPhiNodes);
24326
24327 VisitedInstrs.clear();
24328
24329 InstSetVector PostProcessInserts;
24330 SmallSetVector<CmpInst *, 8> PostProcessCmps;
24331 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
24332 // also vectorizes `PostProcessCmps`.
24333 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
24334 bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
24335 if (VectorizeCmps) {
24336 Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
24337 PostProcessCmps.clear();
24338 }
24339 PostProcessInserts.clear();
24340 return Changed;
24341 };
24342 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
24343 auto IsInPostProcessInstrs = [&](Instruction *I) {
24344 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
24345 return PostProcessCmps.contains(key: Cmp);
24346 return isa<InsertElementInst, InsertValueInst>(Val: I) &&
24347 PostProcessInserts.contains(key: I);
24348 };
24349 // Returns true if `I` is an instruction without users, like terminator, or
24350 // function call with ignored return value, store. Ignore unused instructions
24351 // (basing on instruction type, except for CallInst and InvokeInst).
24352 auto HasNoUsers = [](Instruction *I) {
24353 return I->use_empty() &&
24354 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I));
24355 };
24356 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
24357 // Skip instructions with scalable type. The num of elements is unknown at
24358 // compile-time for scalable type.
24359 if (isa<ScalableVectorType>(Val: It->getType()))
24360 continue;
24361
24362 // Skip instructions marked for the deletion.
24363 if (R.isDeleted(I: &*It))
24364 continue;
24365 // We may go through BB multiple times so skip the one we have checked.
24366 if (!VisitedInstrs.insert(Ptr: &*It).second) {
24367 if (HasNoUsers(&*It) &&
24368 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
24369 // We would like to start over since some instructions are deleted
24370 // and the iterator may become invalid value.
24371 Changed = true;
24372 It = BB->begin();
24373 E = BB->end();
24374 }
24375 continue;
24376 }
24377
24378 // Try to vectorize reductions that use PHINodes.
24379 if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
24380 // Check that the PHI is a reduction PHI.
24381 if (P->getNumIncomingValues() == 2) {
24382 // Try to match and vectorize a horizontal reduction.
24383 Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
24384 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
24385 Changed = true;
24386 It = BB->begin();
24387 E = BB->end();
24388 continue;
24389 }
24390 }
24391 // Try to vectorize the incoming values of the PHI, to catch reductions
24392 // that feed into PHIs.
24393 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
24394 // Skip if the incoming block is the current BB for now. Also, bypass
24395 // unreachable IR for efficiency and to avoid crashing.
24396 // TODO: Collect the skipped incoming values and try to vectorize them
24397 // after processing BB.
24398 if (BB == P->getIncomingBlock(i: I) ||
24399 !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
24400 continue;
24401
24402 // Postponed instructions should not be vectorized here, delay their
24403 // vectorization.
24404 if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
24405 PI && !IsInPostProcessInstrs(PI)) {
24406 bool Res =
24407 vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R);
24408 Changed |= Res;
24409 if (Res && R.isDeleted(I: P)) {
24410 It = BB->begin();
24411 E = BB->end();
24412 break;
24413 }
24414 }
24415 }
24416 continue;
24417 }
24418
24419 if (HasNoUsers(&*It)) {
24420 bool OpsChanged = false;
24421 auto *SI = dyn_cast<StoreInst>(Val&: It);
24422 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
24423 if (SI) {
24424 auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
24425 // Try to vectorize chain in store, if this is the only store to the
24426 // address in the block.
24427 // TODO: This is just a temporarily solution to save compile time. Need
24428 // to investigate if we can safely turn on slp-vectorize-hor-store
24429 // instead to allow lookup for reduction chains in all non-vectorized
24430 // stores (need to check side effects and compile time).
24431 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
24432 SI->getValueOperand()->hasOneUse();
24433 }
24434 if (TryToVectorizeRoot) {
24435 for (auto *V : It->operand_values()) {
24436 // Postponed instructions should not be vectorized here, delay their
24437 // vectorization.
24438 if (auto *VI = dyn_cast<Instruction>(Val: V);
24439 VI && !IsInPostProcessInstrs(VI))
24440 // Try to match and vectorize a horizontal reduction.
24441 OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R);
24442 }
24443 }
24444 // Start vectorization of post-process list of instructions from the
24445 // top-tree instructions to try to vectorize as many instructions as
24446 // possible.
24447 OpsChanged |=
24448 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
24449 if (OpsChanged) {
24450 // We would like to start over since some instructions are deleted
24451 // and the iterator may become invalid value.
24452 Changed = true;
24453 It = BB->begin();
24454 E = BB->end();
24455 continue;
24456 }
24457 }
24458
24459 if (isa<InsertElementInst, InsertValueInst>(Val: It))
24460 PostProcessInserts.insert(X: &*It);
24461 else if (isa<CmpInst>(Val: It))
24462 PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
24463 }
24464
24465 return Changed;
24466}
24467
24468bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
24469 auto Changed = false;
24470 for (auto &Entry : GEPs) {
24471 // If the getelementptr list has fewer than two elements, there's nothing
24472 // to do.
24473 if (Entry.second.size() < 2)
24474 continue;
24475
24476 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
24477 << Entry.second.size() << ".\n");
24478
24479 // Process the GEP list in chunks suitable for the target's supported
24480 // vector size. If a vector register can't hold 1 element, we are done. We
24481 // are trying to vectorize the index computations, so the maximum number of
24482 // elements is based on the size of the index expression, rather than the
24483 // size of the GEP itself (the target's pointer size).
24484 auto *It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst *GEP) {
24485 return !R.isDeleted(I: GEP);
24486 });
24487 if (It == Entry.second.end())
24488 continue;
24489 unsigned MaxVecRegSize = R.getMaxVecRegSize();
24490 unsigned EltSize = R.getVectorElementSize(V: *(*It)->idx_begin());
24491 if (MaxVecRegSize < EltSize)
24492 continue;
24493
24494 unsigned MaxElts = MaxVecRegSize / EltSize;
24495 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
24496 auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
24497 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
24498
24499 // Initialize a set a candidate getelementptrs. Note that we use a
24500 // SetVector here to preserve program order. If the index computations
24501 // are vectorizable and begin with loads, we want to minimize the chance
24502 // of having to reorder them later.
24503 SetVector<Value *> Candidates(llvm::from_range, GEPList);
24504
24505 // Some of the candidates may have already been vectorized after we
24506 // initially collected them or their index is optimized to constant value.
24507 // If so, they are marked as deleted, so remove them from the set of
24508 // candidates.
24509 Candidates.remove_if(P: [&R](Value *I) {
24510 return R.isDeleted(I: cast<Instruction>(Val: I)) ||
24511 isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
24512 });
24513
24514 // Remove from the set of candidates all pairs of getelementptrs with
24515 // constant differences. Such getelementptrs are likely not good
24516 // candidates for vectorization in a bottom-up phase since one can be
24517 // computed from the other. We also ensure all candidate getelementptr
24518 // indices are unique.
24519 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
24520 auto *GEPI = GEPList[I];
24521 if (!Candidates.count(key: GEPI))
24522 continue;
24523 const SCEV *SCEVI = SE->getSCEV(V: GEPList[I]);
24524 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
24525 auto *GEPJ = GEPList[J];
24526 const SCEV *SCEVJ = SE->getSCEV(V: GEPList[J]);
24527 if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
24528 Candidates.remove(X: GEPI);
24529 Candidates.remove(X: GEPJ);
24530 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
24531 Candidates.remove(X: GEPJ);
24532 }
24533 }
24534 }
24535
24536 // We break out of the above computation as soon as we know there are
24537 // fewer than two candidates remaining.
24538 if (Candidates.size() < 2)
24539 continue;
24540
24541 // Add the single, non-constant index of each candidate to the bundle. We
24542 // ensured the indices met these constraints when we originally collected
24543 // the getelementptrs.
24544 SmallVector<Value *, 16> Bundle(Candidates.size());
24545 auto BundleIndex = 0u;
24546 for (auto *V : Candidates) {
24547 auto *GEP = cast<GetElementPtrInst>(Val: V);
24548 auto *GEPIdx = GEP->idx_begin()->get();
24549 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
24550 Bundle[BundleIndex++] = GEPIdx;
24551 }
24552
24553 // Try and vectorize the indices. We are currently only interested in
24554 // gather-like cases of the form:
24555 //
24556 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
24557 //
24558 // where the loads of "a", the loads of "b", and the subtractions can be
24559 // performed in parallel. It's likely that detecting this pattern in a
24560 // bottom-up phase will be simpler and less costly than building a
24561 // full-blown top-down phase beginning at the consecutive loads.
24562 Changed |= tryToVectorizeList(VL: Bundle, R);
24563 }
24564 }
24565 return Changed;
24566}
24567
24568bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
24569 bool Changed = false;
24570 // Sort by type, base pointers and values operand. Value operands must be
24571 // compatible (have the same opcode, same parent), otherwise it is
24572 // definitely not profitable to try to vectorize them.
24573 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
24574 if (V->getValueOperand()->getType()->getTypeID() <
24575 V2->getValueOperand()->getType()->getTypeID())
24576 return true;
24577 if (V->getValueOperand()->getType()->getTypeID() >
24578 V2->getValueOperand()->getType()->getTypeID())
24579 return false;
24580 if (V->getPointerOperandType()->getTypeID() <
24581 V2->getPointerOperandType()->getTypeID())
24582 return true;
24583 if (V->getPointerOperandType()->getTypeID() >
24584 V2->getPointerOperandType()->getTypeID())
24585 return false;
24586 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
24587 V2->getValueOperand()->getType()->getScalarSizeInBits())
24588 return true;
24589 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
24590 V2->getValueOperand()->getType()->getScalarSizeInBits())
24591 return false;
24592 // UndefValues are compatible with all other values.
24593 if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand()))
24594 if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
24595 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
24596 DT->getNode(BB: I1->getParent());
24597 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
24598 DT->getNode(BB: I2->getParent());
24599 assert(NodeI1 && "Should only process reachable instructions");
24600 assert(NodeI2 && "Should only process reachable instructions");
24601 assert((NodeI1 == NodeI2) ==
24602 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
24603 "Different nodes should have different DFS numbers");
24604 if (NodeI1 != NodeI2)
24605 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
24606 return I1->getOpcode() < I2->getOpcode();
24607 }
24608 return V->getValueOperand()->getValueID() <
24609 V2->getValueOperand()->getValueID();
24610 };
24611
24612 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
24613 if (V1 == V2)
24614 return true;
24615 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
24616 return false;
24617 if (V1->getPointerOperandType() != V2->getPointerOperandType())
24618 return false;
24619 // Undefs are compatible with any other value.
24620 if (isa<UndefValue>(Val: V1->getValueOperand()) ||
24621 isa<UndefValue>(Val: V2->getValueOperand()))
24622 return true;
24623 if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand()))
24624 if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
24625 if (I1->getParent() != I2->getParent())
24626 return false;
24627 return getSameOpcode(VL: {I1, I2}, TLI: *TLI).valid();
24628 }
24629 if (isa<Constant>(Val: V1->getValueOperand()) &&
24630 isa<Constant>(Val: V2->getValueOperand()))
24631 return true;
24632 return V1->getValueOperand()->getValueID() ==
24633 V2->getValueOperand()->getValueID();
24634 };
24635
24636 // Attempt to sort and vectorize each of the store-groups.
24637 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
24638 for (auto &Pair : Stores) {
24639 if (Pair.second.size() < 2)
24640 continue;
24641
24642 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
24643 << Pair.second.size() << ".\n");
24644
24645 if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
24646 continue;
24647
24648 // Reverse stores to do bottom-to-top analysis. This is important if the
24649 // values are stores to the same addresses several times, in this case need
24650 // to follow the stores order (reversed to meet the memory dependecies).
24651 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
24652 Pair.second.rend());
24653 Changed |= tryToVectorizeSequence<StoreInst>(
24654 Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
24655 TryToVectorizeHelper: [&](ArrayRef<StoreInst *> Candidates, bool) {
24656 return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
24657 },
24658 /*MaxVFOnly=*/false, R);
24659 }
24660 return Changed;
24661}
24662