1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/LoopAccessAnalysis.h"
42#include "llvm/Analysis/LoopInfo.h"
43#include "llvm/Analysis/MemoryLocation.h"
44#include "llvm/Analysis/OptimizationRemarkEmitter.h"
45#include "llvm/Analysis/ScalarEvolution.h"
46#include "llvm/Analysis/ScalarEvolutionExpressions.h"
47#include "llvm/Analysis/TargetLibraryInfo.h"
48#include "llvm/Analysis/TargetTransformInfo.h"
49#include "llvm/Analysis/ValueTracking.h"
50#include "llvm/Analysis/VectorUtils.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
56#include "llvm/IR/DerivedTypes.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
62#include "llvm/IR/Instructions.h"
63#include "llvm/IR/IntrinsicInst.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
67#include "llvm/IR/PatternMatch.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
77#include "llvm/Support/Casting.h"
78#include "llvm/Support/CommandLine.h"
79#include "llvm/Support/Compiler.h"
80#include "llvm/Support/DOTGraphTraits.h"
81#include "llvm/Support/Debug.h"
82#include "llvm/Support/ErrorHandling.h"
83#include "llvm/Support/GraphWriter.h"
84#include "llvm/Support/InstructionCost.h"
85#include "llvm/Support/KnownBits.h"
86#include "llvm/Support/MathExtras.h"
87#include "llvm/Support/raw_ostream.h"
88#include "llvm/Transforms/Utils/InjectTLIMappings.h"
89#include "llvm/Transforms/Utils/Local.h"
90#include "llvm/Transforms/Utils/LoopUtils.h"
91#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<bool>
117 SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
118 cl::desc("Enable vectorization for wider vector utilization"));
119
120static cl::opt<int>
121 SLPCostThreshold("slp-threshold", cl::init(Val: 0), cl::Hidden,
122 cl::desc("Only vectorize if you gain more than this "
123 "number "));
124
125static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
126 "slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
127 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
128 "heuristics and makes vectorization decision via cost modeling."));
129
130static cl::opt<bool>
131ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
132 cl::desc("Attempt to vectorize horizontal reductions"));
133
134static cl::opt<bool> ShouldStartVectorizeHorAtStore(
135 "slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
136 cl::desc(
137 "Attempt to vectorize horizontal reductions feeding into a store"));
138
139// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
140// even if we match a reduction but do not vectorize in the end.
141static cl::opt<bool> AllowHorRdxIdenityOptimization(
142 "slp-optimize-identity-hor-reduction-ops", cl::init(Val: true), cl::Hidden,
143 cl::desc("Allow optimization of original scalar identity operations on "
144 "matched horizontal reductions."));
145
146static cl::opt<int>
147MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: 128), cl::Hidden,
148 cl::desc("Attempt to vectorize for this register size in bits"));
149
150static cl::opt<unsigned>
151MaxVFOption("slp-max-vf", cl::init(Val: 0), cl::Hidden,
152 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
153
154/// Limits the size of scheduling regions in a block.
155/// It avoid long compile times for _very_ large blocks where vector
156/// instructions are spread over a wide range.
157/// This limit is way higher than needed by real-world functions.
158static cl::opt<int>
159ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: 100000), cl::Hidden,
160 cl::desc("Limit the size of the SLP scheduling region per block"));
161
162static cl::opt<int> MinVectorRegSizeOption(
163 "slp-min-reg-size", cl::init(Val: 128), cl::Hidden,
164 cl::desc("Attempt to vectorize for this register size in bits"));
165
166static cl::opt<unsigned> RecursionMaxDepth(
167 "slp-recursion-max-depth", cl::init(Val: 12), cl::Hidden,
168 cl::desc("Limit the recursion depth when building a vectorizable tree"));
169
170static cl::opt<unsigned> MinTreeSize(
171 "slp-min-tree-size", cl::init(Val: 3), cl::Hidden,
172 cl::desc("Only vectorize small trees if they are fully vectorizable"));
173
174// The maximum depth that the look-ahead score heuristic will explore.
175// The higher this value, the higher the compilation time overhead.
176static cl::opt<int> LookAheadMaxDepth(
177 "slp-max-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
178 cl::desc("The maximum look-ahead depth for operand reordering scores"));
179
180// The maximum depth that the look-ahead score heuristic will explore
181// when it probing among candidates for vectorization tree roots.
182// The higher this value, the higher the compilation time overhead but unlike
183// similar limit for operands ordering this is less frequently used, hence
184// impact of higher value is less noticeable.
185static cl::opt<int> RootLookAheadMaxDepth(
186 "slp-max-root-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
187 cl::desc("The maximum look-ahead depth for searching best rooting option"));
188
189static cl::opt<unsigned> MinProfitableStridedLoads(
190 "slp-min-strided-loads", cl::init(Val: 2), cl::Hidden,
191 cl::desc("The minimum number of loads, which should be considered strided, "
192 "if the stride is > 1 or is runtime value"));
193
194static cl::opt<unsigned> MaxProfitableLoadStride(
195 "slp-max-stride", cl::init(Val: 8), cl::Hidden,
196 cl::desc("The maximum stride, considered to be profitable."));
197
198static cl::opt<bool>
199 ViewSLPTree("view-slp-tree", cl::Hidden,
200 cl::desc("Display the SLP trees with Graphviz"));
201
202static cl::opt<bool> VectorizeNonPowerOf2(
203 "slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
204 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
205
206// Limit the number of alias checks. The limit is chosen so that
207// it has no negative effect on the llvm benchmarks.
208static const unsigned AliasedCheckLimit = 10;
209
210// Limit of the number of uses for potentially transformed instructions/values,
211// used in checks to avoid compile-time explode.
212static constexpr int UsesLimit = 64;
213
214// Another limit for the alias checks: The maximum distance between load/store
215// instructions where alias checks are done.
216// This limit is useful for very large basic blocks.
217static const unsigned MaxMemDepDistance = 160;
218
219/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
220/// regions to be handled.
221static const int MinScheduleRegionSize = 16;
222
223/// Maximum allowed number of operands in the PHI nodes.
224static const unsigned MaxPHINumOperands = 128;
225
226/// Predicate for the element types that the SLP vectorizer supports.
227///
228/// The most important thing to filter here are types which are invalid in LLVM
229/// vectors. We also filter target specific types which have absolutely no
230/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
231/// avoids spending time checking the cost model and realizing that they will
232/// be inevitably scalarized.
233static bool isValidElementType(Type *Ty) {
234 // TODO: Support ScalableVectorType.
235 if (SLPReVec && isa<FixedVectorType>(Val: Ty))
236 Ty = Ty->getScalarType();
237 return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
238 !Ty->isPPC_FP128Ty();
239}
240
241/// \returns the number of elements for Ty.
242static unsigned getNumElements(Type *Ty) {
243 assert(!isa<ScalableVectorType>(Ty) &&
244 "ScalableVectorType is not supported.");
245 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
246 return VecTy->getNumElements();
247 return 1;
248}
249
250/// \returns the vector type of ScalarTy based on vectorization factor.
251static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
252 return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
253 NumElts: VF * getNumElements(Ty: ScalarTy));
254}
255
256/// \returns True if the value is a constant (but not globals/constant
257/// expressions).
258static bool isConstant(Value *V) {
259 return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
260}
261
262/// Checks if \p V is one of vector-like instructions, i.e. undef,
263/// insertelement/extractelement with constant indices for fixed vector type or
264/// extractvalue instruction.
265static bool isVectorLikeInstWithConstOps(Value *V) {
266 if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
267 !isa<ExtractValueInst, UndefValue>(Val: V))
268 return false;
269 auto *I = dyn_cast<Instruction>(Val: V);
270 if (!I || isa<ExtractValueInst>(Val: I))
271 return true;
272 if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType()))
273 return false;
274 if (isa<ExtractElementInst>(Val: I))
275 return isConstant(V: I->getOperand(i: 1));
276 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
277 return isConstant(V: I->getOperand(i: 2));
278}
279
280/// Returns power-of-2 number of elements in a single register (part), given the
281/// total number of elements \p Size and number of registers (parts) \p
282/// NumParts.
283static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
284 return PowerOf2Ceil(A: divideCeil(Numerator: Size, Denominator: NumParts));
285}
286
287/// Returns correct remaining number of elements, considering total amount \p
288/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
289/// and current register (part) \p Part.
290static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
291 unsigned Part) {
292 return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
293}
294
295#if !defined(NDEBUG)
296/// Print a short descriptor of the instruction bundle suitable for debug output.
297static std::string shortBundleName(ArrayRef<Value *> VL) {
298 std::string Result;
299 raw_string_ostream OS(Result);
300 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
301 OS.flush();
302 return Result;
303}
304#endif
305
306/// \returns true if all of the instructions in \p VL are in the same block or
307/// false otherwise.
308static bool allSameBlock(ArrayRef<Value *> VL) {
309 Instruction *I0 = dyn_cast<Instruction>(Val: VL[0]);
310 if (!I0)
311 return false;
312 if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
313 return true;
314
315 BasicBlock *BB = I0->getParent();
316 for (int I = 1, E = VL.size(); I < E; I++) {
317 auto *II = dyn_cast<Instruction>(Val: VL[I]);
318 if (!II)
319 return false;
320
321 if (BB != II->getParent())
322 return false;
323 }
324 return true;
325}
326
327/// \returns True if all of the values in \p VL are constants (but not
328/// globals/constant expressions).
329static bool allConstant(ArrayRef<Value *> VL) {
330 // Constant expressions and globals can't be vectorized like normal integer/FP
331 // constants.
332 return all_of(Range&: VL, P: isConstant);
333}
334
335/// \returns True if all of the values in \p VL are identical or some of them
336/// are UndefValue.
337static bool isSplat(ArrayRef<Value *> VL) {
338 Value *FirstNonUndef = nullptr;
339 for (Value *V : VL) {
340 if (isa<UndefValue>(Val: V))
341 continue;
342 if (!FirstNonUndef) {
343 FirstNonUndef = V;
344 continue;
345 }
346 if (V != FirstNonUndef)
347 return false;
348 }
349 return FirstNonUndef != nullptr;
350}
351
352/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
353static bool isCommutative(Instruction *I) {
354 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
355 return Cmp->isCommutative();
356 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
357 return BO->isCommutative() ||
358 (BO->getOpcode() == Instruction::Sub &&
359 !BO->hasNUsesOrMore(N: UsesLimit) &&
360 all_of(
361 Range: BO->uses(),
362 P: [](const Use &U) {
363 // Commutative, if icmp eq/ne sub, 0
364 ICmpInst::Predicate Pred;
365 if (match(V: U.getUser(),
366 P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
367 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
368 return true;
369 // Commutative, if abs(sub nsw, true) or abs(sub, false).
370 ConstantInt *Flag;
371 return match(V: U.getUser(),
372 P: m_Intrinsic<Intrinsic::abs>(
373 Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
374 (!cast<Instruction>(Val: U.get())->hasNoSignedWrap() ||
375 Flag->isOne());
376 })) ||
377 (BO->getOpcode() == Instruction::FSub &&
378 !BO->hasNUsesOrMore(N: UsesLimit) &&
379 all_of(Range: BO->uses(), P: [](const Use &U) {
380 return match(V: U.getUser(),
381 P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
382 }));
383 return I->isCommutative();
384}
385
386template <typename T>
387static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
388 unsigned Offset) {
389 static_assert(std::is_same_v<T, InsertElementInst> ||
390 std::is_same_v<T, ExtractElementInst>,
391 "unsupported T");
392 int Index = Offset;
393 if (const auto *IE = dyn_cast<T>(Inst)) {
394 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
395 if (!VT)
396 return std::nullopt;
397 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
398 if (!CI)
399 return std::nullopt;
400 if (CI->getValue().uge(VT->getNumElements()))
401 return std::nullopt;
402 Index *= VT->getNumElements();
403 Index += CI->getZExtValue();
404 return Index;
405 }
406 return std::nullopt;
407}
408
409/// \returns inserting or extracting index of InsertElement, ExtractElement or
410/// InsertValue instruction, using Offset as base offset for index.
411/// \returns std::nullopt if the index is not an immediate.
412static std::optional<unsigned> getElementIndex(const Value *Inst,
413 unsigned Offset = 0) {
414 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
415 return Index;
416 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
417 return Index;
418
419 int Index = Offset;
420
421 const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
422 if (!IV)
423 return std::nullopt;
424
425 Type *CurrentType = IV->getType();
426 for (unsigned I : IV->indices()) {
427 if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
428 Index *= ST->getNumElements();
429 CurrentType = ST->getElementType(N: I);
430 } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
431 Index *= AT->getNumElements();
432 CurrentType = AT->getElementType();
433 } else {
434 return std::nullopt;
435 }
436 Index += I;
437 }
438 return Index;
439}
440
441namespace {
442/// Specifies the way the mask should be analyzed for undefs/poisonous elements
443/// in the shuffle mask.
444enum class UseMask {
445 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
446 ///< check for the mask elements for the first argument (mask
447 ///< indices are in range [0:VF)).
448 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
449 ///< for the mask elements for the second argument (mask indices
450 ///< are in range [VF:2*VF))
451 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
452 ///< future shuffle elements and mark them as ones as being used
453 ///< in future. Non-undef elements are considered as unused since
454 ///< they're already marked as used in the mask.
455};
456} // namespace
457
458/// Prepares a use bitset for the given mask either for the first argument or
459/// for the second.
460static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
461 UseMask MaskArg) {
462 SmallBitVector UseMask(VF, true);
463 for (auto [Idx, Value] : enumerate(First&: Mask)) {
464 if (Value == PoisonMaskElem) {
465 if (MaskArg == UseMask::UndefsAsMask)
466 UseMask.reset(Idx);
467 continue;
468 }
469 if (MaskArg == UseMask::FirstArg && Value < VF)
470 UseMask.reset(Idx: Value);
471 else if (MaskArg == UseMask::SecondArg && Value >= VF)
472 UseMask.reset(Idx: Value - VF);
473 }
474 return UseMask;
475}
476
477/// Checks if the given value is actually an undefined constant vector.
478/// Also, if the \p UseMask is not empty, tries to check if the non-masked
479/// elements actually mask the insertelement buildvector, if any.
480template <bool IsPoisonOnly = false>
481static SmallBitVector isUndefVector(const Value *V,
482 const SmallBitVector &UseMask = {}) {
483 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
484 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
485 if (isa<T>(V))
486 return Res;
487 auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
488 if (!VecTy)
489 return Res.reset();
490 auto *C = dyn_cast<Constant>(Val: V);
491 if (!C) {
492 if (!UseMask.empty()) {
493 const Value *Base = V;
494 while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
495 Base = II->getOperand(i_nocapture: 0);
496 if (isa<T>(II->getOperand(i_nocapture: 1)))
497 continue;
498 std::optional<unsigned> Idx = getElementIndex(Inst: II);
499 if (!Idx) {
500 Res.reset();
501 return Res;
502 }
503 if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx))
504 Res.reset(Idx: *Idx);
505 }
506 // TODO: Add analysis for shuffles here too.
507 if (V == Base) {
508 Res.reset();
509 } else {
510 SmallBitVector SubMask(UseMask.size(), false);
511 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
512 }
513 } else {
514 Res.reset();
515 }
516 return Res;
517 }
518 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
519 if (Constant *Elem = C->getAggregateElement(Elt: I))
520 if (!isa<T>(Elem) &&
521 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I))))
522 Res.reset(Idx: I);
523 }
524 return Res;
525}
526
527/// Checks if the vector of instructions can be represented as a shuffle, like:
528/// %x0 = extractelement <4 x i8> %x, i32 0
529/// %x3 = extractelement <4 x i8> %x, i32 3
530/// %y1 = extractelement <4 x i8> %y, i32 1
531/// %y2 = extractelement <4 x i8> %y, i32 2
532/// %x0x0 = mul i8 %x0, %x0
533/// %x3x3 = mul i8 %x3, %x3
534/// %y1y1 = mul i8 %y1, %y1
535/// %y2y2 = mul i8 %y2, %y2
536/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
537/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
538/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
539/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
540/// ret <4 x i8> %ins4
541/// can be transformed into:
542/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
543/// i32 6>
544/// %2 = mul <4 x i8> %1, %1
545/// ret <4 x i8> %2
546/// Mask will return the Shuffle Mask equivalent to the extracted elements.
547/// TODO: Can we split off and reuse the shuffle mask detection from
548/// ShuffleVectorInst/getShuffleCost?
549static std::optional<TargetTransformInfo::ShuffleKind>
550isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
551 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
552 if (It == VL.end())
553 return std::nullopt;
554 unsigned Size =
555 std::accumulate(first: VL.begin(), last: VL.end(), init: 0u, binary_op: [](unsigned S, Value *V) {
556 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
557 if (!EI)
558 return S;
559 auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
560 if (!VTy)
561 return S;
562 return std::max(a: S, b: VTy->getNumElements());
563 });
564
565 Value *Vec1 = nullptr;
566 Value *Vec2 = nullptr;
567 bool HasNonUndefVec = any_of(Range&: VL, P: [](Value *V) {
568 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
569 if (!EE)
570 return false;
571 Value *Vec = EE->getVectorOperand();
572 if (isa<UndefValue>(Val: Vec))
573 return false;
574 return isGuaranteedNotToBePoison(V: Vec);
575 });
576 enum ShuffleMode { Unknown, Select, Permute };
577 ShuffleMode CommonShuffleMode = Unknown;
578 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
579 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
580 // Undef can be represented as an undef element in a vector.
581 if (isa<UndefValue>(Val: VL[I]))
582 continue;
583 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
584 if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
585 return std::nullopt;
586 auto *Vec = EI->getVectorOperand();
587 // We can extractelement from undef or poison vector.
588 if (isUndefVector</*isPoisonOnly=*/true>(V: Vec).all())
589 continue;
590 // All vector operands must have the same number of vector elements.
591 if (isa<UndefValue>(Val: Vec)) {
592 Mask[I] = I;
593 } else {
594 if (isa<UndefValue>(Val: EI->getIndexOperand()))
595 continue;
596 auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
597 if (!Idx)
598 return std::nullopt;
599 // Undefined behavior if Idx is negative or >= Size.
600 if (Idx->getValue().uge(RHS: Size))
601 continue;
602 unsigned IntIdx = Idx->getValue().getZExtValue();
603 Mask[I] = IntIdx;
604 }
605 if (isUndefVector(V: Vec).all() && HasNonUndefVec)
606 continue;
607 // For correct shuffling we have to have at most 2 different vector operands
608 // in all extractelement instructions.
609 if (!Vec1 || Vec1 == Vec) {
610 Vec1 = Vec;
611 } else if (!Vec2 || Vec2 == Vec) {
612 Vec2 = Vec;
613 Mask[I] += Size;
614 } else {
615 return std::nullopt;
616 }
617 if (CommonShuffleMode == Permute)
618 continue;
619 // If the extract index is not the same as the operation number, it is a
620 // permutation.
621 if (Mask[I] % Size != I) {
622 CommonShuffleMode = Permute;
623 continue;
624 }
625 CommonShuffleMode = Select;
626 }
627 // If we're not crossing lanes in different vectors, consider it as blending.
628 if (CommonShuffleMode == Select && Vec2)
629 return TargetTransformInfo::SK_Select;
630 // If Vec2 was never used, we have a permutation of a single vector, otherwise
631 // we have permutation of 2 vectors.
632 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
633 : TargetTransformInfo::SK_PermuteSingleSrc;
634}
635
636/// \returns True if Extract{Value,Element} instruction extracts element Idx.
637static std::optional<unsigned> getExtractIndex(Instruction *E) {
638 unsigned Opcode = E->getOpcode();
639 assert((Opcode == Instruction::ExtractElement ||
640 Opcode == Instruction::ExtractValue) &&
641 "Expected extractelement or extractvalue instruction.");
642 if (Opcode == Instruction::ExtractElement) {
643 auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1));
644 if (!CI)
645 return std::nullopt;
646 return CI->getZExtValue();
647 }
648 auto *EI = cast<ExtractValueInst>(Val: E);
649 if (EI->getNumIndices() != 1)
650 return std::nullopt;
651 return *EI->idx_begin();
652}
653
654namespace {
655
656/// Main data required for vectorization of instructions.
657struct InstructionsState {
658 /// The very first instruction in the list with the main opcode.
659 Value *OpValue = nullptr;
660
661 /// The main/alternate instruction.
662 Instruction *MainOp = nullptr;
663 Instruction *AltOp = nullptr;
664
665 /// The main/alternate opcodes for the list of instructions.
666 unsigned getOpcode() const {
667 return MainOp ? MainOp->getOpcode() : 0;
668 }
669
670 unsigned getAltOpcode() const {
671 return AltOp ? AltOp->getOpcode() : 0;
672 }
673
674 /// Some of the instructions in the list have alternate opcodes.
675 bool isAltShuffle() const { return AltOp != MainOp; }
676
677 bool isOpcodeOrAlt(Instruction *I) const {
678 unsigned CheckedOpcode = I->getOpcode();
679 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
680 }
681
682 InstructionsState() = delete;
683 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
684 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
685};
686
687} // end anonymous namespace
688
689/// Chooses the correct key for scheduling data. If \p Op has the same (or
690/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
691/// OpValue.
692static Value *isOneOf(const InstructionsState &S, Value *Op) {
693 auto *I = dyn_cast<Instruction>(Val: Op);
694 if (I && S.isOpcodeOrAlt(I))
695 return Op;
696 return S.OpValue;
697}
698
699/// \returns true if \p Opcode is allowed as part of the main/alternate
700/// instruction for SLP vectorization.
701///
702/// Example of unsupported opcode is SDIV that can potentially cause UB if the
703/// "shuffled out" lane would result in division by zero.
704static bool isValidForAlternation(unsigned Opcode) {
705 if (Instruction::isIntDivRem(Opcode))
706 return false;
707
708 return true;
709}
710
711static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
712 const TargetLibraryInfo &TLI,
713 unsigned BaseIndex = 0);
714
715/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
716/// compatible instructions or constants, or just some other regular values.
717static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
718 Value *Op1, const TargetLibraryInfo &TLI) {
719 return (isConstant(V: BaseOp0) && isConstant(V: Op0)) ||
720 (isConstant(V: BaseOp1) && isConstant(V: Op1)) ||
721 (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
722 !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) ||
723 BaseOp0 == Op0 || BaseOp1 == Op1 ||
724 getSameOpcode(VL: {BaseOp0, Op0}, TLI).getOpcode() ||
725 getSameOpcode(VL: {BaseOp1, Op1}, TLI).getOpcode();
726}
727
728/// \returns true if a compare instruction \p CI has similar "look" and
729/// same predicate as \p BaseCI, "as is" or with its operands and predicate
730/// swapped, false otherwise.
731static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
732 const TargetLibraryInfo &TLI) {
733 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
734 "Assessing comparisons of different types?");
735 CmpInst::Predicate BasePred = BaseCI->getPredicate();
736 CmpInst::Predicate Pred = CI->getPredicate();
737 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
738
739 Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0);
740 Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1);
741 Value *Op0 = CI->getOperand(i_nocapture: 0);
742 Value *Op1 = CI->getOperand(i_nocapture: 1);
743
744 return (BasePred == Pred &&
745 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
746 (BasePred == SwappedPred &&
747 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
748}
749
750/// \returns analysis of the Instructions in \p VL described in
751/// InstructionsState, the Opcode that we suppose the whole list
752/// could be vectorized even if its structure is diverse.
753static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
754 const TargetLibraryInfo &TLI,
755 unsigned BaseIndex) {
756 // Make sure these are all Instructions.
757 if (llvm::any_of(Range&: VL, P: [](Value *V) { return !isa<Instruction>(Val: V); }))
758 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
759
760 bool IsCastOp = isa<CastInst>(Val: VL[BaseIndex]);
761 bool IsBinOp = isa<BinaryOperator>(Val: VL[BaseIndex]);
762 bool IsCmpOp = isa<CmpInst>(Val: VL[BaseIndex]);
763 CmpInst::Predicate BasePred =
764 IsCmpOp ? cast<CmpInst>(Val: VL[BaseIndex])->getPredicate()
765 : CmpInst::BAD_ICMP_PREDICATE;
766 unsigned Opcode = cast<Instruction>(Val: VL[BaseIndex])->getOpcode();
767 unsigned AltOpcode = Opcode;
768 unsigned AltIndex = BaseIndex;
769
770 bool SwappedPredsCompatible = [&]() {
771 if (!IsCmpOp)
772 return false;
773 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
774 UniquePreds.insert(X: BasePred);
775 UniqueNonSwappedPreds.insert(X: BasePred);
776 for (Value *V : VL) {
777 auto *I = dyn_cast<CmpInst>(Val: V);
778 if (!I)
779 return false;
780 CmpInst::Predicate CurrentPred = I->getPredicate();
781 CmpInst::Predicate SwappedCurrentPred =
782 CmpInst::getSwappedPredicate(pred: CurrentPred);
783 UniqueNonSwappedPreds.insert(X: CurrentPred);
784 if (!UniquePreds.contains(key: CurrentPred) &&
785 !UniquePreds.contains(key: SwappedCurrentPred))
786 UniquePreds.insert(X: CurrentPred);
787 }
788 // Total number of predicates > 2, but if consider swapped predicates
789 // compatible only 2, consider swappable predicates as compatible opcodes,
790 // not alternate.
791 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
792 }();
793 // Check for one alternate opcode from another BinaryOperator.
794 // TODO - generalize to support all operators (types, calls etc.).
795 auto *IBase = cast<Instruction>(Val: VL[BaseIndex]);
796 Intrinsic::ID BaseID = 0;
797 SmallVector<VFInfo> BaseMappings;
798 if (auto *CallBase = dyn_cast<CallInst>(Val: IBase)) {
799 BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
800 BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase);
801 if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
802 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
803 }
804 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
805 auto *I = cast<Instruction>(Val: VL[Cnt]);
806 unsigned InstOpcode = I->getOpcode();
807 if (IsBinOp && isa<BinaryOperator>(Val: I)) {
808 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
809 continue;
810 if (Opcode == AltOpcode && isValidForAlternation(Opcode: InstOpcode) &&
811 isValidForAlternation(Opcode)) {
812 AltOpcode = InstOpcode;
813 AltIndex = Cnt;
814 continue;
815 }
816 } else if (IsCastOp && isa<CastInst>(Val: I)) {
817 Value *Op0 = IBase->getOperand(i: 0);
818 Type *Ty0 = Op0->getType();
819 Value *Op1 = I->getOperand(i: 0);
820 Type *Ty1 = Op1->getType();
821 if (Ty0 == Ty1) {
822 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
823 continue;
824 if (Opcode == AltOpcode) {
825 assert(isValidForAlternation(Opcode) &&
826 isValidForAlternation(InstOpcode) &&
827 "Cast isn't safe for alternation, logic needs to be updated!");
828 AltOpcode = InstOpcode;
829 AltIndex = Cnt;
830 continue;
831 }
832 }
833 } else if (auto *Inst = dyn_cast<CmpInst>(Val: VL[Cnt]); Inst && IsCmpOp) {
834 auto *BaseInst = cast<CmpInst>(Val: VL[BaseIndex]);
835 Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType();
836 Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType();
837 if (Ty0 == Ty1) {
838 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
839 // Check for compatible operands. If the corresponding operands are not
840 // compatible - need to perform alternate vectorization.
841 CmpInst::Predicate CurrentPred = Inst->getPredicate();
842 CmpInst::Predicate SwappedCurrentPred =
843 CmpInst::getSwappedPredicate(pred: CurrentPred);
844
845 if ((E == 2 || SwappedPredsCompatible) &&
846 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
847 continue;
848
849 if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
850 continue;
851 auto *AltInst = cast<CmpInst>(Val: VL[AltIndex]);
852 if (AltIndex != BaseIndex) {
853 if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
854 continue;
855 } else if (BasePred != CurrentPred) {
856 assert(
857 isValidForAlternation(InstOpcode) &&
858 "CmpInst isn't safe for alternation, logic needs to be updated!");
859 AltIndex = Cnt;
860 continue;
861 }
862 CmpInst::Predicate AltPred = AltInst->getPredicate();
863 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
864 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
865 continue;
866 }
867 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
868 if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
869 if (Gep->getNumOperands() != 2 ||
870 Gep->getOperand(i_nocapture: 0)->getType() != IBase->getOperand(i: 0)->getType())
871 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
872 } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
873 if (!isVectorLikeInstWithConstOps(V: EI))
874 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
875 } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
876 auto *BaseLI = cast<LoadInst>(Val: IBase);
877 if (!LI->isSimple() || !BaseLI->isSimple())
878 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
879 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
880 auto *CallBase = cast<CallInst>(Val: IBase);
881 if (Call->getCalledFunction() != CallBase->getCalledFunction())
882 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
883 if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() ||
884 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
885 Call->op_begin() + Call->getBundleOperandsEndIndex(),
886 CallBase->op_begin() +
887 CallBase->getBundleOperandsStartIndex())))
888 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
889 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
890 if (ID != BaseID)
891 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
892 if (!ID) {
893 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call);
894 if (Mappings.size() != BaseMappings.size() ||
895 Mappings.front().ISA != BaseMappings.front().ISA ||
896 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
897 Mappings.front().VectorName != BaseMappings.front().VectorName ||
898 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
899 Mappings.front().Shape.Parameters !=
900 BaseMappings.front().Shape.Parameters)
901 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
902 }
903 }
904 continue;
905 }
906 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
907 }
908
909 return InstructionsState(VL[BaseIndex], cast<Instruction>(Val: VL[BaseIndex]),
910 cast<Instruction>(Val: VL[AltIndex]));
911}
912
913/// \returns true if all of the values in \p VL have the same type or false
914/// otherwise.
915static bool allSameType(ArrayRef<Value *> VL) {
916 Type *Ty = VL.front()->getType();
917 return all_of(Range: VL.drop_front(), P: [&](Value *V) { return V->getType() == Ty; });
918}
919
920/// \returns True if in-tree use also needs extract. This refers to
921/// possible scalar operand in vectorized instruction.
922static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
923 TargetLibraryInfo *TLI) {
924 unsigned Opcode = UserInst->getOpcode();
925 switch (Opcode) {
926 case Instruction::Load: {
927 LoadInst *LI = cast<LoadInst>(Val: UserInst);
928 return (LI->getPointerOperand() == Scalar);
929 }
930 case Instruction::Store: {
931 StoreInst *SI = cast<StoreInst>(Val: UserInst);
932 return (SI->getPointerOperand() == Scalar);
933 }
934 case Instruction::Call: {
935 CallInst *CI = cast<CallInst>(Val: UserInst);
936 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
937 return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
938 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
939 Arg.value().get() == Scalar;
940 });
941 }
942 default:
943 return false;
944 }
945}
946
947/// \returns the AA location that is being access by the instruction.
948static MemoryLocation getLocation(Instruction *I) {
949 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
950 return MemoryLocation::get(SI);
951 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
952 return MemoryLocation::get(LI);
953 return MemoryLocation();
954}
955
956/// \returns True if the instruction is not a volatile or atomic load/store.
957static bool isSimple(Instruction *I) {
958 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
959 return LI->isSimple();
960 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
961 return SI->isSimple();
962 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
963 return !MI->isVolatile();
964 return true;
965}
966
967/// Shuffles \p Mask in accordance with the given \p SubMask.
968/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
969/// one but two input vectors.
970static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
971 bool ExtendingManyInputs = false) {
972 if (SubMask.empty())
973 return;
974 assert(
975 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
976 // Check if input scalars were extended to match the size of other node.
977 (SubMask.size() == Mask.size() &&
978 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
979 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
980 "SubMask with many inputs support must be larger than the mask.");
981 if (Mask.empty()) {
982 Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
983 return;
984 }
985 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
986 int TermValue = std::min(a: Mask.size(), b: SubMask.size());
987 for (int I = 0, E = SubMask.size(); I < E; ++I) {
988 if (SubMask[I] == PoisonMaskElem ||
989 (!ExtendingManyInputs &&
990 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
991 continue;
992 NewMask[I] = Mask[SubMask[I]];
993 }
994 Mask.swap(RHS&: NewMask);
995}
996
997/// Order may have elements assigned special value (size) which is out of
998/// bounds. Such indices only appear on places which correspond to undef values
999/// (see canReuseExtract for details) and used in order to avoid undef values
1000/// have effect on operands ordering.
1001/// The first loop below simply finds all unused indices and then the next loop
1002/// nest assigns these indices for undef values positions.
1003/// As an example below Order has two undef positions and they have assigned
1004/// values 3 and 7 respectively:
1005/// before: 6 9 5 4 9 2 1 0
1006/// after: 6 3 5 4 7 2 1 0
1007static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1008 const unsigned Sz = Order.size();
1009 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1010 SmallBitVector MaskedIndices(Sz);
1011 for (unsigned I = 0; I < Sz; ++I) {
1012 if (Order[I] < Sz)
1013 UnusedIndices.reset(Idx: Order[I]);
1014 else
1015 MaskedIndices.set(I);
1016 }
1017 if (MaskedIndices.none())
1018 return;
1019 assert(UnusedIndices.count() == MaskedIndices.count() &&
1020 "Non-synced masked/available indices.");
1021 int Idx = UnusedIndices.find_first();
1022 int MIdx = MaskedIndices.find_first();
1023 while (MIdx >= 0) {
1024 assert(Idx >= 0 && "Indices must be synced.");
1025 Order[MIdx] = Idx;
1026 Idx = UnusedIndices.find_next(Prev: Idx);
1027 MIdx = MaskedIndices.find_next(Prev: MIdx);
1028 }
1029}
1030
1031/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1032/// Opcode1.
1033SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, unsigned Opcode0,
1034 unsigned Opcode1) {
1035 SmallBitVector OpcodeMask(VL.size(), false);
1036 for (unsigned Lane : seq<unsigned>(Size: VL.size()))
1037 if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1)
1038 OpcodeMask.set(Lane);
1039 return OpcodeMask;
1040}
1041
1042namespace llvm {
1043
1044static void inversePermutation(ArrayRef<unsigned> Indices,
1045 SmallVectorImpl<int> &Mask) {
1046 Mask.clear();
1047 const unsigned E = Indices.size();
1048 Mask.resize(N: E, NV: PoisonMaskElem);
1049 for (unsigned I = 0; I < E; ++I)
1050 Mask[Indices[I]] = I;
1051}
1052
1053/// Reorders the list of scalars in accordance with the given \p Mask.
1054static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1055 ArrayRef<int> Mask) {
1056 assert(!Mask.empty() && "Expected non-empty mask.");
1057 SmallVector<Value *> Prev(Scalars.size(),
1058 PoisonValue::get(T: Scalars.front()->getType()));
1059 Prev.swap(RHS&: Scalars);
1060 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1061 if (Mask[I] != PoisonMaskElem)
1062 Scalars[Mask[I]] = Prev[I];
1063}
1064
1065/// Checks if the provided value does not require scheduling. It does not
1066/// require scheduling if this is not an instruction or it is an instruction
1067/// that does not read/write memory and all operands are either not instructions
1068/// or phi nodes or instructions from different blocks.
1069static bool areAllOperandsNonInsts(Value *V) {
1070 auto *I = dyn_cast<Instruction>(Val: V);
1071 if (!I)
1072 return true;
1073 return !mayHaveNonDefUseDependency(I: *I) &&
1074 all_of(Range: I->operands(), P: [I](Value *V) {
1075 auto *IO = dyn_cast<Instruction>(Val: V);
1076 if (!IO)
1077 return true;
1078 return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent();
1079 });
1080}
1081
1082/// Checks if the provided value does not require scheduling. It does not
1083/// require scheduling if this is not an instruction or it is an instruction
1084/// that does not read/write memory and all users are phi nodes or instructions
1085/// from the different blocks.
1086static bool isUsedOutsideBlock(Value *V) {
1087 auto *I = dyn_cast<Instruction>(Val: V);
1088 if (!I)
1089 return true;
1090 // Limits the number of uses to save compile time.
1091 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1092 all_of(Range: I->users(), P: [I](User *U) {
1093 auto *IU = dyn_cast<Instruction>(Val: U);
1094 if (!IU)
1095 return true;
1096 return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU);
1097 });
1098}
1099
1100/// Checks if the specified value does not require scheduling. It does not
1101/// require scheduling if all operands and all users do not need to be scheduled
1102/// in the current basic block.
1103static bool doesNotNeedToBeScheduled(Value *V) {
1104 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1105}
1106
1107/// Checks if the specified array of instructions does not require scheduling.
1108/// It is so if all either instructions have operands that do not require
1109/// scheduling or their users do not require scheduling since they are phis or
1110/// in other basic blocks.
1111static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1112 return !VL.empty() &&
1113 (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts));
1114}
1115
1116namespace slpvectorizer {
1117
1118/// Bottom Up SLP Vectorizer.
1119class BoUpSLP {
1120 struct TreeEntry;
1121 struct ScheduleData;
1122 class ShuffleCostEstimator;
1123 class ShuffleInstructionBuilder;
1124
1125public:
1126 /// Tracks the state we can represent the loads in the given sequence.
1127 enum class LoadsState {
1128 Gather,
1129 Vectorize,
1130 ScatterVectorize,
1131 StridedVectorize
1132 };
1133
1134 using ValueList = SmallVector<Value *, 8>;
1135 using InstrList = SmallVector<Instruction *, 16>;
1136 using ValueSet = SmallPtrSet<Value *, 16>;
1137 using StoreList = SmallVector<StoreInst *, 8>;
1138 using ExtraValueToDebugLocsMap =
1139 MapVector<Value *, SmallVector<Instruction *, 2>>;
1140 using OrdersType = SmallVector<unsigned, 4>;
1141
1142 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1143 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1144 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1145 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1146 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1147 AC(AC), DB(DB), DL(DL), ORE(ORE),
1148 Builder(Se->getContext(), TargetFolder(*DL)) {
1149 CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1150 // Use the vector register size specified by the target unless overridden
1151 // by a command-line option.
1152 // TODO: It would be better to limit the vectorization factor based on
1153 // data type rather than just register size. For example, x86 AVX has
1154 // 256-bit registers, but it does not support integer operations
1155 // at that width (that requires AVX2).
1156 if (MaxVectorRegSizeOption.getNumOccurrences())
1157 MaxVecRegSize = MaxVectorRegSizeOption;
1158 else
1159 MaxVecRegSize =
1160 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
1161 .getFixedValue();
1162
1163 if (MinVectorRegSizeOption.getNumOccurrences())
1164 MinVecRegSize = MinVectorRegSizeOption;
1165 else
1166 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1167 }
1168
1169 /// Vectorize the tree that starts with the elements in \p VL.
1170 /// Returns the vectorized root.
1171 Value *vectorizeTree();
1172
1173 /// Vectorize the tree but with the list of externally used values \p
1174 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1175 /// generated extractvalue instructions.
1176 /// \param ReplacedExternals containd list of replaced external values
1177 /// {scalar, replace} after emitting extractelement for external uses.
1178 Value *
1179 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1180 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1181 Instruction *ReductionRoot = nullptr);
1182
1183 /// \returns the cost incurred by unwanted spills and fills, caused by
1184 /// holding live values over call sites.
1185 InstructionCost getSpillCost() const;
1186
1187 /// \returns the vectorization cost of the subtree that starts at \p VL.
1188 /// A negative number means that this is profitable.
1189 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1190
1191 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1192 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1193 void buildTree(ArrayRef<Value *> Roots,
1194 const SmallDenseSet<Value *> &UserIgnoreLst);
1195
1196 /// Construct a vectorizable tree that starts at \p Roots.
1197 void buildTree(ArrayRef<Value *> Roots);
1198
1199 /// Returns whether the root node has in-tree uses.
1200 bool doesRootHaveInTreeUses() const {
1201 return !VectorizableTree.empty() &&
1202 !VectorizableTree.front()->UserTreeIndices.empty();
1203 }
1204
1205 /// Return the scalars of the root node.
1206 ArrayRef<Value *> getRootNodeScalars() const {
1207 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1208 return VectorizableTree.front()->Scalars;
1209 }
1210
1211 /// Checks if the root graph node can be emitted with narrower bitwidth at
1212 /// codegen and returns it signedness, if so.
1213 bool isSignedMinBitwidthRootNode() const {
1214 return MinBWs.at(Val: VectorizableTree.front().get()).second;
1215 }
1216
1217 /// Builds external uses of the vectorized scalars, i.e. the list of
1218 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1219 /// ExternallyUsedValues contains additional list of external uses to handle
1220 /// vectorization of reductions.
1221 void
1222 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1223
1224 /// Transforms graph nodes to target specific representations, if profitable.
1225 void transformNodes();
1226
1227 /// Clear the internal data structures that are created by 'buildTree'.
1228 void deleteTree() {
1229 VectorizableTree.clear();
1230 ScalarToTreeEntry.clear();
1231 MultiNodeScalars.clear();
1232 MustGather.clear();
1233 NonScheduledFirst.clear();
1234 EntryToLastInstruction.clear();
1235 ExternalUses.clear();
1236 ExternalUsesAsGEPs.clear();
1237 for (auto &Iter : BlocksSchedules) {
1238 BlockScheduling *BS = Iter.second.get();
1239 BS->clear();
1240 }
1241 MinBWs.clear();
1242 ReductionBitWidth = 0;
1243 CastMaxMinBWSizes.reset();
1244 ExtraBitWidthNodes.clear();
1245 InstrElementSize.clear();
1246 UserIgnoreList = nullptr;
1247 PostponedGathers.clear();
1248 ValueToGatherNodes.clear();
1249 }
1250
1251 unsigned getTreeSize() const { return VectorizableTree.size(); }
1252
1253 /// Perform LICM and CSE on the newly generated gather sequences.
1254 void optimizeGatherSequence();
1255
1256 /// Checks if the specified gather tree entry \p TE can be represented as a
1257 /// shuffled vector entry + (possibly) permutation with other gathers. It
1258 /// implements the checks only for possibly ordered scalars (Loads,
1259 /// ExtractElement, ExtractValue), which can be part of the graph.
1260 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1261
1262 /// Sort loads into increasing pointers offsets to allow greater clustering.
1263 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1264
1265 /// Gets reordering data for the given tree entry. If the entry is vectorized
1266 /// - just return ReorderIndices, otherwise check if the scalars can be
1267 /// reordered and return the most optimal order.
1268 /// \return std::nullopt if ordering is not important, empty order, if
1269 /// identity order is important, or the actual order.
1270 /// \param TopToBottom If true, include the order of vectorized stores and
1271 /// insertelement nodes, otherwise skip them.
1272 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1273 bool TopToBottom);
1274
1275 /// Reorders the current graph to the most profitable order starting from the
1276 /// root node to the leaf nodes. The best order is chosen only from the nodes
1277 /// of the same size (vectorization factor). Smaller nodes are considered
1278 /// parts of subgraph with smaller VF and they are reordered independently. We
1279 /// can make it because we still need to extend smaller nodes to the wider VF
1280 /// and we can merge reordering shuffles with the widening shuffles.
1281 void reorderTopToBottom();
1282
1283 /// Reorders the current graph to the most profitable order starting from
1284 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1285 /// number of reshuffles if the leaf nodes use the same order. In this case we
1286 /// can merge the orders and just shuffle user node instead of shuffling its
1287 /// operands. Plus, even the leaf nodes have different orders, it allows to
1288 /// sink reordering in the graph closer to the root node and merge it later
1289 /// during analysis.
1290 void reorderBottomToTop(bool IgnoreReorder = false);
1291
1292 /// \return The vector element size in bits to use when vectorizing the
1293 /// expression tree ending at \p V. If V is a store, the size is the width of
1294 /// the stored value. Otherwise, the size is the width of the largest loaded
1295 /// value reaching V. This method is used by the vectorizer to calculate
1296 /// vectorization factors.
1297 unsigned getVectorElementSize(Value *V);
1298
1299 /// Compute the minimum type sizes required to represent the entries in a
1300 /// vectorizable tree.
1301 void computeMinimumValueSizes();
1302
1303 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1304 unsigned getMaxVecRegSize() const {
1305 return MaxVecRegSize;
1306 }
1307
1308 // \returns minimum vector register size as set by cl::opt.
1309 unsigned getMinVecRegSize() const {
1310 return MinVecRegSize;
1311 }
1312
1313 unsigned getMinVF(unsigned Sz) const {
1314 return std::max(a: 2U, b: getMinVecRegSize() / Sz);
1315 }
1316
1317 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1318 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1319 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1320 return MaxVF ? MaxVF : UINT_MAX;
1321 }
1322
1323 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1324 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1325 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1326 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1327 ///
1328 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1329 unsigned canMapToVector(Type *T) const;
1330
1331 /// \returns True if the VectorizableTree is both tiny and not fully
1332 /// vectorizable. We do not vectorize such trees.
1333 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1334
1335 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1336 /// can be load combined in the backend. Load combining may not be allowed in
1337 /// the IR optimizer, so we do not want to alter the pattern. For example,
1338 /// partially transforming a scalar bswap() pattern into vector code is
1339 /// effectively impossible for the backend to undo.
1340 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1341 /// may not be necessary.
1342 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1343
1344 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1345 /// can be load combined in the backend. Load combining may not be allowed in
1346 /// the IR optimizer, so we do not want to alter the pattern. For example,
1347 /// partially transforming a scalar bswap() pattern into vector code is
1348 /// effectively impossible for the backend to undo.
1349 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1350 /// may not be necessary.
1351 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1352
1353 /// Checks if the given array of loads can be represented as a vectorized,
1354 /// scatter or just simple gather.
1355 /// \param VL list of loads.
1356 /// \param VL0 main load value.
1357 /// \param Order returned order of load instructions.
1358 /// \param PointerOps returned list of pointer operands.
1359 /// \param TryRecursiveCheck used to check if long masked gather can be
1360 /// represented as a serie of loads/insert subvector, if profitable.
1361 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
1362 SmallVectorImpl<unsigned> &Order,
1363 SmallVectorImpl<Value *> &PointerOps,
1364 bool TryRecursiveCheck = true) const;
1365
1366 OptimizationRemarkEmitter *getORE() { return ORE; }
1367
1368 /// This structure holds any data we need about the edges being traversed
1369 /// during buildTree_rec(). We keep track of:
1370 /// (i) the user TreeEntry index, and
1371 /// (ii) the index of the edge.
1372 struct EdgeInfo {
1373 EdgeInfo() = default;
1374 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1375 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
1376 /// The user TreeEntry.
1377 TreeEntry *UserTE = nullptr;
1378 /// The operand index of the use.
1379 unsigned EdgeIdx = UINT_MAX;
1380#ifndef NDEBUG
1381 friend inline raw_ostream &operator<<(raw_ostream &OS,
1382 const BoUpSLP::EdgeInfo &EI) {
1383 EI.dump(OS);
1384 return OS;
1385 }
1386 /// Debug print.
1387 void dump(raw_ostream &OS) const {
1388 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1389 << " EdgeIdx:" << EdgeIdx << "}";
1390 }
1391 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1392#endif
1393 bool operator == (const EdgeInfo &Other) const {
1394 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1395 }
1396 };
1397
1398 /// A helper class used for scoring candidates for two consecutive lanes.
1399 class LookAheadHeuristics {
1400 const TargetLibraryInfo &TLI;
1401 const DataLayout &DL;
1402 ScalarEvolution &SE;
1403 const BoUpSLP &R;
1404 int NumLanes; // Total number of lanes (aka vectorization factor).
1405 int MaxLevel; // The maximum recursion depth for accumulating score.
1406
1407 public:
1408 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
1409 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1410 int MaxLevel)
1411 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1412 MaxLevel(MaxLevel) {}
1413
1414 // The hard-coded scores listed here are not very important, though it shall
1415 // be higher for better matches to improve the resulting cost. When
1416 // computing the scores of matching one sub-tree with another, we are
1417 // basically counting the number of values that are matching. So even if all
1418 // scores are set to 1, we would still get a decent matching result.
1419 // However, sometimes we have to break ties. For example we may have to
1420 // choose between matching loads vs matching opcodes. This is what these
1421 // scores are helping us with: they provide the order of preference. Also,
1422 // this is important if the scalar is externally used or used in another
1423 // tree entry node in the different lane.
1424
1425 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1426 static const int ScoreConsecutiveLoads = 4;
1427 /// The same load multiple times. This should have a better score than
1428 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1429 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1430 /// a vector load and 1.0 for a broadcast.
1431 static const int ScoreSplatLoads = 3;
1432 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1433 static const int ScoreReversedLoads = 3;
1434 /// A load candidate for masked gather.
1435 static const int ScoreMaskedGatherCandidate = 1;
1436 /// ExtractElementInst from same vector and consecutive indexes.
1437 static const int ScoreConsecutiveExtracts = 4;
1438 /// ExtractElementInst from same vector and reversed indices.
1439 static const int ScoreReversedExtracts = 3;
1440 /// Constants.
1441 static const int ScoreConstants = 2;
1442 /// Instructions with the same opcode.
1443 static const int ScoreSameOpcode = 2;
1444 /// Instructions with alt opcodes (e.g, add + sub).
1445 static const int ScoreAltOpcodes = 1;
1446 /// Identical instructions (a.k.a. splat or broadcast).
1447 static const int ScoreSplat = 1;
1448 /// Matching with an undef is preferable to failing.
1449 static const int ScoreUndef = 1;
1450 /// Score for failing to find a decent match.
1451 static const int ScoreFail = 0;
1452 /// Score if all users are vectorized.
1453 static const int ScoreAllUserVectorized = 1;
1454
1455 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1456 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1457 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1458 /// MainAltOps.
1459 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
1460 ArrayRef<Value *> MainAltOps) const {
1461 if (!isValidElementType(Ty: V1->getType()) ||
1462 !isValidElementType(Ty: V2->getType()))
1463 return LookAheadHeuristics::ScoreFail;
1464
1465 if (V1 == V2) {
1466 if (isa<LoadInst>(Val: V1)) {
1467 // Retruns true if the users of V1 and V2 won't need to be extracted.
1468 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1469 // Bail out if we have too many uses to save compilation time.
1470 if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit))
1471 return false;
1472
1473 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1474 return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
1475 return U == U1 || U == U2 || R.getTreeEntry(V: U) != nullptr;
1476 });
1477 };
1478 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1479 };
1480 // A broadcast of a load can be cheaper on some targets.
1481 if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
1482 NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
1483 ((int)V1->getNumUses() == NumLanes ||
1484 AllUsersAreInternal(V1, V2)))
1485 return LookAheadHeuristics::ScoreSplatLoads;
1486 }
1487 return LookAheadHeuristics::ScoreSplat;
1488 }
1489
1490 auto CheckSameEntryOrFail = [&]() {
1491 if (const TreeEntry *TE1 = R.getTreeEntry(V: V1);
1492 TE1 && TE1 == R.getTreeEntry(V: V2))
1493 return LookAheadHeuristics::ScoreSplatLoads;
1494 return LookAheadHeuristics::ScoreFail;
1495 };
1496
1497 auto *LI1 = dyn_cast<LoadInst>(Val: V1);
1498 auto *LI2 = dyn_cast<LoadInst>(Val: V2);
1499 if (LI1 && LI2) {
1500 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1501 !LI2->isSimple())
1502 return CheckSameEntryOrFail();
1503
1504 std::optional<int> Dist = getPointersDiff(
1505 ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
1506 PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1507 if (!Dist || *Dist == 0) {
1508 if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
1509 getUnderlyingObject(V: LI2->getPointerOperand()) &&
1510 R.TTI->isLegalMaskedGather(
1511 DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
1512 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1513 return CheckSameEntryOrFail();
1514 }
1515 // The distance is too large - still may be profitable to use masked
1516 // loads/gathers.
1517 if (std::abs(x: *Dist) > NumLanes / 2)
1518 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1519 // This still will detect consecutive loads, but we might have "holes"
1520 // in some cases. It is ok for non-power-2 vectorization and may produce
1521 // better results. It should not affect current vectorization.
1522 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
1523 : LookAheadHeuristics::ScoreReversedLoads;
1524 }
1525
1526 auto *C1 = dyn_cast<Constant>(Val: V1);
1527 auto *C2 = dyn_cast<Constant>(Val: V2);
1528 if (C1 && C2)
1529 return LookAheadHeuristics::ScoreConstants;
1530
1531 // Extracts from consecutive indexes of the same vector better score as
1532 // the extracts could be optimized away.
1533 Value *EV1;
1534 ConstantInt *Ex1Idx;
1535 if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
1536 // Undefs are always profitable for extractelements.
1537 // Compiler can easily combine poison and extractelement <non-poison> or
1538 // undef and extractelement <poison>. But combining undef +
1539 // extractelement <non-poison-but-may-produce-poison> requires some
1540 // extra operations.
1541 if (isa<UndefValue>(Val: V2))
1542 return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all())
1543 ? LookAheadHeuristics::ScoreConsecutiveExtracts
1544 : LookAheadHeuristics::ScoreSameOpcode;
1545 Value *EV2 = nullptr;
1546 ConstantInt *Ex2Idx = nullptr;
1547 if (match(V: V2,
1548 P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
1549 R: m_Undef())))) {
1550 // Undefs are always profitable for extractelements.
1551 if (!Ex2Idx)
1552 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1553 if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
1554 return LookAheadHeuristics::ScoreConsecutiveExtracts;
1555 if (EV2 == EV1) {
1556 int Idx1 = Ex1Idx->getZExtValue();
1557 int Idx2 = Ex2Idx->getZExtValue();
1558 int Dist = Idx2 - Idx1;
1559 // The distance is too large - still may be profitable to use
1560 // shuffles.
1561 if (std::abs(x: Dist) == 0)
1562 return LookAheadHeuristics::ScoreSplat;
1563 if (std::abs(x: Dist) > NumLanes / 2)
1564 return LookAheadHeuristics::ScoreSameOpcode;
1565 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
1566 : LookAheadHeuristics::ScoreReversedExtracts;
1567 }
1568 return LookAheadHeuristics::ScoreAltOpcodes;
1569 }
1570 return CheckSameEntryOrFail();
1571 }
1572
1573 auto *I1 = dyn_cast<Instruction>(Val: V1);
1574 auto *I2 = dyn_cast<Instruction>(Val: V2);
1575 if (I1 && I2) {
1576 if (I1->getParent() != I2->getParent())
1577 return CheckSameEntryOrFail();
1578 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1579 Ops.push_back(Elt: I1);
1580 Ops.push_back(Elt: I2);
1581 InstructionsState S = getSameOpcode(VL: Ops, TLI);
1582 // Note: Only consider instructions with <= 2 operands to avoid
1583 // complexity explosion.
1584 if (S.getOpcode() &&
1585 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1586 !S.isAltShuffle()) &&
1587 all_of(Range&: Ops, P: [&S](Value *V) {
1588 return cast<Instruction>(Val: V)->getNumOperands() ==
1589 S.MainOp->getNumOperands();
1590 }))
1591 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1592 : LookAheadHeuristics::ScoreSameOpcode;
1593 }
1594
1595 if (isa<UndefValue>(Val: V2))
1596 return LookAheadHeuristics::ScoreUndef;
1597
1598 return CheckSameEntryOrFail();
1599 }
1600
1601 /// Go through the operands of \p LHS and \p RHS recursively until
1602 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1603 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1604 /// of \p U1 and \p U2), except at the beginning of the recursion where
1605 /// these are set to nullptr.
1606 ///
1607 /// For example:
1608 /// \verbatim
1609 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1610 /// \ / \ / \ / \ /
1611 /// + + + +
1612 /// G1 G2 G3 G4
1613 /// \endverbatim
1614 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1615 /// each level recursively, accumulating the score. It starts from matching
1616 /// the additions at level 0, then moves on to the loads (level 1). The
1617 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1618 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1619 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1620 /// Please note that the order of the operands does not matter, as we
1621 /// evaluate the score of all profitable combinations of operands. In
1622 /// other words the score of G1 and G4 is the same as G1 and G2. This
1623 /// heuristic is based on ideas described in:
1624 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1625 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1626 /// Luís F. W. Góes
1627 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
1628 Instruction *U2, int CurrLevel,
1629 ArrayRef<Value *> MainAltOps) const {
1630
1631 // Get the shallow score of V1 and V2.
1632 int ShallowScoreAtThisLevel =
1633 getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
1634
1635 // If reached MaxLevel,
1636 // or if V1 and V2 are not instructions,
1637 // or if they are SPLAT,
1638 // or if they are not consecutive,
1639 // or if profitable to vectorize loads or extractelements, early return
1640 // the current cost.
1641 auto *I1 = dyn_cast<Instruction>(Val: LHS);
1642 auto *I2 = dyn_cast<Instruction>(Val: RHS);
1643 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1644 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1645 (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) ||
1646 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1647 (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
1648 ShallowScoreAtThisLevel))
1649 return ShallowScoreAtThisLevel;
1650 assert(I1 && I2 && "Should have early exited.");
1651
1652 // Contains the I2 operand indexes that got matched with I1 operands.
1653 SmallSet<unsigned, 4> Op2Used;
1654
1655 // Recursion towards the operands of I1 and I2. We are trying all possible
1656 // operand pairs, and keeping track of the best score.
1657 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1658 OpIdx1 != NumOperands1; ++OpIdx1) {
1659 // Try to pair op1I with the best operand of I2.
1660 int MaxTmpScore = 0;
1661 unsigned MaxOpIdx2 = 0;
1662 bool FoundBest = false;
1663 // If I2 is commutative try all combinations.
1664 unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1;
1665 unsigned ToIdx = isCommutative(I: I2)
1666 ? I2->getNumOperands()
1667 : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1);
1668 assert(FromIdx <= ToIdx && "Bad index");
1669 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1670 // Skip operands already paired with OpIdx1.
1671 if (Op2Used.count(V: OpIdx2))
1672 continue;
1673 // Recursively calculate the cost at each level
1674 int TmpScore =
1675 getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
1676 U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: std::nullopt);
1677 // Look for the best score.
1678 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1679 TmpScore > MaxTmpScore) {
1680 MaxTmpScore = TmpScore;
1681 MaxOpIdx2 = OpIdx2;
1682 FoundBest = true;
1683 }
1684 }
1685 if (FoundBest) {
1686 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1687 Op2Used.insert(V: MaxOpIdx2);
1688 ShallowScoreAtThisLevel += MaxTmpScore;
1689 }
1690 }
1691 return ShallowScoreAtThisLevel;
1692 }
1693 };
1694 /// A helper data structure to hold the operands of a vector of instructions.
1695 /// This supports a fixed vector length for all operand vectors.
1696 class VLOperands {
1697 /// For each operand we need (i) the value, and (ii) the opcode that it
1698 /// would be attached to if the expression was in a left-linearized form.
1699 /// This is required to avoid illegal operand reordering.
1700 /// For example:
1701 /// \verbatim
1702 /// 0 Op1
1703 /// |/
1704 /// Op1 Op2 Linearized + Op2
1705 /// \ / ----------> |/
1706 /// - -
1707 ///
1708 /// Op1 - Op2 (0 + Op1) - Op2
1709 /// \endverbatim
1710 ///
1711 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1712 ///
1713 /// Another way to think of this is to track all the operations across the
1714 /// path from the operand all the way to the root of the tree and to
1715 /// calculate the operation that corresponds to this path. For example, the
1716 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1717 /// corresponding operation is a '-' (which matches the one in the
1718 /// linearized tree, as shown above).
1719 ///
1720 /// For lack of a better term, we refer to this operation as Accumulated
1721 /// Path Operation (APO).
1722 struct OperandData {
1723 OperandData() = default;
1724 OperandData(Value *V, bool APO, bool IsUsed)
1725 : V(V), APO(APO), IsUsed(IsUsed) {}
1726 /// The operand value.
1727 Value *V = nullptr;
1728 /// TreeEntries only allow a single opcode, or an alternate sequence of
1729 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1730 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1731 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1732 /// (e.g., Add/Mul)
1733 bool APO = false;
1734 /// Helper data for the reordering function.
1735 bool IsUsed = false;
1736 };
1737
1738 /// During operand reordering, we are trying to select the operand at lane
1739 /// that matches best with the operand at the neighboring lane. Our
1740 /// selection is based on the type of value we are looking for. For example,
1741 /// if the neighboring lane has a load, we need to look for a load that is
1742 /// accessing a consecutive address. These strategies are summarized in the
1743 /// 'ReorderingMode' enumerator.
1744 enum class ReorderingMode {
1745 Load, ///< Matching loads to consecutive memory addresses
1746 Opcode, ///< Matching instructions based on opcode (same or alternate)
1747 Constant, ///< Matching constants
1748 Splat, ///< Matching the same instruction multiple times (broadcast)
1749 Failed, ///< We failed to create a vectorizable group
1750 };
1751
1752 using OperandDataVec = SmallVector<OperandData, 2>;
1753
1754 /// A vector of operand vectors.
1755 SmallVector<OperandDataVec, 4> OpsVec;
1756
1757 const TargetLibraryInfo &TLI;
1758 const DataLayout &DL;
1759 ScalarEvolution &SE;
1760 const BoUpSLP &R;
1761 const Loop *L = nullptr;
1762
1763 /// \returns the operand data at \p OpIdx and \p Lane.
1764 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1765 return OpsVec[OpIdx][Lane];
1766 }
1767
1768 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1769 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1770 return OpsVec[OpIdx][Lane];
1771 }
1772
1773 /// Clears the used flag for all entries.
1774 void clearUsed() {
1775 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1776 OpIdx != NumOperands; ++OpIdx)
1777 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1778 ++Lane)
1779 OpsVec[OpIdx][Lane].IsUsed = false;
1780 }
1781
1782 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1783 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1784 std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]);
1785 }
1786
1787 /// \param Lane lane of the operands under analysis.
1788 /// \param OpIdx operand index in \p Lane lane we're looking the best
1789 /// candidate for.
1790 /// \param Idx operand index of the current candidate value.
1791 /// \returns The additional score due to possible broadcasting of the
1792 /// elements in the lane. It is more profitable to have power-of-2 unique
1793 /// elements in the lane, it will be vectorized with higher probability
1794 /// after removing duplicates. Currently the SLP vectorizer supports only
1795 /// vectorization of the power-of-2 number of unique scalars.
1796 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1797 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1798 if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1799 return 0;
1800 SmallPtrSet<Value *, 4> Uniques;
1801 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1802 if (Ln == Lane)
1803 continue;
1804 Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
1805 if (!isa<Instruction>(Val: OpIdxLnV))
1806 return 0;
1807 Uniques.insert(Ptr: OpIdxLnV);
1808 }
1809 int UniquesCount = Uniques.size();
1810 int UniquesCntWithIdxLaneV =
1811 Uniques.contains(Ptr: IdxLaneV) ? UniquesCount : UniquesCount + 1;
1812 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1813 int UniquesCntWithOpIdxLaneV =
1814 Uniques.contains(Ptr: OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1815 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1816 return 0;
1817 return (PowerOf2Ceil(A: UniquesCntWithOpIdxLaneV) -
1818 UniquesCntWithOpIdxLaneV) -
1819 (PowerOf2Ceil(A: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1820 }
1821
1822 /// \param Lane lane of the operands under analysis.
1823 /// \param OpIdx operand index in \p Lane lane we're looking the best
1824 /// candidate for.
1825 /// \param Idx operand index of the current candidate value.
1826 /// \returns The additional score for the scalar which users are all
1827 /// vectorized.
1828 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1829 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
1830 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1831 // Do not care about number of uses for vector-like instructions
1832 // (extractelement/extractvalue with constant indices), they are extracts
1833 // themselves and already externally used. Vectorization of such
1834 // instructions does not add extra extractelement instruction, just may
1835 // remove it.
1836 if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
1837 isVectorLikeInstWithConstOps(V: OpIdxLaneV))
1838 return LookAheadHeuristics::ScoreAllUserVectorized;
1839 auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
1840 if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV))
1841 return 0;
1842 return R.areAllUsersVectorized(I: IdxLaneI)
1843 ? LookAheadHeuristics::ScoreAllUserVectorized
1844 : 0;
1845 }
1846
1847 /// Score scaling factor for fully compatible instructions but with
1848 /// different number of external uses. Allows better selection of the
1849 /// instructions with less external uses.
1850 static const int ScoreScaleFactor = 10;
1851
1852 /// \Returns the look-ahead score, which tells us how much the sub-trees
1853 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1854 /// score. This helps break ties in an informed way when we cannot decide on
1855 /// the order of the operands by just considering the immediate
1856 /// predecessors.
1857 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1858 int Lane, unsigned OpIdx, unsigned Idx,
1859 bool &IsUsed) {
1860 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1861 LookAheadMaxDepth);
1862 // Keep track of the instruction stack as we recurse into the operands
1863 // during the look-ahead score exploration.
1864 int Score =
1865 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1866 /*CurrLevel=*/1, MainAltOps);
1867 if (Score) {
1868 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1869 if (Score <= -SplatScore) {
1870 // Set the minimum score for splat-like sequence to avoid setting
1871 // failed state.
1872 Score = 1;
1873 } else {
1874 Score += SplatScore;
1875 // Scale score to see the difference between different operands
1876 // and similar operands but all vectorized/not all vectorized
1877 // uses. It does not affect actual selection of the best
1878 // compatible operand in general, just allows to select the
1879 // operand with all vectorized uses.
1880 Score *= ScoreScaleFactor;
1881 Score += getExternalUseScore(Lane, OpIdx, Idx);
1882 IsUsed = true;
1883 }
1884 }
1885 return Score;
1886 }
1887
1888 /// Best defined scores per lanes between the passes. Used to choose the
1889 /// best operand (with the highest score) between the passes.
1890 /// The key - {Operand Index, Lane}.
1891 /// The value - the best score between the passes for the lane and the
1892 /// operand.
1893 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
1894 BestScoresPerLanes;
1895
1896 // Search all operands in Ops[*][Lane] for the one that matches best
1897 // Ops[OpIdx][LastLane] and return its opreand index.
1898 // If no good match can be found, return std::nullopt.
1899 std::optional<unsigned>
1900 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1901 ArrayRef<ReorderingMode> ReorderingModes,
1902 ArrayRef<Value *> MainAltOps) {
1903 unsigned NumOperands = getNumOperands();
1904
1905 // The operand of the previous lane at OpIdx.
1906 Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
1907
1908 // Our strategy mode for OpIdx.
1909 ReorderingMode RMode = ReorderingModes[OpIdx];
1910 if (RMode == ReorderingMode::Failed)
1911 return std::nullopt;
1912
1913 // The linearized opcode of the operand at OpIdx, Lane.
1914 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1915
1916 // The best operand index and its score.
1917 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1918 // are using the score to differentiate between the two.
1919 struct BestOpData {
1920 std::optional<unsigned> Idx;
1921 unsigned Score = 0;
1922 } BestOp;
1923 BestOp.Score =
1924 BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0)
1925 .first->second;
1926
1927 // Track if the operand must be marked as used. If the operand is set to
1928 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1929 // want to reestimate the operands again on the following iterations).
1930 bool IsUsed = RMode == ReorderingMode::Splat ||
1931 RMode == ReorderingMode::Constant ||
1932 RMode == ReorderingMode::Load;
1933 // Iterate through all unused operands and look for the best.
1934 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1935 // Get the operand at Idx and Lane.
1936 OperandData &OpData = getData(OpIdx: Idx, Lane);
1937 Value *Op = OpData.V;
1938 bool OpAPO = OpData.APO;
1939
1940 // Skip already selected operands.
1941 if (OpData.IsUsed)
1942 continue;
1943
1944 // Skip if we are trying to move the operand to a position with a
1945 // different opcode in the linearized tree form. This would break the
1946 // semantics.
1947 if (OpAPO != OpIdxAPO)
1948 continue;
1949
1950 // Look for an operand that matches the current mode.
1951 switch (RMode) {
1952 case ReorderingMode::Load:
1953 case ReorderingMode::Opcode: {
1954 bool LeftToRight = Lane > LastLane;
1955 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1956 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1957 int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
1958 OpIdx, Idx, IsUsed);
1959 if (Score > static_cast<int>(BestOp.Score) ||
1960 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1961 Idx == OpIdx)) {
1962 BestOp.Idx = Idx;
1963 BestOp.Score = Score;
1964 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score;
1965 }
1966 break;
1967 }
1968 case ReorderingMode::Constant:
1969 if (isa<Constant>(Val: Op) ||
1970 (!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
1971 BestOp.Idx = Idx;
1972 if (isa<Constant>(Val: Op)) {
1973 BestOp.Score = LookAheadHeuristics::ScoreConstants;
1974 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
1975 LookAheadHeuristics::ScoreConstants;
1976 }
1977 if (isa<UndefValue>(Val: Op) || !isa<Constant>(Val: Op))
1978 IsUsed = false;
1979 }
1980 break;
1981 case ReorderingMode::Splat:
1982 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Val: Op))) {
1983 IsUsed = Op == OpLastLane;
1984 if (Op == OpLastLane) {
1985 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1986 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
1987 LookAheadHeuristics::ScoreSplat;
1988 }
1989 BestOp.Idx = Idx;
1990 }
1991 break;
1992 case ReorderingMode::Failed:
1993 llvm_unreachable("Not expected Failed reordering mode.");
1994 }
1995 }
1996
1997 if (BestOp.Idx) {
1998 getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
1999 return BestOp.Idx;
2000 }
2001 // If we could not find a good match return std::nullopt.
2002 return std::nullopt;
2003 }
2004
2005 /// Helper for reorderOperandVecs.
2006 /// \returns the lane that we should start reordering from. This is the one
2007 /// which has the least number of operands that can freely move about or
2008 /// less profitable because it already has the most optimal set of operands.
2009 unsigned getBestLaneToStartReordering() const {
2010 unsigned Min = UINT_MAX;
2011 unsigned SameOpNumber = 0;
2012 // std::pair<unsigned, unsigned> is used to implement a simple voting
2013 // algorithm and choose the lane with the least number of operands that
2014 // can freely move about or less profitable because it already has the
2015 // most optimal set of operands. The first unsigned is a counter for
2016 // voting, the second unsigned is the counter of lanes with instructions
2017 // with same/alternate opcodes and same parent basic block.
2018 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
2019 // Try to be closer to the original results, if we have multiple lanes
2020 // with same cost. If 2 lanes have the same cost, use the one with the
2021 // lowest index.
2022 for (int I = getNumLanes(); I > 0; --I) {
2023 unsigned Lane = I - 1;
2024 OperandsOrderData NumFreeOpsHash =
2025 getMaxNumOperandsThatCanBeReordered(Lane);
2026 // Compare the number of operands that can move and choose the one with
2027 // the least number.
2028 if (NumFreeOpsHash.NumOfAPOs < Min) {
2029 Min = NumFreeOpsHash.NumOfAPOs;
2030 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2031 HashMap.clear();
2032 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
2033 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2034 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2035 // Select the most optimal lane in terms of number of operands that
2036 // should be moved around.
2037 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2038 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
2039 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2040 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2041 auto *It = HashMap.find(Key: NumFreeOpsHash.Hash);
2042 if (It == HashMap.end())
2043 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
2044 else
2045 ++It->second.first;
2046 }
2047 }
2048 // Select the lane with the minimum counter.
2049 unsigned BestLane = 0;
2050 unsigned CntMin = UINT_MAX;
2051 for (const auto &Data : reverse(C&: HashMap)) {
2052 if (Data.second.first < CntMin) {
2053 CntMin = Data.second.first;
2054 BestLane = Data.second.second;
2055 }
2056 }
2057 return BestLane;
2058 }
2059
2060 /// Data structure that helps to reorder operands.
2061 struct OperandsOrderData {
2062 /// The best number of operands with the same APOs, which can be
2063 /// reordered.
2064 unsigned NumOfAPOs = UINT_MAX;
2065 /// Number of operands with the same/alternate instruction opcode and
2066 /// parent.
2067 unsigned NumOpsWithSameOpcodeParent = 0;
2068 /// Hash for the actual operands ordering.
2069 /// Used to count operands, actually their position id and opcode
2070 /// value. It is used in the voting mechanism to find the lane with the
2071 /// least number of operands that can freely move about or less profitable
2072 /// because it already has the most optimal set of operands. Can be
2073 /// replaced with SmallVector<unsigned> instead but hash code is faster
2074 /// and requires less memory.
2075 unsigned Hash = 0;
2076 };
2077 /// \returns the maximum number of operands that are allowed to be reordered
2078 /// for \p Lane and the number of compatible instructions(with the same
2079 /// parent/opcode). This is used as a heuristic for selecting the first lane
2080 /// to start operand reordering.
2081 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2082 unsigned CntTrue = 0;
2083 unsigned NumOperands = getNumOperands();
2084 // Operands with the same APO can be reordered. We therefore need to count
2085 // how many of them we have for each APO, like this: Cnt[APO] = x.
2086 // Since we only have two APOs, namely true and false, we can avoid using
2087 // a map. Instead we can simply count the number of operands that
2088 // correspond to one of them (in this case the 'true' APO), and calculate
2089 // the other by subtracting it from the total number of operands.
2090 // Operands with the same instruction opcode and parent are more
2091 // profitable since we don't need to move them in many cases, with a high
2092 // probability such lane already can be vectorized effectively.
2093 bool AllUndefs = true;
2094 unsigned NumOpsWithSameOpcodeParent = 0;
2095 Instruction *OpcodeI = nullptr;
2096 BasicBlock *Parent = nullptr;
2097 unsigned Hash = 0;
2098 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2099 const OperandData &OpData = getData(OpIdx, Lane);
2100 if (OpData.APO)
2101 ++CntTrue;
2102 // Use Boyer-Moore majority voting for finding the majority opcode and
2103 // the number of times it occurs.
2104 if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
2105 if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI).getOpcode() ||
2106 I->getParent() != Parent) {
2107 if (NumOpsWithSameOpcodeParent == 0) {
2108 NumOpsWithSameOpcodeParent = 1;
2109 OpcodeI = I;
2110 Parent = I->getParent();
2111 } else {
2112 --NumOpsWithSameOpcodeParent;
2113 }
2114 } else {
2115 ++NumOpsWithSameOpcodeParent;
2116 }
2117 }
2118 Hash = hash_combine(
2119 args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1)));
2120 AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
2121 }
2122 if (AllUndefs)
2123 return {};
2124 OperandsOrderData Data;
2125 Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
2126 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2127 Data.Hash = Hash;
2128 return Data;
2129 }
2130
2131 /// Go through the instructions in VL and append their operands.
2132 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2133 assert(!VL.empty() && "Bad VL");
2134 assert((empty() || VL.size() == getNumLanes()) &&
2135 "Expected same number of lanes");
2136 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2137 unsigned NumOperands = cast<Instruction>(Val: VL[0])->getNumOperands();
2138 constexpr unsigned IntrinsicNumOperands = 2;
2139 if (isa<IntrinsicInst>(Val: VL[0]))
2140 NumOperands = IntrinsicNumOperands;
2141 OpsVec.resize(N: NumOperands);
2142 unsigned NumLanes = VL.size();
2143 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2144 OpsVec[OpIdx].resize(N: NumLanes);
2145 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2146 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2147 // Our tree has just 3 nodes: the root and two operands.
2148 // It is therefore trivial to get the APO. We only need to check the
2149 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2150 // RHS operand. The LHS operand of both add and sub is never attached
2151 // to an inversese operation in the linearized form, therefore its APO
2152 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2153
2154 // Since operand reordering is performed on groups of commutative
2155 // operations or alternating sequences (e.g., +, -), we can safely
2156 // tell the inverse operations by checking commutativity.
2157 bool IsInverseOperation = !isCommutative(I: cast<Instruction>(Val: VL[Lane]));
2158 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2159 OpsVec[OpIdx][Lane] = {cast<Instruction>(Val: VL[Lane])->getOperand(i: OpIdx),
2160 APO, false};
2161 }
2162 }
2163 }
2164
2165 /// \returns the number of operands.
2166 unsigned getNumOperands() const { return OpsVec.size(); }
2167
2168 /// \returns the number of lanes.
2169 unsigned getNumLanes() const { return OpsVec[0].size(); }
2170
2171 /// \returns the operand value at \p OpIdx and \p Lane.
2172 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2173 return getData(OpIdx, Lane).V;
2174 }
2175
2176 /// \returns true if the data structure is empty.
2177 bool empty() const { return OpsVec.empty(); }
2178
2179 /// Clears the data.
2180 void clear() { OpsVec.clear(); }
2181
2182 /// \Returns true if there are enough operands identical to \p Op to fill
2183 /// the whole vector (it is mixed with constants or loop invariant values).
2184 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2185 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2186 bool OpAPO = getData(OpIdx, Lane).APO;
2187 bool IsInvariant = L && L->isLoopInvariant(V: Op);
2188 unsigned Cnt = 0;
2189 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2190 if (Ln == Lane)
2191 continue;
2192 // This is set to true if we found a candidate for broadcast at Lane.
2193 bool FoundCandidate = false;
2194 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2195 OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2196 if (Data.APO != OpAPO || Data.IsUsed)
2197 continue;
2198 Value *OpILane = getValue(OpIdx: OpI, Lane);
2199 bool IsConstantOp = isa<Constant>(Val: OpILane);
2200 // Consider the broadcast candidate if:
2201 // 1. Same value is found in one of the operands.
2202 if (Data.V == Op ||
2203 // 2. The operand in the given lane is not constant but there is a
2204 // constant operand in another lane (which can be moved to the
2205 // given lane). In this case we can represent it as a simple
2206 // permutation of constant and broadcast.
2207 (!IsConstantOp &&
2208 ((Lns > 2 && isa<Constant>(Val: Data.V)) ||
2209 // 2.1. If we have only 2 lanes, need to check that value in the
2210 // next lane does not build same opcode sequence.
2211 (Lns == 2 &&
2212 !getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + 1) % OpE, Lane: Ln)}, TLI)
2213 .getOpcode() &&
2214 isa<Constant>(Val: Data.V)))) ||
2215 // 3. The operand in the current lane is loop invariant (can be
2216 // hoisted out) and another operand is also a loop invariant
2217 // (though not a constant). In this case the whole vector can be
2218 // hoisted out.
2219 // FIXME: need to teach the cost model about this case for better
2220 // estimation.
2221 (IsInvariant && !isa<Constant>(Val: Data.V) &&
2222 !getSameOpcode(VL: {Op, Data.V}, TLI).getOpcode() &&
2223 L->isLoopInvariant(V: Data.V))) {
2224 FoundCandidate = true;
2225 Data.IsUsed = Data.V == Op;
2226 if (Data.V == Op)
2227 ++Cnt;
2228 break;
2229 }
2230 }
2231 if (!FoundCandidate)
2232 return false;
2233 }
2234 return getNumLanes() == 2 || Cnt > 1;
2235 }
2236
2237 /// Checks if there is at least single compatible operand in lanes other
2238 /// than \p Lane, compatible with the operand \p Op.
2239 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2240 bool OpAPO = getData(OpIdx, Lane).APO;
2241 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2242 if (Ln == Lane)
2243 continue;
2244 if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
2245 const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
2246 if (Data.APO != OpAPO || Data.IsUsed)
2247 return true;
2248 Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
2249 return (L && L->isLoopInvariant(V: OpILn)) ||
2250 (getSameOpcode(VL: {Op, OpILn}, TLI).getOpcode() &&
2251 Op->getParent() == cast<Instruction>(Val: OpILn)->getParent());
2252 }))
2253 return true;
2254 }
2255 return false;
2256 }
2257
2258 public:
2259 /// Initialize with all the operands of the instruction vector \p RootVL.
2260 VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
2261 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2262 L(R.LI->getLoopFor(
2263 BB: (cast<Instruction>(Val: RootVL.front())->getParent()))) {
2264 // Append all the operands of RootVL.
2265 appendOperandsOfVL(VL: RootVL);
2266 }
2267
2268 /// \Returns a value vector with the operands across all lanes for the
2269 /// opearnd at \p OpIdx.
2270 ValueList getVL(unsigned OpIdx) const {
2271 ValueList OpVL(OpsVec[OpIdx].size());
2272 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2273 "Expected same num of lanes across all operands");
2274 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2275 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2276 return OpVL;
2277 }
2278
2279 // Performs operand reordering for 2 or more operands.
2280 // The original operands are in OrigOps[OpIdx][Lane].
2281 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2282 void reorder() {
2283 unsigned NumOperands = getNumOperands();
2284 unsigned NumLanes = getNumLanes();
2285 // Each operand has its own mode. We are using this mode to help us select
2286 // the instructions for each lane, so that they match best with the ones
2287 // we have selected so far.
2288 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2289
2290 // This is a greedy single-pass algorithm. We are going over each lane
2291 // once and deciding on the best order right away with no back-tracking.
2292 // However, in order to increase its effectiveness, we start with the lane
2293 // that has operands that can move the least. For example, given the
2294 // following lanes:
2295 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2296 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2297 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2298 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2299 // we will start at Lane 1, since the operands of the subtraction cannot
2300 // be reordered. Then we will visit the rest of the lanes in a circular
2301 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2302
2303 // Find the first lane that we will start our search from.
2304 unsigned FirstLane = getBestLaneToStartReordering();
2305
2306 // Initialize the modes.
2307 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2308 Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
2309 // Keep track if we have instructions with all the same opcode on one
2310 // side.
2311 if (isa<LoadInst>(Val: OpLane0))
2312 ReorderingModes[OpIdx] = ReorderingMode::Load;
2313 else if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
2314 // Check if OpLane0 should be broadcast.
2315 if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) ||
2316 !canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
2317 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2318 else
2319 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2320 } else if (isa<Constant>(Val: OpLane0))
2321 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2322 else if (isa<Argument>(Val: OpLane0))
2323 // Our best hope is a Splat. It may save some cost in some cases.
2324 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2325 else
2326 // NOTE: This should be unreachable.
2327 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2328 }
2329
2330 // Check that we don't have same operands. No need to reorder if operands
2331 // are just perfect diamond or shuffled diamond match. Do not do it only
2332 // for possible broadcasts or non-power of 2 number of scalars (just for
2333 // now).
2334 auto &&SkipReordering = [this]() {
2335 SmallPtrSet<Value *, 4> UniqueValues;
2336 ArrayRef<OperandData> Op0 = OpsVec.front();
2337 for (const OperandData &Data : Op0)
2338 UniqueValues.insert(Ptr: Data.V);
2339 for (ArrayRef<OperandData> Op : drop_begin(RangeOrContainer&: OpsVec, N: 1)) {
2340 if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
2341 return !UniqueValues.contains(Ptr: Data.V);
2342 }))
2343 return false;
2344 }
2345 // TODO: Check if we can remove a check for non-power-2 number of
2346 // scalars after full support of non-power-2 vectorization.
2347 return UniqueValues.size() != 2 && isPowerOf2_32(Value: UniqueValues.size());
2348 };
2349
2350 // If the initial strategy fails for any of the operand indexes, then we
2351 // perform reordering again in a second pass. This helps avoid assigning
2352 // high priority to the failed strategy, and should improve reordering for
2353 // the non-failed operand indexes.
2354 for (int Pass = 0; Pass != 2; ++Pass) {
2355 // Check if no need to reorder operands since they're are perfect or
2356 // shuffled diamond match.
2357 // Need to do it to avoid extra external use cost counting for
2358 // shuffled matches, which may cause regressions.
2359 if (SkipReordering())
2360 break;
2361 // Skip the second pass if the first pass did not fail.
2362 bool StrategyFailed = false;
2363 // Mark all operand data as free to use.
2364 clearUsed();
2365 // We keep the original operand order for the FirstLane, so reorder the
2366 // rest of the lanes. We are visiting the nodes in a circular fashion,
2367 // using FirstLane as the center point and increasing the radius
2368 // distance.
2369 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2370 for (unsigned I = 0; I < NumOperands; ++I)
2371 MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
2372
2373 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2374 // Visit the lane on the right and then the lane on the left.
2375 for (int Direction : {+1, -1}) {
2376 int Lane = FirstLane + Direction * Distance;
2377 if (Lane < 0 || Lane >= (int)NumLanes)
2378 continue;
2379 int LastLane = Lane - Direction;
2380 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2381 "Out of bounds");
2382 // Look for a good match for each operand.
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2385 std::optional<unsigned> BestIdx = getBestOperand(
2386 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps: MainAltOps[OpIdx]);
2387 // By not selecting a value, we allow the operands that follow to
2388 // select a better matching value. We will get a non-null value in
2389 // the next run of getBestOperand().
2390 if (BestIdx) {
2391 // Swap the current operand with the one returned by
2392 // getBestOperand().
2393 swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
2394 } else {
2395 // Enable the second pass.
2396 StrategyFailed = true;
2397 }
2398 // Try to get the alternate opcode and follow it during analysis.
2399 if (MainAltOps[OpIdx].size() != 2) {
2400 OperandData &AltOp = getData(OpIdx, Lane);
2401 InstructionsState OpS =
2402 getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2403 if (OpS.getOpcode() && OpS.isAltShuffle())
2404 MainAltOps[OpIdx].push_back(Elt: AltOp.V);
2405 }
2406 }
2407 }
2408 }
2409 // Skip second pass if the strategy did not fail.
2410 if (!StrategyFailed)
2411 break;
2412 }
2413 }
2414
2415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2416 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2417 switch (RMode) {
2418 case ReorderingMode::Load:
2419 return "Load";
2420 case ReorderingMode::Opcode:
2421 return "Opcode";
2422 case ReorderingMode::Constant:
2423 return "Constant";
2424 case ReorderingMode::Splat:
2425 return "Splat";
2426 case ReorderingMode::Failed:
2427 return "Failed";
2428 }
2429 llvm_unreachable("Unimplemented Reordering Type");
2430 }
2431
2432 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2433 raw_ostream &OS) {
2434 return OS << getModeStr(RMode);
2435 }
2436
2437 /// Debug print.
2438 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2439 printMode(RMode, dbgs());
2440 }
2441
2442 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2443 return printMode(RMode, OS);
2444 }
2445
2446 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
2447 const unsigned Indent = 2;
2448 unsigned Cnt = 0;
2449 for (const OperandDataVec &OpDataVec : OpsVec) {
2450 OS << "Operand " << Cnt++ << "\n";
2451 for (const OperandData &OpData : OpDataVec) {
2452 OS.indent(Indent) << "{";
2453 if (Value *V = OpData.V)
2454 OS << *V;
2455 else
2456 OS << "null";
2457 OS << ", APO:" << OpData.APO << "}\n";
2458 }
2459 OS << "\n";
2460 }
2461 return OS;
2462 }
2463
2464 /// Debug print.
2465 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2466#endif
2467 };
2468
2469 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2470 /// for a pair which have highest score deemed to have best chance to form
2471 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2472 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2473 /// of the cost, considered to be good enough score.
2474 std::optional<int>
2475 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2476 int Limit = LookAheadHeuristics::ScoreFail) const {
2477 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2478 RootLookAheadMaxDepth);
2479 int BestScore = Limit;
2480 std::optional<int> Index;
2481 for (int I : seq<int>(Begin: 0, End: Candidates.size())) {
2482 int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first,
2483 RHS: Candidates[I].second,
2484 /*U1=*/nullptr, /*U2=*/nullptr,
2485 /*Level=*/CurrLevel: 1, MainAltOps: std::nullopt);
2486 if (Score > BestScore) {
2487 BestScore = Score;
2488 Index = I;
2489 }
2490 }
2491 return Index;
2492 }
2493
2494 /// Checks if the instruction is marked for deletion.
2495 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); }
2496
2497 /// Removes an instruction from its block and eventually deletes it.
2498 /// It's like Instruction::eraseFromParent() except that the actual deletion
2499 /// is delayed until BoUpSLP is destructed.
2500 void eraseInstruction(Instruction *I) {
2501 DeletedInstructions.insert(V: I);
2502 }
2503
2504 /// Remove instructions from the parent function and clear the operands of \p
2505 /// DeadVals instructions, marking for deletion trivially dead operands.
2506 template <typename T>
2507 void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2508 SmallVector<WeakTrackingVH> DeadInsts;
2509 for (T *V : DeadVals) {
2510 auto *I = cast<Instruction>(V);
2511 DeletedInstructions.insert(I);
2512 }
2513 DenseSet<Value *> Processed;
2514 for (T *V : DeadVals) {
2515 if (!V || !Processed.insert(V).second)
2516 continue;
2517 auto *I = cast<Instruction>(V);
2518 salvageDebugInfo(*I);
2519 SmallVector<const TreeEntry *> Entries;
2520 if (const TreeEntry *Entry = getTreeEntry(I)) {
2521 Entries.push_back(Elt: Entry);
2522 auto It = MultiNodeScalars.find(I);
2523 if (It != MultiNodeScalars.end())
2524 Entries.append(It->second.begin(), It->second.end());
2525 }
2526 for (Use &U : I->operands()) {
2527 if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
2528 OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
2529 wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
2530 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2531 return Entry->VectorizedValue == OpI;
2532 })))
2533 DeadInsts.push_back(Elt: OpI);
2534 }
2535 I->dropAllReferences();
2536 }
2537 for (T *V : DeadVals) {
2538 auto *I = cast<Instruction>(V);
2539 if (!I->getParent())
2540 continue;
2541 assert((I->use_empty() || all_of(I->uses(),
2542 [&](Use &U) {
2543 return isDeleted(
2544 cast<Instruction>(U.getUser()));
2545 })) &&
2546 "trying to erase instruction with users.");
2547 I->removeFromParent();
2548 SE->forgetValue(V: I);
2549 }
2550 // Process the dead instruction list until empty.
2551 while (!DeadInsts.empty()) {
2552 Value *V = DeadInsts.pop_back_val();
2553 Instruction *VI = cast_or_null<Instruction>(Val: V);
2554 if (!VI || !VI->getParent())
2555 continue;
2556 assert(isInstructionTriviallyDead(VI, TLI) &&
2557 "Live instruction found in dead worklist!");
2558 assert(VI->use_empty() && "Instructions with uses are not dead.");
2559
2560 // Don't lose the debug info while deleting the instructions.
2561 salvageDebugInfo(I&: *VI);
2562
2563 // Null out all of the instruction's operands to see if any operand
2564 // becomes dead as we go.
2565 for (Use &OpU : VI->operands()) {
2566 Value *OpV = OpU.get();
2567 if (!OpV)
2568 continue;
2569 OpU.set(nullptr);
2570
2571 if (!OpV->use_empty())
2572 continue;
2573
2574 // If the operand is an instruction that became dead as we nulled out
2575 // the operand, and if it is 'trivially' dead, delete it in a future
2576 // loop iteration.
2577 if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
2578 if (!DeletedInstructions.contains(V: OpI) &&
2579 isInstructionTriviallyDead(I: OpI, TLI))
2580 DeadInsts.push_back(Elt: OpI);
2581 }
2582
2583 VI->removeFromParent();
2584 DeletedInstructions.insert(V: VI);
2585 SE->forgetValue(V: VI);
2586 }
2587 }
2588
2589 /// Checks if the instruction was already analyzed for being possible
2590 /// reduction root.
2591 bool isAnalyzedReductionRoot(Instruction *I) const {
2592 return AnalyzedReductionsRoots.count(Ptr: I);
2593 }
2594 /// Register given instruction as already analyzed for being possible
2595 /// reduction root.
2596 void analyzedReductionRoot(Instruction *I) {
2597 AnalyzedReductionsRoots.insert(Ptr: I);
2598 }
2599 /// Checks if the provided list of reduced values was checked already for
2600 /// vectorization.
2601 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
2602 return AnalyzedReductionVals.contains(V: hash_value(S: VL));
2603 }
2604 /// Adds the list of reduced values to list of already checked values for the
2605 /// vectorization.
2606 void analyzedReductionVals(ArrayRef<Value *> VL) {
2607 AnalyzedReductionVals.insert(V: hash_value(S: VL));
2608 }
2609 /// Clear the list of the analyzed reduction root instructions.
2610 void clearReductionData() {
2611 AnalyzedReductionsRoots.clear();
2612 AnalyzedReductionVals.clear();
2613 AnalyzedMinBWVals.clear();
2614 }
2615 /// Checks if the given value is gathered in one of the nodes.
2616 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2617 return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); });
2618 }
2619 /// Checks if the given value is gathered in one of the nodes.
2620 bool isGathered(const Value *V) const {
2621 return MustGather.contains(Ptr: V);
2622 }
2623 /// Checks if the specified value was not schedule.
2624 bool isNotScheduled(const Value *V) const {
2625 return NonScheduledFirst.contains(Ptr: V);
2626 }
2627
2628 /// Check if the value is vectorized in the tree.
2629 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2630
2631 ~BoUpSLP();
2632
2633private:
2634 /// Determine if a node \p E in can be demoted to a smaller type with a
2635 /// truncation. We collect the entries that will be demoted in ToDemote.
2636 /// \param E Node for analysis
2637 /// \param ToDemote indices of the nodes to be demoted.
2638 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2639 unsigned &BitWidth,
2640 SmallVectorImpl<unsigned> &ToDemote,
2641 DenseSet<const TreeEntry *> &Visited,
2642 unsigned &MaxDepthLevel,
2643 bool &IsProfitableToDemote,
2644 bool IsTruncRoot) const;
2645
2646 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2647 /// reordering (i.e. the operands can be reordered because they have only one
2648 /// user and reordarable).
2649 /// \param ReorderableGathers List of all gather nodes that require reordering
2650 /// (e.g., gather of extractlements or partially vectorizable loads).
2651 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2652 /// reordering, subset of \p NonVectorized.
2653 bool
2654 canReorderOperands(TreeEntry *UserTE,
2655 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2656 ArrayRef<TreeEntry *> ReorderableGathers,
2657 SmallVectorImpl<TreeEntry *> &GatherOps);
2658
2659 /// Checks if the given \p TE is a gather node with clustered reused scalars
2660 /// and reorders it per given \p Mask.
2661 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2662
2663 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2664 /// if any. If it is not vectorized (gather node), returns nullptr.
2665 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2666 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2667 TreeEntry *TE = nullptr;
2668 const auto *It = find_if(Range&: VL, P: [&](Value *V) {
2669 TE = getTreeEntry(V);
2670 if (TE && is_contained(Range&: TE->UserTreeIndices, Element: EdgeInfo(UserTE, OpIdx)))
2671 return true;
2672 auto It = MultiNodeScalars.find(Val: V);
2673 if (It != MultiNodeScalars.end()) {
2674 for (TreeEntry *E : It->second) {
2675 if (is_contained(Range&: E->UserTreeIndices, Element: EdgeInfo(UserTE, OpIdx))) {
2676 TE = E;
2677 return true;
2678 }
2679 }
2680 }
2681 return false;
2682 });
2683 if (It != VL.end()) {
2684 assert(TE->isSame(VL) && "Expected same scalars.");
2685 return TE;
2686 }
2687 return nullptr;
2688 }
2689
2690 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2691 /// if any. If it is not vectorized (gather node), returns nullptr.
2692 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2693 unsigned OpIdx) const {
2694 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2695 UserTE: const_cast<TreeEntry *>(UserTE), OpIdx);
2696 }
2697
2698 /// Checks if all users of \p I are the part of the vectorization tree.
2699 bool areAllUsersVectorized(
2700 Instruction *I,
2701 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2702
2703 /// Return information about the vector formed for the specified index
2704 /// of a vector of (the same) instruction.
2705 TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);
2706
2707 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2708 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2709
2710 /// \returns Cast context for the given graph node.
2711 TargetTransformInfo::CastContextHint
2712 getCastContextHint(const TreeEntry &TE) const;
2713
2714 /// \returns the cost of the vectorizable entry.
2715 InstructionCost getEntryCost(const TreeEntry *E,
2716 ArrayRef<Value *> VectorizedVals,
2717 SmallPtrSetImpl<Value *> &CheckedExtracts);
2718
2719 /// This is the recursive part of buildTree.
2720 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2721 const EdgeInfo &EI);
2722
2723 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2724 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2725 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2726 /// returns false, setting \p CurrentOrder to either an empty vector or a
2727 /// non-identity permutation that allows to reuse extract instructions.
2728 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2729 /// extract order.
2730 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2731 SmallVectorImpl<unsigned> &CurrentOrder,
2732 bool ResizeAllowed = false) const;
2733
2734 /// Vectorize a single entry in the tree.
2735 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2736 /// avoid issues with def-use order.
2737 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2738
2739 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2740 /// \p E.
2741 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2742 /// avoid issues with def-use order.
2743 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2744
2745 /// Create a new vector from a list of scalar values. Produces a sequence
2746 /// which exploits values reused across lanes, and arranges the inserts
2747 /// for ease of later optimization.
2748 template <typename BVTy, typename ResTy, typename... Args>
2749 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2750
2751 /// Create a new vector from a list of scalar values. Produces a sequence
2752 /// which exploits values reused across lanes, and arranges the inserts
2753 /// for ease of later optimization.
2754 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2755
2756 /// Returns the instruction in the bundle, which can be used as a base point
2757 /// for scheduling. Usually it is the last instruction in the bundle, except
2758 /// for the case when all operands are external (in this case, it is the first
2759 /// instruction in the list).
2760 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2761
2762 /// Tries to find extractelement instructions with constant indices from fixed
2763 /// vector type and gather such instructions into a bunch, which highly likely
2764 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2765 /// was successful, the matched scalars are replaced by poison values in \p VL
2766 /// for future analysis.
2767 std::optional<TargetTransformInfo::ShuffleKind>
2768 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2769 SmallVectorImpl<int> &Mask) const;
2770
2771 /// Tries to find extractelement instructions with constant indices from fixed
2772 /// vector type and gather such instructions into a bunch, which highly likely
2773 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2774 /// was successful, the matched scalars are replaced by poison values in \p VL
2775 /// for future analysis.
2776 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2777 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2778 SmallVectorImpl<int> &Mask,
2779 unsigned NumParts) const;
2780
2781 /// Checks if the gathered \p VL can be represented as a single register
2782 /// shuffle(s) of previous tree entries.
2783 /// \param TE Tree entry checked for permutation.
2784 /// \param VL List of scalars (a subset of the TE scalar), checked for
2785 /// permutations. Must form single-register vector.
2786 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2787 /// commands to build the mask using the original vector value, without
2788 /// relying on the potential reordering.
2789 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2790 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2791 std::optional<TargetTransformInfo::ShuffleKind>
2792 isGatherShuffledSingleRegisterEntry(
2793 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2794 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2795 bool ForOrder);
2796
2797 /// Checks if the gathered \p VL can be represented as multi-register
2798 /// shuffle(s) of previous tree entries.
2799 /// \param TE Tree entry checked for permutation.
2800 /// \param VL List of scalars (a subset of the TE scalar), checked for
2801 /// permutations.
2802 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2803 /// commands to build the mask using the original vector value, without
2804 /// relying on the potential reordering.
2805 /// \returns per-register series of ShuffleKind, if gathered values can be
2806 /// represented as shuffles of previous tree entries. \p Mask is filled with
2807 /// the shuffle mask (also on per-register base).
2808 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
2809 isGatherShuffledEntry(
2810 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2811 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
2812 unsigned NumParts, bool ForOrder = false);
2813
2814 /// \returns the scalarization cost for this list of values. Assuming that
2815 /// this subtree gets vectorized, we may need to extract the values from the
2816 /// roots. This method calculates the cost of extracting the values.
2817 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2818 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2819 Type *ScalarTy) const;
2820
2821 /// Set the Builder insert point to one after the last instruction in
2822 /// the bundle
2823 void setInsertPointAfterBundle(const TreeEntry *E);
2824
2825 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2826 /// specified, the starting vector value is poison.
2827 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2828
2829 /// \returns whether the VectorizableTree is fully vectorizable and will
2830 /// be beneficial even the tree height is tiny.
2831 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2832
2833 /// Reorder commutative or alt operands to get better probability of
2834 /// generating vectorized code.
2835 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2836 SmallVectorImpl<Value *> &Left,
2837 SmallVectorImpl<Value *> &Right,
2838 const BoUpSLP &R);
2839
2840 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2841 /// users of \p TE and collects the stores. It returns the map from the store
2842 /// pointers to the collected stores.
2843 DenseMap<Value *, SmallVector<StoreInst *>>
2844 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2845
2846 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2847 /// stores in \p StoresVec can form a vector instruction. If so it returns
2848 /// true and populates \p ReorderIndices with the shuffle indices of the
2849 /// stores when compared to the sorted vector.
2850 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2851 OrdersType &ReorderIndices) const;
2852
2853 /// Iterates through the users of \p TE, looking for scalar stores that can be
2854 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2855 /// their order and builds an order index vector for each store bundle. It
2856 /// returns all these order vectors found.
2857 /// We run this after the tree has formed, otherwise we may come across user
2858 /// instructions that are not yet in the tree.
2859 SmallVector<OrdersType, 1>
2860 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2861
2862 struct TreeEntry {
2863 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2864 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2865
2866 /// \returns Common mask for reorder indices and reused scalars.
2867 SmallVector<int> getCommonMask() const {
2868 SmallVector<int> Mask;
2869 inversePermutation(Indices: ReorderIndices, Mask);
2870 ::addMask(Mask, SubMask: ReuseShuffleIndices);
2871 return Mask;
2872 }
2873
2874 /// \returns true if the scalars in VL are equal to this entry.
2875 bool isSame(ArrayRef<Value *> VL) const {
2876 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2877 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2878 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2879 return VL.size() == Mask.size() &&
2880 std::equal(VL.begin(), VL.end(), Mask.begin(),
2881 [Scalars](Value *V, int Idx) {
2882 return (isa<UndefValue>(Val: V) &&
2883 Idx == PoisonMaskElem) ||
2884 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2885 });
2886 };
2887 if (!ReorderIndices.empty()) {
2888 // TODO: implement matching if the nodes are just reordered, still can
2889 // treat the vector as the same if the list of scalars matches VL
2890 // directly, without reordering.
2891 SmallVector<int> Mask;
2892 inversePermutation(Indices: ReorderIndices, Mask);
2893 if (VL.size() == Scalars.size())
2894 return IsSame(Scalars, Mask);
2895 if (VL.size() == ReuseShuffleIndices.size()) {
2896 ::addMask(Mask, SubMask: ReuseShuffleIndices);
2897 return IsSame(Scalars, Mask);
2898 }
2899 return false;
2900 }
2901 return IsSame(Scalars, ReuseShuffleIndices);
2902 }
2903
2904 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2905 return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2906 UserTreeIndices.front().UserTE == UserEI.UserTE;
2907 }
2908
2909 /// \returns true if current entry has same operands as \p TE.
2910 bool hasEqualOperands(const TreeEntry &TE) const {
2911 if (TE.getNumOperands() != getNumOperands())
2912 return false;
2913 SmallBitVector Used(getNumOperands());
2914 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2915 unsigned PrevCount = Used.count();
2916 for (unsigned K = 0; K < E; ++K) {
2917 if (Used.test(Idx: K))
2918 continue;
2919 if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
2920 Used.set(K);
2921 break;
2922 }
2923 }
2924 // Check if we actually found the matching operand.
2925 if (PrevCount == Used.count())
2926 return false;
2927 }
2928 return true;
2929 }
2930
2931 /// \return Final vectorization factor for the node. Defined by the total
2932 /// number of vectorized scalars, including those, used several times in the
2933 /// entry and counted in the \a ReuseShuffleIndices, if any.
2934 unsigned getVectorFactor() const {
2935 if (!ReuseShuffleIndices.empty())
2936 return ReuseShuffleIndices.size();
2937 return Scalars.size();
2938 };
2939
2940 /// Checks if the current node is a gather node.
2941 bool isGather() const {return State == NeedToGather; }
2942
2943 /// A vector of scalars.
2944 ValueList Scalars;
2945
2946 /// The Scalars are vectorized into this value. It is initialized to Null.
2947 WeakTrackingVH VectorizedValue = nullptr;
2948
2949 /// New vector phi instructions emitted for the vectorized phi nodes.
2950 PHINode *PHI = nullptr;
2951
2952 /// Do we need to gather this sequence or vectorize it
2953 /// (either with vector instruction or with scatter/gather
2954 /// intrinsics for store/load)?
2955 enum EntryState {
2956 Vectorize,
2957 ScatterVectorize,
2958 StridedVectorize,
2959 NeedToGather
2960 };
2961 EntryState State;
2962
2963 /// Does this sequence require some shuffling?
2964 SmallVector<int, 4> ReuseShuffleIndices;
2965
2966 /// Does this entry require reordering?
2967 SmallVector<unsigned, 4> ReorderIndices;
2968
2969 /// Points back to the VectorizableTree.
2970 ///
2971 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2972 /// to be a pointer and needs to be able to initialize the child iterator.
2973 /// Thus we need a reference back to the container to translate the indices
2974 /// to entries.
2975 VecTreeTy &Container;
2976
2977 /// The TreeEntry index containing the user of this entry. We can actually
2978 /// have multiple users so the data structure is not truly a tree.
2979 SmallVector<EdgeInfo, 1> UserTreeIndices;
2980
2981 /// The index of this treeEntry in VectorizableTree.
2982 int Idx = -1;
2983
2984 private:
2985 /// The operands of each instruction in each lane Operands[op_index][lane].
2986 /// Note: This helps avoid the replication of the code that performs the
2987 /// reordering of operands during buildTree_rec() and vectorizeTree().
2988 SmallVector<ValueList, 2> Operands;
2989
2990 /// The main/alternate instruction.
2991 Instruction *MainOp = nullptr;
2992 Instruction *AltOp = nullptr;
2993
2994 public:
2995 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2996 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2997 if (Operands.size() < OpIdx + 1)
2998 Operands.resize(N: OpIdx + 1);
2999 assert(Operands[OpIdx].empty() && "Already resized?");
3000 assert(OpVL.size() <= Scalars.size() &&
3001 "Number of operands is greater than the number of scalars.");
3002 Operands[OpIdx].resize(N: OpVL.size());
3003 copy(Range&: OpVL, Out: Operands[OpIdx].begin());
3004 }
3005
3006 /// Set the operands of this bundle in their original order.
3007 void setOperandsInOrder() {
3008 assert(Operands.empty() && "Already initialized?");
3009 auto *I0 = cast<Instruction>(Val: Scalars[0]);
3010 Operands.resize(N: I0->getNumOperands());
3011 unsigned NumLanes = Scalars.size();
3012 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
3013 OpIdx != NumOperands; ++OpIdx) {
3014 Operands[OpIdx].resize(N: NumLanes);
3015 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
3016 auto *I = cast<Instruction>(Val: Scalars[Lane]);
3017 assert(I->getNumOperands() == NumOperands &&
3018 "Expected same number of operands");
3019 Operands[OpIdx][Lane] = I->getOperand(i: OpIdx);
3020 }
3021 }
3022 }
3023
3024 /// Reorders operands of the node to the given mask \p Mask.
3025 void reorderOperands(ArrayRef<int> Mask) {
3026 for (ValueList &Operand : Operands)
3027 reorderScalars(Scalars&: Operand, Mask);
3028 }
3029
3030 /// \returns the \p OpIdx operand of this TreeEntry.
3031 ValueList &getOperand(unsigned OpIdx) {
3032 assert(OpIdx < Operands.size() && "Off bounds");
3033 return Operands[OpIdx];
3034 }
3035
3036 /// \returns the \p OpIdx operand of this TreeEntry.
3037 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3038 assert(OpIdx < Operands.size() && "Off bounds");
3039 return Operands[OpIdx];
3040 }
3041
3042 /// \returns the number of operands.
3043 unsigned getNumOperands() const { return Operands.size(); }
3044
3045 /// \return the single \p OpIdx operand.
3046 Value *getSingleOperand(unsigned OpIdx) const {
3047 assert(OpIdx < Operands.size() && "Off bounds");
3048 assert(!Operands[OpIdx].empty() && "No operand available");
3049 return Operands[OpIdx][0];
3050 }
3051
3052 /// Some of the instructions in the list have alternate opcodes.
3053 bool isAltShuffle() const { return MainOp != AltOp; }
3054
3055 bool isOpcodeOrAlt(Instruction *I) const {
3056 unsigned CheckedOpcode = I->getOpcode();
3057 return (getOpcode() == CheckedOpcode ||
3058 getAltOpcode() == CheckedOpcode);
3059 }
3060
3061 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3062 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3063 /// \p OpValue.
3064 Value *isOneOf(Value *Op) const {
3065 auto *I = dyn_cast<Instruction>(Val: Op);
3066 if (I && isOpcodeOrAlt(I))
3067 return Op;
3068 return MainOp;
3069 }
3070
3071 void setOperations(const InstructionsState &S) {
3072 MainOp = S.MainOp;
3073 AltOp = S.AltOp;
3074 }
3075
3076 Instruction *getMainOp() const {
3077 return MainOp;
3078 }
3079
3080 Instruction *getAltOp() const {
3081 return AltOp;
3082 }
3083
3084 /// The main/alternate opcodes for the list of instructions.
3085 unsigned getOpcode() const {
3086 return MainOp ? MainOp->getOpcode() : 0;
3087 }
3088
3089 unsigned getAltOpcode() const {
3090 return AltOp ? AltOp->getOpcode() : 0;
3091 }
3092
3093 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3094 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3095 int findLaneForValue(Value *V) const {
3096 unsigned FoundLane = std::distance(first: Scalars.begin(), last: find(Range: Scalars, Val: V));
3097 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3098 if (!ReorderIndices.empty())
3099 FoundLane = ReorderIndices[FoundLane];
3100 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3101 if (!ReuseShuffleIndices.empty()) {
3102 FoundLane = std::distance(first: ReuseShuffleIndices.begin(),
3103 last: find(Range: ReuseShuffleIndices, Val: FoundLane));
3104 }
3105 return FoundLane;
3106 }
3107
3108 /// Build a shuffle mask for graph entry which represents a merge of main
3109 /// and alternate operations.
3110 void
3111 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3112 SmallVectorImpl<int> &Mask,
3113 SmallVectorImpl<Value *> *OpScalars = nullptr,
3114 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3115
3116 /// Return true if this is a non-power-of-2 node.
3117 bool isNonPowOf2Vec() const {
3118 bool IsNonPowerOf2 = !isPowerOf2_32(Value: Scalars.size());
3119 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3120 "Reshuffling not supported with non-power-of-2 vectors yet.");
3121 return IsNonPowerOf2;
3122 }
3123
3124#ifndef NDEBUG
3125 /// Debug printer.
3126 LLVM_DUMP_METHOD void dump() const {
3127 dbgs() << Idx << ".\n";
3128 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3129 dbgs() << "Operand " << OpI << ":\n";
3130 for (const Value *V : Operands[OpI])
3131 dbgs().indent(2) << *V << "\n";
3132 }
3133 dbgs() << "Scalars: \n";
3134 for (Value *V : Scalars)
3135 dbgs().indent(2) << *V << "\n";
3136 dbgs() << "State: ";
3137 switch (State) {
3138 case Vectorize:
3139 dbgs() << "Vectorize\n";
3140 break;
3141 case ScatterVectorize:
3142 dbgs() << "ScatterVectorize\n";
3143 break;
3144 case StridedVectorize:
3145 dbgs() << "StridedVectorize\n";
3146 break;
3147 case NeedToGather:
3148 dbgs() << "NeedToGather\n";
3149 break;
3150 }
3151 dbgs() << "MainOp: ";
3152 if (MainOp)
3153 dbgs() << *MainOp << "\n";
3154 else
3155 dbgs() << "NULL\n";
3156 dbgs() << "AltOp: ";
3157 if (AltOp)
3158 dbgs() << *AltOp << "\n";
3159 else
3160 dbgs() << "NULL\n";
3161 dbgs() << "VectorizedValue: ";
3162 if (VectorizedValue)
3163 dbgs() << *VectorizedValue << "\n";
3164 else
3165 dbgs() << "NULL\n";
3166 dbgs() << "ReuseShuffleIndices: ";
3167 if (ReuseShuffleIndices.empty())
3168 dbgs() << "Empty";
3169 else
3170 for (int ReuseIdx : ReuseShuffleIndices)
3171 dbgs() << ReuseIdx << ", ";
3172 dbgs() << "\n";
3173 dbgs() << "ReorderIndices: ";
3174 for (unsigned ReorderIdx : ReorderIndices)
3175 dbgs() << ReorderIdx << ", ";
3176 dbgs() << "\n";
3177 dbgs() << "UserTreeIndices: ";
3178 for (const auto &EInfo : UserTreeIndices)
3179 dbgs() << EInfo << ", ";
3180 dbgs() << "\n";
3181 }
3182#endif
3183 };
3184
3185#ifndef NDEBUG
3186 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3187 InstructionCost VecCost, InstructionCost ScalarCost,
3188 StringRef Banner) const {
3189 dbgs() << "SLP: " << Banner << ":\n";
3190 E->dump();
3191 dbgs() << "SLP: Costs:\n";
3192 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3193 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3194 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3195 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3196 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3197 }
3198#endif
3199
3200 /// Create a new VectorizableTree entry.
3201 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3202 std::optional<ScheduleData *> Bundle,
3203 const InstructionsState &S,
3204 const EdgeInfo &UserTreeIdx,
3205 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3206 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3207 TreeEntry::EntryState EntryState =
3208 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3209 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3210 ReuseShuffleIndices, ReorderIndices);
3211 }
3212
3213 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3214 TreeEntry::EntryState EntryState,
3215 std::optional<ScheduleData *> Bundle,
3216 const InstructionsState &S,
3217 const EdgeInfo &UserTreeIdx,
3218 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3219 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3220 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3221 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3222 "Need to vectorize gather entry?");
3223 VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
3224 TreeEntry *Last = VectorizableTree.back().get();
3225 Last->Idx = VectorizableTree.size() - 1;
3226 Last->State = EntryState;
3227 Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
3228 in_end: ReuseShuffleIndices.end());
3229 if (ReorderIndices.empty()) {
3230 Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
3231 Last->setOperations(S);
3232 } else {
3233 // Reorder scalars and build final mask.
3234 Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
3235 transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
3236 F: [VL](unsigned Idx) -> Value * {
3237 if (Idx >= VL.size())
3238 return UndefValue::get(T: VL.front()->getType());
3239 return VL[Idx];
3240 });
3241 InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
3242 Last->setOperations(S);
3243 Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
3244 }
3245 if (!Last->isGather()) {
3246 for (Value *V : VL) {
3247 const TreeEntry *TE = getTreeEntry(V);
3248 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3249 "Scalar already in tree!");
3250 if (TE) {
3251 if (TE != Last)
3252 MultiNodeScalars.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
3253 continue;
3254 }
3255 ScalarToTreeEntry[V] = Last;
3256 }
3257 // Update the scheduler bundle to point to this TreeEntry.
3258 ScheduleData *BundleMember = *Bundle;
3259 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3260 isVectorLikeInstWithConstOps(S.MainOp) ||
3261 doesNotNeedToSchedule(VL)) &&
3262 "Bundle and VL out of sync");
3263 if (BundleMember) {
3264 for (Value *V : VL) {
3265 if (doesNotNeedToBeScheduled(V))
3266 continue;
3267 if (!BundleMember)
3268 continue;
3269 BundleMember->TE = Last;
3270 BundleMember = BundleMember->NextInBundle;
3271 }
3272 }
3273 assert(!BundleMember && "Bundle and VL out of sync");
3274 } else {
3275 // Build a map for gathered scalars to the nodes where they are used.
3276 bool AllConstsOrCasts = true;
3277 for (Value *V : VL)
3278 if (!isConstant(V)) {
3279 auto *I = dyn_cast<CastInst>(Val: V);
3280 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3281 ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(Ptr: Last);
3282 }
3283 if (AllConstsOrCasts)
3284 CastMaxMinBWSizes =
3285 std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1);
3286 MustGather.insert(I: VL.begin(), E: VL.end());
3287 }
3288
3289 if (UserTreeIdx.UserTE) {
3290 Last->UserTreeIndices.push_back(Elt: UserTreeIdx);
3291 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3292 "Reordering isn't implemented for non-power-of-2 nodes yet");
3293 }
3294 return Last;
3295 }
3296
3297 /// -- Vectorization State --
3298 /// Holds all of the tree entries.
3299 TreeEntry::VecTreeTy VectorizableTree;
3300
3301#ifndef NDEBUG
3302 /// Debug printer.
3303 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3304 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3305 VectorizableTree[Id]->dump();
3306 dbgs() << "\n";
3307 }
3308 }
3309#endif
3310
3311 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(Val: V); }
3312
3313 const TreeEntry *getTreeEntry(Value *V) const {
3314 return ScalarToTreeEntry.lookup(Val: V);
3315 }
3316
3317 /// Check that the operand node of alternate node does not generate
3318 /// buildvector sequence. If it is, then probably not worth it to build
3319 /// alternate shuffle, if number of buildvector operands + alternate
3320 /// instruction > than the number of buildvector instructions.
3321 /// \param S the instructions state of the analyzed values.
3322 /// \param VL list of the instructions with alternate opcodes.
3323 bool areAltOperandsProfitable(const InstructionsState &S,
3324 ArrayRef<Value *> VL) const;
3325
3326 /// Checks if the specified list of the instructions/values can be vectorized
3327 /// and fills required data before actual scheduling of the instructions.
3328 TreeEntry::EntryState getScalarsVectorizationState(
3329 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3330 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3331
3332 /// Maps a specific scalar to its tree entry.
3333 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3334
3335 /// List of scalars, used in several vectorize nodes, and the list of the
3336 /// nodes.
3337 SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;
3338
3339 /// Maps a value to the proposed vectorizable size.
3340 SmallDenseMap<Value *, unsigned> InstrElementSize;
3341
3342 /// A list of scalars that we found that we need to keep as scalars.
3343 ValueSet MustGather;
3344
3345 /// A set of first non-schedulable values.
3346 ValueSet NonScheduledFirst;
3347
3348 /// A map between the vectorized entries and the last instructions in the
3349 /// bundles. The bundles are built in use order, not in the def order of the
3350 /// instructions. So, we cannot rely directly on the last instruction in the
3351 /// bundle being the last instruction in the program order during
3352 /// vectorization process since the basic blocks are affected, need to
3353 /// pre-gather them before.
3354 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3355
3356 /// List of gather nodes, depending on other gather/vector nodes, which should
3357 /// be emitted after the vector instruction emission process to correctly
3358 /// handle order of the vector instructions and shuffles.
3359 SetVector<const TreeEntry *> PostponedGathers;
3360
3361 using ValueToGatherNodesMap =
3362 DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;
3363 ValueToGatherNodesMap ValueToGatherNodes;
3364
3365 /// This POD struct describes one external user in the vectorized tree.
3366 struct ExternalUser {
3367 ExternalUser(Value *S, llvm::User *U, int L)
3368 : Scalar(S), User(U), Lane(L) {}
3369
3370 // Which scalar in our function.
3371 Value *Scalar;
3372
3373 // Which user that uses the scalar.
3374 llvm::User *User;
3375
3376 // Which lane does the scalar belong to.
3377 int Lane;
3378 };
3379 using UserList = SmallVector<ExternalUser, 16>;
3380
3381 /// Checks if two instructions may access the same memory.
3382 ///
3383 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3384 /// is invariant in the calling loop.
3385 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3386 Instruction *Inst2) {
3387 if (!Loc1.Ptr || !isSimple(I: Inst1) || !isSimple(I: Inst2))
3388 return true;
3389 // First check if the result is already in the cache.
3390 AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
3391 auto It = AliasCache.find(Val: Key);
3392 if (It != AliasCache.end())
3393 return It->second;
3394 bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
3395 // Store the result in the cache.
3396 AliasCache.try_emplace(Key, Args&: Aliased);
3397 AliasCache.try_emplace(Key: std::make_pair(x&: Inst2, y&: Inst1), Args&: Aliased);
3398 return Aliased;
3399 }
3400
3401 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3402
3403 /// Cache for alias results.
3404 /// TODO: consider moving this to the AliasAnalysis itself.
3405 DenseMap<AliasCacheKey, bool> AliasCache;
3406
3407 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3408 // globally through SLP because we don't perform any action which
3409 // invalidates capture results.
3410 BatchAAResults BatchAA;
3411
3412 /// Temporary store for deleted instructions. Instructions will be deleted
3413 /// eventually when the BoUpSLP is destructed. The deferral is required to
3414 /// ensure that there are no incorrect collisions in the AliasCache, which
3415 /// can happen if a new instruction is allocated at the same address as a
3416 /// previously deleted instruction.
3417 DenseSet<Instruction *> DeletedInstructions;
3418
3419 /// Set of the instruction, being analyzed already for reductions.
3420 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3421
3422 /// Set of hashes for the list of reduction values already being analyzed.
3423 DenseSet<size_t> AnalyzedReductionVals;
3424
3425 /// Values, already been analyzed for mininmal bitwidth and found to be
3426 /// non-profitable.
3427 DenseSet<Value *> AnalyzedMinBWVals;
3428
3429 /// A list of values that need to extracted out of the tree.
3430 /// This list holds pairs of (Internal Scalar : External User). External User
3431 /// can be nullptr, it means that this Internal Scalar will be used later,
3432 /// after vectorization.
3433 UserList ExternalUses;
3434
3435 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3436 /// extractelement instructions.
3437 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3438
3439 /// Values used only by @llvm.assume calls.
3440 SmallPtrSet<const Value *, 32> EphValues;
3441
3442 /// Holds all of the instructions that we gathered, shuffle instructions and
3443 /// extractelements.
3444 SetVector<Instruction *> GatherShuffleExtractSeq;
3445
3446 /// A list of blocks that we are going to CSE.
3447 DenseSet<BasicBlock *> CSEBlocks;
3448
3449 /// Contains all scheduling relevant data for an instruction.
3450 /// A ScheduleData either represents a single instruction or a member of an
3451 /// instruction bundle (= a group of instructions which is combined into a
3452 /// vector instruction).
3453 struct ScheduleData {
3454 // The initial value for the dependency counters. It means that the
3455 // dependencies are not calculated yet.
3456 enum { InvalidDeps = -1 };
3457
3458 ScheduleData() = default;
3459
3460 void init(int BlockSchedulingRegionID, Value *OpVal) {
3461 FirstInBundle = this;
3462 NextInBundle = nullptr;
3463 NextLoadStore = nullptr;
3464 IsScheduled = false;
3465 SchedulingRegionID = BlockSchedulingRegionID;
3466 clearDependencies();
3467 OpValue = OpVal;
3468 TE = nullptr;
3469 }
3470
3471 /// Verify basic self consistency properties
3472 void verify() {
3473 if (hasValidDependencies()) {
3474 assert(UnscheduledDeps <= Dependencies && "invariant");
3475 } else {
3476 assert(UnscheduledDeps == Dependencies && "invariant");
3477 }
3478
3479 if (IsScheduled) {
3480 assert(isSchedulingEntity() &&
3481 "unexpected scheduled state");
3482 for (const ScheduleData *BundleMember = this; BundleMember;
3483 BundleMember = BundleMember->NextInBundle) {
3484 assert(BundleMember->hasValidDependencies() &&
3485 BundleMember->UnscheduledDeps == 0 &&
3486 "unexpected scheduled state");
3487 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3488 "only bundle is marked scheduled");
3489 }
3490 }
3491
3492 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3493 "all bundle members must be in same basic block");
3494 }
3495
3496 /// Returns true if the dependency information has been calculated.
3497 /// Note that depenendency validity can vary between instructions within
3498 /// a single bundle.
3499 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3500
3501 /// Returns true for single instructions and for bundle representatives
3502 /// (= the head of a bundle).
3503 bool isSchedulingEntity() const { return FirstInBundle == this; }
3504
3505 /// Returns true if it represents an instruction bundle and not only a
3506 /// single instruction.
3507 bool isPartOfBundle() const {
3508 return NextInBundle != nullptr || FirstInBundle != this || TE;
3509 }
3510
3511 /// Returns true if it is ready for scheduling, i.e. it has no more
3512 /// unscheduled depending instructions/bundles.
3513 bool isReady() const {
3514 assert(isSchedulingEntity() &&
3515 "can't consider non-scheduling entity for ready list");
3516 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3517 }
3518
3519 /// Modifies the number of unscheduled dependencies for this instruction,
3520 /// and returns the number of remaining dependencies for the containing
3521 /// bundle.
3522 int incrementUnscheduledDeps(int Incr) {
3523 assert(hasValidDependencies() &&
3524 "increment of unscheduled deps would be meaningless");
3525 UnscheduledDeps += Incr;
3526 return FirstInBundle->unscheduledDepsInBundle();
3527 }
3528
3529 /// Sets the number of unscheduled dependencies to the number of
3530 /// dependencies.
3531 void resetUnscheduledDeps() {
3532 UnscheduledDeps = Dependencies;
3533 }
3534
3535 /// Clears all dependency information.
3536 void clearDependencies() {
3537 Dependencies = InvalidDeps;
3538 resetUnscheduledDeps();
3539 MemoryDependencies.clear();
3540 ControlDependencies.clear();
3541 }
3542
3543 int unscheduledDepsInBundle() const {
3544 assert(isSchedulingEntity() && "only meaningful on the bundle");
3545 int Sum = 0;
3546 for (const ScheduleData *BundleMember = this; BundleMember;
3547 BundleMember = BundleMember->NextInBundle) {
3548 if (BundleMember->UnscheduledDeps == InvalidDeps)
3549 return InvalidDeps;
3550 Sum += BundleMember->UnscheduledDeps;
3551 }
3552 return Sum;
3553 }
3554
3555 void dump(raw_ostream &os) const {
3556 if (!isSchedulingEntity()) {
3557 os << "/ " << *Inst;
3558 } else if (NextInBundle) {
3559 os << '[' << *Inst;
3560 ScheduleData *SD = NextInBundle;
3561 while (SD) {
3562 os << ';' << *SD->Inst;
3563 SD = SD->NextInBundle;
3564 }
3565 os << ']';
3566 } else {
3567 os << *Inst;
3568 }
3569 }
3570
3571 Instruction *Inst = nullptr;
3572
3573 /// Opcode of the current instruction in the schedule data.
3574 Value *OpValue = nullptr;
3575
3576 /// The TreeEntry that this instruction corresponds to.
3577 TreeEntry *TE = nullptr;
3578
3579 /// Points to the head in an instruction bundle (and always to this for
3580 /// single instructions).
3581 ScheduleData *FirstInBundle = nullptr;
3582
3583 /// Single linked list of all instructions in a bundle. Null if it is a
3584 /// single instruction.
3585 ScheduleData *NextInBundle = nullptr;
3586
3587 /// Single linked list of all memory instructions (e.g. load, store, call)
3588 /// in the block - until the end of the scheduling region.
3589 ScheduleData *NextLoadStore = nullptr;
3590
3591 /// The dependent memory instructions.
3592 /// This list is derived on demand in calculateDependencies().
3593 SmallVector<ScheduleData *, 4> MemoryDependencies;
3594
3595 /// List of instructions which this instruction could be control dependent
3596 /// on. Allowing such nodes to be scheduled below this one could introduce
3597 /// a runtime fault which didn't exist in the original program.
3598 /// ex: this is a load or udiv following a readonly call which inf loops
3599 SmallVector<ScheduleData *, 4> ControlDependencies;
3600
3601 /// This ScheduleData is in the current scheduling region if this matches
3602 /// the current SchedulingRegionID of BlockScheduling.
3603 int SchedulingRegionID = 0;
3604
3605 /// Used for getting a "good" final ordering of instructions.
3606 int SchedulingPriority = 0;
3607
3608 /// The number of dependencies. Constitutes of the number of users of the
3609 /// instruction plus the number of dependent memory instructions (if any).
3610 /// This value is calculated on demand.
3611 /// If InvalidDeps, the number of dependencies is not calculated yet.
3612 int Dependencies = InvalidDeps;
3613
3614 /// The number of dependencies minus the number of dependencies of scheduled
3615 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3616 /// for scheduling.
3617 /// Note that this is negative as long as Dependencies is not calculated.
3618 int UnscheduledDeps = InvalidDeps;
3619
3620 /// True if this instruction is scheduled (or considered as scheduled in the
3621 /// dry-run).
3622 bool IsScheduled = false;
3623 };
3624
3625#ifndef NDEBUG
3626 friend inline raw_ostream &operator<<(raw_ostream &os,
3627 const BoUpSLP::ScheduleData &SD) {
3628 SD.dump(os);
3629 return os;
3630 }
3631#endif
3632
3633 friend struct GraphTraits<BoUpSLP *>;
3634 friend struct DOTGraphTraits<BoUpSLP *>;
3635
3636 /// Contains all scheduling data for a basic block.
3637 /// It does not schedules instructions, which are not memory read/write
3638 /// instructions and their operands are either constants, or arguments, or
3639 /// phis, or instructions from others blocks, or their users are phis or from
3640 /// the other blocks. The resulting vector instructions can be placed at the
3641 /// beginning of the basic block without scheduling (if operands does not need
3642 /// to be scheduled) or at the end of the block (if users are outside of the
3643 /// block). It allows to save some compile time and memory used by the
3644 /// compiler.
3645 /// ScheduleData is assigned for each instruction in between the boundaries of
3646 /// the tree entry, even for those, which are not part of the graph. It is
3647 /// required to correctly follow the dependencies between the instructions and
3648 /// their correct scheduling. The ScheduleData is not allocated for the
3649 /// instructions, which do not require scheduling, like phis, nodes with
3650 /// extractelements/insertelements only or nodes with instructions, with
3651 /// uses/operands outside of the block.
3652 struct BlockScheduling {
3653 BlockScheduling(BasicBlock *BB)
3654 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3655
3656 void clear() {
3657 ReadyInsts.clear();
3658 ScheduleStart = nullptr;
3659 ScheduleEnd = nullptr;
3660 FirstLoadStoreInRegion = nullptr;
3661 LastLoadStoreInRegion = nullptr;
3662 RegionHasStackSave = false;
3663
3664 // Reduce the maximum schedule region size by the size of the
3665 // previous scheduling run.
3666 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3667 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3668 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3669 ScheduleRegionSize = 0;
3670
3671 // Make a new scheduling region, i.e. all existing ScheduleData is not
3672 // in the new region yet.
3673 ++SchedulingRegionID;
3674 }
3675
3676 ScheduleData *getScheduleData(Instruction *I) {
3677 if (BB != I->getParent())
3678 // Avoid lookup if can't possibly be in map.
3679 return nullptr;
3680 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
3681 if (SD && isInSchedulingRegion(SD))
3682 return SD;
3683 return nullptr;
3684 }
3685
3686 ScheduleData *getScheduleData(Value *V) {
3687 if (auto *I = dyn_cast<Instruction>(Val: V))
3688 return getScheduleData(I);
3689 return nullptr;
3690 }
3691
3692 ScheduleData *getScheduleData(Value *V, Value *Key) {
3693 if (V == Key)
3694 return getScheduleData(V);
3695 auto I = ExtraScheduleDataMap.find(Val: V);
3696 if (I != ExtraScheduleDataMap.end()) {
3697 ScheduleData *SD = I->second.lookup(Val: Key);
3698 if (SD && isInSchedulingRegion(SD))
3699 return SD;
3700 }
3701 return nullptr;
3702 }
3703
3704 bool isInSchedulingRegion(ScheduleData *SD) const {
3705 return SD->SchedulingRegionID == SchedulingRegionID;
3706 }
3707
3708 /// Marks an instruction as scheduled and puts all dependent ready
3709 /// instructions into the ready-list.
3710 template <typename ReadyListType>
3711 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3712 SD->IsScheduled = true;
3713 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3714
3715 for (ScheduleData *BundleMember = SD; BundleMember;
3716 BundleMember = BundleMember->NextInBundle) {
3717 if (BundleMember->Inst != BundleMember->OpValue)
3718 continue;
3719
3720 // Handle the def-use chain dependencies.
3721
3722 // Decrement the unscheduled counter and insert to ready list if ready.
3723 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3724 doForAllOpcodes(V: I, Action: [&ReadyList](ScheduleData *OpDef) {
3725 if (OpDef && OpDef->hasValidDependencies() &&
3726 OpDef->incrementUnscheduledDeps(Incr: -1) == 0) {
3727 // There are no more unscheduled dependencies after
3728 // decrementing, so we can put the dependent instruction
3729 // into the ready list.
3730 ScheduleData *DepBundle = OpDef->FirstInBundle;
3731 assert(!DepBundle->IsScheduled &&
3732 "already scheduled bundle gets ready");
3733 ReadyList.insert(DepBundle);
3734 LLVM_DEBUG(dbgs()
3735 << "SLP: gets ready (def): " << *DepBundle << "\n");
3736 }
3737 });
3738 };
3739
3740 // If BundleMember is a vector bundle, its operands may have been
3741 // reordered during buildTree(). We therefore need to get its operands
3742 // through the TreeEntry.
3743 if (TreeEntry *TE = BundleMember->TE) {
3744 // Need to search for the lane since the tree entry can be reordered.
3745 int Lane = std::distance(first: TE->Scalars.begin(),
3746 last: find(Range&: TE->Scalars, Val: BundleMember->Inst));
3747 assert(Lane >= 0 && "Lane not set");
3748
3749 // Since vectorization tree is being built recursively this assertion
3750 // ensures that the tree entry has all operands set before reaching
3751 // this code. Couple of exceptions known at the moment are extracts
3752 // where their second (immediate) operand is not added. Since
3753 // immediates do not affect scheduler behavior this is considered
3754 // okay.
3755 auto *In = BundleMember->Inst;
3756 assert(
3757 In &&
3758 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3759 In->getNumOperands() == TE->getNumOperands()) &&
3760 "Missed TreeEntry operands?");
3761 (void)In; // fake use to avoid build failure when assertions disabled
3762
3763 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3764 OpIdx != NumOperands; ++OpIdx)
3765 if (auto *I = dyn_cast<Instruction>(Val: TE->getOperand(OpIdx)[Lane]))
3766 DecrUnsched(I);
3767 } else {
3768 // If BundleMember is a stand-alone instruction, no operand reordering
3769 // has taken place, so we directly access its operands.
3770 for (Use &U : BundleMember->Inst->operands())
3771 if (auto *I = dyn_cast<Instruction>(Val: U.get()))
3772 DecrUnsched(I);
3773 }
3774 // Handle the memory dependencies.
3775 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3776 if (MemoryDepSD->hasValidDependencies() &&
3777 MemoryDepSD->incrementUnscheduledDeps(Incr: -1) == 0) {
3778 // There are no more unscheduled dependencies after decrementing,
3779 // so we can put the dependent instruction into the ready list.
3780 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3781 assert(!DepBundle->IsScheduled &&
3782 "already scheduled bundle gets ready");
3783 ReadyList.insert(DepBundle);
3784 LLVM_DEBUG(dbgs()
3785 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3786 }
3787 }
3788 // Handle the control dependencies.
3789 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3790 if (DepSD->incrementUnscheduledDeps(Incr: -1) == 0) {
3791 // There are no more unscheduled dependencies after decrementing,
3792 // so we can put the dependent instruction into the ready list.
3793 ScheduleData *DepBundle = DepSD->FirstInBundle;
3794 assert(!DepBundle->IsScheduled &&
3795 "already scheduled bundle gets ready");
3796 ReadyList.insert(DepBundle);
3797 LLVM_DEBUG(dbgs()
3798 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3799 }
3800 }
3801 }
3802 }
3803
3804 /// Verify basic self consistency properties of the data structure.
3805 void verify() {
3806 if (!ScheduleStart)
3807 return;
3808
3809 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3810 ScheduleStart->comesBefore(ScheduleEnd) &&
3811 "Not a valid scheduling region?");
3812
3813 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3814 auto *SD = getScheduleData(I);
3815 if (!SD)
3816 continue;
3817 assert(isInSchedulingRegion(SD) &&
3818 "primary schedule data not in window?");
3819 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3820 "entire bundle in window!");
3821 (void)SD;
3822 doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->verify(); });
3823 }
3824
3825 for (auto *SD : ReadyInsts) {
3826 assert(SD->isSchedulingEntity() && SD->isReady() &&
3827 "item in ready list not ready?");
3828 (void)SD;
3829 }
3830 }
3831
3832 void doForAllOpcodes(Value *V,
3833 function_ref<void(ScheduleData *SD)> Action) {
3834 if (ScheduleData *SD = getScheduleData(V))
3835 Action(SD);
3836 auto I = ExtraScheduleDataMap.find(Val: V);
3837 if (I != ExtraScheduleDataMap.end())
3838 for (auto &P : I->second)
3839 if (isInSchedulingRegion(SD: P.second))
3840 Action(P.second);
3841 }
3842
3843 /// Put all instructions into the ReadyList which are ready for scheduling.
3844 template <typename ReadyListType>
3845 void initialFillReadyList(ReadyListType &ReadyList) {
3846 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3847 doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
3848 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3849 SD->isReady()) {
3850 ReadyList.insert(SD);
3851 LLVM_DEBUG(dbgs()
3852 << "SLP: initially in ready list: " << *SD << "\n");
3853 }
3854 });
3855 }
3856 }
3857
3858 /// Build a bundle from the ScheduleData nodes corresponding to the
3859 /// scalar instruction for each lane.
3860 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3861
3862 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3863 /// cyclic dependencies. This is only a dry-run, no instructions are
3864 /// actually moved at this stage.
3865 /// \returns the scheduling bundle. The returned Optional value is not
3866 /// std::nullopt if \p VL is allowed to be scheduled.
3867 std::optional<ScheduleData *>
3868 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3869 const InstructionsState &S);
3870
3871 /// Un-bundles a group of instructions.
3872 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3873
3874 /// Allocates schedule data chunk.
3875 ScheduleData *allocateScheduleDataChunks();
3876
3877 /// Extends the scheduling region so that V is inside the region.
3878 /// \returns true if the region size is within the limit.
3879 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3880
3881 /// Initialize the ScheduleData structures for new instructions in the
3882 /// scheduling region.
3883 void initScheduleData(Instruction *FromI, Instruction *ToI,
3884 ScheduleData *PrevLoadStore,
3885 ScheduleData *NextLoadStore);
3886
3887 /// Updates the dependency information of a bundle and of all instructions/
3888 /// bundles which depend on the original bundle.
3889 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3890 BoUpSLP *SLP);
3891
3892 /// Sets all instruction in the scheduling region to un-scheduled.
3893 void resetSchedule();
3894
3895 BasicBlock *BB;
3896
3897 /// Simple memory allocation for ScheduleData.
3898 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
3899
3900 /// The size of a ScheduleData array in ScheduleDataChunks.
3901 int ChunkSize;
3902
3903 /// The allocator position in the current chunk, which is the last entry
3904 /// of ScheduleDataChunks.
3905 int ChunkPos;
3906
3907 /// Attaches ScheduleData to Instruction.
3908 /// Note that the mapping survives during all vectorization iterations, i.e.
3909 /// ScheduleData structures are recycled.
3910 DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
3911
3912 /// Attaches ScheduleData to Instruction with the leading key.
3913 DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
3914 ExtraScheduleDataMap;
3915
3916 /// The ready-list for scheduling (only used for the dry-run).
3917 SetVector<ScheduleData *> ReadyInsts;
3918
3919 /// The first instruction of the scheduling region.
3920 Instruction *ScheduleStart = nullptr;
3921
3922 /// The first instruction _after_ the scheduling region.
3923 Instruction *ScheduleEnd = nullptr;
3924
3925 /// The first memory accessing instruction in the scheduling region
3926 /// (can be null).
3927 ScheduleData *FirstLoadStoreInRegion = nullptr;
3928
3929 /// The last memory accessing instruction in the scheduling region
3930 /// (can be null).
3931 ScheduleData *LastLoadStoreInRegion = nullptr;
3932
3933 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3934 /// region? Used to optimize the dependence calculation for the
3935 /// common case where there isn't.
3936 bool RegionHasStackSave = false;
3937
3938 /// The current size of the scheduling region.
3939 int ScheduleRegionSize = 0;
3940
3941 /// The maximum size allowed for the scheduling region.
3942 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3943
3944 /// The ID of the scheduling region. For a new vectorization iteration this
3945 /// is incremented which "removes" all ScheduleData from the region.
3946 /// Make sure that the initial SchedulingRegionID is greater than the
3947 /// initial SchedulingRegionID in ScheduleData (which is 0).
3948 int SchedulingRegionID = 1;
3949 };
3950
3951 /// Attaches the BlockScheduling structures to basic blocks.
3952 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
3953
3954 /// Performs the "real" scheduling. Done before vectorization is actually
3955 /// performed in a basic block.
3956 void scheduleBlock(BlockScheduling *BS);
3957
3958 /// List of users to ignore during scheduling and that don't need extracting.
3959 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3960
3961 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3962 /// sorted SmallVectors of unsigned.
3963 struct OrdersTypeDenseMapInfo {
3964 static OrdersType getEmptyKey() {
3965 OrdersType V;
3966 V.push_back(Elt: ~1U);
3967 return V;
3968 }
3969
3970 static OrdersType getTombstoneKey() {
3971 OrdersType V;
3972 V.push_back(Elt: ~2U);
3973 return V;
3974 }
3975
3976 static unsigned getHashValue(const OrdersType &V) {
3977 return static_cast<unsigned>(hash_combine_range(first: V.begin(), last: V.end()));
3978 }
3979
3980 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3981 return LHS == RHS;
3982 }
3983 };
3984
3985 // Analysis and block reference.
3986 Function *F;
3987 ScalarEvolution *SE;
3988 TargetTransformInfo *TTI;
3989 TargetLibraryInfo *TLI;
3990 LoopInfo *LI;
3991 DominatorTree *DT;
3992 AssumptionCache *AC;
3993 DemandedBits *DB;
3994 const DataLayout *DL;
3995 OptimizationRemarkEmitter *ORE;
3996
3997 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3998 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3999
4000 /// Instruction builder to construct the vectorized tree.
4001 IRBuilder<TargetFolder> Builder;
4002
4003 /// A map of scalar integer values to the smallest bit width with which they
4004 /// can legally be represented. The values map to (width, signed) pairs,
4005 /// where "width" indicates the minimum bit width and "signed" is True if the
4006 /// value must be signed-extended, rather than zero-extended, back to its
4007 /// original width.
4008 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
4009
4010 /// Final size of the reduced vector, if the current graph represents the
4011 /// input for the reduction and it was possible to narrow the size of the
4012 /// reduction.
4013 unsigned ReductionBitWidth = 0;
4014
4015 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4016 /// type sizes, used in the tree.
4017 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4018
4019 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4020 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4021 DenseSet<unsigned> ExtraBitWidthNodes;
4022};
4023
4024} // end namespace slpvectorizer
4025
4026template <> struct GraphTraits<BoUpSLP *> {
4027 using TreeEntry = BoUpSLP::TreeEntry;
4028
4029 /// NodeRef has to be a pointer per the GraphWriter.
4030 using NodeRef = TreeEntry *;
4031
4032 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
4033
4034 /// Add the VectorizableTree to the index iterator to be able to return
4035 /// TreeEntry pointers.
4036 struct ChildIteratorType
4037 : public iterator_adaptor_base<
4038 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4039 ContainerTy &VectorizableTree;
4040
4041 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
4042 ContainerTy &VT)
4043 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4044
4045 NodeRef operator*() { return I->UserTE; }
4046 };
4047
4048 static NodeRef getEntryNode(BoUpSLP &R) {
4049 return R.VectorizableTree[0].get();
4050 }
4051
4052 static ChildIteratorType child_begin(NodeRef N) {
4053 return {N->UserTreeIndices.begin(), N->Container};
4054 }
4055
4056 static ChildIteratorType child_end(NodeRef N) {
4057 return {N->UserTreeIndices.end(), N->Container};
4058 }
4059
4060 /// For the node iterator we just need to turn the TreeEntry iterator into a
4061 /// TreeEntry* iterator so that it dereferences to NodeRef.
4062 class nodes_iterator {
4063 using ItTy = ContainerTy::iterator;
4064 ItTy It;
4065
4066 public:
4067 nodes_iterator(const ItTy &It2) : It(It2) {}
4068 NodeRef operator*() { return It->get(); }
4069 nodes_iterator operator++() {
4070 ++It;
4071 return *this;
4072 }
4073 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4074 };
4075
4076 static nodes_iterator nodes_begin(BoUpSLP *R) {
4077 return nodes_iterator(R->VectorizableTree.begin());
4078 }
4079
4080 static nodes_iterator nodes_end(BoUpSLP *R) {
4081 return nodes_iterator(R->VectorizableTree.end());
4082 }
4083
4084 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4085};
4086
4087template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4088 using TreeEntry = BoUpSLP::TreeEntry;
4089
4090 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4091
4092 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4093 std::string Str;
4094 raw_string_ostream OS(Str);
4095 OS << Entry->Idx << ".\n";
4096 if (isSplat(VL: Entry->Scalars))
4097 OS << "<splat> ";
4098 for (auto *V : Entry->Scalars) {
4099 OS << *V;
4100 if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
4101 return EU.Scalar == V;
4102 }))
4103 OS << " <extract>";
4104 OS << "\n";
4105 }
4106 return Str;
4107 }
4108
4109 static std::string getNodeAttributes(const TreeEntry *Entry,
4110 const BoUpSLP *) {
4111 if (Entry->isGather())
4112 return "color=red";
4113 if (Entry->State == TreeEntry::ScatterVectorize ||
4114 Entry->State == TreeEntry::StridedVectorize)
4115 return "color=blue";
4116 return "";
4117 }
4118};
4119
4120} // end namespace llvm
4121
4122BoUpSLP::~BoUpSLP() {
4123 SmallVector<WeakTrackingVH> DeadInsts;
4124 for (auto *I : DeletedInstructions) {
4125 if (!I->getParent()) {
4126 // Temporarily insert instruction back to erase them from parent and
4127 // memory later.
4128 if (isa<PHINode>(Val: I))
4129 // Phi nodes must be the very first instructions in the block.
4130 I->insertBefore(BB&: F->getEntryBlock(),
4131 InsertPos: F->getEntryBlock().getFirstNonPHIIt());
4132 else
4133 I->insertBefore(InsertPos: F->getEntryBlock().getTerminator());
4134 continue;
4135 }
4136 for (Use &U : I->operands()) {
4137 auto *Op = dyn_cast<Instruction>(Val: U.get());
4138 if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
4139 wouldInstructionBeTriviallyDead(I: Op, TLI))
4140 DeadInsts.emplace_back(Args&: Op);
4141 }
4142 I->dropAllReferences();
4143 }
4144 for (auto *I : DeletedInstructions) {
4145 assert(I->use_empty() &&
4146 "trying to erase instruction with users.");
4147 I->eraseFromParent();
4148 }
4149
4150 // Cleanup any dead scalar code feeding the vectorized instructions
4151 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
4152
4153#ifdef EXPENSIVE_CHECKS
4154 // If we could guarantee that this call is not extremely slow, we could
4155 // remove the ifdef limitation (see PR47712).
4156 assert(!verifyFunction(*F, &dbgs()));
4157#endif
4158}
4159
4160/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4161/// contains original mask for the scalars reused in the node. Procedure
4162/// transform this mask in accordance with the given \p Mask.
4163static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
4164 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4165 "Expected non-empty mask.");
4166 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4167 Prev.swap(RHS&: Reuses);
4168 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4169 if (Mask[I] != PoisonMaskElem)
4170 Reuses[Mask[I]] = Prev[I];
4171}
4172
4173/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4174/// the original order of the scalars. Procedure transforms the provided order
4175/// in accordance with the given \p Mask. If the resulting \p Order is just an
4176/// identity order, \p Order is cleared.
4177static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
4178 bool BottomOrder = false) {
4179 assert(!Mask.empty() && "Expected non-empty mask.");
4180 unsigned Sz = Mask.size();
4181 if (BottomOrder) {
4182 SmallVector<unsigned> PrevOrder;
4183 if (Order.empty()) {
4184 PrevOrder.resize(N: Sz);
4185 std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0);
4186 } else {
4187 PrevOrder.swap(RHS&: Order);
4188 }
4189 Order.assign(NumElts: Sz, Elt: Sz);
4190 for (unsigned I = 0; I < Sz; ++I)
4191 if (Mask[I] != PoisonMaskElem)
4192 Order[I] = PrevOrder[Mask[I]];
4193 if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
4194 return Data.value() == Sz || Data.index() == Data.value();
4195 })) {
4196 Order.clear();
4197 return;
4198 }
4199 fixupOrderingIndices(Order);
4200 return;
4201 }
4202 SmallVector<int> MaskOrder;
4203 if (Order.empty()) {
4204 MaskOrder.resize(N: Sz);
4205 std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0);
4206 } else {
4207 inversePermutation(Indices: Order, Mask&: MaskOrder);
4208 }
4209 reorderReuses(Reuses&: MaskOrder, Mask);
4210 if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
4211 Order.clear();
4212 return;
4213 }
4214 Order.assign(NumElts: Sz, Elt: Sz);
4215 for (unsigned I = 0; I < Sz; ++I)
4216 if (MaskOrder[I] != PoisonMaskElem)
4217 Order[MaskOrder[I]] = I;
4218 fixupOrderingIndices(Order);
4219}
4220
4221std::optional<BoUpSLP::OrdersType>
4222BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4223 assert(TE.isGather() && "Expected gather node only.");
4224 // Try to find subvector extract/insert patterns and reorder only such
4225 // patterns.
4226 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4227 Type *ScalarTy = GatheredScalars.front()->getType();
4228 int NumScalars = GatheredScalars.size();
4229 if (!isValidElementType(Ty: ScalarTy))
4230 return std::nullopt;
4231 auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
4232 int NumParts = TTI->getNumberOfParts(Tp: VecTy);
4233 if (NumParts == 0 || NumParts >= NumScalars)
4234 NumParts = 1;
4235 SmallVector<int> ExtractMask;
4236 SmallVector<int> Mask;
4237 SmallVector<SmallVector<const TreeEntry *>> Entries;
4238 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
4239 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
4240 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
4241 isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
4242 /*ForOrder=*/true);
4243 // No shuffled operands - ignore.
4244 if (GatherShuffles.empty() && ExtractShuffles.empty())
4245 return std::nullopt;
4246 OrdersType CurrentOrder(NumScalars, NumScalars);
4247 if (GatherShuffles.size() == 1 &&
4248 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4249 Entries.front().front()->isSame(VL: TE.Scalars)) {
4250 // Perfect match in the graph, will reuse the previously vectorized
4251 // node. Cost is 0.
4252 std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0);
4253 return CurrentOrder;
4254 }
4255 auto IsSplatMask = [](ArrayRef<int> Mask) {
4256 int SingleElt = PoisonMaskElem;
4257 return all_of(Range&: Mask, P: [&](int I) {
4258 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4259 SingleElt = I;
4260 return I == PoisonMaskElem || I == SingleElt;
4261 });
4262 };
4263 // Exclusive broadcast mask - ignore.
4264 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4265 (Entries.size() != 1 ||
4266 Entries.front().front()->ReorderIndices.empty())) ||
4267 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4268 return std::nullopt;
4269 SmallBitVector ShuffledSubMasks(NumParts);
4270 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4271 ArrayRef<int> Mask, int PartSz, int NumParts,
4272 function_ref<unsigned(unsigned)> GetVF) {
4273 for (int I : seq<int>(Begin: 0, End: NumParts)) {
4274 if (ShuffledSubMasks.test(Idx: I))
4275 continue;
4276 const int VF = GetVF(I);
4277 if (VF == 0)
4278 continue;
4279 unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
4280 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
4281 // Shuffle of at least 2 vectors - ignore.
4282 if (any_of(Range&: Slice, P: [&](int I) { return I != NumScalars; })) {
4283 std::fill(Slice.begin(), Slice.end(), NumScalars);
4284 ShuffledSubMasks.set(I);
4285 continue;
4286 }
4287 // Try to include as much elements from the mask as possible.
4288 int FirstMin = INT_MAX;
4289 int SecondVecFound = false;
4290 for (int K : seq<int>(Size: Limit)) {
4291 int Idx = Mask[I * PartSz + K];
4292 if (Idx == PoisonMaskElem) {
4293 Value *V = GatheredScalars[I * PartSz + K];
4294 if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
4295 SecondVecFound = true;
4296 break;
4297 }
4298 continue;
4299 }
4300 if (Idx < VF) {
4301 if (FirstMin > Idx)
4302 FirstMin = Idx;
4303 } else {
4304 SecondVecFound = true;
4305 break;
4306 }
4307 }
4308 FirstMin = (FirstMin / PartSz) * PartSz;
4309 // Shuffle of at least 2 vectors - ignore.
4310 if (SecondVecFound) {
4311 std::fill(Slice.begin(), Slice.end(), NumScalars);
4312 ShuffledSubMasks.set(I);
4313 continue;
4314 }
4315 for (int K : seq<int>(Size: Limit)) {
4316 int Idx = Mask[I * PartSz + K];
4317 if (Idx == PoisonMaskElem)
4318 continue;
4319 Idx -= FirstMin;
4320 if (Idx >= PartSz) {
4321 SecondVecFound = true;
4322 break;
4323 }
4324 if (CurrentOrder[I * PartSz + Idx] >
4325 static_cast<unsigned>(I * PartSz + K) &&
4326 CurrentOrder[I * PartSz + Idx] !=
4327 static_cast<unsigned>(I * PartSz + Idx))
4328 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4329 }
4330 // Shuffle of at least 2 vectors - ignore.
4331 if (SecondVecFound) {
4332 std::fill(Slice.begin(), Slice.end(), NumScalars);
4333 ShuffledSubMasks.set(I);
4334 continue;
4335 }
4336 }
4337 };
4338 int PartSz = getPartNumElems(Size: NumScalars, NumParts);
4339 if (!ExtractShuffles.empty())
4340 TransformMaskToOrder(
4341 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4342 if (!ExtractShuffles[I])
4343 return 0U;
4344 unsigned VF = 0;
4345 unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
4346 for (unsigned Idx : seq<unsigned>(Size: Sz)) {
4347 int K = I * PartSz + Idx;
4348 if (ExtractMask[K] == PoisonMaskElem)
4349 continue;
4350 if (!TE.ReuseShuffleIndices.empty())
4351 K = TE.ReuseShuffleIndices[K];
4352 if (!TE.ReorderIndices.empty())
4353 K = std::distance(first: TE.ReorderIndices.begin(),
4354 last: find(Range: TE.ReorderIndices, Val: K));
4355 auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]);
4356 if (!EI)
4357 continue;
4358 VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
4359 ->getElementCount()
4360 .getKnownMinValue());
4361 }
4362 return VF;
4363 });
4364 // Check special corner case - single shuffle of the same entry.
4365 if (GatherShuffles.size() == 1 && NumParts != 1) {
4366 if (ShuffledSubMasks.any())
4367 return std::nullopt;
4368 PartSz = NumScalars;
4369 NumParts = 1;
4370 }
4371 if (!Entries.empty())
4372 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4373 if (!GatherShuffles[I])
4374 return 0U;
4375 return std::max(a: Entries[I].front()->getVectorFactor(),
4376 b: Entries[I].back()->getVectorFactor());
4377 });
4378 int NumUndefs =
4379 count_if(Range&: CurrentOrder, P: [&](int Idx) { return Idx == NumScalars; });
4380 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4381 return std::nullopt;
4382 return std::move(CurrentOrder);
4383}
4384
4385static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4386 const TargetLibraryInfo &TLI,
4387 bool CompareOpcodes = true) {
4388 if (getUnderlyingObject(V: Ptr1) != getUnderlyingObject(V: Ptr2))
4389 return false;
4390 auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
4391 if (!GEP1)
4392 return false;
4393 auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
4394 if (!GEP2)
4395 return false;
4396 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4397 ((isConstant(V: GEP1->getOperand(i_nocapture: 1)) &&
4398 isConstant(V: GEP2->getOperand(i_nocapture: 1))) ||
4399 !CompareOpcodes ||
4400 getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI)
4401 .getOpcode());
4402}
4403
4404/// Calculates minimal alignment as a common alignment.
4405template <typename T>
4406static Align computeCommonAlignment(ArrayRef<Value *> VL) {
4407 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4408 for (Value *V : VL.drop_front())
4409 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4410 return CommonAlignment;
4411}
4412
4413/// Check if \p Order represents reverse order.
4414static bool isReverseOrder(ArrayRef<unsigned> Order) {
4415 unsigned Sz = Order.size();
4416 return !Order.empty() && all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
4417 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4418 });
4419}
4420
4421/// Checks if the provided list of pointers \p Pointers represents the strided
4422/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4423/// Otherwise, if \p Inst is not specified, just initialized optional value is
4424/// returned to show that the pointers represent strided pointers. If \p Inst
4425/// specified, the runtime stride is materialized before the given \p Inst.
4426/// \returns std::nullopt if the pointers are not pointers with the runtime
4427/// stride, nullptr or actual stride value, otherwise.
4428static std::optional<Value *>
4429calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
4430 const DataLayout &DL, ScalarEvolution &SE,
4431 SmallVectorImpl<unsigned> &SortedIndices,
4432 Instruction *Inst = nullptr) {
4433 SmallVector<const SCEV *> SCEVs;
4434 const SCEV *PtrSCEVLowest = nullptr;
4435 const SCEV *PtrSCEVHighest = nullptr;
4436 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4437 // addresses).
4438 for (Value *Ptr : PointerOps) {
4439 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
4440 if (!PtrSCEV)
4441 return std::nullopt;
4442 SCEVs.push_back(Elt: PtrSCEV);
4443 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4444 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4445 continue;
4446 }
4447 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4448 if (isa<SCEVCouldNotCompute>(Val: Diff))
4449 return std::nullopt;
4450 if (Diff->isNonConstantNegative()) {
4451 PtrSCEVLowest = PtrSCEV;
4452 continue;
4453 }
4454 const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
4455 if (isa<SCEVCouldNotCompute>(Val: Diff1))
4456 return std::nullopt;
4457 if (Diff1->isNonConstantNegative()) {
4458 PtrSCEVHighest = PtrSCEV;
4459 continue;
4460 }
4461 }
4462 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4463 const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
4464 if (isa<SCEVCouldNotCompute>(Val: Dist))
4465 return std::nullopt;
4466 int Size = DL.getTypeStoreSize(Ty: ElemTy);
4467 auto TryGetStride = [&](const SCEV *Dist,
4468 const SCEV *Multiplier) -> const SCEV * {
4469 if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
4470 if (M->getOperand(i: 0) == Multiplier)
4471 return M->getOperand(i: 1);
4472 if (M->getOperand(i: 1) == Multiplier)
4473 return M->getOperand(i: 0);
4474 return nullptr;
4475 }
4476 if (Multiplier == Dist)
4477 return SE.getConstant(Ty: Dist->getType(), V: 1);
4478 return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
4479 };
4480 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4481 const SCEV *Stride = nullptr;
4482 if (Size != 1 || SCEVs.size() > 2) {
4483 const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1));
4484 Stride = TryGetStride(Dist, Sz);
4485 if (!Stride)
4486 return std::nullopt;
4487 }
4488 if (!Stride || isa<SCEVConstant>(Val: Stride))
4489 return std::nullopt;
4490 // Iterate through all pointers and check if all distances are
4491 // unique multiple of Stride.
4492 using DistOrdPair = std::pair<int64_t, int>;
4493 auto Compare = llvm::less_first();
4494 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4495 int Cnt = 0;
4496 bool IsConsecutive = true;
4497 for (const SCEV *PtrSCEV : SCEVs) {
4498 unsigned Dist = 0;
4499 if (PtrSCEV != PtrSCEVLowest) {
4500 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
4501 const SCEV *Coeff = TryGetStride(Diff, Stride);
4502 if (!Coeff)
4503 return std::nullopt;
4504 const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
4505 if (!SC || isa<SCEVCouldNotCompute>(Val: SC))
4506 return std::nullopt;
4507 if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
4508 RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
4509 ->isZero())
4510 return std::nullopt;
4511 Dist = SC->getAPInt().getZExtValue();
4512 }
4513 // If the strides are not the same or repeated, we can't vectorize.
4514 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4515 return std::nullopt;
4516 auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
4517 if (!Res.second)
4518 return std::nullopt;
4519 // Consecutive order if the inserted element is the last one.
4520 IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
4521 ++Cnt;
4522 }
4523 if (Offsets.size() != SCEVs.size())
4524 return std::nullopt;
4525 SortedIndices.clear();
4526 if (!IsConsecutive) {
4527 // Fill SortedIndices array only if it is non-consecutive.
4528 SortedIndices.resize(N: PointerOps.size());
4529 Cnt = 0;
4530 for (const std::pair<int64_t, int> &Pair : Offsets) {
4531 SortedIndices[Cnt] = Pair.second;
4532 ++Cnt;
4533 }
4534 }
4535 if (!Inst)
4536 return nullptr;
4537 SCEVExpander Expander(SE, DL, "strided-load-vec");
4538 return Expander.expandCodeFor(SH: Stride, Ty: Stride->getType(), I: Inst);
4539}
4540
4541static std::pair<InstructionCost, InstructionCost>
4542getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
4543 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4544 Type *ScalarTy, VectorType *VecTy);
4545
4546BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
4547 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4548 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4549 // Check that a vectorized load would load the same memory as a scalar
4550 // load. For example, we don't want to vectorize loads that are smaller
4551 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4552 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4553 // from such a struct, we read/write packed bits disagreeing with the
4554 // unvectorized version.
4555 Type *ScalarTy = VL0->getType();
4556
4557 if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
4558 return LoadsState::Gather;
4559
4560 // Make sure all loads in the bundle are simple - we can't vectorize
4561 // atomic or volatile loads.
4562 PointerOps.clear();
4563 const unsigned Sz = VL.size();
4564 PointerOps.resize(N: Sz);
4565 auto *POIter = PointerOps.begin();
4566 for (Value *V : VL) {
4567 auto *L = cast<LoadInst>(Val: V);
4568 if (!L->isSimple())
4569 return LoadsState::Gather;
4570 *POIter = L->getPointerOperand();
4571 ++POIter;
4572 }
4573
4574 Order.clear();
4575 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
4576 // Check the order of pointer operands or that all pointers are the same.
4577 bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order);
4578 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4579 if (!Order.empty() && !isPowerOf2_32(Value: VL.size())) {
4580 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4581 "supported with VectorizeNonPowerOf2");
4582 return LoadsState::Gather;
4583 }
4584
4585 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4586 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(Ty: VecTy) &&
4587 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment) &&
4588 calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
4589 return LoadsState::StridedVectorize;
4590 if (IsSorted || all_of(Range&: PointerOps, P: [&](Value *P) {
4591 return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
4592 })) {
4593 if (IsSorted) {
4594 Value *Ptr0;
4595 Value *PtrN;
4596 if (Order.empty()) {
4597 Ptr0 = PointerOps.front();
4598 PtrN = PointerOps.back();
4599 } else {
4600 Ptr0 = PointerOps[Order.front()];
4601 PtrN = PointerOps[Order.back()];
4602 }
4603 std::optional<int> Diff =
4604 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
4605 // Check that the sorted loads are consecutive.
4606 if (static_cast<unsigned>(*Diff) == Sz - 1)
4607 return LoadsState::Vectorize;
4608 // Simple check if not a strided access - clear order.
4609 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4610 // Try to generate strided load node if:
4611 // 1. Target with strided load support is detected.
4612 // 2. The number of loads is greater than MinProfitableStridedLoads,
4613 // or the potential stride <= MaxProfitableLoadStride and the
4614 // potential stride is power-of-2 (to avoid perf regressions for the very
4615 // small number of loads) and max distance > number of loads, or potential
4616 // stride is -1.
4617 // 3. The loads are ordered, or number of unordered loads <=
4618 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4619 // (this check is to avoid extra costs for very expensive shuffles).
4620 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4621 (static_cast<unsigned>(std::abs(x: *Diff)) <=
4622 MaxProfitableLoadStride * Sz &&
4623 isPowerOf2_32(Value: std::abs(x: *Diff)))) &&
4624 static_cast<unsigned>(std::abs(x: *Diff)) > Sz) ||
4625 *Diff == -(static_cast<int>(Sz) - 1))) {
4626 int Stride = *Diff / static_cast<int>(Sz - 1);
4627 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4628 Align Alignment =
4629 cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()])
4630 ->getAlign();
4631 if (TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment)) {
4632 // Iterate through all pointers and check if all distances are
4633 // unique multiple of Dist.
4634 SmallSet<int, 4> Dists;
4635 for (Value *Ptr : PointerOps) {
4636 int Dist = 0;
4637 if (Ptr == PtrN)
4638 Dist = *Diff;
4639 else if (Ptr != Ptr0)
4640 Dist =
4641 *getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL: *DL, SE&: *SE);
4642 // If the strides are not the same or repeated, we can't
4643 // vectorize.
4644 if (((Dist / Stride) * Stride) != Dist ||
4645 !Dists.insert(V: Dist).second)
4646 break;
4647 }
4648 if (Dists.size() == Sz)
4649 return LoadsState::StridedVectorize;
4650 }
4651 }
4652 }
4653 }
4654 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4655 unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
4656 unsigned MinVF = getMinVF(Sz);
4657 unsigned MaxVF = std::max<unsigned>(a: bit_floor(Value: VL.size() / 2), b: MinVF);
4658 MaxVF = std::min(a: getMaximumVF(ElemWidth: Sz, Opcode: Instruction::Load), b: MaxVF);
4659 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4660 unsigned VectorizedCnt = 0;
4661 SmallVector<LoadsState> States;
4662 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4663 Cnt += VF, ++VectorizedCnt) {
4664 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
4665 SmallVector<unsigned> Order;
4666 SmallVector<Value *> PointerOps;
4667 LoadsState LS =
4668 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
4669 /*TryRecursiveCheck=*/false);
4670 // Check that the sorted loads are consecutive.
4671 if (LS == LoadsState::Gather)
4672 break;
4673 // If need the reorder - consider as high-cost masked gather for now.
4674 if ((LS == LoadsState::Vectorize ||
4675 LS == LoadsState::StridedVectorize) &&
4676 !Order.empty() && !isReverseOrder(Order))
4677 LS = LoadsState::ScatterVectorize;
4678 States.push_back(Elt: LS);
4679 }
4680 // Can be vectorized later as a serie of loads/insertelements.
4681 if (VectorizedCnt == VL.size() / VF) {
4682 // Compare masked gather cost and loads + insersubvector costs.
4683 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
4684 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4685 TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(), Opcode: Instruction::GetElementPtr,
4686 CostKind, ScalarTy, VecTy);
4687 InstructionCost MaskedGatherCost =
4688 TTI.getGatherScatterOpCost(
4689 Opcode: Instruction::Load, DataTy: VecTy,
4690 Ptr: cast<LoadInst>(Val: VL0)->getPointerOperand(),
4691 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind) +
4692 VectorGEPCost - ScalarGEPCost;
4693 InstructionCost VecLdCost = 0;
4694 auto *SubVecTy = getWidenedType(ScalarTy, VF);
4695 for (auto [I, LS] : enumerate(First&: States)) {
4696 auto *LI0 = cast<LoadInst>(Val: VL[I * VF]);
4697 switch (LS) {
4698 case LoadsState::Vectorize: {
4699 auto [ScalarGEPCost, VectorGEPCost] =
4700 getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
4701 BasePtr: LI0->getPointerOperand(), Opcode: Instruction::Load,
4702 CostKind, ScalarTy, VecTy: SubVecTy);
4703 VecLdCost += TTI.getMemoryOpCost(
4704 Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
4705 AddressSpace: LI0->getPointerAddressSpace(), CostKind,
4706 OpdInfo: TTI::OperandValueInfo()) +
4707 VectorGEPCost - ScalarGEPCost;
4708 break;
4709 }
4710 case LoadsState::StridedVectorize: {
4711 auto [ScalarGEPCost, VectorGEPCost] =
4712 getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
4713 BasePtr: LI0->getPointerOperand(), Opcode: Instruction::Load,
4714 CostKind, ScalarTy, VecTy: SubVecTy);
4715 VecLdCost +=
4716 TTI.getStridedMemoryOpCost(
4717 Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4718 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind) +
4719 VectorGEPCost - ScalarGEPCost;
4720 break;
4721 }
4722 case LoadsState::ScatterVectorize: {
4723 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4724 TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
4725 BasePtr: LI0->getPointerOperand(), Opcode: Instruction::GetElementPtr,
4726 CostKind, ScalarTy, VecTy: SubVecTy);
4727 VecLdCost +=
4728 TTI.getGatherScatterOpCost(
4729 Opcode: Instruction::Load, DataTy: SubVecTy, Ptr: LI0->getPointerOperand(),
4730 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind) +
4731 VectorGEPCost - ScalarGEPCost;
4732 break;
4733 }
4734 case LoadsState::Gather:
4735 llvm_unreachable(
4736 "Expected only consecutive, strided or masked gather loads.");
4737 }
4738 SmallVector<int> ShuffleMask(VL.size());
4739 for (int Idx : seq<int>(Begin: 0, End: VL.size()))
4740 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4741 VecLdCost +=
4742 TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
4743 CostKind, Index: I * VF, SubTp: SubVecTy);
4744 }
4745 // If masked gather cost is higher - better to vectorize, so
4746 // consider it as a gather node. It will be better estimated
4747 // later.
4748 if (MaskedGatherCost >= VecLdCost)
4749 return true;
4750 }
4751 }
4752 return false;
4753 };
4754 // TODO: need to improve analysis of the pointers, if not all of them are
4755 // GEPs or have > 2 operands, we end up with a gather node, which just
4756 // increases the cost.
4757 Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
4758 bool ProfitableGatherPointers =
4759 L && Sz > 2 &&
4760 static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
4761 return L->isLoopInvariant(V);
4762 })) <= Sz / 2;
4763 if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [IsSorted](Value *P) {
4764 auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
4765 return (IsSorted && !GEP && doesNotNeedToBeScheduled(V: P)) ||
4766 (GEP && GEP->getNumOperands() == 2 &&
4767 isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1)));
4768 })) {
4769 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4770 if (TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) &&
4771 !TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment)) {
4772 // Check if potential masked gather can be represented as series
4773 // of loads + insertsubvectors.
4774 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4775 // If masked gather cost is higher - better to vectorize, so
4776 // consider it as a gather node. It will be better estimated
4777 // later.
4778 return LoadsState::Gather;
4779 }
4780 return LoadsState::ScatterVectorize;
4781 }
4782 }
4783 }
4784
4785 return LoadsState::Gather;
4786}
4787
4788static bool clusterSortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy,
4789 const DataLayout &DL, ScalarEvolution &SE,
4790 SmallVectorImpl<unsigned> &SortedIndices) {
4791 assert(llvm::all_of(
4792 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4793 "Expected list of pointer operands.");
4794 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4795 // Ptr into, sort and return the sorted indices with values next to one
4796 // another.
4797 MapVector<Value *, SmallVector<std::tuple<Value *, int, unsigned>>> Bases;
4798 Bases[VL[0]].push_back(Elt: std::make_tuple(args: VL[0], args: 0U, args: 0U));
4799
4800 unsigned Cnt = 1;
4801 for (Value *Ptr : VL.drop_front()) {
4802 bool Found = any_of(Range&: Bases, P: [&](auto &Base) {
4803 std::optional<int> Diff =
4804 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4805 /*StrictCheck=*/true);
4806 if (!Diff)
4807 return false;
4808
4809 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4810 return true;
4811 });
4812
4813 if (!Found) {
4814 // If we haven't found enough to usefully cluster, return early.
4815 if (Bases.size() > VL.size() / 2 - 1)
4816 return false;
4817
4818 // Not found already - add a new Base
4819 Bases[Ptr].emplace_back(Args&: Ptr, Args: 0, Args: Cnt++);
4820 }
4821 }
4822
4823 // For each of the bases sort the pointers by Offset and check if any of the
4824 // base become consecutively allocated.
4825 bool AnyConsecutive = false;
4826 for (auto &Base : Bases) {
4827 auto &Vec = Base.second;
4828 if (Vec.size() > 1) {
4829 llvm::stable_sort(Range&: Vec, C: [](const std::tuple<Value *, int, unsigned> &X,
4830 const std::tuple<Value *, int, unsigned> &Y) {
4831 return std::get<1>(t: X) < std::get<1>(t: Y);
4832 });
4833 int InitialOffset = std::get<1>(t&: Vec[0]);
4834 AnyConsecutive |= all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
4835 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4836 });
4837 }
4838 }
4839
4840 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4841 SortedIndices.clear();
4842 if (!AnyConsecutive)
4843 return false;
4844
4845 for (auto &Base : Bases) {
4846 for (auto &T : Base.second)
4847 SortedIndices.push_back(Elt: std::get<2>(t&: T));
4848 }
4849
4850 assert(SortedIndices.size() == VL.size() &&
4851 "Expected SortedIndices to be the size of VL");
4852 return true;
4853}
4854
4855std::optional<BoUpSLP::OrdersType>
4856BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4857 assert(TE.isGather() && "Expected gather node only.");
4858 Type *ScalarTy = TE.Scalars[0]->getType();
4859
4860 SmallVector<Value *> Ptrs;
4861 Ptrs.reserve(N: TE.Scalars.size());
4862 for (Value *V : TE.Scalars) {
4863 auto *L = dyn_cast<LoadInst>(Val: V);
4864 if (!L || !L->isSimple())
4865 return std::nullopt;
4866 Ptrs.push_back(Elt: L->getPointerOperand());
4867 }
4868
4869 BoUpSLP::OrdersType Order;
4870 if (clusterSortPtrAccesses(VL: Ptrs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
4871 return std::move(Order);
4872 return std::nullopt;
4873}
4874
4875/// Check if two insertelement instructions are from the same buildvector.
4876static bool areTwoInsertFromSameBuildVector(
4877 InsertElementInst *VU, InsertElementInst *V,
4878 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4879 // Instructions must be from the same basic blocks.
4880 if (VU->getParent() != V->getParent())
4881 return false;
4882 // Checks if 2 insertelements are from the same buildvector.
4883 if (VU->getType() != V->getType())
4884 return false;
4885 // Multiple used inserts are separate nodes.
4886 if (!VU->hasOneUse() && !V->hasOneUse())
4887 return false;
4888 auto *IE1 = VU;
4889 auto *IE2 = V;
4890 std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
4891 std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
4892 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4893 return false;
4894 // Go through the vector operand of insertelement instructions trying to find
4895 // either VU as the original vector for IE2 or V as the original vector for
4896 // IE1.
4897 SmallBitVector ReusedIdx(
4898 cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
4899 bool IsReusedIdx = false;
4900 do {
4901 if (IE2 == VU && !IE1)
4902 return VU->hasOneUse();
4903 if (IE1 == V && !IE2)
4904 return V->hasOneUse();
4905 if (IE1 && IE1 != V) {
4906 unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
4907 IsReusedIdx |= ReusedIdx.test(Idx: Idx1);
4908 ReusedIdx.set(Idx1);
4909 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4910 IE1 = nullptr;
4911 else
4912 IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1));
4913 }
4914 if (IE2 && IE2 != VU) {
4915 unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
4916 IsReusedIdx |= ReusedIdx.test(Idx: Idx2);
4917 ReusedIdx.set(Idx2);
4918 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4919 IE2 = nullptr;
4920 else
4921 IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2));
4922 }
4923 } while (!IsReusedIdx && (IE1 || IE2));
4924 return false;
4925}
4926
4927std::optional<BoUpSLP::OrdersType>
4928BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4929 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4930 if (TE.isNonPowOf2Vec())
4931 return std::nullopt;
4932
4933 // No need to reorder if need to shuffle reuses, still need to shuffle the
4934 // node.
4935 if (!TE.ReuseShuffleIndices.empty()) {
4936 if (isSplat(VL: TE.Scalars))
4937 return std::nullopt;
4938 // Check if reuse shuffle indices can be improved by reordering.
4939 // For this, check that reuse mask is "clustered", i.e. each scalar values
4940 // is used once in each submask of size <number_of_scalars>.
4941 // Example: 4 scalar values.
4942 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4943 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4944 // element 3 is used twice in the second submask.
4945 unsigned Sz = TE.Scalars.size();
4946 if (TE.isGather()) {
4947 if (std::optional<OrdersType> CurrentOrder =
4948 findReusedOrderedScalars(TE)) {
4949 SmallVector<int> Mask;
4950 fixupOrderingIndices(Order: *CurrentOrder);
4951 inversePermutation(Indices: *CurrentOrder, Mask);
4952 ::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
4953 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4954 unsigned Sz = TE.Scalars.size();
4955 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4956 for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
4957 if (Idx != PoisonMaskElem)
4958 Res[Idx + K * Sz] = I + K * Sz;
4959 }
4960 return std::move(Res);
4961 }
4962 }
4963 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4964 TTI->getNumberOfParts(Tp: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
4965 VF: 2 * TE.getVectorFactor())) == 1)
4966 return std::nullopt;
4967 if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
4968 VF: Sz)) {
4969 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4970 if (TE.ReorderIndices.empty())
4971 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
4972 else
4973 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
4974 ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
4975 unsigned VF = ReorderMask.size();
4976 OrdersType ResOrder(VF, VF);
4977 unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
4978 SmallBitVector UsedVals(NumParts);
4979 for (unsigned I = 0; I < VF; I += Sz) {
4980 int Val = PoisonMaskElem;
4981 unsigned UndefCnt = 0;
4982 unsigned Limit = std::min(a: Sz, b: VF - I);
4983 if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
4984 P: [&](int Idx) {
4985 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4986 Val = Idx;
4987 if (Idx == PoisonMaskElem)
4988 ++UndefCnt;
4989 return Idx != PoisonMaskElem && Idx != Val;
4990 }) ||
4991 Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) ||
4992 UndefCnt > Sz / 2)
4993 return std::nullopt;
4994 UsedVals.set(Val);
4995 for (unsigned K = 0; K < NumParts; ++K)
4996 ResOrder[Val + Sz * K] = I + K;
4997 }
4998 return std::move(ResOrder);
4999 }
5000 unsigned VF = TE.getVectorFactor();
5001 // Try build correct order for extractelement instructions.
5002 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5003 TE.ReuseShuffleIndices.end());
5004 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
5005 all_of(Range: TE.Scalars, P: [Sz](Value *V) {
5006 std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
5007 return Idx && *Idx < Sz;
5008 })) {
5009 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5010 if (TE.ReorderIndices.empty())
5011 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
5012 else
5013 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
5014 for (unsigned I = 0; I < VF; ++I) {
5015 int &Idx = ReusedMask[I];
5016 if (Idx == PoisonMaskElem)
5017 continue;
5018 Value *V = TE.Scalars[ReorderMask[Idx]];
5019 std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
5020 Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
5021 }
5022 }
5023 // Build the order of the VF size, need to reorder reuses shuffles, they are
5024 // always of VF size.
5025 OrdersType ResOrder(VF);
5026 std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0);
5027 auto *It = ResOrder.begin();
5028 for (unsigned K = 0; K < VF; K += Sz) {
5029 OrdersType CurrentOrder(TE.ReorderIndices);
5030 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
5031 if (SubMask.front() == PoisonMaskElem)
5032 std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0);
5033 reorderOrder(Order&: CurrentOrder, Mask: SubMask);
5034 transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
5035 std::advance(i&: It, n: Sz);
5036 }
5037 if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
5038 return Data.index() == Data.value();
5039 }))
5040 return std::nullopt; // No need to reorder.
5041 return std::move(ResOrder);
5042 }
5043 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5044 any_of(Range: TE.UserTreeIndices,
5045 P: [](const EdgeInfo &EI) {
5046 return !Instruction::isBinaryOp(Opcode: EI.UserTE->getOpcode());
5047 }) &&
5048 (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices)))
5049 return std::nullopt;
5050 if ((TE.State == TreeEntry::Vectorize ||
5051 TE.State == TreeEntry::StridedVectorize) &&
5052 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) ||
5053 (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))) &&
5054 !TE.isAltShuffle())
5055 return TE.ReorderIndices;
5056 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5057 auto PHICompare = [&](unsigned I1, unsigned I2) {
5058 Value *V1 = TE.Scalars[I1];
5059 Value *V2 = TE.Scalars[I2];
5060 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5061 return false;
5062 if (V1->getNumUses() < V2->getNumUses())
5063 return true;
5064 if (V1->getNumUses() > V2->getNumUses())
5065 return false;
5066 auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin());
5067 auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin());
5068 if (auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1))
5069 if (auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2)) {
5070 if (!areTwoInsertFromSameBuildVector(
5071 VU: IE1, V: IE2,
5072 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); }))
5073 return I1 < I2;
5074 return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
5075 }
5076 if (auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1))
5077 if (auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2)) {
5078 if (EE1->getOperand(i_nocapture: 0) != EE2->getOperand(i_nocapture: 0))
5079 return I1 < I2;
5080 return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
5081 }
5082 return I1 < I2;
5083 };
5084 auto IsIdentityOrder = [](const OrdersType &Order) {
5085 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Order.size()))
5086 if (Idx != Order[Idx])
5087 return false;
5088 return true;
5089 };
5090 if (!TE.ReorderIndices.empty())
5091 return TE.ReorderIndices;
5092 DenseMap<unsigned, unsigned> PhiToId;
5093 SmallVector<unsigned> Phis(TE.Scalars.size());
5094 std::iota(first: Phis.begin(), last: Phis.end(), value: 0);
5095 OrdersType ResOrder(TE.Scalars.size());
5096 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
5097 PhiToId[Id] = Id;
5098 stable_sort(Range&: Phis, C: PHICompare);
5099 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
5100 ResOrder[Id] = PhiToId[Phis[Id]];
5101 if (IsIdentityOrder(ResOrder))
5102 return std::nullopt; // No need to reorder.
5103 return std::move(ResOrder);
5104 }
5105 if (TE.isGather() && !TE.isAltShuffle() && allSameType(VL: TE.Scalars)) {
5106 // TODO: add analysis of other gather nodes with extractelement
5107 // instructions and other values/instructions, not only undefs.
5108 if ((TE.getOpcode() == Instruction::ExtractElement ||
5109 (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
5110 any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
5111 all_of(Range: TE.Scalars, P: [](Value *V) {
5112 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
5113 return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType());
5114 })) {
5115 // Check that gather of extractelements can be represented as
5116 // just a shuffle of a single vector.
5117 OrdersType CurrentOrder;
5118 bool Reuse = canReuseExtract(VL: TE.Scalars, OpValue: TE.getMainOp(), CurrentOrder,
5119 /*ResizeAllowed=*/true);
5120 if (Reuse || !CurrentOrder.empty())
5121 return std::move(CurrentOrder);
5122 }
5123 // If the gather node is <undef, v, .., poison> and
5124 // insertelement poison, v, 0 [+ permute]
5125 // is cheaper than
5126 // insertelement poison, v, n - try to reorder.
5127 // If rotating the whole graph, exclude the permute cost, the whole graph
5128 // might be transformed.
5129 int Sz = TE.Scalars.size();
5130 if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
5131 count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) {
5132 const auto *It =
5133 find_if(Range: TE.Scalars, P: [](Value *V) { return !isConstant(V); });
5134 if (It == TE.Scalars.begin())
5135 return OrdersType();
5136 auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
5137 if (It != TE.Scalars.end()) {
5138 OrdersType Order(Sz, Sz);
5139 unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
5140 Order[Idx] = 0;
5141 fixupOrderingIndices(Order);
5142 SmallVector<int> Mask;
5143 inversePermutation(Indices: Order, Mask);
5144 InstructionCost PermuteCost =
5145 TopToBottom
5146 ? 0
5147 : TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
5148 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5149 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0,
5150 Op0: PoisonValue::get(T: Ty), Op1: *It);
5151 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5152 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
5153 Op0: PoisonValue::get(T: Ty), Op1: *It);
5154 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5155 OrdersType Order(Sz, Sz);
5156 Order[Idx] = 0;
5157 return std::move(Order);
5158 }
5159 }
5160 }
5161 if (isSplat(VL: TE.Scalars))
5162 return std::nullopt;
5163 if (TE.Scalars.size() >= 4)
5164 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5165 return Order;
5166 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5167 return CurrentOrder;
5168 }
5169 return std::nullopt;
5170}
5171
5172/// Checks if the given mask is a "clustered" mask with the same clusters of
5173/// size \p Sz, which are not identity submasks.
5174static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
5175 unsigned Sz) {
5176 ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz);
5177 if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
5178 return false;
5179 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5180 ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
5181 if (Cluster != FirstCluster)
5182 return false;
5183 }
5184 return true;
5185}
5186
5187void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5188 // Reorder reuses mask.
5189 reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
5190 const unsigned Sz = TE.Scalars.size();
5191 // For vectorized and non-clustered reused no need to do anything else.
5192 if (!TE.isGather() ||
5193 !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
5194 VF: Sz) ||
5195 !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
5196 return;
5197 SmallVector<int> NewMask;
5198 inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
5199 addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
5200 // Clear reorder since it is going to be applied to the new mask.
5201 TE.ReorderIndices.clear();
5202 // Try to improve gathered nodes with clustered reuses, if possible.
5203 ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz);
5204 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
5205 inversePermutation(Indices: NewOrder, Mask&: NewMask);
5206 reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
5207 // Fill the reuses mask with the identity submasks.
5208 for (auto *It = TE.ReuseShuffleIndices.begin(),
5209 *End = TE.ReuseShuffleIndices.end();
5210 It != End; std::advance(i&: It, n: Sz))
5211 std::iota(first: It, last: std::next(x: It, n: Sz), value: 0);
5212}
5213
5214static void combineOrders(MutableArrayRef<unsigned> Order,
5215 ArrayRef<unsigned> SecondaryOrder) {
5216 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5217 "Expected same size of orders");
5218 unsigned Sz = Order.size();
5219 SmallBitVector UsedIndices(Sz);
5220 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) {
5221 if (Order[Idx] != Sz)
5222 UsedIndices.set(Order[Idx]);
5223 }
5224 if (SecondaryOrder.empty()) {
5225 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
5226 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5227 Order[Idx] = Idx;
5228 } else {
5229 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
5230 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5231 !UsedIndices.test(Idx: SecondaryOrder[Idx]))
5232 Order[Idx] = SecondaryOrder[Idx];
5233 }
5234}
5235
5236void BoUpSLP::reorderTopToBottom() {
5237 // Maps VF to the graph nodes.
5238 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
5239 // ExtractElement gather nodes which can be vectorized and need to handle
5240 // their ordering.
5241 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
5242
5243 // Phi nodes can have preferred ordering based on their result users
5244 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
5245
5246 // AltShuffles can also have a preferred ordering that leads to fewer
5247 // instructions, e.g., the addsub instruction in x86.
5248 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5249
5250 // Maps a TreeEntry to the reorder indices of external users.
5251 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
5252 ExternalUserReorderMap;
5253 // Find all reorderable nodes with the given VF.
5254 // Currently the are vectorized stores,loads,extracts + some gathering of
5255 // extracts.
5256 for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
5257 const std::unique_ptr<TreeEntry> &TE) {
5258 // Look for external users that will probably be vectorized.
5259 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5260 findExternalStoreUsersReorderIndices(TE: TE.get());
5261 if (!ExternalUserReorderIndices.empty()) {
5262 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
5263 ExternalUserReorderMap.try_emplace(Key: TE.get(),
5264 Args: std::move(ExternalUserReorderIndices));
5265 }
5266
5267 // Patterns like [fadd,fsub] can be combined into a single instruction in
5268 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5269 // to take into account their order when looking for the most used order.
5270 if (TE->isAltShuffle()) {
5271 VectorType *VecTy =
5272 getWidenedType(ScalarTy: TE->Scalars[0]->getType(), VF: TE->Scalars.size());
5273 unsigned Opcode0 = TE->getOpcode();
5274 unsigned Opcode1 = TE->getAltOpcode();
5275 SmallBitVector OpcodeMask(getAltInstrMask(VL: TE->Scalars, Opcode0, Opcode1));
5276 // If this pattern is supported by the target then we consider the order.
5277 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5278 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
5279 AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType());
5280 }
5281 // TODO: Check the reverse order too.
5282 }
5283
5284 if (std::optional<OrdersType> CurrentOrder =
5285 getReorderingData(TE: *TE, /*TopToBottom=*/true)) {
5286 // Do not include ordering for nodes used in the alt opcode vectorization,
5287 // better to reorder them during bottom-to-top stage. If follow the order
5288 // here, it causes reordering of the whole graph though actually it is
5289 // profitable just to reorder the subgraph that starts from the alternate
5290 // opcode vectorization node. Such nodes already end-up with the shuffle
5291 // instruction and it is just enough to change this shuffle rather than
5292 // rotate the scalars for the whole graph.
5293 unsigned Cnt = 0;
5294 const TreeEntry *UserTE = TE.get();
5295 while (UserTE && Cnt < RecursionMaxDepth) {
5296 if (UserTE->UserTreeIndices.size() != 1)
5297 break;
5298 if (all_of(Range: UserTE->UserTreeIndices, P: [](const EdgeInfo &EI) {
5299 return EI.UserTE->State == TreeEntry::Vectorize &&
5300 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5301 }))
5302 return;
5303 UserTE = UserTE->UserTreeIndices.back().UserTE;
5304 ++Cnt;
5305 }
5306 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
5307 if (!(TE->State == TreeEntry::Vectorize ||
5308 TE->State == TreeEntry::StridedVectorize) ||
5309 !TE->ReuseShuffleIndices.empty())
5310 GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
5311 if (TE->State == TreeEntry::Vectorize &&
5312 TE->getOpcode() == Instruction::PHI)
5313 PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
5314 }
5315 });
5316
5317 // Reorder the graph nodes according to their vectorization factor.
5318 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5319 VF /= 2) {
5320 auto It = VFToOrderedEntries.find(Val: VF);
5321 if (It == VFToOrderedEntries.end())
5322 continue;
5323 // Try to find the most profitable order. We just are looking for the most
5324 // used order and reorder scalar elements in the nodes according to this
5325 // mostly used order.
5326 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5327 // All operands are reordered and used only in this node - propagate the
5328 // most used order to the user node.
5329 MapVector<OrdersType, unsigned,
5330 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5331 OrdersUses;
5332 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5333 for (const TreeEntry *OpTE : OrderedEntries) {
5334 // No need to reorder this nodes, still need to extend and to use shuffle,
5335 // just need to merge reordering shuffle and the reuse shuffle.
5336 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE))
5337 continue;
5338 // Count number of orders uses.
5339 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5340 &PhisToOrders]() -> const OrdersType & {
5341 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
5342 auto It = GathersToOrders.find(Val: OpTE);
5343 if (It != GathersToOrders.end())
5344 return It->second;
5345 }
5346 if (OpTE->isAltShuffle()) {
5347 auto It = AltShufflesToOrders.find(Val: OpTE);
5348 if (It != AltShufflesToOrders.end())
5349 return It->second;
5350 }
5351 if (OpTE->State == TreeEntry::Vectorize &&
5352 OpTE->getOpcode() == Instruction::PHI) {
5353 auto It = PhisToOrders.find(Val: OpTE);
5354 if (It != PhisToOrders.end())
5355 return It->second;
5356 }
5357 return OpTE->ReorderIndices;
5358 }();
5359 // First consider the order of the external scalar users.
5360 auto It = ExternalUserReorderMap.find(Val: OpTE);
5361 if (It != ExternalUserReorderMap.end()) {
5362 const auto &ExternalUserReorderIndices = It->second;
5363 // If the OpTE vector factor != number of scalars - use natural order,
5364 // it is an attempt to reorder node with reused scalars but with
5365 // external uses.
5366 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5367 OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0)).first->second +=
5368 ExternalUserReorderIndices.size();
5369 } else {
5370 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5371 ++OrdersUses.insert(KV: std::make_pair(x: ExtOrder, y: 0)).first->second;
5372 }
5373 // No other useful reorder data in this entry.
5374 if (Order.empty())
5375 continue;
5376 }
5377 // Stores actually store the mask, not the order, need to invert.
5378 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5379 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5380 SmallVector<int> Mask;
5381 inversePermutation(Indices: Order, Mask);
5382 unsigned E = Order.size();
5383 OrdersType CurrentOrder(E, E);
5384 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5385 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5386 });
5387 fixupOrderingIndices(Order: CurrentOrder);
5388 ++OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second;
5389 } else {
5390 ++OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second;
5391 }
5392 }
5393 if (OrdersUses.empty())
5394 continue;
5395 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5396 const unsigned Sz = Order.size();
5397 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
5398 if (Idx != Order[Idx] && Order[Idx] != Sz)
5399 return false;
5400 return true;
5401 };
5402 // Choose the most used order.
5403 unsigned IdentityCnt = 0;
5404 unsigned FilledIdentityCnt = 0;
5405 OrdersType IdentityOrder(VF, VF);
5406 for (auto &Pair : OrdersUses) {
5407 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5408 if (!Pair.first.empty())
5409 FilledIdentityCnt += Pair.second;
5410 IdentityCnt += Pair.second;
5411 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5412 }
5413 }
5414 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5415 unsigned Cnt = IdentityCnt;
5416 for (auto &Pair : OrdersUses) {
5417 // Prefer identity order. But, if filled identity found (non-empty order)
5418 // with same number of uses, as the new candidate order, we can choose
5419 // this candidate order.
5420 if (Cnt < Pair.second ||
5421 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5422 Cnt == Pair.second && !BestOrder.empty() &&
5423 IsIdentityOrder(BestOrder))) {
5424 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5425 BestOrder = Pair.first;
5426 Cnt = Pair.second;
5427 } else {
5428 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5429 }
5430 }
5431 // Set order of the user node.
5432 if (IsIdentityOrder(BestOrder))
5433 continue;
5434 fixupOrderingIndices(Order: BestOrder);
5435 SmallVector<int> Mask;
5436 inversePermutation(Indices: BestOrder, Mask);
5437 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5438 unsigned E = BestOrder.size();
5439 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5440 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5441 });
5442 // Do an actual reordering, if profitable.
5443 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5444 // Just do the reordering for the nodes with the given VF.
5445 if (TE->Scalars.size() != VF) {
5446 if (TE->ReuseShuffleIndices.size() == VF) {
5447 // Need to reorder the reuses masks of the operands with smaller VF to
5448 // be able to find the match between the graph nodes and scalar
5449 // operands of the given node during vectorization/cost estimation.
5450 assert(all_of(TE->UserTreeIndices,
5451 [VF, &TE](const EdgeInfo &EI) {
5452 return EI.UserTE->Scalars.size() == VF ||
5453 EI.UserTE->Scalars.size() ==
5454 TE->Scalars.size();
5455 }) &&
5456 "All users must be of VF size.");
5457 // Update ordering of the operands with the smaller VF than the given
5458 // one.
5459 reorderNodeWithReuses(TE&: *TE, Mask);
5460 }
5461 continue;
5462 }
5463 if ((TE->State == TreeEntry::Vectorize ||
5464 TE->State == TreeEntry::StridedVectorize) &&
5465 isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
5466 InsertElementInst>(Val: TE->getMainOp()) &&
5467 !TE->isAltShuffle()) {
5468 // Build correct orders for extract{element,value}, loads and
5469 // stores.
5470 reorderOrder(Order&: TE->ReorderIndices, Mask);
5471 if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp()))
5472 TE->reorderOperands(Mask);
5473 } else {
5474 // Reorder the node and its operands.
5475 TE->reorderOperands(Mask);
5476 assert(TE->ReorderIndices.empty() &&
5477 "Expected empty reorder sequence.");
5478 reorderScalars(Scalars&: TE->Scalars, Mask);
5479 }
5480 if (!TE->ReuseShuffleIndices.empty()) {
5481 // Apply reversed order to keep the original ordering of the reused
5482 // elements to avoid extra reorder indices shuffling.
5483 OrdersType CurrentOrder;
5484 reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
5485 SmallVector<int> NewReuses;
5486 inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
5487 addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices);
5488 TE->ReuseShuffleIndices.swap(RHS&: NewReuses);
5489 }
5490 }
5491 }
5492}
5493
5494bool BoUpSLP::canReorderOperands(
5495 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5496 ArrayRef<TreeEntry *> ReorderableGathers,
5497 SmallVectorImpl<TreeEntry *> &GatherOps) {
5498 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5499 if (UserTE->isNonPowOf2Vec())
5500 return false;
5501
5502 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5503 if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5504 return OpData.first == I &&
5505 (OpData.second->State == TreeEntry::Vectorize ||
5506 OpData.second->State == TreeEntry::StridedVectorize);
5507 }))
5508 continue;
5509 if (TreeEntry *TE = getVectorizedOperand(UserTE, OpIdx: I)) {
5510 // Do not reorder if operand node is used by many user nodes.
5511 if (any_of(Range&: TE->UserTreeIndices,
5512 P: [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5513 return false;
5514 // Add the node to the list of the ordered nodes with the identity
5515 // order.
5516 Edges.emplace_back(Args&: I, Args&: TE);
5517 // Add ScatterVectorize nodes to the list of operands, where just
5518 // reordering of the scalars is required. Similar to the gathers, so
5519 // simply add to the list of gathered ops.
5520 // If there are reused scalars, process this node as a regular vectorize
5521 // node, just reorder reuses mask.
5522 if (TE->State != TreeEntry::Vectorize &&
5523 TE->State != TreeEntry::StridedVectorize &&
5524 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5525 GatherOps.push_back(Elt: TE);
5526 continue;
5527 }
5528 TreeEntry *Gather = nullptr;
5529 if (count_if(Range&: ReorderableGathers,
5530 P: [&Gather, UserTE, I](TreeEntry *TE) {
5531 assert(TE->State != TreeEntry::Vectorize &&
5532 TE->State != TreeEntry::StridedVectorize &&
5533 "Only non-vectorized nodes are expected.");
5534 if (any_of(Range&: TE->UserTreeIndices,
5535 P: [UserTE, I](const EdgeInfo &EI) {
5536 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5537 })) {
5538 assert(TE->isSame(UserTE->getOperand(I)) &&
5539 "Operand entry does not match operands.");
5540 Gather = TE;
5541 return true;
5542 }
5543 return false;
5544 }) > 1 &&
5545 !allConstant(VL: UserTE->getOperand(OpIdx: I)))
5546 return false;
5547 if (Gather)
5548 GatherOps.push_back(Elt: Gather);
5549 }
5550 return true;
5551}
5552
5553void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5554 SetVector<TreeEntry *> OrderedEntries;
5555 DenseSet<const TreeEntry *> GathersToOrders;
5556 // Find all reorderable leaf nodes with the given VF.
5557 // Currently the are vectorized loads,extracts without alternate operands +
5558 // some gathering of extracts.
5559 SmallVector<TreeEntry *> NonVectorized;
5560 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5561 if (TE->State != TreeEntry::Vectorize &&
5562 TE->State != TreeEntry::StridedVectorize)
5563 NonVectorized.push_back(Elt: TE.get());
5564 if (std::optional<OrdersType> CurrentOrder =
5565 getReorderingData(TE: *TE, /*TopToBottom=*/false)) {
5566 OrderedEntries.insert(X: TE.get());
5567 if (!(TE->State == TreeEntry::Vectorize ||
5568 TE->State == TreeEntry::StridedVectorize) ||
5569 !TE->ReuseShuffleIndices.empty())
5570 GathersToOrders.insert(V: TE.get());
5571 }
5572 }
5573
5574 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5575 // I.e., if the node has operands, that are reordered, try to make at least
5576 // one operand order in the natural order and reorder others + reorder the
5577 // user node itself.
5578 SmallPtrSet<const TreeEntry *, 4> Visited;
5579 while (!OrderedEntries.empty()) {
5580 // 1. Filter out only reordered nodes.
5581 // 2. If the entry has multiple uses - skip it and jump to the next node.
5582 DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
5583 SmallVector<TreeEntry *> Filtered;
5584 for (TreeEntry *TE : OrderedEntries) {
5585 if (!(TE->State == TreeEntry::Vectorize ||
5586 TE->State == TreeEntry::StridedVectorize ||
5587 (TE->isGather() && GathersToOrders.contains(V: TE))) ||
5588 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5589 !all_of(Range: drop_begin(RangeOrContainer&: TE->UserTreeIndices),
5590 P: [TE](const EdgeInfo &EI) {
5591 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5592 }) ||
5593 !Visited.insert(Ptr: TE).second) {
5594 Filtered.push_back(Elt: TE);
5595 continue;
5596 }
5597 // Build a map between user nodes and their operands order to speedup
5598 // search. The graph currently does not provide this dependency directly.
5599 for (EdgeInfo &EI : TE->UserTreeIndices) {
5600 TreeEntry *UserTE = EI.UserTE;
5601 auto It = Users.find(Val: UserTE);
5602 if (It == Users.end())
5603 It = Users.insert(KV: {UserTE, {}}).first;
5604 It->second.emplace_back(Args&: EI.EdgeIdx, Args&: TE);
5605 }
5606 }
5607 // Erase filtered entries.
5608 for (TreeEntry *TE : Filtered)
5609 OrderedEntries.remove(X: TE);
5610 SmallVector<
5611 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5612 UsersVec(Users.begin(), Users.end());
5613 sort(C&: UsersVec, Comp: [](const auto &Data1, const auto &Data2) {
5614 return Data1.first->Idx > Data2.first->Idx;
5615 });
5616 for (auto &Data : UsersVec) {
5617 // Check that operands are used only in the User node.
5618 SmallVector<TreeEntry *> GatherOps;
5619 if (!canReorderOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
5620 GatherOps)) {
5621 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5622 OrderedEntries.remove(X: Op.second);
5623 continue;
5624 }
5625 // All operands are reordered and used only in this node - propagate the
5626 // most used order to the user node.
5627 MapVector<OrdersType, unsigned,
5628 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
5629 OrdersUses;
5630 // Do the analysis for each tree entry only once, otherwise the order of
5631 // the same node my be considered several times, though might be not
5632 // profitable.
5633 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
5634 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
5635 for (const auto &Op : Data.second) {
5636 TreeEntry *OpTE = Op.second;
5637 if (!VisitedOps.insert(Ptr: OpTE).second)
5638 continue;
5639 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
5640 continue;
5641 const auto Order = [&]() -> const OrdersType {
5642 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
5643 return getReorderingData(TE: *OpTE, /*TopToBottom=*/false)
5644 .value_or(u: OrdersType(1));
5645 return OpTE->ReorderIndices;
5646 }();
5647 // The order is partially ordered, skip it in favor of fully non-ordered
5648 // orders.
5649 if (Order.size() == 1)
5650 continue;
5651 unsigned NumOps = count_if(
5652 Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5653 return P.second == OpTE;
5654 });
5655 // Stores actually store the mask, not the order, need to invert.
5656 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5657 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5658 SmallVector<int> Mask;
5659 inversePermutation(Indices: Order, Mask);
5660 unsigned E = Order.size();
5661 OrdersType CurrentOrder(E, E);
5662 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
5663 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5664 });
5665 fixupOrderingIndices(Order: CurrentOrder);
5666 OrdersUses.insert(KV: std::make_pair(x&: CurrentOrder, y: 0)).first->second +=
5667 NumOps;
5668 } else {
5669 OrdersUses.insert(KV: std::make_pair(x: Order, y: 0)).first->second += NumOps;
5670 }
5671 auto Res = OrdersUses.insert(KV: std::make_pair(x: OrdersType(), y: 0));
5672 const auto AllowsReordering = [&](const TreeEntry *TE) {
5673 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5674 if (TE->isNonPowOf2Vec())
5675 return false;
5676 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5677 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5678 (IgnoreReorder && TE->Idx == 0))
5679 return true;
5680 if (TE->isGather()) {
5681 if (GathersToOrders.contains(V: TE))
5682 return !getReorderingData(TE: *TE, /*TopToBottom=*/false)
5683 .value_or(u: OrdersType(1))
5684 .empty();
5685 return true;
5686 }
5687 return false;
5688 };
5689 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5690 TreeEntry *UserTE = EI.UserTE;
5691 if (!VisitedUsers.insert(Ptr: UserTE).second)
5692 continue;
5693 // May reorder user node if it requires reordering, has reused
5694 // scalars, is an alternate op vectorize node or its op nodes require
5695 // reordering.
5696 if (AllowsReordering(UserTE))
5697 continue;
5698 // Check if users allow reordering.
5699 // Currently look up just 1 level of operands to avoid increase of
5700 // the compile time.
5701 // Profitable to reorder if definitely more operands allow
5702 // reordering rather than those with natural order.
5703 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users[UserTE];
5704 if (static_cast<unsigned>(count_if(
5705 Range&: Ops, P: [UserTE, &AllowsReordering](
5706 const std::pair<unsigned, TreeEntry *> &Op) {
5707 return AllowsReordering(Op.second) &&
5708 all_of(Range&: Op.second->UserTreeIndices,
5709 P: [UserTE](const EdgeInfo &EI) {
5710 return EI.UserTE == UserTE;
5711 });
5712 })) <= Ops.size() / 2)
5713 ++Res.first->second;
5714 }
5715 }
5716 if (OrdersUses.empty()) {
5717 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5718 OrderedEntries.remove(X: Op.second);
5719 continue;
5720 }
5721 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5722 const unsigned Sz = Order.size();
5723 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
5724 if (Idx != Order[Idx] && Order[Idx] != Sz)
5725 return false;
5726 return true;
5727 };
5728 // Choose the most used order.
5729 unsigned IdentityCnt = 0;
5730 unsigned VF = Data.second.front().second->getVectorFactor();
5731 OrdersType IdentityOrder(VF, VF);
5732 for (auto &Pair : OrdersUses) {
5733 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5734 IdentityCnt += Pair.second;
5735 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
5736 }
5737 }
5738 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5739 unsigned Cnt = IdentityCnt;
5740 for (auto &Pair : OrdersUses) {
5741 // Prefer identity order. But, if filled identity found (non-empty
5742 // order) with same number of uses, as the new candidate order, we can
5743 // choose this candidate order.
5744 if (Cnt < Pair.second) {
5745 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
5746 BestOrder = Pair.first;
5747 Cnt = Pair.second;
5748 } else {
5749 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
5750 }
5751 }
5752 // Set order of the user node.
5753 if (IsIdentityOrder(BestOrder)) {
5754 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5755 OrderedEntries.remove(X: Op.second);
5756 continue;
5757 }
5758 fixupOrderingIndices(Order: BestOrder);
5759 // Erase operands from OrderedEntries list and adjust their orders.
5760 VisitedOps.clear();
5761 SmallVector<int> Mask;
5762 inversePermutation(Indices: BestOrder, Mask);
5763 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5764 unsigned E = BestOrder.size();
5765 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
5766 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5767 });
5768 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5769 TreeEntry *TE = Op.second;
5770 OrderedEntries.remove(X: TE);
5771 if (!VisitedOps.insert(Ptr: TE).second)
5772 continue;
5773 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5774 reorderNodeWithReuses(TE&: *TE, Mask);
5775 continue;
5776 }
5777 // Gathers are processed separately.
5778 if (TE->State != TreeEntry::Vectorize &&
5779 TE->State != TreeEntry::StridedVectorize &&
5780 (TE->State != TreeEntry::ScatterVectorize ||
5781 TE->ReorderIndices.empty()))
5782 continue;
5783 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5784 TE->ReorderIndices.empty()) &&
5785 "Non-matching sizes of user/operand entries.");
5786 reorderOrder(Order&: TE->ReorderIndices, Mask);
5787 if (IgnoreReorder && TE == VectorizableTree.front().get())
5788 IgnoreReorder = false;
5789 }
5790 // For gathers just need to reorder its scalars.
5791 for (TreeEntry *Gather : GatherOps) {
5792 assert(Gather->ReorderIndices.empty() &&
5793 "Unexpected reordering of gathers.");
5794 if (!Gather->ReuseShuffleIndices.empty()) {
5795 // Just reorder reuses indices.
5796 reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
5797 continue;
5798 }
5799 reorderScalars(Scalars&: Gather->Scalars, Mask);
5800 OrderedEntries.remove(X: Gather);
5801 }
5802 // Reorder operands of the user node and set the ordering for the user
5803 // node itself.
5804 if (Data.first->State != TreeEntry::Vectorize ||
5805 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5806 Val: Data.first->getMainOp()) ||
5807 Data.first->isAltShuffle())
5808 Data.first->reorderOperands(Mask);
5809 if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) ||
5810 Data.first->isAltShuffle() ||
5811 Data.first->State == TreeEntry::StridedVectorize) {
5812 reorderScalars(Scalars&: Data.first->Scalars, Mask);
5813 reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
5814 /*BottomOrder=*/true);
5815 if (Data.first->ReuseShuffleIndices.empty() &&
5816 !Data.first->ReorderIndices.empty() &&
5817 !Data.first->isAltShuffle()) {
5818 // Insert user node to the list to try to sink reordering deeper in
5819 // the graph.
5820 OrderedEntries.insert(X: Data.first);
5821 }
5822 } else {
5823 reorderOrder(Order&: Data.first->ReorderIndices, Mask);
5824 }
5825 }
5826 }
5827 // If the reordering is unnecessary, just remove the reorder.
5828 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5829 VectorizableTree.front()->ReuseShuffleIndices.empty())
5830 VectorizableTree.front()->ReorderIndices.clear();
5831}
5832
5833void BoUpSLP::buildExternalUses(
5834 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5835 DenseMap<Value *, unsigned> ScalarToExtUses;
5836 // Collect the values that we need to extract from the tree.
5837 for (auto &TEPtr : VectorizableTree) {
5838 TreeEntry *Entry = TEPtr.get();
5839
5840 // No need to handle users of gathered values.
5841 if (Entry->isGather())
5842 continue;
5843
5844 // For each lane:
5845 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5846 Value *Scalar = Entry->Scalars[Lane];
5847 if (!isa<Instruction>(Val: Scalar))
5848 continue;
5849 // All uses must be replaced already? No need to do it again.
5850 auto It = ScalarToExtUses.find(Val: Scalar);
5851 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5852 continue;
5853
5854 // Check if the scalar is externally used as an extra arg.
5855 const auto *ExtI = ExternallyUsedValues.find(Key: Scalar);
5856 if (ExtI != ExternallyUsedValues.end()) {
5857 int FoundLane = Entry->findLaneForValue(V: Scalar);
5858 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5859 << FoundLane << " from " << *Scalar << ".\n");
5860 ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
5861 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: FoundLane);
5862 continue;
5863 }
5864 for (User *U : Scalar->users()) {
5865 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5866
5867 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
5868 if (!UserInst || isDeleted(I: UserInst))
5869 continue;
5870
5871 // Ignore users in the user ignore list.
5872 if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
5873 continue;
5874
5875 // Skip in-tree scalars that become vectors
5876 if (TreeEntry *UseEntry = getTreeEntry(V: U)) {
5877 // Some in-tree scalars will remain as scalar in vectorized
5878 // instructions. If that is the case, the one in FoundLane will
5879 // be used.
5880 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5881 !doesInTreeUserNeedToExtract(
5882 Scalar, UserInst: cast<Instruction>(Val: UseEntry->Scalars.front()), TLI)) {
5883 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5884 << ".\n");
5885 assert(!UseEntry->isGather() && "Bad state");
5886 continue;
5887 }
5888 U = nullptr;
5889 if (It != ScalarToExtUses.end()) {
5890 ExternalUses[It->second].User = nullptr;
5891 break;
5892 }
5893 }
5894
5895 if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
5896 U = nullptr;
5897 int FoundLane = Entry->findLaneForValue(V: Scalar);
5898 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5899 << " from lane " << FoundLane << " from " << *Scalar
5900 << ".\n");
5901 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
5902 ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: FoundLane);
5903 if (!U)
5904 break;
5905 }
5906 }
5907 }
5908}
5909
5910DenseMap<Value *, SmallVector<StoreInst *>>
5911BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5912 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
5913 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) {
5914 Value *V = TE->Scalars[Lane];
5915 // To save compilation time we don't visit if we have too many users.
5916 if (V->hasNUsesOrMore(N: UsesLimit))
5917 break;
5918
5919 // Collect stores per pointer object.
5920 for (User *U : V->users()) {
5921 auto *SI = dyn_cast<StoreInst>(Val: U);
5922 if (SI == nullptr || !SI->isSimple() ||
5923 !isValidElementType(Ty: SI->getValueOperand()->getType()))
5924 continue;
5925 // Skip entry if already
5926 if (getTreeEntry(V: U))
5927 continue;
5928
5929 Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
5930 auto &StoresVec = PtrToStoresMap[Ptr];
5931 // For now just keep one store per pointer object per lane.
5932 // TODO: Extend this to support multiple stores per pointer per lane
5933 if (StoresVec.size() > Lane)
5934 continue;
5935 // Skip if in different BBs.
5936 if (!StoresVec.empty() &&
5937 SI->getParent() != StoresVec.back()->getParent())
5938 continue;
5939 // Make sure that the stores are of the same type.
5940 if (!StoresVec.empty() &&
5941 SI->getValueOperand()->getType() !=
5942 StoresVec.back()->getValueOperand()->getType())
5943 continue;
5944 StoresVec.push_back(Elt: SI);
5945 }
5946 }
5947 return PtrToStoresMap;
5948}
5949
5950bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5951 OrdersType &ReorderIndices) const {
5952 // We check whether the stores in StoreVec can form a vector by sorting them
5953 // and checking whether they are consecutive.
5954
5955 // To avoid calling getPointersDiff() while sorting we create a vector of
5956 // pairs {store, offset from first} and sort this instead.
5957 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5958 StoreInst *S0 = StoresVec[0];
5959 StoreOffsetVec[0] = {S0, 0};
5960 Type *S0Ty = S0->getValueOperand()->getType();
5961 Value *S0Ptr = S0->getPointerOperand();
5962 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) {
5963 StoreInst *SI = StoresVec[Idx];
5964 std::optional<int> Diff =
5965 getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
5966 PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
5967 /*StrictCheck=*/true);
5968 // We failed to compare the pointers so just abandon this StoresVec.
5969 if (!Diff)
5970 return false;
5971 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5972 }
5973
5974 // Sort the vector based on the pointers. We create a copy because we may
5975 // need the original later for calculating the reorder (shuffle) indices.
5976 stable_sort(Range&: StoreOffsetVec, C: [](const std::pair<StoreInst *, int> &Pair1,
5977 const std::pair<StoreInst *, int> &Pair2) {
5978 int Offset1 = Pair1.second;
5979 int Offset2 = Pair2.second;
5980 return Offset1 < Offset2;
5981 });
5982
5983 // Check if the stores are consecutive by checking if their difference is 1.
5984 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoreOffsetVec.size()))
5985 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5986 return false;
5987
5988 // Calculate the shuffle indices according to their offset against the sorted
5989 // StoreOffsetVec.
5990 ReorderIndices.reserve(N: StoresVec.size());
5991 for (StoreInst *SI : StoresVec) {
5992 unsigned Idx = find_if(Range&: StoreOffsetVec,
5993 P: [SI](const std::pair<StoreInst *, int> &Pair) {
5994 return Pair.first == SI;
5995 }) -
5996 StoreOffsetVec.begin();
5997 ReorderIndices.push_back(Elt: Idx);
5998 }
5999 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6000 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6001 // same convention here.
6002 auto IsIdentityOrder = [](const OrdersType &Order) {
6003 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Order.size()))
6004 if (Idx != Order[Idx])
6005 return false;
6006 return true;
6007 };
6008 if (IsIdentityOrder(ReorderIndices))
6009 ReorderIndices.clear();
6010
6011 return true;
6012}
6013
6014#ifndef NDEBUG
6015LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
6016 for (unsigned Idx : Order)
6017 dbgs() << Idx << ", ";
6018 dbgs() << "\n";
6019}
6020#endif
6021
6022SmallVector<BoUpSLP::OrdersType, 1>
6023BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6024 unsigned NumLanes = TE->Scalars.size();
6025
6026 DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap =
6027 collectUserStores(TE);
6028
6029 // Holds the reorder indices for each candidate store vector that is a user of
6030 // the current TreeEntry.
6031 SmallVector<OrdersType, 1> ExternalReorderIndices;
6032
6033 // Now inspect the stores collected per pointer and look for vectorization
6034 // candidates. For each candidate calculate the reorder index vector and push
6035 // it into `ExternalReorderIndices`
6036 for (const auto &Pair : PtrToStoresMap) {
6037 auto &StoresVec = Pair.second;
6038 // If we have fewer than NumLanes stores, then we can't form a vector.
6039 if (StoresVec.size() != NumLanes)
6040 continue;
6041
6042 // If the stores are not consecutive then abandon this StoresVec.
6043 OrdersType ReorderIndices;
6044 if (!canFormVector(StoresVec, ReorderIndices))
6045 continue;
6046
6047 // We now know that the scalars in StoresVec can form a vector instruction,
6048 // so set the reorder indices.
6049 ExternalReorderIndices.push_back(Elt: ReorderIndices);
6050 }
6051 return ExternalReorderIndices;
6052}
6053
6054void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
6055 const SmallDenseSet<Value *> &UserIgnoreLst) {
6056 deleteTree();
6057 UserIgnoreList = &UserIgnoreLst;
6058 if (!allSameType(VL: Roots))
6059 return;
6060 buildTree_rec(Roots, Depth: 0, EI: EdgeInfo());
6061}
6062
6063void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
6064 deleteTree();
6065 if (!allSameType(VL: Roots))
6066 return;
6067 buildTree_rec(Roots, Depth: 0, EI: EdgeInfo());
6068}
6069
6070/// \return true if the specified list of values has only one instruction that
6071/// requires scheduling, false otherwise.
6072#ifndef NDEBUG
6073static bool needToScheduleSingleInstruction(ArrayRef<Value *> VL) {
6074 Value *NeedsScheduling = nullptr;
6075 for (Value *V : VL) {
6076 if (doesNotNeedToBeScheduled(V))
6077 continue;
6078 if (!NeedsScheduling) {
6079 NeedsScheduling = V;
6080 continue;
6081 }
6082 return false;
6083 }
6084 return NeedsScheduling;
6085}
6086#endif
6087
6088/// Generates key/subkey pair for the given value to provide effective sorting
6089/// of the values and better detection of the vectorizable values sequences. The
6090/// keys/subkeys can be used for better sorting of the values themselves (keys)
6091/// and in values subgroups (subkeys).
6092static std::pair<size_t, size_t> generateKeySubkey(
6093 Value *V, const TargetLibraryInfo *TLI,
6094 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
6095 bool AllowAlternate) {
6096 hash_code Key = hash_value(value: V->getValueID() + 2);
6097 hash_code SubKey = hash_value(value: 0);
6098 // Sort the loads by the distance between the pointers.
6099 if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
6100 Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
6101 if (LI->isSimple())
6102 SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI));
6103 else
6104 Key = SubKey = hash_value(ptr: LI);
6105 } else if (isVectorLikeInstWithConstOps(V)) {
6106 // Sort extracts by the vector operands.
6107 if (isa<ExtractElementInst, UndefValue>(Val: V))
6108 Key = hash_value(value: Value::UndefValueVal + 1);
6109 if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
6110 if (!isUndefVector(V: EI->getVectorOperand()).all() &&
6111 !isa<UndefValue>(Val: EI->getIndexOperand()))
6112 SubKey = hash_value(ptr: EI->getVectorOperand());
6113 }
6114 } else if (auto *I = dyn_cast<Instruction>(Val: V)) {
6115 // Sort other instructions just by the opcodes except for CMPInst.
6116 // For CMP also sort by the predicate kind.
6117 if ((isa<BinaryOperator, CastInst>(Val: I)) &&
6118 isValidForAlternation(Opcode: I->getOpcode())) {
6119 if (AllowAlternate)
6120 Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0);
6121 else
6122 Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
6123 SubKey = hash_combine(
6124 args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
6125 args: hash_value(ptr: isa<BinaryOperator>(Val: I)
6126 ? I->getType()
6127 : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType()));
6128 // For casts, look through the only operand to improve compile time.
6129 if (isa<CastInst>(Val: I)) {
6130 std::pair<size_t, size_t> OpVals =
6131 generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator,
6132 /*AllowAlternate=*/true);
6133 Key = hash_combine(args: OpVals.first, args: Key);
6134 SubKey = hash_combine(args: OpVals.first, args: SubKey);
6135 }
6136 } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
6137 CmpInst::Predicate Pred = CI->getPredicate();
6138 if (CI->isCommutative())
6139 Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
6140 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
6141 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
6142 args: hash_value(value: SwapPred),
6143 args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType()));
6144 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
6145 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
6146 if (isTriviallyVectorizable(ID)) {
6147 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
6148 } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) {
6149 SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
6150 args: hash_value(ptr: Call->getCalledFunction()));
6151 } else {
6152 Key = hash_combine(args: hash_value(ptr: Call), args: Key);
6153 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
6154 }
6155 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
6156 SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
6157 args: hash_value(ptr: Op.Tag), args: SubKey);
6158 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
6159 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1)))
6160 SubKey = hash_value(ptr: Gep->getPointerOperand());
6161 else
6162 SubKey = hash_value(ptr: Gep);
6163 } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
6164 !isa<ConstantInt>(Val: I->getOperand(i: 1))) {
6165 // Do not try to vectorize instructions with potentially high cost.
6166 SubKey = hash_value(ptr: I);
6167 } else {
6168 SubKey = hash_value(value: I->getOpcode());
6169 }
6170 Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
6171 }
6172 return std::make_pair(x&: Key, y&: SubKey);
6173}
6174
6175/// Checks if the specified instruction \p I is an alternate operation for
6176/// the given \p MainOp and \p AltOp instructions.
6177static bool isAlternateInstruction(const Instruction *I,
6178 const Instruction *MainOp,
6179 const Instruction *AltOp,
6180 const TargetLibraryInfo &TLI);
6181
6182bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
6183 ArrayRef<Value *> VL) const {
6184 unsigned Opcode0 = S.getOpcode();
6185 unsigned Opcode1 = S.getAltOpcode();
6186 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
6187 // If this pattern is supported by the target then consider it profitable.
6188 if (TTI->isLegalAltInstr(VecTy: getWidenedType(ScalarTy: S.MainOp->getType(), VF: VL.size()),
6189 Opcode0, Opcode1, OpcodeMask))
6190 return true;
6191 SmallVector<ValueList> Operands;
6192 for (unsigned I : seq<unsigned>(Begin: 0, End: S.MainOp->getNumOperands())) {
6193 Operands.emplace_back();
6194 // Prepare the operand vector.
6195 for (Value *V : VL)
6196 Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
6197 }
6198 if (Operands.size() == 2) {
6199 // Try find best operands candidates.
6200 for (unsigned I : seq<unsigned>(Begin: 0, End: VL.size() - 1)) {
6201 SmallVector<std::pair<Value *, Value *>> Candidates(3);
6202 Candidates[0] = std::make_pair(x&: Operands[0][I], y&: Operands[0][I + 1]);
6203 Candidates[1] = std::make_pair(x&: Operands[0][I], y&: Operands[1][I + 1]);
6204 Candidates[2] = std::make_pair(x&: Operands[1][I], y&: Operands[0][I + 1]);
6205 std::optional<int> Res = findBestRootPair(Candidates);
6206 switch (Res.value_or(u: 0)) {
6207 case 0:
6208 break;
6209 case 1:
6210 std::swap(a&: Operands[0][I + 1], b&: Operands[1][I + 1]);
6211 break;
6212 case 2:
6213 std::swap(a&: Operands[0][I], b&: Operands[1][I]);
6214 break;
6215 default:
6216 llvm_unreachable("Unexpected index.");
6217 }
6218 }
6219 }
6220 DenseSet<unsigned> UniqueOpcodes;
6221 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6222 unsigned NonInstCnt = 0;
6223 // Estimate number of instructions, required for the vectorized node and for
6224 // the buildvector node.
6225 unsigned UndefCnt = 0;
6226 // Count the number of extra shuffles, required for vector nodes.
6227 unsigned ExtraShuffleInsts = 0;
6228 // Check that operands do not contain same values and create either perfect
6229 // diamond match or shuffled match.
6230 if (Operands.size() == 2) {
6231 // Do not count same operands twice.
6232 if (Operands.front() == Operands.back()) {
6233 Operands.erase(CI: Operands.begin());
6234 } else if (!allConstant(VL: Operands.front()) &&
6235 all_of(Range&: Operands.front(), P: [&](Value *V) {
6236 return is_contained(Range&: Operands.back(), Element: V);
6237 })) {
6238 Operands.erase(CI: Operands.begin());
6239 ++ExtraShuffleInsts;
6240 }
6241 }
6242 const Loop *L = LI->getLoopFor(BB: S.MainOp->getParent());
6243 // Vectorize node, if:
6244 // 1. at least single operand is constant or splat.
6245 // 2. Operands have many loop invariants (the instructions are not loop
6246 // invariants).
6247 // 3. At least single unique operands is supposed to vectorized.
6248 return none_of(Range&: Operands,
6249 P: [&](ArrayRef<Value *> Op) {
6250 if (allConstant(VL: Op) ||
6251 (!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
6252 getSameOpcode(VL: Op, TLI: *TLI).MainOp))
6253 return false;
6254 DenseMap<Value *, unsigned> Uniques;
6255 for (Value *V : Op) {
6256 if (isa<Constant, ExtractElementInst>(Val: V) ||
6257 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6258 if (isa<UndefValue>(Val: V))
6259 ++UndefCnt;
6260 continue;
6261 }
6262 auto Res = Uniques.try_emplace(Key: V, Args: 0);
6263 // Found first duplicate - need to add shuffle.
6264 if (!Res.second && Res.first->second == 1)
6265 ++ExtraShuffleInsts;
6266 ++Res.first->getSecond();
6267 if (auto *I = dyn_cast<Instruction>(Val: V))
6268 UniqueOpcodes.insert(V: I->getOpcode());
6269 else if (Res.second)
6270 ++NonInstCnt;
6271 }
6272 return none_of(Range&: Uniques, P: [&](const auto &P) {
6273 return P.first->hasNUsesOrMore(P.second + 1) &&
6274 none_of(P.first->users(), [&](User *U) {
6275 return getTreeEntry(V: U) || Uniques.contains(Val: U);
6276 });
6277 });
6278 }) ||
6279 // Do not vectorize node, if estimated number of vector instructions is
6280 // more than estimated number of buildvector instructions. Number of
6281 // vector operands is number of vector instructions + number of vector
6282 // instructions for operands (buildvectors). Number of buildvector
6283 // instructions is just number_of_operands * number_of_scalars.
6284 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6285 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6286 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6287}
6288
6289BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6290 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6291 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6292 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6293
6294 unsigned ShuffleOrOp =
6295 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6296 auto *VL0 = cast<Instruction>(Val: S.OpValue);
6297 switch (ShuffleOrOp) {
6298 case Instruction::PHI: {
6299 // Too many operands - gather, most probably won't be vectorized.
6300 if (VL0->getNumOperands() > MaxPHINumOperands)
6301 return TreeEntry::NeedToGather;
6302 // Check for terminator values (e.g. invoke).
6303 for (Value *V : VL)
6304 for (Value *Incoming : cast<PHINode>(Val: V)->incoming_values()) {
6305 Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
6306 if (Term && Term->isTerminator()) {
6307 LLVM_DEBUG(dbgs()
6308 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6309 return TreeEntry::NeedToGather;
6310 }
6311 }
6312
6313 return TreeEntry::Vectorize;
6314 }
6315 case Instruction::ExtractValue:
6316 case Instruction::ExtractElement: {
6317 bool Reuse = canReuseExtract(VL, OpValue: VL0, CurrentOrder);
6318 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6319 if (!isPowerOf2_32(Value: VL.size()))
6320 return TreeEntry::NeedToGather;
6321 if (Reuse || !CurrentOrder.empty())
6322 return TreeEntry::Vectorize;
6323 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6324 return TreeEntry::NeedToGather;
6325 }
6326 case Instruction::InsertElement: {
6327 // Check that we have a buildvector and not a shuffle of 2 or more
6328 // different vectors.
6329 ValueSet SourceVectors;
6330 for (Value *V : VL) {
6331 SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0));
6332 assert(getElementIndex(V) != std::nullopt &&
6333 "Non-constant or undef index?");
6334 }
6335
6336 if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
6337 return !SourceVectors.contains(Ptr: V);
6338 }) >= 2) {
6339 // Found 2nd source vector - cancel.
6340 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6341 "different source vectors.\n");
6342 return TreeEntry::NeedToGather;
6343 }
6344
6345 return TreeEntry::Vectorize;
6346 }
6347 case Instruction::Load: {
6348 // Check that a vectorized load would load the same memory as a scalar
6349 // load. For example, we don't want to vectorize loads that are smaller
6350 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6351 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6352 // from such a struct, we read/write packed bits disagreeing with the
6353 // unvectorized version.
6354 switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps)) {
6355 case LoadsState::Vectorize:
6356 return TreeEntry::Vectorize;
6357 case LoadsState::ScatterVectorize:
6358 return TreeEntry::ScatterVectorize;
6359 case LoadsState::StridedVectorize:
6360 return TreeEntry::StridedVectorize;
6361 case LoadsState::Gather:
6362#ifndef NDEBUG
6363 Type *ScalarTy = VL0->getType();
6364 if (DL->getTypeSizeInBits(ScalarTy) !=
6365 DL->getTypeAllocSizeInBits(ScalarTy))
6366 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6367 else if (any_of(VL,
6368 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6369 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6370 else
6371 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6372#endif // NDEBUG
6373 return TreeEntry::NeedToGather;
6374 }
6375 llvm_unreachable("Unexpected state of loads");
6376 }
6377 case Instruction::ZExt:
6378 case Instruction::SExt:
6379 case Instruction::FPToUI:
6380 case Instruction::FPToSI:
6381 case Instruction::FPExt:
6382 case Instruction::PtrToInt:
6383 case Instruction::IntToPtr:
6384 case Instruction::SIToFP:
6385 case Instruction::UIToFP:
6386 case Instruction::Trunc:
6387 case Instruction::FPTrunc:
6388 case Instruction::BitCast: {
6389 Type *SrcTy = VL0->getOperand(i: 0)->getType();
6390 for (Value *V : VL) {
6391 Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType();
6392 if (Ty != SrcTy || !isValidElementType(Ty)) {
6393 LLVM_DEBUG(
6394 dbgs() << "SLP: Gathering casts with different src types.\n");
6395 return TreeEntry::NeedToGather;
6396 }
6397 }
6398 return TreeEntry::Vectorize;
6399 }
6400 case Instruction::ICmp:
6401 case Instruction::FCmp: {
6402 // Check that all of the compares have the same predicate.
6403 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
6404 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
6405 Type *ComparedTy = VL0->getOperand(i: 0)->getType();
6406 for (Value *V : VL) {
6407 CmpInst *Cmp = cast<CmpInst>(Val: V);
6408 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6409 Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) {
6410 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6411 return TreeEntry::NeedToGather;
6412 }
6413 }
6414 return TreeEntry::Vectorize;
6415 }
6416 case Instruction::Select:
6417 case Instruction::FNeg:
6418 case Instruction::Add:
6419 case Instruction::FAdd:
6420 case Instruction::Sub:
6421 case Instruction::FSub:
6422 case Instruction::Mul:
6423 case Instruction::FMul:
6424 case Instruction::UDiv:
6425 case Instruction::SDiv:
6426 case Instruction::FDiv:
6427 case Instruction::URem:
6428 case Instruction::SRem:
6429 case Instruction::FRem:
6430 case Instruction::Shl:
6431 case Instruction::LShr:
6432 case Instruction::AShr:
6433 case Instruction::And:
6434 case Instruction::Or:
6435 case Instruction::Xor:
6436 return TreeEntry::Vectorize;
6437 case Instruction::GetElementPtr: {
6438 // We don't combine GEPs with complicated (nested) indexing.
6439 for (Value *V : VL) {
6440 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6441 if (!I)
6442 continue;
6443 if (I->getNumOperands() != 2) {
6444 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6445 return TreeEntry::NeedToGather;
6446 }
6447 }
6448
6449 // We can't combine several GEPs into one vector if they operate on
6450 // different types.
6451 Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
6452 for (Value *V : VL) {
6453 auto *GEP = dyn_cast<GEPOperator>(Val: V);
6454 if (!GEP)
6455 continue;
6456 Type *CurTy = GEP->getSourceElementType();
6457 if (Ty0 != CurTy) {
6458 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6459 return TreeEntry::NeedToGather;
6460 }
6461 }
6462
6463 // We don't combine GEPs with non-constant indexes.
6464 Type *Ty1 = VL0->getOperand(i: 1)->getType();
6465 for (Value *V : VL) {
6466 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6467 if (!I)
6468 continue;
6469 auto *Op = I->getOperand(i_nocapture: 1);
6470 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
6471 (Op->getType() != Ty1 &&
6472 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
6473 Op->getType()->getScalarSizeInBits() >
6474 DL->getIndexSizeInBits(
6475 AS: V->getType()->getPointerAddressSpace())))) {
6476 LLVM_DEBUG(
6477 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6478 return TreeEntry::NeedToGather;
6479 }
6480 }
6481
6482 return TreeEntry::Vectorize;
6483 }
6484 case Instruction::Store: {
6485 // Check if the stores are consecutive or if we need to swizzle them.
6486 llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
6487 // Avoid types that are padded when being allocated as scalars, while
6488 // being packed together in a vector (such as i1).
6489 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
6490 DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
6491 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6492 return TreeEntry::NeedToGather;
6493 }
6494 // Make sure all stores in the bundle are simple - we can't vectorize
6495 // atomic or volatile stores.
6496 for (Value *V : VL) {
6497 auto *SI = cast<StoreInst>(Val: V);
6498 if (!SI->isSimple()) {
6499 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6500 return TreeEntry::NeedToGather;
6501 }
6502 PointerOps.push_back(Elt: SI->getPointerOperand());
6503 }
6504
6505 // Check the order of pointer operands.
6506 if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) {
6507 Value *Ptr0;
6508 Value *PtrN;
6509 if (CurrentOrder.empty()) {
6510 Ptr0 = PointerOps.front();
6511 PtrN = PointerOps.back();
6512 } else {
6513 Ptr0 = PointerOps[CurrentOrder.front()];
6514 PtrN = PointerOps[CurrentOrder.back()];
6515 }
6516 std::optional<int> Dist =
6517 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
6518 // Check that the sorted pointer operands are consecutive.
6519 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6520 return TreeEntry::Vectorize;
6521 }
6522
6523 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6524 return TreeEntry::NeedToGather;
6525 }
6526 case Instruction::Call: {
6527 // Check if the calls are all to the same vectorizable intrinsic or
6528 // library function.
6529 CallInst *CI = cast<CallInst>(Val: VL0);
6530 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6531
6532 VFShape Shape = VFShape::get(
6533 FTy: CI->getFunctionType(),
6534 EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
6535 HasGlobalPred: false /*HasGlobalPred*/);
6536 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6537
6538 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6539 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6540 return TreeEntry::NeedToGather;
6541 }
6542 Function *F = CI->getCalledFunction();
6543 unsigned NumArgs = CI->arg_size();
6544 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6545 for (unsigned J = 0; J != NumArgs; ++J)
6546 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J))
6547 ScalarArgs[J] = CI->getArgOperand(i: J);
6548 for (Value *V : VL) {
6549 CallInst *CI2 = dyn_cast<CallInst>(Val: V);
6550 if (!CI2 || CI2->getCalledFunction() != F ||
6551 getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID ||
6552 (VecFunc &&
6553 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6554 !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
6555 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6556 << "\n");
6557 return TreeEntry::NeedToGather;
6558 }
6559 // Some intrinsics have scalar arguments and should be same in order for
6560 // them to be vectorized.
6561 for (unsigned J = 0; J != NumArgs; ++J) {
6562 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J)) {
6563 Value *A1J = CI2->getArgOperand(i: J);
6564 if (ScalarArgs[J] != A1J) {
6565 LLVM_DEBUG(dbgs()
6566 << "SLP: mismatched arguments in call:" << *CI
6567 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6568 return TreeEntry::NeedToGather;
6569 }
6570 }
6571 }
6572 // Verify that the bundle operands are identical between the two calls.
6573 if (CI->hasOperandBundles() &&
6574 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6575 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6576 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6577 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6578 << "!=" << *V << '\n');
6579 return TreeEntry::NeedToGather;
6580 }
6581 }
6582
6583 return TreeEntry::Vectorize;
6584 }
6585 case Instruction::ShuffleVector: {
6586 // If this is not an alternate sequence of opcode like add-sub
6587 // then do not vectorize this instruction.
6588 if (!S.isAltShuffle()) {
6589 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6590 return TreeEntry::NeedToGather;
6591 }
6592 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6593 LLVM_DEBUG(
6594 dbgs()
6595 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6596 "the whole alt sequence is not profitable.\n");
6597 return TreeEntry::NeedToGather;
6598 }
6599
6600 return TreeEntry::Vectorize;
6601 }
6602 default:
6603 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6604 return TreeEntry::NeedToGather;
6605 }
6606}
6607
6608namespace {
6609/// Allows to correctly handle operands of the phi nodes based on the \p Main
6610/// PHINode order of incoming basic blocks/values.
6611class PHIHandler {
6612 DominatorTree &DT;
6613 PHINode *Main = nullptr;
6614 SmallVector<Value *> Phis;
6615 SmallVector<SmallVector<Value *>> Operands;
6616
6617public:
6618 PHIHandler() = delete;
6619 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6620 : DT(DT), Main(Main), Phis(Phis),
6621 Operands(Main->getNumIncomingValues(),
6622 SmallVector<Value *>(Phis.size(), nullptr)) {}
6623 void buildOperands() {
6624 constexpr unsigned FastLimit = 4;
6625 if (Main->getNumIncomingValues() <= FastLimit) {
6626 for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) {
6627 BasicBlock *InBB = Main->getIncomingBlock(i: I);
6628 if (!DT.isReachableFromEntry(A: InBB)) {
6629 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
6630 continue;
6631 }
6632 // Prepare the operand vector.
6633 for (auto [Idx, V] : enumerate(First&: Phis)) {
6634 auto *P = cast<PHINode>(Val: V);
6635 if (P->getIncomingBlock(i: I) == InBB)
6636 Operands[I][Idx] = P->getIncomingValue(i: I);
6637 else
6638 Operands[I][Idx] = P->getIncomingValueForBlock(BB: InBB);
6639 }
6640 }
6641 return;
6642 }
6643 SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4> Blocks;
6644 for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) {
6645 BasicBlock *InBB = Main->getIncomingBlock(i: I);
6646 if (!DT.isReachableFromEntry(A: InBB)) {
6647 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
6648 continue;
6649 }
6650 Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
6651 }
6652 for (auto [Idx, V] : enumerate(First&: Phis)) {
6653 auto *P = cast<PHINode>(Val: V);
6654 for (unsigned I : seq<unsigned>(Begin: 0, End: P->getNumIncomingValues())) {
6655 BasicBlock *InBB = P->getIncomingBlock(i: I);
6656 if (InBB == Main->getIncomingBlock(i: I)) {
6657 if (isa_and_nonnull<PoisonValue>(Val: Operands[I][Idx]))
6658 continue;
6659 Operands[I][Idx] = P->getIncomingValue(i: I);
6660 continue;
6661 }
6662 auto It = Blocks.find(Val: InBB);
6663 if (It == Blocks.end())
6664 continue;
6665 Operands[It->second.front()][Idx] = P->getIncomingValue(i: I);
6666 }
6667 }
6668 for (const auto &P : Blocks) {
6669 if (P.getSecond().size() <= 1)
6670 continue;
6671 unsigned BasicI = P.getSecond().front();
6672 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6673 assert(all_of(enumerate(Operands[I]),
6674 [&](const auto &Data) {
6675 return !Data.value() ||
6676 Data.value() == Operands[BasicI][Data.index()];
6677 }) &&
6678 "Expected empty operands list.");
6679 Operands[I] = Operands[BasicI];
6680 }
6681 }
6682 }
6683 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6684};
6685} // namespace
6686
6687void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6688 const EdgeInfo &UserTreeIdx) {
6689 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6690
6691 SmallVector<int> ReuseShuffleIndices;
6692 SmallVector<Value *> UniqueValues;
6693 SmallVector<Value *> NonUniqueValueVL;
6694 auto TryToFindDuplicates = [&](const InstructionsState &S,
6695 bool DoNotFail = false) {
6696 // Check that every instruction appears once in this bundle.
6697 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6698 for (Value *V : VL) {
6699 if (isConstant(V)) {
6700 ReuseShuffleIndices.emplace_back(
6701 Args: isa<UndefValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
6702 UniqueValues.emplace_back(Args&: V);
6703 continue;
6704 }
6705 auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
6706 ReuseShuffleIndices.emplace_back(Args&: Res.first->second);
6707 if (Res.second)
6708 UniqueValues.emplace_back(Args&: V);
6709 }
6710 size_t NumUniqueScalarValues = UniqueValues.size();
6711 if (NumUniqueScalarValues == VL.size()) {
6712 ReuseShuffleIndices.clear();
6713 } else {
6714 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6715 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6716 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6717 "for nodes with padding.\n");
6718 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6719 return false;
6720 }
6721 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6722 if (NumUniqueScalarValues <= 1 ||
6723 (UniquePositions.size() == 1 && all_of(Range&: UniqueValues,
6724 P: [](Value *V) {
6725 return isa<UndefValue>(Val: V) ||
6726 !isConstant(V);
6727 })) ||
6728 !llvm::has_single_bit<uint32_t>(Value: NumUniqueScalarValues)) {
6729 if (DoNotFail && UniquePositions.size() > 1 &&
6730 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6731 all_of(Range&: UniqueValues, P: [=](Value *V) {
6732 return isa<ExtractElementInst>(Val: V) ||
6733 areAllUsersVectorized(I: cast<Instruction>(Val: V),
6734 VectorizedVals: UserIgnoreList);
6735 })) {
6736 unsigned PWSz = PowerOf2Ceil(A: UniqueValues.size());
6737 if (PWSz == VL.size()) {
6738 ReuseShuffleIndices.clear();
6739 } else {
6740 NonUniqueValueVL.assign(in_start: UniqueValues.begin(), in_end: UniqueValues.end());
6741 NonUniqueValueVL.append(NumInputs: PWSz - UniqueValues.size(),
6742 Elt: UniqueValues.back());
6743 VL = NonUniqueValueVL;
6744 }
6745 return true;
6746 }
6747 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6748 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6749 return false;
6750 }
6751 VL = UniqueValues;
6752 }
6753 return true;
6754 };
6755
6756 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
6757
6758 // Don't vectorize ephemeral values.
6759 if (!EphValues.empty()) {
6760 for (Value *V : VL) {
6761 if (EphValues.count(Ptr: V)) {
6762 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6763 << ") is ephemeral.\n");
6764 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6765 return;
6766 }
6767 }
6768 }
6769
6770 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6771 // a load), in which case peek through to include it in the tree, without
6772 // ballooning over-budget.
6773 if (Depth >= RecursionMaxDepth &&
6774 !(S.MainOp && isa<Instruction>(Val: S.MainOp) && S.MainOp == S.AltOp &&
6775 VL.size() >= 4 &&
6776 (match(V: S.MainOp, P: m_Load(Op: m_Value())) || all_of(Range&: VL, P: [&S](const Value *I) {
6777 return match(V: I,
6778 P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
6779 cast<Instruction>(Val: I)->getOpcode() ==
6780 cast<Instruction>(Val: S.MainOp)->getOpcode();
6781 })))) {
6782 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6783 if (TryToFindDuplicates(S))
6784 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6785 ReuseShuffleIndices);
6786 return;
6787 }
6788
6789 // Don't handle scalable vectors
6790 if (S.getOpcode() == Instruction::ExtractElement &&
6791 isa<ScalableVectorType>(
6792 Val: cast<ExtractElementInst>(Val: S.OpValue)->getVectorOperandType())) {
6793 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6794 if (TryToFindDuplicates(S))
6795 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6796 ReuseShuffleIndices);
6797 return;
6798 }
6799
6800 // Don't handle vectors.
6801 if (!SLPReVec && S.OpValue->getType()->isVectorTy() &&
6802 !isa<InsertElementInst>(Val: S.OpValue)) {
6803 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6804 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6805 return;
6806 }
6807
6808 if (StoreInst *SI = dyn_cast<StoreInst>(Val: S.OpValue))
6809 if (!SLPReVec && SI->getValueOperand()->getType()->isVectorTy()) {
6810 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6811 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6812 return;
6813 }
6814
6815 // If all of the operands are identical or constant we have a simple solution.
6816 // If we deal with insert/extract instructions, they all must have constant
6817 // indices, otherwise we should gather them, not try to vectorize.
6818 // If alternate op node with 2 elements with gathered operands - do not
6819 // vectorize.
6820 auto &&NotProfitableForVectorization = [&S, this,
6821 Depth](ArrayRef<Value *> VL) {
6822 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6823 return false;
6824 if (VectorizableTree.size() < MinTreeSize)
6825 return false;
6826 if (Depth >= RecursionMaxDepth - 1)
6827 return true;
6828 // Check if all operands are extracts, part of vector node or can build a
6829 // regular vectorize node.
6830 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6831 for (Value *V : VL) {
6832 auto *I = cast<Instruction>(Val: V);
6833 InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
6834 return isa<Instruction>(Val: Op) || isVectorLikeInstWithConstOps(V: Op);
6835 }));
6836 }
6837 bool IsCommutative = isCommutative(I: S.MainOp) || isCommutative(I: S.AltOp);
6838 if ((IsCommutative &&
6839 std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: 0) < 2) ||
6840 (!IsCommutative &&
6841 all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < 2; })))
6842 return true;
6843 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6844 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
6845 auto *I1 = cast<Instruction>(Val: VL.front());
6846 auto *I2 = cast<Instruction>(Val: VL.back());
6847 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6848 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6849 Args: I2->getOperand(i: Op));
6850 if (static_cast<unsigned>(count_if(
6851 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6852 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6853 })) >= S.MainOp->getNumOperands() / 2)
6854 return false;
6855 if (S.MainOp->getNumOperands() > 2)
6856 return true;
6857 if (IsCommutative) {
6858 // Check permuted operands.
6859 Candidates.clear();
6860 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6861 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
6862 Args: I2->getOperand(i: (Op + 1) % E));
6863 if (any_of(
6864 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6865 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
6866 }))
6867 return false;
6868 }
6869 return true;
6870 };
6871 SmallVector<unsigned> SortedIndices;
6872 BasicBlock *BB = nullptr;
6873 bool IsScatterVectorizeUserTE =
6874 UserTreeIdx.UserTE &&
6875 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6876 bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL);
6877 bool AreScatterAllGEPSameBlock =
6878 (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() &&
6879 VL.size() > 2 &&
6880 all_of(Range&: VL,
6881 P: [&BB](Value *V) {
6882 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
6883 if (!I)
6884 return doesNotNeedToBeScheduled(V);
6885 if (!BB)
6886 BB = I->getParent();
6887 return BB == I->getParent() && I->getNumOperands() == 2;
6888 }) &&
6889 BB &&
6890 sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL, SE&: *SE,
6891 SortedIndices));
6892 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
6893 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6894 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6895 Val: S.OpValue) &&
6896 !all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) ||
6897 NotProfitableForVectorization(VL)) {
6898 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6899 if (TryToFindDuplicates(S))
6900 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6901 ReuseShuffleIndices);
6902 return;
6903 }
6904
6905 // We now know that this is a vector of instructions of the same type from
6906 // the same block.
6907
6908 // Check if this is a duplicate of another entry.
6909 if (TreeEntry *E = getTreeEntry(V: S.OpValue)) {
6910 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6911 if (!E->isSame(VL)) {
6912 auto It = MultiNodeScalars.find(Val: S.OpValue);
6913 if (It != MultiNodeScalars.end()) {
6914 auto *TEIt = find_if(Range&: It->getSecond(),
6915 P: [&](TreeEntry *ME) { return ME->isSame(VL); });
6916 if (TEIt != It->getSecond().end())
6917 E = *TEIt;
6918 else
6919 E = nullptr;
6920 } else {
6921 E = nullptr;
6922 }
6923 }
6924 if (!E) {
6925 if (!doesNotNeedToBeScheduled(V: S.OpValue)) {
6926 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6927 if (TryToFindDuplicates(S))
6928 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6929 ReuseShuffleIndices);
6930 return;
6931 }
6932 } else {
6933 // Record the reuse of the tree node. FIXME, currently this is only used
6934 // to properly draw the graph rather than for the actual vectorization.
6935 E->UserTreeIndices.push_back(Elt: UserTreeIdx);
6936 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6937 << ".\n");
6938 return;
6939 }
6940 }
6941
6942 // Check that none of the instructions in the bundle are already in the tree.
6943 for (Value *V : VL) {
6944 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(Val: V)) ||
6945 doesNotNeedToBeScheduled(V))
6946 continue;
6947 if (getTreeEntry(V)) {
6948 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6949 << ") is already in tree.\n");
6950 if (TryToFindDuplicates(S))
6951 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6952 ReuseShuffleIndices);
6953 return;
6954 }
6955 }
6956
6957 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6958 if (UserIgnoreList && !UserIgnoreList->empty()) {
6959 for (Value *V : VL) {
6960 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6961 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6962 if (TryToFindDuplicates(S))
6963 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
6964 ReuseShuffleIndices);
6965 return;
6966 }
6967 }
6968 }
6969
6970 // Special processing for sorted pointers for ScatterVectorize node with
6971 // constant indeces only.
6972 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
6973 assert(S.OpValue->getType()->isPointerTy() &&
6974 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6975 "Expected pointers only.");
6976 // Reset S to make it GetElementPtr kind of node.
6977 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
6978 assert(It != VL.end() && "Expected at least one GEP.");
6979 S = getSameOpcode(VL: *It, TLI: *TLI);
6980 }
6981
6982 // Check that all of the users of the scalars that we want to vectorize are
6983 // schedulable.
6984 auto *VL0 = cast<Instruction>(Val: S.OpValue);
6985 BB = VL0->getParent();
6986
6987 if (!DT->isReachableFromEntry(A: BB)) {
6988 // Don't go into unreachable blocks. They may contain instructions with
6989 // dependency cycles which confuse the final scheduling.
6990 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6991 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
6992 return;
6993 }
6994
6995 // Don't go into catchswitch blocks, which can happen with PHIs.
6996 // Such blocks can only have PHIs and the catchswitch. There is no
6997 // place to insert a shuffle if we need to, so just avoid that issue.
6998 if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
6999 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
7000 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx);
7001 return;
7002 }
7003
7004 // Check that every instruction appears once in this bundle.
7005 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
7006 return;
7007
7008 // Perform specific checks for each particular instruction kind.
7009 OrdersType CurrentOrder;
7010 SmallVector<Value *> PointerOps;
7011 TreeEntry::EntryState State = getScalarsVectorizationState(
7012 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
7013 if (State == TreeEntry::NeedToGather) {
7014 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
7015 ReuseShuffleIndices);
7016 return;
7017 }
7018
7019 auto &BSRef = BlocksSchedules[BB];
7020 if (!BSRef)
7021 BSRef = std::make_unique<BlockScheduling>(args&: BB);
7022
7023 BlockScheduling &BS = *BSRef;
7024
7025 std::optional<ScheduleData *> Bundle =
7026 BS.tryScheduleBundle(VL: UniqueValues, SLP: this, S);
7027#ifdef EXPENSIVE_CHECKS
7028 // Make sure we didn't break any internal invariants
7029 BS.verify();
7030#endif
7031 if (!Bundle) {
7032 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
7033 assert((!BS.getScheduleData(VL0) ||
7034 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
7035 "tryScheduleBundle should cancelScheduling on failure");
7036 newTreeEntry(VL, Bundle: std::nullopt /*not vectorized*/, S, UserTreeIdx,
7037 ReuseShuffleIndices);
7038 NonScheduledFirst.insert(Ptr: VL.front());
7039 return;
7040 }
7041 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
7042
7043 unsigned ShuffleOrOp = S.isAltShuffle() ?
7044 (unsigned) Instruction::ShuffleVector : S.getOpcode();
7045 switch (ShuffleOrOp) {
7046 case Instruction::PHI: {
7047 auto *PH = cast<PHINode>(Val: VL0);
7048
7049 TreeEntry *TE =
7050 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
7051 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
7052
7053 // Keeps the reordered operands to avoid code duplication.
7054 PHIHandler Handler(*DT, PH, VL);
7055 Handler.buildOperands();
7056 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumOperands()))
7057 TE->setOperand(OpIdx: I, OpVL: Handler.getOperands(I));
7058 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumOperands()))
7059 buildTree_rec(VL: Handler.getOperands(I), Depth: Depth + 1, UserTreeIdx: {TE, I});
7060 return;
7061 }
7062 case Instruction::ExtractValue:
7063 case Instruction::ExtractElement: {
7064 if (CurrentOrder.empty()) {
7065 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
7066 } else {
7067 LLVM_DEBUG({
7068 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
7069 "with order";
7070 for (unsigned Idx : CurrentOrder)
7071 dbgs() << " " << Idx;
7072 dbgs() << "\n";
7073 });
7074 fixupOrderingIndices(Order: CurrentOrder);
7075 }
7076 // Insert new order with initial value 0, if it does not exist,
7077 // otherwise return the iterator to the existing one.
7078 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7079 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7080 // This is a special case, as it does not gather, but at the same time
7081 // we are not extending buildTree_rec() towards the operands.
7082 ValueList Op0;
7083 Op0.assign(NumElts: VL.size(), Elt: VL0->getOperand(i: 0));
7084 VectorizableTree.back()->setOperand(OpIdx: 0, OpVL: Op0);
7085 return;
7086 }
7087 case Instruction::InsertElement: {
7088 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
7089
7090 auto OrdCompare = [](const std::pair<int, int> &P1,
7091 const std::pair<int, int> &P2) {
7092 return P1.first > P2.first;
7093 };
7094 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
7095 decltype(OrdCompare)>
7096 Indices(OrdCompare);
7097 for (int I = 0, E = VL.size(); I < E; ++I) {
7098 unsigned Idx = *getElementIndex(Inst: VL[I]);
7099 Indices.emplace(args&: Idx, args&: I);
7100 }
7101 OrdersType CurrentOrder(VL.size(), VL.size());
7102 bool IsIdentity = true;
7103 for (int I = 0, E = VL.size(); I < E; ++I) {
7104 CurrentOrder[Indices.top().second] = I;
7105 IsIdentity &= Indices.top().second == I;
7106 Indices.pop();
7107 }
7108 if (IsIdentity)
7109 CurrentOrder.clear();
7110 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7111 ReuseShuffleIndices: std::nullopt, ReorderIndices: CurrentOrder);
7112 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
7113
7114 TE->setOperandsInOrder();
7115 buildTree_rec(VL: TE->getOperand(OpIdx: 1), Depth: Depth + 1, UserTreeIdx: {TE, 1});
7116 return;
7117 }
7118 case Instruction::Load: {
7119 // Check that a vectorized load would load the same memory as a scalar
7120 // load. For example, we don't want to vectorize loads that are smaller
7121 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7122 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7123 // from such a struct, we read/write packed bits disagreeing with the
7124 // unvectorized version.
7125 TreeEntry *TE = nullptr;
7126 fixupOrderingIndices(Order: CurrentOrder);
7127 switch (State) {
7128 case TreeEntry::Vectorize:
7129 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7130 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7131 if (CurrentOrder.empty())
7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
7133 else
7134 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
7135 TE->setOperandsInOrder();
7136 break;
7137 case TreeEntry::StridedVectorize:
7138 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7139 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
7140 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7141 TE->setOperandsInOrder();
7142 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
7143 break;
7144 case TreeEntry::ScatterVectorize:
7145 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
7146 TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
7147 UserTreeIdx, ReuseShuffleIndices);
7148 TE->setOperandsInOrder();
7149 buildTree_rec(VL: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7150 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
7151 break;
7152 case TreeEntry::NeedToGather:
7153 llvm_unreachable("Unexpected loads state.");
7154 }
7155 return;
7156 }
7157 case Instruction::ZExt:
7158 case Instruction::SExt:
7159 case Instruction::FPToUI:
7160 case Instruction::FPToSI:
7161 case Instruction::FPExt:
7162 case Instruction::PtrToInt:
7163 case Instruction::IntToPtr:
7164 case Instruction::SIToFP:
7165 case Instruction::UIToFP:
7166 case Instruction::Trunc:
7167 case Instruction::FPTrunc:
7168 case Instruction::BitCast: {
7169 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
7170 u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
7171 y: std::numeric_limits<unsigned>::max()));
7172 if (ShuffleOrOp == Instruction::ZExt ||
7173 ShuffleOrOp == Instruction::SExt) {
7174 CastMaxMinBWSizes = std::make_pair(
7175 x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
7176 b: PrevMaxBW),
7177 y: std::min<unsigned>(
7178 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
7179 b: PrevMinBW));
7180 } else if (ShuffleOrOp == Instruction::Trunc) {
7181 CastMaxMinBWSizes = std::make_pair(
7182 x: std::max<unsigned>(
7183 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
7184 b: PrevMaxBW),
7185 y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
7186 b: PrevMinBW));
7187 ExtraBitWidthNodes.insert(V: VectorizableTree.size() + 1);
7188 } else if (ShuffleOrOp == Instruction::SIToFP ||
7189 ShuffleOrOp == Instruction::UIToFP) {
7190 unsigned NumSignBits =
7191 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
7192 if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) {
7193 APInt Mask = DB->getDemandedBits(I: OpI);
7194 NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
7195 }
7196 if (NumSignBits * 2 >=
7197 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
7198 ExtraBitWidthNodes.insert(V: VectorizableTree.size() + 1);
7199 }
7200 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7201 ReuseShuffleIndices);
7202 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7203
7204 TE->setOperandsInOrder();
7205 for (unsigned I : seq<unsigned>(Begin: 0, End: VL0->getNumOperands()))
7206 buildTree_rec(VL: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
7207 return;
7208 }
7209 case Instruction::ICmp:
7210 case Instruction::FCmp: {
7211 // Check that all of the compares have the same predicate.
7212 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
7213 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7214 ReuseShuffleIndices);
7215 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7216
7217 ValueList Left, Right;
7218 if (cast<CmpInst>(Val: VL0)->isCommutative()) {
7219 // Commutative predicate - collect + sort operands of the instructions
7220 // so that each side is more likely to have the same opcode.
7221 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
7222 "Commutative Predicate mismatch");
7223 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7224 } else {
7225 // Collect operands - commute if it uses the swapped predicate.
7226 for (Value *V : VL) {
7227 auto *Cmp = cast<CmpInst>(Val: V);
7228 Value *LHS = Cmp->getOperand(i_nocapture: 0);
7229 Value *RHS = Cmp->getOperand(i_nocapture: 1);
7230 if (Cmp->getPredicate() != P0)
7231 std::swap(a&: LHS, b&: RHS);
7232 Left.push_back(Elt: LHS);
7233 Right.push_back(Elt: RHS);
7234 }
7235 }
7236 TE->setOperand(OpIdx: 0, OpVL: Left);
7237 TE->setOperand(OpIdx: 1, OpVL: Right);
7238 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7239 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
7240 if (ShuffleOrOp == Instruction::ICmp) {
7241 unsigned NumSignBits0 =
7242 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
7243 if (NumSignBits0 * 2 >=
7244 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
7245 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
7246 unsigned NumSignBits1 =
7247 ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
7248 if (NumSignBits1 * 2 >=
7249 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType()))
7250 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx);
7251 }
7252 return;
7253 }
7254 case Instruction::Select:
7255 case Instruction::FNeg:
7256 case Instruction::Add:
7257 case Instruction::FAdd:
7258 case Instruction::Sub:
7259 case Instruction::FSub:
7260 case Instruction::Mul:
7261 case Instruction::FMul:
7262 case Instruction::UDiv:
7263 case Instruction::SDiv:
7264 case Instruction::FDiv:
7265 case Instruction::URem:
7266 case Instruction::SRem:
7267 case Instruction::FRem:
7268 case Instruction::Shl:
7269 case Instruction::LShr:
7270 case Instruction::AShr:
7271 case Instruction::And:
7272 case Instruction::Or:
7273 case Instruction::Xor: {
7274 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7275 ReuseShuffleIndices);
7276 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7277
7278 // Sort operands of the instructions so that each side is more likely to
7279 // have the same opcode.
7280 if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
7281 ValueList Left, Right;
7282 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7283 TE->setOperand(OpIdx: 0, OpVL: Left);
7284 TE->setOperand(OpIdx: 1, OpVL: Right);
7285 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7286 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
7287 return;
7288 }
7289
7290 TE->setOperandsInOrder();
7291 for (unsigned I : seq<unsigned>(Begin: 0, End: VL0->getNumOperands()))
7292 buildTree_rec(VL: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
7293 return;
7294 }
7295 case Instruction::GetElementPtr: {
7296 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7297 ReuseShuffleIndices);
7298 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7299 SmallVector<ValueList, 2> Operands(2);
7300 // Prepare the operand vector for pointer operands.
7301 for (Value *V : VL) {
7302 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
7303 if (!GEP) {
7304 Operands.front().push_back(Elt: V);
7305 continue;
7306 }
7307 Operands.front().push_back(Elt: GEP->getPointerOperand());
7308 }
7309 TE->setOperand(OpIdx: 0, OpVL: Operands.front());
7310 // Need to cast all indices to the same type before vectorization to
7311 // avoid crash.
7312 // Required to be able to find correct matches between different gather
7313 // nodes and reuse the vectorized values rather than trying to gather them
7314 // again.
7315 int IndexIdx = 1;
7316 Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
7317 Type *Ty = all_of(Range&: VL,
7318 P: [VL0Ty, IndexIdx](Value *V) {
7319 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
7320 if (!GEP)
7321 return true;
7322 return VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
7323 })
7324 ? VL0Ty
7325 : DL->getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
7326 ->getPointerOperandType()
7327 ->getScalarType());
7328 // Prepare the operand vector.
7329 for (Value *V : VL) {
7330 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
7331 if (!I) {
7332 Operands.back().push_back(
7333 Elt: ConstantInt::get(Ty, V: 0, /*isSigned=*/IsSigned: false));
7334 continue;
7335 }
7336 auto *Op = I->getOperand(i_nocapture: IndexIdx);
7337 auto *CI = dyn_cast<ConstantInt>(Val: Op);
7338 if (!CI)
7339 Operands.back().push_back(Elt: Op);
7340 else
7341 Operands.back().push_back(Elt: ConstantFoldIntegerCast(
7342 C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL: *DL));
7343 }
7344 TE->setOperand(OpIdx: IndexIdx, OpVL: Operands.back());
7345
7346 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7347 buildTree_rec(VL: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
7348 return;
7349 }
7350 case Instruction::Store: {
7351 bool Consecutive = CurrentOrder.empty();
7352 if (!Consecutive)
7353 fixupOrderingIndices(Order: CurrentOrder);
7354 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7355 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
7356 TE->setOperandsInOrder();
7357 buildTree_rec(VL: TE->getOperand(OpIdx: 0), Depth: Depth + 1, UserTreeIdx: {TE, 0});
7358 if (Consecutive)
7359 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7360 else
7361 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7362 return;
7363 }
7364 case Instruction::Call: {
7365 // Check if the calls are all to the same vectorizable intrinsic or
7366 // library function.
7367 CallInst *CI = cast<CallInst>(Val: VL0);
7368 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7369
7370 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7371 ReuseShuffleIndices);
7372 // Sort operands of the instructions so that each side is more likely to
7373 // have the same opcode.
7374 if (isCommutative(I: VL0)) {
7375 ValueList Left, Right;
7376 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7377 TE->setOperand(OpIdx: 0, OpVL: Left);
7378 TE->setOperand(OpIdx: 1, OpVL: Right);
7379 SmallVector<ValueList> Operands;
7380 for (unsigned I : seq<unsigned>(Begin: 2, End: CI->arg_size())) {
7381 Operands.emplace_back();
7382 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7383 continue;
7384 for (Value *V : VL) {
7385 auto *CI2 = cast<CallInst>(Val: V);
7386 Operands.back().push_back(Elt: CI2->getArgOperand(i: I));
7387 }
7388 TE->setOperand(OpIdx: I, OpVL: Operands.back());
7389 }
7390 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7391 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
7392 for (unsigned I : seq<unsigned>(Begin: 2, End: CI->arg_size())) {
7393 if (Operands[I - 2].empty())
7394 continue;
7395 buildTree_rec(VL: Operands[I - 2], Depth: Depth + 1, UserTreeIdx: {TE, I});
7396 }
7397 return;
7398 }
7399 TE->setOperandsInOrder();
7400 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
7401 // For scalar operands no need to create an entry since no need to
7402 // vectorize it.
7403 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I))
7404 continue;
7405 ValueList Operands;
7406 // Prepare the operand vector.
7407 for (Value *V : VL) {
7408 auto *CI2 = cast<CallInst>(Val: V);
7409 Operands.push_back(Elt: CI2->getArgOperand(i: I));
7410 }
7411 buildTree_rec(VL: Operands, Depth: Depth + 1, UserTreeIdx: {TE, I});
7412 }
7413 return;
7414 }
7415 case Instruction::ShuffleVector: {
7416 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7417 ReuseShuffleIndices);
7418 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7419
7420 // Reorder operands if reordering would enable vectorization.
7421 auto *CI = dyn_cast<CmpInst>(Val: VL0);
7422 if (isa<BinaryOperator>(Val: VL0) || CI) {
7423 ValueList Left, Right;
7424 if (!CI || all_of(Range&: VL, P: [](Value *V) {
7425 return cast<CmpInst>(Val: V)->isCommutative();
7426 })) {
7427 reorderInputsAccordingToOpcode(VL, Left, Right, R: *this);
7428 } else {
7429 auto *MainCI = cast<CmpInst>(Val: S.MainOp);
7430 auto *AltCI = cast<CmpInst>(Val: S.AltOp);
7431 CmpInst::Predicate MainP = MainCI->getPredicate();
7432 CmpInst::Predicate AltP = AltCI->getPredicate();
7433 assert(MainP != AltP &&
7434 "Expected different main/alternate predicates.");
7435 // Collect operands - commute if it uses the swapped predicate or
7436 // alternate operation.
7437 for (Value *V : VL) {
7438 auto *Cmp = cast<CmpInst>(Val: V);
7439 Value *LHS = Cmp->getOperand(i_nocapture: 0);
7440 Value *RHS = Cmp->getOperand(i_nocapture: 1);
7441
7442 if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
7443 if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7444 std::swap(a&: LHS, b&: RHS);
7445 } else {
7446 if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
7447 std::swap(a&: LHS, b&: RHS);
7448 }
7449 Left.push_back(Elt: LHS);
7450 Right.push_back(Elt: RHS);
7451 }
7452 }
7453 TE->setOperand(OpIdx: 0, OpVL: Left);
7454 TE->setOperand(OpIdx: 1, OpVL: Right);
7455 buildTree_rec(VL: Left, Depth: Depth + 1, UserTreeIdx: {TE, 0});
7456 buildTree_rec(VL: Right, Depth: Depth + 1, UserTreeIdx: {TE, 1});
7457 return;
7458 }
7459
7460 TE->setOperandsInOrder();
7461 for (unsigned I : seq<unsigned>(Begin: 0, End: VL0->getNumOperands()))
7462 buildTree_rec(VL: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
7463 return;
7464 }
7465 default:
7466 break;
7467 }
7468 llvm_unreachable("Unexpected vectorization of the instructions.");
7469}
7470
7471unsigned BoUpSLP::canMapToVector(Type *T) const {
7472 unsigned N = 1;
7473 Type *EltTy = T;
7474
7475 while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
7476 if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
7477 // Check that struct is homogeneous.
7478 for (const auto *Ty : ST->elements())
7479 if (Ty != *ST->element_begin())
7480 return 0;
7481 N *= ST->getNumElements();
7482 EltTy = *ST->element_begin();
7483 } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
7484 N *= AT->getNumElements();
7485 EltTy = AT->getElementType();
7486 } else {
7487 auto *VT = cast<FixedVectorType>(Val: EltTy);
7488 N *= VT->getNumElements();
7489 EltTy = VT->getElementType();
7490 }
7491 }
7492
7493 if (!isValidElementType(Ty: EltTy))
7494 return 0;
7495 uint64_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
7496 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7497 VTSize != DL->getTypeStoreSizeInBits(Ty: T))
7498 return 0;
7499 return N;
7500}
7501
7502bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7503 SmallVectorImpl<unsigned> &CurrentOrder,
7504 bool ResizeAllowed) const {
7505 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
7506 assert(It != VL.end() && "Expected at least one extract instruction.");
7507 auto *E0 = cast<Instruction>(Val: *It);
7508 assert(
7509 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7510 "Invalid opcode");
7511 // Check if all of the extracts come from the same vector and from the
7512 // correct offset.
7513 Value *Vec = E0->getOperand(i: 0);
7514
7515 CurrentOrder.clear();
7516
7517 // We have to extract from a vector/aggregate with the same number of elements.
7518 unsigned NElts;
7519 if (E0->getOpcode() == Instruction::ExtractValue) {
7520 NElts = canMapToVector(T: Vec->getType());
7521 if (!NElts)
7522 return false;
7523 // Check if load can be rewritten as load of vector.
7524 LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
7525 if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size()))
7526 return false;
7527 } else {
7528 NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
7529 }
7530
7531 unsigned E = VL.size();
7532 if (!ResizeAllowed && NElts != E)
7533 return false;
7534 SmallVector<int> Indices(E, PoisonMaskElem);
7535 unsigned MinIdx = NElts, MaxIdx = 0;
7536 for (auto [I, V] : enumerate(First&: VL)) {
7537 auto *Inst = dyn_cast<Instruction>(Val: V);
7538 if (!Inst)
7539 continue;
7540 if (Inst->getOperand(i: 0) != Vec)
7541 return false;
7542 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
7543 if (isa<UndefValue>(Val: EE->getIndexOperand()))
7544 continue;
7545 std::optional<unsigned> Idx = getExtractIndex(E: Inst);
7546 if (!Idx)
7547 return false;
7548 const unsigned ExtIdx = *Idx;
7549 if (ExtIdx >= NElts)
7550 continue;
7551 Indices[I] = ExtIdx;
7552 if (MinIdx > ExtIdx)
7553 MinIdx = ExtIdx;
7554 if (MaxIdx < ExtIdx)
7555 MaxIdx = ExtIdx;
7556 }
7557 if (MaxIdx - MinIdx + 1 > E)
7558 return false;
7559 if (MaxIdx + 1 <= E)
7560 MinIdx = 0;
7561
7562 // Check that all of the indices extract from the correct offset.
7563 bool ShouldKeepOrder = true;
7564 // Assign to all items the initial value E + 1 so we can check if the extract
7565 // instruction index was used already.
7566 // Also, later we can check that all the indices are used and we have a
7567 // consecutive access in the extract instructions, by checking that no
7568 // element of CurrentOrder still has value E + 1.
7569 CurrentOrder.assign(NumElts: E, Elt: E);
7570 for (unsigned I = 0; I < E; ++I) {
7571 if (Indices[I] == PoisonMaskElem)
7572 continue;
7573 const unsigned ExtIdx = Indices[I] - MinIdx;
7574 if (CurrentOrder[ExtIdx] != E) {
7575 CurrentOrder.clear();
7576 return false;
7577 }
7578 ShouldKeepOrder &= ExtIdx == I;
7579 CurrentOrder[ExtIdx] = I;
7580 }
7581 if (ShouldKeepOrder)
7582 CurrentOrder.clear();
7583
7584 return ShouldKeepOrder;
7585}
7586
7587bool BoUpSLP::areAllUsersVectorized(
7588 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7589 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) ||
7590 all_of(Range: I->users(), P: [this](User *U) {
7591 return ScalarToTreeEntry.contains(Val: U) ||
7592 isVectorLikeInstWithConstOps(V: U) ||
7593 (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
7594 });
7595}
7596
7597static std::pair<InstructionCost, InstructionCost>
7598getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7599 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7600 ArrayRef<Type *> ArgTys) {
7601 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7602
7603 // Calculate the cost of the scalar and vector calls.
7604 FastMathFlags FMF;
7605 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
7606 FMF = FPCI->getFastMathFlags();
7607 SmallVector<const Value *> Arguments(CI->args());
7608 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7609 dyn_cast<IntrinsicInst>(Val: CI));
7610 auto IntrinsicCost =
7611 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
7612
7613 auto Shape = VFShape::get(FTy: CI->getFunctionType(),
7614 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
7615 HasGlobalPred: false /*HasGlobalPred*/);
7616 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7617 auto LibCost = IntrinsicCost;
7618 if (!CI->isNoBuiltin() && VecFunc) {
7619 // Calculate the cost of the vector library call.
7620 // If the corresponding vector call is cheaper, return its cost.
7621 LibCost =
7622 TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
7623 }
7624 return {IntrinsicCost, LibCost};
7625}
7626
7627void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7628 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7629 SmallVectorImpl<Value *> *OpScalars,
7630 SmallVectorImpl<Value *> *AltScalars) const {
7631 unsigned Sz = Scalars.size();
7632 Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
7633 SmallVector<int> OrderMask;
7634 if (!ReorderIndices.empty())
7635 inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
7636 for (unsigned I = 0; I < Sz; ++I) {
7637 unsigned Idx = I;
7638 if (!ReorderIndices.empty())
7639 Idx = OrderMask[I];
7640 auto *OpInst = cast<Instruction>(Val: Scalars[Idx]);
7641 if (IsAltOp(OpInst)) {
7642 Mask[I] = Sz + Idx;
7643 if (AltScalars)
7644 AltScalars->push_back(Elt: OpInst);
7645 } else {
7646 Mask[I] = Idx;
7647 if (OpScalars)
7648 OpScalars->push_back(Elt: OpInst);
7649 }
7650 }
7651 if (!ReuseShuffleIndices.empty()) {
7652 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7653 transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
7654 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7655 });
7656 Mask.swap(RHS&: NewMask);
7657 }
7658}
7659
7660static bool isAlternateInstruction(const Instruction *I,
7661 const Instruction *MainOp,
7662 const Instruction *AltOp,
7663 const TargetLibraryInfo &TLI) {
7664 if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
7665 auto *AltCI = cast<CmpInst>(Val: AltOp);
7666 CmpInst::Predicate MainP = MainCI->getPredicate();
7667 CmpInst::Predicate AltP = AltCI->getPredicate();
7668 assert(MainP != AltP && "Expected different main/alternate predicates.");
7669 auto *CI = cast<CmpInst>(Val: I);
7670 if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
7671 return false;
7672 if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
7673 return true;
7674 CmpInst::Predicate P = CI->getPredicate();
7675 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
7676
7677 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7678 "CmpInst expected to match either main or alternate predicate or "
7679 "their swap.");
7680 (void)AltP;
7681 return MainP != P && MainP != SwappedP;
7682 }
7683 return I->getOpcode() == AltOp->getOpcode();
7684}
7685
7686TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7687 assert(!Ops.empty());
7688 const auto *Op0 = Ops.front();
7689
7690 const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
7691 // TODO: We should allow undef elements here
7692 return isConstant(V) && !isa<UndefValue>(Val: V);
7693 });
7694 const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
7695 // TODO: We should allow undef elements here
7696 return V == Op0;
7697 });
7698 const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7699 // TODO: We should allow undef elements here
7700 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7701 return CI->getValue().isPowerOf2();
7702 return false;
7703 });
7704 const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
7705 // TODO: We should allow undef elements here
7706 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
7707 return CI->getValue().isNegatedPowerOf2();
7708 return false;
7709 });
7710
7711 TTI::OperandValueKind VK = TTI::OK_AnyValue;
7712 if (IsConstant && IsUniform)
7713 VK = TTI::OK_UniformConstantValue;
7714 else if (IsConstant)
7715 VK = TTI::OK_NonUniformConstantValue;
7716 else if (IsUniform)
7717 VK = TTI::OK_UniformValue;
7718
7719 TTI::OperandValueProperties VP = TTI::OP_None;
7720 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7721 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7722
7723 return {.Kind: VK, .Properties: VP};
7724}
7725
7726namespace {
7727/// The base class for shuffle instruction emission and shuffle cost estimation.
7728class BaseShuffleAnalysis {
7729protected:
7730 /// Checks if the mask is an identity mask.
7731 /// \param IsStrict if is true the function returns false if mask size does
7732 /// not match vector size.
7733 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7734 bool IsStrict) {
7735 int Limit = Mask.size();
7736 int VF = VecTy->getNumElements();
7737 int Index = -1;
7738 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
7739 return true;
7740 if (!IsStrict) {
7741 // Consider extract subvector starting from index 0.
7742 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
7743 Index == 0)
7744 return true;
7745 // All VF-size submasks are identity (e.g.
7746 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7747 if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) {
7748 ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
7749 return all_of(Range&: Slice, P: [](int I) { return I == PoisonMaskElem; }) ||
7750 ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
7751 }))
7752 return true;
7753 }
7754 return false;
7755 }
7756
7757 /// Tries to combine 2 different masks into single one.
7758 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7759 /// change the size of the vector, \p LocalVF is the original size of the
7760 /// shuffled vector.
7761 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7762 ArrayRef<int> ExtMask) {
7763 unsigned VF = Mask.size();
7764 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7765 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7766 if (ExtMask[I] == PoisonMaskElem)
7767 continue;
7768 int MaskedIdx = Mask[ExtMask[I] % VF];
7769 NewMask[I] =
7770 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7771 }
7772 Mask.swap(RHS&: NewMask);
7773 }
7774
7775 /// Looks through shuffles trying to reduce final number of shuffles in the
7776 /// code. The function looks through the previously emitted shuffle
7777 /// instructions and properly mark indices in mask as undef.
7778 /// For example, given the code
7779 /// \code
7780 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7781 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7782 /// \endcode
7783 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7784 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7785 /// <0, 1, 2, 3> for the shuffle.
7786 /// If 2 operands are of different size, the smallest one will be resized and
7787 /// the mask recalculated properly.
7788 /// For example, given the code
7789 /// \code
7790 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7791 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7792 /// \endcode
7793 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7794 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7795 /// <0, 1, 2, 3> for the shuffle.
7796 /// So, it tries to transform permutations to simple vector merge, if
7797 /// possible.
7798 /// \param V The input vector which must be shuffled using the given \p Mask.
7799 /// If the better candidate is found, \p V is set to this best candidate
7800 /// vector.
7801 /// \param Mask The input mask for the shuffle. If the best candidate is found
7802 /// during looking-through-shuffles attempt, it is updated accordingly.
7803 /// \param SinglePermute true if the shuffle operation is originally a
7804 /// single-value-permutation. In this case the look-through-shuffles procedure
7805 /// may look for resizing shuffles as the best candidates.
7806 /// \return true if the shuffle results in the non-resizing identity shuffle
7807 /// (and thus can be ignored), false - otherwise.
7808 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7809 bool SinglePermute) {
7810 Value *Op = V;
7811 ShuffleVectorInst *IdentityOp = nullptr;
7812 SmallVector<int> IdentityMask;
7813 while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
7814 // Exit if not a fixed vector type or changing size shuffle.
7815 auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
7816 if (!SVTy)
7817 break;
7818 // Remember the identity or broadcast mask, if it is not a resizing
7819 // shuffle. If no better candidates are found, this Op and Mask will be
7820 // used in the final shuffle.
7821 if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) {
7822 if (!IdentityOp || !SinglePermute ||
7823 (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) &&
7824 !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
7825 NumSrcElts: IdentityMask.size()))) {
7826 IdentityOp = SV;
7827 // Store current mask in the IdentityMask so later we did not lost
7828 // this info if IdentityOp is selected as the best candidate for the
7829 // permutation.
7830 IdentityMask.assign(RHS: Mask);
7831 }
7832 }
7833 // Remember the broadcast mask. If no better candidates are found, this Op
7834 // and Mask will be used in the final shuffle.
7835 // Zero splat can be used as identity too, since it might be used with
7836 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7837 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7838 // expensive, the analysis founds out, that the source vector is just a
7839 // broadcast, this original mask can be transformed to identity mask <0,
7840 // 1, 2, 3>.
7841 // \code
7842 // %0 = shuffle %v, poison, zeroinitalizer
7843 // %res = shuffle %0, poison, <3, 1, 2, 0>
7844 // \endcode
7845 // may be transformed to
7846 // \code
7847 // %0 = shuffle %v, poison, zeroinitalizer
7848 // %res = shuffle %0, poison, <0, 1, 2, 3>
7849 // \endcode
7850 if (SV->isZeroEltSplat()) {
7851 IdentityOp = SV;
7852 IdentityMask.assign(RHS: Mask);
7853 }
7854 int LocalVF = Mask.size();
7855 if (auto *SVOpTy =
7856 dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()))
7857 LocalVF = SVOpTy->getNumElements();
7858 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7859 for (auto [Idx, I] : enumerate(First&: Mask)) {
7860 if (I == PoisonMaskElem ||
7861 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7862 continue;
7863 ExtMask[Idx] = SV->getMaskValue(Elt: I);
7864 }
7865 bool IsOp1Undef =
7866 isUndefVector(V: SV->getOperand(i_nocapture: 0),
7867 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
7868 .all();
7869 bool IsOp2Undef =
7870 isUndefVector(V: SV->getOperand(i_nocapture: 1),
7871 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
7872 .all();
7873 if (!IsOp1Undef && !IsOp2Undef) {
7874 // Update mask and mark undef elems.
7875 for (int &I : Mask) {
7876 if (I == PoisonMaskElem)
7877 continue;
7878 if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
7879 PoisonMaskElem)
7880 I = PoisonMaskElem;
7881 }
7882 break;
7883 }
7884 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7885 SV->getShuffleMask().end());
7886 combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
7887 Mask.swap(RHS&: ShuffleMask);
7888 if (IsOp2Undef)
7889 Op = SV->getOperand(i_nocapture: 0);
7890 else
7891 Op = SV->getOperand(i_nocapture: 1);
7892 }
7893 if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
7894 !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) ||
7895 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
7896 if (IdentityOp) {
7897 V = IdentityOp;
7898 assert(Mask.size() == IdentityMask.size() &&
7899 "Expected masks of same sizes.");
7900 // Clear known poison elements.
7901 for (auto [I, Idx] : enumerate(First&: Mask))
7902 if (Idx == PoisonMaskElem)
7903 IdentityMask[I] = PoisonMaskElem;
7904 Mask.swap(RHS&: IdentityMask);
7905 auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
7906 return SinglePermute &&
7907 (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
7908 /*IsStrict=*/true) ||
7909 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7910 Shuffle->isZeroEltSplat() &&
7911 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())));
7912 }
7913 V = Op;
7914 return false;
7915 }
7916 V = Op;
7917 return true;
7918 }
7919
7920 /// Smart shuffle instruction emission, walks through shuffles trees and
7921 /// tries to find the best matching vector for the actual shuffle
7922 /// instruction.
7923 template <typename T, typename ShuffleBuilderTy>
7924 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7925 ShuffleBuilderTy &Builder) {
7926 assert(V1 && "Expected at least one vector value.");
7927 if (V2)
7928 Builder.resizeToMatch(V1, V2);
7929 int VF = Mask.size();
7930 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
7931 VF = FTy->getNumElements();
7932 if (V2 &&
7933 !isUndefVector(V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg)).all()) {
7934 // Peek through shuffles.
7935 Value *Op1 = V1;
7936 Value *Op2 = V2;
7937 int VF =
7938 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
7939 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7940 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7941 for (int I = 0, E = Mask.size(); I < E; ++I) {
7942 if (Mask[I] < VF)
7943 CombinedMask1[I] = Mask[I];
7944 else
7945 CombinedMask2[I] = Mask[I] - VF;
7946 }
7947 Value *PrevOp1;
7948 Value *PrevOp2;
7949 do {
7950 PrevOp1 = Op1;
7951 PrevOp2 = Op2;
7952 (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false);
7953 (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false);
7954 // Check if we have 2 resizing shuffles - need to peek through operands
7955 // again.
7956 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
7957 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
7958 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7959 for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
7960 if (I == PoisonMaskElem)
7961 continue;
7962 ExtMask1[Idx] = SV1->getMaskValue(Elt: I);
7963 }
7964 SmallBitVector UseMask1 = buildUseMask(
7965 VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType())
7966 ->getNumElements(),
7967 Mask: ExtMask1, MaskArg: UseMask::SecondArg);
7968 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7969 for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
7970 if (I == PoisonMaskElem)
7971 continue;
7972 ExtMask2[Idx] = SV2->getMaskValue(Elt: I);
7973 }
7974 SmallBitVector UseMask2 = buildUseMask(
7975 VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType())
7976 ->getNumElements(),
7977 Mask: ExtMask2, MaskArg: UseMask::SecondArg);
7978 if (SV1->getOperand(i_nocapture: 0)->getType() ==
7979 SV2->getOperand(i_nocapture: 0)->getType() &&
7980 SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() &&
7981 isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() &&
7982 isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) {
7983 Op1 = SV1->getOperand(i_nocapture: 0);
7984 Op2 = SV2->getOperand(i_nocapture: 0);
7985 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7986 SV1->getShuffleMask().end());
7987 int LocalVF = ShuffleMask1.size();
7988 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
7989 LocalVF = FTy->getNumElements();
7990 combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
7991 CombinedMask1.swap(RHS&: ShuffleMask1);
7992 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7993 SV2->getShuffleMask().end());
7994 LocalVF = ShuffleMask2.size();
7995 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
7996 LocalVF = FTy->getNumElements();
7997 combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
7998 CombinedMask2.swap(RHS&: ShuffleMask2);
7999 }
8000 }
8001 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
8002 Builder.resizeToMatch(Op1, Op2);
8003 VF = std::max(a: cast<VectorType>(Val: Op1->getType())
8004 ->getElementCount()
8005 .getKnownMinValue(),
8006 b: cast<VectorType>(Val: Op2->getType())
8007 ->getElementCount()
8008 .getKnownMinValue());
8009 for (int I = 0, E = Mask.size(); I < E; ++I) {
8010 if (CombinedMask2[I] != PoisonMaskElem) {
8011 assert(CombinedMask1[I] == PoisonMaskElem &&
8012 "Expected undefined mask element");
8013 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
8014 }
8015 }
8016 if (Op1 == Op2 &&
8017 (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) ||
8018 (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
8019 isa<ShuffleVectorInst>(Val: Op1) &&
8020 cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
8021 ArrayRef(CombinedMask1))))
8022 return Builder.createIdentity(Op1);
8023 return Builder.createShuffleVector(
8024 Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
8025 CombinedMask1);
8026 }
8027 if (isa<PoisonValue>(Val: V1))
8028 return Builder.createPoison(
8029 cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
8030 SmallVector<int> NewMask(Mask.begin(), Mask.end());
8031 bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true);
8032 assert(V1 && "Expected non-null value after looking through shuffles.");
8033
8034 if (!IsIdentity)
8035 return Builder.createShuffleVector(V1, NewMask);
8036 return Builder.createIdentity(V1);
8037 }
8038};
8039} // namespace
8040
8041/// Returns the cost of the shuffle instructions with the given \p Kind, vector
8042/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
8043/// subvector pattern.
8044static InstructionCost
8045getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
8046 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
8047 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
8048 int Index = 0, VectorType *SubTp = nullptr,
8049 ArrayRef<const Value *> Args = std::nullopt) {
8050 if (Kind != TTI::SK_PermuteTwoSrc)
8051 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8052 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
8053 int NumSubElts;
8054 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
8055 Mask, NumSrcElts, NumSubElts, Index)) {
8056 if (Index + NumSubElts > NumSrcElts &&
8057 Index + NumSrcElts <= static_cast<int>(Mask.size()))
8058 return TTI.getShuffleCost(
8059 Kind: TTI::SK_InsertSubvector,
8060 Tp: getWidenedType(ScalarTy: Tp->getElementType(), VF: Mask.size()), Mask,
8061 CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
8062 }
8063 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
8064}
8065
8066/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
8067static std::pair<InstructionCost, InstructionCost>
8068getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
8069 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
8070 Type *ScalarTy, VectorType *VecTy) {
8071 InstructionCost ScalarCost = 0;
8072 InstructionCost VecCost = 0;
8073 // Here we differentiate two cases: (1) when Ptrs represent a regular
8074 // vectorization tree node (as they are pointer arguments of scattered
8075 // loads) or (2) when Ptrs are the arguments of loads or stores being
8076 // vectorized as plane wide unit-stride load/store since all the
8077 // loads/stores are known to be from/to adjacent locations.
8078 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
8079 // Case 2: estimate costs for pointer related costs when vectorizing to
8080 // a wide load/store.
8081 // Scalar cost is estimated as a set of pointers with known relationship
8082 // between them.
8083 // For vector code we will use BasePtr as argument for the wide load/store
8084 // but we also need to account all the instructions which are going to
8085 // stay in vectorized code due to uses outside of these scalar
8086 // loads/stores.
8087 ScalarCost = TTI.getPointersChainCost(
8088 Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
8089 CostKind);
8090
8091 SmallVector<const Value *> PtrsRetainedInVecCode;
8092 for (Value *V : Ptrs) {
8093 if (V == BasePtr) {
8094 PtrsRetainedInVecCode.push_back(Elt: V);
8095 continue;
8096 }
8097 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
8098 // For simplicity assume Ptr to stay in vectorized code if it's not a
8099 // GEP instruction. We don't care since it's cost considered free.
8100 // TODO: We should check for any uses outside of vectorizable tree
8101 // rather than just single use.
8102 if (!Ptr || !Ptr->hasOneUse())
8103 PtrsRetainedInVecCode.push_back(Elt: V);
8104 }
8105
8106 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
8107 // If all pointers stay in vectorized code then we don't have
8108 // any savings on that.
8109 return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
8110 }
8111 VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
8112 Info: TTI::PointersChainInfo::getKnownStride(),
8113 AccessTy: VecTy, CostKind);
8114 } else {
8115 // Case 1: Ptrs are the arguments of loads that we are going to transform
8116 // into masked gather load intrinsic.
8117 // All the scalar GEPs will be removed as a result of vectorization.
8118 // For any external uses of some lanes extract element instructions will
8119 // be generated (which cost is estimated separately).
8120 TTI::PointersChainInfo PtrsInfo =
8121 all_of(Range&: Ptrs,
8122 P: [](const Value *V) {
8123 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
8124 return Ptr && !Ptr->hasAllConstantIndices();
8125 })
8126 ? TTI::PointersChainInfo::getUnknownStride()
8127 : TTI::PointersChainInfo::getKnownStride();
8128
8129 ScalarCost =
8130 TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
8131 auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
8132 if (!BaseGEP) {
8133 auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
8134 if (It != Ptrs.end())
8135 BaseGEP = cast<GEPOperator>(Val: *It);
8136 }
8137 if (BaseGEP) {
8138 SmallVector<const Value *> Indices(BaseGEP->indices());
8139 VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
8140 Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
8141 CostKind);
8142 }
8143 }
8144
8145 return std::make_pair(x&: ScalarCost, y&: VecCost);
8146}
8147
8148void BoUpSLP::transformNodes() {
8149 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8150 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8151 TreeEntry &E = *TE;
8152 switch (E.getOpcode()) {
8153 case Instruction::Load: {
8154 // No need to reorder masked gather loads, just reorder the scalar
8155 // operands.
8156 if (E.State != TreeEntry::Vectorize)
8157 break;
8158 Type *ScalarTy = E.getMainOp()->getType();
8159 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
8160 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
8161 // Check if profitable to represent consecutive load + reverse as strided
8162 // load with stride -1.
8163 if (isReverseOrder(Order: E.ReorderIndices) &&
8164 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
8165 SmallVector<int> Mask;
8166 inversePermutation(Indices: E.ReorderIndices, Mask);
8167 auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
8168 InstructionCost OriginalVecCost =
8169 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
8170 AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
8171 OpdInfo: TTI::OperandValueInfo()) +
8172 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
8173 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8174 Opcode: Instruction::Load, DataTy: VecTy, Ptr: BaseLI->getPointerOperand(),
8175 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseLI);
8176 if (StridedCost < OriginalVecCost)
8177 // Strided load is more profitable than consecutive load + reverse -
8178 // transform the node to strided load.
8179 E.State = TreeEntry::StridedVectorize;
8180 }
8181 break;
8182 }
8183 case Instruction::Store: {
8184 Type *ScalarTy =
8185 cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
8186 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
8187 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
8188 // Check if profitable to represent consecutive load + reverse as strided
8189 // load with stride -1.
8190 if (isReverseOrder(Order: E.ReorderIndices) &&
8191 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
8192 SmallVector<int> Mask;
8193 inversePermutation(Indices: E.ReorderIndices, Mask);
8194 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
8195 InstructionCost OriginalVecCost =
8196 TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
8197 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
8198 OpdInfo: TTI::OperandValueInfo()) +
8199 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
8200 InstructionCost StridedCost = TTI->getStridedMemoryOpCost(
8201 Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
8202 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: BaseSI);
8203 if (StridedCost < OriginalVecCost)
8204 // Strided load is more profitable than consecutive load + reverse -
8205 // transform the node to strided load.
8206 E.State = TreeEntry::StridedVectorize;
8207 }
8208 break;
8209 }
8210 default:
8211 break;
8212 }
8213 }
8214}
8215
8216/// Merges shuffle masks and emits final shuffle instruction, if required. It
8217/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8218/// when the actual shuffle instruction is generated only if this is actually
8219/// required. Otherwise, the shuffle instruction emission is delayed till the
8220/// end of the process, to reduce the number of emitted instructions and further
8221/// analysis/transformations.
8222class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8223 bool IsFinalized = false;
8224 SmallVector<int> CommonMask;
8225 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
8226 Type *ScalarTy = nullptr;
8227 const TargetTransformInfo &TTI;
8228 InstructionCost Cost = 0;
8229 SmallDenseSet<Value *> VectorizedVals;
8230 BoUpSLP &R;
8231 SmallPtrSetImpl<Value *> &CheckedExtracts;
8232 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8233 /// While set, still trying to estimate the cost for the same nodes and we
8234 /// can delay actual cost estimation (virtual shuffle instruction emission).
8235 /// May help better estimate the cost if same nodes must be permuted + allows
8236 /// to move most of the long shuffles cost estimation to TTI.
8237 bool SameNodesEstimated = true;
8238
8239 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8240 if (Ty->getScalarType()->isPointerTy()) {
8241 Constant *Res = ConstantExpr::getIntToPtr(
8242 C: ConstantInt::getAllOnesValue(
8243 Ty: IntegerType::get(C&: Ty->getContext(),
8244 NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
8245 Ty: Ty->getScalarType());
8246 if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
8247 Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
8248 return Res;
8249 }
8250 return Constant::getAllOnesValue(Ty);
8251 }
8252
8253 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8254 if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>))
8255 return TTI::TCC_Free;
8256 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
8257 InstructionCost GatherCost = 0;
8258 SmallVector<Value *> Gathers(VL.begin(), VL.end());
8259 // Improve gather cost for gather of loads, if we can group some of the
8260 // loads into vector loads.
8261 InstructionsState S = getSameOpcode(VL, TLI: *R.TLI);
8262 const unsigned Sz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
8263 unsigned MinVF = R.getMinVF(Sz: 2 * Sz);
8264 if (VL.size() > 2 &&
8265 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8266 (InVectors.empty() &&
8267 any_of(Range: seq<unsigned>(Begin: 0, End: VL.size() / MinVF),
8268 P: [&](unsigned Idx) {
8269 ArrayRef<Value *> SubVL = VL.slice(N: Idx * MinVF, M: MinVF);
8270 InstructionsState S = getSameOpcode(VL: SubVL, TLI: *R.TLI);
8271 return S.getOpcode() == Instruction::Load &&
8272 !S.isAltShuffle();
8273 }))) &&
8274 !all_of(Range&: Gathers, P: [&](Value *V) { return R.getTreeEntry(V); }) &&
8275 !isSplat(VL: Gathers)) {
8276 InstructionCost BaseCost = R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root, ScalarTy);
8277 SetVector<Value *> VectorizedLoads;
8278 SmallVector<std::pair<unsigned, LoadsState>> VectorizedStarts;
8279 SmallVector<unsigned> ScatterVectorized;
8280 unsigned StartIdx = 0;
8281 unsigned VF = VL.size() / 2;
8282 for (; VF >= MinVF; VF /= 2) {
8283 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8284 Cnt += VF) {
8285 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
8286 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8287 InstructionsState SliceS = getSameOpcode(VL: Slice, TLI: *R.TLI);
8288 if (SliceS.getOpcode() != Instruction::Load ||
8289 SliceS.isAltShuffle())
8290 continue;
8291 }
8292 if (!VectorizedLoads.count(key: Slice.front()) &&
8293 !VectorizedLoads.count(key: Slice.back()) && allSameBlock(VL: Slice)) {
8294 SmallVector<Value *> PointerOps;
8295 OrdersType CurrentOrder;
8296 LoadsState LS = R.canVectorizeLoads(VL: Slice, VL0: Slice.front(),
8297 Order&: CurrentOrder, PointerOps);
8298 switch (LS) {
8299 case LoadsState::Vectorize:
8300 case LoadsState::ScatterVectorize:
8301 case LoadsState::StridedVectorize:
8302 // Mark the vectorized loads so that we don't vectorize them
8303 // again.
8304 // TODO: better handling of loads with reorders.
8305 if (((LS == LoadsState::Vectorize ||
8306 LS == LoadsState::StridedVectorize) &&
8307 CurrentOrder.empty()) ||
8308 (LS == LoadsState::StridedVectorize &&
8309 isReverseOrder(Order: CurrentOrder)))
8310 VectorizedStarts.emplace_back(Args&: Cnt, Args&: LS);
8311 else
8312 ScatterVectorized.push_back(Elt: Cnt);
8313 VectorizedLoads.insert(Start: Slice.begin(), End: Slice.end());
8314 // If we vectorized initial block, no need to try to vectorize
8315 // it again.
8316 if (Cnt == StartIdx)
8317 StartIdx += VF;
8318 break;
8319 case LoadsState::Gather:
8320 break;
8321 }
8322 }
8323 }
8324 // Check if the whole array was vectorized already - exit.
8325 if (StartIdx >= VL.size())
8326 break;
8327 // Found vectorizable parts - exit.
8328 if (!VectorizedLoads.empty())
8329 break;
8330 }
8331 if (!VectorizedLoads.empty()) {
8332 unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
8333 bool NeedInsertSubvectorAnalysis =
8334 !NumParts || (VL.size() / VF) > NumParts;
8335 // Get the cost for gathered loads.
8336 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8337 if (VectorizedLoads.contains(key: VL[I]))
8338 continue;
8339 GatherCost +=
8340 getBuildVectorCost(VL: VL.slice(N: I, M: std::min(a: End - I, b: VF)), Root);
8341 }
8342 // Exclude potentially vectorized loads from list of gathered
8343 // scalars.
8344 Gathers.assign(NumElts: Gathers.size(), Elt: PoisonValue::get(T: VL.front()->getType()));
8345 // The cost for vectorized loads.
8346 InstructionCost ScalarsCost = 0;
8347 for (Value *V : VectorizedLoads) {
8348 auto *LI = cast<LoadInst>(Val: V);
8349 ScalarsCost +=
8350 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LI->getType(),
8351 Alignment: LI->getAlign(), AddressSpace: LI->getPointerAddressSpace(),
8352 CostKind, OpdInfo: TTI::OperandValueInfo(), I: LI);
8353 }
8354 auto *LoadTy = getWidenedType(ScalarTy: VL.front()->getType(), VF);
8355 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8356 auto *LI = cast<LoadInst>(Val: VL[P.first]);
8357 Align Alignment = LI->getAlign();
8358 GatherCost +=
8359 P.second == LoadsState::Vectorize
8360 ? TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadTy, Alignment,
8361 AddressSpace: LI->getPointerAddressSpace(), CostKind,
8362 OpdInfo: TTI::OperandValueInfo(), I: LI)
8363 : TTI.getStridedMemoryOpCost(
8364 Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI->getPointerOperand(),
8365 /*VariableMask=*/false, Alignment, CostKind, I: LI);
8366 // Estimate GEP cost.
8367 SmallVector<Value *> PointerOps(VF);
8368 for (auto [I, V] : enumerate(First: VL.slice(N: P.first, M: VF)))
8369 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
8370 auto [ScalarGEPCost, VectorGEPCost] =
8371 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: LI->getPointerOperand(),
8372 Opcode: Instruction::Load, CostKind, ScalarTy: LI->getType(), VecTy: LoadTy);
8373 GatherCost += VectorGEPCost - ScalarGEPCost;
8374 }
8375 for (unsigned P : ScatterVectorized) {
8376 auto *LI0 = cast<LoadInst>(Val: VL[P]);
8377 ArrayRef<Value *> Slice = VL.slice(N: P, M: VF);
8378 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: Slice);
8379 GatherCost += TTI.getGatherScatterOpCost(
8380 Opcode: Instruction::Load, DataTy: LoadTy, Ptr: LI0->getPointerOperand(),
8381 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind, I: LI0);
8382 // Estimate GEP cost.
8383 SmallVector<Value *> PointerOps(VF);
8384 for (auto [I, V] : enumerate(First&: Slice))
8385 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
8386 OrdersType Order;
8387 if (sortPtrAccesses(VL: PointerOps, ElemTy: LI0->getType(), DL: *R.DL, SE&: *R.SE,
8388 SortedIndices&: Order)) {
8389 // TODO: improve checks if GEPs can be vectorized.
8390 Value *Ptr0 = PointerOps.front();
8391 Type *ScalarTy = Ptr0->getType();
8392 auto *VecTy = getWidenedType(ScalarTy, VF);
8393 auto [ScalarGEPCost, VectorGEPCost] =
8394 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: Ptr0, Opcode: Instruction::GetElementPtr,
8395 CostKind, ScalarTy, VecTy);
8396 GatherCost += VectorGEPCost - ScalarGEPCost;
8397 if (!Order.empty()) {
8398 SmallVector<int> Mask;
8399 inversePermutation(Indices: Order, Mask);
8400 GatherCost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
8401 Tp: VecTy, Mask, CostKind);
8402 }
8403 } else {
8404 GatherCost += R.getGatherCost(VL: PointerOps, /*ForPoisonSrc=*/true,
8405 ScalarTy: PointerOps.front()->getType());
8406 }
8407 }
8408 if (NeedInsertSubvectorAnalysis) {
8409 // Add the cost for the subvectors insert.
8410 SmallVector<int> ShuffleMask(VL.size());
8411 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8412 for (unsigned Idx : seq<unsigned>(Begin: 0, End: E))
8413 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8414 GatherCost += TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: VecTy,
8415 Mask: ShuffleMask, CostKind, Index: I, SubTp: LoadTy);
8416 }
8417 }
8418 GatherCost -= ScalarsCost;
8419 }
8420 GatherCost = std::min(a: BaseCost, b: GatherCost);
8421 } else if (!Root && isSplat(VL)) {
8422 // Found the broadcasting of the single scalar, calculate the cost as
8423 // the broadcast.
8424 const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
8425 assert(It != VL.end() && "Expected at least one non-undef value.");
8426 // Add broadcast for non-identity shuffle only.
8427 bool NeedShuffle =
8428 count(Range&: VL, Element: *It) > 1 &&
8429 (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
8430 if (!NeedShuffle)
8431 return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
8432 CostKind, Index: std::distance(first: VL.begin(), last: It),
8433 Op0: PoisonValue::get(T: VecTy), Op1: *It);
8434
8435 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8436 transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
8437 return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0;
8438 });
8439 InstructionCost InsertCost =
8440 TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0,
8441 Op0: PoisonValue::get(T: VecTy), Op1: *It);
8442 return InsertCost + TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast,
8443 Tp: VecTy, Mask: ShuffleMask, CostKind,
8444 /*Index=*/0, /*SubTp=*/nullptr,
8445 /*Args=*/*It);
8446 }
8447 return GatherCost +
8448 (all_of(Range&: Gathers, P: IsaPred<UndefValue>)
8449 ? TTI::TCC_Free
8450 : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
8451 ScalarTy));
8452 };
8453
8454 /// Compute the cost of creating a vector containing the extracted values from
8455 /// \p VL.
8456 InstructionCost
8457 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8458 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8459 unsigned NumParts) {
8460 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8461 unsigned NumElts =
8462 std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) {
8463 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8464 if (!EE)
8465 return Sz;
8466 auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
8467 if (!VecTy)
8468 return Sz;
8469 return std::max(a: Sz, b: VecTy->getNumElements());
8470 });
8471 // FIXME: this must be moved to TTI for better estimation.
8472 unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
8473 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
8474 SmallVectorImpl<unsigned> &Indices)
8475 -> std::optional<TTI::ShuffleKind> {
8476 if (NumElts <= EltsPerVector)
8477 return std::nullopt;
8478 int OffsetReg0 =
8479 alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
8480 binary_op: [](int S, int I) {
8481 if (I == PoisonMaskElem)
8482 return S;
8483 return std::min(a: S, b: I);
8484 }),
8485 Align: EltsPerVector);
8486 int OffsetReg1 = OffsetReg0;
8487 DenseSet<int> RegIndices;
8488 // Check that if trying to permute same single/2 input vectors.
8489 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
8490 int FirstRegId = -1;
8491 Indices.assign(NumElts: 1, Elt: OffsetReg0);
8492 for (auto [Pos, I] : enumerate(First&: Mask)) {
8493 if (I == PoisonMaskElem)
8494 continue;
8495 int Idx = I - OffsetReg0;
8496 int RegId =
8497 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
8498 if (FirstRegId < 0)
8499 FirstRegId = RegId;
8500 RegIndices.insert(V: RegId);
8501 if (RegIndices.size() > 2)
8502 return std::nullopt;
8503 if (RegIndices.size() == 2) {
8504 ShuffleKind = TTI::SK_PermuteTwoSrc;
8505 if (Indices.size() == 1) {
8506 OffsetReg1 = alignDown(
8507 Value: std::accumulate(
8508 first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
8509 binary_op: [&](int S, int I) {
8510 if (I == PoisonMaskElem)
8511 return S;
8512 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
8513 ((I - OffsetReg0) % NumElts) / EltsPerVector;
8514 if (RegId == FirstRegId)
8515 return S;
8516 return std::min(a: S, b: I);
8517 }),
8518 Align: EltsPerVector);
8519 Indices.push_back(Elt: OffsetReg1 % NumElts);
8520 }
8521 Idx = I - OffsetReg1;
8522 }
8523 I = (Idx % NumElts) % EltsPerVector +
8524 (RegId == FirstRegId ? 0 : EltsPerVector);
8525 }
8526 return ShuffleKind;
8527 };
8528 InstructionCost Cost = 0;
8529
8530 // Process extracts in blocks of EltsPerVector to check if the source vector
8531 // operand can be re-used directly. If not, add the cost of creating a
8532 // shuffle to extract the values into a vector register.
8533 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
8534 if (!ShuffleKinds[Part])
8535 continue;
8536 ArrayRef<int> MaskSlice = Mask.slice(
8537 N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
8538 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8539 copy(Range&: MaskSlice, Out: SubMask.begin());
8540 SmallVector<unsigned, 2> Indices;
8541 std::optional<TTI::ShuffleKind> RegShuffleKind =
8542 CheckPerRegistersShuffle(SubMask, Indices);
8543 if (!RegShuffleKind) {
8544 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
8545 !ShuffleVectorInst::isIdentityMask(
8546 Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
8547 Cost +=
8548 ::getShuffleCost(TTI, Kind: *ShuffleKinds[Part],
8549 Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
8550 continue;
8551 }
8552 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8553 !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
8554 Cost +=
8555 ::getShuffleCost(TTI, Kind: *RegShuffleKind,
8556 Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
8557 }
8558 for (unsigned Idx : Indices) {
8559 assert((Idx + EltsPerVector) <= alignTo(NumElts, EltsPerVector) &&
8560 "SK_ExtractSubvector index out of range");
8561 Cost += ::getShuffleCost(
8562 TTI, Kind: TTI::SK_ExtractSubvector,
8563 Tp: getWidenedType(ScalarTy, VF: alignTo(Value: NumElts, Align: EltsPerVector)),
8564 Mask: std::nullopt, CostKind, Index: Idx,
8565 SubTp: getWidenedType(ScalarTy, VF: EltsPerVector));
8566 }
8567 // Second attempt to check, if just a permute is better estimated than
8568 // subvector extract.
8569 SubMask.assign(NumElts, Elt: PoisonMaskElem);
8570 copy(Range&: MaskSlice, Out: SubMask.begin());
8571 InstructionCost OriginalCost = ::getShuffleCost(
8572 TTI, Kind: *ShuffleKinds[Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
8573 if (OriginalCost < Cost)
8574 Cost = OriginalCost;
8575 }
8576 return Cost;
8577 }
8578 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8579 /// shuffle emission.
8580 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8581 ArrayRef<int> Mask) {
8582 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8583 if (Mask[Idx] != PoisonMaskElem)
8584 CommonMask[Idx] = Idx;
8585 }
8586 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8587 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8588 /// elements.
8589 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8590 ArrayRef<int> Mask, unsigned Part,
8591 unsigned SliceSize) {
8592 if (SameNodesEstimated) {
8593 // Delay the cost estimation if the same nodes are reshuffling.
8594 // If we already requested the cost of reshuffling of E1 and E2 before, no
8595 // need to estimate another cost with the sub-Mask, instead include this
8596 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8597 // estimation.
8598 if ((InVectors.size() == 2 &&
8599 InVectors.front().get<const TreeEntry *>() == &E1 &&
8600 InVectors.back().get<const TreeEntry *>() == E2) ||
8601 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8602 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
8603 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
8604 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8605 "Expected all poisoned elements.");
8606 ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
8607 copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
8608 return;
8609 }
8610 // Found non-matching nodes - need to estimate the cost for the matched
8611 // and transform mask.
8612 Cost += createShuffle(P1: InVectors.front(),
8613 P2: InVectors.size() == 1 ? nullptr : InVectors.back(),
8614 Mask: CommonMask);
8615 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8616 }
8617 SameNodesEstimated = false;
8618 if (!E2 && InVectors.size() == 1) {
8619 unsigned VF = E1.getVectorFactor();
8620 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8621 VF = std::max(a: VF,
8622 b: cast<FixedVectorType>(Val: V1->getType())->getNumElements());
8623 } else {
8624 const auto *E = InVectors.front().get<const TreeEntry *>();
8625 VF = std::max(a: VF, b: E->getVectorFactor());
8626 }
8627 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8628 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8629 CommonMask[Idx] = Mask[Idx] + VF;
8630 Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
8631 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8632 } else {
8633 Cost += createShuffle(P1: &E1, P2: E2, Mask);
8634 transformMaskAfterShuffle(CommonMask, Mask);
8635 }
8636 }
8637
8638 class ShuffleCostBuilder {
8639 const TargetTransformInfo &TTI;
8640
8641 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8642 int Index = -1;
8643 return Mask.empty() ||
8644 (VF == Mask.size() &&
8645 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) ||
8646 (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
8647 Index == 0);
8648 }
8649
8650 public:
8651 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8652 ~ShuffleCostBuilder() = default;
8653 InstructionCost createShuffleVector(Value *V1, Value *,
8654 ArrayRef<int> Mask) const {
8655 // Empty mask or identity mask are free.
8656 unsigned VF =
8657 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8658 if (isEmptyOrIdentity(Mask, VF))
8659 return TTI::TCC_Free;
8660 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
8661 Tp: cast<VectorType>(Val: V1->getType()), Mask);
8662 }
8663 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8664 // Empty mask or identity mask are free.
8665 unsigned VF =
8666 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
8667 if (isEmptyOrIdentity(Mask, VF))
8668 return TTI::TCC_Free;
8669 return TTI.getShuffleCost(Kind: TTI::SK_PermuteSingleSrc,
8670 Tp: cast<VectorType>(Val: V1->getType()), Mask);
8671 }
8672 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8673 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8674 return TTI::TCC_Free;
8675 }
8676 void resizeToMatch(Value *&, Value *&) const {}
8677 };
8678
8679 /// Smart shuffle instruction emission, walks through shuffles trees and
8680 /// tries to find the best matching vector for the actual shuffle
8681 /// instruction.
8682 InstructionCost
8683 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8684 const PointerUnion<Value *, const TreeEntry *> &P2,
8685 ArrayRef<int> Mask) {
8686 ShuffleCostBuilder Builder(TTI);
8687 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8688 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8689 unsigned CommonVF = Mask.size();
8690 InstructionCost ExtraCost = 0;
8691 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8692 unsigned VF) -> InstructionCost {
8693 if (E.isGather() && allConstant(VL: E.Scalars))
8694 return TTI::TCC_Free;
8695 Type *EScalarTy = E.Scalars.front()->getType();
8696 bool IsSigned = true;
8697 if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
8698 EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
8699 IsSigned = It->second.second;
8700 }
8701 if (EScalarTy != ScalarTy) {
8702 unsigned CastOpcode = Instruction::Trunc;
8703 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
8704 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
8705 if (DstSz > SrcSz)
8706 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8707 return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
8708 Src: getWidenedType(ScalarTy: EScalarTy, VF),
8709 CCH: TTI::CastContextHint::None, CostKind);
8710 }
8711 return TTI::TCC_Free;
8712 };
8713 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8714 if (isa<Constant>(Val: V))
8715 return TTI::TCC_Free;
8716 auto *VecTy = cast<VectorType>(Val: V->getType());
8717 Type *EScalarTy = VecTy->getElementType();
8718 if (EScalarTy != ScalarTy) {
8719 bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL));
8720 unsigned CastOpcode = Instruction::Trunc;
8721 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
8722 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
8723 if (DstSz > SrcSz)
8724 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8725 return TTI.getCastInstrCost(
8726 Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
8727 Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
8728 }
8729 return TTI::TCC_Free;
8730 };
8731 if (!V1 && !V2 && !P2.isNull()) {
8732 // Shuffle 2 entry nodes.
8733 const TreeEntry *E = P1.get<const TreeEntry *>();
8734 unsigned VF = E->getVectorFactor();
8735 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8736 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8737 assert(all_of(Mask,
8738 [=](int Idx) {
8739 return Idx < 2 * static_cast<int>(CommonVF);
8740 }) &&
8741 "All elements in mask must be less than 2 * CommonVF.");
8742 if (E->Scalars.size() == E2->Scalars.size()) {
8743 SmallVector<int> EMask = E->getCommonMask();
8744 SmallVector<int> E2Mask = E2->getCommonMask();
8745 if (!EMask.empty() || !E2Mask.empty()) {
8746 for (int &Idx : CommonMask) {
8747 if (Idx == PoisonMaskElem)
8748 continue;
8749 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8750 Idx = EMask[Idx];
8751 else if (Idx >= static_cast<int>(CommonVF))
8752 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8753 E->Scalars.size();
8754 }
8755 }
8756 CommonVF = E->Scalars.size();
8757 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8758 GetNodeMinBWAffectedCost(*E2, CommonVF);
8759 } else {
8760 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8761 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8762 }
8763 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8764 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8765 } else if (!V1 && P2.isNull()) {
8766 // Shuffle single entry node.
8767 const TreeEntry *E = P1.get<const TreeEntry *>();
8768 unsigned VF = E->getVectorFactor();
8769 CommonVF = VF;
8770 assert(
8771 all_of(Mask,
8772 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8773 "All elements in mask must be less than CommonVF.");
8774 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8775 SmallVector<int> EMask = E->getCommonMask();
8776 assert(!EMask.empty() && "Expected non-empty common mask.");
8777 for (int &Idx : CommonMask) {
8778 if (Idx != PoisonMaskElem)
8779 Idx = EMask[Idx];
8780 }
8781 CommonVF = E->Scalars.size();
8782 }
8783 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8784 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8785 // Not identity/broadcast? Try to see if the original vector is better.
8786 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8787 CommonVF == CommonMask.size() &&
8788 any_of(Range: enumerate(First&: CommonMask),
8789 P: [](const auto &&P) {
8790 return P.value() != PoisonMaskElem &&
8791 static_cast<unsigned>(P.value()) != P.index();
8792 }) &&
8793 any_of(Range&: CommonMask,
8794 P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8795 SmallVector<int> ReorderMask;
8796 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
8797 ::addMask(Mask&: CommonMask, SubMask: ReorderMask);
8798 }
8799 } else if (V1 && P2.isNull()) {
8800 // Shuffle single vector.
8801 ExtraCost += GetValueMinBWAffectedCost(V1);
8802 CommonVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8803 assert(
8804 all_of(Mask,
8805 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8806 "All elements in mask must be less than CommonVF.");
8807 } else if (V1 && !V2) {
8808 // Shuffle vector and tree node.
8809 unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8810 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8811 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
8812 assert(all_of(Mask,
8813 [=](int Idx) {
8814 return Idx < 2 * static_cast<int>(CommonVF);
8815 }) &&
8816 "All elements in mask must be less than 2 * CommonVF.");
8817 if (E2->Scalars.size() == VF && VF != CommonVF) {
8818 SmallVector<int> E2Mask = E2->getCommonMask();
8819 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8820 for (int &Idx : CommonMask) {
8821 if (Idx == PoisonMaskElem)
8822 continue;
8823 if (Idx >= static_cast<int>(CommonVF))
8824 Idx = E2Mask[Idx - CommonVF] + VF;
8825 }
8826 CommonVF = VF;
8827 }
8828 ExtraCost += GetValueMinBWAffectedCost(V1);
8829 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8830 ExtraCost += GetNodeMinBWAffectedCost(
8831 *E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
8832 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8833 } else if (!V1 && V2) {
8834 // Shuffle vector and tree node.
8835 unsigned VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
8836 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8837 CommonVF = std::max(a: VF, b: E1->getVectorFactor());
8838 assert(all_of(Mask,
8839 [=](int Idx) {
8840 return Idx < 2 * static_cast<int>(CommonVF);
8841 }) &&
8842 "All elements in mask must be less than 2 * CommonVF.");
8843 if (E1->Scalars.size() == VF && VF != CommonVF) {
8844 SmallVector<int> E1Mask = E1->getCommonMask();
8845 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8846 for (int &Idx : CommonMask) {
8847 if (Idx == PoisonMaskElem)
8848 continue;
8849 if (Idx >= static_cast<int>(CommonVF))
8850 Idx = E1Mask[Idx - CommonVF] + VF;
8851 else
8852 Idx = E1Mask[Idx];
8853 }
8854 CommonVF = VF;
8855 }
8856 ExtraCost += GetNodeMinBWAffectedCost(
8857 *E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
8858 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8859 ExtraCost += GetValueMinBWAffectedCost(V2);
8860 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8861 } else {
8862 assert(V1 && V2 && "Expected both vectors.");
8863 unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
8864 CommonVF =
8865 std::max(a: VF, b: cast<FixedVectorType>(Val: V2->getType())->getNumElements());
8866 assert(all_of(Mask,
8867 [=](int Idx) {
8868 return Idx < 2 * static_cast<int>(CommonVF);
8869 }) &&
8870 "All elements in mask must be less than 2 * CommonVF.");
8871 ExtraCost +=
8872 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8873 if (V1->getType() != V2->getType()) {
8874 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8875 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8876 } else {
8877 if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
8878 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
8879 if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
8880 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
8881 }
8882 }
8883 InVectors.front() =
8884 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
8885 if (InVectors.size() == 2)
8886 InVectors.pop_back();
8887 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8888 V1, V2, Mask: CommonMask, Builder);
8889 }
8890
8891public:
8892 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
8893 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8894 SmallPtrSetImpl<Value *> &CheckedExtracts)
8895 : ScalarTy(ScalarTy), TTI(TTI),
8896 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8897 CheckedExtracts(CheckedExtracts) {}
8898 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8899 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8900 unsigned NumParts, bool &UseVecBaseAsInput) {
8901 UseVecBaseAsInput = false;
8902 if (Mask.empty())
8903 return nullptr;
8904 Value *VecBase = nullptr;
8905 ArrayRef<Value *> VL = E->Scalars;
8906 // If the resulting type is scalarized, do not adjust the cost.
8907 if (NumParts == VL.size())
8908 return nullptr;
8909 // Check if it can be considered reused if same extractelements were
8910 // vectorized already.
8911 bool PrevNodeFound = any_of(
8912 Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
8913 P: [&](const std::unique_ptr<TreeEntry> &TE) {
8914 return ((!TE->isAltShuffle() &&
8915 TE->getOpcode() == Instruction::ExtractElement) ||
8916 TE->isGather()) &&
8917 all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) {
8918 return VL.size() > Data.index() &&
8919 (Mask[Data.index()] == PoisonMaskElem ||
8920 isa<UndefValue>(VL[Data.index()]) ||
8921 Data.value() == VL[Data.index()]);
8922 });
8923 });
8924 SmallPtrSet<Value *, 4> UniqueBases;
8925 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
8926 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
8927 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
8928 ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
8929 for (auto [I, V] : enumerate(First: VL.slice(N: Part * SliceSize, M: Limit))) {
8930 // Ignore non-extractelement scalars.
8931 if (isa<UndefValue>(Val: V) ||
8932 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8933 continue;
8934 // If all users of instruction are going to be vectorized and this
8935 // instruction itself is not going to be vectorized, consider this
8936 // instruction as dead and remove its cost from the final cost of the
8937 // vectorized tree.
8938 // Also, avoid adjusting the cost for extractelements with multiple uses
8939 // in different graph entries.
8940 auto *EE = cast<ExtractElementInst>(Val: V);
8941 VecBase = EE->getVectorOperand();
8942 UniqueBases.insert(Ptr: VecBase);
8943 const TreeEntry *VE = R.getTreeEntry(V);
8944 if (!CheckedExtracts.insert(Ptr: V).second ||
8945 !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) ||
8946 any_of(Range: EE->users(),
8947 P: [&](User *U) {
8948 return isa<GetElementPtrInst>(Val: U) &&
8949 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
8950 VectorizedVals: &VectorizedVals);
8951 }) ||
8952 (VE && VE != E))
8953 continue;
8954 std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
8955 if (!EEIdx)
8956 continue;
8957 unsigned Idx = *EEIdx;
8958 // Take credit for instruction that will become dead.
8959 if (EE->hasOneUse() || !PrevNodeFound) {
8960 Instruction *Ext = EE->user_back();
8961 if (isa<SExtInst, ZExtInst>(Val: Ext) &&
8962 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
8963 // Use getExtractWithExtendCost() to calculate the cost of
8964 // extractelement/ext pair.
8965 Cost -=
8966 TTI.getExtractWithExtendCost(Opcode: Ext->getOpcode(), Dst: Ext->getType(),
8967 VecTy: EE->getVectorOperandType(), Index: Idx);
8968 // Add back the cost of s|zext which is subtracted separately.
8969 Cost += TTI.getCastInstrCost(
8970 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
8971 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
8972 continue;
8973 }
8974 }
8975 Cost -= TTI.getVectorInstrCost(I: *EE, Val: EE->getVectorOperandType(),
8976 CostKind, Index: Idx);
8977 }
8978 }
8979 // Check that gather of extractelements can be represented as just a
8980 // shuffle of a single/two vectors the scalars are extracted from.
8981 // Found the bunch of extractelement instructions that must be gathered
8982 // into a vector and can be represented as a permutation elements in a
8983 // single input vector or of 2 input vectors.
8984 // Done for reused if same extractelements were vectorized already.
8985 if (!PrevNodeFound)
8986 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8987 InVectors.assign(NumElts: 1, Elt: E);
8988 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
8989 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
8990 SameNodesEstimated = false;
8991 if (NumParts != 1 && UniqueBases.size() != 1) {
8992 UseVecBaseAsInput = true;
8993 VecBase =
8994 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
8995 }
8996 return VecBase;
8997 }
8998 /// Checks if the specified entry \p E needs to be delayed because of its
8999 /// dependency nodes.
9000 std::optional<InstructionCost>
9001 needToDelay(const TreeEntry *,
9002 ArrayRef<SmallVector<const TreeEntry *>>) const {
9003 // No need to delay the cost estimation during analysis.
9004 return std::nullopt;
9005 }
9006 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
9007 if (&E1 == &E2) {
9008 assert(all_of(Mask,
9009 [&](int Idx) {
9010 return Idx < static_cast<int>(E1.getVectorFactor());
9011 }) &&
9012 "Expected single vector shuffle mask.");
9013 add(E1, Mask);
9014 return;
9015 }
9016 if (InVectors.empty()) {
9017 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
9018 InVectors.assign(IL: {&E1, &E2});
9019 return;
9020 }
9021 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9022 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
9023 unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
9024 if (NumParts == 0 || NumParts >= Mask.size())
9025 NumParts = 1;
9026 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
9027 const auto *It =
9028 find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
9029 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
9030 estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
9031 }
9032 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
9033 if (InVectors.empty()) {
9034 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
9035 InVectors.assign(NumElts: 1, Elt: &E1);
9036 return;
9037 }
9038 assert(!CommonMask.empty() && "Expected non-empty common mask.");
9039 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
9040 unsigned NumParts = TTI.getNumberOfParts(Tp: MaskVecTy);
9041 if (NumParts == 0 || NumParts >= Mask.size())
9042 NumParts = 1;
9043 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
9044 const auto *It =
9045 find_if(Range&: Mask, P: [](int Idx) { return Idx != PoisonMaskElem; });
9046 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
9047 estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
9048 if (!SameNodesEstimated && InVectors.size() == 1)
9049 InVectors.emplace_back(Args: &E1);
9050 }
9051 /// Adds 2 input vectors and the mask for their shuffling.
9052 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
9053 // May come only for shuffling of 2 vectors with extractelements, already
9054 // handled in adjustExtracts.
9055 assert(InVectors.size() == 1 &&
9056 all_of(enumerate(CommonMask),
9057 [&](auto P) {
9058 if (P.value() == PoisonMaskElem)
9059 return Mask[P.index()] == PoisonMaskElem;
9060 auto *EI =
9061 cast<ExtractElementInst>(InVectors.front()
9062 .get<const TreeEntry *>()
9063 ->Scalars[P.index()]);
9064 return EI->getVectorOperand() == V1 ||
9065 EI->getVectorOperand() == V2;
9066 }) &&
9067 "Expected extractelement vectors.");
9068 }
9069 /// Adds another one input vector and the mask for the shuffling.
9070 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
9071 if (InVectors.empty()) {
9072 assert(CommonMask.empty() && !ForExtracts &&
9073 "Expected empty input mask/vectors.");
9074 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
9075 InVectors.assign(NumElts: 1, Elt: V1);
9076 return;
9077 }
9078 if (ForExtracts) {
9079 // No need to add vectors here, already handled them in adjustExtracts.
9080 assert(InVectors.size() == 1 &&
9081 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
9082 all_of(enumerate(CommonMask),
9083 [&](auto P) {
9084 Value *Scalar = InVectors.front()
9085 .get<const TreeEntry *>()
9086 ->Scalars[P.index()];
9087 if (P.value() == PoisonMaskElem)
9088 return P.value() == Mask[P.index()] ||
9089 isa<UndefValue>(Scalar);
9090 if (isa<Constant>(V1))
9091 return true;
9092 auto *EI = cast<ExtractElementInst>(Scalar);
9093 return EI->getVectorOperand() == V1;
9094 }) &&
9095 "Expected only tree entry for extractelement vectors.");
9096 return;
9097 }
9098 assert(!InVectors.empty() && !CommonMask.empty() &&
9099 "Expected only tree entries from extracts/reused buildvectors.");
9100 unsigned VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
9101 if (InVectors.size() == 2) {
9102 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
9103 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
9104 VF = std::max<unsigned>(a: VF, b: CommonMask.size());
9105 } else if (const auto *InTE =
9106 InVectors.front().dyn_cast<const TreeEntry *>()) {
9107 VF = std::max(a: VF, b: InTE->getVectorFactor());
9108 } else {
9109 VF = std::max(
9110 a: VF, b: cast<FixedVectorType>(Val: InVectors.front().get<Value *>()->getType())
9111 ->getNumElements());
9112 }
9113 InVectors.push_back(Elt: V1);
9114 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9115 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
9116 CommonMask[Idx] = Mask[Idx] + VF;
9117 }
9118 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
9119 Value *Root = nullptr) {
9120 Cost += getBuildVectorCost(VL, Root);
9121 if (!Root) {
9122 // FIXME: Need to find a way to avoid use of getNullValue here.
9123 SmallVector<Constant *> Vals;
9124 unsigned VF = VL.size();
9125 if (MaskVF != 0)
9126 VF = std::min(a: VF, b: MaskVF);
9127 for (Value *V : VL.take_front(N: VF)) {
9128 if (isa<UndefValue>(Val: V)) {
9129 Vals.push_back(Elt: cast<Constant>(Val: V));
9130 continue;
9131 }
9132 Vals.push_back(Elt: Constant::getNullValue(Ty: V->getType()));
9133 }
9134 return ConstantVector::get(V: Vals);
9135 }
9136 return ConstantVector::getSplat(
9137 EC: ElementCount::getFixed(
9138 MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
9139 Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy));
9140 }
9141 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
9142 /// Finalize emission of the shuffles.
9143 InstructionCost
9144 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
9145 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
9146 IsFinalized = true;
9147 if (Action) {
9148 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
9149 if (InVectors.size() == 2)
9150 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
9151 else
9152 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
9153 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
9154 if (CommonMask[Idx] != PoisonMaskElem)
9155 CommonMask[Idx] = Idx;
9156 assert(VF > 0 &&
9157 "Expected vector length for the final value before action.");
9158 Value *V = Vec.get<Value *>();
9159 Action(V, CommonMask);
9160 InVectors.front() = V;
9161 }
9162 ::addMask(Mask&: CommonMask, SubMask: ExtMask, /*ExtendingManyInputs=*/true);
9163 if (CommonMask.empty()) {
9164 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
9165 return Cost;
9166 }
9167 return Cost +
9168 createShuffle(P1: InVectors.front(),
9169 P2: InVectors.size() == 2 ? InVectors.back() : nullptr,
9170 Mask: CommonMask);
9171 }
9172
9173 ~ShuffleCostEstimator() {
9174 assert((IsFinalized || CommonMask.empty()) &&
9175 "Shuffle construction must be finalized.");
9176 }
9177};
9178
9179const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
9180 unsigned Idx) const {
9181 Value *Op = E->getOperand(OpIdx: Idx).front();
9182 if (const TreeEntry *TE = getTreeEntry(V: Op)) {
9183 if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
9184 return EI.EdgeIdx == Idx && EI.UserTE == E;
9185 }) != TE->UserTreeIndices.end())
9186 return TE;
9187 auto MIt = MultiNodeScalars.find(Val: Op);
9188 if (MIt != MultiNodeScalars.end()) {
9189 for (const TreeEntry *TE : MIt->second) {
9190 if (find_if(Range: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
9191 return EI.EdgeIdx == Idx && EI.UserTE == E;
9192 }) != TE->UserTreeIndices.end())
9193 return TE;
9194 }
9195 }
9196 }
9197 const auto *It =
9198 find_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
9199 return TE->isGather() &&
9200 find_if(Range&: TE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
9201 return EI.EdgeIdx == Idx && EI.UserTE == E;
9202 }) != TE->UserTreeIndices.end();
9203 });
9204 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9205 return It->get();
9206}
9207
9208TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9209 if (TE.State == TreeEntry::ScatterVectorize ||
9210 TE.State == TreeEntry::StridedVectorize)
9211 return TTI::CastContextHint::GatherScatter;
9212 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9213 !TE.isAltShuffle()) {
9214 if (TE.ReorderIndices.empty())
9215 return TTI::CastContextHint::Normal;
9216 SmallVector<int> Mask;
9217 inversePermutation(Indices: TE.ReorderIndices, Mask);
9218 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
9219 return TTI::CastContextHint::Reversed;
9220 }
9221 return TTI::CastContextHint::None;
9222}
9223
9224/// Builds the arguments types vector for the given call instruction with the
9225/// given \p ID for the specified vector factor.
9226static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
9227 const Intrinsic::ID ID,
9228 const unsigned VF,
9229 unsigned MinBW) {
9230 SmallVector<Type *> ArgTys;
9231 for (auto [Idx, Arg] : enumerate(First: CI->args())) {
9232 if (ID != Intrinsic::not_intrinsic) {
9233 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx)) {
9234 ArgTys.push_back(Elt: Arg->getType());
9235 continue;
9236 }
9237 if (MinBW > 0) {
9238 ArgTys.push_back(
9239 Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
9240 continue;
9241 }
9242 }
9243 ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg->getType(), VF));
9244 }
9245 return ArgTys;
9246}
9247
9248InstructionCost
9249BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9250 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9251 ArrayRef<Value *> VL = E->Scalars;
9252
9253 Type *ScalarTy = VL[0]->getType();
9254 if (!E->isGather()) {
9255 if (auto *SI = dyn_cast<StoreInst>(Val: VL[0]))
9256 ScalarTy = SI->getValueOperand()->getType();
9257 else if (auto *CI = dyn_cast<CmpInst>(Val: VL[0]))
9258 ScalarTy = CI->getOperand(i_nocapture: 0)->getType();
9259 else if (auto *IE = dyn_cast<InsertElementInst>(Val: VL[0]))
9260 ScalarTy = IE->getOperand(i_nocapture: 1)->getType();
9261 }
9262 if (!isValidElementType(Ty: ScalarTy))
9263 return InstructionCost::getInvalid();
9264 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
9265
9266 // If we have computed a smaller type for the expression, update VecTy so
9267 // that the costs will be accurate.
9268 auto It = MinBWs.find(Val: E);
9269 Type *OrigScalarTy = ScalarTy;
9270 if (It != MinBWs.end())
9271 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
9272 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
9273 unsigned EntryVF = E->getVectorFactor();
9274 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
9275
9276 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9277 if (E->isGather()) {
9278 if (allConstant(VL))
9279 return 0;
9280 if (isa<InsertElementInst>(Val: VL[0]))
9281 return InstructionCost::getInvalid();
9282 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9283 E, ScalarTy, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts);
9284 }
9285 InstructionCost CommonCost = 0;
9286 SmallVector<int> Mask;
9287 bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
9288 if (!E->ReorderIndices.empty() &&
9289 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9290 SmallVector<int> NewMask;
9291 if (E->getOpcode() == Instruction::Store) {
9292 // For stores the order is actually a mask.
9293 NewMask.resize(N: E->ReorderIndices.size());
9294 copy(Range: E->ReorderIndices, Out: NewMask.begin());
9295 } else {
9296 inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
9297 }
9298 ::addMask(Mask, SubMask: NewMask);
9299 }
9300 if (NeedToShuffleReuses)
9301 ::addMask(Mask, SubMask: E->ReuseShuffleIndices);
9302 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
9303 CommonCost =
9304 TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
9305 assert((E->State == TreeEntry::Vectorize ||
9306 E->State == TreeEntry::ScatterVectorize ||
9307 E->State == TreeEntry::StridedVectorize) &&
9308 "Unhandled state");
9309 assert(E->getOpcode() &&
9310 ((allSameType(VL) && allSameBlock(VL)) ||
9311 (E->getOpcode() == Instruction::GetElementPtr &&
9312 E->getMainOp()->getType()->isPointerTy())) &&
9313 "Invalid VL");
9314 Instruction *VL0 = E->getMainOp();
9315 unsigned ShuffleOrOp =
9316 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9317 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9318 const unsigned Sz = UniqueValues.size();
9319 SmallBitVector UsedScalars(Sz, false);
9320 for (unsigned I = 0; I < Sz; ++I) {
9321 if (getTreeEntry(V: UniqueValues[I]) == E)
9322 continue;
9323 UsedScalars.set(I);
9324 }
9325 auto GetCastContextHint = [&](Value *V) {
9326 if (const TreeEntry *OpTE = getTreeEntry(V))
9327 return getCastContextHint(TE: *OpTE);
9328 InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI);
9329 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9330 return TTI::CastContextHint::GatherScatter;
9331 return TTI::CastContextHint::None;
9332 };
9333 auto GetCostDiff =
9334 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9335 function_ref<InstructionCost(InstructionCost)> VectorCost) {
9336 // Calculate the cost of this instruction.
9337 InstructionCost ScalarCost = 0;
9338 if (isa<CastInst, CallInst>(Val: VL0)) {
9339 // For some of the instructions no need to calculate cost for each
9340 // particular instruction, we can use the cost of the single
9341 // instruction x total number of scalar instructions.
9342 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9343 } else {
9344 for (unsigned I = 0; I < Sz; ++I) {
9345 if (UsedScalars.test(Idx: I))
9346 continue;
9347 ScalarCost += ScalarEltCost(I);
9348 }
9349 }
9350
9351 InstructionCost VecCost = VectorCost(CommonCost);
9352 // Check if the current node must be resized, if the parent node is not
9353 // resized.
9354 if (!UnaryInstruction::isCast(Opcode: E->getOpcode()) && E->Idx != 0) {
9355 const EdgeInfo &EI = E->UserTreeIndices.front();
9356 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9357 EI.EdgeIdx != 0) &&
9358 It != MinBWs.end()) {
9359 auto UserBWIt = MinBWs.find(Val: EI.UserTE);
9360 Type *UserScalarTy =
9361 EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
9362 if (UserBWIt != MinBWs.end())
9363 UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
9364 NumBits: UserBWIt->second.first);
9365 if (ScalarTy != UserScalarTy) {
9366 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9367 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
9368 unsigned VecOpcode;
9369 auto *UserVecTy =
9370 getWidenedType(ScalarTy: UserScalarTy, VF: E->getVectorFactor());
9371 if (BWSz > SrcBWSz)
9372 VecOpcode = Instruction::Trunc;
9373 else
9374 VecOpcode =
9375 It->second.second ? Instruction::SExt : Instruction::ZExt;
9376 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9377 VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
9378 CostKind);
9379 }
9380 }
9381 }
9382 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9383 ScalarCost, "Calculated costs for Tree"));
9384 return VecCost - ScalarCost;
9385 };
9386 // Calculate cost difference from vectorizing set of GEPs.
9387 // Negative value means vectorizing is profitable.
9388 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9389 assert((E->State == TreeEntry::Vectorize ||
9390 E->State == TreeEntry::StridedVectorize) &&
9391 "Entry state expected to be Vectorize or StridedVectorize here.");
9392 InstructionCost ScalarCost = 0;
9393 InstructionCost VecCost = 0;
9394 std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
9395 TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
9396 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9397 "Calculated GEPs cost for Tree"));
9398
9399 return VecCost - ScalarCost;
9400 };
9401
9402 switch (ShuffleOrOp) {
9403 case Instruction::PHI: {
9404 // Count reused scalars.
9405 InstructionCost ScalarCost = 0;
9406 SmallPtrSet<const TreeEntry *, 4> CountedOps;
9407 for (Value *V : UniqueValues) {
9408 auto *PHI = dyn_cast<PHINode>(Val: V);
9409 if (!PHI)
9410 continue;
9411
9412 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9413 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9414 Value *Op = PHI->getIncomingValue(i: I);
9415 Operands[I] = Op;
9416 }
9417 if (const TreeEntry *OpTE = getTreeEntry(V: Operands.front()))
9418 if (OpTE->isSame(VL: Operands) && CountedOps.insert(Ptr: OpTE).second)
9419 if (!OpTE->ReuseShuffleIndices.empty())
9420 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9421 OpTE->Scalars.size());
9422 }
9423
9424 return CommonCost - ScalarCost;
9425 }
9426 case Instruction::ExtractValue:
9427 case Instruction::ExtractElement: {
9428 auto GetScalarCost = [&](unsigned Idx) {
9429 auto *I = cast<Instruction>(Val: UniqueValues[Idx]);
9430 VectorType *SrcVecTy;
9431 if (ShuffleOrOp == Instruction::ExtractElement) {
9432 auto *EE = cast<ExtractElementInst>(Val: I);
9433 SrcVecTy = EE->getVectorOperandType();
9434 } else {
9435 auto *EV = cast<ExtractValueInst>(Val: I);
9436 Type *AggregateTy = EV->getAggregateOperand()->getType();
9437 unsigned NumElts;
9438 if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
9439 NumElts = ATy->getNumElements();
9440 else
9441 NumElts = AggregateTy->getStructNumElements();
9442 SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
9443 }
9444 if (I->hasOneUse()) {
9445 Instruction *Ext = I->user_back();
9446 if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) &&
9447 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
9448 // Use getExtractWithExtendCost() to calculate the cost of
9449 // extractelement/ext pair.
9450 InstructionCost Cost = TTI->getExtractWithExtendCost(
9451 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I));
9452 // Subtract the cost of s|zext which is subtracted separately.
9453 Cost -= TTI->getCastInstrCost(
9454 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
9455 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
9456 return Cost;
9457 }
9458 }
9459 return TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: SrcVecTy,
9460 CostKind, Index: *getExtractIndex(E: I));
9461 };
9462 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9463 return GetCostDiff(GetScalarCost, GetVectorCost);
9464 }
9465 case Instruction::InsertElement: {
9466 assert(E->ReuseShuffleIndices.empty() &&
9467 "Unique insertelements only are expected.");
9468 auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
9469 unsigned const NumElts = SrcVecTy->getNumElements();
9470 unsigned const NumScalars = VL.size();
9471
9472 unsigned NumOfParts = TTI->getNumberOfParts(Tp: SrcVecTy);
9473
9474 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9475 unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
9476 unsigned OffsetEnd = OffsetBeg;
9477 InsertMask[OffsetBeg] = 0;
9478 for (auto [I, V] : enumerate(First: VL.drop_front())) {
9479 unsigned Idx = *getElementIndex(Inst: V);
9480 if (OffsetBeg > Idx)
9481 OffsetBeg = Idx;
9482 else if (OffsetEnd < Idx)
9483 OffsetEnd = Idx;
9484 InsertMask[Idx] = I + 1;
9485 }
9486 unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
9487 if (NumOfParts > 0)
9488 VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts);
9489 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9490 VecScalarsSz;
9491 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9492 unsigned InsertVecSz = std::min<unsigned>(
9493 a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1),
9494 b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9495 bool IsWholeSubvector =
9496 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9497 // Check if we can safely insert a subvector. If it is not possible, just
9498 // generate a whole-sized vector and shuffle the source vector and the new
9499 // subvector.
9500 if (OffsetBeg + InsertVecSz > VecSz) {
9501 // Align OffsetBeg to generate correct mask.
9502 OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
9503 InsertVecSz = VecSz;
9504 }
9505
9506 APInt DemandedElts = APInt::getZero(numBits: NumElts);
9507 // TODO: Add support for Instruction::InsertValue.
9508 SmallVector<int> Mask;
9509 if (!E->ReorderIndices.empty()) {
9510 inversePermutation(Indices: E->ReorderIndices, Mask);
9511 Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
9512 } else {
9513 Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
9514 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0);
9515 }
9516 bool IsIdentity = true;
9517 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9518 Mask.swap(RHS&: PrevMask);
9519 for (unsigned I = 0; I < NumScalars; ++I) {
9520 unsigned InsertIdx = *getElementIndex(Inst: VL[PrevMask[I]]);
9521 DemandedElts.setBit(InsertIdx);
9522 IsIdentity &= InsertIdx - OffsetBeg == I;
9523 Mask[InsertIdx - OffsetBeg] = I;
9524 }
9525 assert(Offset < NumElts && "Failed to find vector index offset");
9526
9527 InstructionCost Cost = 0;
9528 Cost -= TTI->getScalarizationOverhead(Ty: SrcVecTy, DemandedElts,
9529 /*Insert*/ true, /*Extract*/ false,
9530 CostKind);
9531
9532 // First cost - resize to actual vector size if not identity shuffle or
9533 // need to shift the vector.
9534 // Do not calculate the cost if the actual size is the register size and
9535 // we can merge this shuffle with the following SK_Select.
9536 auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
9537 if (!IsIdentity)
9538 Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
9539 Tp: InsertVecTy, Mask);
9540 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) {
9541 return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
9542 }));
9543 // Second cost - permutation with subvector, if some elements are from the
9544 // initial vector or inserting a subvector.
9545 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9546 // subvector of ActualVecTy.
9547 SmallBitVector InMask =
9548 isUndefVector(V: FirstInsert->getOperand(i: 0),
9549 UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
9550 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9551 if (InsertVecSz != VecSz) {
9552 auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
9553 Cost += TTI->getShuffleCost(Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy,
9554 Mask: std::nullopt, CostKind, Index: OffsetBeg - Offset,
9555 SubTp: InsertVecTy);
9556 } else {
9557 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9558 Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
9559 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9560 I <= End; ++I)
9561 if (Mask[I] != PoisonMaskElem)
9562 Mask[I] = I + VecSz;
9563 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9564 Mask[I] =
9565 ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I;
9566 Cost +=
9567 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
9568 }
9569 }
9570 return Cost;
9571 }
9572 case Instruction::ZExt:
9573 case Instruction::SExt:
9574 case Instruction::FPToUI:
9575 case Instruction::FPToSI:
9576 case Instruction::FPExt:
9577 case Instruction::PtrToInt:
9578 case Instruction::IntToPtr:
9579 case Instruction::SIToFP:
9580 case Instruction::UIToFP:
9581 case Instruction::Trunc:
9582 case Instruction::FPTrunc:
9583 case Instruction::BitCast: {
9584 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
9585 Type *SrcScalarTy = VL0->getOperand(i: 0)->getType();
9586 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
9587 unsigned Opcode = ShuffleOrOp;
9588 unsigned VecOpcode = Opcode;
9589 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9590 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9591 // Check if the values are candidates to demote.
9592 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
9593 if (SrcIt != MinBWs.end()) {
9594 SrcBWSz = SrcIt->second.first;
9595 SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
9596 SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
9597 }
9598 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9599 if (BWSz == SrcBWSz) {
9600 VecOpcode = Instruction::BitCast;
9601 } else if (BWSz < SrcBWSz) {
9602 VecOpcode = Instruction::Trunc;
9603 } else if (It != MinBWs.end()) {
9604 assert(BWSz > SrcBWSz && "Invalid cast!");
9605 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9606 } else if (SrcIt != MinBWs.end()) {
9607 assert(BWSz > SrcBWSz && "Invalid cast!");
9608 VecOpcode =
9609 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9610 }
9611 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9612 !SrcIt->second.second) {
9613 VecOpcode = Instruction::UIToFP;
9614 }
9615 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9616 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9617 return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
9618 Src: VL0->getOperand(i: 0)->getType(),
9619 CCH: TTI::getCastContextHint(I: VI), CostKind, I: VI);
9620 };
9621 auto GetVectorCost = [=](InstructionCost CommonCost) {
9622 // Do not count cost here if minimum bitwidth is in effect and it is just
9623 // a bitcast (here it is just a noop).
9624 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9625 return CommonCost;
9626 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9627 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0));
9628 return CommonCost +
9629 TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
9630 I: VecOpcode == Opcode ? VI : nullptr);
9631 };
9632 return GetCostDiff(GetScalarCost, GetVectorCost);
9633 }
9634 case Instruction::FCmp:
9635 case Instruction::ICmp:
9636 case Instruction::Select: {
9637 CmpInst::Predicate VecPred, SwappedVecPred;
9638 auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
9639 if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) ||
9640 match(V: VL0, P: MatchCmp))
9641 SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
9642 else
9643 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9644 ? CmpInst::BAD_FCMP_PREDICATE
9645 : CmpInst::BAD_ICMP_PREDICATE;
9646 auto GetScalarCost = [&](unsigned Idx) {
9647 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9648 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9649 ? CmpInst::BAD_FCMP_PREDICATE
9650 : CmpInst::BAD_ICMP_PREDICATE;
9651 auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
9652 if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
9653 !match(V: VI, P: MatchCmp)) ||
9654 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9655 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9656 ? CmpInst::BAD_FCMP_PREDICATE
9657 : CmpInst::BAD_ICMP_PREDICATE;
9658
9659 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
9660 Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
9661 CostKind, I: VI);
9662 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI);
9663 if (MinMaxID != Intrinsic::not_intrinsic) {
9664 Type *CanonicalType = OrigScalarTy;
9665 if (CanonicalType->isPtrOrPtrVectorTy())
9666 CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
9667 C&: CanonicalType->getContext(),
9668 NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
9669
9670 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9671 {CanonicalType, CanonicalType});
9672 InstructionCost IntrinsicCost =
9673 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9674 // If the selects are the only uses of the compares, they will be
9675 // dead and we can adjust the cost by removing their cost.
9676 if (SelectOnly) {
9677 auto *CI = cast<CmpInst>(Val: VI->getOperand(i: 0));
9678 IntrinsicCost -= TTI->getCmpSelInstrCost(
9679 Opcode: CI->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(),
9680 VecPred: CI->getPredicate(), CostKind, I: CI);
9681 }
9682 ScalarCost = std::min(a: ScalarCost, b: IntrinsicCost);
9683 }
9684
9685 return ScalarCost;
9686 };
9687 auto GetVectorCost = [&](InstructionCost CommonCost) {
9688 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
9689
9690 InstructionCost VecCost = TTI->getCmpSelInstrCost(
9691 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred, CostKind, I: VL0);
9692 // Check if it is possible and profitable to use min/max for selects
9693 // in VL.
9694 //
9695 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL);
9696 if (MinMaxID != Intrinsic::not_intrinsic) {
9697 Type *CanonicalType = VecTy;
9698 if (CanonicalType->isPtrOrPtrVectorTy())
9699 CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
9700 C&: CanonicalType->getContext(),
9701 NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
9702 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
9703 {CanonicalType, CanonicalType});
9704 InstructionCost IntrinsicCost =
9705 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9706 // If the selects are the only uses of the compares, they will be
9707 // dead and we can adjust the cost by removing their cost.
9708 if (SelectOnly) {
9709 auto *CI =
9710 cast<CmpInst>(Val: cast<Instruction>(Val: VL.front())->getOperand(i: 0));
9711 IntrinsicCost -= TTI->getCmpSelInstrCost(Opcode: CI->getOpcode(), ValTy: VecTy,
9712 CondTy: MaskTy, VecPred, CostKind);
9713 }
9714 VecCost = std::min(a: VecCost, b: IntrinsicCost);
9715 }
9716 return VecCost + CommonCost;
9717 };
9718 return GetCostDiff(GetScalarCost, GetVectorCost);
9719 }
9720 case Instruction::FNeg:
9721 case Instruction::Add:
9722 case Instruction::FAdd:
9723 case Instruction::Sub:
9724 case Instruction::FSub:
9725 case Instruction::Mul:
9726 case Instruction::FMul:
9727 case Instruction::UDiv:
9728 case Instruction::SDiv:
9729 case Instruction::FDiv:
9730 case Instruction::URem:
9731 case Instruction::SRem:
9732 case Instruction::FRem:
9733 case Instruction::Shl:
9734 case Instruction::LShr:
9735 case Instruction::AShr:
9736 case Instruction::And:
9737 case Instruction::Or:
9738 case Instruction::Xor: {
9739 auto GetScalarCost = [&](unsigned Idx) {
9740 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9741 unsigned OpIdx = isa<UnaryOperator>(Val: VI) ? 0 : 1;
9742 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: VI->getOperand(i: 0));
9743 TTI::OperandValueInfo Op2Info =
9744 TTI::getOperandInfo(V: VI->getOperand(i: OpIdx));
9745 SmallVector<const Value *> Operands(VI->operand_values());
9746 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind,
9747 Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands, CxtI: VI);
9748 };
9749 auto GetVectorCost = [=](InstructionCost CommonCost) {
9750 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9751 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
9752 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
9753 if (all_of(Range&: Ops, P: [&](Value *Op) {
9754 auto *CI = dyn_cast<ConstantInt>(Val: Op);
9755 return CI && CI->getValue().countr_one() >= It->second.first;
9756 }))
9757 return CommonCost;
9758 }
9759 }
9760 unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1;
9761 TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
9762 TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
9763 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
9764 Opd2Info: Op2Info, Args: std::nullopt, CxtI: nullptr, TLibInfo: TLI) +
9765 CommonCost;
9766 };
9767 return GetCostDiff(GetScalarCost, GetVectorCost);
9768 }
9769 case Instruction::GetElementPtr: {
9770 return CommonCost + GetGEPCostDiff(VL, VL0);
9771 }
9772 case Instruction::Load: {
9773 auto GetScalarCost = [&](unsigned Idx) {
9774 auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]);
9775 return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
9776 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9777 CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI);
9778 };
9779 auto *LI0 = cast<LoadInst>(Val: VL0);
9780 auto GetVectorCost = [&](InstructionCost CommonCost) {
9781 InstructionCost VecLdCost;
9782 if (E->State == TreeEntry::Vectorize) {
9783 VecLdCost = TTI->getMemoryOpCost(
9784 Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
9785 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
9786 } else if (E->State == TreeEntry::StridedVectorize) {
9787 Align CommonAlignment =
9788 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9789 VecLdCost = TTI->getStridedMemoryOpCost(
9790 Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9791 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
9792 } else {
9793 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9794 Align CommonAlignment =
9795 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
9796 VecLdCost = TTI->getGatherScatterOpCost(
9797 Opcode: Instruction::Load, DataTy: VecTy, Ptr: LI0->getPointerOperand(),
9798 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
9799 }
9800 return VecLdCost + CommonCost;
9801 };
9802
9803 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9804 // If this node generates masked gather load then it is not a terminal node.
9805 // Hence address operand cost is estimated separately.
9806 if (E->State == TreeEntry::ScatterVectorize)
9807 return Cost;
9808
9809 // Estimate cost of GEPs since this tree node is a terminator.
9810 SmallVector<Value *> PointerOps(VL.size());
9811 for (auto [I, V] : enumerate(First&: VL))
9812 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
9813 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9814 }
9815 case Instruction::Store: {
9816 bool IsReorder = !E->ReorderIndices.empty();
9817 auto GetScalarCost = [=](unsigned Idx) {
9818 auto *VI = cast<StoreInst>(Val: VL[Idx]);
9819 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
9820 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
9821 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
9822 CostKind, OpdInfo: OpInfo, I: VI);
9823 };
9824 auto *BaseSI =
9825 cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9826 auto GetVectorCost = [=](InstructionCost CommonCost) {
9827 // We know that we can merge the stores. Calculate the cost.
9828 InstructionCost VecStCost;
9829 if (E->State == TreeEntry::StridedVectorize) {
9830 Align CommonAlignment =
9831 computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
9832 VecStCost = TTI->getStridedMemoryOpCost(
9833 Opcode: Instruction::Store, DataTy: VecTy, Ptr: BaseSI->getPointerOperand(),
9834 /*VariableMask=*/false, Alignment: CommonAlignment, CostKind);
9835 } else {
9836 assert(E->State == TreeEntry::Vectorize &&
9837 "Expected either strided or consecutive stores.");
9838 TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
9839 VecStCost = TTI->getMemoryOpCost(
9840 Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
9841 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
9842 }
9843 return VecStCost + CommonCost;
9844 };
9845 SmallVector<Value *> PointerOps(VL.size());
9846 for (auto [I, V] : enumerate(First&: VL)) {
9847 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9848 PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
9849 }
9850
9851 return GetCostDiff(GetScalarCost, GetVectorCost) +
9852 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9853 }
9854 case Instruction::Call: {
9855 auto GetScalarCost = [&](unsigned Idx) {
9856 auto *CI = cast<CallInst>(Val: UniqueValues[Idx]);
9857 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9858 if (ID != Intrinsic::not_intrinsic) {
9859 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9860 return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
9861 }
9862 return TTI->getCallInstrCost(F: CI->getCalledFunction(),
9863 RetTy: CI->getFunctionType()->getReturnType(),
9864 Tys: CI->getFunctionType()->params(), CostKind);
9865 };
9866 auto GetVectorCost = [=](InstructionCost CommonCost) {
9867 auto *CI = cast<CallInst>(Val: VL0);
9868 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9869 SmallVector<Type *> ArgTys =
9870 buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
9871 MinBW: It != MinBWs.end() ? It->second.first : 0);
9872 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9873 return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
9874 };
9875 return GetCostDiff(GetScalarCost, GetVectorCost);
9876 }
9877 case Instruction::ShuffleVector: {
9878 assert(E->isAltShuffle() &&
9879 ((Instruction::isBinaryOp(E->getOpcode()) &&
9880 Instruction::isBinaryOp(E->getAltOpcode())) ||
9881 (Instruction::isCast(E->getOpcode()) &&
9882 Instruction::isCast(E->getAltOpcode())) ||
9883 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9884 "Invalid Shuffle Vector Operand");
9885 // Try to find the previous shuffle node with the same operands and same
9886 // main/alternate ops.
9887 auto TryFindNodeWithEqualOperands = [=]() {
9888 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9889 if (TE.get() == E)
9890 break;
9891 if (TE->isAltShuffle() &&
9892 ((TE->getOpcode() == E->getOpcode() &&
9893 TE->getAltOpcode() == E->getAltOpcode()) ||
9894 (TE->getOpcode() == E->getAltOpcode() &&
9895 TE->getAltOpcode() == E->getOpcode())) &&
9896 TE->hasEqualOperands(TE: *E))
9897 return true;
9898 }
9899 return false;
9900 };
9901 auto GetScalarCost = [&](unsigned Idx) {
9902 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
9903 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9904 (void)E;
9905 return TTI->getInstructionCost(U: VI, CostKind);
9906 };
9907 // Need to clear CommonCost since the final shuffle cost is included into
9908 // vector cost.
9909 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9910 // VecCost is equal to sum of the cost of creating 2 vectors
9911 // and the cost of creating shuffle.
9912 InstructionCost VecCost = 0;
9913 if (TryFindNodeWithEqualOperands()) {
9914 LLVM_DEBUG({
9915 dbgs() << "SLP: diamond match for alternate node found.\n";
9916 E->dump();
9917 });
9918 // No need to add new vector costs here since we're going to reuse
9919 // same main/alternate vector ops, just do different shuffling.
9920 } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
9921 VecCost =
9922 TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
9923 VecCost +=
9924 TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
9925 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
9926 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
9927 VecCost = TTIRef.getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9928 VecPred: CI0->getPredicate(), CostKind, I: VL0);
9929 VecCost += TTIRef.getCmpSelInstrCost(
9930 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
9931 VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
9932 I: E->getAltOp());
9933 } else {
9934 Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType();
9935 auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
9936 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9937 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
9938 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
9939 unsigned SrcBWSz =
9940 DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType());
9941 if (SrcIt != MinBWs.end()) {
9942 SrcBWSz = SrcIt->second.first;
9943 SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
9944 SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
9945 }
9946 if (BWSz <= SrcBWSz) {
9947 if (BWSz < SrcBWSz)
9948 VecCost =
9949 TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
9950 CCH: TTI::CastContextHint::None, CostKind);
9951 LLVM_DEBUG({
9952 dbgs()
9953 << "SLP: alternate extension, which should be truncated.\n";
9954 E->dump();
9955 });
9956 return VecCost;
9957 }
9958 }
9959 VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
9960 CCH: TTI::CastContextHint::None, CostKind);
9961 VecCost +=
9962 TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
9963 CCH: TTI::CastContextHint::None, CostKind);
9964 }
9965 SmallVector<int> Mask;
9966 E->buildAltOpShuffleMask(
9967 IsAltOp: [E](Instruction *I) {
9968 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9969 return I->getOpcode() == E->getAltOpcode();
9970 },
9971 Mask);
9972 VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
9973 Tp: FinalVecTy, Mask);
9974 // Patterns like [fadd,fsub] can be combined into a single instruction
9975 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9976 // need to take into account their order when looking for the most used
9977 // order.
9978 unsigned Opcode0 = E->getOpcode();
9979 unsigned Opcode1 = E->getAltOpcode();
9980 SmallBitVector OpcodeMask(getAltInstrMask(VL: E->Scalars, Opcode0, Opcode1));
9981 // If this pattern is supported by the target then we consider the
9982 // order.
9983 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9984 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9985 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9986 return AltVecCost < VecCost ? AltVecCost : VecCost;
9987 }
9988 // TODO: Check the reverse order too.
9989 return VecCost;
9990 };
9991 return GetCostDiff(GetScalarCost, GetVectorCost);
9992 }
9993 default:
9994 llvm_unreachable("Unknown instruction");
9995 }
9996}
9997
9998bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9999 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
10000 << VectorizableTree.size() << " is fully vectorizable .\n");
10001
10002 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
10003 SmallVector<int> Mask;
10004 return TE->isGather() &&
10005 !any_of(Range: TE->Scalars,
10006 P: [this](Value *V) { return EphValues.contains(Ptr: V); }) &&
10007 (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) ||
10008 TE->Scalars.size() < Limit ||
10009 ((TE->getOpcode() == Instruction::ExtractElement ||
10010 all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
10011 isFixedVectorShuffle(VL: TE->Scalars, Mask)) ||
10012 (TE->isGather() && TE->getOpcode() == Instruction::Load &&
10013 !TE->isAltShuffle()));
10014 };
10015
10016 // We only handle trees of heights 1 and 2.
10017 if (VectorizableTree.size() == 1 &&
10018 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
10019 (ForReduction &&
10020 AreVectorizableGathers(VectorizableTree[0].get(),
10021 VectorizableTree[0]->Scalars.size()) &&
10022 VectorizableTree[0]->getVectorFactor() > 2)))
10023 return true;
10024
10025 if (VectorizableTree.size() != 2)
10026 return false;
10027
10028 // Handle splat and all-constants stores. Also try to vectorize tiny trees
10029 // with the second gather nodes if they have less scalar operands rather than
10030 // the initial tree element (may be profitable to shuffle the second gather)
10031 // or they are extractelements, which form shuffle.
10032 SmallVector<int> Mask;
10033 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
10034 AreVectorizableGathers(VectorizableTree[1].get(),
10035 VectorizableTree[0]->Scalars.size()))
10036 return true;
10037
10038 // Gathering cost would be too much for tiny trees.
10039 if (VectorizableTree[0]->isGather() ||
10040 (VectorizableTree[1]->isGather() &&
10041 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
10042 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
10043 return false;
10044
10045 return true;
10046}
10047
10048static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
10049 TargetTransformInfo *TTI,
10050 bool MustMatchOrInst) {
10051 // Look past the root to find a source value. Arbitrarily follow the
10052 // path through operand 0 of any 'or'. Also, peek through optional
10053 // shift-left-by-multiple-of-8-bits.
10054 Value *ZextLoad = Root;
10055 const APInt *ShAmtC;
10056 bool FoundOr = false;
10057 while (!isa<ConstantExpr>(Val: ZextLoad) &&
10058 (match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) ||
10059 (match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
10060 ShAmtC->urem(RHS: 8) == 0))) {
10061 auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
10062 ZextLoad = BinOp->getOperand(i_nocapture: 0);
10063 if (BinOp->getOpcode() == Instruction::Or)
10064 FoundOr = true;
10065 }
10066 // Check if the input is an extended load of the required or/shift expression.
10067 Value *Load;
10068 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
10069 !match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) || !isa<LoadInst>(Val: Load))
10070 return false;
10071
10072 // Require that the total load bit width is a legal integer type.
10073 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
10074 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
10075 Type *SrcTy = Load->getType();
10076 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
10077 if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
10078 return false;
10079
10080 // Everything matched - assume that we can fold the whole sequence using
10081 // load combining.
10082 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
10083 << *(cast<Instruction>(Root)) << "\n");
10084
10085 return true;
10086}
10087
10088bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
10089 if (RdxKind != RecurKind::Or)
10090 return false;
10091
10092 unsigned NumElts = VectorizableTree[0]->Scalars.size();
10093 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
10094 return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
10095 /* MatchOr */ MustMatchOrInst: false);
10096}
10097
10098bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
10099 // Peek through a final sequence of stores and check if all operations are
10100 // likely to be load-combined.
10101 unsigned NumElts = Stores.size();
10102 for (Value *Scalar : Stores) {
10103 Value *X;
10104 if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) ||
10105 !isLoadCombineCandidateImpl(Root: X, NumElts, TTI, /* MatchOr */ MustMatchOrInst: true))
10106 return false;
10107 }
10108 return true;
10109}
10110
10111bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
10112 // No need to vectorize inserts of gathered values.
10113 if (VectorizableTree.size() == 2 &&
10114 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
10115 VectorizableTree[1]->isGather() &&
10116 (VectorizableTree[1]->getVectorFactor() <= 2 ||
10117 !(isSplat(VL: VectorizableTree[1]->Scalars) ||
10118 allConstant(VL: VectorizableTree[1]->Scalars))))
10119 return true;
10120
10121 // If the graph includes only PHI nodes and gathers, it is defnitely not
10122 // profitable for the vectorization, we can skip it, if the cost threshold is
10123 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
10124 // gathers/buildvectors.
10125 constexpr int Limit = 4;
10126 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
10127 !VectorizableTree.empty() &&
10128 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
10129 return (TE->isGather() &&
10130 TE->getOpcode() != Instruction::ExtractElement &&
10131 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) ||
10132 TE->getOpcode() == Instruction::PHI;
10133 }))
10134 return true;
10135
10136 // We can vectorize the tree if its size is greater than or equal to the
10137 // minimum size specified by the MinTreeSize command line option.
10138 if (VectorizableTree.size() >= MinTreeSize)
10139 return false;
10140
10141 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
10142 // can vectorize it if we can prove it fully vectorizable.
10143 if (isFullyVectorizableTinyTree(ForReduction))
10144 return false;
10145
10146 // Check if any of the gather node forms an insertelement buildvector
10147 // somewhere.
10148 bool IsAllowedSingleBVNode =
10149 VectorizableTree.size() > 1 ||
10150 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
10151 !VectorizableTree.front()->isAltShuffle() &&
10152 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
10153 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
10154 allSameBlock(VL: VectorizableTree.front()->Scalars));
10155 if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
10156 return TE->isGather() && all_of(Range&: TE->Scalars, P: [&](Value *V) {
10157 return isa<ExtractElementInst, UndefValue>(Val: V) ||
10158 (IsAllowedSingleBVNode &&
10159 !V->hasNUsesOrMore(N: UsesLimit) &&
10160 any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
10161 });
10162 }))
10163 return false;
10164
10165 assert(VectorizableTree.empty()
10166 ? ExternalUses.empty()
10167 : true && "We shouldn't have any external users");
10168
10169 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
10170 // vectorizable.
10171 return true;
10172}
10173
10174InstructionCost BoUpSLP::getSpillCost() const {
10175 // Walk from the bottom of the tree to the top, tracking which values are
10176 // live. When we see a call instruction that is not part of our tree,
10177 // query TTI to see if there is a cost to keeping values live over it
10178 // (for example, if spills and fills are required).
10179 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
10180 InstructionCost Cost = 0;
10181
10182 SmallPtrSet<Instruction *, 4> LiveValues;
10183 Instruction *PrevInst = nullptr;
10184
10185 // The entries in VectorizableTree are not necessarily ordered by their
10186 // position in basic blocks. Collect them and order them by dominance so later
10187 // instructions are guaranteed to be visited first. For instructions in
10188 // different basic blocks, we only scan to the beginning of the block, so
10189 // their order does not matter, as long as all instructions in a basic block
10190 // are grouped together. Using dominance ensures a deterministic order.
10191 SmallVector<Instruction *, 16> OrderedScalars;
10192 for (const auto &TEPtr : VectorizableTree) {
10193 if (TEPtr->State != TreeEntry::Vectorize)
10194 continue;
10195 Instruction *Inst = dyn_cast<Instruction>(Val: TEPtr->Scalars[0]);
10196 if (!Inst)
10197 continue;
10198 OrderedScalars.push_back(Elt: Inst);
10199 }
10200 llvm::sort(C&: OrderedScalars, Comp: [&](Instruction *A, Instruction *B) {
10201 auto *NodeA = DT->getNode(BB: A->getParent());
10202 auto *NodeB = DT->getNode(BB: B->getParent());
10203 assert(NodeA && "Should only process reachable instructions");
10204 assert(NodeB && "Should only process reachable instructions");
10205 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
10206 "Different nodes should have different DFS numbers");
10207 if (NodeA != NodeB)
10208 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
10209 return B->comesBefore(Other: A);
10210 });
10211
10212 for (Instruction *Inst : OrderedScalars) {
10213 if (!PrevInst) {
10214 PrevInst = Inst;
10215 continue;
10216 }
10217
10218 // Update LiveValues.
10219 LiveValues.erase(Ptr: PrevInst);
10220 for (auto &J : PrevInst->operands()) {
10221 if (isa<Instruction>(Val: &*J) && getTreeEntry(V: &*J))
10222 LiveValues.insert(Ptr: cast<Instruction>(Val: &*J));
10223 }
10224
10225 LLVM_DEBUG({
10226 dbgs() << "SLP: #LV: " << LiveValues.size();
10227 for (auto *X : LiveValues)
10228 dbgs() << " " << X->getName();
10229 dbgs() << ", Looking at ";
10230 Inst->dump();
10231 });
10232
10233 // Now find the sequence of instructions between PrevInst and Inst.
10234 unsigned NumCalls = 0;
10235 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10236 PrevInstIt =
10237 PrevInst->getIterator().getReverse();
10238 while (InstIt != PrevInstIt) {
10239 if (PrevInstIt == PrevInst->getParent()->rend()) {
10240 PrevInstIt = Inst->getParent()->rbegin();
10241 continue;
10242 }
10243
10244 auto NoCallIntrinsic = [this](Instruction *I) {
10245 if (auto *II = dyn_cast<IntrinsicInst>(Val: I)) {
10246 if (II->isAssumeLikeIntrinsic())
10247 return true;
10248 FastMathFlags FMF;
10249 SmallVector<Type *, 4> Tys;
10250 for (auto &ArgOp : II->args())
10251 Tys.push_back(Elt: ArgOp->getType());
10252 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: II))
10253 FMF = FPMO->getFastMathFlags();
10254 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10255 FMF);
10256 InstructionCost IntrCost =
10257 TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
10258 InstructionCost CallCost = TTI->getCallInstrCost(
10259 F: nullptr, RetTy: II->getType(), Tys, CostKind: TTI::TCK_RecipThroughput);
10260 if (IntrCost < CallCost)
10261 return true;
10262 }
10263 return false;
10264 };
10265
10266 // Debug information does not impact spill cost.
10267 if (isa<CallBase>(Val: &*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10268 &*PrevInstIt != PrevInst)
10269 NumCalls++;
10270
10271 ++PrevInstIt;
10272 }
10273
10274 if (NumCalls) {
10275 SmallVector<Type *, 4> V;
10276 for (auto *II : LiveValues) {
10277 auto *ScalarTy = II->getType();
10278 if (auto *VectorTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
10279 ScalarTy = VectorTy->getElementType();
10280 V.push_back(Elt: getWidenedType(ScalarTy, VF: BundleWidth));
10281 }
10282 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(Tys: V);
10283 }
10284
10285 PrevInst = Inst;
10286 }
10287
10288 return Cost;
10289}
10290
10291/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10292/// buildvector sequence.
10293static bool isFirstInsertElement(const InsertElementInst *IE1,
10294 const InsertElementInst *IE2) {
10295 if (IE1 == IE2)
10296 return false;
10297 const auto *I1 = IE1;
10298 const auto *I2 = IE2;
10299 const InsertElementInst *PrevI1;
10300 const InsertElementInst *PrevI2;
10301 unsigned Idx1 = *getElementIndex(Inst: IE1);
10302 unsigned Idx2 = *getElementIndex(Inst: IE2);
10303 do {
10304 if (I2 == IE1)
10305 return true;
10306 if (I1 == IE2)
10307 return false;
10308 PrevI1 = I1;
10309 PrevI2 = I2;
10310 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10311 getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
10312 I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0));
10313 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10314 getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
10315 I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0));
10316 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10317 llvm_unreachable("Two different buildvectors not expected.");
10318}
10319
10320namespace {
10321/// Returns incoming Value *, if the requested type is Value * too, or a default
10322/// value, otherwise.
10323struct ValueSelect {
10324 template <typename U>
10325 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10326 return V;
10327 }
10328 template <typename U>
10329 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10330 return U();
10331 }
10332};
10333} // namespace
10334
10335/// Does the analysis of the provided shuffle masks and performs the requested
10336/// actions on the vectors with the given shuffle masks. It tries to do it in
10337/// several steps.
10338/// 1. If the Base vector is not undef vector, resizing the very first mask to
10339/// have common VF and perform action for 2 input vectors (including non-undef
10340/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10341/// and processed as a shuffle of 2 elements.
10342/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10343/// action only for 1 vector with the given mask, if it is not the identity
10344/// mask.
10345/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10346/// vectors, combing the masks properly between the steps.
10347template <typename T>
10348static T *performExtractsShuffleAction(
10349 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10350 function_ref<unsigned(T *)> GetVF,
10351 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10352 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
10353 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10354 SmallVector<int> Mask(ShuffleMask.begin()->second);
10355 auto VMIt = std::next(ShuffleMask.begin());
10356 T *Prev = nullptr;
10357 SmallBitVector UseMask =
10358 buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
10359 SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
10360 if (!IsBaseUndef.all()) {
10361 // Base is not undef, need to combine it with the next subvectors.
10362 std::pair<T *, bool> Res =
10363 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10364 SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
10365 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10366 if (Mask[Idx] == PoisonMaskElem)
10367 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10368 else
10369 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10370 }
10371 auto *V = ValueSelect::get<T *>(Base);
10372 (void)V;
10373 assert((!V || GetVF(V) == Mask.size()) &&
10374 "Expected base vector of VF number of elements.");
10375 Prev = Action(Mask, {nullptr, Res.first});
10376 } else if (ShuffleMask.size() == 1) {
10377 // Base is undef and only 1 vector is shuffled - perform the action only for
10378 // single vector, if the mask is not the identity mask.
10379 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10380 /*ForSingleMask=*/true);
10381 if (Res.second)
10382 // Identity mask is found.
10383 Prev = Res.first;
10384 else
10385 Prev = Action(Mask, {ShuffleMask.begin()->first});
10386 } else {
10387 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10388 // shuffles step by step, combining shuffle between the steps.
10389 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10390 unsigned Vec2VF = GetVF(VMIt->first);
10391 if (Vec1VF == Vec2VF) {
10392 // No need to resize the input vectors since they are of the same size, we
10393 // can shuffle them directly.
10394 ArrayRef<int> SecMask = VMIt->second;
10395 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10396 if (SecMask[I] != PoisonMaskElem) {
10397 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10398 Mask[I] = SecMask[I] + Vec1VF;
10399 }
10400 }
10401 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10402 } else {
10403 // Vectors of different sizes - resize and reshuffle.
10404 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10405 /*ForSingleMask=*/false);
10406 std::pair<T *, bool> Res2 =
10407 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10408 ArrayRef<int> SecMask = VMIt->second;
10409 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10410 if (Mask[I] != PoisonMaskElem) {
10411 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10412 if (Res1.second)
10413 Mask[I] = I;
10414 } else if (SecMask[I] != PoisonMaskElem) {
10415 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10416 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10417 }
10418 }
10419 Prev = Action(Mask, {Res1.first, Res2.first});
10420 }
10421 VMIt = std::next(VMIt);
10422 }
10423 bool IsBaseNotUndef = !IsBaseUndef.all();
10424 (void)IsBaseNotUndef;
10425 // Perform requested actions for the remaining masks/vectors.
10426 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10427 // Shuffle other input vectors, if any.
10428 std::pair<T *, bool> Res =
10429 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10430 ArrayRef<int> SecMask = VMIt->second;
10431 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10432 if (SecMask[I] != PoisonMaskElem) {
10433 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10434 "Multiple uses of scalars.");
10435 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10436 } else if (Mask[I] != PoisonMaskElem) {
10437 Mask[I] = I;
10438 }
10439 }
10440 Prev = Action(Mask, {Prev, Res.first});
10441 }
10442 return Prev;
10443}
10444
10445InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
10446 InstructionCost Cost = 0;
10447 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10448 << VectorizableTree.size() << ".\n");
10449
10450 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10451
10452 SmallPtrSet<Value *, 4> CheckedExtracts;
10453 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10454 TreeEntry &TE = *VectorizableTree[I];
10455 if (TE.isGather()) {
10456 if (const TreeEntry *E = getTreeEntry(V: TE.getMainOp());
10457 E && E->getVectorFactor() == TE.getVectorFactor() &&
10458 E->isSame(VL: TE.Scalars)) {
10459 // Some gather nodes might be absolutely the same as some vectorizable
10460 // nodes after reordering, need to handle it.
10461 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10462 << shortBundleName(TE.Scalars) << ".\n"
10463 << "SLP: Current total cost = " << Cost << "\n");
10464 continue;
10465 }
10466 }
10467
10468 InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
10469 Cost += C;
10470 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10471 << shortBundleName(TE.Scalars) << ".\n"
10472 << "SLP: Current total cost = " << Cost << "\n");
10473 }
10474
10475 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10476 InstructionCost ExtractCost = 0;
10477 SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
10478 SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
10479 SmallVector<APInt> DemandedElts;
10480 SmallDenseSet<Value *, 4> UsedInserts;
10481 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
10482 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10483 for (ExternalUser &EU : ExternalUses) {
10484 // We only add extract cost once for the same scalar.
10485 if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
10486 !ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
10487 continue;
10488
10489 // Uses by ephemeral values are free (because the ephemeral value will be
10490 // removed prior to code generation, and so the extraction will be
10491 // removed as well).
10492 if (EphValues.count(Ptr: EU.User))
10493 continue;
10494
10495 // No extract cost for vector "scalar"
10496 if (isa<FixedVectorType>(Val: EU.Scalar->getType()))
10497 continue;
10498
10499 // If found user is an insertelement, do not calculate extract cost but try
10500 // to detect it as a final shuffled/identity match.
10501 if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
10502 VU && VU->getOperand(i_nocapture: 1) == EU.Scalar) {
10503 if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
10504 if (!UsedInserts.insert(V: VU).second)
10505 continue;
10506 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
10507 if (InsertIdx) {
10508 const TreeEntry *ScalarTE = getTreeEntry(V: EU.Scalar);
10509 auto *It = find_if(
10510 Range&: FirstUsers,
10511 P: [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10512 return areTwoInsertFromSameBuildVector(
10513 VU, V: cast<InsertElementInst>(Val: Pair.first),
10514 GetBaseOperand: [this](InsertElementInst *II) -> Value * {
10515 Value *Op0 = II->getOperand(i_nocapture: 0);
10516 if (getTreeEntry(V: II) && !getTreeEntry(V: Op0))
10517 return nullptr;
10518 return Op0;
10519 });
10520 });
10521 int VecId = -1;
10522 if (It == FirstUsers.end()) {
10523 (void)ShuffleMasks.emplace_back();
10524 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10525 if (Mask.empty())
10526 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10527 // Find the insertvector, vectorized in tree, if any.
10528 Value *Base = VU;
10529 while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
10530 if (IEBase != EU.User &&
10531 (!IEBase->hasOneUse() ||
10532 getElementIndex(Inst: IEBase).value_or(u&: *InsertIdx) == *InsertIdx))
10533 break;
10534 // Build the mask for the vectorized insertelement instructions.
10535 if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
10536 VU = IEBase;
10537 do {
10538 IEBase = cast<InsertElementInst>(Val: Base);
10539 int Idx = *getElementIndex(Inst: IEBase);
10540 assert(Mask[Idx] == PoisonMaskElem &&
10541 "InsertElementInstruction used already.");
10542 Mask[Idx] = Idx;
10543 Base = IEBase->getOperand(i_nocapture: 0);
10544 } while (E == getTreeEntry(V: Base));
10545 break;
10546 }
10547 Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: 0);
10548 }
10549 FirstUsers.emplace_back(Args&: VU, Args&: ScalarTE);
10550 DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
10551 VecId = FirstUsers.size() - 1;
10552 auto It = MinBWs.find(Val: ScalarTE);
10553 if (It != MinBWs.end() &&
10554 VectorCasts
10555 .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
10556 .second) {
10557 unsigned BWSz = It->second.first;
10558 unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
10559 unsigned VecOpcode;
10560 if (DstBWSz < BWSz)
10561 VecOpcode = Instruction::Trunc;
10562 else
10563 VecOpcode =
10564 It->second.second ? Instruction::SExt : Instruction::ZExt;
10565 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10566 InstructionCost C = TTI->getCastInstrCost(
10567 Opcode: VecOpcode, Dst: FTy,
10568 Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
10569 VF: FTy->getNumElements()),
10570 CCH: TTI::CastContextHint::None, CostKind);
10571 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10572 << " for extending externally used vector with "
10573 "non-equal minimum bitwidth.\n");
10574 Cost += C;
10575 }
10576 } else {
10577 if (isFirstInsertElement(IE1: VU, IE2: cast<InsertElementInst>(Val: It->first)))
10578 It->first = VU;
10579 VecId = std::distance(first: FirstUsers.begin(), last: It);
10580 }
10581 int InIdx = *InsertIdx;
10582 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10583 if (Mask.empty())
10584 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
10585 Mask[InIdx] = EU.Lane;
10586 DemandedElts[VecId].setBit(InIdx);
10587 continue;
10588 }
10589 }
10590 }
10591 // Leave the GEPs as is, they are free in most cases and better to keep them
10592 // as GEPs.
10593 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10594 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: EU.Scalar)) {
10595 if (!ValueToExtUses) {
10596 ValueToExtUses.emplace();
10597 for_each(Range: enumerate(First&: ExternalUses), F: [&](const auto &P) {
10598 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10599 });
10600 }
10601 // Can use original GEP, if no operands vectorized or they are marked as
10602 // externally used already.
10603 bool CanBeUsedAsGEP = all_of(Range: GEP->operands(), P: [&](Value *V) {
10604 if (!getTreeEntry(V))
10605 return true;
10606 auto It = ValueToExtUses->find(Val: V);
10607 if (It != ValueToExtUses->end()) {
10608 // Replace all uses to avoid compiler crash.
10609 ExternalUses[It->second].User = nullptr;
10610 return true;
10611 }
10612 return false;
10613 });
10614 if (CanBeUsedAsGEP) {
10615 ExtractCost += TTI->getInstructionCost(U: GEP, CostKind);
10616 ExternalUsesAsGEPs.insert(Ptr: EU.Scalar);
10617 continue;
10618 }
10619 }
10620
10621 // If we plan to rewrite the tree in a smaller type, we will need to sign
10622 // extend the extracted value back to the original type. Here, we account
10623 // for the extract and the added cost of the sign extend if needed.
10624 auto *VecTy = getWidenedType(ScalarTy: EU.Scalar->getType(), VF: BundleWidth);
10625 auto It = MinBWs.find(Val: getTreeEntry(V: EU.Scalar));
10626 if (It != MinBWs.end()) {
10627 auto *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
10628 unsigned Extend =
10629 It->second.second ? Instruction::SExt : Instruction::ZExt;
10630 VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
10631 ExtractCost += TTI->getExtractWithExtendCost(Opcode: Extend, Dst: EU.Scalar->getType(),
10632 VecTy, Index: EU.Lane);
10633 } else {
10634 ExtractCost += TTI->getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy,
10635 CostKind, Index: EU.Lane);
10636 }
10637 }
10638 // Add reduced value cost, if resized.
10639 if (!VectorizedVals.empty()) {
10640 const TreeEntry &Root = *VectorizableTree.front();
10641 auto BWIt = MinBWs.find(Val: &Root);
10642 if (BWIt != MinBWs.end()) {
10643 Type *DstTy = Root.Scalars.front()->getType();
10644 unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy);
10645 unsigned SrcSz =
10646 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10647 if (OriginalSz != SrcSz) {
10648 unsigned Opcode = Instruction::Trunc;
10649 if (OriginalSz > SrcSz)
10650 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10651 Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
10652 Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
10653 CCH: TTI::CastContextHint::None,
10654 CostKind: TTI::TCK_RecipThroughput);
10655 }
10656 }
10657 }
10658
10659 InstructionCost SpillCost = getSpillCost();
10660 Cost += SpillCost + ExtractCost;
10661 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10662 bool) {
10663 InstructionCost C = 0;
10664 unsigned VF = Mask.size();
10665 unsigned VecVF = TE->getVectorFactor();
10666 if (VF != VecVF &&
10667 (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10668 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))) {
10669 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10670 std::copy(Mask.begin(), std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
10671 OrigMask.begin());
10672 C = TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc,
10673 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
10674 Mask: OrigMask);
10675 LLVM_DEBUG(
10676 dbgs() << "SLP: Adding cost " << C
10677 << " for final shuffle of insertelement external users.\n";
10678 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10679 Cost += C;
10680 return std::make_pair(x&: TE, y: true);
10681 }
10682 return std::make_pair(x&: TE, y: false);
10683 };
10684 // Calculate the cost of the reshuffled vectors, if any.
10685 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10686 Value *Base = cast<Instruction>(Val: FirstUsers[I].first)->getOperand(i: 0);
10687 auto Vector = ShuffleMasks[I].takeVector();
10688 unsigned VF = 0;
10689 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10690 ArrayRef<const TreeEntry *> TEs) {
10691 assert((TEs.size() == 1 || TEs.size() == 2) &&
10692 "Expected exactly 1 or 2 tree entries.");
10693 if (TEs.size() == 1) {
10694 if (VF == 0)
10695 VF = TEs.front()->getVectorFactor();
10696 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
10697 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
10698 !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
10699 return Data.value() == PoisonMaskElem ||
10700 (Data.index() < VF &&
10701 static_cast<int>(Data.index()) == Data.value());
10702 })) {
10703 InstructionCost C =
10704 TTI->getShuffleCost(Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
10705 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10706 << " for final shuffle of insertelement "
10707 "external users.\n";
10708 TEs.front()->dump();
10709 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10710 Cost += C;
10711 }
10712 } else {
10713 if (VF == 0) {
10714 if (TEs.front() &&
10715 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10716 VF = TEs.front()->getVectorFactor();
10717 else
10718 VF = Mask.size();
10719 }
10720 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
10721 InstructionCost C =
10722 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
10723 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10724 << " for final shuffle of vector node and external "
10725 "insertelement users.\n";
10726 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10727 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10728 Cost += C;
10729 }
10730 VF = Mask.size();
10731 return TEs.back();
10732 };
10733 (void)performExtractsShuffleAction<const TreeEntry>(
10734 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
10735 GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF,
10736 Action: EstimateShufflesCost);
10737 InstructionCost InsertCost = TTI->getScalarizationOverhead(
10738 Ty: cast<FixedVectorType>(Val: FirstUsers[I].first->getType()), DemandedElts: DemandedElts[I],
10739 /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
10740 Cost -= InsertCost;
10741 }
10742
10743 // Add the cost for reduced value resize (if required).
10744 if (ReductionBitWidth != 0) {
10745 assert(UserIgnoreList && "Expected reduction tree.");
10746 const TreeEntry &E = *VectorizableTree.front();
10747 auto It = MinBWs.find(Val: &E);
10748 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10749 unsigned SrcSize = It->second.first;
10750 unsigned DstSize = ReductionBitWidth;
10751 unsigned Opcode = Instruction::Trunc;
10752 if (SrcSize < DstSize)
10753 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10754 auto *SrcVecTy =
10755 getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
10756 auto *DstVecTy =
10757 getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
10758 TTI::CastContextHint CCH = getCastContextHint(TE: E);
10759 InstructionCost CastCost;
10760 switch (E.getOpcode()) {
10761 case Instruction::SExt:
10762 case Instruction::ZExt:
10763 case Instruction::Trunc: {
10764 const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0);
10765 CCH = getCastContextHint(TE: *OpTE);
10766 break;
10767 }
10768 default:
10769 break;
10770 }
10771 CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
10772 CostKind: TTI::TCK_RecipThroughput);
10773 Cost += CastCost;
10774 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10775 << " for final resize for reduction from " << SrcVecTy
10776 << " to " << DstVecTy << "\n";
10777 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10778 }
10779 }
10780
10781#ifndef NDEBUG
10782 SmallString<256> Str;
10783 {
10784 raw_svector_ostream OS(Str);
10785 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10786 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10787 << "SLP: Total Cost = " << Cost << ".\n";
10788 }
10789 LLVM_DEBUG(dbgs() << Str);
10790 if (ViewSLPTree)
10791 ViewGraph(this, "SLP" + F->getName(), false, Str);
10792#endif
10793
10794 return Cost;
10795}
10796
10797/// Tries to find extractelement instructions with constant indices from fixed
10798/// vector type and gather such instructions into a bunch, which highly likely
10799/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10800/// successful, the matched scalars are replaced by poison values in \p VL for
10801/// future analysis.
10802std::optional<TTI::ShuffleKind>
10803BoUpSLP::tryToGatherSingleRegisterExtractElements(
10804 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
10805 // Scan list of gathered scalars for extractelements that can be represented
10806 // as shuffles.
10807 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
10808 SmallVector<int> UndefVectorExtracts;
10809 for (int I = 0, E = VL.size(); I < E; ++I) {
10810 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
10811 if (!EI) {
10812 if (isa<UndefValue>(Val: VL[I]))
10813 UndefVectorExtracts.push_back(Elt: I);
10814 continue;
10815 }
10816 auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
10817 if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
10818 continue;
10819 std::optional<unsigned> Idx = getExtractIndex(E: EI);
10820 // Undefined index.
10821 if (!Idx) {
10822 UndefVectorExtracts.push_back(Elt: I);
10823 continue;
10824 }
10825 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10826 ExtractMask.reset(Idx: *Idx);
10827 if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
10828 UndefVectorExtracts.push_back(Elt: I);
10829 continue;
10830 }
10831 VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I);
10832 }
10833 // Sort the vector operands by the maximum number of uses in extractelements.
10834 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
10835 VectorOpToIdx.takeVector();
10836 stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
10837 return P1.second.size() > P2.second.size();
10838 });
10839 // Find the best pair of the vectors or a single vector.
10840 const int UndefSz = UndefVectorExtracts.size();
10841 unsigned SingleMax = 0;
10842 unsigned PairMax = 0;
10843 if (!Vectors.empty()) {
10844 SingleMax = Vectors.front().second.size() + UndefSz;
10845 if (Vectors.size() > 1) {
10846 auto *ItNext = std::next(x: Vectors.begin());
10847 PairMax = SingleMax + ItNext->second.size();
10848 }
10849 }
10850 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10851 return std::nullopt;
10852 // Check if better to perform a shuffle of 2 vectors or just of a single
10853 // vector.
10854 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10855 SmallVector<Value *> GatheredExtracts(
10856 VL.size(), PoisonValue::get(T: VL.front()->getType()));
10857 if (SingleMax >= PairMax && SingleMax) {
10858 for (int Idx : Vectors.front().second)
10859 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
10860 } else if (!Vectors.empty()) {
10861 for (unsigned Idx : {0, 1})
10862 for (int Idx : Vectors[Idx].second)
10863 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
10864 }
10865 // Add extracts from undefs too.
10866 for (int Idx : UndefVectorExtracts)
10867 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
10868 // Check that gather of extractelements can be represented as just a
10869 // shuffle of a single/two vectors the scalars are extracted from.
10870 std::optional<TTI::ShuffleKind> Res =
10871 isFixedVectorShuffle(VL: GatheredExtracts, Mask);
10872 if (!Res) {
10873 // TODO: try to check other subsets if possible.
10874 // Restore the original VL if attempt was not successful.
10875 copy(Range&: SavedVL, Out: VL.begin());
10876 return std::nullopt;
10877 }
10878 // Restore unused scalars from mask, if some of the extractelements were not
10879 // selected for shuffle.
10880 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10881 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) &&
10882 isa<UndefValue>(Val: GatheredExtracts[I])) {
10883 std::swap(a&: VL[I], b&: GatheredExtracts[I]);
10884 continue;
10885 }
10886 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
10887 if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) ||
10888 !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) ||
10889 is_contained(Range&: UndefVectorExtracts, Element: I))
10890 continue;
10891 }
10892 return Res;
10893}
10894
10895/// Tries to find extractelement instructions with constant indices from fixed
10896/// vector type and gather such instructions into a bunch, which highly likely
10897/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10898/// successful, the matched scalars are replaced by poison values in \p VL for
10899/// future analysis.
10900SmallVector<std::optional<TTI::ShuffleKind>>
10901BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10902 SmallVectorImpl<int> &Mask,
10903 unsigned NumParts) const {
10904 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10905 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10906 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
10907 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
10908 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
10909 // Scan list of gathered scalars for extractelements that can be represented
10910 // as shuffles.
10911 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
10912 N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
10913 SmallVector<int> SubMask;
10914 std::optional<TTI::ShuffleKind> Res =
10915 tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
10916 ShufflesRes[Part] = Res;
10917 copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
10918 }
10919 if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
10920 return Res.has_value();
10921 }))
10922 ShufflesRes.clear();
10923 return ShufflesRes;
10924}
10925
10926std::optional<TargetTransformInfo::ShuffleKind>
10927BoUpSLP::isGatherShuffledSingleRegisterEntry(
10928 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10929 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10930 Entries.clear();
10931 // TODO: currently checking only for Scalars in the tree entry, need to count
10932 // reused elements too for better cost estimation.
10933 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10934 const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
10935 const BasicBlock *TEInsertBlock = nullptr;
10936 // Main node of PHI entries keeps the correct order of operands/incoming
10937 // blocks.
10938 if (auto *PHI = dyn_cast<PHINode>(Val: TEUseEI.UserTE->getMainOp())) {
10939 TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
10940 TEInsertPt = TEInsertBlock->getTerminator();
10941 } else {
10942 TEInsertBlock = TEInsertPt->getParent();
10943 }
10944 if (!DT->isReachableFromEntry(A: TEInsertBlock))
10945 return std::nullopt;
10946 auto *NodeUI = DT->getNode(BB: TEInsertBlock);
10947 assert(NodeUI && "Should only process reachable instructions");
10948 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10949 auto CheckOrdering = [&](const Instruction *InsertPt) {
10950 // Argument InsertPt is an instruction where vector code for some other
10951 // tree entry (one that shares one or more scalars with TE) is going to be
10952 // generated. This lambda returns true if insertion point of vector code
10953 // for the TE dominates that point (otherwise dependency is the other way
10954 // around). The other node is not limited to be of a gather kind. Gather
10955 // nodes are not scheduled and their vector code is inserted before their
10956 // first user. If user is PHI, that is supposed to be at the end of a
10957 // predecessor block. Otherwise it is the last instruction among scalars of
10958 // the user node. So, instead of checking dependency between instructions
10959 // themselves, we check dependency between their insertion points for vector
10960 // code (since each scalar instruction ends up as a lane of a vector
10961 // instruction).
10962 const BasicBlock *InsertBlock = InsertPt->getParent();
10963 auto *NodeEUI = DT->getNode(BB: InsertBlock);
10964 if (!NodeEUI)
10965 return false;
10966 assert((NodeUI == NodeEUI) ==
10967 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10968 "Different nodes should have different DFS numbers");
10969 // Check the order of the gather nodes users.
10970 if (TEInsertPt->getParent() != InsertBlock &&
10971 (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI)))
10972 return false;
10973 if (TEInsertPt->getParent() == InsertBlock &&
10974 TEInsertPt->comesBefore(Other: InsertPt))
10975 return false;
10976 return true;
10977 };
10978 // Find all tree entries used by the gathered values. If no common entries
10979 // found - not a shuffle.
10980 // Here we build a set of tree nodes for each gathered value and trying to
10981 // find the intersection between these sets. If we have at least one common
10982 // tree node for each gathered value - we have just a permutation of the
10983 // single vector. If we have 2 different sets, we're in situation where we
10984 // have a permutation of 2 input vectors.
10985 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
10986 DenseMap<Value *, int> UsedValuesEntry;
10987 for (Value *V : VL) {
10988 if (isConstant(V))
10989 continue;
10990 // Build a list of tree entries where V is used.
10991 SmallPtrSet<const TreeEntry *, 4> VToTEs;
10992 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(Val: V)->second) {
10993 if (TEPtr == TE)
10994 continue;
10995 assert(any_of(TEPtr->Scalars,
10996 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10997 "Must contain at least single gathered value.");
10998 assert(TEPtr->UserTreeIndices.size() == 1 &&
10999 "Expected only single user of a gather node.");
11000 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
11001
11002 PHINode *UserPHI = dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp());
11003 const Instruction *InsertPt =
11004 UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
11005 : &getLastInstructionInBundle(E: UseEI.UserTE);
11006 if (TEInsertPt == InsertPt) {
11007 // If 2 gathers are operands of the same entry (regardless of whether
11008 // user is PHI or else), compare operands indices, use the earlier one
11009 // as the base.
11010 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
11011 continue;
11012 // If the user instruction is used for some reason in different
11013 // vectorized nodes - make it depend on index.
11014 if (TEUseEI.UserTE != UseEI.UserTE &&
11015 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
11016 continue;
11017 }
11018
11019 // Check if the user node of the TE comes after user node of TEPtr,
11020 // otherwise TEPtr depends on TE.
11021 if ((TEInsertBlock != InsertPt->getParent() ||
11022 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
11023 !CheckOrdering(InsertPt))
11024 continue;
11025 VToTEs.insert(Ptr: TEPtr);
11026 }
11027 if (const TreeEntry *VTE = getTreeEntry(V)) {
11028 if (ForOrder) {
11029 if (VTE->State != TreeEntry::Vectorize) {
11030 auto It = MultiNodeScalars.find(Val: V);
11031 if (It == MultiNodeScalars.end())
11032 continue;
11033 VTE = *It->getSecond().begin();
11034 // Iterate through all vectorized nodes.
11035 auto *MIt = find_if(Range&: It->getSecond(), P: [](const TreeEntry *MTE) {
11036 return MTE->State == TreeEntry::Vectorize;
11037 });
11038 if (MIt == It->getSecond().end())
11039 continue;
11040 VTE = *MIt;
11041 }
11042 }
11043 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
11044 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
11045 continue;
11046 VToTEs.insert(Ptr: VTE);
11047 }
11048 if (VToTEs.empty())
11049 continue;
11050 if (UsedTEs.empty()) {
11051 // The first iteration, just insert the list of nodes to vector.
11052 UsedTEs.push_back(Elt: VToTEs);
11053 UsedValuesEntry.try_emplace(Key: V, Args: 0);
11054 } else {
11055 // Need to check if there are any previously used tree nodes which use V.
11056 // If there are no such nodes, consider that we have another one input
11057 // vector.
11058 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
11059 unsigned Idx = 0;
11060 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
11061 // Do we have a non-empty intersection of previously listed tree entries
11062 // and tree entries using current V?
11063 set_intersect(S1&: VToTEs, S2: Set);
11064 if (!VToTEs.empty()) {
11065 // Yes, write the new subset and continue analysis for the next
11066 // scalar.
11067 Set.swap(RHS&: VToTEs);
11068 break;
11069 }
11070 VToTEs = SavedVToTEs;
11071 ++Idx;
11072 }
11073 // No non-empty intersection found - need to add a second set of possible
11074 // source vectors.
11075 if (Idx == UsedTEs.size()) {
11076 // If the number of input vectors is greater than 2 - not a permutation,
11077 // fallback to the regular gather.
11078 // TODO: support multiple reshuffled nodes.
11079 if (UsedTEs.size() == 2)
11080 continue;
11081 UsedTEs.push_back(Elt: SavedVToTEs);
11082 Idx = UsedTEs.size() - 1;
11083 }
11084 UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
11085 }
11086 }
11087
11088 if (UsedTEs.empty()) {
11089 Entries.clear();
11090 return std::nullopt;
11091 }
11092
11093 unsigned VF = 0;
11094 if (UsedTEs.size() == 1) {
11095 // Keep the order to avoid non-determinism.
11096 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
11097 UsedTEs.front().end());
11098 sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
11099 return TE1->Idx < TE2->Idx;
11100 });
11101 // Try to find the perfect match in another gather node at first.
11102 auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) {
11103 return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars);
11104 });
11105 if (It != FirstEntries.end() &&
11106 ((*It)->getVectorFactor() == VL.size() ||
11107 ((*It)->getVectorFactor() == TE->Scalars.size() &&
11108 TE->ReuseShuffleIndices.size() == VL.size() &&
11109 (*It)->isSame(VL: TE->Scalars)))) {
11110 Entries.push_back(Elt: *It);
11111 if ((*It)->getVectorFactor() == VL.size()) {
11112 std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
11113 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0);
11114 } else {
11115 SmallVector<int> CommonMask = TE->getCommonMask();
11116 copy(Range&: CommonMask, Out: Mask.begin());
11117 }
11118 // Clear undef scalars.
11119 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11120 if (isa<PoisonValue>(Val: VL[I]))
11121 Mask[I] = PoisonMaskElem;
11122 return TargetTransformInfo::SK_PermuteSingleSrc;
11123 }
11124 // No perfect match, just shuffle, so choose the first tree node from the
11125 // tree.
11126 Entries.push_back(Elt: FirstEntries.front());
11127 } else {
11128 // Try to find nodes with the same vector factor.
11129 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
11130 // Keep the order of tree nodes to avoid non-determinism.
11131 DenseMap<int, const TreeEntry *> VFToTE;
11132 for (const TreeEntry *TE : UsedTEs.front()) {
11133 unsigned VF = TE->getVectorFactor();
11134 auto It = VFToTE.find(Val: VF);
11135 if (It != VFToTE.end()) {
11136 if (It->second->Idx > TE->Idx)
11137 It->getSecond() = TE;
11138 continue;
11139 }
11140 VFToTE.try_emplace(Key: VF, Args&: TE);
11141 }
11142 // Same, keep the order to avoid non-determinism.
11143 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
11144 UsedTEs.back().end());
11145 sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
11146 return TE1->Idx < TE2->Idx;
11147 });
11148 for (const TreeEntry *TE : SecondEntries) {
11149 auto It = VFToTE.find(Val: TE->getVectorFactor());
11150 if (It != VFToTE.end()) {
11151 VF = It->first;
11152 Entries.push_back(Elt: It->second);
11153 Entries.push_back(Elt: TE);
11154 break;
11155 }
11156 }
11157 // No 2 source vectors with the same vector factor - just choose 2 with max
11158 // index.
11159 if (Entries.empty()) {
11160 Entries.push_back(Elt: *llvm::max_element(
11161 Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) {
11162 return TE1->Idx < TE2->Idx;
11163 }));
11164 Entries.push_back(Elt: SecondEntries.front());
11165 VF = std::max(a: Entries.front()->getVectorFactor(),
11166 b: Entries.back()->getVectorFactor());
11167 }
11168 }
11169
11170 bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>);
11171 // Checks if the 2 PHIs are compatible in terms of high possibility to be
11172 // vectorized.
11173 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
11174 auto *PHI = cast<PHINode>(Val: V);
11175 auto *PHI1 = cast<PHINode>(Val: V1);
11176 // Check that all incoming values are compatible/from same parent (if they
11177 // are instructions).
11178 // The incoming values are compatible if they all are constants, or
11179 // instruction with the same/alternate opcodes from the same basic block.
11180 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
11181 Value *In = PHI->getIncomingValue(i: I);
11182 Value *In1 = PHI1->getIncomingValue(i: I);
11183 if (isConstant(V: In) && isConstant(V: In1))
11184 continue;
11185 if (!getSameOpcode(VL: {In, In1}, TLI: *TLI).getOpcode())
11186 return false;
11187 if (cast<Instruction>(Val: In)->getParent() !=
11188 cast<Instruction>(Val: In1)->getParent())
11189 return false;
11190 }
11191 return true;
11192 };
11193 // Check if the value can be ignored during analysis for shuffled gathers.
11194 // We suppose it is better to ignore instruction, which do not form splats,
11195 // are not vectorized/not extractelements (these instructions will be handled
11196 // by extractelements processing) or may form vector node in future.
11197 auto MightBeIgnored = [=](Value *V) {
11198 auto *I = dyn_cast<Instruction>(Val: V);
11199 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(Val: I) &&
11200 !isVectorLikeInstWithConstOps(V: I) &&
11201 !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
11202 };
11203 // Check that the neighbor instruction may form a full vector node with the
11204 // current instruction V. It is possible, if they have same/alternate opcode
11205 // and same parent basic block.
11206 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11207 Value *V1 = VL[Idx];
11208 bool UsedInSameVTE = false;
11209 auto It = UsedValuesEntry.find(Val: V1);
11210 if (It != UsedValuesEntry.end())
11211 UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second;
11212 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11213 getSameOpcode(VL: {V, V1}, TLI: *TLI).getOpcode() &&
11214 cast<Instruction>(Val: V)->getParent() ==
11215 cast<Instruction>(Val: V1)->getParent() &&
11216 (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1));
11217 };
11218 // Build a shuffle mask for better cost estimation and vector emission.
11219 SmallBitVector UsedIdxs(Entries.size());
11220 SmallVector<std::pair<unsigned, int>> EntryLanes;
11221 for (int I = 0, E = VL.size(); I < E; ++I) {
11222 Value *V = VL[I];
11223 auto It = UsedValuesEntry.find(Val: V);
11224 if (It == UsedValuesEntry.end())
11225 continue;
11226 // Do not try to shuffle scalars, if they are constants, or instructions
11227 // that can be vectorized as a result of the following vector build
11228 // vectorization.
11229 if (isConstant(V) || (MightBeIgnored(V) &&
11230 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11231 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11232 continue;
11233 unsigned Idx = It->second;
11234 EntryLanes.emplace_back(Args&: Idx, Args&: I);
11235 UsedIdxs.set(Idx);
11236 }
11237 // Iterate through all shuffled scalars and select entries, which can be used
11238 // for final shuffle.
11239 SmallVector<const TreeEntry *> TempEntries;
11240 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11241 if (!UsedIdxs.test(Idx: I))
11242 continue;
11243 // Fix the entry number for the given scalar. If it is the first entry, set
11244 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11245 // These indices are used when calculating final shuffle mask as the vector
11246 // offset.
11247 for (std::pair<unsigned, int> &Pair : EntryLanes)
11248 if (Pair.first == I)
11249 Pair.first = TempEntries.size();
11250 TempEntries.push_back(Elt: Entries[I]);
11251 }
11252 Entries.swap(RHS&: TempEntries);
11253 if (EntryLanes.size() == Entries.size() &&
11254 !VL.equals(RHS: ArrayRef(TE->Scalars)
11255 .slice(N: Part * VL.size(),
11256 M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
11257 // We may have here 1 or 2 entries only. If the number of scalars is equal
11258 // to the number of entries, no need to do the analysis, it is not very
11259 // profitable. Since VL is not the same as TE->Scalars, it means we already
11260 // have some shuffles before. Cut off not profitable case.
11261 Entries.clear();
11262 return std::nullopt;
11263 }
11264 // Build the final mask, check for the identity shuffle, if possible.
11265 bool IsIdentity = Entries.size() == 1;
11266 // Pair.first is the offset to the vector, while Pair.second is the index of
11267 // scalar in the list.
11268 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11269 unsigned Idx = Part * VL.size() + Pair.second;
11270 Mask[Idx] =
11271 Pair.first * VF +
11272 (ForOrder ? std::distance(
11273 first: Entries[Pair.first]->Scalars.begin(),
11274 last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second]))
11275 : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second]));
11276 IsIdentity &= Mask[Idx] == Pair.second;
11277 }
11278 switch (Entries.size()) {
11279 case 1:
11280 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11281 return TargetTransformInfo::SK_PermuteSingleSrc;
11282 break;
11283 case 2:
11284 if (EntryLanes.size() > 2 || VL.size() <= 2)
11285 return TargetTransformInfo::SK_PermuteTwoSrc;
11286 break;
11287 default:
11288 break;
11289 }
11290 Entries.clear();
11291 // Clear the corresponding mask elements.
11292 std::fill(std::next(x: Mask.begin(), n: Part * VL.size()),
11293 std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), PoisonMaskElem);
11294 return std::nullopt;
11295}
11296
11297SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
11298BoUpSLP::isGatherShuffledEntry(
11299 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11300 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11301 bool ForOrder) {
11302 assert(NumParts > 0 && NumParts < VL.size() &&
11303 "Expected positive number of registers.");
11304 Entries.clear();
11305 // No need to check for the topmost gather node.
11306 if (TE == VectorizableTree.front().get())
11307 return {};
11308 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11309 if (TE->isNonPowOf2Vec())
11310 return {};
11311 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
11312 assert(TE->UserTreeIndices.size() == 1 &&
11313 "Expected only single user of the gather node.");
11314 assert(VL.size() % NumParts == 0 &&
11315 "Number of scalars must be divisible by NumParts.");
11316 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
11317 SmallVector<std::optional<TTI::ShuffleKind>> Res;
11318 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
11319 ArrayRef<Value *> SubVL =
11320 VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
11321 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11322 std::optional<TTI::ShuffleKind> SubRes =
11323 isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
11324 ForOrder);
11325 if (!SubRes)
11326 SubEntries.clear();
11327 Res.push_back(Elt: SubRes);
11328 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11329 SubEntries.front()->getVectorFactor() == VL.size() &&
11330 (SubEntries.front()->isSame(VL: TE->Scalars) ||
11331 SubEntries.front()->isSame(VL))) {
11332 SmallVector<const TreeEntry *> LocalSubEntries;
11333 LocalSubEntries.swap(RHS&: SubEntries);
11334 Entries.clear();
11335 Res.clear();
11336 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
11337 // Clear undef scalars.
11338 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11339 if (isa<PoisonValue>(Val: VL[I]))
11340 Mask[I] = PoisonMaskElem;
11341 Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front());
11342 Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
11343 return Res;
11344 }
11345 }
11346 if (all_of(Range&: Res,
11347 P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11348 Entries.clear();
11349 return {};
11350 }
11351 return Res;
11352}
11353
11354InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11355 Type *ScalarTy) const {
11356 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11357 bool DuplicateNonConst = false;
11358 // Find the cost of inserting/extracting values from the vector.
11359 // Check if the same elements are inserted several times and count them as
11360 // shuffle candidates.
11361 APInt ShuffledElements = APInt::getZero(numBits: VL.size());
11362 DenseMap<Value *, unsigned> UniqueElements;
11363 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
11364 InstructionCost Cost;
11365 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11366 if (V->getType() != ScalarTy) {
11367 Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
11368 CCH: TTI::CastContextHint::None, CostKind);
11369 V = nullptr;
11370 }
11371 if (!ForPoisonSrc)
11372 Cost +=
11373 TTI->getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
11374 Index: I, Op0: Constant::getNullValue(Ty: VecTy), Op1: V);
11375 };
11376 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11377 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11378 Value *V = VL[I];
11379 // No need to shuffle duplicates for constants.
11380 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V)) {
11381 ShuffledElements.setBit(I);
11382 ShuffleMask[I] = isa<PoisonValue>(Val: V) ? PoisonMaskElem : I;
11383 continue;
11384 }
11385
11386 auto Res = UniqueElements.try_emplace(Key: V, Args&: I);
11387 if (Res.second) {
11388 EstimateInsertCost(I, V);
11389 ShuffleMask[I] = I;
11390 continue;
11391 }
11392
11393 DuplicateNonConst = true;
11394 ShuffledElements.setBit(I);
11395 ShuffleMask[I] = Res.first->second;
11396 }
11397 if (ForPoisonSrc)
11398 Cost =
11399 TTI->getScalarizationOverhead(Ty: VecTy, DemandedElts: ~ShuffledElements, /*Insert*/ true,
11400 /*Extract*/ false, CostKind);
11401 if (DuplicateNonConst)
11402 Cost += TTI->getShuffleCost(Kind: TargetTransformInfo::SK_PermuteSingleSrc,
11403 Tp: VecTy, Mask: ShuffleMask);
11404 return Cost;
11405}
11406
11407// Perform operand reordering on the instructions in VL and return the reordered
11408// operands in Left and Right.
11409void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11410 SmallVectorImpl<Value *> &Left,
11411 SmallVectorImpl<Value *> &Right,
11412 const BoUpSLP &R) {
11413 if (VL.empty())
11414 return;
11415 VLOperands Ops(VL, R);
11416 // Reorder the operands in place.
11417 Ops.reorder();
11418 Left = Ops.getVL(OpIdx: 0);
11419 Right = Ops.getVL(OpIdx: 1);
11420}
11421
11422Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11423 auto &Res = EntryToLastInstruction.FindAndConstruct(Key: E);
11424 if (Res.second)
11425 return *Res.second;
11426 // Get the basic block this bundle is in. All instructions in the bundle
11427 // should be in this block (except for extractelement-like instructions with
11428 // constant indeces).
11429 auto *Front = E->getMainOp();
11430 auto *BB = Front->getParent();
11431 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11432 if (E->getOpcode() == Instruction::GetElementPtr &&
11433 !isa<GetElementPtrInst>(V))
11434 return true;
11435 auto *I = cast<Instruction>(V);
11436 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11437 isVectorLikeInstWithConstOps(I);
11438 }));
11439
11440 auto FindLastInst = [&]() {
11441 Instruction *LastInst = Front;
11442 for (Value *V : E->Scalars) {
11443 auto *I = dyn_cast<Instruction>(Val: V);
11444 if (!I)
11445 continue;
11446 if (LastInst->getParent() == I->getParent()) {
11447 if (LastInst->comesBefore(Other: I))
11448 LastInst = I;
11449 continue;
11450 }
11451 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11452 !isa<GetElementPtrInst>(I)) ||
11453 (isVectorLikeInstWithConstOps(LastInst) &&
11454 isVectorLikeInstWithConstOps(I))) &&
11455 "Expected vector-like or non-GEP in GEP node insts only.");
11456 if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
11457 LastInst = I;
11458 continue;
11459 }
11460 if (!DT->isReachableFromEntry(A: I->getParent()))
11461 continue;
11462 auto *NodeA = DT->getNode(BB: LastInst->getParent());
11463 auto *NodeB = DT->getNode(BB: I->getParent());
11464 assert(NodeA && "Should only process reachable instructions");
11465 assert(NodeB && "Should only process reachable instructions");
11466 assert((NodeA == NodeB) ==
11467 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11468 "Different nodes should have different DFS numbers");
11469 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11470 LastInst = I;
11471 }
11472 BB = LastInst->getParent();
11473 return LastInst;
11474 };
11475
11476 auto FindFirstInst = [&]() {
11477 Instruction *FirstInst = Front;
11478 for (Value *V : E->Scalars) {
11479 auto *I = dyn_cast<Instruction>(Val: V);
11480 if (!I)
11481 continue;
11482 if (FirstInst->getParent() == I->getParent()) {
11483 if (I->comesBefore(Other: FirstInst))
11484 FirstInst = I;
11485 continue;
11486 }
11487 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11488 !isa<GetElementPtrInst>(I)) ||
11489 (isVectorLikeInstWithConstOps(FirstInst) &&
11490 isVectorLikeInstWithConstOps(I))) &&
11491 "Expected vector-like or non-GEP in GEP node insts only.");
11492 if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
11493 FirstInst = I;
11494 continue;
11495 }
11496 if (!DT->isReachableFromEntry(A: I->getParent()))
11497 continue;
11498 auto *NodeA = DT->getNode(BB: FirstInst->getParent());
11499 auto *NodeB = DT->getNode(BB: I->getParent());
11500 assert(NodeA && "Should only process reachable instructions");
11501 assert(NodeB && "Should only process reachable instructions");
11502 assert((NodeA == NodeB) ==
11503 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11504 "Different nodes should have different DFS numbers");
11505 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11506 FirstInst = I;
11507 }
11508 return FirstInst;
11509 };
11510
11511 // Set the insert point to the beginning of the basic block if the entry
11512 // should not be scheduled.
11513 if (doesNotNeedToSchedule(VL: E->Scalars) ||
11514 (!E->isGather() && all_of(Range: E->Scalars, P: isVectorLikeInstWithConstOps))) {
11515 if ((E->getOpcode() == Instruction::GetElementPtr &&
11516 any_of(Range: E->Scalars,
11517 P: [](Value *V) {
11518 return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
11519 })) ||
11520 all_of(Range: E->Scalars,
11521 P: [](Value *V) {
11522 return !isVectorLikeInstWithConstOps(V) &&
11523 isUsedOutsideBlock(V);
11524 }) ||
11525 (E->isGather() && E->Idx == 0 && all_of(Range: E->Scalars, P: [](Value *V) {
11526 return isa<ExtractElementInst, UndefValue>(Val: V) ||
11527 areAllOperandsNonInsts(V);
11528 })))
11529 Res.second = FindLastInst();
11530 else
11531 Res.second = FindFirstInst();
11532 return *Res.second;
11533 }
11534
11535 // Find the last instruction. The common case should be that BB has been
11536 // scheduled, and the last instruction is VL.back(). So we start with
11537 // VL.back() and iterate over schedule data until we reach the end of the
11538 // bundle. The end of the bundle is marked by null ScheduleData.
11539 if (BlocksSchedules.count(Key: BB)) {
11540 Value *V = E->isOneOf(Op: E->Scalars.back());
11541 if (doesNotNeedToBeScheduled(V))
11542 V = *find_if_not(Range: E->Scalars, P: doesNotNeedToBeScheduled);
11543 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11544 if (Bundle && Bundle->isPartOfBundle())
11545 for (; Bundle; Bundle = Bundle->NextInBundle)
11546 if (Bundle->OpValue == Bundle->Inst)
11547 Res.second = Bundle->Inst;
11548 }
11549
11550 // LastInst can still be null at this point if there's either not an entry
11551 // for BB in BlocksSchedules or there's no ScheduleData available for
11552 // VL.back(). This can be the case if buildTree_rec aborts for various
11553 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11554 // size is reached, etc.). ScheduleData is initialized in the scheduling
11555 // "dry-run".
11556 //
11557 // If this happens, we can still find the last instruction by brute force. We
11558 // iterate forwards from Front (inclusive) until we either see all
11559 // instructions in the bundle or reach the end of the block. If Front is the
11560 // last instruction in program order, LastInst will be set to Front, and we
11561 // will visit all the remaining instructions in the block.
11562 //
11563 // One of the reasons we exit early from buildTree_rec is to place an upper
11564 // bound on compile-time. Thus, taking an additional compile-time hit here is
11565 // not ideal. However, this should be exceedingly rare since it requires that
11566 // we both exit early from buildTree_rec and that the bundle be out-of-order
11567 // (causing us to iterate all the way to the end of the block).
11568 if (!Res.second)
11569 Res.second = FindLastInst();
11570 assert(Res.second && "Failed to find last instruction in bundle");
11571 return *Res.second;
11572}
11573
11574void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11575 auto *Front = E->getMainOp();
11576 Instruction *LastInst = &getLastInstructionInBundle(E);
11577 assert(LastInst && "Failed to find last instruction in bundle");
11578 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11579 // If the instruction is PHI, set the insert point after all the PHIs.
11580 bool IsPHI = isa<PHINode>(Val: LastInst);
11581 if (IsPHI)
11582 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11583 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(VL: E->Scalars))) {
11584 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
11585 } else {
11586 // Set the insertion point after the last instruction in the bundle. Set the
11587 // debug location to Front.
11588 Builder.SetInsertPoint(
11589 TheBB: LastInst->getParent(),
11590 IP: LastInst->getNextNonDebugInstruction()->getIterator());
11591 }
11592 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11593}
11594
11595Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11596 // List of instructions/lanes from current block and/or the blocks which are
11597 // part of the current loop. These instructions will be inserted at the end to
11598 // make it possible to optimize loops and hoist invariant instructions out of
11599 // the loops body with better chances for success.
11600 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
11601 SmallSet<int, 4> PostponedIndices;
11602 Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
11603 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11604 SmallPtrSet<BasicBlock *, 4> Visited;
11605 while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
11606 InsertBB = InsertBB->getSinglePredecessor();
11607 return InsertBB && InsertBB == InstBB;
11608 };
11609 for (int I = 0, E = VL.size(); I < E; ++I) {
11610 if (auto *Inst = dyn_cast<Instruction>(Val: VL[I]))
11611 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11612 getTreeEntry(V: Inst) ||
11613 (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
11614 PostponedIndices.insert(V: I).second)
11615 PostponedInsts.emplace_back(Args&: Inst, Args&: I);
11616 }
11617
11618 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11619 Type *Ty) {
11620 Value *Scalar = V;
11621 if (Scalar->getType() != Ty) {
11622 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11623 "Expected integer types only.");
11624 Value *V = Scalar;
11625 if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
11626 isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
11627 Value *Op = CI->getOperand(i_nocapture: 0);
11628 if (auto *IOp = dyn_cast<Instruction>(Val: Op);
11629 !IOp || !(isDeleted(I: IOp) || getTreeEntry(V: IOp)))
11630 V = Op;
11631 }
11632 Scalar = Builder.CreateIntCast(
11633 V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
11634 }
11635
11636 Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
11637 auto *InsElt = dyn_cast<InsertElementInst>(Val: Vec);
11638 if (!InsElt)
11639 return Vec;
11640 GatherShuffleExtractSeq.insert(X: InsElt);
11641 CSEBlocks.insert(V: InsElt->getParent());
11642 // Add to our 'need-to-extract' list.
11643 if (isa<Instruction>(Val: V)) {
11644 if (TreeEntry *Entry = getTreeEntry(V)) {
11645 // Find which lane we need to extract.
11646 User *UserOp = nullptr;
11647 if (Scalar != V) {
11648 if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
11649 UserOp = SI;
11650 } else {
11651 UserOp = InsElt;
11652 }
11653 if (UserOp) {
11654 unsigned FoundLane = Entry->findLaneForValue(V);
11655 ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: FoundLane);
11656 }
11657 }
11658 }
11659 return Vec;
11660 };
11661 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11662 Value *Vec = Root ? Root : PoisonValue::get(T: VecTy);
11663 SmallVector<int> NonConsts;
11664 // Insert constant values at first.
11665 for (int I = 0, E = VL.size(); I < E; ++I) {
11666 if (PostponedIndices.contains(V: I))
11667 continue;
11668 if (!isConstant(V: VL[I])) {
11669 NonConsts.push_back(Elt: I);
11670 continue;
11671 }
11672 if (Root) {
11673 if (!isa<UndefValue>(Val: VL[I])) {
11674 NonConsts.push_back(Elt: I);
11675 continue;
11676 }
11677 if (isa<PoisonValue>(Val: VL[I]))
11678 continue;
11679 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Root)) {
11680 if (SV->getMaskValue(Elt: I) == PoisonMaskElem)
11681 continue;
11682 }
11683 }
11684 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11685 }
11686 // Insert non-constant values.
11687 for (int I : NonConsts)
11688 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11689 // Append instructions, which are/may be part of the loop, in the end to make
11690 // it possible to hoist non-loop-based instructions.
11691 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11692 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11693
11694 return Vec;
11695}
11696
11697/// Merges shuffle masks and emits final shuffle instruction, if required. It
11698/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11699/// when the actual shuffle instruction is generated only if this is actually
11700/// required. Otherwise, the shuffle instruction emission is delayed till the
11701/// end of the process, to reduce the number of emitted instructions and further
11702/// analysis/transformations.
11703/// The class also will look through the previously emitted shuffle instructions
11704/// and properly mark indices in mask as undef.
11705/// For example, given the code
11706/// \code
11707/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11708/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11709/// \endcode
11710/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11711/// look through %s1 and %s2 and emit
11712/// \code
11713/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11714/// \endcode
11715/// instead.
11716/// If 2 operands are of different size, the smallest one will be resized and
11717/// the mask recalculated properly.
11718/// For example, given the code
11719/// \code
11720/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11721/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11722/// \endcode
11723/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11724/// look through %s1 and %s2 and emit
11725/// \code
11726/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11727/// \endcode
11728/// instead.
11729class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11730 bool IsFinalized = false;
11731 /// Combined mask for all applied operands and masks. It is built during
11732 /// analysis and actual emission of shuffle vector instructions.
11733 SmallVector<int> CommonMask;
11734 /// List of operands for the shuffle vector instruction. It hold at max 2
11735 /// operands, if the 3rd is going to be added, the first 2 are combined into
11736 /// shuffle with \p CommonMask mask, the first operand sets to be the
11737 /// resulting shuffle and the second operand sets to be the newly added
11738 /// operand. The \p CommonMask is transformed in the proper way after that.
11739 SmallVector<Value *, 2> InVectors;
11740 Type *ScalarTy = nullptr;
11741 IRBuilderBase &Builder;
11742 BoUpSLP &R;
11743
11744 class ShuffleIRBuilder {
11745 IRBuilderBase &Builder;
11746 /// Holds all of the instructions that we gathered.
11747 SetVector<Instruction *> &GatherShuffleExtractSeq;
11748 /// A list of blocks that we are going to CSE.
11749 DenseSet<BasicBlock *> &CSEBlocks;
11750 /// Data layout.
11751 const DataLayout &DL;
11752
11753 public:
11754 ShuffleIRBuilder(IRBuilderBase &Builder,
11755 SetVector<Instruction *> &GatherShuffleExtractSeq,
11756 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11757 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11758 CSEBlocks(CSEBlocks), DL(DL) {}
11759 ~ShuffleIRBuilder() = default;
11760 /// Creates shufflevector for the 2 operands with the given mask.
11761 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11762 if (V1->getType() != V2->getType()) {
11763 assert(V1->getType()->isIntOrIntVectorTy() &&
11764 V1->getType()->isIntOrIntVectorTy() &&
11765 "Expected integer vector types only.");
11766 if (V1->getType() != V2->getType()) {
11767 if (cast<VectorType>(Val: V2->getType())
11768 ->getElementType()
11769 ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
11770 ->getElementType()
11771 ->getIntegerBitWidth())
11772 V2 = Builder.CreateIntCast(
11773 V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL)));
11774 else
11775 V1 = Builder.CreateIntCast(
11776 V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL)));
11777 }
11778 }
11779 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11780 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11781 GatherShuffleExtractSeq.insert(X: I);
11782 CSEBlocks.insert(V: I->getParent());
11783 }
11784 return Vec;
11785 }
11786 /// Creates permutation of the single vector operand with the given mask, if
11787 /// it is not identity mask.
11788 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11789 if (Mask.empty())
11790 return V1;
11791 unsigned VF = Mask.size();
11792 unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11793 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
11794 return V1;
11795 Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
11796 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
11797 GatherShuffleExtractSeq.insert(X: I);
11798 CSEBlocks.insert(V: I->getParent());
11799 }
11800 return Vec;
11801 }
11802 Value *createIdentity(Value *V) { return V; }
11803 Value *createPoison(Type *Ty, unsigned VF) {
11804 return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
11805 }
11806 /// Resizes 2 input vector to match the sizes, if the they are not equal
11807 /// yet. The smallest vector is resized to the size of the larger vector.
11808 void resizeToMatch(Value *&V1, Value *&V2) {
11809 if (V1->getType() == V2->getType())
11810 return;
11811 int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
11812 int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
11813 int VF = std::max(a: V1VF, b: V2VF);
11814 int MinVF = std::min(a: V1VF, b: V2VF);
11815 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11816 std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
11817 value: 0);
11818 Value *&Op = MinVF == V1VF ? V1 : V2;
11819 Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
11820 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
11821 GatherShuffleExtractSeq.insert(X: I);
11822 CSEBlocks.insert(V: I->getParent());
11823 }
11824 if (MinVF == V1VF)
11825 V1 = Op;
11826 else
11827 V2 = Op;
11828 }
11829 };
11830
11831 /// Smart shuffle instruction emission, walks through shuffles trees and
11832 /// tries to find the best matching vector for the actual shuffle
11833 /// instruction.
11834 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11835 assert(V1 && "Expected at least one vector value.");
11836 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11837 R.CSEBlocks, *R.DL);
11838 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11839 Builder&: ShuffleBuilder);
11840 }
11841
11842 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11843 /// shuffle emission.
11844 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11845 ArrayRef<int> Mask) {
11846 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11847 if (Mask[Idx] != PoisonMaskElem)
11848 CommonMask[Idx] = Idx;
11849 }
11850
11851 /// Cast value \p V to the vector type with the same number of elements, but
11852 /// the base type \p ScalarTy.
11853 Value *castToScalarTyElem(Value *V,
11854 std::optional<bool> IsSigned = std::nullopt) {
11855 auto *VecTy = cast<VectorType>(Val: V->getType());
11856 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
11857 if (VecTy->getElementType() == ScalarTy->getScalarType())
11858 return V;
11859 return Builder.CreateIntCast(
11860 V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
11861 isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL))));
11862 }
11863
11864public:
11865 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
11866 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11867
11868 /// Adjusts extractelements after reusing them.
11869 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11870 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11871 unsigned NumParts, bool &UseVecBaseAsInput) {
11872 UseVecBaseAsInput = false;
11873 SmallPtrSet<Value *, 4> UniqueBases;
11874 Value *VecBase = nullptr;
11875 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11876 int Idx = Mask[I];
11877 if (Idx == PoisonMaskElem)
11878 continue;
11879 auto *EI = cast<ExtractElementInst>(Val: E->Scalars[I]);
11880 VecBase = EI->getVectorOperand();
11881 if (const TreeEntry *TE = R.getTreeEntry(V: VecBase))
11882 VecBase = TE->VectorizedValue;
11883 assert(VecBase && "Expected vectorized value.");
11884 UniqueBases.insert(Ptr: VecBase);
11885 // If the only one use is vectorized - can delete the extractelement
11886 // itself.
11887 if (!EI->hasOneUse() || (NumParts != 1 && count(Range: E->Scalars, Element: EI) > 1) ||
11888 any_of(Range: EI->users(), P: [&](User *U) {
11889 const TreeEntry *UTE = R.getTreeEntry(V: U);
11890 return !UTE || R.MultiNodeScalars.contains(Val: U) ||
11891 (isa<GetElementPtrInst>(Val: U) &&
11892 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) ||
11893 count_if(Range&: R.VectorizableTree,
11894 P: [&](const std::unique_ptr<TreeEntry> &TE) {
11895 return any_of(Range&: TE->UserTreeIndices,
11896 P: [&](const EdgeInfo &Edge) {
11897 return Edge.UserTE == UTE;
11898 }) &&
11899 is_contained(Range&: TE->Scalars, Element: EI);
11900 }) != 1;
11901 }))
11902 continue;
11903 R.eraseInstruction(I: EI);
11904 }
11905 if (NumParts == 1 || UniqueBases.size() == 1) {
11906 assert(VecBase && "Expected vectorized value.");
11907 return castToScalarTyElem(V: VecBase);
11908 }
11909 UseVecBaseAsInput = true;
11910 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11911 for (auto [I, Idx] : enumerate(First&: Mask))
11912 if (Idx != PoisonMaskElem)
11913 Idx = I;
11914 };
11915 // Perform multi-register vector shuffle, joining them into a single virtual
11916 // long vector.
11917 // Need to shuffle each part independently and then insert all this parts
11918 // into a long virtual vector register, forming the original vector.
11919 Value *Vec = nullptr;
11920 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11921 unsigned SliceSize = getPartNumElems(Size: E->Scalars.size(), NumParts);
11922 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
11923 unsigned Limit = getNumElems(Size: E->Scalars.size(), PartNumElems: SliceSize, Part);
11924 ArrayRef<Value *> VL =
11925 ArrayRef(E->Scalars).slice(N: Part * SliceSize, M: Limit);
11926 MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
11927 constexpr int MaxBases = 2;
11928 SmallVector<Value *, MaxBases> Bases(MaxBases);
11929 auto VLMask = zip(t&: VL, u&: SubMask);
11930 const unsigned VF = std::accumulate(
11931 first: VLMask.begin(), last: VLMask.end(), init: 0U, binary_op: [&](unsigned S, const auto &D) {
11932 if (std::get<1>(D) == PoisonMaskElem)
11933 return S;
11934 Value *VecOp =
11935 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
11936 if (const TreeEntry *TE = R.getTreeEntry(V: VecOp))
11937 VecOp = TE->VectorizedValue;
11938 assert(VecOp && "Expected vectorized value.");
11939 const unsigned Size =
11940 cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
11941 return std::max(a: S, b: Size);
11942 });
11943 for (const auto [V, I] : VLMask) {
11944 if (I == PoisonMaskElem)
11945 continue;
11946 Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
11947 if (const TreeEntry *TE = R.getTreeEntry(V: VecOp))
11948 VecOp = TE->VectorizedValue;
11949 assert(VecOp && "Expected vectorized value.");
11950 VecOp = castToScalarTyElem(V: VecOp);
11951 Bases[I / VF] = VecOp;
11952 }
11953 if (!Bases.front())
11954 continue;
11955 Value *SubVec;
11956 if (Bases.back()) {
11957 SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
11958 TransformToIdentity(SubMask);
11959 } else {
11960 SubVec = Bases.front();
11961 }
11962 if (!Vec) {
11963 Vec = SubVec;
11964 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11965 [&](unsigned P) {
11966 ArrayRef<int> SubMask =
11967 Mask.slice(P * SliceSize,
11968 getNumElems(Mask.size(),
11969 SliceSize, P));
11970 return all_of(SubMask, [](int Idx) {
11971 return Idx == PoisonMaskElem;
11972 });
11973 })) &&
11974 "Expected first part or all previous parts masked.");
11975 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11976 } else {
11977 unsigned NewVF =
11978 cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
11979 if (Vec->getType() != SubVec->getType()) {
11980 unsigned SubVecVF =
11981 cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
11982 NewVF = std::max(a: NewVF, b: SubVecVF);
11983 }
11984 // Adjust SubMask.
11985 for (int &Idx : SubMask)
11986 if (Idx != PoisonMaskElem)
11987 Idx += NewVF;
11988 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
11989 Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
11990 TransformToIdentity(VecMask);
11991 }
11992 }
11993 copy(Range&: VecMask, Out: Mask.begin());
11994 return Vec;
11995 }
11996 /// Checks if the specified entry \p E needs to be delayed because of its
11997 /// dependency nodes.
11998 std::optional<Value *>
11999 needToDelay(const TreeEntry *E,
12000 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
12001 // No need to delay emission if all deps are ready.
12002 if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
12003 return all_of(
12004 Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; });
12005 }))
12006 return std::nullopt;
12007 // Postpone gather emission, will be emitted after the end of the
12008 // process to keep correct order.
12009 auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
12010 return Builder.CreateAlignedLoad(
12011 Ty: ResVecTy,
12012 Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
12013 Align: MaybeAlign());
12014 }
12015 /// Adds 2 input vectors (in form of tree entries) and the mask for their
12016 /// shuffling.
12017 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
12018 Value *V1 = E1.VectorizedValue;
12019 if (V1->getType()->isIntOrIntVectorTy())
12020 V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
12021 return !isKnownNonNegative(
12022 V, SQ: SimplifyQuery(*R.DL));
12023 }));
12024 Value *V2 = E2.VectorizedValue;
12025 if (V2->getType()->isIntOrIntVectorTy())
12026 V2 = castToScalarTyElem(V: V2, IsSigned: any_of(Range: E2.Scalars, P: [&](Value *V) {
12027 return !isKnownNonNegative(
12028 V, SQ: SimplifyQuery(*R.DL));
12029 }));
12030 add(V1, V2, Mask);
12031 }
12032 /// Adds single input vector (in form of tree entry) and the mask for its
12033 /// shuffling.
12034 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
12035 Value *V1 = E1.VectorizedValue;
12036 if (V1->getType()->isIntOrIntVectorTy())
12037 V1 = castToScalarTyElem(V: V1, IsSigned: any_of(Range: E1.Scalars, P: [&](Value *V) {
12038 return !isKnownNonNegative(
12039 V, SQ: SimplifyQuery(*R.DL));
12040 }));
12041 add(V1, Mask);
12042 }
12043 /// Adds 2 input vectors and the mask for their shuffling.
12044 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
12045 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
12046 V1 = castToScalarTyElem(V: V1);
12047 V2 = castToScalarTyElem(V: V2);
12048 if (InVectors.empty()) {
12049 InVectors.push_back(Elt: V1);
12050 InVectors.push_back(Elt: V2);
12051 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12052 return;
12053 }
12054 Value *Vec = InVectors.front();
12055 if (InVectors.size() == 2) {
12056 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
12057 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12058 } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
12059 Mask.size()) {
12060 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
12061 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12062 }
12063 V1 = createShuffle(V1, V2, Mask);
12064 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12065 if (Mask[Idx] != PoisonMaskElem)
12066 CommonMask[Idx] = Idx + Sz;
12067 InVectors.front() = Vec;
12068 if (InVectors.size() == 2)
12069 InVectors.back() = V1;
12070 else
12071 InVectors.push_back(Elt: V1);
12072 }
12073 /// Adds another one input vector and the mask for the shuffling.
12074 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
12075 V1 = castToScalarTyElem(V: V1);
12076 if (InVectors.empty()) {
12077 if (!isa<FixedVectorType>(Val: V1->getType())) {
12078 V1 = createShuffle(V1, V2: nullptr, Mask: CommonMask);
12079 CommonMask.assign(NumElts: Mask.size(), Elt: PoisonMaskElem);
12080 transformMaskAfterShuffle(CommonMask, Mask);
12081 }
12082 InVectors.push_back(Elt: V1);
12083 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
12084 return;
12085 }
12086 const auto *It = find(Range&: InVectors, Val: V1);
12087 if (It == InVectors.end()) {
12088 if (InVectors.size() == 2 ||
12089 InVectors.front()->getType() != V1->getType() ||
12090 !isa<FixedVectorType>(Val: V1->getType())) {
12091 Value *V = InVectors.front();
12092 if (InVectors.size() == 2) {
12093 V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
12094 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12095 } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
12096 CommonMask.size()) {
12097 V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
12098 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
12099 }
12100 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12101 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
12102 CommonMask[Idx] =
12103 V->getType() != V1->getType()
12104 ? Idx + Sz
12105 : Mask[Idx] + cast<FixedVectorType>(Val: V1->getType())
12106 ->getNumElements();
12107 if (V->getType() != V1->getType())
12108 V1 = createShuffle(V1, V2: nullptr, Mask);
12109 InVectors.front() = V;
12110 if (InVectors.size() == 2)
12111 InVectors.back() = V1;
12112 else
12113 InVectors.push_back(Elt: V1);
12114 return;
12115 }
12116 // Check if second vector is required if the used elements are already
12117 // used from the first one.
12118 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12119 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
12120 InVectors.push_back(Elt: V1);
12121 break;
12122 }
12123 }
12124 int VF = CommonMask.size();
12125 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
12126 VF = FTy->getNumElements();
12127 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12128 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
12129 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
12130 }
12131 /// Adds another one input vector and the mask for the shuffling.
12132 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
12133 SmallVector<int> NewMask;
12134 inversePermutation(Indices: Order, Mask&: NewMask);
12135 add(V1, Mask: NewMask);
12136 }
12137 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
12138 Value *Root = nullptr) {
12139 return R.gather(VL, Root, ScalarTy);
12140 }
12141 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
12142 /// Finalize emission of the shuffles.
12143 /// \param Action the action (if any) to be performed before final applying of
12144 /// the \p ExtMask mask.
12145 Value *
12146 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
12147 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
12148 IsFinalized = true;
12149 if (Action) {
12150 Value *Vec = InVectors.front();
12151 if (InVectors.size() == 2) {
12152 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
12153 InVectors.pop_back();
12154 } else {
12155 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
12156 }
12157 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
12158 if (CommonMask[Idx] != PoisonMaskElem)
12159 CommonMask[Idx] = Idx;
12160 assert(VF > 0 &&
12161 "Expected vector length for the final value before action.");
12162 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
12163 if (VecVF < VF) {
12164 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12165 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
12166 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
12167 }
12168 Action(Vec, CommonMask);
12169 InVectors.front() = Vec;
12170 }
12171 if (!ExtMask.empty()) {
12172 if (CommonMask.empty()) {
12173 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
12174 } else {
12175 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12176 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12177 if (ExtMask[I] == PoisonMaskElem)
12178 continue;
12179 NewMask[I] = CommonMask[ExtMask[I]];
12180 }
12181 CommonMask.swap(RHS&: NewMask);
12182 }
12183 }
12184 if (CommonMask.empty()) {
12185 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
12186 return InVectors.front();
12187 }
12188 if (InVectors.size() == 2)
12189 return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
12190 return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
12191 }
12192
12193 ~ShuffleInstructionBuilder() {
12194 assert((IsFinalized || CommonMask.empty()) &&
12195 "Shuffle construction must be finalized.");
12196 }
12197};
12198
12199Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
12200 bool PostponedPHIs) {
12201 ValueList &VL = E->getOperand(OpIdx: NodeIdx);
12202 const unsigned VF = VL.size();
12203 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
12204 // Special processing for GEPs bundle, which may include non-gep values.
12205 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
12206 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
12207 if (It != VL.end())
12208 S = getSameOpcode(VL: *It, TLI: *TLI);
12209 }
12210 if (S.getOpcode()) {
12211 auto CheckSameVE = [&](const TreeEntry *VE) {
12212 return VE->isSame(VL) &&
12213 (any_of(Range: VE->UserTreeIndices,
12214 P: [E, NodeIdx](const EdgeInfo &EI) {
12215 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12216 }) ||
12217 any_of(Range&: VectorizableTree,
12218 P: [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12219 return TE->isOperandGatherNode(UserEI: {E, NodeIdx}) &&
12220 VE->isSame(VL: TE->Scalars);
12221 }));
12222 };
12223 TreeEntry *VE = getTreeEntry(V: S.OpValue);
12224 bool IsSameVE = VE && CheckSameVE(VE);
12225 if (!IsSameVE) {
12226 auto It = MultiNodeScalars.find(Val: S.OpValue);
12227 if (It != MultiNodeScalars.end()) {
12228 auto *I = find_if(Range&: It->getSecond(), P: [&](const TreeEntry *TE) {
12229 return TE != VE && CheckSameVE(TE);
12230 });
12231 if (I != It->getSecond().end()) {
12232 VE = *I;
12233 IsSameVE = true;
12234 }
12235 }
12236 }
12237 if (IsSameVE) {
12238 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12239 ShuffleInstructionBuilder ShuffleBuilder(
12240 cast<VectorType>(Val: V->getType())->getElementType(), Builder, *this);
12241 ShuffleBuilder.add(V1: V, Mask);
12242 return ShuffleBuilder.finalize(ExtMask: std::nullopt);
12243 };
12244 Value *V = vectorizeTree(E: VE, PostponedPHIs);
12245 if (VF * getNumElements(Ty: VL[0]->getType()) !=
12246 cast<FixedVectorType>(Val: V->getType())->getNumElements()) {
12247 if (!VE->ReuseShuffleIndices.empty()) {
12248 // Reshuffle to get only unique values.
12249 // If some of the scalars are duplicated in the vectorization
12250 // tree entry, we do not vectorize them but instead generate a
12251 // mask for the reuses. But if there are several users of the
12252 // same entry, they may have different vectorization factors.
12253 // This is especially important for PHI nodes. In this case, we
12254 // need to adapt the resulting instruction for the user
12255 // vectorization factor and have to reshuffle it again to take
12256 // only unique elements of the vector. Without this code the
12257 // function incorrectly returns reduced vector instruction with
12258 // the same elements, not with the unique ones.
12259
12260 // block:
12261 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12262 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12263 // ... (use %2)
12264 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12265 // br %block
12266 SmallVector<int> Mask(VF, PoisonMaskElem);
12267 for (auto [I, V] : enumerate(First&: VL)) {
12268 if (isa<PoisonValue>(Val: V))
12269 continue;
12270 Mask[I] = VE->findLaneForValue(V);
12271 }
12272 V = FinalShuffle(V, Mask);
12273 } else {
12274 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12275 "Expected vectorization factor less "
12276 "than original vector size.");
12277 SmallVector<int> UniformMask(VF, 0);
12278 std::iota(first: UniformMask.begin(), last: UniformMask.end(), value: 0);
12279 V = FinalShuffle(V, UniformMask);
12280 }
12281 }
12282 // Need to update the operand gather node, if actually the operand is not a
12283 // vectorized node, but the buildvector/gather node, which matches one of
12284 // the vectorized nodes.
12285 if (find_if(Range&: VE->UserTreeIndices, P: [&](const EdgeInfo &EI) {
12286 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12287 }) == VE->UserTreeIndices.end()) {
12288 auto *It = find_if(
12289 Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
12290 return TE->isGather() &&
12291 TE->UserTreeIndices.front().UserTE == E &&
12292 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12293 });
12294 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12295 (*It)->VectorizedValue = V;
12296 }
12297 return V;
12298 }
12299 }
12300
12301 // Find the corresponding gather entry and vectorize it.
12302 // Allows to be more accurate with tree/graph transformations, checks for the
12303 // correctness of the transformations in many cases.
12304 auto *I = find_if(Range&: VectorizableTree,
12305 P: [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12306 return TE->isOperandGatherNode(UserEI: {E, NodeIdx});
12307 });
12308 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12309 assert(I->get()->UserTreeIndices.size() == 1 &&
12310 "Expected only single user for the gather node.");
12311 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12312 return vectorizeTree(E: I->get(), PostponedPHIs);
12313}
12314
12315template <typename BVTy, typename ResTy, typename... Args>
12316ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12317 Args &...Params) {
12318 assert(E->isGather() && "Expected gather node.");
12319 unsigned VF = E->getVectorFactor();
12320
12321 bool NeedFreeze = false;
12322 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
12323 E->ReuseShuffleIndices.end());
12324 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12325 // Build a mask out of the reorder indices and reorder scalars per this
12326 // mask.
12327 SmallVector<int> ReorderMask;
12328 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
12329 if (!ReorderMask.empty())
12330 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
12331 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12332 unsigned I, unsigned SliceSize) {
12333 if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) {
12334 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
12335 }))
12336 return false;
12337 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12338 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12339 if (UserTE->getNumOperands() != 2)
12340 return false;
12341 auto *It =
12342 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12343 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12344 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12345 }) != TE->UserTreeIndices.end();
12346 });
12347 if (It == VectorizableTree.end())
12348 return false;
12349 int Idx;
12350 if ((Mask.size() < InputVF &&
12351 ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
12352 Idx == 0) ||
12353 (Mask.size() == InputVF &&
12354 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
12355 std::iota(
12356 first: std::next(x: Mask.begin(), n: I * SliceSize),
12357 last: std::next(x: Mask.begin(),
12358 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
12359 value: 0);
12360 } else {
12361 unsigned IVal =
12362 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12363 std::fill(
12364 std::next(x: Mask.begin(), n: I * SliceSize),
12365 std::next(x: Mask.begin(),
12366 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
12367 IVal);
12368 }
12369 return true;
12370 };
12371 BVTy ShuffleBuilder(ScalarTy, Params...);
12372 ResTy Res = ResTy();
12373 SmallVector<int> Mask;
12374 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12375 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
12376 Value *ExtractVecBase = nullptr;
12377 bool UseVecBaseAsInput = false;
12378 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
12379 SmallVector<SmallVector<const TreeEntry *>> Entries;
12380 Type *OrigScalarTy = GatheredScalars.front()->getType();
12381 auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
12382 unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
12383 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12384 NumParts = 1;
12385 if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
12386 // Check for gathered extracts.
12387 bool Resized = false;
12388 ExtractShuffles =
12389 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
12390 if (!ExtractShuffles.empty()) {
12391 SmallVector<const TreeEntry *> ExtractEntries;
12392 for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
12393 if (I == PoisonMaskElem)
12394 continue;
12395 if (const auto *TE = getTreeEntry(
12396 V: cast<ExtractElementInst>(Val: E->Scalars[Idx])->getVectorOperand()))
12397 ExtractEntries.push_back(Elt: TE);
12398 }
12399 if (std::optional<ResTy> Delayed =
12400 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12401 // Delay emission of gathers which are not ready yet.
12402 PostponedGathers.insert(X: E);
12403 // Postpone gather emission, will be emitted after the end of the
12404 // process to keep correct order.
12405 return *Delayed;
12406 }
12407 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12408 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12409 ExtractVecBase = VecBase;
12410 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
12411 if (VF == VecBaseTy->getNumElements() &&
12412 GatheredScalars.size() != VF) {
12413 Resized = true;
12414 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
12415 Elt: PoisonValue::get(T: OrigScalarTy));
12416 }
12417 }
12418 }
12419 // Gather extracts after we check for full matched gathers only.
12420 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12421 E->isAltShuffle() ||
12422 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12423 isSplat(VL: E->Scalars) ||
12424 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12425 GatherShuffles =
12426 isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
12427 }
12428 if (!GatherShuffles.empty()) {
12429 if (std::optional<ResTy> Delayed =
12430 ShuffleBuilder.needToDelay(E, Entries)) {
12431 // Delay emission of gathers which are not ready yet.
12432 PostponedGathers.insert(X: E);
12433 // Postpone gather emission, will be emitted after the end of the
12434 // process to keep correct order.
12435 return *Delayed;
12436 }
12437 if (GatherShuffles.size() == 1 &&
12438 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12439 Entries.front().front()->isSame(VL: E->Scalars)) {
12440 // Perfect match in the graph, will reuse the previously vectorized
12441 // node. Cost is 0.
12442 LLVM_DEBUG(
12443 dbgs()
12444 << "SLP: perfect diamond match for gather bundle "
12445 << shortBundleName(E->Scalars) << ".\n");
12446 // Restore the mask for previous partially matched values.
12447 Mask.resize(N: E->Scalars.size());
12448 const TreeEntry *FrontTE = Entries.front().front();
12449 if (FrontTE->ReorderIndices.empty() &&
12450 ((FrontTE->ReuseShuffleIndices.empty() &&
12451 E->Scalars.size() == FrontTE->Scalars.size()) ||
12452 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12453 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
12454 } else {
12455 for (auto [I, V] : enumerate(First: E->Scalars)) {
12456 if (isa<PoisonValue>(Val: V)) {
12457 Mask[I] = PoisonMaskElem;
12458 continue;
12459 }
12460 Mask[I] = FrontTE->findLaneForValue(V);
12461 }
12462 }
12463 ShuffleBuilder.add(*FrontTE, Mask);
12464 Res = ShuffleBuilder.finalize(E->getCommonMask());
12465 return Res;
12466 }
12467 if (!Resized) {
12468 if (GatheredScalars.size() != VF &&
12469 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12470 return any_of(TEs, [&](const TreeEntry *TE) {
12471 return TE->getVectorFactor() == VF;
12472 });
12473 }))
12474 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
12475 Elt: PoisonValue::get(T: OrigScalarTy));
12476 }
12477 // Remove shuffled elements from list of gathers.
12478 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12479 if (Mask[I] != PoisonMaskElem)
12480 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
12481 }
12482 }
12483 }
12484 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12485 SmallVectorImpl<int> &ReuseMask,
12486 bool IsRootPoison) {
12487 // For splats with can emit broadcasts instead of gathers, so try to find
12488 // such sequences.
12489 bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
12490 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12491 Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
12492 SmallVector<int> UndefPos;
12493 DenseMap<Value *, unsigned> UniquePositions;
12494 // Gather unique non-const values and all constant values.
12495 // For repeated values, just shuffle them.
12496 int NumNonConsts = 0;
12497 int SinglePos = 0;
12498 for (auto [I, V] : enumerate(First&: Scalars)) {
12499 if (isa<UndefValue>(Val: V)) {
12500 if (!isa<PoisonValue>(Val: V)) {
12501 ReuseMask[I] = I;
12502 UndefPos.push_back(Elt: I);
12503 }
12504 continue;
12505 }
12506 if (isConstant(V)) {
12507 ReuseMask[I] = I;
12508 continue;
12509 }
12510 ++NumNonConsts;
12511 SinglePos = I;
12512 Value *OrigV = V;
12513 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
12514 if (IsSplat) {
12515 Scalars.front() = OrigV;
12516 ReuseMask[I] = 0;
12517 } else {
12518 const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
12519 Scalars[Res.first->second] = OrigV;
12520 ReuseMask[I] = Res.first->second;
12521 }
12522 }
12523 if (NumNonConsts == 1) {
12524 // Restore single insert element.
12525 if (IsSplat) {
12526 ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
12527 std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]);
12528 if (!UndefPos.empty() && UndefPos.front() == 0)
12529 Scalars.front() = UndefValue::get(T: OrigScalarTy);
12530 }
12531 ReuseMask[SinglePos] = SinglePos;
12532 } else if (!UndefPos.empty() && IsSplat) {
12533 // For undef values, try to replace them with the simple broadcast.
12534 // We can do it if the broadcasted value is guaranteed to be
12535 // non-poisonous, or by freezing the incoming scalar value first.
12536 auto *It = find_if(Scalars, [this, E](Value *V) {
12537 return !isa<UndefValue>(Val: V) &&
12538 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12539 (E->UserTreeIndices.size() == 1 &&
12540 any_of(V->uses(), [E](const Use &U) {
12541 // Check if the value already used in the same operation in
12542 // one of the nodes already.
12543 return E->UserTreeIndices.front().EdgeIdx !=
12544 U.getOperandNo() &&
12545 is_contained(
12546 Range&: E->UserTreeIndices.front().UserTE->Scalars,
12547 Element: U.getUser());
12548 })));
12549 });
12550 if (It != Scalars.end()) {
12551 // Replace undefs by the non-poisoned scalars and emit broadcast.
12552 int Pos = std::distance(Scalars.begin(), It);
12553 for (int I : UndefPos) {
12554 // Set the undef position to the non-poisoned scalar.
12555 ReuseMask[I] = Pos;
12556 // Replace the undef by the poison, in the mask it is replaced by
12557 // non-poisoned scalar already.
12558 if (I != Pos)
12559 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
12560 }
12561 } else {
12562 // Replace undefs by the poisons, emit broadcast and then emit
12563 // freeze.
12564 for (int I : UndefPos) {
12565 ReuseMask[I] = PoisonMaskElem;
12566 if (isa<UndefValue>(Val: Scalars[I]))
12567 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
12568 }
12569 NeedFreeze = true;
12570 }
12571 }
12572 };
12573 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12574 bool IsNonPoisoned = true;
12575 bool IsUsedInExpr = true;
12576 Value *Vec1 = nullptr;
12577 if (!ExtractShuffles.empty()) {
12578 // Gather of extractelements can be represented as just a shuffle of
12579 // a single/two vectors the scalars are extracted from.
12580 // Find input vectors.
12581 Value *Vec2 = nullptr;
12582 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12583 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12584 ExtractMask[I] = PoisonMaskElem;
12585 }
12586 if (UseVecBaseAsInput) {
12587 Vec1 = ExtractVecBase;
12588 } else {
12589 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12590 if (ExtractMask[I] == PoisonMaskElem)
12591 continue;
12592 if (isa<UndefValue>(Val: E->Scalars[I]))
12593 continue;
12594 auto *EI = cast<ExtractElementInst>(Val: E->Scalars[I]);
12595 Value *VecOp = EI->getVectorOperand();
12596 if (const auto *TE = getTreeEntry(V: VecOp))
12597 if (TE->VectorizedValue)
12598 VecOp = TE->VectorizedValue;
12599 if (!Vec1) {
12600 Vec1 = VecOp;
12601 } else if (Vec1 != VecOp) {
12602 assert((!Vec2 || Vec2 == VecOp) &&
12603 "Expected only 1 or 2 vectors shuffle.");
12604 Vec2 = VecOp;
12605 }
12606 }
12607 }
12608 if (Vec2) {
12609 IsUsedInExpr = false;
12610 IsNonPoisoned &=
12611 isGuaranteedNotToBePoison(V: Vec1) && isGuaranteedNotToBePoison(V: Vec2);
12612 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12613 } else if (Vec1) {
12614 IsUsedInExpr &= FindReusedSplat(
12615 ExtractMask,
12616 cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0,
12617 ExtractMask.size());
12618 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12619 IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1);
12620 } else {
12621 IsUsedInExpr = false;
12622 ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
12623 /*ForExtracts=*/true);
12624 }
12625 }
12626 if (!GatherShuffles.empty()) {
12627 unsigned SliceSize = getPartNumElems(Size: E->Scalars.size(), NumParts);
12628 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12629 for (const auto [I, TEs] : enumerate(First&: Entries)) {
12630 if (TEs.empty()) {
12631 assert(!GatherShuffles[I] &&
12632 "No shuffles with empty entries list expected.");
12633 continue;
12634 }
12635 assert((TEs.size() == 1 || TEs.size() == 2) &&
12636 "Expected shuffle of 1 or 2 entries.");
12637 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
12638 auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
12639 VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
12640 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
12641 if (TEs.size() == 1) {
12642 IsUsedInExpr &= FindReusedSplat(
12643 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12644 ShuffleBuilder.add(*TEs.front(), VecMask);
12645 if (TEs.front()->VectorizedValue)
12646 IsNonPoisoned &=
12647 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue);
12648 } else {
12649 IsUsedInExpr = false;
12650 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12651 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12652 IsNonPoisoned &=
12653 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue) &&
12654 isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue);
12655 }
12656 }
12657 }
12658 // Try to figure out best way to combine values: build a shuffle and insert
12659 // elements or just build several shuffles.
12660 // Insert non-constant scalars.
12661 SmallVector<Value *> NonConstants(GatheredScalars);
12662 int EMSz = ExtractMask.size();
12663 int MSz = Mask.size();
12664 // Try to build constant vector and shuffle with it only if currently we
12665 // have a single permutation and more than 1 scalar constants.
12666 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12667 bool IsIdentityShuffle =
12668 ((UseVecBaseAsInput ||
12669 all_of(ExtractShuffles,
12670 [](const std::optional<TTI::ShuffleKind> &SK) {
12671 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12672 TTI::SK_PermuteSingleSrc;
12673 })) &&
12674 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12675 ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) ||
12676 (!GatherShuffles.empty() &&
12677 all_of(GatherShuffles,
12678 [](const std::optional<TTI::ShuffleKind> &SK) {
12679 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
12680 TTI::SK_PermuteSingleSrc;
12681 }) &&
12682 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12683 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
12684 bool EnoughConstsForShuffle =
12685 IsSingleShuffle &&
12686 (none_of(GatheredScalars,
12687 [](Value *V) {
12688 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
12689 }) ||
12690 any_of(GatheredScalars,
12691 [](Value *V) {
12692 return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
12693 })) &&
12694 (!IsIdentityShuffle ||
12695 (GatheredScalars.size() == 2 &&
12696 any_of(GatheredScalars,
12697 [](Value *V) { return !isa<UndefValue>(Val: V); })) ||
12698 count_if(GatheredScalars, [](Value *V) {
12699 return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
12700 }) > 1);
12701 // NonConstants array contains just non-constant values, GatheredScalars
12702 // contains only constant to build final vector and then shuffle.
12703 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12704 if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I]))
12705 NonConstants[I] = PoisonValue::get(T: OrigScalarTy);
12706 else
12707 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
12708 }
12709 // Generate constants for final shuffle and build a mask for them.
12710 if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
12711 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12712 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12713 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12714 ShuffleBuilder.add(BV, BVMask);
12715 }
12716 if (all_of(NonConstants, [=](Value *V) {
12717 return isa<PoisonValue>(Val: V) ||
12718 (IsSingleShuffle && ((IsIdentityShuffle &&
12719 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V));
12720 }))
12721 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12722 else
12723 Res = ShuffleBuilder.finalize(
12724 E->ReuseShuffleIndices, E->Scalars.size(),
12725 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12726 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12727 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12728 });
12729 } else if (!allConstant(VL: GatheredScalars)) {
12730 // Gather unique scalars and all constants.
12731 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12732 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12733 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12734 ShuffleBuilder.add(BV, ReuseMask);
12735 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12736 } else {
12737 // Gather all constants.
12738 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12739 for (auto [I, V] : enumerate(First: E->Scalars)) {
12740 if (!isa<PoisonValue>(Val: V))
12741 Mask[I] = I;
12742 }
12743 Value *BV = ShuffleBuilder.gather(E->Scalars);
12744 ShuffleBuilder.add(BV, Mask);
12745 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12746 }
12747
12748 if (NeedFreeze)
12749 Res = ShuffleBuilder.createFreeze(Res);
12750 return Res;
12751}
12752
12753Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12754 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12755 Params&: Builder, Params&: *this);
12756}
12757
12758Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12759 IRBuilderBase::InsertPointGuard Guard(Builder);
12760
12761 if (E->VectorizedValue &&
12762 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12763 E->isAltShuffle())) {
12764 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12765 return E->VectorizedValue;
12766 }
12767
12768 Value *V = E->Scalars.front();
12769 Type *ScalarTy = V->getType();
12770 if (auto *Store = dyn_cast<StoreInst>(Val: V))
12771 ScalarTy = Store->getValueOperand()->getType();
12772 else if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
12773 ScalarTy = IE->getOperand(i_nocapture: 1)->getType();
12774 auto It = MinBWs.find(Val: E);
12775 if (It != MinBWs.end())
12776 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
12777 auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
12778 if (E->isGather()) {
12779 // Set insert point for non-reduction initial nodes.
12780 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12781 setInsertPointAfterBundle(E);
12782 Value *Vec = createBuildVector(E, ScalarTy);
12783 E->VectorizedValue = Vec;
12784 return Vec;
12785 }
12786
12787 bool IsReverseOrder = isReverseOrder(Order: E->ReorderIndices);
12788 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12789 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12790 if (E->getOpcode() == Instruction::Store &&
12791 E->State == TreeEntry::Vectorize) {
12792 ArrayRef<int> Mask =
12793 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12794 E->ReorderIndices.size());
12795 ShuffleBuilder.add(V1: V, Mask);
12796 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12797 ShuffleBuilder.addOrdered(V1: V, Order: std::nullopt);
12798 } else {
12799 ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
12800 }
12801 return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices);
12802 };
12803
12804 assert((E->State == TreeEntry::Vectorize ||
12805 E->State == TreeEntry::ScatterVectorize ||
12806 E->State == TreeEntry::StridedVectorize) &&
12807 "Unhandled state");
12808 unsigned ShuffleOrOp =
12809 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12810 Instruction *VL0 = E->getMainOp();
12811 auto GetOperandSignedness = [&](unsigned Idx) {
12812 const TreeEntry *OpE = getOperandEntry(E, Idx);
12813 bool IsSigned = false;
12814 auto It = MinBWs.find(Val: OpE);
12815 if (It != MinBWs.end())
12816 IsSigned = It->second.second;
12817 else
12818 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
12819 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
12820 });
12821 return IsSigned;
12822 };
12823 switch (ShuffleOrOp) {
12824 case Instruction::PHI: {
12825 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12826 E != VectorizableTree.front().get() ||
12827 !E->UserTreeIndices.empty()) &&
12828 "PHI reordering is free.");
12829 if (PostponedPHIs && E->VectorizedValue)
12830 return E->VectorizedValue;
12831 auto *PH = cast<PHINode>(Val: VL0);
12832 Builder.SetInsertPoint(TheBB: PH->getParent(),
12833 IP: PH->getParent()->getFirstNonPHIIt());
12834 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12835 if (PostponedPHIs || !E->VectorizedValue) {
12836 PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
12837 E->PHI = NewPhi;
12838 Value *V = NewPhi;
12839
12840 // Adjust insertion point once all PHI's have been generated.
12841 Builder.SetInsertPoint(TheBB: PH->getParent(),
12842 IP: PH->getParent()->getFirstInsertionPt());
12843 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12844
12845 V = FinalShuffle(V, E, VecTy);
12846
12847 E->VectorizedValue = V;
12848 if (PostponedPHIs)
12849 return V;
12850 }
12851 PHINode *NewPhi = cast<PHINode>(Val: E->PHI);
12852 // If phi node is fully emitted - exit.
12853 if (NewPhi->getNumIncomingValues() != 0)
12854 return NewPhi;
12855
12856 // PHINodes may have multiple entries from the same block. We want to
12857 // visit every block once.
12858 SmallPtrSet<BasicBlock *, 4> VisitedBBs;
12859
12860 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
12861 ValueList Operands;
12862 BasicBlock *IBB = PH->getIncomingBlock(i: I);
12863
12864 // Stop emission if all incoming values are generated.
12865 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12866 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12867 return NewPhi;
12868 }
12869
12870 if (!VisitedBBs.insert(Ptr: IBB).second) {
12871 NewPhi->addIncoming(V: NewPhi->getIncomingValueForBlock(BB: IBB), BB: IBB);
12872 continue;
12873 }
12874
12875 Builder.SetInsertPoint(IBB->getTerminator());
12876 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12877 Value *Vec = vectorizeOperand(E, NodeIdx: I, /*PostponedPHIs=*/true);
12878 if (VecTy != Vec->getType()) {
12879 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
12880 MinBWs.contains(getOperandEntry(E, I))) &&
12881 "Expected item in MinBWs.");
12882 Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
12883 }
12884 NewPhi->addIncoming(V: Vec, BB: IBB);
12885 }
12886
12887 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12888 "Invalid number of incoming values");
12889 return NewPhi;
12890 }
12891
12892 case Instruction::ExtractElement: {
12893 Value *V = E->getSingleOperand(OpIdx: 0);
12894 if (const TreeEntry *TE = getTreeEntry(V))
12895 V = TE->VectorizedValue;
12896 setInsertPointAfterBundle(E);
12897 V = FinalShuffle(V, E, VecTy);
12898 E->VectorizedValue = V;
12899 return V;
12900 }
12901 case Instruction::ExtractValue: {
12902 auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0));
12903 Builder.SetInsertPoint(LI);
12904 Value *Ptr = LI->getPointerOperand();
12905 LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
12906 Value *NewV = propagateMetadata(I: V, VL: E->Scalars);
12907 NewV = FinalShuffle(NewV, E, VecTy);
12908 E->VectorizedValue = NewV;
12909 return NewV;
12910 }
12911 case Instruction::InsertElement: {
12912 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12913 Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
12914 Value *V = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
12915 ArrayRef<Value *> Op = E->getOperand(OpIdx: 1);
12916 Type *ScalarTy = Op.front()->getType();
12917 if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
12918 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12919 std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1));
12920 assert(Res.first > 0 && "Expected item in MinBWs.");
12921 V = Builder.CreateIntCast(
12922 V,
12923 DestTy: getWidenedType(
12924 ScalarTy,
12925 VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
12926 isSigned: Res.second);
12927 }
12928
12929 // Create InsertVector shuffle if necessary
12930 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) {
12931 return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
12932 }));
12933 const unsigned NumElts =
12934 cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
12935 const unsigned NumScalars = E->Scalars.size();
12936
12937 unsigned Offset = *getElementIndex(Inst: VL0);
12938 assert(Offset < NumElts && "Failed to find vector index offset");
12939
12940 // Create shuffle to resize vector
12941 SmallVector<int> Mask;
12942 if (!E->ReorderIndices.empty()) {
12943 inversePermutation(Indices: E->ReorderIndices, Mask);
12944 Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
12945 } else {
12946 Mask.assign(NumElts, Elt: PoisonMaskElem);
12947 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0);
12948 }
12949 // Create InsertVector shuffle if necessary
12950 bool IsIdentity = true;
12951 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12952 Mask.swap(RHS&: PrevMask);
12953 for (unsigned I = 0; I < NumScalars; ++I) {
12954 Value *Scalar = E->Scalars[PrevMask[I]];
12955 unsigned InsertIdx = *getElementIndex(Inst: Scalar);
12956 IsIdentity &= InsertIdx - Offset == I;
12957 Mask[InsertIdx - Offset] = I;
12958 }
12959 if (!IsIdentity || NumElts != NumScalars) {
12960 Value *V2 = nullptr;
12961 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12962 SmallVector<int> InsertMask(Mask);
12963 if (NumElts != NumScalars && Offset == 0) {
12964 // Follow all insert element instructions from the current buildvector
12965 // sequence.
12966 InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
12967 do {
12968 std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
12969 if (!InsertIdx)
12970 break;
12971 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12972 InsertMask[*InsertIdx] = *InsertIdx;
12973 if (!Ins->hasOneUse())
12974 break;
12975 Ins = dyn_cast_or_null<InsertElementInst>(
12976 Val: Ins->getUniqueUndroppableUser());
12977 } while (Ins);
12978 SmallBitVector UseMask =
12979 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
12980 SmallBitVector IsFirstPoison =
12981 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
12982 SmallBitVector IsFirstUndef =
12983 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
12984 if (!IsFirstPoison.all()) {
12985 unsigned Idx = 0;
12986 for (unsigned I = 0; I < NumElts; I++) {
12987 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
12988 IsFirstUndef.test(Idx: I)) {
12989 if (IsVNonPoisonous) {
12990 InsertMask[I] = I < NumScalars ? I : 0;
12991 continue;
12992 }
12993 if (!V2)
12994 V2 = UndefValue::get(T: V->getType());
12995 if (Idx >= NumScalars)
12996 Idx = NumScalars - 1;
12997 InsertMask[I] = NumScalars + Idx;
12998 ++Idx;
12999 } else if (InsertMask[I] != PoisonMaskElem &&
13000 Mask[I] == PoisonMaskElem) {
13001 InsertMask[I] = PoisonMaskElem;
13002 }
13003 }
13004 } else {
13005 InsertMask = Mask;
13006 }
13007 }
13008 if (!V2)
13009 V2 = PoisonValue::get(T: V->getType());
13010 V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
13011 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13012 GatherShuffleExtractSeq.insert(X: I);
13013 CSEBlocks.insert(V: I->getParent());
13014 }
13015 }
13016
13017 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
13018 for (unsigned I = 0; I < NumElts; I++) {
13019 if (Mask[I] != PoisonMaskElem)
13020 InsertMask[Offset + I] = I;
13021 }
13022 SmallBitVector UseMask =
13023 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
13024 SmallBitVector IsFirstUndef =
13025 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
13026 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
13027 NumElts != NumScalars) {
13028 if (IsFirstUndef.all()) {
13029 if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
13030 SmallBitVector IsFirstPoison =
13031 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
13032 if (!IsFirstPoison.all()) {
13033 for (unsigned I = 0; I < NumElts; I++) {
13034 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
13035 InsertMask[I] = I + NumElts;
13036 }
13037 }
13038 V = Builder.CreateShuffleVector(
13039 V1: V,
13040 V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
13041 : FirstInsert->getOperand(i: 0),
13042 Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
13043 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13044 GatherShuffleExtractSeq.insert(X: I);
13045 CSEBlocks.insert(V: I->getParent());
13046 }
13047 }
13048 } else {
13049 SmallBitVector IsFirstPoison =
13050 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
13051 for (unsigned I = 0; I < NumElts; I++) {
13052 if (InsertMask[I] == PoisonMaskElem)
13053 InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
13054 else
13055 InsertMask[I] += NumElts;
13056 }
13057 V = Builder.CreateShuffleVector(
13058 V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask,
13059 Name: cast<Instruction>(Val: E->Scalars.back())->getName());
13060 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13061 GatherShuffleExtractSeq.insert(X: I);
13062 CSEBlocks.insert(V: I->getParent());
13063 }
13064 }
13065 }
13066
13067 ++NumVectorInstructions;
13068 E->VectorizedValue = V;
13069 return V;
13070 }
13071 case Instruction::ZExt:
13072 case Instruction::SExt:
13073 case Instruction::FPToUI:
13074 case Instruction::FPToSI:
13075 case Instruction::FPExt:
13076 case Instruction::PtrToInt:
13077 case Instruction::IntToPtr:
13078 case Instruction::SIToFP:
13079 case Instruction::UIToFP:
13080 case Instruction::Trunc:
13081 case Instruction::FPTrunc:
13082 case Instruction::BitCast: {
13083 setInsertPointAfterBundle(E);
13084
13085 Value *InVec = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13086 if (E->VectorizedValue) {
13087 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13088 return E->VectorizedValue;
13089 }
13090
13091 auto *CI = cast<CastInst>(Val: VL0);
13092 Instruction::CastOps VecOpcode = CI->getOpcode();
13093 Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
13094 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
13095 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
13096 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
13097 SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType())) {
13098 // Check if the values are candidates to demote.
13099 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
13100 if (SrcIt != MinBWs.end())
13101 SrcBWSz = SrcIt->second.first;
13102 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13103 if (BWSz == SrcBWSz) {
13104 VecOpcode = Instruction::BitCast;
13105 } else if (BWSz < SrcBWSz) {
13106 VecOpcode = Instruction::Trunc;
13107 } else if (It != MinBWs.end()) {
13108 assert(BWSz > SrcBWSz && "Invalid cast!");
13109 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
13110 } else if (SrcIt != MinBWs.end()) {
13111 assert(BWSz > SrcBWSz && "Invalid cast!");
13112 VecOpcode =
13113 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
13114 }
13115 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
13116 !SrcIt->second.second) {
13117 VecOpcode = Instruction::UIToFP;
13118 }
13119 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
13120 ? InVec
13121 : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
13122 V = FinalShuffle(V, E, VecTy);
13123
13124 E->VectorizedValue = V;
13125 ++NumVectorInstructions;
13126 return V;
13127 }
13128 case Instruction::FCmp:
13129 case Instruction::ICmp: {
13130 setInsertPointAfterBundle(E);
13131
13132 Value *L = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13133 if (E->VectorizedValue) {
13134 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13135 return E->VectorizedValue;
13136 }
13137 Value *R = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
13138 if (E->VectorizedValue) {
13139 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13140 return E->VectorizedValue;
13141 }
13142 if (L->getType() != R->getType()) {
13143 assert((getOperandEntry(E, 0)->isGather() ||
13144 getOperandEntry(E, 1)->isGather() ||
13145 MinBWs.contains(getOperandEntry(E, 0)) ||
13146 MinBWs.contains(getOperandEntry(E, 1))) &&
13147 "Expected item in MinBWs.");
13148 if (cast<VectorType>(Val: L->getType())
13149 ->getElementType()
13150 ->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
13151 ->getElementType()
13152 ->getIntegerBitWidth()) {
13153 Type *CastTy = R->getType();
13154 L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0));
13155 } else {
13156 Type *CastTy = L->getType();
13157 R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1));
13158 }
13159 }
13160
13161 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
13162 Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
13163 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
13164 // Do not cast for cmps.
13165 VecTy = cast<FixedVectorType>(Val: V->getType());
13166 V = FinalShuffle(V, E, VecTy);
13167
13168 E->VectorizedValue = V;
13169 ++NumVectorInstructions;
13170 return V;
13171 }
13172 case Instruction::Select: {
13173 setInsertPointAfterBundle(E);
13174
13175 Value *Cond = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13176 if (E->VectorizedValue) {
13177 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13178 return E->VectorizedValue;
13179 }
13180 Value *True = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
13181 if (E->VectorizedValue) {
13182 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13183 return E->VectorizedValue;
13184 }
13185 Value *False = vectorizeOperand(E, NodeIdx: 2, PostponedPHIs);
13186 if (E->VectorizedValue) {
13187 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13188 return E->VectorizedValue;
13189 }
13190 if (True->getType() != VecTy || False->getType() != VecTy) {
13191 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
13192 getOperandEntry(E, 2)->isGather() ||
13193 MinBWs.contains(getOperandEntry(E, 1)) ||
13194 MinBWs.contains(getOperandEntry(E, 2))) &&
13195 "Expected item in MinBWs.");
13196 if (True->getType() != VecTy)
13197 True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1));
13198 if (False->getType() != VecTy)
13199 False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2));
13200 }
13201
13202 Value *V = Builder.CreateSelect(C: Cond, True, False);
13203 V = FinalShuffle(V, E, VecTy);
13204
13205 E->VectorizedValue = V;
13206 ++NumVectorInstructions;
13207 return V;
13208 }
13209 case Instruction::FNeg: {
13210 setInsertPointAfterBundle(E);
13211
13212 Value *Op = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13213
13214 if (E->VectorizedValue) {
13215 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13216 return E->VectorizedValue;
13217 }
13218
13219 Value *V = Builder.CreateUnOp(
13220 Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
13221 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
13222 if (auto *I = dyn_cast<Instruction>(Val: V))
13223 V = propagateMetadata(I, VL: E->Scalars);
13224
13225 V = FinalShuffle(V, E, VecTy);
13226
13227 E->VectorizedValue = V;
13228 ++NumVectorInstructions;
13229
13230 return V;
13231 }
13232 case Instruction::Add:
13233 case Instruction::FAdd:
13234 case Instruction::Sub:
13235 case Instruction::FSub:
13236 case Instruction::Mul:
13237 case Instruction::FMul:
13238 case Instruction::UDiv:
13239 case Instruction::SDiv:
13240 case Instruction::FDiv:
13241 case Instruction::URem:
13242 case Instruction::SRem:
13243 case Instruction::FRem:
13244 case Instruction::Shl:
13245 case Instruction::LShr:
13246 case Instruction::AShr:
13247 case Instruction::And:
13248 case Instruction::Or:
13249 case Instruction::Xor: {
13250 setInsertPointAfterBundle(E);
13251
13252 Value *LHS = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13253 if (E->VectorizedValue) {
13254 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13255 return E->VectorizedValue;
13256 }
13257 Value *RHS = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
13258 if (E->VectorizedValue) {
13259 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13260 return E->VectorizedValue;
13261 }
13262 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13263 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
13264 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
13265 if (all_of(Range&: Ops, P: [&](Value *Op) {
13266 auto *CI = dyn_cast<ConstantInt>(Val: Op);
13267 return CI && CI->getValue().countr_one() >= It->second.first;
13268 })) {
13269 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13270 E->VectorizedValue = V;
13271 ++NumVectorInstructions;
13272 return V;
13273 }
13274 }
13275 }
13276 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13277 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13278 getOperandEntry(E, 1)->isGather() ||
13279 MinBWs.contains(getOperandEntry(E, 0)) ||
13280 MinBWs.contains(getOperandEntry(E, 1))) &&
13281 "Expected item in MinBWs.");
13282 if (LHS->getType() != VecTy)
13283 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0));
13284 if (RHS->getType() != VecTy)
13285 RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1));
13286 }
13287
13288 Value *V = Builder.CreateBinOp(
13289 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13290 RHS);
13291 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0, IncludeWrapFlags: It == MinBWs.end());
13292 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13293 V = propagateMetadata(I, VL: E->Scalars);
13294 // Drop nuw flags for abs(sub(commutative), true).
13295 if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
13296 any_of(Range&: E->Scalars, P: [](Value *V) {
13297 return isCommutative(I: cast<Instruction>(Val: V));
13298 }))
13299 I->setHasNoUnsignedWrap(/*b=*/false);
13300 }
13301
13302 V = FinalShuffle(V, E, VecTy);
13303
13304 E->VectorizedValue = V;
13305 ++NumVectorInstructions;
13306
13307 return V;
13308 }
13309 case Instruction::Load: {
13310 // Loads are inserted at the head of the tree because we don't want to
13311 // sink them all the way down past store instructions.
13312 setInsertPointAfterBundle(E);
13313
13314 LoadInst *LI = cast<LoadInst>(Val: VL0);
13315 Instruction *NewLI;
13316 Value *PO = LI->getPointerOperand();
13317 if (E->State == TreeEntry::Vectorize) {
13318 NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
13319 } else if (E->State == TreeEntry::StridedVectorize) {
13320 Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
13321 Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
13322 PO = IsReverseOrder ? PtrN : Ptr0;
13323 std::optional<int> Diff = getPointersDiff(
13324 ElemTyA: VL0->getType(), PtrA: Ptr0, ElemTyB: VL0->getType(), PtrB: PtrN, DL: *DL, SE&: *SE);
13325 Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
13326 Value *StrideVal;
13327 if (Diff) {
13328 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13329 StrideVal =
13330 ConstantInt::get(Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) * Stride *
13331 DL->getTypeAllocSize(Ty: ScalarTy));
13332 } else {
13333 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13334 transform(Range&: E->Scalars, d_first: PointerOps.begin(), F: [](Value *V) {
13335 return cast<LoadInst>(Val: V)->getPointerOperand();
13336 });
13337 OrdersType Order;
13338 std::optional<Value *> Stride =
13339 calculateRtStride(PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order,
13340 Inst: &*Builder.GetInsertPoint());
13341 Value *NewStride =
13342 Builder.CreateIntCast(V: *Stride, DestTy: StrideTy, /*isSigned=*/true);
13343 StrideVal = Builder.CreateMul(
13344 LHS: NewStride,
13345 RHS: ConstantInt::get(
13346 Ty: StrideTy,
13347 V: (IsReverseOrder ? -1 : 1) *
13348 static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))));
13349 }
13350 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
13351 auto *Inst = Builder.CreateIntrinsic(
13352 ID: Intrinsic::experimental_vp_strided_load,
13353 Types: {VecTy, PO->getType(), StrideTy},
13354 Args: {PO, StrideVal, Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
13355 Builder.getInt32(C: E->Scalars.size())});
13356 Inst->addParamAttr(
13357 /*ArgNo=*/0,
13358 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
13359 NewLI = Inst;
13360 } else {
13361 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13362 Value *VecPtr = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13363 if (E->VectorizedValue) {
13364 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13365 return E->VectorizedValue;
13366 }
13367 // Use the minimum alignment of the gathered loads.
13368 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
13369 NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
13370 }
13371 Value *V = propagateMetadata(I: NewLI, VL: E->Scalars);
13372
13373 V = FinalShuffle(V, E, VecTy);
13374 E->VectorizedValue = V;
13375 ++NumVectorInstructions;
13376 return V;
13377 }
13378 case Instruction::Store: {
13379 auto *SI = cast<StoreInst>(Val: VL0);
13380
13381 setInsertPointAfterBundle(E);
13382
13383 Value *VecValue = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13384 if (VecValue->getType() != VecTy)
13385 VecValue =
13386 Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0));
13387 VecValue = FinalShuffle(VecValue, E, VecTy);
13388
13389 Value *Ptr = SI->getPointerOperand();
13390 Instruction *ST;
13391 if (E->State == TreeEntry::Vectorize) {
13392 ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
13393 } else {
13394 assert(E->State == TreeEntry::StridedVectorize &&
13395 "Expected either strided or conseutive stores.");
13396 if (!E->ReorderIndices.empty()) {
13397 SI = cast<StoreInst>(Val: E->Scalars[E->ReorderIndices.front()]);
13398 Ptr = SI->getPointerOperand();
13399 }
13400 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
13401 Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
13402 auto *Inst = Builder.CreateIntrinsic(
13403 ID: Intrinsic::experimental_vp_strided_store,
13404 Types: {VecTy, Ptr->getType(), StrideTy},
13405 Args: {VecValue, Ptr,
13406 ConstantInt::get(
13407 Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
13408 Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
13409 Builder.getInt32(C: E->Scalars.size())});
13410 Inst->addParamAttr(
13411 /*ArgNo=*/1,
13412 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
13413 ST = Inst;
13414 }
13415
13416 Value *V = propagateMetadata(I: ST, VL: E->Scalars);
13417
13418 E->VectorizedValue = V;
13419 ++NumVectorInstructions;
13420 return V;
13421 }
13422 case Instruction::GetElementPtr: {
13423 auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
13424 setInsertPointAfterBundle(E);
13425
13426 Value *Op0 = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13427 if (E->VectorizedValue) {
13428 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13429 return E->VectorizedValue;
13430 }
13431
13432 SmallVector<Value *> OpVecs;
13433 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13434 Value *OpVec = vectorizeOperand(E, NodeIdx: J, PostponedPHIs);
13435 if (E->VectorizedValue) {
13436 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13437 return E->VectorizedValue;
13438 }
13439 OpVecs.push_back(Elt: OpVec);
13440 }
13441
13442 Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
13443 if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
13444 SmallVector<Value *> GEPs;
13445 for (Value *V : E->Scalars) {
13446 if (isa<GetElementPtrInst>(Val: V))
13447 GEPs.push_back(Elt: V);
13448 }
13449 V = propagateMetadata(I, VL: GEPs);
13450 }
13451
13452 V = FinalShuffle(V, E, VecTy);
13453
13454 E->VectorizedValue = V;
13455 ++NumVectorInstructions;
13456
13457 return V;
13458 }
13459 case Instruction::Call: {
13460 CallInst *CI = cast<CallInst>(Val: VL0);
13461 setInsertPointAfterBundle(E);
13462
13463 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
13464
13465 SmallVector<Type *> ArgTys =
13466 buildIntrinsicArgTypes(CI, ID, VF: VecTy->getNumElements(),
13467 MinBW: It != MinBWs.end() ? It->second.first : 0);
13468 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13469 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13470 VecCallCosts.first <= VecCallCosts.second;
13471
13472 Value *ScalarArg = nullptr;
13473 SmallVector<Value *> OpVecs;
13474 SmallVector<Type *, 2> TysForDecl;
13475 // Add return type if intrinsic is overloaded on it.
13476 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1))
13477 TysForDecl.push_back(Elt: VecTy);
13478 auto *CEI = cast<CallInst>(Val: VL0);
13479 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
13480 ValueList OpVL;
13481 // Some intrinsics have scalar arguments. This argument should not be
13482 // vectorized.
13483 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I)) {
13484 ScalarArg = CEI->getArgOperand(i: I);
13485 // if decided to reduce bitwidth of abs intrinsic, it second argument
13486 // must be set false (do not return poison, if value issigned min).
13487 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13488 It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
13489 ScalarArg = Builder.getFalse();
13490 OpVecs.push_back(Elt: ScalarArg);
13491 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
13492 TysForDecl.push_back(Elt: ScalarArg->getType());
13493 continue;
13494 }
13495
13496 Value *OpVec = vectorizeOperand(E, NodeIdx: I, PostponedPHIs);
13497 if (E->VectorizedValue) {
13498 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13499 return E->VectorizedValue;
13500 }
13501 ScalarArg = CEI->getArgOperand(i: I);
13502 if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
13503 ScalarArg->getType()->getScalarType() &&
13504 It == MinBWs.end()) {
13505 auto *CastTy =
13506 getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
13507 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I));
13508 } else if (It != MinBWs.end()) {
13509 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
13510 }
13511 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13512 OpVecs.push_back(Elt: OpVec);
13513 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I))
13514 TysForDecl.push_back(Elt: OpVec->getType());
13515 }
13516
13517 Function *CF;
13518 if (!UseIntrinsic) {
13519 VFShape Shape =
13520 VFShape::get(FTy: CI->getFunctionType(),
13521 EC: ElementCount::getFixed(
13522 MinVal: static_cast<unsigned>(VecTy->getNumElements())),
13523 HasGlobalPred: false /*HasGlobalPred*/);
13524 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13525 } else {
13526 CF = Intrinsic::getDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
13527 }
13528
13529 SmallVector<OperandBundleDef, 1> OpBundles;
13530 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
13531 Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
13532
13533 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
13534 V = FinalShuffle(V, E, VecTy);
13535
13536 E->VectorizedValue = V;
13537 ++NumVectorInstructions;
13538 return V;
13539 }
13540 case Instruction::ShuffleVector: {
13541 assert(E->isAltShuffle() &&
13542 ((Instruction::isBinaryOp(E->getOpcode()) &&
13543 Instruction::isBinaryOp(E->getAltOpcode())) ||
13544 (Instruction::isCast(E->getOpcode()) &&
13545 Instruction::isCast(E->getAltOpcode())) ||
13546 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13547 "Invalid Shuffle Vector Operand");
13548
13549 Value *LHS = nullptr, *RHS = nullptr;
13550 if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) {
13551 setInsertPointAfterBundle(E);
13552 LHS = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13553 if (E->VectorizedValue) {
13554 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13555 return E->VectorizedValue;
13556 }
13557 RHS = vectorizeOperand(E, NodeIdx: 1, PostponedPHIs);
13558 } else {
13559 setInsertPointAfterBundle(E);
13560 LHS = vectorizeOperand(E, NodeIdx: 0, PostponedPHIs);
13561 }
13562 if (E->VectorizedValue) {
13563 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13564 return E->VectorizedValue;
13565 }
13566 if (LHS && RHS &&
13567 ((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
13568 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13569 (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
13570 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
13571 getOperandEntry(E, 1)->isGather() ||
13572 MinBWs.contains(getOperandEntry(E, 0)) ||
13573 MinBWs.contains(getOperandEntry(E, 1))) &&
13574 "Expected item in MinBWs.");
13575 Type *CastTy = VecTy;
13576 if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
13577 if (cast<VectorType>(Val: LHS->getType())
13578 ->getElementType()
13579 ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
13580 ->getElementType()
13581 ->getIntegerBitWidth())
13582 CastTy = RHS->getType();
13583 else
13584 CastTy = LHS->getType();
13585 }
13586 if (LHS->getType() != CastTy)
13587 LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0));
13588 if (RHS->getType() != CastTy)
13589 RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1));
13590 }
13591
13592 Value *V0, *V1;
13593 if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
13594 V0 = Builder.CreateBinOp(
13595 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13596 V1 = Builder.CreateBinOp(
13597 Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13598 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
13599 V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
13600 auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
13601 CmpInst::Predicate AltPred = AltCI->getPredicate();
13602 V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
13603 } else {
13604 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13605 unsigned SrcBWSz = DL->getTypeSizeInBits(
13606 Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
13607 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
13608 if (BWSz <= SrcBWSz) {
13609 if (BWSz < SrcBWSz)
13610 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first);
13611 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13612 if (auto *I = dyn_cast<Instruction>(Val: LHS))
13613 LHS = propagateMetadata(I, VL: E->Scalars);
13614 E->VectorizedValue = LHS;
13615 ++NumVectorInstructions;
13616 return LHS;
13617 }
13618 }
13619 V0 = Builder.CreateCast(
13620 Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
13621 V1 = Builder.CreateCast(
13622 Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
13623 }
13624 // Add V0 and V1 to later analysis to try to find and remove matching
13625 // instruction, if any.
13626 for (Value *V : {V0, V1}) {
13627 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13628 GatherShuffleExtractSeq.insert(X: I);
13629 CSEBlocks.insert(V: I->getParent());
13630 }
13631 }
13632
13633 // Create shuffle to take alternate operations from the vector.
13634 // Also, gather up main and alt scalar ops to propagate IR flags to
13635 // each vector operation.
13636 ValueList OpScalars, AltScalars;
13637 SmallVector<int> Mask;
13638 E->buildAltOpShuffleMask(
13639 IsAltOp: [E, this](Instruction *I) {
13640 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13641 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
13642 TLI: *TLI);
13643 },
13644 Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
13645
13646 propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
13647 propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
13648 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13649 // Drop nuw flags for abs(sub(commutative), true).
13650 if (auto *I = dyn_cast<Instruction>(Val: Vec);
13651 I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
13652 any_of(Range&: E->Scalars, P: [](Value *V) {
13653 auto *IV = cast<Instruction>(Val: V);
13654 return IV->getOpcode() == Instruction::Sub &&
13655 isCommutative(I: cast<Instruction>(Val: IV));
13656 }))
13657 I->setHasNoUnsignedWrap(/*b=*/false);
13658 };
13659 DropNuwFlag(V0, E->getOpcode());
13660 DropNuwFlag(V1, E->getAltOpcode());
13661
13662 Value *V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
13663 if (auto *I = dyn_cast<Instruction>(Val: V)) {
13664 V = propagateMetadata(I, VL: E->Scalars);
13665 GatherShuffleExtractSeq.insert(X: I);
13666 CSEBlocks.insert(V: I->getParent());
13667 }
13668
13669 E->VectorizedValue = V;
13670 ++NumVectorInstructions;
13671
13672 return V;
13673 }
13674 default:
13675 llvm_unreachable("unknown inst");
13676 }
13677 return nullptr;
13678}
13679
13680Value *BoUpSLP::vectorizeTree() {
13681 ExtraValueToDebugLocsMap ExternallyUsedValues;
13682 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13683 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13684}
13685
13686namespace {
13687/// Data type for handling buildvector sequences with the reused scalars from
13688/// other tree entries.
13689struct ShuffledInsertData {
13690 /// List of insertelements to be replaced by shuffles.
13691 SmallVector<InsertElementInst *> InsertElements;
13692 /// The parent vectors and shuffle mask for the given list of inserts.
13693 MapVector<Value *, SmallVector<int>> ValueMasks;
13694};
13695} // namespace
13696
13697Value *BoUpSLP::vectorizeTree(
13698 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13699 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13700 Instruction *ReductionRoot) {
13701 // All blocks must be scheduled before any instructions are inserted.
13702 for (auto &BSIter : BlocksSchedules) {
13703 scheduleBlock(BS: BSIter.second.get());
13704 }
13705 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13706 // need to rebuild it.
13707 EntryToLastInstruction.clear();
13708
13709 if (ReductionRoot)
13710 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
13711 IP: ReductionRoot->getIterator());
13712 else
13713 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13714
13715 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13716 (void)vectorizeTree(E: VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13717 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13718 if (TE->State == TreeEntry::Vectorize &&
13719 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13720 TE->VectorizedValue)
13721 (void)vectorizeTree(E: TE.get(), /*PostponedPHIs=*/false);
13722 // Run through the list of postponed gathers and emit them, replacing the temp
13723 // emitted allocas with actual vector instructions.
13724 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13725 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
13726 for (const TreeEntry *E : PostponedNodes) {
13727 auto *TE = const_cast<TreeEntry *>(E);
13728 if (auto *VecTE = getTreeEntry(V: TE->Scalars.front()))
13729 if (VecTE->isSame(VL: TE->UserTreeIndices.front().UserTE->getOperand(
13730 OpIdx: TE->UserTreeIndices.front().EdgeIdx)) &&
13731 VecTE->isSame(VL: TE->Scalars))
13732 // Found gather node which is absolutely the same as one of the
13733 // vectorized nodes. It may happen after reordering.
13734 continue;
13735 auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
13736 TE->VectorizedValue = nullptr;
13737 auto *UserI =
13738 cast<Instruction>(Val&: TE->UserTreeIndices.front().UserTE->VectorizedValue);
13739 // If user is a PHI node, its vector code have to be inserted right before
13740 // block terminator. Since the node was delayed, there were some unresolved
13741 // dependencies at the moment when stab instruction was emitted. In a case
13742 // when any of these dependencies turn out an operand of another PHI, coming
13743 // from this same block, position of a stab instruction will become invalid.
13744 // The is because source vector that supposed to feed this gather node was
13745 // inserted at the end of the block [after stab instruction]. So we need
13746 // to adjust insertion point again to the end of block.
13747 if (isa<PHINode>(Val: UserI)) {
13748 // Insert before all users.
13749 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13750 for (User *U : PrevVec->users()) {
13751 if (U == UserI)
13752 continue;
13753 auto *UI = dyn_cast<Instruction>(Val: U);
13754 if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent())
13755 continue;
13756 if (UI->comesBefore(Other: InsertPt))
13757 InsertPt = UI;
13758 }
13759 Builder.SetInsertPoint(InsertPt);
13760 } else {
13761 Builder.SetInsertPoint(PrevVec);
13762 }
13763 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13764 Value *Vec = vectorizeTree(E: TE, /*PostponedPHIs=*/false);
13765 if (Vec->getType() != PrevVec->getType()) {
13766 assert(Vec->getType()->isIntOrIntVectorTy() &&
13767 PrevVec->getType()->isIntOrIntVectorTy() &&
13768 "Expected integer vector types only.");
13769 std::optional<bool> IsSigned;
13770 for (Value *V : TE->Scalars) {
13771 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13772 auto It = MinBWs.find(Val: BaseTE);
13773 if (It != MinBWs.end()) {
13774 IsSigned = IsSigned.value_or(u: false) || It->second.second;
13775 if (*IsSigned)
13776 break;
13777 }
13778 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(Val: V)) {
13779 auto It = MinBWs.find(Val: MNTE);
13780 if (It != MinBWs.end()) {
13781 IsSigned = IsSigned.value_or(u: false) || It->second.second;
13782 if (*IsSigned)
13783 break;
13784 }
13785 }
13786 if (IsSigned.value_or(u: false))
13787 break;
13788 // Scan through gather nodes.
13789 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
13790 auto It = MinBWs.find(Val: BVE);
13791 if (It != MinBWs.end()) {
13792 IsSigned = IsSigned.value_or(u: false) || It->second.second;
13793 if (*IsSigned)
13794 break;
13795 }
13796 }
13797 if (IsSigned.value_or(u: false))
13798 break;
13799 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
13800 IsSigned =
13801 IsSigned.value_or(u: false) ||
13802 !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL));
13803 continue;
13804 }
13805 if (IsSigned.value_or(u: false))
13806 break;
13807 }
13808 }
13809 if (IsSigned.value_or(u: false)) {
13810 // Final attempt - check user node.
13811 auto It = MinBWs.find(Val: TE->UserTreeIndices.front().UserTE);
13812 if (It != MinBWs.end())
13813 IsSigned = It->second.second;
13814 }
13815 assert(IsSigned &&
13816 "Expected user node or perfect diamond match in MinBWs.");
13817 Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
13818 }
13819 PrevVec->replaceAllUsesWith(V: Vec);
13820 PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE);
13821 // Replace the stub vector node, if it was used before for one of the
13822 // buildvector nodes already.
13823 auto It = PostponedValues.find(Val: PrevVec);
13824 if (It != PostponedValues.end()) {
13825 for (TreeEntry *VTE : It->getSecond())
13826 VTE->VectorizedValue = Vec;
13827 }
13828 eraseInstruction(I: PrevVec);
13829 }
13830
13831 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13832 << " values .\n");
13833
13834 SmallVector<ShuffledInsertData> ShuffledInserts;
13835 // Maps vector instruction to original insertelement instruction
13836 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13837 // Maps extract Scalar to the corresponding extractelement instruction in the
13838 // basic block. Only one extractelement per block should be emitted.
13839 DenseMap<Value *,
13840 DenseMap<BasicBlock *, std::pair<Instruction *, Instruction *>>>
13841 ScalarToEEs;
13842 SmallDenseSet<Value *, 4> UsedInserts;
13843 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
13844 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13845 // Extract all of the elements with the external uses.
13846 for (const auto &ExternalUse : ExternalUses) {
13847 Value *Scalar = ExternalUse.Scalar;
13848 llvm::User *User = ExternalUse.User;
13849
13850 // Skip users that we already RAUW. This happens when one instruction
13851 // has multiple uses of the same value.
13852 if (User && !is_contained(Range: Scalar->users(), Element: User))
13853 continue;
13854 TreeEntry *E = getTreeEntry(V: Scalar);
13855 assert(E && "Invalid scalar");
13856 assert(!E->isGather() && "Extracting from a gather list");
13857 // Non-instruction pointers are not deleted, just skip them.
13858 if (E->getOpcode() == Instruction::GetElementPtr &&
13859 !isa<GetElementPtrInst>(Val: Scalar))
13860 continue;
13861
13862 Value *Vec = E->VectorizedValue;
13863 assert(Vec && "Can't find vectorizable value");
13864
13865 Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
13866 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13867 if (Scalar->getType() != Vec->getType()) {
13868 Value *Ex = nullptr;
13869 Value *ExV = nullptr;
13870 auto *GEP = dyn_cast<GetElementPtrInst>(Val: Scalar);
13871 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(Ptr: GEP);
13872 auto It = ScalarToEEs.find(Val: Scalar);
13873 if (It != ScalarToEEs.end()) {
13874 // No need to emit many extracts, just move the only one in the
13875 // current block.
13876 auto EEIt = It->second.find(Val: Builder.GetInsertBlock());
13877 if (EEIt != It->second.end()) {
13878 Instruction *I = EEIt->second.first;
13879 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13880 Builder.GetInsertPoint()->comesBefore(Other: I)) {
13881 I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
13882 I: Builder.GetInsertPoint());
13883 if (auto *CI = EEIt->second.second)
13884 CI->moveAfter(MovePos: I);
13885 }
13886 Ex = I;
13887 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13888 }
13889 }
13890 if (!Ex) {
13891 // "Reuse" the existing extract to improve final codegen.
13892 if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar)) {
13893 Value *V = ES->getVectorOperand();
13894 if (const TreeEntry *ETE = getTreeEntry(V))
13895 V = ETE->VectorizedValue;
13896 Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
13897 } else if (ReplaceGEP) {
13898 // Leave the GEPs as is, they are free in most cases and better to
13899 // keep them as GEPs.
13900 auto *CloneGEP = GEP->clone();
13901 if (isa<Instruction>(Val: Vec))
13902 CloneGEP->insertBefore(BB&: *Builder.GetInsertBlock(),
13903 InsertPos: Builder.GetInsertPoint());
13904 else
13905 CloneGEP->insertBefore(InsertPos: GEP);
13906 if (GEP->hasName())
13907 CloneGEP->takeName(V: GEP);
13908 Ex = CloneGEP;
13909 } else {
13910 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
13911 }
13912 // If necessary, sign-extend or zero-extend ScalarRoot
13913 // to the larger type.
13914 ExV = Ex;
13915 if (Scalar->getType() != Ex->getType())
13916 ExV = Builder.CreateIntCast(V: Ex, DestTy: Scalar->getType(),
13917 isSigned: MinBWs.find(Val: E)->second.second);
13918 if (auto *I = dyn_cast<Instruction>(Val: Ex))
13919 ScalarToEEs[Scalar].try_emplace(
13920 Key: Builder.GetInsertBlock(),
13921 Args: std::make_pair(x&: I, y: cast<Instruction>(Val: ExV)));
13922 }
13923 // The then branch of the previous if may produce constants, since 0
13924 // operand might be a constant.
13925 if (auto *ExI = dyn_cast<Instruction>(Val: Ex)) {
13926 GatherShuffleExtractSeq.insert(X: ExI);
13927 CSEBlocks.insert(V: ExI->getParent());
13928 }
13929 return ExV;
13930 }
13931 assert(isa<FixedVectorType>(Scalar->getType()) &&
13932 isa<InsertElementInst>(Scalar) &&
13933 "In-tree scalar of vector type is not insertelement?");
13934 auto *IE = cast<InsertElementInst>(Val: Scalar);
13935 VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
13936 return Vec;
13937 };
13938 // If User == nullptr, the Scalar remains as scalar in vectorized
13939 // instructions or is used as extra arg. Generate ExtractElement instruction
13940 // and update the record for this scalar in ExternallyUsedValues.
13941 if (!User) {
13942 if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
13943 continue;
13944 assert((ExternallyUsedValues.count(Scalar) ||
13945 Scalar->hasNUsesOrMore(UsesLimit) ||
13946 any_of(Scalar->users(),
13947 [&](llvm::User *U) {
13948 if (ExternalUsesAsGEPs.contains(U))
13949 return true;
13950 TreeEntry *UseEntry = getTreeEntry(U);
13951 return UseEntry &&
13952 (UseEntry->State == TreeEntry::Vectorize ||
13953 UseEntry->State ==
13954 TreeEntry::StridedVectorize) &&
13955 (E->State == TreeEntry::Vectorize ||
13956 E->State == TreeEntry::StridedVectorize) &&
13957 doesInTreeUserNeedToExtract(
13958 Scalar,
13959 cast<Instruction>(UseEntry->Scalars.front()),
13960 TLI);
13961 })) &&
13962 "Scalar with nullptr User must be registered in "
13963 "ExternallyUsedValues map or remain as scalar in vectorized "
13964 "instructions");
13965 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
13966 if (auto *PHI = dyn_cast<PHINode>(Val: VecI))
13967 Builder.SetInsertPoint(TheBB: PHI->getParent(),
13968 IP: PHI->getParent()->getFirstNonPHIIt());
13969 else
13970 Builder.SetInsertPoint(TheBB: VecI->getParent(),
13971 IP: std::next(x: VecI->getIterator()));
13972 } else {
13973 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
13974 }
13975 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13976 // Required to update internally referenced instructions.
13977 Scalar->replaceAllUsesWith(V: NewInst);
13978 ReplacedExternals.emplace_back(Args&: Scalar, Args&: NewInst);
13979 continue;
13980 }
13981
13982 if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
13983 VU && VU->getOperand(i_nocapture: 1) == Scalar) {
13984 // Skip if the scalar is another vector op or Vec is not an instruction.
13985 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
13986 if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
13987 if (!UsedInserts.insert(V: VU).second)
13988 continue;
13989 // Need to use original vector, if the root is truncated.
13990 auto BWIt = MinBWs.find(Val: E);
13991 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13992 auto *ScalarTy = FTy->getElementType();
13993 auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
13994 auto VecIt = VectorCasts.find(Val: Key);
13995 if (VecIt == VectorCasts.end()) {
13996 IRBuilderBase::InsertPointGuard Guard(Builder);
13997 if (auto *IVec = dyn_cast<PHINode>(Val: Vec))
13998 Builder.SetInsertPoint(
13999 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
14000 else if (auto *IVec = dyn_cast<Instruction>(Val: Vec))
14001 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
14002 Vec = Builder.CreateIntCast(
14003 V: Vec,
14004 DestTy: getWidenedType(
14005 ScalarTy,
14006 VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
14007 isSigned: BWIt->second.second);
14008 VectorCasts.try_emplace(Key, Args&: Vec);
14009 } else {
14010 Vec = VecIt->second;
14011 }
14012 }
14013
14014 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
14015 if (InsertIdx) {
14016 auto *It =
14017 find_if(Range&: ShuffledInserts, P: [VU](const ShuffledInsertData &Data) {
14018 // Checks if 2 insertelements are from the same buildvector.
14019 InsertElementInst *VecInsert = Data.InsertElements.front();
14020 return areTwoInsertFromSameBuildVector(
14021 VU, V: VecInsert,
14022 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); });
14023 });
14024 unsigned Idx = *InsertIdx;
14025 if (It == ShuffledInserts.end()) {
14026 (void)ShuffledInserts.emplace_back();
14027 It = std::next(x: ShuffledInserts.begin(),
14028 n: ShuffledInserts.size() - 1);
14029 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14030 if (Mask.empty())
14031 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
14032 // Find the insertvector, vectorized in tree, if any.
14033 Value *Base = VU;
14034 while (auto *IEBase = dyn_cast<InsertElementInst>(Val: Base)) {
14035 if (IEBase != User &&
14036 (!IEBase->hasOneUse() ||
14037 getElementIndex(Inst: IEBase).value_or(u&: Idx) == Idx))
14038 break;
14039 // Build the mask for the vectorized insertelement instructions.
14040 if (const TreeEntry *E = getTreeEntry(V: IEBase)) {
14041 do {
14042 IEBase = cast<InsertElementInst>(Val: Base);
14043 int IEIdx = *getElementIndex(Inst: IEBase);
14044 assert(Mask[IEIdx] == PoisonMaskElem &&
14045 "InsertElementInstruction used already.");
14046 Mask[IEIdx] = IEIdx;
14047 Base = IEBase->getOperand(i_nocapture: 0);
14048 } while (E == getTreeEntry(V: Base));
14049 break;
14050 }
14051 Base = cast<InsertElementInst>(Val: Base)->getOperand(i_nocapture: 0);
14052 // After the vectorization the def-use chain has changed, need
14053 // to look through original insertelement instructions, if they
14054 // get replaced by vector instructions.
14055 auto It = VectorToInsertElement.find(Val: Base);
14056 if (It != VectorToInsertElement.end())
14057 Base = It->second;
14058 }
14059 }
14060 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
14061 if (Mask.empty())
14062 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
14063 Mask[Idx] = ExternalUse.Lane;
14064 It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
14065 continue;
14066 }
14067 }
14068 }
14069 }
14070
14071 // Generate extracts for out-of-tree users.
14072 // Find the insertion point for the extractelement lane.
14073 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
14074 if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
14075 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
14076 if (PH->getIncomingValue(i: I) == Scalar) {
14077 Instruction *IncomingTerminator =
14078 PH->getIncomingBlock(i: I)->getTerminator();
14079 if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
14080 Builder.SetInsertPoint(TheBB: VecI->getParent(),
14081 IP: std::next(x: VecI->getIterator()));
14082 } else {
14083 Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
14084 }
14085 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14086 PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
14087 }
14088 }
14089 } else {
14090 Builder.SetInsertPoint(cast<Instruction>(Val: User));
14091 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14092 User->replaceUsesOfWith(From: Scalar, To: NewInst);
14093 }
14094 } else {
14095 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
14096 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
14097 User->replaceUsesOfWith(From: Scalar, To: NewInst);
14098 }
14099
14100 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
14101 }
14102
14103 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14104 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
14105 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
14106 int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
14107 for (int I = 0, E = Mask.size(); I < E; ++I) {
14108 if (Mask[I] < VF)
14109 CombinedMask1[I] = Mask[I];
14110 else
14111 CombinedMask2[I] = Mask[I] - VF;
14112 }
14113 ShuffleInstructionBuilder ShuffleBuilder(
14114 cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
14115 ShuffleBuilder.add(V1, Mask: CombinedMask1);
14116 if (V2)
14117 ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
14118 return ShuffleBuilder.finalize(ExtMask: std::nullopt);
14119 };
14120
14121 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
14122 bool ForSingleMask) {
14123 unsigned VF = Mask.size();
14124 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
14125 if (VF != VecVF) {
14126 if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
14127 Vec = CreateShuffle(Vec, nullptr, Mask);
14128 return std::make_pair(x&: Vec, y: true);
14129 }
14130 if (!ForSingleMask) {
14131 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14132 for (unsigned I = 0; I < VF; ++I) {
14133 if (Mask[I] != PoisonMaskElem)
14134 ResizeMask[Mask[I]] = Mask[I];
14135 }
14136 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
14137 }
14138 }
14139
14140 return std::make_pair(x&: Vec, y: false);
14141 };
14142 // Perform shuffling of the vectorize tree entries for better handling of
14143 // external extracts.
14144 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
14145 // Find the first and the last instruction in the list of insertelements.
14146 sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement);
14147 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
14148 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
14149 Builder.SetInsertPoint(LastInsert);
14150 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
14151 Value *NewInst = performExtractsShuffleAction<Value>(
14152 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
14153 Base: FirstInsert->getOperand(i_nocapture: 0),
14154 GetVF: [](Value *Vec) {
14155 return cast<VectorType>(Val: Vec->getType())
14156 ->getElementCount()
14157 .getKnownMinValue();
14158 },
14159 ResizeAction: ResizeToVF,
14160 Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
14161 ArrayRef<Value *> Vals) {
14162 assert((Vals.size() == 1 || Vals.size() == 2) &&
14163 "Expected exactly 1 or 2 input values.");
14164 if (Vals.size() == 1) {
14165 // Do not create shuffle if the mask is a simple identity
14166 // non-resizing mask.
14167 if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
14168 ->getNumElements() ||
14169 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
14170 return CreateShuffle(Vals.front(), nullptr, Mask);
14171 return Vals.front();
14172 }
14173 return CreateShuffle(Vals.front() ? Vals.front()
14174 : FirstInsert->getOperand(i_nocapture: 0),
14175 Vals.back(), Mask);
14176 });
14177 auto It = ShuffledInserts[I].InsertElements.rbegin();
14178 // Rebuild buildvector chain.
14179 InsertElementInst *II = nullptr;
14180 if (It != ShuffledInserts[I].InsertElements.rend())
14181 II = *It;
14182 SmallVector<Instruction *> Inserts;
14183 while (It != ShuffledInserts[I].InsertElements.rend()) {
14184 assert(II && "Must be an insertelement instruction.");
14185 if (*It == II)
14186 ++It;
14187 else
14188 Inserts.push_back(Elt: cast<Instruction>(Val: II));
14189 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
14190 }
14191 for (Instruction *II : reverse(C&: Inserts)) {
14192 II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst);
14193 if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
14194 if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
14195 II->moveAfter(MovePos: NewI);
14196 NewInst = II;
14197 }
14198 LastInsert->replaceAllUsesWith(V: NewInst);
14199 for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) {
14200 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0),
14201 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType()));
14202 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1),
14203 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType()));
14204 eraseInstruction(I: IE);
14205 }
14206 CSEBlocks.insert(V: LastInsert->getParent());
14207 }
14208
14209 SmallVector<Instruction *> RemovedInsts;
14210 // For each vectorized value:
14211 for (auto &TEPtr : VectorizableTree) {
14212 TreeEntry *Entry = TEPtr.get();
14213
14214 // No need to handle users of gathered values.
14215 if (Entry->isGather())
14216 continue;
14217
14218 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14219
14220 // For each lane:
14221 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14222 Value *Scalar = Entry->Scalars[Lane];
14223
14224 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14225 !isa<GetElementPtrInst>(Val: Scalar))
14226 continue;
14227#ifndef NDEBUG
14228 Type *Ty = Scalar->getType();
14229 if (!Ty->isVoidTy()) {
14230 for (User *U : Scalar->users()) {
14231 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14232
14233 // It is legal to delete users in the ignorelist.
14234 assert((getTreeEntry(U) ||
14235 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14236 (isa_and_nonnull<Instruction>(U) &&
14237 isDeleted(cast<Instruction>(U)))) &&
14238 "Deleting out-of-tree value");
14239 }
14240 }
14241#endif
14242 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14243 auto *I = cast<Instruction>(Val: Scalar);
14244 RemovedInsts.push_back(Elt: I);
14245 }
14246 }
14247
14248 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14249 // new vector instruction.
14250 if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue))
14251 V->mergeDIAssignID(SourceInstructions: RemovedInsts);
14252
14253 // Clear up reduction references, if any.
14254 if (UserIgnoreList) {
14255 for (Instruction *I : RemovedInsts) {
14256 if (getTreeEntry(V: I)->Idx != 0)
14257 continue;
14258 SmallVector<SelectInst *> LogicalOpSelects;
14259 I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
14260 // Do not replace condition of the logical op in form select <cond>.
14261 bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
14262 (match(V: U.getUser(), P: m_LogicalAnd()) ||
14263 match(V: U.getUser(), P: m_LogicalOr())) &&
14264 U.getOperandNo() == 0;
14265 if (IsPoisoningLogicalOp) {
14266 LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
14267 return false;
14268 }
14269 return UserIgnoreList->contains(V: U.getUser());
14270 });
14271 // Replace conditions of the poisoning logical ops with the non-poison
14272 // constant value.
14273 for (SelectInst *SI : LogicalOpSelects)
14274 SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
14275 }
14276 }
14277 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14278 // cache correctness.
14279 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
14280 // - instructions are not deleted until later.
14281 removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts));
14282
14283 Builder.ClearInsertionPoint();
14284 InstrElementSize.clear();
14285
14286 const TreeEntry &RootTE = *VectorizableTree.front();
14287 Value *Vec = RootTE.VectorizedValue;
14288 if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 &&
14289 It != MinBWs.end() &&
14290 ReductionBitWidth != It->second.first) {
14291 IRBuilder<>::InsertPointGuard Guard(Builder);
14292 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
14293 IP: ReductionRoot->getIterator());
14294 Vec = Builder.CreateIntCast(
14295 V: Vec,
14296 DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
14297 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
14298 isSigned: It->second.second);
14299 }
14300 return Vec;
14301}
14302
14303void BoUpSLP::optimizeGatherSequence() {
14304 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14305 << " gather sequences instructions.\n");
14306 // LICM InsertElementInst sequences.
14307 for (Instruction *I : GatherShuffleExtractSeq) {
14308 if (isDeleted(I))
14309 continue;
14310
14311 // Check if this block is inside a loop.
14312 Loop *L = LI->getLoopFor(BB: I->getParent());
14313 if (!L)
14314 continue;
14315
14316 // Check if it has a preheader.
14317 BasicBlock *PreHeader = L->getLoopPreheader();
14318 if (!PreHeader)
14319 continue;
14320
14321 // If the vector or the element that we insert into it are
14322 // instructions that are defined in this basic block then we can't
14323 // hoist this instruction.
14324 if (any_of(Range: I->operands(), P: [L](Value *V) {
14325 auto *OpI = dyn_cast<Instruction>(Val: V);
14326 return OpI && L->contains(Inst: OpI);
14327 }))
14328 continue;
14329
14330 // We can hoist this instruction. Move it to the pre-header.
14331 I->moveBefore(MovePos: PreHeader->getTerminator());
14332 CSEBlocks.insert(V: PreHeader);
14333 }
14334
14335 // Make a list of all reachable blocks in our CSE queue.
14336 SmallVector<const DomTreeNode *, 8> CSEWorkList;
14337 CSEWorkList.reserve(N: CSEBlocks.size());
14338 for (BasicBlock *BB : CSEBlocks)
14339 if (DomTreeNode *N = DT->getNode(BB)) {
14340 assert(DT->isReachableFromEntry(N));
14341 CSEWorkList.push_back(Elt: N);
14342 }
14343
14344 // Sort blocks by domination. This ensures we visit a block after all blocks
14345 // dominating it are visited.
14346 llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) {
14347 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14348 "Different nodes should have different DFS numbers");
14349 return A->getDFSNumIn() < B->getDFSNumIn();
14350 });
14351
14352 // Less defined shuffles can be replaced by the more defined copies.
14353 // Between two shuffles one is less defined if it has the same vector operands
14354 // and its mask indeces are the same as in the first one or undefs. E.g.
14355 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14356 // poison, <0, 0, 0, 0>.
14357 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14358 SmallVectorImpl<int> &NewMask) {
14359 if (I1->getType() != I2->getType())
14360 return false;
14361 auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
14362 auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
14363 if (!SI1 || !SI2)
14364 return I1->isIdenticalTo(I: I2);
14365 if (SI1->isIdenticalTo(I: SI2))
14366 return true;
14367 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14368 if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
14369 return false;
14370 // Check if the second instruction is more defined than the first one.
14371 NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
14372 ArrayRef<int> SM1 = SI1->getShuffleMask();
14373 // Count trailing undefs in the mask to check the final number of used
14374 // registers.
14375 unsigned LastUndefsCnt = 0;
14376 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14377 if (SM1[I] == PoisonMaskElem)
14378 ++LastUndefsCnt;
14379 else
14380 LastUndefsCnt = 0;
14381 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14382 NewMask[I] != SM1[I])
14383 return false;
14384 if (NewMask[I] == PoisonMaskElem)
14385 NewMask[I] = SM1[I];
14386 }
14387 // Check if the last undefs actually change the final number of used vector
14388 // registers.
14389 return SM1.size() - LastUndefsCnt > 1 &&
14390 TTI->getNumberOfParts(Tp: SI1->getType()) ==
14391 TTI->getNumberOfParts(
14392 Tp: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
14393 VF: SM1.size() - LastUndefsCnt));
14394 };
14395 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14396 // instructions. TODO: We can further optimize this scan if we split the
14397 // instructions into different buckets based on the insert lane.
14398 SmallVector<Instruction *, 16> Visited;
14399 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14400 assert(*I &&
14401 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14402 "Worklist not sorted properly!");
14403 BasicBlock *BB = (*I)->getBlock();
14404 // For all instructions in blocks containing gather sequences:
14405 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
14406 if (isDeleted(I: &In))
14407 continue;
14408 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
14409 !GatherShuffleExtractSeq.contains(key: &In))
14410 continue;
14411
14412 // Check if we can replace this instruction with any of the
14413 // visited instructions.
14414 bool Replaced = false;
14415 for (Instruction *&V : Visited) {
14416 SmallVector<int> NewMask;
14417 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14418 DT->dominates(A: V->getParent(), B: In.getParent())) {
14419 In.replaceAllUsesWith(V);
14420 eraseInstruction(I: &In);
14421 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
14422 if (!NewMask.empty())
14423 SI->setShuffleMask(NewMask);
14424 Replaced = true;
14425 break;
14426 }
14427 if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
14428 GatherShuffleExtractSeq.contains(key: V) &&
14429 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14430 DT->dominates(A: In.getParent(), B: V->getParent())) {
14431 In.moveAfter(MovePos: V);
14432 V->replaceAllUsesWith(V: &In);
14433 eraseInstruction(I: V);
14434 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
14435 if (!NewMask.empty())
14436 SI->setShuffleMask(NewMask);
14437 V = &In;
14438 Replaced = true;
14439 break;
14440 }
14441 }
14442 if (!Replaced) {
14443 assert(!is_contained(Visited, &In));
14444 Visited.push_back(Elt: &In);
14445 }
14446 }
14447 }
14448 CSEBlocks.clear();
14449 GatherShuffleExtractSeq.clear();
14450}
14451
14452BoUpSLP::ScheduleData *
14453BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14454 ScheduleData *Bundle = nullptr;
14455 ScheduleData *PrevInBundle = nullptr;
14456 for (Value *V : VL) {
14457 if (doesNotNeedToBeScheduled(V))
14458 continue;
14459 ScheduleData *BundleMember = getScheduleData(V);
14460 assert(BundleMember &&
14461 "no ScheduleData for bundle member "
14462 "(maybe not in same basic block)");
14463 assert(BundleMember->isSchedulingEntity() &&
14464 "bundle member already part of other bundle");
14465 if (PrevInBundle) {
14466 PrevInBundle->NextInBundle = BundleMember;
14467 } else {
14468 Bundle = BundleMember;
14469 }
14470
14471 // Group the instructions to a bundle.
14472 BundleMember->FirstInBundle = Bundle;
14473 PrevInBundle = BundleMember;
14474 }
14475 assert(Bundle && "Failed to find schedule bundle");
14476 return Bundle;
14477}
14478
14479// Groups the instructions to a bundle (which is then a single scheduling entity)
14480// and schedules instructions until the bundle gets ready.
14481std::optional<BoUpSLP::ScheduleData *>
14482BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14483 const InstructionsState &S) {
14484 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14485 // instructions.
14486 if (isa<PHINode>(Val: S.OpValue) || isVectorLikeInstWithConstOps(V: S.OpValue) ||
14487 doesNotNeedToSchedule(VL))
14488 return nullptr;
14489
14490 // Initialize the instruction bundle.
14491 Instruction *OldScheduleEnd = ScheduleEnd;
14492 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14493
14494 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14495 ScheduleData *Bundle) {
14496 // The scheduling region got new instructions at the lower end (or it is a
14497 // new region for the first bundle). This makes it necessary to
14498 // recalculate all dependencies.
14499 // It is seldom that this needs to be done a second time after adding the
14500 // initial bundle to the region.
14501 if (ScheduleEnd != OldScheduleEnd) {
14502 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14503 doForAllOpcodes(V: I, Action: [](ScheduleData *SD) { SD->clearDependencies(); });
14504 ReSchedule = true;
14505 }
14506 if (Bundle) {
14507 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14508 << " in block " << BB->getName() << "\n");
14509 calculateDependencies(SD: Bundle, /*InsertInReadyList=*/true, SLP);
14510 }
14511
14512 if (ReSchedule) {
14513 resetSchedule();
14514 initialFillReadyList(ReadyList&: ReadyInsts);
14515 }
14516
14517 // Now try to schedule the new bundle or (if no bundle) just calculate
14518 // dependencies. As soon as the bundle is "ready" it means that there are no
14519 // cyclic dependencies and we can schedule it. Note that's important that we
14520 // don't "schedule" the bundle yet (see cancelScheduling).
14521 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14522 !ReadyInsts.empty()) {
14523 ScheduleData *Picked = ReadyInsts.pop_back_val();
14524 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14525 "must be ready to schedule");
14526 schedule(SD: Picked, ReadyList&: ReadyInsts);
14527 }
14528 };
14529
14530 // Make sure that the scheduling region contains all
14531 // instructions of the bundle.
14532 for (Value *V : VL) {
14533 if (doesNotNeedToBeScheduled(V))
14534 continue;
14535 if (!extendSchedulingRegion(V, S)) {
14536 // If the scheduling region got new instructions at the lower end (or it
14537 // is a new region for the first bundle). This makes it necessary to
14538 // recalculate all dependencies.
14539 // Otherwise the compiler may crash trying to incorrectly calculate
14540 // dependencies and emit instruction in the wrong order at the actual
14541 // scheduling.
14542 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14543 return std::nullopt;
14544 }
14545 }
14546
14547 bool ReSchedule = false;
14548 for (Value *V : VL) {
14549 if (doesNotNeedToBeScheduled(V))
14550 continue;
14551 ScheduleData *BundleMember = getScheduleData(V);
14552 assert(BundleMember &&
14553 "no ScheduleData for bundle member (maybe not in same basic block)");
14554
14555 // Make sure we don't leave the pieces of the bundle in the ready list when
14556 // whole bundle might not be ready.
14557 ReadyInsts.remove(X: BundleMember);
14558
14559 if (!BundleMember->IsScheduled)
14560 continue;
14561 // A bundle member was scheduled as single instruction before and now
14562 // needs to be scheduled as part of the bundle. We just get rid of the
14563 // existing schedule.
14564 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14565 << " was already scheduled\n");
14566 ReSchedule = true;
14567 }
14568
14569 auto *Bundle = buildBundle(VL);
14570 TryScheduleBundleImpl(ReSchedule, Bundle);
14571 if (!Bundle->isReady()) {
14572 cancelScheduling(VL, OpValue: S.OpValue);
14573 return std::nullopt;
14574 }
14575 return Bundle;
14576}
14577
14578void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14579 Value *OpValue) {
14580 if (isa<PHINode>(Val: OpValue) || isVectorLikeInstWithConstOps(V: OpValue) ||
14581 doesNotNeedToSchedule(VL))
14582 return;
14583
14584 if (doesNotNeedToBeScheduled(V: OpValue))
14585 OpValue = *find_if_not(Range&: VL, P: doesNotNeedToBeScheduled);
14586 ScheduleData *Bundle = getScheduleData(V: OpValue);
14587 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14588 assert(!Bundle->IsScheduled &&
14589 "Can't cancel bundle which is already scheduled");
14590 assert(Bundle->isSchedulingEntity() &&
14591 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14592 "tried to unbundle something which is not a bundle");
14593
14594 // Remove the bundle from the ready list.
14595 if (Bundle->isReady())
14596 ReadyInsts.remove(X: Bundle);
14597
14598 // Un-bundle: make single instructions out of the bundle.
14599 ScheduleData *BundleMember = Bundle;
14600 while (BundleMember) {
14601 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14602 BundleMember->FirstInBundle = BundleMember;
14603 ScheduleData *Next = BundleMember->NextInBundle;
14604 BundleMember->NextInBundle = nullptr;
14605 BundleMember->TE = nullptr;
14606 if (BundleMember->unscheduledDepsInBundle() == 0) {
14607 ReadyInsts.insert(X: BundleMember);
14608 }
14609 BundleMember = Next;
14610 }
14611}
14612
14613BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14614 // Allocate a new ScheduleData for the instruction.
14615 if (ChunkPos >= ChunkSize) {
14616 ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
14617 ChunkPos = 0;
14618 }
14619 return &(ScheduleDataChunks.back()[ChunkPos++]);
14620}
14621
14622bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14623 const InstructionsState &S) {
14624 if (getScheduleData(V, Key: isOneOf(S, Op: V)))
14625 return true;
14626 Instruction *I = dyn_cast<Instruction>(Val: V);
14627 assert(I && "bundle member must be an instruction");
14628 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14629 !doesNotNeedToBeScheduled(I) &&
14630 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14631 "be scheduled");
14632 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14633 ScheduleData *ISD = getScheduleData(I);
14634 if (!ISD)
14635 return false;
14636 assert(isInSchedulingRegion(ISD) &&
14637 "ScheduleData not in scheduling region");
14638 ScheduleData *SD = allocateScheduleDataChunks();
14639 SD->Inst = I;
14640 SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: S.OpValue);
14641 ExtraScheduleDataMap[I][S.OpValue] = SD;
14642 return true;
14643 };
14644 if (CheckScheduleForI(I))
14645 return true;
14646 if (!ScheduleStart) {
14647 // It's the first instruction in the new region.
14648 initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
14649 ScheduleStart = I;
14650 ScheduleEnd = I->getNextNode();
14651 if (isOneOf(S, Op: I) != I)
14652 CheckScheduleForI(I);
14653 assert(ScheduleEnd && "tried to vectorize a terminator?");
14654 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14655 return true;
14656 }
14657 // Search up and down at the same time, because we don't know if the new
14658 // instruction is above or below the existing scheduling region.
14659 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14660 // against the budget. Otherwise debug info could affect codegen.
14661 BasicBlock::reverse_iterator UpIter =
14662 ++ScheduleStart->getIterator().getReverse();
14663 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14664 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14665 BasicBlock::iterator LowerEnd = BB->end();
14666 auto IsAssumeLikeIntr = [](const Instruction &I) {
14667 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
14668 return II->isAssumeLikeIntrinsic();
14669 return false;
14670 };
14671 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14672 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14673 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14674 &*DownIter != I) {
14675 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14676 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14677 return false;
14678 }
14679
14680 ++UpIter;
14681 ++DownIter;
14682
14683 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
14684 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
14685 }
14686 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14687 assert(I->getParent() == ScheduleStart->getParent() &&
14688 "Instruction is in wrong basic block.");
14689 initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
14690 ScheduleStart = I;
14691 if (isOneOf(S, Op: I) != I)
14692 CheckScheduleForI(I);
14693 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14694 << "\n");
14695 return true;
14696 }
14697 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14698 "Expected to reach top of the basic block or instruction down the "
14699 "lower end.");
14700 assert(I->getParent() == ScheduleEnd->getParent() &&
14701 "Instruction is in wrong basic block.");
14702 initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
14703 NextLoadStore: nullptr);
14704 ScheduleEnd = I->getNextNode();
14705 if (isOneOf(S, Op: I) != I)
14706 CheckScheduleForI(I);
14707 assert(ScheduleEnd && "tried to vectorize a terminator?");
14708 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14709 return true;
14710}
14711
14712void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14713 Instruction *ToI,
14714 ScheduleData *PrevLoadStore,
14715 ScheduleData *NextLoadStore) {
14716 ScheduleData *CurrentLoadStore = PrevLoadStore;
14717 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14718 // No need to allocate data for non-schedulable instructions.
14719 if (doesNotNeedToBeScheduled(V: I))
14720 continue;
14721 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
14722 if (!SD) {
14723 SD = allocateScheduleDataChunks();
14724 ScheduleDataMap[I] = SD;
14725 SD->Inst = I;
14726 }
14727 assert(!isInSchedulingRegion(SD) &&
14728 "new ScheduleData already in scheduling region");
14729 SD->init(BlockSchedulingRegionID: SchedulingRegionID, OpVal: I);
14730
14731 if (I->mayReadOrWriteMemory() &&
14732 (!isa<IntrinsicInst>(Val: I) ||
14733 (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
14734 cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
14735 Intrinsic::pseudoprobe))) {
14736 // Update the linked list of memory accessing instructions.
14737 if (CurrentLoadStore) {
14738 CurrentLoadStore->NextLoadStore = SD;
14739 } else {
14740 FirstLoadStoreInRegion = SD;
14741 }
14742 CurrentLoadStore = SD;
14743 }
14744
14745 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
14746 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
14747 RegionHasStackSave = true;
14748 }
14749 if (NextLoadStore) {
14750 if (CurrentLoadStore)
14751 CurrentLoadStore->NextLoadStore = NextLoadStore;
14752 } else {
14753 LastLoadStoreInRegion = CurrentLoadStore;
14754 }
14755}
14756
14757void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14758 bool InsertInReadyList,
14759 BoUpSLP *SLP) {
14760 assert(SD->isSchedulingEntity());
14761
14762 SmallVector<ScheduleData *, 10> WorkList;
14763 WorkList.push_back(Elt: SD);
14764
14765 while (!WorkList.empty()) {
14766 ScheduleData *SD = WorkList.pop_back_val();
14767 for (ScheduleData *BundleMember = SD; BundleMember;
14768 BundleMember = BundleMember->NextInBundle) {
14769 assert(isInSchedulingRegion(BundleMember));
14770 if (BundleMember->hasValidDependencies())
14771 continue;
14772
14773 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14774 << "\n");
14775 BundleMember->Dependencies = 0;
14776 BundleMember->resetUnscheduledDeps();
14777
14778 // Handle def-use chain dependencies.
14779 if (BundleMember->OpValue != BundleMember->Inst) {
14780 if (ScheduleData *UseSD = getScheduleData(I: BundleMember->Inst)) {
14781 BundleMember->Dependencies++;
14782 ScheduleData *DestBundle = UseSD->FirstInBundle;
14783 if (!DestBundle->IsScheduled)
14784 BundleMember->incrementUnscheduledDeps(Incr: 1);
14785 if (!DestBundle->hasValidDependencies())
14786 WorkList.push_back(Elt: DestBundle);
14787 }
14788 } else {
14789 for (User *U : BundleMember->Inst->users()) {
14790 if (ScheduleData *UseSD = getScheduleData(I: cast<Instruction>(Val: U))) {
14791 BundleMember->Dependencies++;
14792 ScheduleData *DestBundle = UseSD->FirstInBundle;
14793 if (!DestBundle->IsScheduled)
14794 BundleMember->incrementUnscheduledDeps(Incr: 1);
14795 if (!DestBundle->hasValidDependencies())
14796 WorkList.push_back(Elt: DestBundle);
14797 }
14798 }
14799 }
14800
14801 auto MakeControlDependent = [&](Instruction *I) {
14802 auto *DepDest = getScheduleData(I);
14803 assert(DepDest && "must be in schedule window");
14804 DepDest->ControlDependencies.push_back(Elt: BundleMember);
14805 BundleMember->Dependencies++;
14806 ScheduleData *DestBundle = DepDest->FirstInBundle;
14807 if (!DestBundle->IsScheduled)
14808 BundleMember->incrementUnscheduledDeps(Incr: 1);
14809 if (!DestBundle->hasValidDependencies())
14810 WorkList.push_back(Elt: DestBundle);
14811 };
14812
14813 // Any instruction which isn't safe to speculate at the beginning of the
14814 // block is control dependend on any early exit or non-willreturn call
14815 // which proceeds it.
14816 if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->Inst)) {
14817 for (Instruction *I = BundleMember->Inst->getNextNode();
14818 I != ScheduleEnd; I = I->getNextNode()) {
14819 if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
14820 continue;
14821
14822 // Add the dependency
14823 MakeControlDependent(I);
14824
14825 if (!isGuaranteedToTransferExecutionToSuccessor(I))
14826 // Everything past here must be control dependent on I.
14827 break;
14828 }
14829 }
14830
14831 if (RegionHasStackSave) {
14832 // If we have an inalloc alloca instruction, it needs to be scheduled
14833 // after any preceeding stacksave. We also need to prevent any alloca
14834 // from reordering above a preceeding stackrestore.
14835 if (match(V: BundleMember->Inst, P: m_Intrinsic<Intrinsic::stacksave>()) ||
14836 match(V: BundleMember->Inst, P: m_Intrinsic<Intrinsic::stackrestore>())) {
14837 for (Instruction *I = BundleMember->Inst->getNextNode();
14838 I != ScheduleEnd; I = I->getNextNode()) {
14839 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
14840 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
14841 // Any allocas past here must be control dependent on I, and I
14842 // must be memory dependend on BundleMember->Inst.
14843 break;
14844
14845 if (!isa<AllocaInst>(Val: I))
14846 continue;
14847
14848 // Add the dependency
14849 MakeControlDependent(I);
14850 }
14851 }
14852
14853 // In addition to the cases handle just above, we need to prevent
14854 // allocas and loads/stores from moving below a stacksave or a
14855 // stackrestore. Avoiding moving allocas below stackrestore is currently
14856 // thought to be conservatism. Moving loads/stores below a stackrestore
14857 // can lead to incorrect code.
14858 if (isa<AllocaInst>(Val: BundleMember->Inst) ||
14859 BundleMember->Inst->mayReadOrWriteMemory()) {
14860 for (Instruction *I = BundleMember->Inst->getNextNode();
14861 I != ScheduleEnd; I = I->getNextNode()) {
14862 if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
14863 !match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
14864 continue;
14865
14866 // Add the dependency
14867 MakeControlDependent(I);
14868 break;
14869 }
14870 }
14871 }
14872
14873 // Handle the memory dependencies (if any).
14874 ScheduleData *DepDest = BundleMember->NextLoadStore;
14875 if (!DepDest)
14876 continue;
14877 Instruction *SrcInst = BundleMember->Inst;
14878 assert(SrcInst->mayReadOrWriteMemory() &&
14879 "NextLoadStore list for non memory effecting bundle?");
14880 MemoryLocation SrcLoc = getLocation(I: SrcInst);
14881 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14882 unsigned NumAliased = 0;
14883 unsigned DistToSrc = 1;
14884
14885 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14886 assert(isInSchedulingRegion(DepDest));
14887
14888 // We have two limits to reduce the complexity:
14889 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14890 // SLP->isAliased (which is the expensive part in this loop).
14891 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14892 // the whole loop (even if the loop is fast, it's quadratic).
14893 // It's important for the loop break condition (see below) to
14894 // check this limit even between two read-only instructions.
14895 if (DistToSrc >= MaxMemDepDistance ||
14896 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14897 (NumAliased >= AliasedCheckLimit ||
14898 SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->Inst)))) {
14899
14900 // We increment the counter only if the locations are aliased
14901 // (instead of counting all alias checks). This gives a better
14902 // balance between reduced runtime and accurate dependencies.
14903 NumAliased++;
14904
14905 DepDest->MemoryDependencies.push_back(Elt: BundleMember);
14906 BundleMember->Dependencies++;
14907 ScheduleData *DestBundle = DepDest->FirstInBundle;
14908 if (!DestBundle->IsScheduled) {
14909 BundleMember->incrementUnscheduledDeps(Incr: 1);
14910 }
14911 if (!DestBundle->hasValidDependencies()) {
14912 WorkList.push_back(Elt: DestBundle);
14913 }
14914 }
14915
14916 // Example, explaining the loop break condition: Let's assume our
14917 // starting instruction is i0 and MaxMemDepDistance = 3.
14918 //
14919 // +--------v--v--v
14920 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14921 // +--------^--^--^
14922 //
14923 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14924 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14925 // Previously we already added dependencies from i3 to i6,i7,i8
14926 // (because of MaxMemDepDistance). As we added a dependency from
14927 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14928 // and we can abort this loop at i6.
14929 if (DistToSrc >= 2 * MaxMemDepDistance)
14930 break;
14931 DistToSrc++;
14932 }
14933 }
14934 if (InsertInReadyList && SD->isReady()) {
14935 ReadyInsts.insert(X: SD);
14936 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14937 << "\n");
14938 }
14939 }
14940}
14941
14942void BoUpSLP::BlockScheduling::resetSchedule() {
14943 assert(ScheduleStart &&
14944 "tried to reset schedule on block which has not been scheduled");
14945 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14946 doForAllOpcodes(V: I, Action: [&](ScheduleData *SD) {
14947 assert(isInSchedulingRegion(SD) &&
14948 "ScheduleData not in scheduling region");
14949 SD->IsScheduled = false;
14950 SD->resetUnscheduledDeps();
14951 });
14952 }
14953 ReadyInsts.clear();
14954}
14955
14956void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14957 if (!BS->ScheduleStart)
14958 return;
14959
14960 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14961
14962 // A key point - if we got here, pre-scheduling was able to find a valid
14963 // scheduling of the sub-graph of the scheduling window which consists
14964 // of all vector bundles and their transitive users. As such, we do not
14965 // need to reschedule anything *outside of* that subgraph.
14966
14967 BS->resetSchedule();
14968
14969 // For the real scheduling we use a more sophisticated ready-list: it is
14970 // sorted by the original instruction location. This lets the final schedule
14971 // be as close as possible to the original instruction order.
14972 // WARNING: If changing this order causes a correctness issue, that means
14973 // there is some missing dependence edge in the schedule data graph.
14974 struct ScheduleDataCompare {
14975 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14976 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14977 }
14978 };
14979 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14980
14981 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14982 // and fill the ready-list with initial instructions.
14983 int Idx = 0;
14984 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14985 I = I->getNextNode()) {
14986 BS->doForAllOpcodes(V: I, Action: [this, &Idx, BS](ScheduleData *SD) {
14987 TreeEntry *SDTE = getTreeEntry(V: SD->Inst);
14988 (void)SDTE;
14989 assert((isVectorLikeInstWithConstOps(SD->Inst) ||
14990 SD->isPartOfBundle() ==
14991 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14992 "scheduler and vectorizer bundle mismatch");
14993 SD->FirstInBundle->SchedulingPriority = Idx++;
14994
14995 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14996 BS->calculateDependencies(SD, InsertInReadyList: false, SLP: this);
14997 });
14998 }
14999 BS->initialFillReadyList(ReadyList&: ReadyInsts);
15000
15001 Instruction *LastScheduledInst = BS->ScheduleEnd;
15002
15003 // Do the "real" scheduling.
15004 while (!ReadyInsts.empty()) {
15005 ScheduleData *Picked = *ReadyInsts.begin();
15006 ReadyInsts.erase(position: ReadyInsts.begin());
15007
15008 // Move the scheduled instruction(s) to their dedicated places, if not
15009 // there yet.
15010 for (ScheduleData *BundleMember = Picked; BundleMember;
15011 BundleMember = BundleMember->NextInBundle) {
15012 Instruction *PickedInst = BundleMember->Inst;
15013 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
15014 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
15015 LastScheduledInst = PickedInst;
15016 }
15017
15018 BS->schedule(SD: Picked, ReadyList&: ReadyInsts);
15019 }
15020
15021 // Check that we didn't break any of our invariants.
15022#ifdef EXPENSIVE_CHECKS
15023 BS->verify();
15024#endif
15025
15026#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
15027 // Check that all schedulable entities got scheduled
15028 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
15029 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
15030 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
15031 assert(SD->IsScheduled && "must be scheduled at this point");
15032 }
15033 });
15034 }
15035#endif
15036
15037 // Avoid duplicate scheduling of the block.
15038 BS->ScheduleStart = nullptr;
15039}
15040
15041unsigned BoUpSLP::getVectorElementSize(Value *V) {
15042 // If V is a store, just return the width of the stored value (or value
15043 // truncated just before storing) without traversing the expression tree.
15044 // This is the common case.
15045 if (auto *Store = dyn_cast<StoreInst>(Val: V))
15046 return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
15047
15048 if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
15049 return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1));
15050
15051 auto E = InstrElementSize.find(Val: V);
15052 if (E != InstrElementSize.end())
15053 return E->second;
15054
15055 // If V is not a store, we can traverse the expression tree to find loads
15056 // that feed it. The type of the loaded value may indicate a more suitable
15057 // width than V's type. We want to base the vector element size on the width
15058 // of memory operations where possible.
15059 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
15060 SmallPtrSet<Instruction *, 16> Visited;
15061 if (auto *I = dyn_cast<Instruction>(Val: V)) {
15062 Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0);
15063 Visited.insert(Ptr: I);
15064 }
15065
15066 // Traverse the expression tree in bottom-up order looking for loads. If we
15067 // encounter an instruction we don't yet handle, we give up.
15068 auto Width = 0u;
15069 Value *FirstNonBool = nullptr;
15070 while (!Worklist.empty()) {
15071 auto [I, Parent, Level] = Worklist.pop_back_val();
15072
15073 // We should only be looking at scalar instructions here. If the current
15074 // instruction has a vector type, skip.
15075 auto *Ty = I->getType();
15076 if (isa<VectorType>(Val: Ty))
15077 continue;
15078 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
15079 FirstNonBool = I;
15080 if (Level > RecursionMaxDepth)
15081 continue;
15082
15083 // If the current instruction is a load, update MaxWidth to reflect the
15084 // width of the loaded value.
15085 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
15086 Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
15087
15088 // Otherwise, we need to visit the operands of the instruction. We only
15089 // handle the interesting cases from buildTree here. If an operand is an
15090 // instruction we haven't yet visited and from the same basic block as the
15091 // user or the use is a PHI node, we add it to the worklist.
15092 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
15093 BinaryOperator, UnaryOperator>(Val: I)) {
15094 for (Use &U : I->operands()) {
15095 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
15096 if (Visited.insert(Ptr: J).second &&
15097 (isa<PHINode>(Val: I) || J->getParent() == Parent)) {
15098 Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1);
15099 continue;
15100 }
15101 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
15102 FirstNonBool = U.get();
15103 }
15104 } else {
15105 break;
15106 }
15107 }
15108
15109 // If we didn't encounter a memory access in the expression tree, or if we
15110 // gave up for some reason, just return the width of V. Otherwise, return the
15111 // maximum width we found.
15112 if (!Width) {
15113 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
15114 V = FirstNonBool;
15115 Width = DL->getTypeSizeInBits(Ty: V->getType());
15116 }
15117
15118 for (Instruction *I : Visited)
15119 InstrElementSize[I] = Width;
15120
15121 return Width;
15122}
15123
15124bool BoUpSLP::collectValuesToDemote(
15125 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
15126 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
15127 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
15128 bool IsTruncRoot) const {
15129 // We can always demote constants.
15130 if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
15131 return true;
15132
15133 unsigned OrigBitWidth = DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType());
15134 if (OrigBitWidth == BitWidth) {
15135 MaxDepthLevel = 1;
15136 return true;
15137 }
15138
15139 // If the value is not a vectorized instruction in the expression and not used
15140 // by the insertelement instruction and not used in multiple vector nodes, it
15141 // cannot be demoted.
15142 bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
15143 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
15144 });
15145 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
15146 if (MultiNodeScalars.contains(Val: V))
15147 return false;
15148 // For lat shuffle of sext/zext with many uses need to check the extra bit
15149 // for unsigned values, otherwise may have incorrect casting for reused
15150 // scalars.
15151 bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL));
15152 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
15153 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15154 if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery(*DL)))
15155 return true;
15156 }
15157 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
15158 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
15159 if (IsSignedNode)
15160 ++BitWidth1;
15161 if (auto *I = dyn_cast<Instruction>(Val: V)) {
15162 APInt Mask = DB->getDemandedBits(I);
15163 unsigned BitWidth2 =
15164 std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero());
15165 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
15166 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1);
15167 if (MaskedValueIsZero(V, Mask, DL: SimplifyQuery(*DL)))
15168 break;
15169 BitWidth2 *= 2;
15170 }
15171 BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
15172 }
15173 BitWidth = std::max(a: BitWidth, b: BitWidth1);
15174 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
15175 };
15176 using namespace std::placeholders;
15177 auto FinalAnalysis = [&]() {
15178 if (!IsProfitableToDemote)
15179 return false;
15180 bool Res = all_of(
15181 Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
15182 // Demote gathers.
15183 if (Res && E.isGather()) {
15184 // Check possible extractelement instructions bases and final vector
15185 // length.
15186 SmallPtrSet<Value *, 4> UniqueBases;
15187 for (Value *V : E.Scalars) {
15188 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
15189 if (!EE)
15190 continue;
15191 UniqueBases.insert(Ptr: EE->getVectorOperand());
15192 }
15193 const unsigned VF = E.Scalars.size();
15194 Type *OrigScalarTy = E.Scalars.front()->getType();
15195 if (UniqueBases.size() <= 2 ||
15196 TTI->getNumberOfParts(Tp: getWidenedType(ScalarTy: OrigScalarTy, VF)) ==
15197 TTI->getNumberOfParts(Tp: getWidenedType(
15198 ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth), VF)))
15199 ToDemote.push_back(Elt: E.Idx);
15200 }
15201 return Res;
15202 };
15203 if (E.isGather() || !Visited.insert(V: &E).second ||
15204 any_of(Range: E.Scalars, P: [&](Value *V) {
15205 return all_of(Range: V->users(), P: [&](User *U) {
15206 return isa<InsertElementInst>(Val: U) && !getTreeEntry(V: U);
15207 });
15208 }))
15209 return FinalAnalysis();
15210
15211 if (any_of(Range: E.Scalars, P: [&](Value *V) {
15212 return !all_of(Range: V->users(), P: [=](User *U) {
15213 return getTreeEntry(V: U) ||
15214 (E.Idx == 0 && UserIgnoreList &&
15215 UserIgnoreList->contains(V: U)) ||
15216 (!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
15217 !U->getType()->isScalableTy() &&
15218 DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
15219 }) && !IsPotentiallyTruncated(V, BitWidth);
15220 }))
15221 return false;
15222
15223 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
15224 bool &NeedToExit) {
15225 NeedToExit = false;
15226 unsigned InitLevel = MaxDepthLevel;
15227 for (const TreeEntry *Op : Operands) {
15228 unsigned Level = InitLevel;
15229 if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
15230 ToDemote, Visited, MaxDepthLevel&: Level, IsProfitableToDemote,
15231 IsTruncRoot)) {
15232 if (!IsProfitableToDemote)
15233 return false;
15234 NeedToExit = true;
15235 if (!FinalAnalysis())
15236 return false;
15237 continue;
15238 }
15239 MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
15240 }
15241 return true;
15242 };
15243 auto AttemptCheckBitwidth =
15244 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
15245 // Try all bitwidth < OrigBitWidth.
15246 NeedToExit = false;
15247 unsigned BestFailBitwidth = 0;
15248 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
15249 if (Checker(BitWidth, OrigBitWidth))
15250 return true;
15251 if (BestFailBitwidth == 0 && FinalAnalysis())
15252 BestFailBitwidth = BitWidth;
15253 }
15254 if (BitWidth >= OrigBitWidth) {
15255 if (BestFailBitwidth == 0) {
15256 BitWidth = OrigBitWidth;
15257 return false;
15258 }
15259 MaxDepthLevel = 1;
15260 BitWidth = BestFailBitwidth;
15261 NeedToExit = true;
15262 return true;
15263 }
15264 return false;
15265 };
15266 auto TryProcessInstruction =
15267 [&](unsigned &BitWidth,
15268 ArrayRef<const TreeEntry *> Operands = std::nullopt,
15269 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15270 if (Operands.empty()) {
15271 if (!IsTruncRoot)
15272 MaxDepthLevel = 1;
15273 (void)for_each(Range: E.Scalars, F: std::bind(f&: IsPotentiallyTruncated, args: _1,
15274 args: std::ref(t&: BitWidth)));
15275 } else {
15276 // Several vectorized uses? Check if we can truncate it, otherwise -
15277 // exit.
15278 if (E.UserTreeIndices.size() > 1 &&
15279 !all_of(Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1,
15280 args: std::ref(t&: BitWidth))))
15281 return false;
15282 bool NeedToExit = false;
15283 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15284 return false;
15285 if (NeedToExit)
15286 return true;
15287 if (!ProcessOperands(Operands, NeedToExit))
15288 return false;
15289 if (NeedToExit)
15290 return true;
15291 }
15292
15293 ++MaxDepthLevel;
15294 // Record the entry that we can demote.
15295 ToDemote.push_back(Elt: E.Idx);
15296 return IsProfitableToDemote;
15297 };
15298 switch (E.getOpcode()) {
15299
15300 // We can always demote truncations and extensions. Since truncations can
15301 // seed additional demotion, we save the truncated value.
15302 case Instruction::Trunc:
15303 if (IsProfitableToDemoteRoot)
15304 IsProfitableToDemote = true;
15305 return TryProcessInstruction(BitWidth);
15306 case Instruction::ZExt:
15307 case Instruction::SExt:
15308 IsProfitableToDemote = true;
15309 return TryProcessInstruction(BitWidth);
15310
15311 // We can demote certain binary operations if we can demote both of their
15312 // operands.
15313 case Instruction::Add:
15314 case Instruction::Sub:
15315 case Instruction::Mul:
15316 case Instruction::And:
15317 case Instruction::Or:
15318 case Instruction::Xor: {
15319 return TryProcessInstruction(
15320 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)});
15321 }
15322 case Instruction::Shl: {
15323 // If we are truncating the result of this SHL, and if it's a shift of an
15324 // inrange amount, we can always perform a SHL in a smaller type.
15325 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15326 return all_of(Range: E.Scalars, P: [&](Value *V) {
15327 auto *I = cast<Instruction>(Val: V);
15328 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
15329 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
15330 });
15331 };
15332 return TryProcessInstruction(
15333 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker);
15334 }
15335 case Instruction::LShr: {
15336 // If this is a truncate of a logical shr, we can truncate it to a smaller
15337 // lshr iff we know that the bits we would otherwise be shifting in are
15338 // already zeros.
15339 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15340 return all_of(Range: E.Scalars, P: [&](Value *V) {
15341 auto *I = cast<Instruction>(Val: V);
15342 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
15343 APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15344 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
15345 MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits,
15346 DL: SimplifyQuery(*DL));
15347 });
15348 };
15349 return TryProcessInstruction(
15350 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
15351 LShrChecker);
15352 }
15353 case Instruction::AShr: {
15354 // If this is a truncate of an arithmetic shr, we can truncate it to a
15355 // smaller ashr iff we know that all the bits from the sign bit of the
15356 // original type and the sign bit of the truncate type are similar.
15357 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15358 return all_of(Range: E.Scalars, P: [&](Value *V) {
15359 auto *I = cast<Instruction>(Val: V);
15360 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
15361 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15362 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
15363 ShiftedBits < ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, Depth: 0, AC,
15364 CxtI: nullptr, DT);
15365 });
15366 };
15367 return TryProcessInstruction(
15368 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
15369 AShrChecker);
15370 }
15371 case Instruction::UDiv:
15372 case Instruction::URem: {
15373 // UDiv and URem can be truncated if all the truncated bits are zero.
15374 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15375 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15376 return all_of(Range: E.Scalars, P: [&](Value *V) {
15377 auto *I = cast<Instruction>(Val: V);
15378 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15379 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, DL: SimplifyQuery(*DL)) &&
15380 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, DL: SimplifyQuery(*DL));
15381 });
15382 };
15383 return TryProcessInstruction(
15384 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker);
15385 }
15386
15387 // We can demote selects if we can demote their true and false values.
15388 case Instruction::Select: {
15389 return TryProcessInstruction(
15390 BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)});
15391 }
15392
15393 // We can demote phis if we can demote all their incoming operands. Note that
15394 // we don't need to worry about cycles since we ensure single use above.
15395 case Instruction::PHI: {
15396 const unsigned NumOps = E.getNumOperands();
15397 SmallVector<const TreeEntry *> Ops(NumOps);
15398 transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(),
15399 F: std::bind(f: &BoUpSLP::getOperandEntry, args: this, args: &E, args: _1));
15400
15401 return TryProcessInstruction(BitWidth, Ops);
15402 }
15403
15404 case Instruction::Call: {
15405 auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
15406 if (!IC)
15407 break;
15408 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
15409 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15410 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15411 break;
15412 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0));
15413 function_ref<bool(unsigned, unsigned)> CallChecker;
15414 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15415 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15416 return all_of(Range: E.Scalars, P: [&](Value *V) {
15417 auto *I = cast<Instruction>(Val: V);
15418 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15419 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
15420 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
15421 DL: SimplifyQuery(*DL)) &&
15422 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, DL: SimplifyQuery(*DL));
15423 }
15424 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15425 "Expected min/max intrinsics only.");
15426 unsigned SignBits = OrigBitWidth - BitWidth;
15427 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
15428 unsigned Op0SignBits = ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, Depth: 0, AC,
15429 CxtI: nullptr, DT);
15430 unsigned Op1SignBits = ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, Depth: 0, AC,
15431 CxtI: nullptr, DT);
15432 return SignBits <= Op0SignBits &&
15433 ((SignBits != Op0SignBits &&
15434 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
15435 MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
15436 DL: SimplifyQuery(*DL))) &&
15437 SignBits <= Op1SignBits &&
15438 ((SignBits != Op1SignBits &&
15439 !isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL))) ||
15440 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, DL: SimplifyQuery(*DL)));
15441 });
15442 };
15443 if (ID != Intrinsic::abs) {
15444 Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1));
15445 CallChecker = CompChecker;
15446 }
15447 InstructionCost BestCost =
15448 std::numeric_limits<InstructionCost::CostType>::max();
15449 unsigned BestBitWidth = BitWidth;
15450 unsigned VF = E.Scalars.size();
15451 // Choose the best bitwidth based on cost estimations.
15452 auto Checker = [&](unsigned BitWidth, unsigned) {
15453 unsigned MinBW = PowerOf2Ceil(A: BitWidth);
15454 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW);
15455 auto VecCallCosts = getVectorCallCosts(
15456 CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
15457 TTI, TLI, ArgTys);
15458 InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
15459 if (Cost < BestCost) {
15460 BestCost = Cost;
15461 BestBitWidth = BitWidth;
15462 }
15463 return false;
15464 };
15465 [[maybe_unused]] bool NeedToExit;
15466 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15467 BitWidth = BestBitWidth;
15468 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15469 }
15470
15471 // Otherwise, conservatively give up.
15472 default:
15473 break;
15474 }
15475 MaxDepthLevel = 1;
15476 return FinalAnalysis();
15477}
15478
15479static RecurKind getRdxKind(Value *V);
15480
15481void BoUpSLP::computeMinimumValueSizes() {
15482 // We only attempt to truncate integer expressions.
15483 bool IsStoreOrInsertElt =
15484 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15485 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15486 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15487 ExtraBitWidthNodes.size() <= 1 &&
15488 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15489 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15490 return;
15491
15492 unsigned NodeIdx = 0;
15493 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
15494 NodeIdx = 1;
15495
15496 // Ensure the roots of the vectorizable tree don't form a cycle.
15497 if (VectorizableTree[NodeIdx]->isGather() ||
15498 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15499 (NodeIdx != 0 && any_of(Range&: VectorizableTree[NodeIdx]->UserTreeIndices,
15500 P: [NodeIdx](const EdgeInfo &EI) {
15501 return EI.UserTE->Idx >
15502 static_cast<int>(NodeIdx);
15503 })))
15504 return;
15505
15506 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15507 // resize to the final type.
15508 bool IsTruncRoot = false;
15509 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15510 SmallVector<unsigned> RootDemotes;
15511 if (NodeIdx != 0 &&
15512 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15513 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15514 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15515 IsTruncRoot = true;
15516 RootDemotes.push_back(Elt: NodeIdx);
15517 IsProfitableToDemoteRoot = true;
15518 ++NodeIdx;
15519 }
15520
15521 // Analyzed the reduction already and not profitable - exit.
15522 if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front()))
15523 return;
15524
15525 SmallVector<unsigned> ToDemote;
15526 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15527 bool IsProfitableToDemoteRoot, unsigned Opcode,
15528 unsigned Limit, bool IsTruncRoot,
15529 bool IsSignedCmp) -> unsigned {
15530 ToDemote.clear();
15531 // Check if the root is trunc and the next node is gather/buildvector, then
15532 // keep trunc in scalars, which is free in most cases.
15533 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
15534 E.Idx > (IsStoreOrInsertElt ? 2 : 1) &&
15535 all_of(Range: E.Scalars, P: [&](Value *V) {
15536 return V->hasOneUse() || isa<Constant>(Val: V) ||
15537 (!V->hasNUsesOrMore(N: UsesLimit) &&
15538 none_of(Range: V->users(), P: [&](User *U) {
15539 const TreeEntry *TE = getTreeEntry(V: U);
15540 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15541 if (TE == UserTE || !TE)
15542 return false;
15543 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15544 SelectInst>(Val: U) ||
15545 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
15546 SelectInst>(Val: UserTE->getMainOp()))
15547 return true;
15548 unsigned UserTESz = DL->getTypeSizeInBits(
15549 Ty: UserTE->Scalars.front()->getType());
15550 auto It = MinBWs.find(Val: TE);
15551 if (It != MinBWs.end() && It->second.first > UserTESz)
15552 return true;
15553 return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
15554 }));
15555 })) {
15556 ToDemote.push_back(Elt: E.Idx);
15557 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
15558 auto It = MinBWs.find(Val: UserTE);
15559 if (It != MinBWs.end())
15560 return It->second.first;
15561 unsigned MaxBitWidth =
15562 DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
15563 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
15564 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15565 MaxBitWidth = 8;
15566 return MaxBitWidth;
15567 }
15568
15569 unsigned VF = E.getVectorFactor();
15570 auto *TreeRootIT = dyn_cast<IntegerType>(Val: E.Scalars.front()->getType());
15571 if (!TreeRootIT || !Opcode)
15572 return 0u;
15573
15574 if (any_of(Range: E.Scalars,
15575 P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15576 return 0u;
15577
15578 unsigned NumParts = TTI->getNumberOfParts(Tp: getWidenedType(ScalarTy: TreeRootIT, VF));
15579
15580 // The maximum bit width required to represent all the values that can be
15581 // demoted without loss of precision. It would be safe to truncate the roots
15582 // of the expression to this width.
15583 unsigned MaxBitWidth = 1u;
15584
15585 // True if the roots can be zero-extended back to their original type,
15586 // rather than sign-extended. We know that if the leading bits are not
15587 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15588 // True.
15589 // Determine if the sign bit of all the roots is known to be zero. If not,
15590 // IsKnownPositive is set to False.
15591 bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
15592 KnownBits Known = computeKnownBits(V: R, DL: *DL);
15593 return Known.isNonNegative();
15594 });
15595
15596 // We first check if all the bits of the roots are demanded. If they're not,
15597 // we can truncate the roots to this narrower type.
15598 for (Value *Root : E.Scalars) {
15599 unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
15600 TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: Root->getType());
15601 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15602 // If we can't prove that the sign bit is zero, we must add one to the
15603 // maximum bit width to account for the unknown sign bit. This preserves
15604 // the existing sign bit so we can safely sign-extend the root back to the
15605 // original type. Otherwise, if we know the sign bit is zero, we will
15606 // zero-extend the root instead.
15607 //
15608 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15609 // one to the maximum bit width will yield a larger-than-necessary
15610 // type. In general, we need to add an extra bit only if we can't
15611 // prove that the upper bit of the original type is equal to the
15612 // upper bit of the proposed smaller type. If these two bits are
15613 // the same (either zero or one) we know that sign-extending from
15614 // the smaller type will result in the same value. Here, since we
15615 // can't yet prove this, we are just making the proposed smaller
15616 // type larger to ensure correctness.
15617 if (!IsKnownPositive)
15618 ++BitWidth1;
15619
15620 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: Root));
15621 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15622 MaxBitWidth =
15623 std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
15624 }
15625
15626 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15627 MaxBitWidth = 8;
15628
15629 // If the original type is large, but reduced type does not improve the reg
15630 // use - ignore it.
15631 if (NumParts > 1 &&
15632 NumParts ==
15633 TTI->getNumberOfParts(Tp: getWidenedType(
15634 ScalarTy: IntegerType::get(C&: F->getContext(), NumBits: bit_ceil(Value: MaxBitWidth)), VF)))
15635 return 0u;
15636
15637 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15638 Opcode == Instruction::SExt ||
15639 Opcode == Instruction::ZExt || NumParts > 1;
15640 // Conservatively determine if we can actually truncate the roots of the
15641 // expression. Collect the values that can be demoted in ToDemote and
15642 // additional roots that require investigating in Roots.
15643 DenseSet<const TreeEntry *> Visited;
15644 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15645 bool NeedToDemote = IsProfitableToDemote;
15646
15647 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
15648 ToDemote, Visited, MaxDepthLevel, IsProfitableToDemote&: NeedToDemote,
15649 IsTruncRoot) ||
15650 (MaxDepthLevel <= Limit &&
15651 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15652 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15653 DL->getTypeSizeInBits(Ty: TreeRootIT) /
15654 DL->getTypeSizeInBits(Ty: cast<Instruction>(Val: E.Scalars.front())
15655 ->getOperand(i: 0)
15656 ->getType()) >
15657 2)))))
15658 return 0u;
15659 // Round MaxBitWidth up to the next power-of-two.
15660 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
15661
15662 return MaxBitWidth;
15663 };
15664
15665 // If we can truncate the root, we must collect additional values that might
15666 // be demoted as a result. That is, those seeded by truncations we will
15667 // modify.
15668 // Add reduction ops sizes, if any.
15669 if (UserIgnoreList &&
15670 isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
15671 for (Value *V : *UserIgnoreList) {
15672 auto NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, Depth: 0, AC, CxtI: nullptr, DT);
15673 auto NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
15674 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15675 if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL)))
15676 ++BitWidth1;
15677 unsigned BitWidth2 = BitWidth1;
15678 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
15679 auto Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
15680 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15681 }
15682 ReductionBitWidth =
15683 std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
15684 }
15685 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15686 ReductionBitWidth = 8;
15687
15688 ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
15689 }
15690 bool IsTopRoot = NodeIdx == 0;
15691 while (NodeIdx < VectorizableTree.size() &&
15692 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15693 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15694 RootDemotes.push_back(Elt: NodeIdx);
15695 ++NodeIdx;
15696 IsTruncRoot = true;
15697 }
15698 bool IsSignedCmp = false;
15699 while (NodeIdx < VectorizableTree.size()) {
15700 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15701 unsigned Limit = 2;
15702 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15703 if (IsTopRoot &&
15704 ReductionBitWidth ==
15705 DL->getTypeSizeInBits(
15706 Ty: VectorizableTree.front()->Scalars.front()->getType()))
15707 Limit = 3;
15708 unsigned MaxBitWidth = ComputeMaxBitWidth(
15709 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Opcode,
15710 Limit, IsTruncRoot, IsSignedCmp);
15711 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15712 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15713 ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
15714 else if (MaxBitWidth == 0)
15715 ReductionBitWidth = 0;
15716 }
15717
15718 for (unsigned Idx : RootDemotes) {
15719 if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) {
15720 uint32_t OrigBitWidth = DL->getTypeSizeInBits(Ty: V->getType());
15721 if (OrigBitWidth > MaxBitWidth) {
15722 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
15723 return MaskedValueIsZero(V, Mask, DL: SimplifyQuery(*DL));
15724 }
15725 return false;
15726 }))
15727 ToDemote.push_back(Elt: Idx);
15728 }
15729 RootDemotes.clear();
15730 IsTopRoot = false;
15731 IsProfitableToDemoteRoot = true;
15732
15733 if (ExtraBitWidthNodes.empty()) {
15734 NodeIdx = VectorizableTree.size();
15735 } else {
15736 unsigned NewIdx = 0;
15737 do {
15738 NewIdx = *ExtraBitWidthNodes.begin();
15739 ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
15740 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15741 NodeIdx = NewIdx;
15742 IsTruncRoot =
15743 NodeIdx < VectorizableTree.size() &&
15744 any_of(Range&: VectorizableTree[NodeIdx]->UserTreeIndices,
15745 P: [](const EdgeInfo &EI) {
15746 return EI.EdgeIdx == 0 &&
15747 EI.UserTE->getOpcode() == Instruction::Trunc &&
15748 !EI.UserTE->isAltShuffle();
15749 });
15750 IsSignedCmp =
15751 NodeIdx < VectorizableTree.size() &&
15752 any_of(Range&: VectorizableTree[NodeIdx]->UserTreeIndices,
15753 P: [&](const EdgeInfo &EI) {
15754 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15755 any_of(Range&: EI.UserTE->Scalars, P: [&](Value *V) {
15756 auto *IC = dyn_cast<ICmpInst>(Val: V);
15757 return IC &&
15758 (IC->isSigned() ||
15759 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 0),
15760 SQ: SimplifyQuery(*DL)) ||
15761 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 1),
15762 SQ: SimplifyQuery(*DL)));
15763 });
15764 });
15765 }
15766
15767 // If the maximum bit width we compute is less than the with of the roots'
15768 // type, we can proceed with the narrowing. Otherwise, do nothing.
15769 if (MaxBitWidth == 0 ||
15770 MaxBitWidth >=
15771 cast<IntegerType>(Val: TreeRoot.front()->getType())->getBitWidth()) {
15772 if (UserIgnoreList)
15773 AnalyzedMinBWVals.insert(I: TreeRoot.begin(), E: TreeRoot.end());
15774 continue;
15775 }
15776
15777 // Finally, map the values we can demote to the maximum bit with we
15778 // computed.
15779 for (unsigned Idx : ToDemote) {
15780 TreeEntry *TE = VectorizableTree[Idx].get();
15781 if (MinBWs.contains(Val: TE))
15782 continue;
15783 bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
15784 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
15785 });
15786 MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
15787 }
15788 }
15789}
15790
15791PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
15792 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
15793 auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
15794 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
15795 auto *AA = &AM.getResult<AAManager>(IR&: F);
15796 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
15797 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
15798 auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
15799 auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
15800 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
15801
15802 bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
15803 if (!Changed)
15804 return PreservedAnalyses::all();
15805
15806 PreservedAnalyses PA;
15807 PA.preserveSet<CFGAnalyses>();
15808 return PA;
15809}
15810
15811bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
15812 TargetTransformInfo *TTI_,
15813 TargetLibraryInfo *TLI_, AAResults *AA_,
15814 LoopInfo *LI_, DominatorTree *DT_,
15815 AssumptionCache *AC_, DemandedBits *DB_,
15816 OptimizationRemarkEmitter *ORE_) {
15817 if (!RunSLPVectorization)
15818 return false;
15819 SE = SE_;
15820 TTI = TTI_;
15821 TLI = TLI_;
15822 AA = AA_;
15823 LI = LI_;
15824 DT = DT_;
15825 AC = AC_;
15826 DB = DB_;
15827 DL = &F.getDataLayout();
15828
15829 Stores.clear();
15830 GEPs.clear();
15831 bool Changed = false;
15832
15833 // If the target claims to have no vector registers don't attempt
15834 // vectorization.
15835 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
15836 LLVM_DEBUG(
15837 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15838 return false;
15839 }
15840
15841 // Don't vectorize when the attribute NoImplicitFloat is used.
15842 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
15843 return false;
15844
15845 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15846
15847 // Use the bottom up slp vectorizer to construct chains that start with
15848 // store instructions.
15849 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15850
15851 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15852 // delete instructions.
15853
15854 // Update DFS numbers now so that we can use them for ordering.
15855 DT->updateDFSNumbers();
15856
15857 // Scan the blocks in the function in post order.
15858 for (auto *BB : post_order(G: &F.getEntryBlock())) {
15859 // Start new block - clear the list of reduction roots.
15860 R.clearReductionData();
15861 collectSeedInstructions(BB);
15862
15863 // Vectorize trees that end at stores.
15864 if (!Stores.empty()) {
15865 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15866 << " underlying objects.\n");
15867 Changed |= vectorizeStoreChains(R);
15868 }
15869
15870 // Vectorize trees that end at reductions.
15871 Changed |= vectorizeChainsInBlock(BB, R);
15872
15873 // Vectorize the index computations of getelementptr instructions. This
15874 // is primarily intended to catch gather-like idioms ending at
15875 // non-consecutive loads.
15876 if (!GEPs.empty()) {
15877 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15878 << " underlying objects.\n");
15879 Changed |= vectorizeGEPIndices(BB, R);
15880 }
15881 }
15882
15883 if (Changed) {
15884 R.optimizeGatherSequence();
15885 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15886 }
15887 return Changed;
15888}
15889
15890std::optional<bool>
15891SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15892 unsigned Idx, unsigned MinVF,
15893 unsigned &Size) {
15894 Size = 0;
15895 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15896 << "\n");
15897 const unsigned Sz = R.getVectorElementSize(V: Chain[0]);
15898 unsigned VF = Chain.size();
15899
15900 if (!isPowerOf2_32(Value: Sz) || !isPowerOf2_32(Value: VF) || VF < 2 || VF < MinVF) {
15901 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15902 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15903 // all vector lanes are used.
15904 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15905 return false;
15906 }
15907
15908 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15909 << "\n");
15910
15911 SetVector<Value *> ValOps;
15912 for (Value *V : Chain)
15913 ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
15914 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15915 InstructionsState S = getSameOpcode(VL: ValOps.getArrayRef(), TLI: *TLI);
15916 if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > 1) {
15917 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15918 bool IsPowerOf2 =
15919 isPowerOf2_32(Value: ValOps.size()) ||
15920 (VectorizeNonPowerOf2 && isPowerOf2_32(Value: ValOps.size() + 1));
15921 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15922 (!S.MainOp->isSafeToRemove() ||
15923 any_of(Range: ValOps.getArrayRef(),
15924 P: [&](Value *V) {
15925 return !isa<ExtractElementInst>(Val: V) &&
15926 (V->getNumUses() > Chain.size() ||
15927 any_of(Range: V->users(), P: [&](User *U) {
15928 return !Stores.contains(V: U);
15929 }));
15930 }))) ||
15931 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15932 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15933 return false;
15934 }
15935 }
15936 if (R.isLoadCombineCandidate(Stores: Chain))
15937 return true;
15938 R.buildTree(Roots: Chain);
15939 // Check if tree tiny and store itself or its value is not vectorized.
15940 if (R.isTreeTinyAndNotFullyVectorizable()) {
15941 if (R.isGathered(V: Chain.front()) ||
15942 R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
15943 return std::nullopt;
15944 Size = R.getTreeSize();
15945 return false;
15946 }
15947 R.reorderTopToBottom();
15948 R.reorderBottomToTop();
15949 R.buildExternalUses();
15950
15951 R.computeMinimumValueSizes();
15952 R.transformNodes();
15953
15954 Size = R.getTreeSize();
15955 if (S.getOpcode() == Instruction::Load)
15956 Size = 2; // cut off masked gather small trees
15957 InstructionCost Cost = R.getTreeCost();
15958
15959 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15960 if (Cost < -SLPCostThreshold) {
15961 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15962
15963 using namespace ore;
15964
15965 R.getORE()->emit(OptDiag&: OptimizationRemark(SV_NAME, "StoresVectorized",
15966 cast<StoreInst>(Val: Chain[0]))
15967 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15968 << " and with tree size "
15969 << NV("TreeSize", R.getTreeSize()));
15970
15971 R.vectorizeTree();
15972 return true;
15973 }
15974
15975 return false;
15976}
15977
15978/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15979static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15980 bool First) {
15981 unsigned Num = 0;
15982 uint64_t Sum = std::accumulate(
15983 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
15984 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15985 unsigned Size = First ? Val.first : Val.second;
15986 if (Size == 1)
15987 return V;
15988 ++Num;
15989 return V + Size;
15990 });
15991 if (Num == 0)
15992 return true;
15993 uint64_t Mean = Sum / Num;
15994 if (Mean == 0)
15995 return true;
15996 uint64_t Dev = std::accumulate(
15997 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
15998 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15999 unsigned P = First ? Val.first : Val.second;
16000 if (P == 1)
16001 return V;
16002 return V + (P - Mean) * (P - Mean);
16003 }) /
16004 Num;
16005 return Dev * 81 / (Mean * Mean) == 0;
16006}
16007
16008bool SLPVectorizerPass::vectorizeStores(
16009 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
16010 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
16011 &Visited) {
16012 // We may run into multiple chains that merge into a single chain. We mark the
16013 // stores that we vectorized so that we don't visit the same store twice.
16014 BoUpSLP::ValueSet VectorizedStores;
16015 bool Changed = false;
16016
16017 struct StoreDistCompare {
16018 bool operator()(const std::pair<unsigned, int> &Op1,
16019 const std::pair<unsigned, int> &Op2) const {
16020 return Op1.second < Op2.second;
16021 }
16022 };
16023 // A set of pairs (index of store in Stores array ref, Distance of the store
16024 // address relative to base store address in units).
16025 using StoreIndexToDistSet =
16026 std::set<std::pair<unsigned, int>, StoreDistCompare>;
16027 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
16028 int PrevDist = -1;
16029 BoUpSLP::ValueList Operands;
16030 // Collect the chain into a list.
16031 for (auto [Idx, Data] : enumerate(First: Set)) {
16032 if (Operands.empty() || Data.second - PrevDist == 1) {
16033 Operands.push_back(Elt: Stores[Data.first]);
16034 PrevDist = Data.second;
16035 if (Idx != Set.size() - 1)
16036 continue;
16037 }
16038 auto E = make_scope_exit(F: [&, &DataVar = Data]() {
16039 Operands.clear();
16040 Operands.push_back(Elt: Stores[DataVar.first]);
16041 PrevDist = DataVar.second;
16042 });
16043
16044 if (Operands.size() <= 1 ||
16045 !Visited
16046 .insert(V: {Operands.front(),
16047 cast<StoreInst>(Val: Operands.front())->getValueOperand(),
16048 Operands.back(),
16049 cast<StoreInst>(Val: Operands.back())->getValueOperand(),
16050 Operands.size()})
16051 .second)
16052 continue;
16053
16054 unsigned MaxVecRegSize = R.getMaxVecRegSize();
16055 unsigned EltSize = R.getVectorElementSize(V: Operands[0]);
16056 unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
16057
16058 unsigned MaxVF =
16059 std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
16060 unsigned MaxRegVF = MaxVF;
16061 auto *Store = cast<StoreInst>(Val: Operands[0]);
16062 Type *StoreTy = Store->getValueOperand()->getType();
16063 Type *ValueTy = StoreTy;
16064 if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
16065 ValueTy = Trunc->getSrcTy();
16066 if (ValueTy == StoreTy &&
16067 R.getVectorElementSize(V: Store->getValueOperand()) <= EltSize)
16068 MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
16069 unsigned MinVF = std::max<unsigned>(
16070 a: 2, b: PowerOf2Ceil(A: TTI->getStoreMinimumVF(
16071 VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreTy)), ScalarMemTy: StoreTy,
16072 ScalarValTy: ValueTy)));
16073
16074 if (MaxVF < MinVF) {
16075 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
16076 << ") < "
16077 << "MinVF (" << MinVF << ")\n");
16078 continue;
16079 }
16080
16081 unsigned NonPowerOf2VF = 0;
16082 if (VectorizeNonPowerOf2) {
16083 // First try vectorizing with a non-power-of-2 VF. At the moment, only
16084 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
16085 // lanes are used.
16086 unsigned CandVF = Operands.size();
16087 if (isPowerOf2_32(Value: CandVF + 1) && CandVF <= MaxRegVF)
16088 NonPowerOf2VF = CandVF;
16089 }
16090
16091 unsigned Sz = 1 + Log2_32(Value: MaxVF) - Log2_32(Value: MinVF);
16092 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
16093 unsigned Size = MinVF;
16094 for_each(Range: reverse(C&: CandidateVFs), F: [&](unsigned &VF) {
16095 VF = Size > MaxVF ? NonPowerOf2VF : Size;
16096 Size *= 2;
16097 });
16098 unsigned End = Operands.size();
16099 unsigned Repeat = 0;
16100 constexpr unsigned MaxAttempts = 4;
16101 OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());
16102 for_each(Range&: RangeSizes, F: [](std::pair<unsigned, unsigned> &P) {
16103 P.first = P.second = 1;
16104 });
16105 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
16106 auto IsNotVectorized = [](bool First,
16107 const std::pair<unsigned, unsigned> &P) {
16108 return First ? P.first > 0 : P.second > 0;
16109 };
16110 auto IsVectorized = [](bool First,
16111 const std::pair<unsigned, unsigned> &P) {
16112 return First ? P.first == 0 : P.second == 0;
16113 };
16114 auto VFIsProfitable = [](bool First, unsigned Size,
16115 const std::pair<unsigned, unsigned> &P) {
16116 return First ? Size >= P.first : Size >= P.second;
16117 };
16118 auto FirstSizeSame = [](unsigned Size,
16119 const std::pair<unsigned, unsigned> &P) {
16120 return Size == P.first;
16121 };
16122 while (true) {
16123 ++Repeat;
16124 bool RepeatChanged = false;
16125 bool AnyProfitableGraph = false;
16126 for (unsigned Size : CandidateVFs) {
16127 AnyProfitableGraph = false;
16128 unsigned StartIdx = std::distance(
16129 first: RangeSizes.begin(),
16130 last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: Size >= MaxRegVF,
16131 args: std::placeholders::_1)));
16132 while (StartIdx < End) {
16133 unsigned EndIdx =
16134 std::distance(first: RangeSizes.begin(),
16135 last: find_if(Range: RangeSizes.drop_front(N: StartIdx),
16136 P: std::bind(f&: IsVectorized, args: Size >= MaxRegVF,
16137 args: std::placeholders::_1)));
16138 unsigned Sz = EndIdx >= End ? End : EndIdx;
16139 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
16140 if (!checkTreeSizes(Sizes: RangeSizes.slice(N: Cnt, M: Size),
16141 First: Size >= MaxRegVF)) {
16142 ++Cnt;
16143 continue;
16144 }
16145 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(N: Cnt, M: Size);
16146 assert(all_of(Slice,
16147 [&](Value *V) {
16148 return cast<StoreInst>(V)
16149 ->getValueOperand()
16150 ->getType() ==
16151 cast<StoreInst>(Slice.front())
16152 ->getValueOperand()
16153 ->getType();
16154 }) &&
16155 "Expected all operands of same type.");
16156 if (!NonSchedulable.empty()) {
16157 auto [NonSchedSizeMax, NonSchedSizeMin] =
16158 NonSchedulable.lookup(Val: Slice.front());
16159 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
16160 Cnt += NonSchedSizeMax;
16161 continue;
16162 }
16163 }
16164 unsigned TreeSize;
16165 std::optional<bool> Res =
16166 vectorizeStoreChain(Chain: Slice, R, Idx: Cnt, MinVF, Size&: TreeSize);
16167 if (!Res) {
16168 NonSchedulable
16169 .try_emplace(Key: Slice.front(), Args: std::make_pair(x&: Size, y&: Size))
16170 .first->getSecond()
16171 .second = Size;
16172 } else if (*Res) {
16173 // Mark the vectorized stores so that we don't vectorize them
16174 // again.
16175 VectorizedStores.insert(I: Slice.begin(), E: Slice.end());
16176 // Mark the vectorized stores so that we don't vectorize them
16177 // again.
16178 AnyProfitableGraph = RepeatChanged = Changed = true;
16179 // If we vectorized initial block, no need to try to vectorize
16180 // it again.
16181 for_each(Range: RangeSizes.slice(N: Cnt, M: Size),
16182 F: [](std::pair<unsigned, unsigned> &P) {
16183 P.first = P.second = 0;
16184 });
16185 if (Cnt < StartIdx + MinVF) {
16186 for_each(Range: RangeSizes.slice(N: StartIdx, M: Cnt - StartIdx),
16187 F: [](std::pair<unsigned, unsigned> &P) {
16188 P.first = P.second = 0;
16189 });
16190 StartIdx = Cnt + Size;
16191 }
16192 if (Cnt > Sz - Size - MinVF) {
16193 for_each(Range: RangeSizes.slice(N: Cnt + Size, M: Sz - (Cnt + Size)),
16194 F: [](std::pair<unsigned, unsigned> &P) {
16195 P.first = P.second = 0;
16196 });
16197 if (Sz == End)
16198 End = Cnt;
16199 Sz = Cnt;
16200 }
16201 Cnt += Size;
16202 continue;
16203 }
16204 if (Size > 2 && Res &&
16205 !all_of(Range: RangeSizes.slice(N: Cnt, M: Size),
16206 P: std::bind(f&: VFIsProfitable, args: Size >= MaxRegVF, args&: TreeSize,
16207 args: std::placeholders::_1))) {
16208 Cnt += Size;
16209 continue;
16210 }
16211 // Check for the very big VFs that we're not rebuilding same
16212 // trees, just with larger number of elements.
16213 if (Size > MaxRegVF && TreeSize > 1 &&
16214 all_of(Range: RangeSizes.slice(N: Cnt, M: Size),
16215 P: std::bind(f&: FirstSizeSame, args&: TreeSize,
16216 args: std::placeholders::_1))) {
16217 Cnt += Size;
16218 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
16219 ++Cnt;
16220 continue;
16221 }
16222 if (TreeSize > 1)
16223 for_each(Range: RangeSizes.slice(N: Cnt, M: Size),
16224 F: [&](std::pair<unsigned, unsigned> &P) {
16225 if (Size >= MaxRegVF)
16226 P.second = std::max(a: P.second, b: TreeSize);
16227 else
16228 P.first = std::max(a: P.first, b: TreeSize);
16229 });
16230 ++Cnt;
16231 AnyProfitableGraph = true;
16232 }
16233 if (StartIdx >= End)
16234 break;
16235 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
16236 AnyProfitableGraph = true;
16237 StartIdx = std::distance(
16238 first: RangeSizes.begin(),
16239 last: find_if(Range: RangeSizes.drop_front(N: Sz),
16240 P: std::bind(f&: IsNotVectorized, args: Size >= MaxRegVF,
16241 args: std::placeholders::_1)));
16242 }
16243 if (!AnyProfitableGraph && Size >= MaxRegVF)
16244 break;
16245 }
16246 // All values vectorized - exit.
16247 if (all_of(Range&: RangeSizes, P: [](const std::pair<unsigned, unsigned> &P) {
16248 return P.first == 0 && P.second == 0;
16249 }))
16250 break;
16251 // Check if tried all attempts or no need for the last attempts at all.
16252 if (Repeat >= MaxAttempts ||
16253 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
16254 break;
16255 constexpr unsigned StoresLimit = 64;
16256 const unsigned MaxTotalNum = bit_floor(Value: std::min<unsigned>(
16257 a: Operands.size(),
16258 b: static_cast<unsigned>(
16259 End -
16260 std::distance(
16261 first: RangeSizes.begin(),
16262 last: find_if(Range&: RangeSizes, P: std::bind(f&: IsNotVectorized, args: true,
16263 args: std::placeholders::_1))) +
16264 1)));
16265 unsigned VF = PowerOf2Ceil(A: CandidateVFs.front()) * 2;
16266 if (VF > MaxTotalNum || VF >= StoresLimit)
16267 break;
16268 for_each(Range&: RangeSizes, F: [&](std::pair<unsigned, unsigned> &P) {
16269 if (P.first != 0)
16270 P.first = std::max(a: P.second, b: P.first);
16271 });
16272 // Last attempt to vectorize max number of elements, if all previous
16273 // attempts were unsuccessful because of the cost issues.
16274 CandidateVFs.clear();
16275 CandidateVFs.push_back(Elt: VF);
16276 }
16277 }
16278 };
16279
16280 // Stores pair (first: index of the store into Stores array ref, address of
16281 // which taken as base, second: sorted set of pairs {index, dist}, which are
16282 // indices of stores in the set and their store location distances relative to
16283 // the base address).
16284
16285 // Need to store the index of the very first store separately, since the set
16286 // may be reordered after the insertion and the first store may be moved. This
16287 // container allows to reduce number of calls of getPointersDiff() function.
16288 SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;
16289 // Inserts the specified store SI with the given index Idx to the set of the
16290 // stores. If the store with the same distance is found already - stop
16291 // insertion, try to vectorize already found stores. If some stores from this
16292 // sequence were not vectorized - try to vectorize them with the new store
16293 // later. But this logic is applied only to the stores, that come before the
16294 // previous store with the same distance.
16295 // Example:
16296 // 1. store x, %p
16297 // 2. store y, %p+1
16298 // 3. store z, %p+2
16299 // 4. store a, %p
16300 // 5. store b, %p+3
16301 // - Scan this from the last to first store. The very first bunch of stores is
16302 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16303 // vector).
16304 // - The next store in the list - #1 - has the same distance from store #5 as
16305 // the store #4.
16306 // - Try to vectorize sequence of stores 4,2,3,5.
16307 // - If all these stores are vectorized - just drop them.
16308 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16309 // - Start new stores sequence.
16310 // The new bunch of stores is {1, {1, 0}}.
16311 // - Add the stores from previous sequence, that were not vectorized.
16312 // Here we consider the stores in the reversed order, rather they are used in
16313 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16314 // Store #3 can be added -> comes after store #4 with the same distance as
16315 // store #1.
16316 // Store #5 cannot be added - comes before store #4.
16317 // This logic allows to improve the compile time, we assume that the stores
16318 // after previous store with the same distance most likely have memory
16319 // dependencies and no need to waste compile time to try to vectorize them.
16320 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16321 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16322 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16323 std::optional<int> Diff = getPointersDiff(
16324 ElemTyA: Stores[Set.first]->getValueOperand()->getType(),
16325 PtrA: Stores[Set.first]->getPointerOperand(),
16326 ElemTyB: SI->getValueOperand()->getType(), PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
16327 /*StrictCheck=*/true);
16328 if (!Diff)
16329 continue;
16330 auto It = Set.second.find(x: std::make_pair(x&: Idx, y&: *Diff));
16331 if (It == Set.second.end()) {
16332 Set.second.emplace(args&: Idx, args&: *Diff);
16333 return;
16334 }
16335 // Try to vectorize the first found set to avoid duplicate analysis.
16336 TryToVectorize(Set.second);
16337 StoreIndexToDistSet PrevSet;
16338 PrevSet.swap(x&: Set.second);
16339 Set.first = Idx;
16340 Set.second.emplace(args&: Idx, args: 0);
16341 // Insert stores that followed previous match to try to vectorize them
16342 // with this store.
16343 unsigned StartIdx = It->first + 1;
16344 SmallBitVector UsedStores(Idx - StartIdx);
16345 // Distances to previously found dup store (or this store, since they
16346 // store to the same addresses).
16347 SmallVector<int> Dists(Idx - StartIdx, 0);
16348 for (const std::pair<unsigned, int> &Pair : reverse(C&: PrevSet)) {
16349 // Do not try to vectorize sequences, we already tried.
16350 if (Pair.first <= It->first ||
16351 VectorizedStores.contains(Ptr: Stores[Pair.first]))
16352 break;
16353 unsigned BI = Pair.first - StartIdx;
16354 UsedStores.set(BI);
16355 Dists[BI] = Pair.second - It->second;
16356 }
16357 for (unsigned I = StartIdx; I < Idx; ++I) {
16358 unsigned BI = I - StartIdx;
16359 if (UsedStores.test(Idx: BI))
16360 Set.second.emplace(args&: I, args&: Dists[BI]);
16361 }
16362 return;
16363 }
16364 auto &Res = SortedStores.emplace_back();
16365 Res.first = Idx;
16366 Res.second.emplace(args&: Idx, args: 0);
16367 };
16368 Type *PrevValTy = nullptr;
16369 for (auto [I, SI] : enumerate(First&: Stores)) {
16370 if (R.isDeleted(I: SI))
16371 continue;
16372 if (!PrevValTy)
16373 PrevValTy = SI->getValueOperand()->getType();
16374 // Check that we do not try to vectorize stores of different types.
16375 if (PrevValTy != SI->getValueOperand()->getType()) {
16376 for (auto &Set : SortedStores)
16377 TryToVectorize(Set.second);
16378 SortedStores.clear();
16379 PrevValTy = SI->getValueOperand()->getType();
16380 }
16381 FillStoresSet(I, SI);
16382 }
16383
16384 // Final vectorization attempt.
16385 for (auto &Set : SortedStores)
16386 TryToVectorize(Set.second);
16387
16388 return Changed;
16389}
16390
16391void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16392 // Initialize the collections. We will make a single pass over the block.
16393 Stores.clear();
16394 GEPs.clear();
16395
16396 // Visit the store and getelementptr instructions in BB and organize them in
16397 // Stores and GEPs according to the underlying objects of their pointer
16398 // operands.
16399 for (Instruction &I : *BB) {
16400 // Ignore store instructions that are volatile or have a pointer operand
16401 // that doesn't point to a scalar type.
16402 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
16403 if (!SI->isSimple())
16404 continue;
16405 if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
16406 continue;
16407 Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
16408 }
16409
16410 // Ignore getelementptr instructions that have more than one index, a
16411 // constant index, or a pointer operand that doesn't point to a scalar
16412 // type.
16413 else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
16414 if (GEP->getNumIndices() != 1)
16415 continue;
16416 Value *Idx = GEP->idx_begin()->get();
16417 if (isa<Constant>(Val: Idx))
16418 continue;
16419 if (!isValidElementType(Ty: Idx->getType()))
16420 continue;
16421 if (GEP->getType()->isVectorTy())
16422 continue;
16423 GEPs[GEP->getPointerOperand()].push_back(Elt: GEP);
16424 }
16425 }
16426}
16427
16428bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16429 bool MaxVFOnly) {
16430 if (VL.size() < 2)
16431 return false;
16432
16433 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16434 << VL.size() << ".\n");
16435
16436 // Check that all of the parts are instructions of the same type,
16437 // we permit an alternate opcode via InstructionsState.
16438 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
16439 if (!S.getOpcode())
16440 return false;
16441
16442 Instruction *I0 = cast<Instruction>(Val: S.OpValue);
16443 // Make sure invalid types (including vector type) are rejected before
16444 // determining vectorization factor for scalar instructions.
16445 for (Value *V : VL) {
16446 Type *Ty = V->getType();
16447 if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
16448 // NOTE: the following will give user internal llvm type name, which may
16449 // not be useful.
16450 R.getORE()->emit(RemarkBuilder: [&]() {
16451 std::string TypeStr;
16452 llvm::raw_string_ostream rso(TypeStr);
16453 Ty->print(O&: rso);
16454 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16455 << "Cannot SLP vectorize list: type "
16456 << TypeStr + " is unsupported by vectorizer";
16457 });
16458 return false;
16459 }
16460 }
16461
16462 unsigned Sz = R.getVectorElementSize(V: I0);
16463 unsigned MinVF = R.getMinVF(Sz);
16464 unsigned MaxVF = std::max<unsigned>(a: llvm::bit_floor(Value: VL.size()), b: MinVF);
16465 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
16466 if (MaxVF < 2) {
16467 R.getORE()->emit(RemarkBuilder: [&]() {
16468 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16469 << "Cannot SLP vectorize list: vectorization factor "
16470 << "less than 2 is not supported";
16471 });
16472 return false;
16473 }
16474
16475 bool Changed = false;
16476 bool CandidateFound = false;
16477 InstructionCost MinCost = SLPCostThreshold.getValue();
16478 Type *ScalarTy = VL[0]->getType();
16479 if (auto *IE = dyn_cast<InsertElementInst>(Val: VL[0]))
16480 ScalarTy = IE->getOperand(i_nocapture: 1)->getType();
16481
16482 unsigned NextInst = 0, MaxInst = VL.size();
16483 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16484 // No actual vectorization should happen, if number of parts is the same as
16485 // provided vectorization factor (i.e. the scalar type is used for vector
16486 // code during codegen).
16487 auto *VecTy = getWidenedType(ScalarTy, VF);
16488 if (TTI->getNumberOfParts(Tp: VecTy) == VF)
16489 continue;
16490 for (unsigned I = NextInst; I < MaxInst; ++I) {
16491 unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
16492
16493 if (!isPowerOf2_32(Value: ActualVF))
16494 continue;
16495
16496 if (MaxVFOnly && ActualVF < MaxVF)
16497 break;
16498 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16499 break;
16500
16501 ArrayRef<Value *> Ops = VL.slice(N: I, M: ActualVF);
16502 // Check that a previous iteration of this loop did not delete the Value.
16503 if (llvm::any_of(Range&: Ops, P: [&R](Value *V) {
16504 auto *I = dyn_cast<Instruction>(Val: V);
16505 return I && R.isDeleted(I);
16506 }))
16507 continue;
16508
16509 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16510 << "\n");
16511
16512 R.buildTree(Roots: Ops);
16513 if (R.isTreeTinyAndNotFullyVectorizable())
16514 continue;
16515 R.reorderTopToBottom();
16516 R.reorderBottomToTop(
16517 /*IgnoreReorder=*/!isa<InsertElementInst>(Val: Ops.front()) &&
16518 !R.doesRootHaveInTreeUses());
16519 R.buildExternalUses();
16520
16521 R.computeMinimumValueSizes();
16522 R.transformNodes();
16523 InstructionCost Cost = R.getTreeCost();
16524 CandidateFound = true;
16525 MinCost = std::min(a: MinCost, b: Cost);
16526
16527 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16528 << " for VF=" << ActualVF << "\n");
16529 if (Cost < -SLPCostThreshold) {
16530 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16531 R.getORE()->emit(OptDiag&: OptimizationRemark(SV_NAME, "VectorizedList",
16532 cast<Instruction>(Val: Ops[0]))
16533 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16534 << " and with tree size "
16535 << ore::NV("TreeSize", R.getTreeSize()));
16536
16537 R.vectorizeTree();
16538 // Move to the next bundle.
16539 I += VF - 1;
16540 NextInst = I + 1;
16541 Changed = true;
16542 }
16543 }
16544 }
16545
16546 if (!Changed && CandidateFound) {
16547 R.getORE()->emit(RemarkBuilder: [&]() {
16548 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16549 << "List vectorization was possible but not beneficial with cost "
16550 << ore::NV("Cost", MinCost) << " >= "
16551 << ore::NV("Treshold", -SLPCostThreshold);
16552 });
16553 } else if (!Changed) {
16554 R.getORE()->emit(RemarkBuilder: [&]() {
16555 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16556 << "Cannot SLP vectorize list: vectorization was impossible"
16557 << " with available vectorization factors";
16558 });
16559 }
16560 return Changed;
16561}
16562
16563bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16564 if (!I)
16565 return false;
16566
16567 if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType()))
16568 return false;
16569
16570 Value *P = I->getParent();
16571
16572 // Vectorize in current basic block only.
16573 auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
16574 auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1));
16575 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16576 return false;
16577
16578 // First collect all possible candidates
16579 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
16580 Candidates.emplace_back(Args&: Op0, Args&: Op1);
16581
16582 auto *A = dyn_cast<BinaryOperator>(Val: Op0);
16583 auto *B = dyn_cast<BinaryOperator>(Val: Op1);
16584 // Try to skip B.
16585 if (A && B && B->hasOneUse()) {
16586 auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0));
16587 auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1));
16588 if (B0 && B0->getParent() == P)
16589 Candidates.emplace_back(Args&: A, Args&: B0);
16590 if (B1 && B1->getParent() == P)
16591 Candidates.emplace_back(Args&: A, Args&: B1);
16592 }
16593 // Try to skip A.
16594 if (B && A && A->hasOneUse()) {
16595 auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0));
16596 auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1));
16597 if (A0 && A0->getParent() == P)
16598 Candidates.emplace_back(Args&: A0, Args&: B);
16599 if (A1 && A1->getParent() == P)
16600 Candidates.emplace_back(Args&: A1, Args&: B);
16601 }
16602
16603 if (Candidates.size() == 1)
16604 return tryToVectorizeList(VL: {Op0, Op1}, R);
16605
16606 // We have multiple options. Try to pick the single best.
16607 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16608 if (!BestCandidate)
16609 return false;
16610 return tryToVectorizeList(
16611 VL: {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16612}
16613
16614namespace {
16615
16616/// Model horizontal reductions.
16617///
16618/// A horizontal reduction is a tree of reduction instructions that has values
16619/// that can be put into a vector as its leaves. For example:
16620///
16621/// mul mul mul mul
16622/// \ / \ /
16623/// + +
16624/// \ /
16625/// +
16626/// This tree has "mul" as its leaf values and "+" as its reduction
16627/// instructions. A reduction can feed into a store or a binary operation
16628/// feeding a phi.
16629/// ...
16630/// \ /
16631/// +
16632/// |
16633/// phi +=
16634///
16635/// Or:
16636/// ...
16637/// \ /
16638/// +
16639/// |
16640/// *p =
16641///
16642class HorizontalReduction {
16643 using ReductionOpsType = SmallVector<Value *, 16>;
16644 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16645 ReductionOpsListType ReductionOps;
16646 /// List of possibly reduced values.
16647 SmallVector<SmallVector<Value *>> ReducedVals;
16648 /// Maps reduced value to the corresponding reduction operation.
16649 DenseMap<Value *, SmallVector<Instruction *>> ReducedValsToOps;
16650 // Use map vector to make stable output.
16651 MapVector<Instruction *, Value *> ExtraArgs;
16652 WeakTrackingVH ReductionRoot;
16653 /// The type of reduction operation.
16654 RecurKind RdxKind;
16655 /// Checks if the optimization of original scalar identity operations on
16656 /// matched horizontal reductions is enabled and allowed.
16657 bool IsSupportedHorRdxIdentityOp = false;
16658
16659 static bool isCmpSelMinMax(Instruction *I) {
16660 return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
16661 RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
16662 }
16663
16664 // And/or are potentially poison-safe logical patterns like:
16665 // select x, y, false
16666 // select x, true, y
16667 static bool isBoolLogicOp(Instruction *I) {
16668 return isa<SelectInst>(Val: I) &&
16669 (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr()));
16670 }
16671
16672 /// Checks if instruction is associative and can be vectorized.
16673 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16674 if (Kind == RecurKind::None)
16675 return false;
16676
16677 // Integer ops that map to select instructions or intrinsics are fine.
16678 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
16679 isBoolLogicOp(I))
16680 return true;
16681
16682 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16683 // FP min/max are associative except for NaN and -0.0. We do not
16684 // have to rule out -0.0 here because the intrinsic semantics do not
16685 // specify a fixed result for it.
16686 return I->getFastMathFlags().noNaNs();
16687 }
16688
16689 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16690 return true;
16691
16692 return I->isAssociative();
16693 }
16694
16695 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16696 // Poison-safe 'or' takes the form: select X, true, Y
16697 // To make that work with the normal operand processing, we skip the
16698 // true value operand.
16699 // TODO: Change the code and data structures to handle this without a hack.
16700 if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1)
16701 return I->getOperand(i: 2);
16702 return I->getOperand(i: Index);
16703 }
16704
16705 /// Creates reduction operation with the current opcode.
16706 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16707 Value *RHS, const Twine &Name, bool UseSelect) {
16708 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16709 switch (Kind) {
16710 case RecurKind::Or:
16711 if (UseSelect &&
16712 LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
16713 return Builder.CreateSelect(C: LHS, True: Builder.getTrue(), False: RHS, Name);
16714 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16715 Name);
16716 case RecurKind::And:
16717 if (UseSelect &&
16718 LHS->getType() == CmpInst::makeCmpResultType(opnd_type: LHS->getType()))
16719 return Builder.CreateSelect(C: LHS, True: RHS, False: Builder.getFalse(), Name);
16720 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16721 Name);
16722 case RecurKind::Add:
16723 case RecurKind::Mul:
16724 case RecurKind::Xor:
16725 case RecurKind::FAdd:
16726 case RecurKind::FMul:
16727 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16728 Name);
16729 case RecurKind::FMax:
16730 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::maxnum, LHS, RHS);
16731 case RecurKind::FMin:
16732 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::minnum, LHS, RHS);
16733 case RecurKind::FMaximum:
16734 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::maximum, LHS, RHS);
16735 case RecurKind::FMinimum:
16736 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::minimum, LHS, RHS);
16737 case RecurKind::SMax:
16738 if (UseSelect) {
16739 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16740 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16741 }
16742 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::smax, LHS, RHS);
16743 case RecurKind::SMin:
16744 if (UseSelect) {
16745 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16746 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16747 }
16748 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::smin, LHS, RHS);
16749 case RecurKind::UMax:
16750 if (UseSelect) {
16751 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16752 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16753 }
16754 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::umax, LHS, RHS);
16755 case RecurKind::UMin:
16756 if (UseSelect) {
16757 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16758 return Builder.CreateSelect(C: Cmp, True: LHS, False: RHS, Name);
16759 }
16760 return Builder.CreateBinaryIntrinsic(ID: Intrinsic::umin, LHS, RHS);
16761 default:
16762 llvm_unreachable("Unknown reduction operation.");
16763 }
16764 }
16765
16766 /// Creates reduction operation with the current opcode with the IR flags
16767 /// from \p ReductionOps, dropping nuw/nsw flags.
16768 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16769 Value *RHS, const Twine &Name,
16770 const ReductionOpsListType &ReductionOps) {
16771 bool UseSelect = ReductionOps.size() == 2 ||
16772 // Logical or/and.
16773 (ReductionOps.size() == 1 &&
16774 any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
16775 assert((!UseSelect || ReductionOps.size() != 2 ||
16776 isa<SelectInst>(ReductionOps[1][0])) &&
16777 "Expected cmp + select pairs for reduction");
16778 Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
16779 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
16780 if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
16781 propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr,
16782 /*IncludeWrapFlags=*/false);
16783 propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr,
16784 /*IncludeWrapFlags=*/false);
16785 return Op;
16786 }
16787 }
16788 propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false);
16789 return Op;
16790 }
16791
16792public:
16793 static RecurKind getRdxKind(Value *V) {
16794 auto *I = dyn_cast<Instruction>(Val: V);
16795 if (!I)
16796 return RecurKind::None;
16797 if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
16798 return RecurKind::Add;
16799 if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
16800 return RecurKind::Mul;
16801 if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) ||
16802 match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
16803 return RecurKind::And;
16804 if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) ||
16805 match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
16806 return RecurKind::Or;
16807 if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
16808 return RecurKind::Xor;
16809 if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
16810 return RecurKind::FAdd;
16811 if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
16812 return RecurKind::FMul;
16813
16814 if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
16815 return RecurKind::FMax;
16816 if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
16817 return RecurKind::FMin;
16818
16819 if (match(V: I, P: m_Intrinsic<Intrinsic::maximum>(Op0: m_Value(), Op1: m_Value())))
16820 return RecurKind::FMaximum;
16821 if (match(V: I, P: m_Intrinsic<Intrinsic::minimum>(Op0: m_Value(), Op1: m_Value())))
16822 return RecurKind::FMinimum;
16823 // This matches either cmp+select or intrinsics. SLP is expected to handle
16824 // either form.
16825 // TODO: If we are canonicalizing to intrinsics, we can remove several
16826 // special-case paths that deal with selects.
16827 if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
16828 return RecurKind::SMax;
16829 if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
16830 return RecurKind::SMin;
16831 if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
16832 return RecurKind::UMax;
16833 if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
16834 return RecurKind::UMin;
16835
16836 if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
16837 // Try harder: look for min/max pattern based on instructions producing
16838 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16839 // During the intermediate stages of SLP, it's very common to have
16840 // pattern like this (since optimizeGatherSequence is run only once
16841 // at the end):
16842 // %1 = extractelement <2 x i32> %a, i32 0
16843 // %2 = extractelement <2 x i32> %a, i32 1
16844 // %cond = icmp sgt i32 %1, %2
16845 // %3 = extractelement <2 x i32> %a, i32 0
16846 // %4 = extractelement <2 x i32> %a, i32 1
16847 // %select = select i1 %cond, i32 %3, i32 %4
16848 CmpInst::Predicate Pred;
16849 Instruction *L1;
16850 Instruction *L2;
16851
16852 Value *LHS = Select->getTrueValue();
16853 Value *RHS = Select->getFalseValue();
16854 Value *Cond = Select->getCondition();
16855
16856 // TODO: Support inverse predicates.
16857 if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
16858 if (!isa<ExtractElementInst>(Val: RHS) ||
16859 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
16860 return RecurKind::None;
16861 } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
16862 if (!isa<ExtractElementInst>(Val: LHS) ||
16863 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
16864 return RecurKind::None;
16865 } else {
16866 if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS))
16867 return RecurKind::None;
16868 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) ||
16869 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) ||
16870 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
16871 return RecurKind::None;
16872 }
16873
16874 switch (Pred) {
16875 default:
16876 return RecurKind::None;
16877 case CmpInst::ICMP_SGT:
16878 case CmpInst::ICMP_SGE:
16879 return RecurKind::SMax;
16880 case CmpInst::ICMP_SLT:
16881 case CmpInst::ICMP_SLE:
16882 return RecurKind::SMin;
16883 case CmpInst::ICMP_UGT:
16884 case CmpInst::ICMP_UGE:
16885 return RecurKind::UMax;
16886 case CmpInst::ICMP_ULT:
16887 case CmpInst::ICMP_ULE:
16888 return RecurKind::UMin;
16889 }
16890 }
16891 return RecurKind::None;
16892 }
16893
16894 /// Get the index of the first operand.
16895 static unsigned getFirstOperandIndex(Instruction *I) {
16896 return isCmpSelMinMax(I) ? 1 : 0;
16897 }
16898
16899private:
16900 /// Total number of operands in the reduction operation.
16901 static unsigned getNumberOfOperands(Instruction *I) {
16902 return isCmpSelMinMax(I) ? 3 : 2;
16903 }
16904
16905 /// Checks if the instruction is in basic block \p BB.
16906 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16907 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16908 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16909 auto *Sel = cast<SelectInst>(Val: I);
16910 auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
16911 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16912 }
16913 return I->getParent() == BB;
16914 }
16915
16916 /// Expected number of uses for reduction operations/reduced values.
16917 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16918 if (IsCmpSelMinMax) {
16919 // SelectInst must be used twice while the condition op must have single
16920 // use only.
16921 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
16922 return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse();
16923 return I->hasNUses(N: 2);
16924 }
16925
16926 // Arithmetic reduction operation must be used once only.
16927 return I->hasOneUse();
16928 }
16929
16930 /// Initializes the list of reduction operations.
16931 void initReductionOps(Instruction *I) {
16932 if (isCmpSelMinMax(I))
16933 ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType());
16934 else
16935 ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType());
16936 }
16937
16938 /// Add all reduction operations for the reduction instruction \p I.
16939 void addReductionOps(Instruction *I) {
16940 if (isCmpSelMinMax(I)) {
16941 ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
16942 ReductionOps[1].emplace_back(Args&: I);
16943 } else {
16944 ReductionOps[0].emplace_back(Args&: I);
16945 }
16946 }
16947
16948 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16949 int Sz = Data.size();
16950 auto *I = dyn_cast<Instruction>(Val: Data.front());
16951 return Sz > 1 || isConstant(V: Data.front()) ||
16952 (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
16953 }
16954
16955public:
16956 HorizontalReduction() = default;
16957
16958 /// Try to find a reduction tree.
16959 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16960 ScalarEvolution &SE, const DataLayout &DL,
16961 const TargetLibraryInfo &TLI) {
16962 RdxKind = HorizontalReduction::getRdxKind(V: Root);
16963 if (!isVectorizable(Kind: RdxKind, I: Root))
16964 return false;
16965
16966 // Analyze "regular" integer/FP types for reductions - no target-specific
16967 // types or pointers.
16968 Type *Ty = Root->getType();
16969 if (!isValidElementType(Ty) || Ty->isPointerTy())
16970 return false;
16971
16972 // Though the ultimate reduction may have multiple uses, its condition must
16973 // have only single use.
16974 if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
16975 if (!Sel->getCondition()->hasOneUse())
16976 return false;
16977
16978 ReductionRoot = Root;
16979
16980 // Iterate through all the operands of the possible reduction tree and
16981 // gather all the reduced values, sorting them by their value id.
16982 BasicBlock *BB = Root->getParent();
16983 bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
16984 SmallVector<Instruction *> Worklist(1, Root);
16985 // Checks if the operands of the \p TreeN instruction are also reduction
16986 // operations or should be treated as reduced values or an extra argument,
16987 // which is not part of the reduction.
16988 auto CheckOperands = [&](Instruction *TreeN,
16989 SmallVectorImpl<Value *> &ExtraArgs,
16990 SmallVectorImpl<Value *> &PossibleReducedVals,
16991 SmallVectorImpl<Instruction *> &ReductionOps) {
16992 for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
16993 End: getNumberOfOperands(I: TreeN)))) {
16994 Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
16995 ReducedValsToOps[EdgeVal].push_back(Elt: TreeN);
16996 auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
16997 // Edge has wrong parent - mark as an extra argument.
16998 if (EdgeInst && !isVectorLikeInstWithConstOps(V: EdgeInst) &&
16999 !hasSameParent(I: EdgeInst, BB)) {
17000 ExtraArgs.push_back(Elt: EdgeVal);
17001 continue;
17002 }
17003 // If the edge is not an instruction, or it is different from the main
17004 // reduction opcode or has too many uses - possible reduced value.
17005 // Also, do not try to reduce const values, if the operation is not
17006 // foldable.
17007 if (!EdgeInst || getRdxKind(V: EdgeInst) != RdxKind ||
17008 IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) ||
17009 !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) ||
17010 !isVectorizable(Kind: RdxKind, I: EdgeInst) ||
17011 (R.isAnalyzedReductionRoot(I: EdgeInst) &&
17012 all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
17013 PossibleReducedVals.push_back(Elt: EdgeVal);
17014 continue;
17015 }
17016 ReductionOps.push_back(Elt: EdgeInst);
17017 }
17018 };
17019 // Try to regroup reduced values so that it gets more profitable to try to
17020 // reduce them. Values are grouped by their value ids, instructions - by
17021 // instruction op id and/or alternate op id, plus do extra analysis for
17022 // loads (grouping them by the distabce between pointers) and cmp
17023 // instructions (grouping them by the predicate).
17024 MapVector<size_t, MapVector<size_t, MapVector<Value *, unsigned>>>
17025 PossibleReducedVals;
17026 initReductionOps(I: Root);
17027 DenseMap<Value *, SmallVector<LoadInst *>> LoadsMap;
17028 SmallSet<size_t, 2> LoadKeyUsed;
17029
17030 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
17031 Value *Ptr = getUnderlyingObject(V: LI->getPointerOperand());
17032 if (LoadKeyUsed.contains(V: Key)) {
17033 auto LIt = LoadsMap.find(Val: Ptr);
17034 if (LIt != LoadsMap.end()) {
17035 for (LoadInst *RLI : LIt->second) {
17036 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
17037 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
17038 /*StrictCheck=*/true))
17039 return hash_value(ptr: RLI->getPointerOperand());
17040 }
17041 for (LoadInst *RLI : LIt->second) {
17042 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
17043 Ptr2: LI->getPointerOperand(), TLI)) {
17044 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
17045 return SubKey;
17046 }
17047 }
17048 if (LIt->second.size() > 2) {
17049 hash_code SubKey =
17050 hash_value(ptr: LIt->second.back()->getPointerOperand());
17051 return SubKey;
17052 }
17053 }
17054 }
17055 LoadKeyUsed.insert(V: Key);
17056 LoadsMap.try_emplace(Key: Ptr).first->second.push_back(Elt: LI);
17057 return hash_value(ptr: LI->getPointerOperand());
17058 };
17059
17060 while (!Worklist.empty()) {
17061 Instruction *TreeN = Worklist.pop_back_val();
17062 SmallVector<Value *> Args;
17063 SmallVector<Value *> PossibleRedVals;
17064 SmallVector<Instruction *> PossibleReductionOps;
17065 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
17066 // If too many extra args - mark the instruction itself as a reduction
17067 // value, not a reduction operation.
17068 if (Args.size() < 2) {
17069 addReductionOps(I: TreeN);
17070 // Add extra args.
17071 if (!Args.empty()) {
17072 assert(Args.size() == 1 && "Expected only single argument.");
17073 ExtraArgs[TreeN] = Args.front();
17074 }
17075 // Add reduction values. The values are sorted for better vectorization
17076 // results.
17077 for (Value *V : PossibleRedVals) {
17078 size_t Key, Idx;
17079 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
17080 /*AllowAlternate=*/false);
17081 ++PossibleReducedVals[Key][Idx]
17082 .insert(KV: std::make_pair(x&: V, y: 0))
17083 .first->second;
17084 }
17085 Worklist.append(in_start: PossibleReductionOps.rbegin(),
17086 in_end: PossibleReductionOps.rend());
17087 } else {
17088 size_t Key, Idx;
17089 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V: TreeN, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
17090 /*AllowAlternate=*/false);
17091 ++PossibleReducedVals[Key][Idx]
17092 .insert(KV: std::make_pair(x&: TreeN, y: 0))
17093 .first->second;
17094 }
17095 }
17096 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
17097 // Sort values by the total number of values kinds to start the reduction
17098 // from the longest possible reduced values sequences.
17099 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
17100 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
17101 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
17102 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
17103 It != E; ++It) {
17104 PossibleRedValsVect.emplace_back();
17105 auto RedValsVect = It->second.takeVector();
17106 stable_sort(Range&: RedValsVect, C: llvm::less_second());
17107 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
17108 PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
17109 }
17110 stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
17111 return P1.size() > P2.size();
17112 });
17113 int NewIdx = -1;
17114 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
17115 if (NewIdx < 0 ||
17116 (!isGoodForReduction(Data) &&
17117 (!isa<LoadInst>(Val: Data.front()) ||
17118 !isa<LoadInst>(Val: ReducedVals[NewIdx].front()) ||
17119 getUnderlyingObject(
17120 V: cast<LoadInst>(Val: Data.front())->getPointerOperand()) !=
17121 getUnderlyingObject(
17122 V: cast<LoadInst>(Val: ReducedVals[NewIdx].front())
17123 ->getPointerOperand())))) {
17124 NewIdx = ReducedVals.size();
17125 ReducedVals.emplace_back();
17126 }
17127 ReducedVals[NewIdx].append(in_start: Data.rbegin(), in_end: Data.rend());
17128 }
17129 }
17130 // Sort the reduced values by number of same/alternate opcode and/or pointer
17131 // operand.
17132 stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
17133 return P1.size() > P2.size();
17134 });
17135 return true;
17136 }
17137
17138 /// Attempt to vectorize the tree found by matchAssociativeReduction.
17139 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
17140 const TargetLibraryInfo &TLI) {
17141 constexpr int ReductionLimit = 4;
17142 constexpr unsigned RegMaxNumber = 4;
17143 constexpr unsigned RedValsMaxNumber = 128;
17144 // If there are a sufficient number of reduction values, reduce
17145 // to a nearby power-of-2. We can safely generate oversized
17146 // vectors and rely on the backend to split them to legal sizes.
17147 unsigned NumReducedVals =
17148 std::accumulate(first: ReducedVals.begin(), last: ReducedVals.end(), init: 0,
17149 binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
17150 if (!isGoodForReduction(Data: Vals))
17151 return Num;
17152 return Num + Vals.size();
17153 });
17154 if (NumReducedVals < ReductionLimit &&
17155 (!AllowHorRdxIdenityOptimization ||
17156 all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
17157 return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV);
17158 }))) {
17159 for (ReductionOpsType &RdxOps : ReductionOps)
17160 for (Value *RdxOp : RdxOps)
17161 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
17162 return nullptr;
17163 }
17164
17165 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
17166 TargetFolder(DL));
17167 Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
17168
17169 // Track the reduced values in case if they are replaced by extractelement
17170 // because of the vectorization.
17171 DenseMap<Value *, WeakTrackingVH> TrackedVals(
17172 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
17173 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
17174 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
17175 ExternallyUsedValues.reserve(NumEntries: ExtraArgs.size() + 1);
17176 // The same extra argument may be used several times, so log each attempt
17177 // to use it.
17178 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
17179 assert(Pair.first && "DebugLoc must be set.");
17180 ExternallyUsedValues[Pair.second].push_back(Elt: Pair.first);
17181 TrackedVals.try_emplace(Key: Pair.second, Args: Pair.second);
17182 }
17183
17184 // The compare instruction of a min/max is the insertion point for new
17185 // instructions and may be replaced with a new compare instruction.
17186 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
17187 assert(isa<SelectInst>(RdxRootInst) &&
17188 "Expected min/max reduction to have select root instruction");
17189 Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
17190 assert(isa<Instruction>(ScalarCond) &&
17191 "Expected min/max reduction to have compare condition");
17192 return cast<Instruction>(Val: ScalarCond);
17193 };
17194
17195 // Return new VectorizedTree, based on previous value.
17196 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
17197 if (VectorizedTree) {
17198 // Update the final value in the reduction.
17199 Builder.SetCurrentDebugLocation(
17200 cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
17201 if ((isa<PoisonValue>(Val: VectorizedTree) && !isa<PoisonValue>(Val: Res)) ||
17202 (isGuaranteedNotToBePoison(V: Res) &&
17203 !isGuaranteedNotToBePoison(V: VectorizedTree))) {
17204 auto It = ReducedValsToOps.find(Val: Res);
17205 if (It != ReducedValsToOps.end() &&
17206 any_of(Range&: It->getSecond(),
17207 P: [](Instruction *I) { return isBoolLogicOp(I); }))
17208 std::swap(a&: VectorizedTree, b&: Res);
17209 }
17210
17211 return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
17212 ReductionOps);
17213 }
17214 // Initialize the final value in the reduction.
17215 return Res;
17216 };
17217 bool AnyBoolLogicOp =
17218 any_of(Range&: ReductionOps.back(), P: [](Value *V) {
17219 return isBoolLogicOp(I: cast<Instruction>(Val: V));
17220 });
17221 // The reduction root is used as the insertion point for new instructions,
17222 // so set it as externally used to prevent it from being deleted.
17223 ExternallyUsedValues[ReductionRoot];
17224 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
17225 ReductionOps.front().size());
17226 for (ReductionOpsType &RdxOps : ReductionOps)
17227 for (Value *RdxOp : RdxOps) {
17228 if (!RdxOp)
17229 continue;
17230 IgnoreList.insert(V: RdxOp);
17231 }
17232 // Intersect the fast-math-flags from all reduction operations.
17233 FastMathFlags RdxFMF;
17234 RdxFMF.set();
17235 for (Value *U : IgnoreList)
17236 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
17237 RdxFMF &= FPMO->getFastMathFlags();
17238 bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
17239
17240 // Need to track reduced vals, they may be changed during vectorization of
17241 // subvectors.
17242 for (ArrayRef<Value *> Candidates : ReducedVals)
17243 for (Value *V : Candidates)
17244 TrackedVals.try_emplace(Key: V, Args&: V);
17245
17246 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
17247 // List of the values that were reduced in other trees as part of gather
17248 // nodes and thus requiring extract if fully vectorized in other trees.
17249 SmallPtrSet<Value *, 4> RequiredExtract;
17250 Value *VectorizedTree = nullptr;
17251 bool CheckForReusedReductionOps = false;
17252 // Try to vectorize elements based on their type.
17253 SmallVector<InstructionsState> States;
17254 for (ArrayRef<Value *> RV : ReducedVals)
17255 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
17256 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
17257 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17258 InstructionsState S = States[I];
17259 SmallVector<Value *> Candidates;
17260 Candidates.reserve(N: 2 * OrigReducedVals.size());
17261 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
17262 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
17263 Value *RdxVal = TrackedVals.find(Val: OrigReducedVals[Cnt])->second;
17264 // Check if the reduction value was not overriden by the extractelement
17265 // instruction because of the vectorization and exclude it, if it is not
17266 // compatible with other values.
17267 // Also check if the instruction was folded to constant/other value.
17268 auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
17269 if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
17270 (!S.getOpcode() || !S.isOpcodeOrAlt(I: Inst))) ||
17271 (S.getOpcode() && !Inst))
17272 continue;
17273 Candidates.push_back(Elt: RdxVal);
17274 TrackedToOrig.try_emplace(Key: RdxVal, Args: OrigReducedVals[Cnt]);
17275 }
17276 bool ShuffledExtracts = false;
17277 // Try to handle shuffled extractelements.
17278 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
17279 I + 1 < E) {
17280 InstructionsState NextS = getSameOpcode(VL: ReducedVals[I + 1], TLI);
17281 if (NextS.getOpcode() == Instruction::ExtractElement &&
17282 !NextS.isAltShuffle()) {
17283 SmallVector<Value *> CommonCandidates(Candidates);
17284 for (Value *RV : ReducedVals[I + 1]) {
17285 Value *RdxVal = TrackedVals.find(Val: RV)->second;
17286 // Check if the reduction value was not overriden by the
17287 // extractelement instruction because of the vectorization and
17288 // exclude it, if it is not compatible with other values.
17289 if (auto *Inst = dyn_cast<Instruction>(Val: RdxVal))
17290 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(I: Inst))
17291 continue;
17292 CommonCandidates.push_back(Elt: RdxVal);
17293 TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
17294 }
17295 SmallVector<int> Mask;
17296 if (isFixedVectorShuffle(VL: CommonCandidates, Mask)) {
17297 ++I;
17298 Candidates.swap(RHS&: CommonCandidates);
17299 ShuffledExtracts = true;
17300 }
17301 }
17302 }
17303
17304 // Emit code for constant values.
17305 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17306 allConstant(VL: Candidates)) {
17307 Value *Res = Candidates.front();
17308 ++VectorizedVals.try_emplace(Key: Candidates.front(), Args: 0).first->getSecond();
17309 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17310 Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
17311 ++VectorizedVals.try_emplace(Key: VC, Args: 0).first->getSecond();
17312 if (auto *ResI = dyn_cast<Instruction>(Val: Res))
17313 V.analyzedReductionRoot(I: ResI);
17314 }
17315 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17316 continue;
17317 }
17318
17319 unsigned NumReducedVals = Candidates.size();
17320 if (NumReducedVals < ReductionLimit &&
17321 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17322 !isSplat(VL: Candidates)))
17323 continue;
17324
17325 // Check if we support repeated scalar values processing (optimization of
17326 // original scalar identity operations on matched horizontal reductions).
17327 IsSupportedHorRdxIdentityOp =
17328 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17329 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17330 // Gather same values.
17331 MapVector<Value *, unsigned> SameValuesCounter;
17332 if (IsSupportedHorRdxIdentityOp)
17333 for (Value *V : Candidates)
17334 ++SameValuesCounter.insert(KV: std::make_pair(x&: V, y: 0)).first->second;
17335 // Used to check if the reduced values used same number of times. In this
17336 // case the compiler may produce better code. E.g. if reduced values are
17337 // aabbccdd (8 x values), then the first node of the tree will have a node
17338 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17339 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17340 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17341 // x abcd) * 2.
17342 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17343 // this analysis, other operations may require an extra estimation of
17344 // the profitability.
17345 bool SameScaleFactor = false;
17346 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17347 SameValuesCounter.size() != Candidates.size();
17348 if (OptReusedScalars) {
17349 SameScaleFactor =
17350 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17351 RdxKind == RecurKind::Xor) &&
17352 all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
17353 P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17354 return P.second == SameValuesCounter.front().second;
17355 });
17356 Candidates.resize(N: SameValuesCounter.size());
17357 transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
17358 F: [](const auto &P) { return P.first; });
17359 NumReducedVals = Candidates.size();
17360 // Have a reduction of the same element.
17361 if (NumReducedVals == 1) {
17362 Value *OrigV = TrackedToOrig.find(Val: Candidates.front())->second;
17363 unsigned Cnt = SameValuesCounter.lookup(Key: OrigV);
17364 Value *RedVal =
17365 emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
17366 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17367 VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
17368 continue;
17369 }
17370 }
17371
17372 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17373 unsigned EltSize = V.getVectorElementSize(V: Candidates[0]);
17374 unsigned MaxElts =
17375 RegMaxNumber * llvm::bit_floor(Value: MaxVecRegSize / EltSize);
17376
17377 unsigned ReduxWidth = std::min<unsigned>(
17378 a: llvm::bit_floor(Value: NumReducedVals),
17379 b: std::clamp<unsigned>(val: MaxElts, lo: RedValsMaxNumber,
17380 hi: RegMaxNumber * RedValsMaxNumber));
17381 unsigned Start = 0;
17382 unsigned Pos = Start;
17383 // Restarts vectorization attempt with lower vector factor.
17384 unsigned PrevReduxWidth = ReduxWidth;
17385 bool CheckForReusedReductionOpsLocal = false;
17386 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17387 &CheckForReusedReductionOpsLocal,
17388 &PrevReduxWidth, &V,
17389 &IgnoreList](bool IgnoreVL = false) {
17390 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
17391 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17392 // Check if any of the reduction ops are gathered. If so, worth
17393 // trying again with less number of reduction ops.
17394 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17395 }
17396 ++Pos;
17397 if (Pos < NumReducedVals - ReduxWidth + 1)
17398 return IsAnyRedOpGathered;
17399 Pos = Start;
17400 ReduxWidth /= 2;
17401 return IsAnyRedOpGathered;
17402 };
17403 bool AnyVectorized = false;
17404 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17405 ReduxWidth >= ReductionLimit) {
17406 // Dependency in tree of the reduction ops - drop this attempt, try
17407 // later.
17408 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17409 Start == 0) {
17410 CheckForReusedReductionOps = true;
17411 break;
17412 }
17413 PrevReduxWidth = ReduxWidth;
17414 ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
17415 // Beeing analyzed already - skip.
17416 if (V.areAnalyzedReductionVals(VL)) {
17417 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17418 continue;
17419 }
17420 // Early exit if any of the reduction values were deleted during
17421 // previous vectorization attempts.
17422 if (any_of(Range&: VL, P: [&V](Value *RedVal) {
17423 auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
17424 if (!RedValI)
17425 return false;
17426 return V.isDeleted(I: RedValI);
17427 }))
17428 break;
17429 V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
17430 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17431 if (!AdjustReducedVals())
17432 V.analyzedReductionVals(VL);
17433 continue;
17434 }
17435 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17436 if (!AdjustReducedVals())
17437 V.analyzedReductionVals(VL);
17438 continue;
17439 }
17440 V.reorderTopToBottom();
17441 // No need to reorder the root node at all.
17442 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17443 // Keep extracted other reduction values, if they are used in the
17444 // vectorization trees.
17445 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17446 ExternallyUsedValues);
17447 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17448 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17449 continue;
17450 for (Value *V : ReducedVals[Cnt])
17451 if (isa<Instruction>(Val: V))
17452 LocalExternallyUsedValues[TrackedVals[V]];
17453 }
17454 if (!IsSupportedHorRdxIdentityOp) {
17455 // Number of uses of the candidates in the vector of values.
17456 assert(SameValuesCounter.empty() &&
17457 "Reused values counter map is not empty");
17458 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17459 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17460 continue;
17461 Value *V = Candidates[Cnt];
17462 Value *OrigV = TrackedToOrig.find(Val: V)->second;
17463 ++SameValuesCounter[OrigV];
17464 }
17465 }
17466 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17467 // Gather externally used values.
17468 SmallPtrSet<Value *, 4> Visited;
17469 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17470 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17471 continue;
17472 Value *RdxVal = Candidates[Cnt];
17473 if (!Visited.insert(Ptr: RdxVal).second)
17474 continue;
17475 // Check if the scalar was vectorized as part of the vectorization
17476 // tree but not the top node.
17477 if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
17478 LocalExternallyUsedValues[RdxVal];
17479 continue;
17480 }
17481 Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
17482 unsigned NumOps =
17483 VectorizedVals.lookup(Val: RdxVal) + SameValuesCounter[OrigV];
17484 if (NumOps != ReducedValsToOps.find(Val: OrigV)->second.size())
17485 LocalExternallyUsedValues[RdxVal];
17486 }
17487 // Do not need the list of reused scalars in regular mode anymore.
17488 if (!IsSupportedHorRdxIdentityOp)
17489 SameValuesCounter.clear();
17490 for (Value *RdxVal : VL)
17491 if (RequiredExtract.contains(Ptr: RdxVal))
17492 LocalExternallyUsedValues[RdxVal];
17493 // Update LocalExternallyUsedValues for the scalar, replaced by
17494 // extractelement instructions.
17495 DenseMap<Value *, Value *> ReplacementToExternal;
17496 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17497 ReplacementToExternal.try_emplace(Key: Pair.second, Args: Pair.first);
17498 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17499 Value *Ext = Pair.first;
17500 auto RIt = ReplacementToExternal.find(Val: Ext);
17501 while (RIt != ReplacementToExternal.end()) {
17502 Ext = RIt->second;
17503 RIt = ReplacementToExternal.find(Val: Ext);
17504 }
17505 auto *It = ExternallyUsedValues.find(Key: Ext);
17506 if (It == ExternallyUsedValues.end())
17507 continue;
17508 LocalExternallyUsedValues[Pair.second].append(RHS: It->second);
17509 }
17510 V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
17511
17512 V.computeMinimumValueSizes();
17513 V.transformNodes();
17514
17515 // Estimate cost.
17516 InstructionCost TreeCost = V.getTreeCost(VectorizedVals: VL);
17517 InstructionCost ReductionCost =
17518 getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, ReduxWidth, FMF: RdxFMF);
17519 InstructionCost Cost = TreeCost + ReductionCost;
17520 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17521 << " for reduction\n");
17522 if (!Cost.isValid())
17523 break;
17524 if (Cost >= -SLPCostThreshold) {
17525 V.getORE()->emit(RemarkBuilder: [&]() {
17526 return OptimizationRemarkMissed(
17527 SV_NAME, "HorSLPNotBeneficial",
17528 ReducedValsToOps.find(Val: VL[0])->second.front())
17529 << "Vectorizing horizontal reduction is possible "
17530 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17531 << " and threshold "
17532 << ore::NV("Threshold", -SLPCostThreshold);
17533 });
17534 if (!AdjustReducedVals())
17535 V.analyzedReductionVals(VL);
17536 continue;
17537 }
17538
17539 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17540 << Cost << ". (HorRdx)\n");
17541 V.getORE()->emit(RemarkBuilder: [&]() {
17542 return OptimizationRemark(
17543 SV_NAME, "VectorizedHorizontalReduction",
17544 ReducedValsToOps.find(Val: VL[0])->second.front())
17545 << "Vectorized horizontal reduction with cost "
17546 << ore::NV("Cost", Cost) << " and with tree size "
17547 << ore::NV("TreeSize", V.getTreeSize());
17548 });
17549
17550 Builder.setFastMathFlags(RdxFMF);
17551
17552 // Emit a reduction. If the root is a select (min/max idiom), the insert
17553 // point is the compare condition of that select.
17554 Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
17555 Instruction *InsertPt = RdxRootInst;
17556 if (IsCmpSelMinMax)
17557 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17558
17559 // Vectorize a tree.
17560 Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues: LocalExternallyUsedValues,
17561 ReplacedExternals, ReductionRoot: InsertPt);
17562
17563 Builder.SetInsertPoint(InsertPt);
17564
17565 // To prevent poison from leaking across what used to be sequential,
17566 // safe, scalar boolean logic operations, the reduction operand must be
17567 // frozen.
17568 if ((isBoolLogicOp(I: RdxRootInst) ||
17569 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17570 !isGuaranteedNotToBePoison(V: VectorizedRoot))
17571 VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
17572
17573 // Emit code to correctly handle reused reduced values, if required.
17574 if (OptReusedScalars && !SameScaleFactor) {
17575 VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
17576 SameValuesCounter, TrackedToOrig);
17577 }
17578
17579 Value *ReducedSubTree =
17580 emitReduction(VectorizedValue: VectorizedRoot, Builder, ReduxWidth, TTI);
17581 if (ReducedSubTree->getType() != VL.front()->getType()) {
17582 assert(ReducedSubTree->getType() != VL.front()->getType() &&
17583 "Expected different reduction type.");
17584 ReducedSubTree =
17585 Builder.CreateIntCast(V: ReducedSubTree, DestTy: VL.front()->getType(),
17586 isSigned: V.isSignedMinBitwidthRootNode());
17587 }
17588
17589 // Improved analysis for add/fadd/xor reductions with same scale factor
17590 // for all operands of reductions. We can emit scalar ops for them
17591 // instead.
17592 if (OptReusedScalars && SameScaleFactor)
17593 ReducedSubTree = emitScaleForReusedOps(
17594 VectorizedValue: ReducedSubTree, Builder, Cnt: SameValuesCounter.front().second);
17595
17596 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17597 // Count vectorized reduced values to exclude them from final reduction.
17598 for (Value *RdxVal : VL) {
17599 Value *OrigV = TrackedToOrig.find(Val: RdxVal)->second;
17600 if (IsSupportedHorRdxIdentityOp) {
17601 VectorizedVals.try_emplace(Key: OrigV, Args&: SameValuesCounter[RdxVal]);
17602 continue;
17603 }
17604 ++VectorizedVals.try_emplace(Key: OrigV, Args: 0).first->getSecond();
17605 if (!V.isVectorized(V: RdxVal))
17606 RequiredExtract.insert(Ptr: RdxVal);
17607 }
17608 Pos += ReduxWidth;
17609 Start = Pos;
17610 ReduxWidth = llvm::bit_floor(Value: NumReducedVals - Pos);
17611 AnyVectorized = true;
17612 }
17613 if (OptReusedScalars && !AnyVectorized) {
17614 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17615 Value *RedVal = emitScaleForReusedOps(VectorizedValue: P.first, Builder, Cnt: P.second);
17616 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17617 Value *OrigV = TrackedToOrig.find(Val: P.first)->second;
17618 VectorizedVals.try_emplace(Key: OrigV, Args: P.second);
17619 }
17620 continue;
17621 }
17622 }
17623 if (VectorizedTree) {
17624 // Reorder operands of bool logical op in the natural order to avoid
17625 // possible problem with poison propagation. If not possible to reorder
17626 // (both operands are originally RHS), emit an extra freeze instruction
17627 // for the LHS operand.
17628 // I.e., if we have original code like this:
17629 // RedOp1 = select i1 ?, i1 LHS, i1 false
17630 // RedOp2 = select i1 RHS, i1 ?, i1 false
17631
17632 // Then, we swap LHS/RHS to create a new op that matches the poison
17633 // semantics of the original code.
17634
17635 // If we have original code like this and both values could be poison:
17636 // RedOp1 = select i1 ?, i1 LHS, i1 false
17637 // RedOp2 = select i1 ?, i1 RHS, i1 false
17638
17639 // Then, we must freeze LHS in the new op.
17640 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17641 Instruction *RedOp1,
17642 Instruction *RedOp2,
17643 bool InitStep) {
17644 if (!AnyBoolLogicOp)
17645 return;
17646 if (isBoolLogicOp(I: RedOp1) &&
17647 ((!InitStep && LHS == VectorizedTree) ||
17648 getRdxOperand(I: RedOp1, Index: 0) == LHS || isGuaranteedNotToBePoison(V: LHS)))
17649 return;
17650 if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17651 getRdxOperand(I: RedOp2, Index: 0) == RHS ||
17652 isGuaranteedNotToBePoison(V: RHS))) {
17653 std::swap(a&: LHS, b&: RHS);
17654 return;
17655 }
17656 if (LHS != VectorizedTree)
17657 LHS = Builder.CreateFreeze(V: LHS);
17658 };
17659 // Finish the reduction.
17660 // Need to add extra arguments and not vectorized possible reduction
17661 // values.
17662 // Try to avoid dependencies between the scalar remainders after
17663 // reductions.
17664 auto FinalGen =
17665 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
17666 bool InitStep) {
17667 unsigned Sz = InstVals.size();
17668 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +
17669 Sz % 2);
17670 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17671 Instruction *RedOp = InstVals[I + 1].first;
17672 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17673 Value *RdxVal1 = InstVals[I].second;
17674 Value *StableRdxVal1 = RdxVal1;
17675 auto It1 = TrackedVals.find(Val: RdxVal1);
17676 if (It1 != TrackedVals.end())
17677 StableRdxVal1 = It1->second;
17678 Value *RdxVal2 = InstVals[I + 1].second;
17679 Value *StableRdxVal2 = RdxVal2;
17680 auto It2 = TrackedVals.find(Val: RdxVal2);
17681 if (It2 != TrackedVals.end())
17682 StableRdxVal2 = It2->second;
17683 // To prevent poison from leaking across what used to be
17684 // sequential, safe, scalar boolean logic operations, the
17685 // reduction operand must be frozen.
17686 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17687 RedOp, InitStep);
17688 Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
17689 RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
17690 ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed);
17691 }
17692 if (Sz % 2 == 1)
17693 ExtraReds[Sz / 2] = InstVals.back();
17694 return ExtraReds;
17695 };
17696 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
17697 ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
17698 Args&: VectorizedTree);
17699 SmallPtrSet<Value *, 8> Visited;
17700 for (ArrayRef<Value *> Candidates : ReducedVals) {
17701 for (Value *RdxVal : Candidates) {
17702 if (!Visited.insert(Ptr: RdxVal).second)
17703 continue;
17704 unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
17705 for (Instruction *RedOp :
17706 ArrayRef(ReducedValsToOps.find(Val: RdxVal)->second)
17707 .drop_back(N: NumOps))
17708 ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
17709 }
17710 }
17711 for (auto &Pair : ExternallyUsedValues) {
17712 // Add each externally used value to the final reduction.
17713 for (auto *I : Pair.second)
17714 ExtraReductions.emplace_back(Args&: I, Args&: Pair.first);
17715 }
17716 // Iterate through all not-vectorized reduction values/extra arguments.
17717 bool InitStep = true;
17718 while (ExtraReductions.size() > 1) {
17719 SmallVector<std::pair<Instruction *, Value *>> NewReds =
17720 FinalGen(ExtraReductions, InitStep);
17721 ExtraReductions.swap(RHS&: NewReds);
17722 InitStep = false;
17723 }
17724 VectorizedTree = ExtraReductions.front().second;
17725
17726 ReductionRoot->replaceAllUsesWith(V: VectorizedTree);
17727
17728 // The original scalar reduction is expected to have no remaining
17729 // uses outside the reduction tree itself. Assert that we got this
17730 // correct, replace internal uses with undef, and mark for eventual
17731 // deletion.
17732#ifndef NDEBUG
17733 SmallSet<Value *, 4> IgnoreSet;
17734 for (ArrayRef<Value *> RdxOps : ReductionOps)
17735 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17736#endif
17737 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17738 for (Value *Ignore : RdxOps) {
17739 if (!Ignore)
17740 continue;
17741#ifndef NDEBUG
17742 for (auto *U : Ignore->users()) {
17743 assert(IgnoreSet.count(U) &&
17744 "All users must be either in the reduction ops list.");
17745 }
17746#endif
17747 if (!Ignore->use_empty()) {
17748 Value *P = PoisonValue::get(T: Ignore->getType());
17749 Ignore->replaceAllUsesWith(V: P);
17750 }
17751 }
17752 V.removeInstructionsAndOperands(DeadVals: RdxOps);
17753 }
17754 } else if (!CheckForReusedReductionOps) {
17755 for (ReductionOpsType &RdxOps : ReductionOps)
17756 for (Value *RdxOp : RdxOps)
17757 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
17758 }
17759 return VectorizedTree;
17760 }
17761
17762private:
17763 /// Calculate the cost of a reduction.
17764 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17765 ArrayRef<Value *> ReducedVals,
17766 bool IsCmpSelMinMax, unsigned ReduxWidth,
17767 FastMathFlags FMF) {
17768 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17769 Type *ScalarTy = ReducedVals.front()->getType();
17770 FixedVectorType *VectorTy = getWidenedType(ScalarTy, VF: ReduxWidth);
17771 InstructionCost VectorCost = 0, ScalarCost;
17772 // If all of the reduced values are constant, the vector cost is 0, since
17773 // the reduction value can be calculated at the compile time.
17774 bool AllConsts = allConstant(VL: ReducedVals);
17775 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17776 InstructionCost Cost = 0;
17777 // Scalar cost is repeated for N-1 elements.
17778 int Cnt = ReducedVals.size();
17779 for (Value *RdxVal : ReducedVals) {
17780 if (Cnt == 1)
17781 break;
17782 --Cnt;
17783 if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) {
17784 Cost += GenCostFn();
17785 continue;
17786 }
17787 InstructionCost ScalarCost = 0;
17788 for (User *U : RdxVal->users()) {
17789 auto *RdxOp = cast<Instruction>(Val: U);
17790 if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
17791 ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
17792 continue;
17793 }
17794 ScalarCost = InstructionCost::getInvalid();
17795 break;
17796 }
17797 if (ScalarCost.isValid())
17798 Cost += ScalarCost;
17799 else
17800 Cost += GenCostFn();
17801 }
17802 return Cost;
17803 };
17804 switch (RdxKind) {
17805 case RecurKind::Add:
17806 case RecurKind::Mul:
17807 case RecurKind::Or:
17808 case RecurKind::And:
17809 case RecurKind::Xor:
17810 case RecurKind::FAdd:
17811 case RecurKind::FMul: {
17812 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
17813 if (!AllConsts)
17814 VectorCost =
17815 TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy, FMF, CostKind);
17816 ScalarCost = EvaluateScalarCost([&]() {
17817 return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
17818 });
17819 break;
17820 }
17821 case RecurKind::FMax:
17822 case RecurKind::FMin:
17823 case RecurKind::FMaximum:
17824 case RecurKind::FMinimum:
17825 case RecurKind::SMax:
17826 case RecurKind::SMin:
17827 case RecurKind::UMax:
17828 case RecurKind::UMin: {
17829 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
17830 if (!AllConsts)
17831 VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
17832 ScalarCost = EvaluateScalarCost([&]() {
17833 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17834 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17835 });
17836 break;
17837 }
17838 default:
17839 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17840 }
17841
17842 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17843 << " for reduction of " << shortBundleName(ReducedVals)
17844 << " (It is a splitting reduction)\n");
17845 return VectorCost - ScalarCost;
17846 }
17847
17848 /// Emit a horizontal reduction of the vectorized value.
17849 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17850 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17851 assert(VectorizedValue && "Need to have a vectorized tree node");
17852 assert(isPowerOf2_32(ReduxWidth) &&
17853 "We only handle power-of-two reductions for now");
17854 assert(RdxKind != RecurKind::FMulAdd &&
17855 "A call to the llvm.fmuladd intrinsic is not handled yet");
17856
17857 ++NumVectorInstructions;
17858 return createSimpleTargetReduction(B&: Builder, Src: VectorizedValue, RdxKind);
17859 }
17860
17861 /// Emits optimized code for unique scalar value reused \p Cnt times.
17862 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17863 unsigned Cnt) {
17864 assert(IsSupportedHorRdxIdentityOp &&
17865 "The optimization of matched scalar identity horizontal reductions "
17866 "must be supported.");
17867 switch (RdxKind) {
17868 case RecurKind::Add: {
17869 // res = mul vv, n
17870 Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
17871 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17872 << VectorizedValue << ". (HorRdx)\n");
17873 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
17874 }
17875 case RecurKind::Xor: {
17876 // res = n % 2 ? 0 : vv
17877 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17878 << ". (HorRdx)\n");
17879 if (Cnt % 2 == 0)
17880 return Constant::getNullValue(Ty: VectorizedValue->getType());
17881 return VectorizedValue;
17882 }
17883 case RecurKind::FAdd: {
17884 // res = fmul v, n
17885 Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
17886 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17887 << VectorizedValue << ". (HorRdx)\n");
17888 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
17889 }
17890 case RecurKind::And:
17891 case RecurKind::Or:
17892 case RecurKind::SMax:
17893 case RecurKind::SMin:
17894 case RecurKind::UMax:
17895 case RecurKind::UMin:
17896 case RecurKind::FMax:
17897 case RecurKind::FMin:
17898 case RecurKind::FMaximum:
17899 case RecurKind::FMinimum:
17900 // res = vv
17901 return VectorizedValue;
17902 case RecurKind::Mul:
17903 case RecurKind::FMul:
17904 case RecurKind::FMulAdd:
17905 case RecurKind::IAnyOf:
17906 case RecurKind::FAnyOf:
17907 case RecurKind::None:
17908 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17909 }
17910 return nullptr;
17911 }
17912
17913 /// Emits actual operation for the scalar identity values, found during
17914 /// horizontal reduction analysis.
17915 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17916 BoUpSLP &R,
17917 const MapVector<Value *, unsigned> &SameValuesCounter,
17918 const DenseMap<Value *, Value *> &TrackedToOrig) {
17919 assert(IsSupportedHorRdxIdentityOp &&
17920 "The optimization of matched scalar identity horizontal reductions "
17921 "must be supported.");
17922 ArrayRef<Value *> VL = R.getRootNodeScalars();
17923 auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
17924 if (VTy->getElementType() != VL.front()->getType()) {
17925 VectorizedValue = Builder.CreateIntCast(
17926 V: VectorizedValue,
17927 DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
17928 isSigned: R.isSignedMinBitwidthRootNode());
17929 }
17930 switch (RdxKind) {
17931 case RecurKind::Add: {
17932 // root = mul prev_root, <1, 1, n, 1>
17933 SmallVector<Constant *> Vals;
17934 for (Value *V : VL) {
17935 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17936 Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false));
17937 }
17938 auto *Scale = ConstantVector::get(V: Vals);
17939 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17940 << VectorizedValue << ". (HorRdx)\n");
17941 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
17942 }
17943 case RecurKind::And:
17944 case RecurKind::Or:
17945 // No need for multiple or/and(s).
17946 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17947 << ". (HorRdx)\n");
17948 return VectorizedValue;
17949 case RecurKind::SMax:
17950 case RecurKind::SMin:
17951 case RecurKind::UMax:
17952 case RecurKind::UMin:
17953 case RecurKind::FMax:
17954 case RecurKind::FMin:
17955 case RecurKind::FMaximum:
17956 case RecurKind::FMinimum:
17957 // No need for multiple min/max(s) of the same value.
17958 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17959 << ". (HorRdx)\n");
17960 return VectorizedValue;
17961 case RecurKind::Xor: {
17962 // Replace values with even number of repeats with 0, since
17963 // x xor x = 0.
17964 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17965 // 7>, if elements 4th and 6th elements have even number of repeats.
17966 SmallVector<int> Mask(
17967 cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
17968 PoisonMaskElem);
17969 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
17970 bool NeedShuffle = false;
17971 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17972 Value *V = VL[I];
17973 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17974 if (Cnt % 2 == 0) {
17975 Mask[I] = VF;
17976 NeedShuffle = true;
17977 }
17978 }
17979 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17980 : Mask) dbgs()
17981 << I << " ";
17982 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17983 if (NeedShuffle)
17984 VectorizedValue = Builder.CreateShuffleVector(
17985 V1: VectorizedValue,
17986 V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
17987 return VectorizedValue;
17988 }
17989 case RecurKind::FAdd: {
17990 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17991 SmallVector<Constant *> Vals;
17992 for (Value *V : VL) {
17993 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.find(Val: V)->second);
17994 Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
17995 }
17996 auto *Scale = ConstantVector::get(V: Vals);
17997 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
17998 }
17999 case RecurKind::Mul:
18000 case RecurKind::FMul:
18001 case RecurKind::FMulAdd:
18002 case RecurKind::IAnyOf:
18003 case RecurKind::FAnyOf:
18004 case RecurKind::None:
18005 llvm_unreachable("Unexpected reduction kind for reused scalars.");
18006 }
18007 return nullptr;
18008 }
18009};
18010} // end anonymous namespace
18011
18012/// Gets recurrence kind from the specified value.
18013static RecurKind getRdxKind(Value *V) {
18014 return HorizontalReduction::getRdxKind(V);
18015}
18016static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
18017 if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
18018 return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
18019
18020 unsigned AggregateSize = 1;
18021 auto *IV = cast<InsertValueInst>(Val: InsertInst);
18022 Type *CurrentType = IV->getType();
18023 do {
18024 if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
18025 for (auto *Elt : ST->elements())
18026 if (Elt != ST->getElementType(N: 0)) // check homogeneity
18027 return std::nullopt;
18028 AggregateSize *= ST->getNumElements();
18029 CurrentType = ST->getElementType(N: 0);
18030 } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
18031 AggregateSize *= AT->getNumElements();
18032 CurrentType = AT->getElementType();
18033 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
18034 AggregateSize *= VT->getNumElements();
18035 return AggregateSize;
18036 } else if (CurrentType->isSingleValueType()) {
18037 return AggregateSize;
18038 } else {
18039 return std::nullopt;
18040 }
18041 } while (true);
18042}
18043
18044static void findBuildAggregate_rec(Instruction *LastInsertInst,
18045 TargetTransformInfo *TTI,
18046 SmallVectorImpl<Value *> &BuildVectorOpds,
18047 SmallVectorImpl<Value *> &InsertElts,
18048 unsigned OperandOffset) {
18049 do {
18050 Value *InsertedOperand = LastInsertInst->getOperand(i: 1);
18051 std::optional<unsigned> OperandIndex =
18052 getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
18053 if (!OperandIndex)
18054 return;
18055 if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
18056 findBuildAggregate_rec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
18057 BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex);
18058
18059 } else {
18060 BuildVectorOpds[*OperandIndex] = InsertedOperand;
18061 InsertElts[*OperandIndex] = LastInsertInst;
18062 }
18063 LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0));
18064 } while (LastInsertInst != nullptr &&
18065 isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
18066 LastInsertInst->hasOneUse());
18067}
18068
18069/// Recognize construction of vectors like
18070/// %ra = insertelement <4 x float> poison, float %s0, i32 0
18071/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
18072/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
18073/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
18074/// starting from the last insertelement or insertvalue instruction.
18075///
18076/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
18077/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
18078/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
18079///
18080/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
18081///
18082/// \return true if it matches.
18083static bool findBuildAggregate(Instruction *LastInsertInst,
18084 TargetTransformInfo *TTI,
18085 SmallVectorImpl<Value *> &BuildVectorOpds,
18086 SmallVectorImpl<Value *> &InsertElts) {
18087
18088 assert((isa<InsertElementInst>(LastInsertInst) ||
18089 isa<InsertValueInst>(LastInsertInst)) &&
18090 "Expected insertelement or insertvalue instruction!");
18091
18092 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
18093 "Expected empty result vectors!");
18094
18095 std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
18096 if (!AggregateSize)
18097 return false;
18098 BuildVectorOpds.resize(N: *AggregateSize);
18099 InsertElts.resize(N: *AggregateSize);
18100
18101 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0);
18102 llvm::erase(C&: BuildVectorOpds, V: nullptr);
18103 llvm::erase(C&: InsertElts, V: nullptr);
18104 if (BuildVectorOpds.size() >= 2)
18105 return true;
18106
18107 return false;
18108}
18109
18110/// Try and get a reduction instruction from a phi node.
18111///
18112/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
18113/// if they come from either \p ParentBB or a containing loop latch.
18114///
18115/// \returns A candidate reduction value if possible, or \code nullptr \endcode
18116/// if not possible.
18117static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
18118 BasicBlock *ParentBB, LoopInfo *LI) {
18119 // There are situations where the reduction value is not dominated by the
18120 // reduction phi. Vectorizing such cases has been reported to cause
18121 // miscompiles. See PR25787.
18122 auto DominatedReduxValue = [&](Value *R) {
18123 return isa<Instruction>(Val: R) &&
18124 DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
18125 };
18126
18127 Instruction *Rdx = nullptr;
18128
18129 // Return the incoming value if it comes from the same BB as the phi node.
18130 if (P->getIncomingBlock(i: 0) == ParentBB) {
18131 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
18132 } else if (P->getIncomingBlock(i: 1) == ParentBB) {
18133 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
18134 }
18135
18136 if (Rdx && DominatedReduxValue(Rdx))
18137 return Rdx;
18138
18139 // Otherwise, check whether we have a loop latch to look at.
18140 Loop *BBL = LI->getLoopFor(BB: ParentBB);
18141 if (!BBL)
18142 return nullptr;
18143 BasicBlock *BBLatch = BBL->getLoopLatch();
18144 if (!BBLatch)
18145 return nullptr;
18146
18147 // There is a loop latch, return the incoming value if it comes from
18148 // that. This reduction pattern occasionally turns up.
18149 if (P->getIncomingBlock(i: 0) == BBLatch) {
18150 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
18151 } else if (P->getIncomingBlock(i: 1) == BBLatch) {
18152 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
18153 }
18154
18155 if (Rdx && DominatedReduxValue(Rdx))
18156 return Rdx;
18157
18158 return nullptr;
18159}
18160
18161static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
18162 if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
18163 return true;
18164 if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18165 return true;
18166 if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18167 return true;
18168 if (match(V: I, P: m_Intrinsic<Intrinsic::maximum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18169 return true;
18170 if (match(V: I, P: m_Intrinsic<Intrinsic::minimum>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18171 return true;
18172 if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18173 return true;
18174 if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18175 return true;
18176 if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18177 return true;
18178 if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
18179 return true;
18180 return false;
18181}
18182
18183/// We could have an initial reduction that is not an add.
18184/// r *= v1 + v2 + v3 + v4
18185/// In such a case start looking for a tree rooted in the first '+'.
18186/// \Returns the new root if found, which may be nullptr if not an instruction.
18187static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
18188 Instruction *Root) {
18189 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
18190 isa<IntrinsicInst>(Root)) &&
18191 "Expected binop, select, or intrinsic for reduction matching");
18192 Value *LHS =
18193 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
18194 Value *RHS =
18195 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1);
18196 if (LHS == Phi)
18197 return dyn_cast<Instruction>(Val: RHS);
18198 if (RHS == Phi)
18199 return dyn_cast<Instruction>(Val: LHS);
18200 return nullptr;
18201}
18202
18203/// \p Returns the first operand of \p I that does not match \p Phi. If
18204/// operand is not an instruction it returns nullptr.
18205static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
18206 Value *Op0 = nullptr;
18207 Value *Op1 = nullptr;
18208 if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
18209 return nullptr;
18210 return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
18211}
18212
18213/// \Returns true if \p I is a candidate instruction for reduction vectorization.
18214static bool isReductionCandidate(Instruction *I) {
18215 bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
18216 Value *B0 = nullptr, *B1 = nullptr;
18217 bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
18218 return IsBinop || IsSelect;
18219}
18220
18221bool SLPVectorizerPass::vectorizeHorReduction(
18222 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
18223 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
18224 if (!ShouldVectorizeHor)
18225 return false;
18226 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
18227
18228 if (Root->getParent() != BB || isa<PHINode>(Val: Root))
18229 return false;
18230
18231 // If we can find a secondary reduction root, use that instead.
18232 auto SelectRoot = [&]() {
18233 if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
18234 HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
18235 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
18236 return NewRoot;
18237 return Root;
18238 };
18239
18240 // Start analysis starting from Root instruction. If horizontal reduction is
18241 // found, try to vectorize it. If it is not a horizontal reduction or
18242 // vectorization is not possible or not effective, and currently analyzed
18243 // instruction is a binary operation, try to vectorize the operands, using
18244 // pre-order DFS traversal order. If the operands were not vectorized, repeat
18245 // the same procedure considering each operand as a possible root of the
18246 // horizontal reduction.
18247 // Interrupt the process if the Root instruction itself was vectorized or all
18248 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
18249 // If a horizintal reduction was not matched or vectorized we collect
18250 // instructions for possible later attempts for vectorization.
18251 std::queue<std::pair<Instruction *, unsigned>> Stack;
18252 Stack.emplace(args: SelectRoot(), args: 0);
18253 SmallPtrSet<Value *, 8> VisitedInstrs;
18254 bool Res = false;
18255 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
18256 if (R.isAnalyzedReductionRoot(I: Inst))
18257 return nullptr;
18258 if (!isReductionCandidate(I: Inst))
18259 return nullptr;
18260 HorizontalReduction HorRdx;
18261 if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DL: *DL, TLI: *TLI))
18262 return nullptr;
18263 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI);
18264 };
18265 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
18266 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
18267 FutureSeed = getNonPhiOperand(I: Root, Phi: P);
18268 if (!FutureSeed)
18269 return false;
18270 }
18271 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18272 // analysis is done separately.
18273 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
18274 PostponedInsts.push_back(Elt: FutureSeed);
18275 return true;
18276 };
18277
18278 while (!Stack.empty()) {
18279 Instruction *Inst;
18280 unsigned Level;
18281 std::tie(args&: Inst, args&: Level) = Stack.front();
18282 Stack.pop();
18283 // Do not try to analyze instruction that has already been vectorized.
18284 // This may happen when we vectorize instruction operands on a previous
18285 // iteration while stack was populated before that happened.
18286 if (R.isDeleted(I: Inst))
18287 continue;
18288 if (Value *VectorizedV = TryToReduce(Inst)) {
18289 Res = true;
18290 if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
18291 // Try to find another reduction.
18292 Stack.emplace(args&: I, args&: Level);
18293 continue;
18294 }
18295 if (R.isDeleted(I: Inst))
18296 continue;
18297 } else {
18298 // We could not vectorize `Inst` so try to use it as a future seed.
18299 if (!TryAppendToPostponedInsts(Inst)) {
18300 assert(Stack.empty() && "Expected empty stack");
18301 break;
18302 }
18303 }
18304
18305 // Try to vectorize operands.
18306 // Continue analysis for the instruction from the same basic block only to
18307 // save compile time.
18308 if (++Level < RecursionMaxDepth)
18309 for (auto *Op : Inst->operand_values())
18310 if (VisitedInstrs.insert(Ptr: Op).second)
18311 if (auto *I = dyn_cast<Instruction>(Val: Op))
18312 // Do not try to vectorize CmpInst operands, this is done
18313 // separately.
18314 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
18315 !R.isDeleted(I) && I->getParent() == BB)
18316 Stack.emplace(args&: I, args&: Level);
18317 }
18318 return Res;
18319}
18320
18321bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18322 BasicBlock *BB, BoUpSLP &R,
18323 TargetTransformInfo *TTI) {
18324 SmallVector<WeakTrackingVH> PostponedInsts;
18325 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18326 Res |= tryToVectorize(Insts: PostponedInsts, R);
18327 return Res;
18328}
18329
18330bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18331 BoUpSLP &R) {
18332 bool Res = false;
18333 for (Value *V : Insts)
18334 if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
18335 Res |= tryToVectorize(I: Inst, R);
18336 return Res;
18337}
18338
18339bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18340 BasicBlock *BB, BoUpSLP &R,
18341 bool MaxVFOnly) {
18342 if (!R.canMapToVector(T: IVI->getType()))
18343 return false;
18344
18345 SmallVector<Value *, 16> BuildVectorOpds;
18346 SmallVector<Value *, 16> BuildVectorInsts;
18347 if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts))
18348 return false;
18349
18350 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
18351 R.getORE()->emit(RemarkBuilder: [&]() {
18352 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
18353 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
18354 "trying reduction first.";
18355 });
18356 return false;
18357 }
18358 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18359 // Aggregate value is unlikely to be processed in vector register.
18360 return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
18361}
18362
18363bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18364 BasicBlock *BB, BoUpSLP &R,
18365 bool MaxVFOnly) {
18366 SmallVector<Value *, 16> BuildVectorInsts;
18367 SmallVector<Value *, 16> BuildVectorOpds;
18368 SmallVector<int> Mask;
18369 if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts) ||
18370 (llvm::all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
18371 isFixedVectorShuffle(VL: BuildVectorOpds, Mask)))
18372 return false;
18373
18374 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
18375 R.getORE()->emit(RemarkBuilder: [&]() {
18376 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
18377 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
18378 "trying reduction first.";
18379 });
18380 return false;
18381 }
18382 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18383 return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
18384}
18385
18386template <typename T>
18387static bool tryToVectorizeSequence(
18388 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18389 function_ref<bool(T *, T *)> AreCompatible,
18390 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18391 bool MaxVFOnly, BoUpSLP &R) {
18392 bool Changed = false;
18393 // Sort by type, parent, operands.
18394 stable_sort(Incoming, Comparator);
18395
18396 // Try to vectorize elements base on their type.
18397 SmallVector<T *> Candidates;
18398 SmallVector<T *> VL;
18399 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18400 VL.clear()) {
18401 // Look for the next elements with the same type, parent and operand
18402 // kinds.
18403 auto *I = dyn_cast<Instruction>(*IncIt);
18404 if (!I || R.isDeleted(I)) {
18405 ++IncIt;
18406 continue;
18407 }
18408 auto *SameTypeIt = IncIt;
18409 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18410 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
18411 AreCompatible(*SameTypeIt, *IncIt))) {
18412 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18413 ++SameTypeIt;
18414 if (I && !R.isDeleted(I))
18415 VL.push_back(cast<T>(I));
18416 }
18417
18418 // Try to vectorize them.
18419 unsigned NumElts = VL.size();
18420 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18421 << NumElts << ")\n");
18422 // The vectorization is a 3-state attempt:
18423 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18424 // size of maximal register at first.
18425 // 2. Try to vectorize remaining instructions with the same type, if
18426 // possible. This may result in the better vectorization results rather than
18427 // if we try just to vectorize instructions with the same/alternate opcodes.
18428 // 3. Final attempt to try to vectorize all instructions with the
18429 // same/alternate ops only, this may result in some extra final
18430 // vectorization.
18431 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
18432 // Success start over because instructions might have been changed.
18433 Changed = true;
18434 VL.swap(Candidates);
18435 Candidates.clear();
18436 for (T *V : VL) {
18437 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18438 Candidates.push_back(V);
18439 }
18440 } else {
18441 /// \Returns the minimum number of elements that we will attempt to
18442 /// vectorize.
18443 auto GetMinNumElements = [&R](Value *V) {
18444 unsigned EltSize = R.getVectorElementSize(V);
18445 return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize);
18446 };
18447 if (NumElts < GetMinNumElements(*IncIt) &&
18448 (Candidates.empty() ||
18449 Candidates.front()->getType() == (*IncIt)->getType())) {
18450 for (T *V : VL) {
18451 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18452 Candidates.push_back(V);
18453 }
18454 }
18455 }
18456 // Final attempt to vectorize instructions with the same types.
18457 if (Candidates.size() > 1 &&
18458 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18459 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18460 // Success start over because instructions might have been changed.
18461 Changed = true;
18462 } else if (MaxVFOnly) {
18463 // Try to vectorize using small vectors.
18464 SmallVector<T *> VL;
18465 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18466 VL.clear()) {
18467 auto *I = dyn_cast<Instruction>(*It);
18468 if (!I || R.isDeleted(I)) {
18469 ++It;
18470 continue;
18471 }
18472 auto *SameTypeIt = It;
18473 while (SameTypeIt != End &&
18474 (!isa<Instruction>(*SameTypeIt) ||
18475 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
18476 AreCompatible(*SameTypeIt, *It))) {
18477 auto *I = dyn_cast<Instruction>(*SameTypeIt);
18478 ++SameTypeIt;
18479 if (I && !R.isDeleted(I))
18480 VL.push_back(cast<T>(I));
18481 }
18482 unsigned NumElts = VL.size();
18483 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
18484 /*MaxVFOnly=*/false))
18485 Changed = true;
18486 It = SameTypeIt;
18487 }
18488 }
18489 Candidates.clear();
18490 }
18491
18492 // Start over at the next instruction of a different type (or the end).
18493 IncIt = SameTypeIt;
18494 }
18495 return Changed;
18496}
18497
18498/// Compare two cmp instructions. If IsCompatibility is true, function returns
18499/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18500/// operands. If IsCompatibility is false, function implements strict weak
18501/// ordering relation between two cmp instructions, returning true if the first
18502/// instruction is "less" than the second, i.e. its predicate is less than the
18503/// predicate of the second or the operands IDs are less than the operands IDs
18504/// of the second cmp instruction.
18505template <bool IsCompatibility>
18506static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18507 const DominatorTree &DT) {
18508 assert(isValidElementType(V->getType()) &&
18509 isValidElementType(V2->getType()) &&
18510 "Expected valid element types only.");
18511 if (V == V2)
18512 return IsCompatibility;
18513 auto *CI1 = cast<CmpInst>(Val: V);
18514 auto *CI2 = cast<CmpInst>(Val: V2);
18515 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() <
18516 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
18517 return !IsCompatibility;
18518 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() >
18519 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
18520 return false;
18521 CmpInst::Predicate Pred1 = CI1->getPredicate();
18522 CmpInst::Predicate Pred2 = CI2->getPredicate();
18523 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
18524 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
18525 CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
18526 CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
18527 if (BasePred1 < BasePred2)
18528 return !IsCompatibility;
18529 if (BasePred1 > BasePred2)
18530 return false;
18531 // Compare operands.
18532 bool CI1Preds = Pred1 == BasePred1;
18533 bool CI2Preds = Pred2 == BasePred1;
18534 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18535 auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1);
18536 auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1);
18537 if (Op1 == Op2)
18538 continue;
18539 if (Op1->getValueID() < Op2->getValueID())
18540 return !IsCompatibility;
18541 if (Op1->getValueID() > Op2->getValueID())
18542 return false;
18543 if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
18544 if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
18545 if (IsCompatibility) {
18546 if (I1->getParent() != I2->getParent())
18547 return false;
18548 } else {
18549 // Try to compare nodes with same parent.
18550 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
18551 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
18552 if (!NodeI1)
18553 return NodeI2 != nullptr;
18554 if (!NodeI2)
18555 return false;
18556 assert((NodeI1 == NodeI2) ==
18557 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18558 "Different nodes should have different DFS numbers");
18559 if (NodeI1 != NodeI2)
18560 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18561 }
18562 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
18563 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18564 continue;
18565 if (IsCompatibility)
18566 return false;
18567 if (I1->getOpcode() != I2->getOpcode())
18568 return I1->getOpcode() < I2->getOpcode();
18569 }
18570 }
18571 return IsCompatibility;
18572}
18573
18574template <typename ItT>
18575bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18576 BasicBlock *BB, BoUpSLP &R) {
18577 bool Changed = false;
18578 // Try to find reductions first.
18579 for (CmpInst *I : CmpInsts) {
18580 if (R.isDeleted(I))
18581 continue;
18582 for (Value *Op : I->operands())
18583 if (auto *RootOp = dyn_cast<Instruction>(Val: Op))
18584 Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R, TTI);
18585 }
18586 // Try to vectorize operands as vector bundles.
18587 for (CmpInst *I : CmpInsts) {
18588 if (R.isDeleted(I))
18589 continue;
18590 Changed |= tryToVectorize(I, R);
18591 }
18592 // Try to vectorize list of compares.
18593 // Sort by type, compare predicate, etc.
18594 auto CompareSorter = [&](Value *V, Value *V2) {
18595 if (V == V2)
18596 return false;
18597 return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT);
18598 };
18599
18600 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18601 if (V1 == V2)
18602 return true;
18603 return compareCmp<true>(V: V1, V2, TLI&: *TLI, DT: *DT);
18604 };
18605
18606 SmallVector<Value *> Vals;
18607 for (Instruction *V : CmpInsts)
18608 if (!R.isDeleted(I: V) && isValidElementType(Ty: V->getType()))
18609 Vals.push_back(Elt: V);
18610 if (Vals.size() <= 1)
18611 return Changed;
18612 Changed |= tryToVectorizeSequence<Value>(
18613 Vals, CompareSorter, AreCompatibleCompares,
18614 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18615 // Exclude possible reductions from other blocks.
18616 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18617 return any_of(V->users(), [V](User *U) {
18618 auto *Select = dyn_cast<SelectInst>(Val: U);
18619 return Select &&
18620 Select->getParent() != cast<Instruction>(Val: V)->getParent();
18621 });
18622 });
18623 if (ArePossiblyReducedInOtherBlock)
18624 return false;
18625 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
18626 },
18627 /*MaxVFOnly=*/true, R);
18628 return Changed;
18629}
18630
18631bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18632 BasicBlock *BB, BoUpSLP &R) {
18633 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18634 "This function only accepts Insert instructions");
18635 bool OpsChanged = false;
18636 SmallVector<WeakTrackingVH> PostponedInsts;
18637 for (auto *I : reverse(C&: Instructions)) {
18638 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
18639 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
18640 continue;
18641 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
18642 OpsChanged |=
18643 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/true);
18644 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
18645 OpsChanged |=
18646 vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /*MaxVFOnly=*/true);
18647 }
18648 // pass2 - try to vectorize reductions only
18649 if (R.isDeleted(I))
18650 continue;
18651 OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, TTI, PostponedInsts);
18652 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
18653 continue;
18654 // pass3 - try to match and vectorize a buildvector sequence.
18655 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
18656 OpsChanged |=
18657 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/false);
18658 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
18659 OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
18660 /*MaxVFOnly=*/false);
18661 }
18662 }
18663 // Now try to vectorize postponed instructions.
18664 OpsChanged |= tryToVectorize(Insts: PostponedInsts, R);
18665
18666 Instructions.clear();
18667 return OpsChanged;
18668}
18669
18670bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18671 bool Changed = false;
18672 SmallVector<Value *, 4> Incoming;
18673 SmallPtrSet<Value *, 16> VisitedInstrs;
18674 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18675 // node. Allows better to identify the chains that can be vectorized in the
18676 // better way.
18677 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
18678 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18679 assert(isValidElementType(V1->getType()) &&
18680 isValidElementType(V2->getType()) &&
18681 "Expected vectorizable types only.");
18682 // It is fine to compare type IDs here, since we expect only vectorizable
18683 // types, like ints, floats and pointers, we don't care about other type.
18684 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18685 return true;
18686 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18687 return false;
18688 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18689 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18690 if (Opcodes1.size() < Opcodes2.size())
18691 return true;
18692 if (Opcodes1.size() > Opcodes2.size())
18693 return false;
18694 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18695 {
18696 // Instructions come first.
18697 auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]);
18698 auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]);
18699 if (I1 && I2) {
18700 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
18701 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
18702 if (!NodeI1)
18703 return NodeI2 != nullptr;
18704 if (!NodeI2)
18705 return false;
18706 assert((NodeI1 == NodeI2) ==
18707 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18708 "Different nodes should have different DFS numbers");
18709 if (NodeI1 != NodeI2)
18710 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18711 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18712 if (S.getOpcode() && !S.isAltShuffle())
18713 continue;
18714 return I1->getOpcode() < I2->getOpcode();
18715 }
18716 if (I1)
18717 return true;
18718 if (I2)
18719 return false;
18720 }
18721 {
18722 // Non-undef constants come next.
18723 bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]);
18724 bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]);
18725 if (C1 && C2)
18726 continue;
18727 if (C1)
18728 return true;
18729 if (C2)
18730 return false;
18731 }
18732 bool U1 = isa<UndefValue>(Val: Opcodes1[I]);
18733 bool U2 = isa<UndefValue>(Val: Opcodes2[I]);
18734 {
18735 // Non-constant non-instructions come next.
18736 if (!U1 && !U2) {
18737 auto ValID1 = Opcodes1[I]->getValueID();
18738 auto ValID2 = Opcodes2[I]->getValueID();
18739 if (ValID1 == ValID2)
18740 continue;
18741 if (ValID1 < ValID2)
18742 return true;
18743 if (ValID1 > ValID2)
18744 return false;
18745 }
18746 if (!U1)
18747 return true;
18748 if (!U2)
18749 return false;
18750 }
18751 // Undefs come last.
18752 assert(U1 && U2 && "The only thing left should be undef & undef.");
18753 continue;
18754 }
18755 return false;
18756 };
18757 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
18758 if (V1 == V2)
18759 return true;
18760 if (V1->getType() != V2->getType())
18761 return false;
18762 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18763 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18764 if (Opcodes1.size() != Opcodes2.size())
18765 return false;
18766 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18767 // Undefs are compatible with any other value.
18768 if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I]))
18769 continue;
18770 if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]))
18771 if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) {
18772 if (R.isDeleted(I: I1) || R.isDeleted(I: I2))
18773 return false;
18774 if (I1->getParent() != I2->getParent())
18775 return false;
18776 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
18777 if (S.getOpcode())
18778 continue;
18779 return false;
18780 }
18781 if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I]))
18782 continue;
18783 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18784 return false;
18785 }
18786 return true;
18787 };
18788
18789 bool HaveVectorizedPhiNodes = false;
18790 do {
18791 // Collect the incoming values from the PHIs.
18792 Incoming.clear();
18793 for (Instruction &I : *BB) {
18794 auto *P = dyn_cast<PHINode>(Val: &I);
18795 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18796 break;
18797
18798 // No need to analyze deleted, vectorized and non-vectorizable
18799 // instructions.
18800 if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
18801 isValidElementType(Ty: P->getType()))
18802 Incoming.push_back(Elt: P);
18803 }
18804
18805 if (Incoming.size() <= 1)
18806 break;
18807
18808 // Find the corresponding non-phi nodes for better matching when trying to
18809 // build the tree.
18810 for (Value *V : Incoming) {
18811 SmallVectorImpl<Value *> &Opcodes =
18812 PHIToOpcodes.try_emplace(Key: V).first->getSecond();
18813 if (!Opcodes.empty())
18814 continue;
18815 SmallVector<Value *, 4> Nodes(1, V);
18816 SmallPtrSet<Value *, 4> Visited;
18817 while (!Nodes.empty()) {
18818 auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
18819 if (!Visited.insert(Ptr: PHI).second)
18820 continue;
18821 for (Value *V : PHI->incoming_values()) {
18822 if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
18823 Nodes.push_back(Elt: PHI1);
18824 continue;
18825 }
18826 Opcodes.emplace_back(Args&: V);
18827 }
18828 }
18829 }
18830
18831 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18832 Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
18833 TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18834 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
18835 },
18836 /*MaxVFOnly=*/true, R);
18837 Changed |= HaveVectorizedPhiNodes;
18838 if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
18839 auto *PHI = dyn_cast<PHINode>(P.first);
18840 return !PHI || R.isDeleted(I: PHI);
18841 }))
18842 PHIToOpcodes.clear();
18843 VisitedInstrs.insert(I: Incoming.begin(), E: Incoming.end());
18844 } while (HaveVectorizedPhiNodes);
18845
18846 VisitedInstrs.clear();
18847
18848 InstSetVector PostProcessInserts;
18849 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18850 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18851 // also vectorizes `PostProcessCmps`.
18852 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18853 bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
18854 if (VectorizeCmps) {
18855 Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
18856 PostProcessCmps.clear();
18857 }
18858 PostProcessInserts.clear();
18859 return Changed;
18860 };
18861 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18862 auto IsInPostProcessInstrs = [&](Instruction *I) {
18863 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
18864 return PostProcessCmps.contains(key: Cmp);
18865 return isa<InsertElementInst, InsertValueInst>(Val: I) &&
18866 PostProcessInserts.contains(key: I);
18867 };
18868 // Returns true if `I` is an instruction without users, like terminator, or
18869 // function call with ignored return value, store. Ignore unused instructions
18870 // (basing on instruction type, except for CallInst and InvokeInst).
18871 auto HasNoUsers = [](Instruction *I) {
18872 return I->use_empty() &&
18873 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I));
18874 };
18875 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18876 // Skip instructions with scalable type. The num of elements is unknown at
18877 // compile-time for scalable type.
18878 if (isa<ScalableVectorType>(Val: It->getType()))
18879 continue;
18880
18881 // Skip instructions marked for the deletion.
18882 if (R.isDeleted(I: &*It))
18883 continue;
18884 // We may go through BB multiple times so skip the one we have checked.
18885 if (!VisitedInstrs.insert(Ptr: &*It).second) {
18886 if (HasNoUsers(&*It) &&
18887 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18888 // We would like to start over since some instructions are deleted
18889 // and the iterator may become invalid value.
18890 Changed = true;
18891 It = BB->begin();
18892 E = BB->end();
18893 }
18894 continue;
18895 }
18896
18897 if (isa<DbgInfoIntrinsic>(Val: It))
18898 continue;
18899
18900 // Try to vectorize reductions that use PHINodes.
18901 if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
18902 // Check that the PHI is a reduction PHI.
18903 if (P->getNumIncomingValues() == 2) {
18904 // Try to match and vectorize a horizontal reduction.
18905 Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
18906 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18907 Changed = true;
18908 It = BB->begin();
18909 E = BB->end();
18910 continue;
18911 }
18912 }
18913 // Try to vectorize the incoming values of the PHI, to catch reductions
18914 // that feed into PHIs.
18915 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
18916 // Skip if the incoming block is the current BB for now. Also, bypass
18917 // unreachable IR for efficiency and to avoid crashing.
18918 // TODO: Collect the skipped incoming values and try to vectorize them
18919 // after processing BB.
18920 if (BB == P->getIncomingBlock(i: I) ||
18921 !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
18922 continue;
18923
18924 // Postponed instructions should not be vectorized here, delay their
18925 // vectorization.
18926 if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
18927 PI && !IsInPostProcessInstrs(PI)) {
18928 bool Res = vectorizeRootInstruction(P: nullptr, Root: PI,
18929 BB: P->getIncomingBlock(i: I), R, TTI);
18930 Changed |= Res;
18931 if (Res && R.isDeleted(I: P)) {
18932 It = BB->begin();
18933 E = BB->end();
18934 break;
18935 }
18936 }
18937 }
18938 continue;
18939 }
18940
18941 if (HasNoUsers(&*It)) {
18942 bool OpsChanged = false;
18943 auto *SI = dyn_cast<StoreInst>(Val&: It);
18944 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18945 if (SI) {
18946 auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
18947 // Try to vectorize chain in store, if this is the only store to the
18948 // address in the block.
18949 // TODO: This is just a temporarily solution to save compile time. Need
18950 // to investigate if we can safely turn on slp-vectorize-hor-store
18951 // instead to allow lookup for reduction chains in all non-vectorized
18952 // stores (need to check side effects and compile time).
18953 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18954 SI->getValueOperand()->hasOneUse();
18955 }
18956 if (TryToVectorizeRoot) {
18957 for (auto *V : It->operand_values()) {
18958 // Postponed instructions should not be vectorized here, delay their
18959 // vectorization.
18960 if (auto *VI = dyn_cast<Instruction>(Val: V);
18961 VI && !IsInPostProcessInstrs(VI))
18962 // Try to match and vectorize a horizontal reduction.
18963 OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R, TTI);
18964 }
18965 }
18966 // Start vectorization of post-process list of instructions from the
18967 // top-tree instructions to try to vectorize as many instructions as
18968 // possible.
18969 OpsChanged |=
18970 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18971 if (OpsChanged) {
18972 // We would like to start over since some instructions are deleted
18973 // and the iterator may become invalid value.
18974 Changed = true;
18975 It = BB->begin();
18976 E = BB->end();
18977 continue;
18978 }
18979 }
18980
18981 if (isa<InsertElementInst, InsertValueInst>(Val: It))
18982 PostProcessInserts.insert(X: &*It);
18983 else if (isa<CmpInst>(Val: It))
18984 PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
18985 }
18986
18987 return Changed;
18988}
18989
18990bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18991 auto Changed = false;
18992 for (auto &Entry : GEPs) {
18993 // If the getelementptr list has fewer than two elements, there's nothing
18994 // to do.
18995 if (Entry.second.size() < 2)
18996 continue;
18997
18998 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18999 << Entry.second.size() << ".\n");
19000
19001 // Process the GEP list in chunks suitable for the target's supported
19002 // vector size. If a vector register can't hold 1 element, we are done. We
19003 // are trying to vectorize the index computations, so the maximum number of
19004 // elements is based on the size of the index expression, rather than the
19005 // size of the GEP itself (the target's pointer size).
19006 auto *It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst *GEP) {
19007 return !R.isDeleted(I: GEP);
19008 });
19009 if (It == Entry.second.end())
19010 continue;
19011 unsigned MaxVecRegSize = R.getMaxVecRegSize();
19012 unsigned EltSize = R.getVectorElementSize(V: *(*It)->idx_begin());
19013 if (MaxVecRegSize < EltSize)
19014 continue;
19015
19016 unsigned MaxElts = MaxVecRegSize / EltSize;
19017 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
19018 auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
19019 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
19020
19021 // Initialize a set a candidate getelementptrs. Note that we use a
19022 // SetVector here to preserve program order. If the index computations
19023 // are vectorizable and begin with loads, we want to minimize the chance
19024 // of having to reorder them later.
19025 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
19026
19027 // Some of the candidates may have already been vectorized after we
19028 // initially collected them or their index is optimized to constant value.
19029 // If so, they are marked as deleted, so remove them from the set of
19030 // candidates.
19031 Candidates.remove_if(P: [&R](Value *I) {
19032 return R.isDeleted(I: cast<Instruction>(Val: I)) ||
19033 isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
19034 });
19035
19036 // Remove from the set of candidates all pairs of getelementptrs with
19037 // constant differences. Such getelementptrs are likely not good
19038 // candidates for vectorization in a bottom-up phase since one can be
19039 // computed from the other. We also ensure all candidate getelementptr
19040 // indices are unique.
19041 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
19042 auto *GEPI = GEPList[I];
19043 if (!Candidates.count(key: GEPI))
19044 continue;
19045 auto *SCEVI = SE->getSCEV(V: GEPList[I]);
19046 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
19047 auto *GEPJ = GEPList[J];
19048 auto *SCEVJ = SE->getSCEV(V: GEPList[J]);
19049 if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
19050 Candidates.remove(X: GEPI);
19051 Candidates.remove(X: GEPJ);
19052 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
19053 Candidates.remove(X: GEPJ);
19054 }
19055 }
19056 }
19057
19058 // We break out of the above computation as soon as we know there are
19059 // fewer than two candidates remaining.
19060 if (Candidates.size() < 2)
19061 continue;
19062
19063 // Add the single, non-constant index of each candidate to the bundle. We
19064 // ensured the indices met these constraints when we originally collected
19065 // the getelementptrs.
19066 SmallVector<Value *, 16> Bundle(Candidates.size());
19067 auto BundleIndex = 0u;
19068 for (auto *V : Candidates) {
19069 auto *GEP = cast<GetElementPtrInst>(Val: V);
19070 auto *GEPIdx = GEP->idx_begin()->get();
19071 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
19072 Bundle[BundleIndex++] = GEPIdx;
19073 }
19074
19075 // Try and vectorize the indices. We are currently only interested in
19076 // gather-like cases of the form:
19077 //
19078 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
19079 //
19080 // where the loads of "a", the loads of "b", and the subtractions can be
19081 // performed in parallel. It's likely that detecting this pattern in a
19082 // bottom-up phase will be simpler and less costly than building a
19083 // full-blown top-down phase beginning at the consecutive loads.
19084 Changed |= tryToVectorizeList(VL: Bundle, R);
19085 }
19086 }
19087 return Changed;
19088}
19089
19090bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
19091 bool Changed = false;
19092 // Sort by type, base pointers and values operand. Value operands must be
19093 // compatible (have the same opcode, same parent), otherwise it is
19094 // definitely not profitable to try to vectorize them.
19095 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
19096 if (V->getValueOperand()->getType()->getTypeID() <
19097 V2->getValueOperand()->getType()->getTypeID())
19098 return true;
19099 if (V->getValueOperand()->getType()->getTypeID() >
19100 V2->getValueOperand()->getType()->getTypeID())
19101 return false;
19102 if (V->getPointerOperandType()->getTypeID() <
19103 V2->getPointerOperandType()->getTypeID())
19104 return true;
19105 if (V->getPointerOperandType()->getTypeID() >
19106 V2->getPointerOperandType()->getTypeID())
19107 return false;
19108 // UndefValues are compatible with all other values.
19109 if (isa<UndefValue>(Val: V->getValueOperand()) ||
19110 isa<UndefValue>(Val: V2->getValueOperand()))
19111 return false;
19112 if (auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand()))
19113 if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
19114 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
19115 DT->getNode(BB: I1->getParent());
19116 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
19117 DT->getNode(BB: I2->getParent());
19118 assert(NodeI1 && "Should only process reachable instructions");
19119 assert(NodeI2 && "Should only process reachable instructions");
19120 assert((NodeI1 == NodeI2) ==
19121 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
19122 "Different nodes should have different DFS numbers");
19123 if (NodeI1 != NodeI2)
19124 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
19125 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
19126 if (S.getOpcode())
19127 return false;
19128 return I1->getOpcode() < I2->getOpcode();
19129 }
19130 if (isa<Constant>(Val: V->getValueOperand()) &&
19131 isa<Constant>(Val: V2->getValueOperand()))
19132 return false;
19133 return V->getValueOperand()->getValueID() <
19134 V2->getValueOperand()->getValueID();
19135 };
19136
19137 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
19138 if (V1 == V2)
19139 return true;
19140 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
19141 return false;
19142 if (V1->getPointerOperandType() != V2->getPointerOperandType())
19143 return false;
19144 // Undefs are compatible with any other value.
19145 if (isa<UndefValue>(Val: V1->getValueOperand()) ||
19146 isa<UndefValue>(Val: V2->getValueOperand()))
19147 return true;
19148 if (auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand()))
19149 if (auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand())) {
19150 if (I1->getParent() != I2->getParent())
19151 return false;
19152 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
19153 return S.getOpcode() > 0;
19154 }
19155 if (isa<Constant>(Val: V1->getValueOperand()) &&
19156 isa<Constant>(Val: V2->getValueOperand()))
19157 return true;
19158 return V1->getValueOperand()->getValueID() ==
19159 V2->getValueOperand()->getValueID();
19160 };
19161
19162 // Attempt to sort and vectorize each of the store-groups.
19163 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
19164 for (auto &Pair : Stores) {
19165 if (Pair.second.size() < 2)
19166 continue;
19167
19168 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
19169 << Pair.second.size() << ".\n");
19170
19171 if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
19172 continue;
19173
19174 // Reverse stores to do bottom-to-top analysis. This is important if the
19175 // values are stores to the same addresses several times, in this case need
19176 // to follow the stores order (reversed to meet the memory dependecies).
19177 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
19178 Pair.second.rend());
19179 Changed |= tryToVectorizeSequence<StoreInst>(
19180 Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
19181 TryToVectorizeHelper: [&](ArrayRef<StoreInst *> Candidates, bool) {
19182 return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
19183 },
19184 /*MaxVFOnly=*/false, R);
19185 }
19186 return Changed;
19187}
19188